xref: /linux/mm/migrate.c (revision b24413180f5600bcb3bb70fbed5cf186b60864bd)
1*b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2b20a3503SChristoph Lameter /*
314e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
4b20a3503SChristoph Lameter  *
5b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6b20a3503SChristoph Lameter  *
7b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
8b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
9b20a3503SChristoph Lameter  *
10b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
12b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
13cde53535SChristoph Lameter  * Christoph Lameter
14b20a3503SChristoph Lameter  */
15b20a3503SChristoph Lameter 
16b20a3503SChristoph Lameter #include <linux/migrate.h>
17b95f1b31SPaul Gortmaker #include <linux/export.h>
18b20a3503SChristoph Lameter #include <linux/swap.h>
190697212aSChristoph Lameter #include <linux/swapops.h>
20b20a3503SChristoph Lameter #include <linux/pagemap.h>
21e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
22b20a3503SChristoph Lameter #include <linux/mm_inline.h>
23b488893aSPavel Emelyanov #include <linux/nsproxy.h>
24b20a3503SChristoph Lameter #include <linux/pagevec.h>
25e9995ef9SHugh Dickins #include <linux/ksm.h>
26b20a3503SChristoph Lameter #include <linux/rmap.h>
27b20a3503SChristoph Lameter #include <linux/topology.h>
28b20a3503SChristoph Lameter #include <linux/cpu.h>
29b20a3503SChristoph Lameter #include <linux/cpuset.h>
3004e62a29SChristoph Lameter #include <linux/writeback.h>
31742755a1SChristoph Lameter #include <linux/mempolicy.h>
32742755a1SChristoph Lameter #include <linux/vmalloc.h>
3386c3a764SDavid Quigley #include <linux/security.h>
3442cb14b1SHugh Dickins #include <linux/backing-dev.h>
35bda807d4SMinchan Kim #include <linux/compaction.h>
364f5ca265SAdrian Bunk #include <linux/syscalls.h>
37290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
388e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
395a0e3ad6STejun Heo #include <linux/gfp.h>
40df6ad698SJérôme Glisse #include <linux/pfn_t.h>
41a5430ddaSJérôme Glisse #include <linux/memremap.h>
428315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
43bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
44f714f4f2SMel Gorman #include <linux/mmu_notifier.h>
4533c3fc71SVladimir Davydov #include <linux/page_idle.h>
46d435edcaSVlastimil Babka #include <linux/page_owner.h>
476e84f315SIngo Molnar #include <linux/sched/mm.h>
48197e7e52SLinus Torvalds #include <linux/ptrace.h>
49b20a3503SChristoph Lameter 
500d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
510d1836c3SMichal Nazarewicz 
527b2a2d4aSMel Gorman #define CREATE_TRACE_POINTS
537b2a2d4aSMel Gorman #include <trace/events/migrate.h>
547b2a2d4aSMel Gorman 
55b20a3503SChristoph Lameter #include "internal.h"
56b20a3503SChristoph Lameter 
57b20a3503SChristoph Lameter /*
58742755a1SChristoph Lameter  * migrate_prep() needs to be called before we start compiling a list of pages
59748446bbSMel Gorman  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
60748446bbSMel Gorman  * undesirable, use migrate_prep_local()
61b20a3503SChristoph Lameter  */
62b20a3503SChristoph Lameter int migrate_prep(void)
63b20a3503SChristoph Lameter {
64b20a3503SChristoph Lameter 	/*
65b20a3503SChristoph Lameter 	 * Clear the LRU lists so pages can be isolated.
66b20a3503SChristoph Lameter 	 * Note that pages may be moved off the LRU after we have
67b20a3503SChristoph Lameter 	 * drained them. Those pages will fail to migrate like other
68b20a3503SChristoph Lameter 	 * pages that may be busy.
69b20a3503SChristoph Lameter 	 */
70b20a3503SChristoph Lameter 	lru_add_drain_all();
71b20a3503SChristoph Lameter 
72b20a3503SChristoph Lameter 	return 0;
73b20a3503SChristoph Lameter }
74b20a3503SChristoph Lameter 
75748446bbSMel Gorman /* Do the necessary work of migrate_prep but not if it involves other CPUs */
76748446bbSMel Gorman int migrate_prep_local(void)
77748446bbSMel Gorman {
78748446bbSMel Gorman 	lru_add_drain();
79748446bbSMel Gorman 
80748446bbSMel Gorman 	return 0;
81748446bbSMel Gorman }
82748446bbSMel Gorman 
839e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
84bda807d4SMinchan Kim {
85bda807d4SMinchan Kim 	struct address_space *mapping;
86bda807d4SMinchan Kim 
87bda807d4SMinchan Kim 	/*
88bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
89bda807d4SMinchan Kim 	 * or just got freed under us.
90bda807d4SMinchan Kim 	 *
91bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
92bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
93bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
94bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
95bda807d4SMinchan Kim 	 */
96bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
97bda807d4SMinchan Kim 		goto out;
98bda807d4SMinchan Kim 
99bda807d4SMinchan Kim 	/*
100bda807d4SMinchan Kim 	 * Check PageMovable before holding a PG_lock because page's owner
101bda807d4SMinchan Kim 	 * assumes anybody doesn't touch PG_lock of newly allocated page
102bda807d4SMinchan Kim 	 * so unconditionally grapping the lock ruins page's owner side.
103bda807d4SMinchan Kim 	 */
104bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
105bda807d4SMinchan Kim 		goto out_putpage;
106bda807d4SMinchan Kim 	/*
107bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
108bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
109bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
110bda807d4SMinchan Kim 	 *
111bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
112bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
113bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
114bda807d4SMinchan Kim 	 * lets be sure we have the page lock
115bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
116bda807d4SMinchan Kim 	 */
117bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
118bda807d4SMinchan Kim 		goto out_putpage;
119bda807d4SMinchan Kim 
120bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
121bda807d4SMinchan Kim 		goto out_no_isolated;
122bda807d4SMinchan Kim 
123bda807d4SMinchan Kim 	mapping = page_mapping(page);
124bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!mapping, page);
125bda807d4SMinchan Kim 
126bda807d4SMinchan Kim 	if (!mapping->a_ops->isolate_page(page, mode))
127bda807d4SMinchan Kim 		goto out_no_isolated;
128bda807d4SMinchan Kim 
129bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
130bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
131bda807d4SMinchan Kim 	__SetPageIsolated(page);
132bda807d4SMinchan Kim 	unlock_page(page);
133bda807d4SMinchan Kim 
1349e5bcd61SYisheng Xie 	return 0;
135bda807d4SMinchan Kim 
136bda807d4SMinchan Kim out_no_isolated:
137bda807d4SMinchan Kim 	unlock_page(page);
138bda807d4SMinchan Kim out_putpage:
139bda807d4SMinchan Kim 	put_page(page);
140bda807d4SMinchan Kim out:
1419e5bcd61SYisheng Xie 	return -EBUSY;
142bda807d4SMinchan Kim }
143bda807d4SMinchan Kim 
144bda807d4SMinchan Kim /* It should be called on page which is PG_movable */
145bda807d4SMinchan Kim void putback_movable_page(struct page *page)
146bda807d4SMinchan Kim {
147bda807d4SMinchan Kim 	struct address_space *mapping;
148bda807d4SMinchan Kim 
149bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageLocked(page), page);
150bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageMovable(page), page);
151bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
152bda807d4SMinchan Kim 
153bda807d4SMinchan Kim 	mapping = page_mapping(page);
154bda807d4SMinchan Kim 	mapping->a_ops->putback_page(page);
155bda807d4SMinchan Kim 	__ClearPageIsolated(page);
156bda807d4SMinchan Kim }
157bda807d4SMinchan Kim 
158b20a3503SChristoph Lameter /*
1595733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1605733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1615733c7d1SRafael Aquini  *
16259c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
16359c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
16459c82b70SJoonsoo Kim  * and isolate_huge_page().
1655733c7d1SRafael Aquini  */
1665733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1675733c7d1SRafael Aquini {
1685733c7d1SRafael Aquini 	struct page *page;
1695733c7d1SRafael Aquini 	struct page *page2;
1705733c7d1SRafael Aquini 
1715733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
17231caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
17331caf665SNaoya Horiguchi 			putback_active_hugepage(page);
17431caf665SNaoya Horiguchi 			continue;
17531caf665SNaoya Horiguchi 		}
1765733c7d1SRafael Aquini 		list_del(&page->lru);
177bda807d4SMinchan Kim 		/*
178bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
179bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
180bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
181bda807d4SMinchan Kim 		 */
182b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
183bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
184bda807d4SMinchan Kim 			lock_page(page);
185bda807d4SMinchan Kim 			if (PageMovable(page))
186bda807d4SMinchan Kim 				putback_movable_page(page);
187bf6bddf1SRafael Aquini 			else
188bda807d4SMinchan Kim 				__ClearPageIsolated(page);
189bda807d4SMinchan Kim 			unlock_page(page);
190bda807d4SMinchan Kim 			put_page(page);
191bda807d4SMinchan Kim 		} else {
192e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
193e8db67ebSNaoya Horiguchi 					page_is_file_cache(page), -hpage_nr_pages(page));
194fc280fe8SRabin Vincent 			putback_lru_page(page);
195b20a3503SChristoph Lameter 		}
196b20a3503SChristoph Lameter 	}
197bda807d4SMinchan Kim }
198b20a3503SChristoph Lameter 
1990697212aSChristoph Lameter /*
2000697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
2010697212aSChristoph Lameter  */
202e4b82222SMinchan Kim static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
203e9995ef9SHugh Dickins 				 unsigned long addr, void *old)
2040697212aSChristoph Lameter {
2053fe87967SKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
2063fe87967SKirill A. Shutemov 		.page = old,
2073fe87967SKirill A. Shutemov 		.vma = vma,
2083fe87967SKirill A. Shutemov 		.address = addr,
2093fe87967SKirill A. Shutemov 		.flags = PVMW_SYNC | PVMW_MIGRATION,
2103fe87967SKirill A. Shutemov 	};
2113fe87967SKirill A. Shutemov 	struct page *new;
2123fe87967SKirill A. Shutemov 	pte_t pte;
2130697212aSChristoph Lameter 	swp_entry_t entry;
2140697212aSChristoph Lameter 
2153fe87967SKirill A. Shutemov 	VM_BUG_ON_PAGE(PageTail(page), page);
2163fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
2174b0ece6fSNaoya Horiguchi 		if (PageKsm(page))
2184b0ece6fSNaoya Horiguchi 			new = page;
2194b0ece6fSNaoya Horiguchi 		else
2203fe87967SKirill A. Shutemov 			new = page - pvmw.page->index +
2213fe87967SKirill A. Shutemov 				linear_page_index(vma, pvmw.address);
2220697212aSChristoph Lameter 
223616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
224616b8371SZi Yan 		/* PMD-mapped THP migration entry */
225616b8371SZi Yan 		if (!pvmw.pte) {
226616b8371SZi Yan 			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
227616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
228616b8371SZi Yan 			continue;
229616b8371SZi Yan 		}
230616b8371SZi Yan #endif
231616b8371SZi Yan 
2320697212aSChristoph Lameter 		get_page(new);
2336d2329f8SAndrea Arcangeli 		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
2343fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
235c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
236d3cb8bf6SMel Gorman 
2373fe87967SKirill A. Shutemov 		/*
2383fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2393fe87967SKirill A. Shutemov 		 */
2403fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
2410697212aSChristoph Lameter 		if (is_write_migration_entry(entry))
242d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
243d3cb8bf6SMel Gorman 
244df6ad698SJérôme Glisse 		if (unlikely(is_zone_device_page(new))) {
245df6ad698SJérôme Glisse 			if (is_device_private_page(new)) {
246a5430ddaSJérôme Glisse 				entry = make_device_private_entry(new, pte_write(pte));
247a5430ddaSJérôme Glisse 				pte = swp_entry_to_pte(entry);
248df6ad698SJérôme Glisse 			} else if (is_device_public_page(new)) {
249df6ad698SJérôme Glisse 				pte = pte_mkdevmap(pte);
250df6ad698SJérôme Glisse 				flush_dcache_page(new);
251df6ad698SJérôme Glisse 			}
252a5430ddaSJérôme Glisse 		} else
253383321abSAneesh Kumar K.V 			flush_dcache_page(new);
254a5430ddaSJérôme Glisse 
2553ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
256be7517d6STony Lu 		if (PageHuge(new)) {
257290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
258be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, new, 0);
259383321abSAneesh Kumar K.V 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
26004e62a29SChristoph Lameter 			if (PageAnon(new))
2613fe87967SKirill A. Shutemov 				hugepage_add_anon_rmap(new, vma, pvmw.address);
262290408d4SNaoya Horiguchi 			else
26353f9263bSKirill A. Shutemov 				page_dup_rmap(new, true);
264383321abSAneesh Kumar K.V 		} else
265383321abSAneesh Kumar K.V #endif
266383321abSAneesh Kumar K.V 		{
267383321abSAneesh Kumar K.V 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
268383321abSAneesh Kumar K.V 
269383321abSAneesh Kumar K.V 			if (PageAnon(new))
2703fe87967SKirill A. Shutemov 				page_add_anon_rmap(new, vma, pvmw.address, false);
27104e62a29SChristoph Lameter 			else
272dd78feddSKirill A. Shutemov 				page_add_file_rmap(new, false);
273383321abSAneesh Kumar K.V 		}
274e388466dSKirill A. Shutemov 		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
27551afb12bSHugh Dickins 			mlock_vma_page(new);
27651afb12bSHugh Dickins 
27704e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2783fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2793fe87967SKirill A. Shutemov 	}
2803fe87967SKirill A. Shutemov 
281e4b82222SMinchan Kim 	return true;
2820697212aSChristoph Lameter }
2830697212aSChristoph Lameter 
2840697212aSChristoph Lameter /*
28504e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
28604e62a29SChristoph Lameter  * references to the indicated page.
28704e62a29SChristoph Lameter  */
288e388466dSKirill A. Shutemov void remove_migration_ptes(struct page *old, struct page *new, bool locked)
28904e62a29SChristoph Lameter {
290051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
291051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
292051ac83aSJoonsoo Kim 		.arg = old,
293051ac83aSJoonsoo Kim 	};
294051ac83aSJoonsoo Kim 
295e388466dSKirill A. Shutemov 	if (locked)
296e388466dSKirill A. Shutemov 		rmap_walk_locked(new, &rwc);
297e388466dSKirill A. Shutemov 	else
298051ac83aSJoonsoo Kim 		rmap_walk(new, &rwc);
29904e62a29SChristoph Lameter }
30004e62a29SChristoph Lameter 
30104e62a29SChristoph Lameter /*
3020697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
3030697212aSChristoph Lameter  * get to the page and wait until migration is finished.
3040697212aSChristoph Lameter  * When we return from this function the fault will be retried.
3050697212aSChristoph Lameter  */
306e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
30730dad309SNaoya Horiguchi 				spinlock_t *ptl)
3080697212aSChristoph Lameter {
30930dad309SNaoya Horiguchi 	pte_t pte;
3100697212aSChristoph Lameter 	swp_entry_t entry;
3110697212aSChristoph Lameter 	struct page *page;
3120697212aSChristoph Lameter 
31330dad309SNaoya Horiguchi 	spin_lock(ptl);
3140697212aSChristoph Lameter 	pte = *ptep;
3150697212aSChristoph Lameter 	if (!is_swap_pte(pte))
3160697212aSChristoph Lameter 		goto out;
3170697212aSChristoph Lameter 
3180697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3190697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3200697212aSChristoph Lameter 		goto out;
3210697212aSChristoph Lameter 
3220697212aSChristoph Lameter 	page = migration_entry_to_page(entry);
3230697212aSChristoph Lameter 
324e286781dSNick Piggin 	/*
325e286781dSNick Piggin 	 * Once radix-tree replacement of page migration started, page_count
326e286781dSNick Piggin 	 * *must* be zero. And, we don't want to call wait_on_page_locked()
327e286781dSNick Piggin 	 * against a page without get_page().
328e286781dSNick Piggin 	 * So, we use get_page_unless_zero(), here. Even failed, page fault
329e286781dSNick Piggin 	 * will occur again.
330e286781dSNick Piggin 	 */
331e286781dSNick Piggin 	if (!get_page_unless_zero(page))
332e286781dSNick Piggin 		goto out;
3330697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3340697212aSChristoph Lameter 	wait_on_page_locked(page);
3350697212aSChristoph Lameter 	put_page(page);
3360697212aSChristoph Lameter 	return;
3370697212aSChristoph Lameter out:
3380697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3390697212aSChristoph Lameter }
3400697212aSChristoph Lameter 
34130dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
34230dad309SNaoya Horiguchi 				unsigned long address)
34330dad309SNaoya Horiguchi {
34430dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
34530dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
34630dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
34730dad309SNaoya Horiguchi }
34830dad309SNaoya Horiguchi 
349cb900f41SKirill A. Shutemov void migration_entry_wait_huge(struct vm_area_struct *vma,
350cb900f41SKirill A. Shutemov 		struct mm_struct *mm, pte_t *pte)
35130dad309SNaoya Horiguchi {
352cb900f41SKirill A. Shutemov 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
35330dad309SNaoya Horiguchi 	__migration_entry_wait(mm, pte, ptl);
35430dad309SNaoya Horiguchi }
35530dad309SNaoya Horiguchi 
356616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
357616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
358616b8371SZi Yan {
359616b8371SZi Yan 	spinlock_t *ptl;
360616b8371SZi Yan 	struct page *page;
361616b8371SZi Yan 
362616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
363616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
364616b8371SZi Yan 		goto unlock;
365616b8371SZi Yan 	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
366616b8371SZi Yan 	if (!get_page_unless_zero(page))
367616b8371SZi Yan 		goto unlock;
368616b8371SZi Yan 	spin_unlock(ptl);
369616b8371SZi Yan 	wait_on_page_locked(page);
370616b8371SZi Yan 	put_page(page);
371616b8371SZi Yan 	return;
372616b8371SZi Yan unlock:
373616b8371SZi Yan 	spin_unlock(ptl);
374616b8371SZi Yan }
375616b8371SZi Yan #endif
376616b8371SZi Yan 
377b969c4abSMel Gorman #ifdef CONFIG_BLOCK
378b969c4abSMel Gorman /* Returns true if all buffers are successfully locked */
379a6bc32b8SMel Gorman static bool buffer_migrate_lock_buffers(struct buffer_head *head,
380a6bc32b8SMel Gorman 							enum migrate_mode mode)
381b969c4abSMel Gorman {
382b969c4abSMel Gorman 	struct buffer_head *bh = head;
383b969c4abSMel Gorman 
384b969c4abSMel Gorman 	/* Simple case, sync compaction */
385a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC) {
386b969c4abSMel Gorman 		do {
387b969c4abSMel Gorman 			get_bh(bh);
388b969c4abSMel Gorman 			lock_buffer(bh);
389b969c4abSMel Gorman 			bh = bh->b_this_page;
390b969c4abSMel Gorman 
391b969c4abSMel Gorman 		} while (bh != head);
392b969c4abSMel Gorman 
393b969c4abSMel Gorman 		return true;
394b969c4abSMel Gorman 	}
395b969c4abSMel Gorman 
396b969c4abSMel Gorman 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
397b969c4abSMel Gorman 	do {
398b969c4abSMel Gorman 		get_bh(bh);
399b969c4abSMel Gorman 		if (!trylock_buffer(bh)) {
400b969c4abSMel Gorman 			/*
401b969c4abSMel Gorman 			 * We failed to lock the buffer and cannot stall in
402b969c4abSMel Gorman 			 * async migration. Release the taken locks
403b969c4abSMel Gorman 			 */
404b969c4abSMel Gorman 			struct buffer_head *failed_bh = bh;
405b969c4abSMel Gorman 			put_bh(failed_bh);
406b969c4abSMel Gorman 			bh = head;
407b969c4abSMel Gorman 			while (bh != failed_bh) {
408b969c4abSMel Gorman 				unlock_buffer(bh);
409b969c4abSMel Gorman 				put_bh(bh);
410b969c4abSMel Gorman 				bh = bh->b_this_page;
411b969c4abSMel Gorman 			}
412b969c4abSMel Gorman 			return false;
413b969c4abSMel Gorman 		}
414b969c4abSMel Gorman 
415b969c4abSMel Gorman 		bh = bh->b_this_page;
416b969c4abSMel Gorman 	} while (bh != head);
417b969c4abSMel Gorman 	return true;
418b969c4abSMel Gorman }
419b969c4abSMel Gorman #else
420b969c4abSMel Gorman static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
421a6bc32b8SMel Gorman 							enum migrate_mode mode)
422b969c4abSMel Gorman {
423b969c4abSMel Gorman 	return true;
424b969c4abSMel Gorman }
425b969c4abSMel Gorman #endif /* CONFIG_BLOCK */
426b969c4abSMel Gorman 
427b20a3503SChristoph Lameter /*
428c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
4295b5c7120SChristoph Lameter  *
4305b5c7120SChristoph Lameter  * The number of remaining references must be:
4315b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
4325b5c7120SChristoph Lameter  * 2 for pages with a mapping
433266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
434b20a3503SChristoph Lameter  */
43536bc08ccSGu Zheng int migrate_page_move_mapping(struct address_space *mapping,
436b969c4abSMel Gorman 		struct page *newpage, struct page *page,
4378e321fefSBenjamin LaHaise 		struct buffer_head *head, enum migrate_mode mode,
4388e321fefSBenjamin LaHaise 		int extra_count)
439b20a3503SChristoph Lameter {
44042cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
44142cb14b1SHugh Dickins 	int dirty;
4428e321fefSBenjamin LaHaise 	int expected_count = 1 + extra_count;
4437cf9c2c7SNick Piggin 	void **pslot;
444b20a3503SChristoph Lameter 
4458763cb45SJérôme Glisse 	/*
446df6ad698SJérôme Glisse 	 * Device public or private pages have an extra refcount as they are
447df6ad698SJérôme Glisse 	 * ZONE_DEVICE pages.
4488763cb45SJérôme Glisse 	 */
449df6ad698SJérôme Glisse 	expected_count += is_device_private_page(page);
450df6ad698SJérôme Glisse 	expected_count += is_device_public_page(page);
4518763cb45SJérôme Glisse 
4526c5240aeSChristoph Lameter 	if (!mapping) {
4530e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
4548e321fefSBenjamin LaHaise 		if (page_count(page) != expected_count)
4556c5240aeSChristoph Lameter 			return -EAGAIN;
456cf4b769aSHugh Dickins 
457cf4b769aSHugh Dickins 		/* No turning back from here */
458cf4b769aSHugh Dickins 		newpage->index = page->index;
459cf4b769aSHugh Dickins 		newpage->mapping = page->mapping;
460cf4b769aSHugh Dickins 		if (PageSwapBacked(page))
461fa9949daSHugh Dickins 			__SetPageSwapBacked(newpage);
462cf4b769aSHugh Dickins 
46378bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4646c5240aeSChristoph Lameter 	}
4656c5240aeSChristoph Lameter 
46642cb14b1SHugh Dickins 	oldzone = page_zone(page);
46742cb14b1SHugh Dickins 	newzone = page_zone(newpage);
46842cb14b1SHugh Dickins 
46919fd6231SNick Piggin 	spin_lock_irq(&mapping->tree_lock);
470b20a3503SChristoph Lameter 
4717cf9c2c7SNick Piggin 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
472b20a3503SChristoph Lameter  					page_index(page));
473b20a3503SChristoph Lameter 
4748e321fefSBenjamin LaHaise 	expected_count += 1 + page_has_private(page);
475e286781dSNick Piggin 	if (page_count(page) != expected_count ||
47629c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
47719fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
478e23ca00bSChristoph Lameter 		return -EAGAIN;
479b20a3503SChristoph Lameter 	}
480b20a3503SChristoph Lameter 
481fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
48219fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
483e286781dSNick Piggin 		return -EAGAIN;
484e286781dSNick Piggin 	}
485e286781dSNick Piggin 
486b20a3503SChristoph Lameter 	/*
487b969c4abSMel Gorman 	 * In the async migration case of moving a page with buffers, lock the
488b969c4abSMel Gorman 	 * buffers using trylock before the mapping is moved. If the mapping
489b969c4abSMel Gorman 	 * was moved, we later failed to lock the buffers and could not move
490b969c4abSMel Gorman 	 * the mapping back due to an elevated page count, we would have to
491b969c4abSMel Gorman 	 * block waiting on other references to be dropped.
492b969c4abSMel Gorman 	 */
493a6bc32b8SMel Gorman 	if (mode == MIGRATE_ASYNC && head &&
494a6bc32b8SMel Gorman 			!buffer_migrate_lock_buffers(head, mode)) {
495fe896d18SJoonsoo Kim 		page_ref_unfreeze(page, expected_count);
496b969c4abSMel Gorman 		spin_unlock_irq(&mapping->tree_lock);
497b969c4abSMel Gorman 		return -EAGAIN;
498b969c4abSMel Gorman 	}
499b969c4abSMel Gorman 
500b969c4abSMel Gorman 	/*
501cf4b769aSHugh Dickins 	 * Now we know that no one else is looking at the page:
502cf4b769aSHugh Dickins 	 * no turning back from here.
503b20a3503SChristoph Lameter 	 */
504cf4b769aSHugh Dickins 	newpage->index = page->index;
505cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
5067cf9c2c7SNick Piggin 	get_page(newpage);	/* add cache reference */
5076326fec1SNicholas Piggin 	if (PageSwapBacked(page)) {
5086326fec1SNicholas Piggin 		__SetPageSwapBacked(newpage);
509b20a3503SChristoph Lameter 		if (PageSwapCache(page)) {
510b20a3503SChristoph Lameter 			SetPageSwapCache(newpage);
511b20a3503SChristoph Lameter 			set_page_private(newpage, page_private(page));
512b20a3503SChristoph Lameter 		}
5136326fec1SNicholas Piggin 	} else {
5146326fec1SNicholas Piggin 		VM_BUG_ON_PAGE(PageSwapCache(page), page);
5156326fec1SNicholas Piggin 	}
516b20a3503SChristoph Lameter 
51742cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
51842cb14b1SHugh Dickins 	dirty = PageDirty(page);
51942cb14b1SHugh Dickins 	if (dirty) {
52042cb14b1SHugh Dickins 		ClearPageDirty(page);
52142cb14b1SHugh Dickins 		SetPageDirty(newpage);
52242cb14b1SHugh Dickins 	}
52342cb14b1SHugh Dickins 
5246d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
5257cf9c2c7SNick Piggin 
5267cf9c2c7SNick Piggin 	/*
527937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
528937a94c9SJacobo Giralt 	 * to one less reference.
5297cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
5307cf9c2c7SNick Piggin 	 */
531fe896d18SJoonsoo Kim 	page_ref_unfreeze(page, expected_count - 1);
5327cf9c2c7SNick Piggin 
53342cb14b1SHugh Dickins 	spin_unlock(&mapping->tree_lock);
53442cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
53542cb14b1SHugh Dickins 
5360e8c7d0fSChristoph Lameter 	/*
5370e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
5380e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
5390e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
5400e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
5410e8c7d0fSChristoph Lameter 	 *
5420e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
5434b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
5440e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
5450e8c7d0fSChristoph Lameter 	 */
54642cb14b1SHugh Dickins 	if (newzone != oldzone) {
54711fb9989SMel Gorman 		__dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
54811fb9989SMel Gorman 		__inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
54942cb14b1SHugh Dickins 		if (PageSwapBacked(page) && !PageSwapCache(page)) {
55011fb9989SMel Gorman 			__dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
55111fb9989SMel Gorman 			__inc_node_state(newzone->zone_pgdat, NR_SHMEM);
5524b02108aSKOSAKI Motohiro 		}
55342cb14b1SHugh Dickins 		if (dirty && mapping_cap_account_dirty(mapping)) {
55411fb9989SMel Gorman 			__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
5555a1c84b4SMel Gorman 			__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
55611fb9989SMel Gorman 			__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
5575a1c84b4SMel Gorman 			__inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
55842cb14b1SHugh Dickins 		}
55942cb14b1SHugh Dickins 	}
56042cb14b1SHugh Dickins 	local_irq_enable();
561b20a3503SChristoph Lameter 
56278bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
563b20a3503SChristoph Lameter }
5641118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_move_mapping);
565b20a3503SChristoph Lameter 
566b20a3503SChristoph Lameter /*
567290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
568290408d4SNaoya Horiguchi  * of migrate_page_move_mapping().
569290408d4SNaoya Horiguchi  */
570290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
571290408d4SNaoya Horiguchi 				   struct page *newpage, struct page *page)
572290408d4SNaoya Horiguchi {
573290408d4SNaoya Horiguchi 	int expected_count;
574290408d4SNaoya Horiguchi 	void **pslot;
575290408d4SNaoya Horiguchi 
576290408d4SNaoya Horiguchi 	spin_lock_irq(&mapping->tree_lock);
577290408d4SNaoya Horiguchi 
578290408d4SNaoya Horiguchi 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
579290408d4SNaoya Horiguchi 					page_index(page));
580290408d4SNaoya Horiguchi 
581290408d4SNaoya Horiguchi 	expected_count = 2 + page_has_private(page);
582290408d4SNaoya Horiguchi 	if (page_count(page) != expected_count ||
58329c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
584290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
585290408d4SNaoya Horiguchi 		return -EAGAIN;
586290408d4SNaoya Horiguchi 	}
587290408d4SNaoya Horiguchi 
588fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
589290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
590290408d4SNaoya Horiguchi 		return -EAGAIN;
591290408d4SNaoya Horiguchi 	}
592290408d4SNaoya Horiguchi 
593cf4b769aSHugh Dickins 	newpage->index = page->index;
594cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
5956a93ca8fSJohannes Weiner 
596290408d4SNaoya Horiguchi 	get_page(newpage);
597290408d4SNaoya Horiguchi 
5986d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
599290408d4SNaoya Horiguchi 
600fe896d18SJoonsoo Kim 	page_ref_unfreeze(page, expected_count - 1);
601290408d4SNaoya Horiguchi 
602290408d4SNaoya Horiguchi 	spin_unlock_irq(&mapping->tree_lock);
6036a93ca8fSJohannes Weiner 
60478bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
605290408d4SNaoya Horiguchi }
606290408d4SNaoya Horiguchi 
607290408d4SNaoya Horiguchi /*
60830b0a105SDave Hansen  * Gigantic pages are so large that we do not guarantee that page++ pointer
60930b0a105SDave Hansen  * arithmetic will work across the entire page.  We need something more
61030b0a105SDave Hansen  * specialized.
61130b0a105SDave Hansen  */
61230b0a105SDave Hansen static void __copy_gigantic_page(struct page *dst, struct page *src,
61330b0a105SDave Hansen 				int nr_pages)
61430b0a105SDave Hansen {
61530b0a105SDave Hansen 	int i;
61630b0a105SDave Hansen 	struct page *dst_base = dst;
61730b0a105SDave Hansen 	struct page *src_base = src;
61830b0a105SDave Hansen 
61930b0a105SDave Hansen 	for (i = 0; i < nr_pages; ) {
62030b0a105SDave Hansen 		cond_resched();
62130b0a105SDave Hansen 		copy_highpage(dst, src);
62230b0a105SDave Hansen 
62330b0a105SDave Hansen 		i++;
62430b0a105SDave Hansen 		dst = mem_map_next(dst, dst_base, i);
62530b0a105SDave Hansen 		src = mem_map_next(src, src_base, i);
62630b0a105SDave Hansen 	}
62730b0a105SDave Hansen }
62830b0a105SDave Hansen 
62930b0a105SDave Hansen static void copy_huge_page(struct page *dst, struct page *src)
63030b0a105SDave Hansen {
63130b0a105SDave Hansen 	int i;
63230b0a105SDave Hansen 	int nr_pages;
63330b0a105SDave Hansen 
63430b0a105SDave Hansen 	if (PageHuge(src)) {
63530b0a105SDave Hansen 		/* hugetlbfs page */
63630b0a105SDave Hansen 		struct hstate *h = page_hstate(src);
63730b0a105SDave Hansen 		nr_pages = pages_per_huge_page(h);
63830b0a105SDave Hansen 
63930b0a105SDave Hansen 		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
64030b0a105SDave Hansen 			__copy_gigantic_page(dst, src, nr_pages);
64130b0a105SDave Hansen 			return;
64230b0a105SDave Hansen 		}
64330b0a105SDave Hansen 	} else {
64430b0a105SDave Hansen 		/* thp page */
64530b0a105SDave Hansen 		BUG_ON(!PageTransHuge(src));
64630b0a105SDave Hansen 		nr_pages = hpage_nr_pages(src);
64730b0a105SDave Hansen 	}
64830b0a105SDave Hansen 
64930b0a105SDave Hansen 	for (i = 0; i < nr_pages; i++) {
65030b0a105SDave Hansen 		cond_resched();
65130b0a105SDave Hansen 		copy_highpage(dst + i, src + i);
65230b0a105SDave Hansen 	}
65330b0a105SDave Hansen }
65430b0a105SDave Hansen 
65530b0a105SDave Hansen /*
656b20a3503SChristoph Lameter  * Copy the page to its new location
657b20a3503SChristoph Lameter  */
6582916ecc0SJérôme Glisse void migrate_page_states(struct page *newpage, struct page *page)
659b20a3503SChristoph Lameter {
6607851a45cSRik van Riel 	int cpupid;
6617851a45cSRik van Riel 
662b20a3503SChristoph Lameter 	if (PageError(page))
663b20a3503SChristoph Lameter 		SetPageError(newpage);
664b20a3503SChristoph Lameter 	if (PageReferenced(page))
665b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
666b20a3503SChristoph Lameter 	if (PageUptodate(page))
667b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
668894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
669309381feSSasha Levin 		VM_BUG_ON_PAGE(PageUnevictable(page), page);
670b20a3503SChristoph Lameter 		SetPageActive(newpage);
671418b27efSLee Schermerhorn 	} else if (TestClearPageUnevictable(page))
672418b27efSLee Schermerhorn 		SetPageUnevictable(newpage);
673b20a3503SChristoph Lameter 	if (PageChecked(page))
674b20a3503SChristoph Lameter 		SetPageChecked(newpage);
675b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
676b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
677b20a3503SChristoph Lameter 
67842cb14b1SHugh Dickins 	/* Move dirty on pages not done by migrate_page_move_mapping() */
67942cb14b1SHugh Dickins 	if (PageDirty(page))
680752dc185SHugh Dickins 		SetPageDirty(newpage);
681b20a3503SChristoph Lameter 
68233c3fc71SVladimir Davydov 	if (page_is_young(page))
68333c3fc71SVladimir Davydov 		set_page_young(newpage);
68433c3fc71SVladimir Davydov 	if (page_is_idle(page))
68533c3fc71SVladimir Davydov 		set_page_idle(newpage);
68633c3fc71SVladimir Davydov 
6877851a45cSRik van Riel 	/*
6887851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
6897851a45cSRik van Riel 	 * future migrations of this same page.
6907851a45cSRik van Riel 	 */
6917851a45cSRik van Riel 	cpupid = page_cpupid_xchg_last(page, -1);
6927851a45cSRik van Riel 	page_cpupid_xchg_last(newpage, cpupid);
6937851a45cSRik van Riel 
694e9995ef9SHugh Dickins 	ksm_migrate_page(newpage, page);
695c8d6553bSHugh Dickins 	/*
696c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
697c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
698c8d6553bSHugh Dickins 	 */
699b3b3a99cSNaoya Horiguchi 	if (PageSwapCache(page))
700b20a3503SChristoph Lameter 		ClearPageSwapCache(page);
701b20a3503SChristoph Lameter 	ClearPagePrivate(page);
702b20a3503SChristoph Lameter 	set_page_private(page, 0);
703b20a3503SChristoph Lameter 
704b20a3503SChristoph Lameter 	/*
705b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
706b20a3503SChristoph Lameter 	 * wake them up.
707b20a3503SChristoph Lameter 	 */
708b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
709b20a3503SChristoph Lameter 		end_page_writeback(newpage);
710d435edcaSVlastimil Babka 
711d435edcaSVlastimil Babka 	copy_page_owner(page, newpage);
71274485cf2SJohannes Weiner 
71374485cf2SJohannes Weiner 	mem_cgroup_migrate(page, newpage);
714b20a3503SChristoph Lameter }
7152916ecc0SJérôme Glisse EXPORT_SYMBOL(migrate_page_states);
7162916ecc0SJérôme Glisse 
7172916ecc0SJérôme Glisse void migrate_page_copy(struct page *newpage, struct page *page)
7182916ecc0SJérôme Glisse {
7192916ecc0SJérôme Glisse 	if (PageHuge(page) || PageTransHuge(page))
7202916ecc0SJérôme Glisse 		copy_huge_page(newpage, page);
7212916ecc0SJérôme Glisse 	else
7222916ecc0SJérôme Glisse 		copy_highpage(newpage, page);
7232916ecc0SJérôme Glisse 
7242916ecc0SJérôme Glisse 	migrate_page_states(newpage, page);
7252916ecc0SJérôme Glisse }
7261118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_copy);
727b20a3503SChristoph Lameter 
7281d8b85ccSChristoph Lameter /************************************************************
7291d8b85ccSChristoph Lameter  *                    Migration functions
7301d8b85ccSChristoph Lameter  ***********************************************************/
7311d8b85ccSChristoph Lameter 
732b20a3503SChristoph Lameter /*
733bda807d4SMinchan Kim  * Common logic to directly migrate a single LRU page suitable for
734266cf658SDavid Howells  * pages that do not use PagePrivate/PagePrivate2.
735b20a3503SChristoph Lameter  *
736b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
737b20a3503SChristoph Lameter  */
7382d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
739a6bc32b8SMel Gorman 		struct page *newpage, struct page *page,
740a6bc32b8SMel Gorman 		enum migrate_mode mode)
741b20a3503SChristoph Lameter {
742b20a3503SChristoph Lameter 	int rc;
743b20a3503SChristoph Lameter 
744b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
745b20a3503SChristoph Lameter 
7468e321fefSBenjamin LaHaise 	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
747b20a3503SChristoph Lameter 
74878bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
749b20a3503SChristoph Lameter 		return rc;
750b20a3503SChristoph Lameter 
7512916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
752b20a3503SChristoph Lameter 		migrate_page_copy(newpage, page);
7532916ecc0SJérôme Glisse 	else
7542916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
75578bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
756b20a3503SChristoph Lameter }
757b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
758b20a3503SChristoph Lameter 
7599361401eSDavid Howells #ifdef CONFIG_BLOCK
760b20a3503SChristoph Lameter /*
7611d8b85ccSChristoph Lameter  * Migration function for pages with buffers. This function can only be used
7621d8b85ccSChristoph Lameter  * if the underlying filesystem guarantees that no other references to "page"
7631d8b85ccSChristoph Lameter  * exist.
7641d8b85ccSChristoph Lameter  */
7652d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping,
766a6bc32b8SMel Gorman 		struct page *newpage, struct page *page, enum migrate_mode mode)
7671d8b85ccSChristoph Lameter {
7681d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7691d8b85ccSChristoph Lameter 	int rc;
7701d8b85ccSChristoph Lameter 
7711d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
772a6bc32b8SMel Gorman 		return migrate_page(mapping, newpage, page, mode);
7731d8b85ccSChristoph Lameter 
7741d8b85ccSChristoph Lameter 	head = page_buffers(page);
7751d8b85ccSChristoph Lameter 
7768e321fefSBenjamin LaHaise 	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
7771d8b85ccSChristoph Lameter 
77878bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
7791d8b85ccSChristoph Lameter 		return rc;
7801d8b85ccSChristoph Lameter 
781b969c4abSMel Gorman 	/*
782b969c4abSMel Gorman 	 * In the async case, migrate_page_move_mapping locked the buffers
783b969c4abSMel Gorman 	 * with an IRQ-safe spinlock held. In the sync case, the buffers
784b969c4abSMel Gorman 	 * need to be locked now
785b969c4abSMel Gorman 	 */
786a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC)
787a6bc32b8SMel Gorman 		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
7881d8b85ccSChristoph Lameter 
7891d8b85ccSChristoph Lameter 	ClearPagePrivate(page);
7901d8b85ccSChristoph Lameter 	set_page_private(newpage, page_private(page));
7911d8b85ccSChristoph Lameter 	set_page_private(page, 0);
7921d8b85ccSChristoph Lameter 	put_page(page);
7931d8b85ccSChristoph Lameter 	get_page(newpage);
7941d8b85ccSChristoph Lameter 
7951d8b85ccSChristoph Lameter 	bh = head;
7961d8b85ccSChristoph Lameter 	do {
7971d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
7981d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7991d8b85ccSChristoph Lameter 
8001d8b85ccSChristoph Lameter 	} while (bh != head);
8011d8b85ccSChristoph Lameter 
8021d8b85ccSChristoph Lameter 	SetPagePrivate(newpage);
8031d8b85ccSChristoph Lameter 
8042916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
8051d8b85ccSChristoph Lameter 		migrate_page_copy(newpage, page);
8062916ecc0SJérôme Glisse 	else
8072916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
8081d8b85ccSChristoph Lameter 
8091d8b85ccSChristoph Lameter 	bh = head;
8101d8b85ccSChristoph Lameter 	do {
8111d8b85ccSChristoph Lameter 		unlock_buffer(bh);
8121d8b85ccSChristoph Lameter 		put_bh(bh);
8131d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
8141d8b85ccSChristoph Lameter 
8151d8b85ccSChristoph Lameter 	} while (bh != head);
8161d8b85ccSChristoph Lameter 
81778bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
8181d8b85ccSChristoph Lameter }
8191d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
8209361401eSDavid Howells #endif
8211d8b85ccSChristoph Lameter 
82204e62a29SChristoph Lameter /*
82304e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
82404e62a29SChristoph Lameter  */
82504e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
82604e62a29SChristoph Lameter {
82704e62a29SChristoph Lameter 	struct writeback_control wbc = {
82804e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
82904e62a29SChristoph Lameter 		.nr_to_write = 1,
83004e62a29SChristoph Lameter 		.range_start = 0,
83104e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
83204e62a29SChristoph Lameter 		.for_reclaim = 1
83304e62a29SChristoph Lameter 	};
83404e62a29SChristoph Lameter 	int rc;
83504e62a29SChristoph Lameter 
83604e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
83704e62a29SChristoph Lameter 		/* No write method for the address space */
83804e62a29SChristoph Lameter 		return -EINVAL;
83904e62a29SChristoph Lameter 
84004e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
84104e62a29SChristoph Lameter 		/* Someone else already triggered a write */
84204e62a29SChristoph Lameter 		return -EAGAIN;
84304e62a29SChristoph Lameter 
84404e62a29SChristoph Lameter 	/*
84504e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
84604e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
84704e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
84804e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
84904e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
85004e62a29SChristoph Lameter 	 * be successful.
85104e62a29SChristoph Lameter 	 */
852e388466dSKirill A. Shutemov 	remove_migration_ptes(page, page, false);
85304e62a29SChristoph Lameter 
85404e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
85504e62a29SChristoph Lameter 
85604e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
85704e62a29SChristoph Lameter 		/* unlocked. Relock */
85804e62a29SChristoph Lameter 		lock_page(page);
85904e62a29SChristoph Lameter 
860bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
86104e62a29SChristoph Lameter }
86204e62a29SChristoph Lameter 
86304e62a29SChristoph Lameter /*
86404e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
86504e62a29SChristoph Lameter  */
8668351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
867a6bc32b8SMel Gorman 	struct page *newpage, struct page *page, enum migrate_mode mode)
8688351a6e4SChristoph Lameter {
869b969c4abSMel Gorman 	if (PageDirty(page)) {
870a6bc32b8SMel Gorman 		/* Only writeback pages in full synchronous migration */
8712916ecc0SJérôme Glisse 		switch (mode) {
8722916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
8732916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
8742916ecc0SJérôme Glisse 			break;
8752916ecc0SJérôme Glisse 		default:
876b969c4abSMel Gorman 			return -EBUSY;
8772916ecc0SJérôme Glisse 		}
87804e62a29SChristoph Lameter 		return writeout(mapping, page);
879b969c4abSMel Gorman 	}
8808351a6e4SChristoph Lameter 
8818351a6e4SChristoph Lameter 	/*
8828351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
8838351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
8848351a6e4SChristoph Lameter 	 */
885266cf658SDavid Howells 	if (page_has_private(page) &&
8868351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
8878351a6e4SChristoph Lameter 		return -EAGAIN;
8888351a6e4SChristoph Lameter 
889a6bc32b8SMel Gorman 	return migrate_page(mapping, newpage, page, mode);
8908351a6e4SChristoph Lameter }
8918351a6e4SChristoph Lameter 
8921d8b85ccSChristoph Lameter /*
893e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
894e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
895b20a3503SChristoph Lameter  *
896e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
897e24f0b8fSChristoph Lameter  * is successful.
898894bc310SLee Schermerhorn  *
899894bc310SLee Schermerhorn  * Return value:
900894bc310SLee Schermerhorn  *   < 0 - error code
90178bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
902b20a3503SChristoph Lameter  */
9033fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page,
9045c3f9a67SHugh Dickins 				enum migrate_mode mode)
905b20a3503SChristoph Lameter {
906e24f0b8fSChristoph Lameter 	struct address_space *mapping;
907bda807d4SMinchan Kim 	int rc = -EAGAIN;
908bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
909b20a3503SChristoph Lameter 
9107db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(page), page);
9117db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
912b20a3503SChristoph Lameter 
913b20a3503SChristoph Lameter 	mapping = page_mapping(page);
914bda807d4SMinchan Kim 
915bda807d4SMinchan Kim 	if (likely(is_lru)) {
916b20a3503SChristoph Lameter 		if (!mapping)
917a6bc32b8SMel Gorman 			rc = migrate_page(mapping, newpage, page, mode);
9186c5240aeSChristoph Lameter 		else if (mapping->a_ops->migratepage)
919b20a3503SChristoph Lameter 			/*
920bda807d4SMinchan Kim 			 * Most pages have a mapping and most filesystems
921bda807d4SMinchan Kim 			 * provide a migratepage callback. Anonymous pages
922bda807d4SMinchan Kim 			 * are part of swap space which also has its own
923bda807d4SMinchan Kim 			 * migratepage callback. This is the most common path
924bda807d4SMinchan Kim 			 * for page migration.
925b20a3503SChristoph Lameter 			 */
926bda807d4SMinchan Kim 			rc = mapping->a_ops->migratepage(mapping, newpage,
927bda807d4SMinchan Kim 							page, mode);
9288351a6e4SChristoph Lameter 		else
929bda807d4SMinchan Kim 			rc = fallback_migrate_page(mapping, newpage,
930bda807d4SMinchan Kim 							page, mode);
931bda807d4SMinchan Kim 	} else {
932bda807d4SMinchan Kim 		/*
933bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
934bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
935bda807d4SMinchan Kim 		 */
936bda807d4SMinchan Kim 		VM_BUG_ON_PAGE(!PageIsolated(page), page);
937bda807d4SMinchan Kim 		if (!PageMovable(page)) {
938bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
939bda807d4SMinchan Kim 			__ClearPageIsolated(page);
940bda807d4SMinchan Kim 			goto out;
941bda807d4SMinchan Kim 		}
942bda807d4SMinchan Kim 
943bda807d4SMinchan Kim 		rc = mapping->a_ops->migratepage(mapping, newpage,
944bda807d4SMinchan Kim 						page, mode);
945bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
946bda807d4SMinchan Kim 			!PageIsolated(page));
947bda807d4SMinchan Kim 	}
948b20a3503SChristoph Lameter 
9495c3f9a67SHugh Dickins 	/*
9505c3f9a67SHugh Dickins 	 * When successful, old pagecache page->mapping must be cleared before
9515c3f9a67SHugh Dickins 	 * page is freed; but stats require that PageAnon be left as PageAnon.
9525c3f9a67SHugh Dickins 	 */
9535c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
954bda807d4SMinchan Kim 		if (__PageMovable(page)) {
955bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
956bda807d4SMinchan Kim 
957bda807d4SMinchan Kim 			/*
958bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
959bda807d4SMinchan Kim 			 * cannot try to migrate this page.
960bda807d4SMinchan Kim 			 */
961bda807d4SMinchan Kim 			__ClearPageIsolated(page);
962bda807d4SMinchan Kim 		}
963bda807d4SMinchan Kim 
964bda807d4SMinchan Kim 		/*
965bda807d4SMinchan Kim 		 * Anonymous and movable page->mapping will be cleard by
966bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
967bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
968bda807d4SMinchan Kim 		 */
969bda807d4SMinchan Kim 		if (!PageMappingFlags(page))
9705c3f9a67SHugh Dickins 			page->mapping = NULL;
9713fe2011fSMel Gorman 	}
972bda807d4SMinchan Kim out:
973e24f0b8fSChristoph Lameter 	return rc;
974e24f0b8fSChristoph Lameter }
975e24f0b8fSChristoph Lameter 
9760dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage,
9779c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
978e24f0b8fSChristoph Lameter {
9790dabec93SMinchan Kim 	int rc = -EAGAIN;
9802ebba6b7SHugh Dickins 	int page_was_mapped = 0;
9813f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
982bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
98395a402c3SChristoph Lameter 
984529ae9aaSNick Piggin 	if (!trylock_page(page)) {
985a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
9860dabec93SMinchan Kim 			goto out;
9873e7d3449SMel Gorman 
9883e7d3449SMel Gorman 		/*
9893e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
9903e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
9913e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
9923e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
9933e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
9943e7d3449SMel Gorman 		 * mpage_readpages). If an allocation happens for the
9953e7d3449SMel Gorman 		 * second or third page, the process can end up locking
9963e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
9973e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
9983e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
9993e7d3449SMel Gorman 		 * altogether.
10003e7d3449SMel Gorman 		 */
10013e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
10020dabec93SMinchan Kim 			goto out;
10033e7d3449SMel Gorman 
1004e24f0b8fSChristoph Lameter 		lock_page(page);
1005e24f0b8fSChristoph Lameter 	}
1006e24f0b8fSChristoph Lameter 
1007e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
100811bc82d6SAndrea Arcangeli 		/*
1009fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
1010a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
1011a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
1012a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
101311bc82d6SAndrea Arcangeli 		 */
10142916ecc0SJérôme Glisse 		switch (mode) {
10152916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
10162916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10172916ecc0SJérôme Glisse 			break;
10182916ecc0SJérôme Glisse 		default:
101911bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10200a31bc97SJohannes Weiner 			goto out_unlock;
102111bc82d6SAndrea Arcangeli 		}
102211bc82d6SAndrea Arcangeli 		if (!force)
10230a31bc97SJohannes Weiner 			goto out_unlock;
1024e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
1025e24f0b8fSChristoph Lameter 	}
102603f15c86SHugh Dickins 
1027e24f0b8fSChristoph Lameter 	/*
1028dc386d4dSKAMEZAWA Hiroyuki 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
1029dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
10301ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1031dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1032989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1033989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10343fe2011fSMel Gorman 	 *
103503f15c86SHugh Dickins 	 * Only page_get_anon_vma() understands the subtleties of
103603f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
103703f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
103803f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
103903f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10403fe2011fSMel Gorman 	 */
104103f15c86SHugh Dickins 	if (PageAnon(page) && !PageKsm(page))
104203f15c86SHugh Dickins 		anon_vma = page_get_anon_vma(page);
104362e1c553SShaohua Li 
10447db7671fSHugh Dickins 	/*
10457db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10467db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
10477db7671fSHugh Dickins 	 * holding a reference to newpage at this point. We used to have a BUG
10487db7671fSHugh Dickins 	 * here if trylock_page(newpage) fails, but would like to allow for
10497db7671fSHugh Dickins 	 * cases where there might be a race with the previous use of newpage.
10507db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
10517db7671fSHugh Dickins 	 */
10527db7671fSHugh Dickins 	if (unlikely(!trylock_page(newpage)))
10537db7671fSHugh Dickins 		goto out_unlock;
10547db7671fSHugh Dickins 
1055bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1056bda807d4SMinchan Kim 		rc = move_to_new_page(newpage, page, mode);
1057bda807d4SMinchan Kim 		goto out_unlock_both;
1058bda807d4SMinchan Kim 	}
1059bda807d4SMinchan Kim 
1060dc386d4dSKAMEZAWA Hiroyuki 	/*
106162e1c553SShaohua Li 	 * Corner case handling:
106262e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
106362e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
106462e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
106562e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
106662e1c553SShaohua Li 	 * 2. An orphaned page (see truncate_complete_page) might have
106762e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
106862e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
106962e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
107062e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1071dc386d4dSKAMEZAWA Hiroyuki 	 */
107262e1c553SShaohua Li 	if (!page->mapping) {
1073309381feSSasha Levin 		VM_BUG_ON_PAGE(PageAnon(page), page);
10741ce82b69SHugh Dickins 		if (page_has_private(page)) {
107562e1c553SShaohua Li 			try_to_free_buffers(page);
10767db7671fSHugh Dickins 			goto out_unlock_both;
107762e1c553SShaohua Li 		}
10787db7671fSHugh Dickins 	} else if (page_mapped(page)) {
10797db7671fSHugh Dickins 		/* Establish migration ptes */
108003f15c86SHugh Dickins 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
108103f15c86SHugh Dickins 				page);
10822ebba6b7SHugh Dickins 		try_to_unmap(page,
1083da1b13ccSWanpeng Li 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
10842ebba6b7SHugh Dickins 		page_was_mapped = 1;
10852ebba6b7SHugh Dickins 	}
1086dc386d4dSKAMEZAWA Hiroyuki 
1087e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
10885c3f9a67SHugh Dickins 		rc = move_to_new_page(newpage, page, mode);
1089e24f0b8fSChristoph Lameter 
10905c3f9a67SHugh Dickins 	if (page_was_mapped)
10915c3f9a67SHugh Dickins 		remove_migration_ptes(page,
1092e388466dSKirill A. Shutemov 			rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
10933f6c8272SMel Gorman 
10947db7671fSHugh Dickins out_unlock_both:
10957db7671fSHugh Dickins 	unlock_page(newpage);
10967db7671fSHugh Dickins out_unlock:
10973f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
109876545066SRik van Riel 	if (anon_vma)
10999e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1100b20a3503SChristoph Lameter 	unlock_page(page);
11010dabec93SMinchan Kim out:
1102c6c919ebSMinchan Kim 	/*
1103c6c919ebSMinchan Kim 	 * If migration is successful, decrease refcount of the newpage
1104c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1105c6c919ebSMinchan Kim 	 * refcounter. As well, if it is LRU page, add the page to LRU
1106c6c919ebSMinchan Kim 	 * list in here.
1107c6c919ebSMinchan Kim 	 */
1108c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1109b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(newpage)))
1110c6c919ebSMinchan Kim 			put_page(newpage);
1111c6c919ebSMinchan Kim 		else
1112c6c919ebSMinchan Kim 			putback_lru_page(newpage);
1113c6c919ebSMinchan Kim 	}
1114c6c919ebSMinchan Kim 
11150dabec93SMinchan Kim 	return rc;
11160dabec93SMinchan Kim }
111795a402c3SChristoph Lameter 
11180dabec93SMinchan Kim /*
1119ef2a5153SGeert Uytterhoeven  * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move().  Work
1120ef2a5153SGeert Uytterhoeven  * around it.
1121ef2a5153SGeert Uytterhoeven  */
1122ef2a5153SGeert Uytterhoeven #if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
1123ef2a5153SGeert Uytterhoeven #define ICE_noinline noinline
1124ef2a5153SGeert Uytterhoeven #else
1125ef2a5153SGeert Uytterhoeven #define ICE_noinline
1126ef2a5153SGeert Uytterhoeven #endif
1127ef2a5153SGeert Uytterhoeven 
1128ef2a5153SGeert Uytterhoeven /*
11290dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
11300dabec93SMinchan Kim  * to the newly allocated page in newpage.
11310dabec93SMinchan Kim  */
1132ef2a5153SGeert Uytterhoeven static ICE_noinline int unmap_and_move(new_page_t get_new_page,
1133ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
1134ef2a5153SGeert Uytterhoeven 				   unsigned long private, struct page *page,
1135add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1136add05cecSNaoya Horiguchi 				   enum migrate_reason reason)
11370dabec93SMinchan Kim {
11382def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
11390dabec93SMinchan Kim 	int *result = NULL;
11402def7424SHugh Dickins 	struct page *newpage;
11410dabec93SMinchan Kim 
11422def7424SHugh Dickins 	newpage = get_new_page(page, private, &result);
11430dabec93SMinchan Kim 	if (!newpage)
11440dabec93SMinchan Kim 		return -ENOMEM;
11450dabec93SMinchan Kim 
11460dabec93SMinchan Kim 	if (page_count(page) == 1) {
11470dabec93SMinchan Kim 		/* page was freed from under us. So we are done. */
1148c6c919ebSMinchan Kim 		ClearPageActive(page);
1149c6c919ebSMinchan Kim 		ClearPageUnevictable(page);
1150bda807d4SMinchan Kim 		if (unlikely(__PageMovable(page))) {
1151bda807d4SMinchan Kim 			lock_page(page);
1152bda807d4SMinchan Kim 			if (!PageMovable(page))
1153bda807d4SMinchan Kim 				__ClearPageIsolated(page);
1154bda807d4SMinchan Kim 			unlock_page(page);
1155bda807d4SMinchan Kim 		}
1156c6c919ebSMinchan Kim 		if (put_new_page)
1157c6c919ebSMinchan Kim 			put_new_page(newpage, private);
1158c6c919ebSMinchan Kim 		else
1159c6c919ebSMinchan Kim 			put_page(newpage);
11600dabec93SMinchan Kim 		goto out;
11610dabec93SMinchan Kim 	}
11620dabec93SMinchan Kim 
1163616b8371SZi Yan 	if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
11644d2fa965SKirill A. Shutemov 		lock_page(page);
11654d2fa965SKirill A. Shutemov 		rc = split_huge_page(page);
11664d2fa965SKirill A. Shutemov 		unlock_page(page);
11674d2fa965SKirill A. Shutemov 		if (rc)
11680dabec93SMinchan Kim 			goto out;
11694d2fa965SKirill A. Shutemov 	}
11700dabec93SMinchan Kim 
11719c620e2bSHugh Dickins 	rc = __unmap_and_move(page, newpage, force, mode);
1172c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
11737cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(newpage, reason);
1174bf6bddf1SRafael Aquini 
11750dabec93SMinchan Kim out:
1176e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1177aaa994b3SChristoph Lameter 		/*
1178aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
1179aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
1180aaa994b3SChristoph Lameter 		 * migrated will have kepts its references and be
1181aaa994b3SChristoph Lameter 		 * restored.
1182aaa994b3SChristoph Lameter 		 */
1183aaa994b3SChristoph Lameter 		list_del(&page->lru);
11846afcf8efSMing Ling 
11856afcf8efSMing Ling 		/*
11866afcf8efSMing Ling 		 * Compaction can migrate also non-LRU pages which are
11876afcf8efSMing Ling 		 * not accounted to NR_ISOLATED_*. They can be recognized
11886afcf8efSMing Ling 		 * as __PageMovable
11896afcf8efSMing Ling 		 */
11906afcf8efSMing Ling 		if (likely(!__PageMovable(page)))
1191e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1192e8db67ebSNaoya Horiguchi 					page_is_file_cache(page), -hpage_nr_pages(page));
1193e24f0b8fSChristoph Lameter 	}
119468711a74SDavid Rientjes 
119595a402c3SChristoph Lameter 	/*
1196c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
1197c6c919ebSMinchan Kim 	 * isolation. Otherwise, restore the page to right list unless
1198c6c919ebSMinchan Kim 	 * we want to retry.
119995a402c3SChristoph Lameter 	 */
1200c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1201c6c919ebSMinchan Kim 		put_page(page);
1202c6c919ebSMinchan Kim 		if (reason == MR_MEMORY_FAILURE) {
1203c6c919ebSMinchan Kim 			/*
1204c6c919ebSMinchan Kim 			 * Set PG_HWPoison on just freed page
1205c6c919ebSMinchan Kim 			 * intentionally. Although it's rather weird,
1206c6c919ebSMinchan Kim 			 * it's how HWPoison flag works at the moment.
1207c6c919ebSMinchan Kim 			 */
1208c6c919ebSMinchan Kim 			if (!test_set_page_hwpoison(page))
1209c6c919ebSMinchan Kim 				num_poisoned_pages_inc();
1210c6c919ebSMinchan Kim 		}
1211c6c919ebSMinchan Kim 	} else {
1212bda807d4SMinchan Kim 		if (rc != -EAGAIN) {
1213bda807d4SMinchan Kim 			if (likely(!__PageMovable(page))) {
1214c6c919ebSMinchan Kim 				putback_lru_page(page);
1215bda807d4SMinchan Kim 				goto put_new;
1216bda807d4SMinchan Kim 			}
1217bda807d4SMinchan Kim 
1218bda807d4SMinchan Kim 			lock_page(page);
1219bda807d4SMinchan Kim 			if (PageMovable(page))
1220bda807d4SMinchan Kim 				putback_movable_page(page);
1221bda807d4SMinchan Kim 			else
1222bda807d4SMinchan Kim 				__ClearPageIsolated(page);
1223bda807d4SMinchan Kim 			unlock_page(page);
1224bda807d4SMinchan Kim 			put_page(page);
1225bda807d4SMinchan Kim 		}
1226bda807d4SMinchan Kim put_new:
1227cf4b769aSHugh Dickins 		if (put_new_page)
122868711a74SDavid Rientjes 			put_new_page(newpage, private);
1229c6c919ebSMinchan Kim 		else
1230d6d86c0aSKonstantin Khlebnikov 			put_page(newpage);
1231c6c919ebSMinchan Kim 	}
123268711a74SDavid Rientjes 
1233742755a1SChristoph Lameter 	if (result) {
1234742755a1SChristoph Lameter 		if (rc)
1235742755a1SChristoph Lameter 			*result = rc;
1236742755a1SChristoph Lameter 		else
1237742755a1SChristoph Lameter 			*result = page_to_nid(newpage);
1238742755a1SChristoph Lameter 	}
1239e24f0b8fSChristoph Lameter 	return rc;
1240e24f0b8fSChristoph Lameter }
1241b20a3503SChristoph Lameter 
1242e24f0b8fSChristoph Lameter /*
1243290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1244290408d4SNaoya Horiguchi  *
1245290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1246290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1247290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1248290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1249290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1250290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1251290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1252290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1253290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1254290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1255290408d4SNaoya Horiguchi  *
1256290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1257290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1258290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1259290408d4SNaoya Horiguchi  */
1260290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
126168711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
126268711a74SDavid Rientjes 				struct page *hpage, int force,
12637cd12b4aSVlastimil Babka 				enum migrate_mode mode, int reason)
1264290408d4SNaoya Horiguchi {
12652def7424SHugh Dickins 	int rc = -EAGAIN;
1266290408d4SNaoya Horiguchi 	int *result = NULL;
12672ebba6b7SHugh Dickins 	int page_was_mapped = 0;
126832665f2bSJoonsoo Kim 	struct page *new_hpage;
1269290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1270290408d4SNaoya Horiguchi 
127183467efbSNaoya Horiguchi 	/*
127283467efbSNaoya Horiguchi 	 * Movability of hugepages depends on architectures and hugepage size.
127383467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
127483467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
127583467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
127683467efbSNaoya Horiguchi 	 * kicking migration.
127783467efbSNaoya Horiguchi 	 */
1278100873d7SNaoya Horiguchi 	if (!hugepage_migration_supported(page_hstate(hpage))) {
127932665f2bSJoonsoo Kim 		putback_active_hugepage(hpage);
128083467efbSNaoya Horiguchi 		return -ENOSYS;
128132665f2bSJoonsoo Kim 	}
128283467efbSNaoya Horiguchi 
128332665f2bSJoonsoo Kim 	new_hpage = get_new_page(hpage, private, &result);
1284290408d4SNaoya Horiguchi 	if (!new_hpage)
1285290408d4SNaoya Horiguchi 		return -ENOMEM;
1286290408d4SNaoya Horiguchi 
1287290408d4SNaoya Horiguchi 	if (!trylock_page(hpage)) {
12882916ecc0SJérôme Glisse 		if (!force)
1289290408d4SNaoya Horiguchi 			goto out;
12902916ecc0SJérôme Glisse 		switch (mode) {
12912916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
12922916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
12932916ecc0SJérôme Glisse 			break;
12942916ecc0SJérôme Glisse 		default:
12952916ecc0SJérôme Glisse 			goto out;
12962916ecc0SJérôme Glisse 		}
1297290408d4SNaoya Horiguchi 		lock_page(hpage);
1298290408d4SNaoya Horiguchi 	}
1299290408d4SNaoya Horiguchi 
1300746b18d4SPeter Zijlstra 	if (PageAnon(hpage))
1301746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(hpage);
1302290408d4SNaoya Horiguchi 
13037db7671fSHugh Dickins 	if (unlikely(!trylock_page(new_hpage)))
13047db7671fSHugh Dickins 		goto put_anon;
13057db7671fSHugh Dickins 
13062ebba6b7SHugh Dickins 	if (page_mapped(hpage)) {
13072ebba6b7SHugh Dickins 		try_to_unmap(hpage,
13082ebba6b7SHugh Dickins 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
13092ebba6b7SHugh Dickins 		page_was_mapped = 1;
13102ebba6b7SHugh Dickins 	}
1311290408d4SNaoya Horiguchi 
1312290408d4SNaoya Horiguchi 	if (!page_mapped(hpage))
13135c3f9a67SHugh Dickins 		rc = move_to_new_page(new_hpage, hpage, mode);
1314290408d4SNaoya Horiguchi 
13155c3f9a67SHugh Dickins 	if (page_was_mapped)
13165c3f9a67SHugh Dickins 		remove_migration_ptes(hpage,
1317e388466dSKirill A. Shutemov 			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1318290408d4SNaoya Horiguchi 
13197db7671fSHugh Dickins 	unlock_page(new_hpage);
13207db7671fSHugh Dickins 
13217db7671fSHugh Dickins put_anon:
1322fd4a4663SHugh Dickins 	if (anon_vma)
13239e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13248e6ac7faSAneesh Kumar K.V 
13252def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
13268e6ac7faSAneesh Kumar K.V 		hugetlb_cgroup_migrate(hpage, new_hpage);
13272def7424SHugh Dickins 		put_new_page = NULL;
13287cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(new_hpage, reason);
13292def7424SHugh Dickins 	}
13308e6ac7faSAneesh Kumar K.V 
1331290408d4SNaoya Horiguchi 	unlock_page(hpage);
133209761333SHillf Danton out:
1333b8ec1ceeSNaoya Horiguchi 	if (rc != -EAGAIN)
1334b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1335c3114a84SAnshuman Khandual 	if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
1336c3114a84SAnshuman Khandual 		num_poisoned_pages_inc();
133768711a74SDavid Rientjes 
133868711a74SDavid Rientjes 	/*
133968711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
134068711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
134168711a74SDavid Rientjes 	 * isolation.
134268711a74SDavid Rientjes 	 */
13432def7424SHugh Dickins 	if (put_new_page)
134468711a74SDavid Rientjes 		put_new_page(new_hpage, private);
134568711a74SDavid Rientjes 	else
13463aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
134768711a74SDavid Rientjes 
1348290408d4SNaoya Horiguchi 	if (result) {
1349290408d4SNaoya Horiguchi 		if (rc)
1350290408d4SNaoya Horiguchi 			*result = rc;
1351290408d4SNaoya Horiguchi 		else
1352290408d4SNaoya Horiguchi 			*result = page_to_nid(new_hpage);
1353290408d4SNaoya Horiguchi 	}
1354290408d4SNaoya Horiguchi 	return rc;
1355290408d4SNaoya Horiguchi }
1356290408d4SNaoya Horiguchi 
1357290408d4SNaoya Horiguchi /*
1358c73e5c9cSSrivatsa S. Bhat  * migrate_pages - migrate the pages specified in a list, to the free pages
1359c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1360e24f0b8fSChristoph Lameter  *
1361c73e5c9cSSrivatsa S. Bhat  * @from:		The list of pages to be migrated.
1362c73e5c9cSSrivatsa S. Bhat  * @get_new_page:	The function used to allocate free pages to be used
1363c73e5c9cSSrivatsa S. Bhat  *			as the target of the page migration.
136468711a74SDavid Rientjes  * @put_new_page:	The function used to free target pages if migration
136568711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1366c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1367c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1368c73e5c9cSSrivatsa S. Bhat  *			page migration, if any.
1369c73e5c9cSSrivatsa S. Bhat  * @reason:		The reason for page migration.
1370e24f0b8fSChristoph Lameter  *
1371c73e5c9cSSrivatsa S. Bhat  * The function returns after 10 attempts or if no pages are movable any more
1372c73e5c9cSSrivatsa S. Bhat  * because the list has become empty or no retryable pages exist any more.
137314e0f9bcSHugh Dickins  * The caller should call putback_movable_pages() to return pages to the LRU
137428bd6578SMinchan Kim  * or free list only if ret != 0.
1375e24f0b8fSChristoph Lameter  *
1376c73e5c9cSSrivatsa S. Bhat  * Returns the number of pages that were not migrated, or an error code.
1377e24f0b8fSChristoph Lameter  */
13789c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
137968711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
138068711a74SDavid Rientjes 		enum migrate_mode mode, int reason)
1381e24f0b8fSChristoph Lameter {
1382e24f0b8fSChristoph Lameter 	int retry = 1;
1383e24f0b8fSChristoph Lameter 	int nr_failed = 0;
13845647bc29SMel Gorman 	int nr_succeeded = 0;
1385e24f0b8fSChristoph Lameter 	int pass = 0;
1386e24f0b8fSChristoph Lameter 	struct page *page;
1387e24f0b8fSChristoph Lameter 	struct page *page2;
1388e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
1389e24f0b8fSChristoph Lameter 	int rc;
13902d1db3b1SChristoph Lameter 
1391e24f0b8fSChristoph Lameter 	if (!swapwrite)
1392e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
1393e24f0b8fSChristoph Lameter 
1394e24f0b8fSChristoph Lameter 	for(pass = 0; pass < 10 && retry; pass++) {
1395e24f0b8fSChristoph Lameter 		retry = 0;
1396e24f0b8fSChristoph Lameter 
1397e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
1398e24f0b8fSChristoph Lameter 			cond_resched();
1399e24f0b8fSChristoph Lameter 
140031caf665SNaoya Horiguchi 			if (PageHuge(page))
140131caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
140268711a74SDavid Rientjes 						put_new_page, private, page,
14037cd12b4aSVlastimil Babka 						pass > 2, mode, reason);
140431caf665SNaoya Horiguchi 			else
140568711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1406add05cecSNaoya Horiguchi 						private, page, pass > 2, mode,
1407add05cecSNaoya Horiguchi 						reason);
1408e24f0b8fSChristoph Lameter 
1409e24f0b8fSChristoph Lameter 			switch(rc) {
141095a402c3SChristoph Lameter 			case -ENOMEM:
1411dfef2ef4SDavid Rientjes 				nr_failed++;
141295a402c3SChristoph Lameter 				goto out;
1413e24f0b8fSChristoph Lameter 			case -EAGAIN:
1414b20a3503SChristoph Lameter 				retry++;
1415e24f0b8fSChristoph Lameter 				break;
141678bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
14175647bc29SMel Gorman 				nr_succeeded++;
1418e24f0b8fSChristoph Lameter 				break;
1419e24f0b8fSChristoph Lameter 			default:
1420354a3363SNaoya Horiguchi 				/*
1421354a3363SNaoya Horiguchi 				 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1422354a3363SNaoya Horiguchi 				 * unlike -EAGAIN case, the failed page is
1423354a3363SNaoya Horiguchi 				 * removed from migration page list and not
1424354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1425354a3363SNaoya Horiguchi 				 */
1426b20a3503SChristoph Lameter 				nr_failed++;
1427e24f0b8fSChristoph Lameter 				break;
1428b20a3503SChristoph Lameter 			}
1429b20a3503SChristoph Lameter 		}
1430e24f0b8fSChristoph Lameter 	}
1431f2f81fb2SVlastimil Babka 	nr_failed += retry;
1432f2f81fb2SVlastimil Babka 	rc = nr_failed;
143395a402c3SChristoph Lameter out:
14345647bc29SMel Gorman 	if (nr_succeeded)
14355647bc29SMel Gorman 		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
14365647bc29SMel Gorman 	if (nr_failed)
14375647bc29SMel Gorman 		count_vm_events(PGMIGRATE_FAIL, nr_failed);
14387b2a2d4aSMel Gorman 	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
14397b2a2d4aSMel Gorman 
1440b20a3503SChristoph Lameter 	if (!swapwrite)
1441b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
1442b20a3503SChristoph Lameter 
144395a402c3SChristoph Lameter 	return rc;
1444b20a3503SChristoph Lameter }
1445b20a3503SChristoph Lameter 
1446742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1447742755a1SChristoph Lameter /*
1448742755a1SChristoph Lameter  * Move a list of individual pages
1449742755a1SChristoph Lameter  */
1450742755a1SChristoph Lameter struct page_to_node {
1451742755a1SChristoph Lameter 	unsigned long addr;
1452742755a1SChristoph Lameter 	struct page *page;
1453742755a1SChristoph Lameter 	int node;
1454742755a1SChristoph Lameter 	int status;
1455742755a1SChristoph Lameter };
1456742755a1SChristoph Lameter 
1457742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private,
1458742755a1SChristoph Lameter 		int **result)
1459742755a1SChristoph Lameter {
1460742755a1SChristoph Lameter 	struct page_to_node *pm = (struct page_to_node *)private;
1461742755a1SChristoph Lameter 
1462742755a1SChristoph Lameter 	while (pm->node != MAX_NUMNODES && pm->page != p)
1463742755a1SChristoph Lameter 		pm++;
1464742755a1SChristoph Lameter 
1465742755a1SChristoph Lameter 	if (pm->node == MAX_NUMNODES)
1466742755a1SChristoph Lameter 		return NULL;
1467742755a1SChristoph Lameter 
1468742755a1SChristoph Lameter 	*result = &pm->status;
1469742755a1SChristoph Lameter 
1470e632a938SNaoya Horiguchi 	if (PageHuge(p))
1471e632a938SNaoya Horiguchi 		return alloc_huge_page_node(page_hstate(compound_head(p)),
1472e632a938SNaoya Horiguchi 					pm->node);
1473e8db67ebSNaoya Horiguchi 	else if (thp_migration_supported() && PageTransHuge(p)) {
1474e8db67ebSNaoya Horiguchi 		struct page *thp;
1475e8db67ebSNaoya Horiguchi 
1476e8db67ebSNaoya Horiguchi 		thp = alloc_pages_node(pm->node,
1477e8db67ebSNaoya Horiguchi 			(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
1478e8db67ebSNaoya Horiguchi 			HPAGE_PMD_ORDER);
1479e8db67ebSNaoya Horiguchi 		if (!thp)
1480e8db67ebSNaoya Horiguchi 			return NULL;
1481e8db67ebSNaoya Horiguchi 		prep_transhuge_page(thp);
1482e8db67ebSNaoya Horiguchi 		return thp;
1483e8db67ebSNaoya Horiguchi 	} else
148496db800fSVlastimil Babka 		return __alloc_pages_node(pm->node,
1485e97ca8e5SJohannes Weiner 				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1486742755a1SChristoph Lameter }
1487742755a1SChristoph Lameter 
1488742755a1SChristoph Lameter /*
1489742755a1SChristoph Lameter  * Move a set of pages as indicated in the pm array. The addr
1490742755a1SChristoph Lameter  * field must be set to the virtual address of the page to be moved
1491742755a1SChristoph Lameter  * and the node number must contain a valid target node.
14925e9a0f02SBrice Goglin  * The pm array ends with node = MAX_NUMNODES.
1493742755a1SChristoph Lameter  */
14945e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm,
14955e9a0f02SBrice Goglin 				      struct page_to_node *pm,
1496742755a1SChristoph Lameter 				      int migrate_all)
1497742755a1SChristoph Lameter {
1498742755a1SChristoph Lameter 	int err;
1499742755a1SChristoph Lameter 	struct page_to_node *pp;
1500742755a1SChristoph Lameter 	LIST_HEAD(pagelist);
1501742755a1SChristoph Lameter 
1502742755a1SChristoph Lameter 	down_read(&mm->mmap_sem);
1503742755a1SChristoph Lameter 
1504742755a1SChristoph Lameter 	/*
1505742755a1SChristoph Lameter 	 * Build a list of pages to migrate
1506742755a1SChristoph Lameter 	 */
1507742755a1SChristoph Lameter 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1508742755a1SChristoph Lameter 		struct vm_area_struct *vma;
1509742755a1SChristoph Lameter 		struct page *page;
1510e8db67ebSNaoya Horiguchi 		struct page *head;
1511e8db67ebSNaoya Horiguchi 		unsigned int follflags;
1512742755a1SChristoph Lameter 
1513742755a1SChristoph Lameter 		err = -EFAULT;
1514742755a1SChristoph Lameter 		vma = find_vma(mm, pp->addr);
151570384dc6SGleb Natapov 		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1516742755a1SChristoph Lameter 			goto set_status;
1517742755a1SChristoph Lameter 
1518d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
1519e8db67ebSNaoya Horiguchi 		follflags = FOLL_GET | FOLL_DUMP;
1520e8db67ebSNaoya Horiguchi 		if (!thp_migration_supported())
1521e8db67ebSNaoya Horiguchi 			follflags |= FOLL_SPLIT;
1522e8db67ebSNaoya Horiguchi 		page = follow_page(vma, pp->addr, follflags);
152389f5b7daSLinus Torvalds 
152489f5b7daSLinus Torvalds 		err = PTR_ERR(page);
152589f5b7daSLinus Torvalds 		if (IS_ERR(page))
152689f5b7daSLinus Torvalds 			goto set_status;
152789f5b7daSLinus Torvalds 
1528742755a1SChristoph Lameter 		err = -ENOENT;
1529742755a1SChristoph Lameter 		if (!page)
1530742755a1SChristoph Lameter 			goto set_status;
1531742755a1SChristoph Lameter 
1532742755a1SChristoph Lameter 		err = page_to_nid(page);
1533742755a1SChristoph Lameter 
1534742755a1SChristoph Lameter 		if (err == pp->node)
1535742755a1SChristoph Lameter 			/*
1536742755a1SChristoph Lameter 			 * Node already in the right place
1537742755a1SChristoph Lameter 			 */
1538742755a1SChristoph Lameter 			goto put_and_set;
1539742755a1SChristoph Lameter 
1540742755a1SChristoph Lameter 		err = -EACCES;
1541742755a1SChristoph Lameter 		if (page_mapcount(page) > 1 &&
1542742755a1SChristoph Lameter 				!migrate_all)
1543742755a1SChristoph Lameter 			goto put_and_set;
1544742755a1SChristoph Lameter 
1545e632a938SNaoya Horiguchi 		if (PageHuge(page)) {
1546e8db67ebSNaoya Horiguchi 			if (PageHead(page)) {
1547e632a938SNaoya Horiguchi 				isolate_huge_page(page, &pagelist);
1548e8db67ebSNaoya Horiguchi 				err = 0;
1549e8db67ebSNaoya Horiguchi 				pp->page = page;
1550e8db67ebSNaoya Horiguchi 			}
1551e632a938SNaoya Horiguchi 			goto put_and_set;
1552e632a938SNaoya Horiguchi 		}
1553e632a938SNaoya Horiguchi 
1554e8db67ebSNaoya Horiguchi 		pp->page = compound_head(page);
1555e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1556e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
15576d9c285aSKOSAKI Motohiro 		if (!err) {
1558e8db67ebSNaoya Horiguchi 			list_add_tail(&head->lru, &pagelist);
1559e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(head),
1560e8db67ebSNaoya Horiguchi 				NR_ISOLATED_ANON + page_is_file_cache(head),
1561e8db67ebSNaoya Horiguchi 				hpage_nr_pages(head));
15626d9c285aSKOSAKI Motohiro 		}
1563742755a1SChristoph Lameter put_and_set:
1564742755a1SChristoph Lameter 		/*
1565742755a1SChristoph Lameter 		 * Either remove the duplicate refcount from
1566742755a1SChristoph Lameter 		 * isolate_lru_page() or drop the page ref if it was
1567742755a1SChristoph Lameter 		 * not isolated.
1568742755a1SChristoph Lameter 		 */
1569742755a1SChristoph Lameter 		put_page(page);
1570742755a1SChristoph Lameter set_status:
1571742755a1SChristoph Lameter 		pp->status = err;
1572742755a1SChristoph Lameter 	}
1573742755a1SChristoph Lameter 
1574e78bbfa8SBrice Goglin 	err = 0;
1575cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
157668711a74SDavid Rientjes 		err = migrate_pages(&pagelist, new_page_node, NULL,
15779c620e2bSHugh Dickins 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1578cf608ac1SMinchan Kim 		if (err)
1579e632a938SNaoya Horiguchi 			putback_movable_pages(&pagelist);
1580cf608ac1SMinchan Kim 	}
1581742755a1SChristoph Lameter 
1582742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
1583742755a1SChristoph Lameter 	return err;
1584742755a1SChristoph Lameter }
1585742755a1SChristoph Lameter 
1586742755a1SChristoph Lameter /*
15875e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
15885e9a0f02SBrice Goglin  * the corresponding array of status.
15895e9a0f02SBrice Goglin  */
15903268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
15915e9a0f02SBrice Goglin 			 unsigned long nr_pages,
15925e9a0f02SBrice Goglin 			 const void __user * __user *pages,
15935e9a0f02SBrice Goglin 			 const int __user *nodes,
15945e9a0f02SBrice Goglin 			 int __user *status, int flags)
15955e9a0f02SBrice Goglin {
15963140a227SBrice Goglin 	struct page_to_node *pm;
15973140a227SBrice Goglin 	unsigned long chunk_nr_pages;
15983140a227SBrice Goglin 	unsigned long chunk_start;
15993140a227SBrice Goglin 	int err;
16005e9a0f02SBrice Goglin 
16015e9a0f02SBrice Goglin 	err = -ENOMEM;
16023140a227SBrice Goglin 	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
16033140a227SBrice Goglin 	if (!pm)
16045e9a0f02SBrice Goglin 		goto out;
160535282a2dSBrice Goglin 
160635282a2dSBrice Goglin 	migrate_prep();
160735282a2dSBrice Goglin 
16085e9a0f02SBrice Goglin 	/*
16093140a227SBrice Goglin 	 * Store a chunk of page_to_node array in a page,
16103140a227SBrice Goglin 	 * but keep the last one as a marker
16115e9a0f02SBrice Goglin 	 */
16123140a227SBrice Goglin 	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
16133140a227SBrice Goglin 
16143140a227SBrice Goglin 	for (chunk_start = 0;
16153140a227SBrice Goglin 	     chunk_start < nr_pages;
16163140a227SBrice Goglin 	     chunk_start += chunk_nr_pages) {
16173140a227SBrice Goglin 		int j;
16183140a227SBrice Goglin 
16193140a227SBrice Goglin 		if (chunk_start + chunk_nr_pages > nr_pages)
16203140a227SBrice Goglin 			chunk_nr_pages = nr_pages - chunk_start;
16213140a227SBrice Goglin 
16223140a227SBrice Goglin 		/* fill the chunk pm with addrs and nodes from user-space */
16233140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++) {
16245e9a0f02SBrice Goglin 			const void __user *p;
16255e9a0f02SBrice Goglin 			int node;
16265e9a0f02SBrice Goglin 
16273140a227SBrice Goglin 			err = -EFAULT;
16283140a227SBrice Goglin 			if (get_user(p, pages + j + chunk_start))
16293140a227SBrice Goglin 				goto out_pm;
16303140a227SBrice Goglin 			pm[j].addr = (unsigned long) p;
16313140a227SBrice Goglin 
16323140a227SBrice Goglin 			if (get_user(node, nodes + j + chunk_start))
16335e9a0f02SBrice Goglin 				goto out_pm;
16345e9a0f02SBrice Goglin 
16355e9a0f02SBrice Goglin 			err = -ENODEV;
16366f5a55f1SLinus Torvalds 			if (node < 0 || node >= MAX_NUMNODES)
16376f5a55f1SLinus Torvalds 				goto out_pm;
16386f5a55f1SLinus Torvalds 
1639389162c2SLai Jiangshan 			if (!node_state(node, N_MEMORY))
16405e9a0f02SBrice Goglin 				goto out_pm;
16415e9a0f02SBrice Goglin 
16425e9a0f02SBrice Goglin 			err = -EACCES;
16435e9a0f02SBrice Goglin 			if (!node_isset(node, task_nodes))
16445e9a0f02SBrice Goglin 				goto out_pm;
16455e9a0f02SBrice Goglin 
16463140a227SBrice Goglin 			pm[j].node = node;
16475e9a0f02SBrice Goglin 		}
16485e9a0f02SBrice Goglin 
16493140a227SBrice Goglin 		/* End marker for this chunk */
16503140a227SBrice Goglin 		pm[chunk_nr_pages].node = MAX_NUMNODES;
16513140a227SBrice Goglin 
16523140a227SBrice Goglin 		/* Migrate this chunk */
16533140a227SBrice Goglin 		err = do_move_page_to_node_array(mm, pm,
16543140a227SBrice Goglin 						 flags & MPOL_MF_MOVE_ALL);
16553140a227SBrice Goglin 		if (err < 0)
16563140a227SBrice Goglin 			goto out_pm;
16573140a227SBrice Goglin 
16585e9a0f02SBrice Goglin 		/* Return status information */
16593140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++)
16603140a227SBrice Goglin 			if (put_user(pm[j].status, status + j + chunk_start)) {
16615e9a0f02SBrice Goglin 				err = -EFAULT;
16623140a227SBrice Goglin 				goto out_pm;
16633140a227SBrice Goglin 			}
16643140a227SBrice Goglin 	}
16653140a227SBrice Goglin 	err = 0;
16665e9a0f02SBrice Goglin 
16675e9a0f02SBrice Goglin out_pm:
16683140a227SBrice Goglin 	free_page((unsigned long)pm);
16695e9a0f02SBrice Goglin out:
16705e9a0f02SBrice Goglin 	return err;
16715e9a0f02SBrice Goglin }
16725e9a0f02SBrice Goglin 
16735e9a0f02SBrice Goglin /*
16742f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1675742755a1SChristoph Lameter  */
167680bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
167780bba129SBrice Goglin 				const void __user **pages, int *status)
1678742755a1SChristoph Lameter {
16792f007e74SBrice Goglin 	unsigned long i;
1680742755a1SChristoph Lameter 
16812f007e74SBrice Goglin 	down_read(&mm->mmap_sem);
16822f007e74SBrice Goglin 
16832f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
168480bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
16852f007e74SBrice Goglin 		struct vm_area_struct *vma;
16862f007e74SBrice Goglin 		struct page *page;
1687c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
16882f007e74SBrice Goglin 
16892f007e74SBrice Goglin 		vma = find_vma(mm, addr);
169070384dc6SGleb Natapov 		if (!vma || addr < vma->vm_start)
1691742755a1SChristoph Lameter 			goto set_status;
1692742755a1SChristoph Lameter 
1693d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
1694d899844eSKirill A. Shutemov 		page = follow_page(vma, addr, FOLL_DUMP);
169589f5b7daSLinus Torvalds 
169689f5b7daSLinus Torvalds 		err = PTR_ERR(page);
169789f5b7daSLinus Torvalds 		if (IS_ERR(page))
169889f5b7daSLinus Torvalds 			goto set_status;
169989f5b7daSLinus Torvalds 
1700d899844eSKirill A. Shutemov 		err = page ? page_to_nid(page) : -ENOENT;
1701742755a1SChristoph Lameter set_status:
170280bba129SBrice Goglin 		*status = err;
170380bba129SBrice Goglin 
170480bba129SBrice Goglin 		pages++;
170580bba129SBrice Goglin 		status++;
170680bba129SBrice Goglin 	}
170780bba129SBrice Goglin 
170880bba129SBrice Goglin 	up_read(&mm->mmap_sem);
170980bba129SBrice Goglin }
171080bba129SBrice Goglin 
171180bba129SBrice Goglin /*
171280bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
171380bba129SBrice Goglin  * a user array of status.
171480bba129SBrice Goglin  */
171580bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
171680bba129SBrice Goglin 			 const void __user * __user *pages,
171780bba129SBrice Goglin 			 int __user *status)
171880bba129SBrice Goglin {
171980bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16
172080bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
172180bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
172280bba129SBrice Goglin 
172387b8d1adSH. Peter Anvin 	while (nr_pages) {
172487b8d1adSH. Peter Anvin 		unsigned long chunk_nr;
172580bba129SBrice Goglin 
172687b8d1adSH. Peter Anvin 		chunk_nr = nr_pages;
172787b8d1adSH. Peter Anvin 		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
172887b8d1adSH. Peter Anvin 			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
172987b8d1adSH. Peter Anvin 
173087b8d1adSH. Peter Anvin 		if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
173187b8d1adSH. Peter Anvin 			break;
173280bba129SBrice Goglin 
173380bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
173480bba129SBrice Goglin 
173587b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
173687b8d1adSH. Peter Anvin 			break;
1737742755a1SChristoph Lameter 
173887b8d1adSH. Peter Anvin 		pages += chunk_nr;
173987b8d1adSH. Peter Anvin 		status += chunk_nr;
174087b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
174187b8d1adSH. Peter Anvin 	}
174287b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1743742755a1SChristoph Lameter }
1744742755a1SChristoph Lameter 
1745742755a1SChristoph Lameter /*
1746742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
1747742755a1SChristoph Lameter  * process.
1748742755a1SChristoph Lameter  */
1749938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1750938bb9f5SHeiko Carstens 		const void __user * __user *, pages,
1751938bb9f5SHeiko Carstens 		const int __user *, nodes,
1752938bb9f5SHeiko Carstens 		int __user *, status, int, flags)
1753742755a1SChristoph Lameter {
1754742755a1SChristoph Lameter 	struct task_struct *task;
1755742755a1SChristoph Lameter 	struct mm_struct *mm;
17565e9a0f02SBrice Goglin 	int err;
17573268c63eSChristoph Lameter 	nodemask_t task_nodes;
1758742755a1SChristoph Lameter 
1759742755a1SChristoph Lameter 	/* Check flags */
1760742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1761742755a1SChristoph Lameter 		return -EINVAL;
1762742755a1SChristoph Lameter 
1763742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1764742755a1SChristoph Lameter 		return -EPERM;
1765742755a1SChristoph Lameter 
1766742755a1SChristoph Lameter 	/* Find the mm_struct */
1767a879bf58SGreg Thelen 	rcu_read_lock();
1768228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
1769742755a1SChristoph Lameter 	if (!task) {
1770a879bf58SGreg Thelen 		rcu_read_unlock();
1771742755a1SChristoph Lameter 		return -ESRCH;
1772742755a1SChristoph Lameter 	}
17733268c63eSChristoph Lameter 	get_task_struct(task);
1774742755a1SChristoph Lameter 
1775742755a1SChristoph Lameter 	/*
1776742755a1SChristoph Lameter 	 * Check if this process has the right to modify the specified
1777197e7e52SLinus Torvalds 	 * process. Use the regular "ptrace_may_access()" checks.
1778742755a1SChristoph Lameter 	 */
1779197e7e52SLinus Torvalds 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1780c69e8d9cSDavid Howells 		rcu_read_unlock();
1781742755a1SChristoph Lameter 		err = -EPERM;
17825e9a0f02SBrice Goglin 		goto out;
1783742755a1SChristoph Lameter 	}
1784c69e8d9cSDavid Howells 	rcu_read_unlock();
1785742755a1SChristoph Lameter 
178686c3a764SDavid Quigley  	err = security_task_movememory(task);
178786c3a764SDavid Quigley  	if (err)
1788742755a1SChristoph Lameter 		goto out;
1789742755a1SChristoph Lameter 
17903268c63eSChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
17913268c63eSChristoph Lameter 	mm = get_task_mm(task);
17923268c63eSChristoph Lameter 	put_task_struct(task);
17933268c63eSChristoph Lameter 
17946e8b09eaSSasha Levin 	if (!mm)
17956e8b09eaSSasha Levin 		return -EINVAL;
17966e8b09eaSSasha Levin 
17973268c63eSChristoph Lameter 	if (nodes)
17983268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
17993268c63eSChristoph Lameter 				    nodes, status, flags);
18003268c63eSChristoph Lameter 	else
18015e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
18023268c63eSChristoph Lameter 
18033268c63eSChristoph Lameter 	mmput(mm);
18043268c63eSChristoph Lameter 	return err;
1805742755a1SChristoph Lameter 
1806742755a1SChristoph Lameter out:
18073268c63eSChristoph Lameter 	put_task_struct(task);
1808742755a1SChristoph Lameter 	return err;
1809742755a1SChristoph Lameter }
1810742755a1SChristoph Lameter 
18117039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
18127039e1dbSPeter Zijlstra /*
18137039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
18147039e1dbSPeter Zijlstra  * pages. Currently it only checks the watermarks which crude
18157039e1dbSPeter Zijlstra  */
18167039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
18173abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
18187039e1dbSPeter Zijlstra {
18197039e1dbSPeter Zijlstra 	int z;
1820599d0c95SMel Gorman 
18217039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
18227039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
18237039e1dbSPeter Zijlstra 
18247039e1dbSPeter Zijlstra 		if (!populated_zone(zone))
18257039e1dbSPeter Zijlstra 			continue;
18267039e1dbSPeter Zijlstra 
18277039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
18287039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
18297039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
18307039e1dbSPeter Zijlstra 				       nr_migrate_pages,
18317039e1dbSPeter Zijlstra 				       0, 0))
18327039e1dbSPeter Zijlstra 			continue;
18337039e1dbSPeter Zijlstra 		return true;
18347039e1dbSPeter Zijlstra 	}
18357039e1dbSPeter Zijlstra 	return false;
18367039e1dbSPeter Zijlstra }
18377039e1dbSPeter Zijlstra 
18387039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
18397039e1dbSPeter Zijlstra 					   unsigned long data,
18407039e1dbSPeter Zijlstra 					   int **result)
18417039e1dbSPeter Zijlstra {
18427039e1dbSPeter Zijlstra 	int nid = (int) data;
18437039e1dbSPeter Zijlstra 	struct page *newpage;
18447039e1dbSPeter Zijlstra 
184596db800fSVlastimil Babka 	newpage = __alloc_pages_node(nid,
1846e97ca8e5SJohannes Weiner 					 (GFP_HIGHUSER_MOVABLE |
1847e97ca8e5SJohannes Weiner 					  __GFP_THISNODE | __GFP_NOMEMALLOC |
1848e97ca8e5SJohannes Weiner 					  __GFP_NORETRY | __GFP_NOWARN) &
18498479eba7SMel Gorman 					 ~__GFP_RECLAIM, 0);
1850bac0382cSHillf Danton 
18517039e1dbSPeter Zijlstra 	return newpage;
18527039e1dbSPeter Zijlstra }
18537039e1dbSPeter Zijlstra 
18547039e1dbSPeter Zijlstra /*
1855a8f60772SMel Gorman  * page migration rate limiting control.
1856a8f60772SMel Gorman  * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1857a8f60772SMel Gorman  * window of time. Default here says do not migrate more than 1280M per second.
1858a8f60772SMel Gorman  */
1859a8f60772SMel Gorman static unsigned int migrate_interval_millisecs __read_mostly = 100;
1860a8f60772SMel Gorman static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1861a8f60772SMel Gorman 
1862b32967ffSMel Gorman /* Returns true if the node is migrate rate-limited after the update */
18631c30e017SMel Gorman static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
18641c30e017SMel Gorman 					unsigned long nr_pages)
1865b32967ffSMel Gorman {
1866b32967ffSMel Gorman 	/*
1867b32967ffSMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
1868b32967ffSMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
1869b32967ffSMel Gorman 	 * all the time is being spent migrating!
1870b32967ffSMel Gorman 	 */
1871b32967ffSMel Gorman 	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
18721c5e9c27SMel Gorman 		spin_lock(&pgdat->numabalancing_migrate_lock);
1873b32967ffSMel Gorman 		pgdat->numabalancing_migrate_nr_pages = 0;
1874b32967ffSMel Gorman 		pgdat->numabalancing_migrate_next_window = jiffies +
1875b32967ffSMel Gorman 			msecs_to_jiffies(migrate_interval_millisecs);
18761c5e9c27SMel Gorman 		spin_unlock(&pgdat->numabalancing_migrate_lock);
1877b32967ffSMel Gorman 	}
1878af1839d7SMel Gorman 	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1879af1839d7SMel Gorman 		trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1880af1839d7SMel Gorman 								nr_pages);
18811c5e9c27SMel Gorman 		return true;
1882af1839d7SMel Gorman 	}
1883b32967ffSMel Gorman 
18841c5e9c27SMel Gorman 	/*
18851c5e9c27SMel Gorman 	 * This is an unlocked non-atomic update so errors are possible.
18861c5e9c27SMel Gorman 	 * The consequences are failing to migrate when we potentiall should
18871c5e9c27SMel Gorman 	 * have which is not severe enough to warrant locking. If it is ever
18881c5e9c27SMel Gorman 	 * a problem, it can be converted to a per-cpu counter.
18891c5e9c27SMel Gorman 	 */
18901c5e9c27SMel Gorman 	pgdat->numabalancing_migrate_nr_pages += nr_pages;
18911c5e9c27SMel Gorman 	return false;
1892b32967ffSMel Gorman }
1893b32967ffSMel Gorman 
18941c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1895b32967ffSMel Gorman {
1896340ef390SHugh Dickins 	int page_lru;
1897b32967ffSMel Gorman 
1898309381feSSasha Levin 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
18993abef4e6SMel Gorman 
1900b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
1901340ef390SHugh Dickins 	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1902340ef390SHugh Dickins 		return 0;
1903b32967ffSMel Gorman 
1904340ef390SHugh Dickins 	if (isolate_lru_page(page))
1905340ef390SHugh Dickins 		return 0;
1906340ef390SHugh Dickins 
1907340ef390SHugh Dickins 	/*
1908340ef390SHugh Dickins 	 * migrate_misplaced_transhuge_page() skips page migration's usual
1909340ef390SHugh Dickins 	 * check on page_count(), so we must do it here, now that the page
1910340ef390SHugh Dickins 	 * has been isolated: a GUP pin, or any other pin, prevents migration.
1911340ef390SHugh Dickins 	 * The expected page count is 3: 1 for page's mapcount and 1 for the
1912340ef390SHugh Dickins 	 * caller's pin and 1 for the reference taken by isolate_lru_page().
1913340ef390SHugh Dickins 	 */
1914340ef390SHugh Dickins 	if (PageTransHuge(page) && page_count(page) != 3) {
1915340ef390SHugh Dickins 		putback_lru_page(page);
1916b32967ffSMel Gorman 		return 0;
1917b32967ffSMel Gorman 	}
1918b32967ffSMel Gorman 
1919b32967ffSMel Gorman 	page_lru = page_is_file_cache(page);
1920599d0c95SMel Gorman 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1921340ef390SHugh Dickins 				hpage_nr_pages(page));
1922b32967ffSMel Gorman 
1923b32967ffSMel Gorman 	/*
1924340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
1925340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
1926340ef390SHugh Dickins 	 * disappearing underneath us during migration.
1927b32967ffSMel Gorman 	 */
1928b32967ffSMel Gorman 	put_page(page);
1929340ef390SHugh Dickins 	return 1;
1930b32967ffSMel Gorman }
1931b32967ffSMel Gorman 
1932de466bd6SMel Gorman bool pmd_trans_migrating(pmd_t pmd)
1933de466bd6SMel Gorman {
1934de466bd6SMel Gorman 	struct page *page = pmd_page(pmd);
1935de466bd6SMel Gorman 	return PageLocked(page);
1936de466bd6SMel Gorman }
1937de466bd6SMel Gorman 
1938a8f60772SMel Gorman /*
19397039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
19407039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
19417039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
19427039e1dbSPeter Zijlstra  */
19431bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
19441bc115d8SMel Gorman 			   int node)
19457039e1dbSPeter Zijlstra {
1946a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
1947340ef390SHugh Dickins 	int isolated;
1948b32967ffSMel Gorman 	int nr_remaining;
19497039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
19507039e1dbSPeter Zijlstra 
19517039e1dbSPeter Zijlstra 	/*
19521bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
19531bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
19547039e1dbSPeter Zijlstra 	 */
19551bc115d8SMel Gorman 	if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
19561bc115d8SMel Gorman 	    (vma->vm_flags & VM_EXEC))
19577039e1dbSPeter Zijlstra 		goto out;
19587039e1dbSPeter Zijlstra 
1959a8f60772SMel Gorman 	/*
1960a8f60772SMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
1961a8f60772SMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
1962a8f60772SMel Gorman 	 * all the time is being spent migrating!
1963a8f60772SMel Gorman 	 */
1964340ef390SHugh Dickins 	if (numamigrate_update_ratelimit(pgdat, 1))
1965a8f60772SMel Gorman 		goto out;
1966a8f60772SMel Gorman 
1967b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
1968b32967ffSMel Gorman 	if (!isolated)
19697039e1dbSPeter Zijlstra 		goto out;
19707039e1dbSPeter Zijlstra 
19717039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
19729c620e2bSHugh Dickins 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
197368711a74SDavid Rientjes 				     NULL, node, MIGRATE_ASYNC,
197468711a74SDavid Rientjes 				     MR_NUMA_MISPLACED);
19757039e1dbSPeter Zijlstra 	if (nr_remaining) {
197659c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
197759c82b70SJoonsoo Kim 			list_del(&page->lru);
1978599d0c95SMel Gorman 			dec_node_page_state(page, NR_ISOLATED_ANON +
197959c82b70SJoonsoo Kim 					page_is_file_cache(page));
198059c82b70SJoonsoo Kim 			putback_lru_page(page);
198159c82b70SJoonsoo Kim 		}
19827039e1dbSPeter Zijlstra 		isolated = 0;
198303c5a6e1SMel Gorman 	} else
198403c5a6e1SMel Gorman 		count_vm_numa_event(NUMA_PAGE_MIGRATE);
19857039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
19867039e1dbSPeter Zijlstra 	return isolated;
1987340ef390SHugh Dickins 
1988340ef390SHugh Dickins out:
1989340ef390SHugh Dickins 	put_page(page);
1990340ef390SHugh Dickins 	return 0;
19917039e1dbSPeter Zijlstra }
1992220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
1993b32967ffSMel Gorman 
1994220018d3SMel Gorman #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1995340ef390SHugh Dickins /*
1996340ef390SHugh Dickins  * Migrates a THP to a given target node. page must be locked and is unlocked
1997340ef390SHugh Dickins  * before returning.
1998340ef390SHugh Dickins  */
1999b32967ffSMel Gorman int migrate_misplaced_transhuge_page(struct mm_struct *mm,
2000b32967ffSMel Gorman 				struct vm_area_struct *vma,
2001b32967ffSMel Gorman 				pmd_t *pmd, pmd_t entry,
2002b32967ffSMel Gorman 				unsigned long address,
2003b32967ffSMel Gorman 				struct page *page, int node)
2004b32967ffSMel Gorman {
2005c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
2006b32967ffSMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2007b32967ffSMel Gorman 	int isolated = 0;
2008b32967ffSMel Gorman 	struct page *new_page = NULL;
2009b32967ffSMel Gorman 	int page_lru = page_is_file_cache(page);
2010f714f4f2SMel Gorman 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
2011f714f4f2SMel Gorman 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
2012b32967ffSMel Gorman 
2013b32967ffSMel Gorman 	/*
2014b32967ffSMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
2015b32967ffSMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
2016b32967ffSMel Gorman 	 * all the time is being spent migrating!
2017b32967ffSMel Gorman 	 */
2018d28d4335SMel Gorman 	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
2019b32967ffSMel Gorman 		goto out_dropref;
2020b32967ffSMel Gorman 
2021b32967ffSMel Gorman 	new_page = alloc_pages_node(node,
202225160354SVlastimil Babka 		(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
2023e97ca8e5SJohannes Weiner 		HPAGE_PMD_ORDER);
2024340ef390SHugh Dickins 	if (!new_page)
2025340ef390SHugh Dickins 		goto out_fail;
20269a982250SKirill A. Shutemov 	prep_transhuge_page(new_page);
2027340ef390SHugh Dickins 
2028b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2029340ef390SHugh Dickins 	if (!isolated) {
2030b32967ffSMel Gorman 		put_page(new_page);
2031340ef390SHugh Dickins 		goto out_fail;
2032b32967ffSMel Gorman 	}
2033b0943d61SMel Gorman 
2034b32967ffSMel Gorman 	/* Prepare a page as a migration target */
203548c935adSKirill A. Shutemov 	__SetPageLocked(new_page);
2036d44d363fSShaohua Li 	if (PageSwapBacked(page))
2037fa9949daSHugh Dickins 		__SetPageSwapBacked(new_page);
2038b32967ffSMel Gorman 
2039b32967ffSMel Gorman 	/* anon mapping, we can simply copy page->mapping to the new page: */
2040b32967ffSMel Gorman 	new_page->mapping = page->mapping;
2041b32967ffSMel Gorman 	new_page->index = page->index;
2042b32967ffSMel Gorman 	migrate_page_copy(new_page, page);
2043b32967ffSMel Gorman 	WARN_ON(PageLRU(new_page));
2044b32967ffSMel Gorman 
2045b32967ffSMel Gorman 	/* Recheck the target PMD */
2046f714f4f2SMel Gorman 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2047c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
2048f4e177d1SWill Deacon 	if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
2049c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
2050f714f4f2SMel Gorman 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2051b32967ffSMel Gorman 
2052b32967ffSMel Gorman 		/* Reverse changes made by migrate_page_copy() */
2053b32967ffSMel Gorman 		if (TestClearPageActive(new_page))
2054b32967ffSMel Gorman 			SetPageActive(page);
2055b32967ffSMel Gorman 		if (TestClearPageUnevictable(new_page))
2056b32967ffSMel Gorman 			SetPageUnevictable(page);
2057b32967ffSMel Gorman 
2058b32967ffSMel Gorman 		unlock_page(new_page);
2059b32967ffSMel Gorman 		put_page(new_page);		/* Free it */
2060b32967ffSMel Gorman 
2061a54a407fSMel Gorman 		/* Retake the callers reference and putback on LRU */
2062a54a407fSMel Gorman 		get_page(page);
2063b32967ffSMel Gorman 		putback_lru_page(page);
2064599d0c95SMel Gorman 		mod_node_page_state(page_pgdat(page),
2065a54a407fSMel Gorman 			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
2066eb4489f6SMel Gorman 
2067eb4489f6SMel Gorman 		goto out_unlock;
2068b32967ffSMel Gorman 	}
2069b32967ffSMel Gorman 
207010102459SKirill A. Shutemov 	entry = mk_huge_pmd(new_page, vma->vm_page_prot);
20712b4847e7SMel Gorman 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2072b32967ffSMel Gorman 
20732b4847e7SMel Gorman 	/*
20742b4847e7SMel Gorman 	 * Clear the old entry under pagetable lock and establish the new PTE.
20752b4847e7SMel Gorman 	 * Any parallel GUP will either observe the old page blocking on the
20762b4847e7SMel Gorman 	 * page lock, block on the page table lock or observe the new page.
20772b4847e7SMel Gorman 	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
20782b4847e7SMel Gorman 	 * guarantee the copy is visible before the pagetable update.
20792b4847e7SMel Gorman 	 */
2080f714f4f2SMel Gorman 	flush_cache_range(vma, mmun_start, mmun_end);
2081d281ee61SKirill A. Shutemov 	page_add_anon_rmap(new_page, vma, mmun_start, true);
20828809aa2dSAneesh Kumar K.V 	pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
2083f714f4f2SMel Gorman 	set_pmd_at(mm, mmun_start, pmd, entry);
2084ce4a9cc5SStephen Rothwell 	update_mmu_cache_pmd(vma, address, &entry);
20852b4847e7SMel Gorman 
2086f4e177d1SWill Deacon 	page_ref_unfreeze(page, 2);
208751afb12bSHugh Dickins 	mlock_migrate_page(new_page, page);
2088d281ee61SKirill A. Shutemov 	page_remove_rmap(page, true);
20897cd12b4aSVlastimil Babka 	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
20902b4847e7SMel Gorman 
2091c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
2092f714f4f2SMel Gorman 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2093b32967ffSMel Gorman 
209411de9927SMel Gorman 	/* Take an "isolate" reference and put new page on the LRU. */
209511de9927SMel Gorman 	get_page(new_page);
209611de9927SMel Gorman 	putback_lru_page(new_page);
209711de9927SMel Gorman 
2098b32967ffSMel Gorman 	unlock_page(new_page);
2099b32967ffSMel Gorman 	unlock_page(page);
2100b32967ffSMel Gorman 	put_page(page);			/* Drop the rmap reference */
2101b32967ffSMel Gorman 	put_page(page);			/* Drop the LRU isolation reference */
2102b32967ffSMel Gorman 
2103b32967ffSMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
2104b32967ffSMel Gorman 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
2105b32967ffSMel Gorman 
2106599d0c95SMel Gorman 	mod_node_page_state(page_pgdat(page),
2107b32967ffSMel Gorman 			NR_ISOLATED_ANON + page_lru,
2108b32967ffSMel Gorman 			-HPAGE_PMD_NR);
2109b32967ffSMel Gorman 	return isolated;
2110b32967ffSMel Gorman 
2111340ef390SHugh Dickins out_fail:
2112340ef390SHugh Dickins 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
2113b32967ffSMel Gorman out_dropref:
21142b4847e7SMel Gorman 	ptl = pmd_lock(mm, pmd);
21152b4847e7SMel Gorman 	if (pmd_same(*pmd, entry)) {
21164d942466SMel Gorman 		entry = pmd_modify(entry, vma->vm_page_prot);
2117f714f4f2SMel Gorman 		set_pmd_at(mm, mmun_start, pmd, entry);
2118a54a407fSMel Gorman 		update_mmu_cache_pmd(vma, address, &entry);
21192b4847e7SMel Gorman 	}
21202b4847e7SMel Gorman 	spin_unlock(ptl);
2121a54a407fSMel Gorman 
2122eb4489f6SMel Gorman out_unlock:
2123340ef390SHugh Dickins 	unlock_page(page);
2124b32967ffSMel Gorman 	put_page(page);
2125b32967ffSMel Gorman 	return 0;
2126b32967ffSMel Gorman }
21277039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA_BALANCING */
21287039e1dbSPeter Zijlstra 
21297039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA */
21308763cb45SJérôme Glisse 
21316b368cd4SJérôme Glisse #if defined(CONFIG_MIGRATE_VMA_HELPER)
21328763cb45SJérôme Glisse struct migrate_vma {
21338763cb45SJérôme Glisse 	struct vm_area_struct	*vma;
21348763cb45SJérôme Glisse 	unsigned long		*dst;
21358763cb45SJérôme Glisse 	unsigned long		*src;
21368763cb45SJérôme Glisse 	unsigned long		cpages;
21378763cb45SJérôme Glisse 	unsigned long		npages;
21388763cb45SJérôme Glisse 	unsigned long		start;
21398763cb45SJérôme Glisse 	unsigned long		end;
21408763cb45SJérôme Glisse };
21418763cb45SJérôme Glisse 
21428763cb45SJérôme Glisse static int migrate_vma_collect_hole(unsigned long start,
21438763cb45SJérôme Glisse 				    unsigned long end,
21448763cb45SJérôme Glisse 				    struct mm_walk *walk)
21458763cb45SJérôme Glisse {
21468763cb45SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
21478763cb45SJérôme Glisse 	unsigned long addr;
21488763cb45SJérôme Glisse 
21498763cb45SJérôme Glisse 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2150e20d103bSMark Hairgrove 		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
21518315ada7SJérôme Glisse 		migrate->dst[migrate->npages] = 0;
2152e20d103bSMark Hairgrove 		migrate->npages++;
21538315ada7SJérôme Glisse 		migrate->cpages++;
21548315ada7SJérôme Glisse 	}
21558315ada7SJérôme Glisse 
21568315ada7SJérôme Glisse 	return 0;
21578315ada7SJérôme Glisse }
21588315ada7SJérôme Glisse 
21598315ada7SJérôme Glisse static int migrate_vma_collect_skip(unsigned long start,
21608315ada7SJérôme Glisse 				    unsigned long end,
21618315ada7SJérôme Glisse 				    struct mm_walk *walk)
21628315ada7SJérôme Glisse {
21638315ada7SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
21648315ada7SJérôme Glisse 	unsigned long addr;
21658315ada7SJérôme Glisse 
21668315ada7SJérôme Glisse 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
21678763cb45SJérôme Glisse 		migrate->dst[migrate->npages] = 0;
21688763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = 0;
21698763cb45SJérôme Glisse 	}
21708763cb45SJérôme Glisse 
21718763cb45SJérôme Glisse 	return 0;
21728763cb45SJérôme Glisse }
21738763cb45SJérôme Glisse 
21748763cb45SJérôme Glisse static int migrate_vma_collect_pmd(pmd_t *pmdp,
21758763cb45SJérôme Glisse 				   unsigned long start,
21768763cb45SJérôme Glisse 				   unsigned long end,
21778763cb45SJérôme Glisse 				   struct mm_walk *walk)
21788763cb45SJérôme Glisse {
21798763cb45SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
21808763cb45SJérôme Glisse 	struct vm_area_struct *vma = walk->vma;
21818763cb45SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
21828c3328f1SJérôme Glisse 	unsigned long addr = start, unmapped = 0;
21838763cb45SJérôme Glisse 	spinlock_t *ptl;
21848763cb45SJérôme Glisse 	pte_t *ptep;
21858763cb45SJérôme Glisse 
21868763cb45SJérôme Glisse again:
21878763cb45SJérôme Glisse 	if (pmd_none(*pmdp))
21888763cb45SJérôme Glisse 		return migrate_vma_collect_hole(start, end, walk);
21898763cb45SJérôme Glisse 
21908763cb45SJérôme Glisse 	if (pmd_trans_huge(*pmdp)) {
21918763cb45SJérôme Glisse 		struct page *page;
21928763cb45SJérôme Glisse 
21938763cb45SJérôme Glisse 		ptl = pmd_lock(mm, pmdp);
21948763cb45SJérôme Glisse 		if (unlikely(!pmd_trans_huge(*pmdp))) {
21958763cb45SJérôme Glisse 			spin_unlock(ptl);
21968763cb45SJérôme Glisse 			goto again;
21978763cb45SJérôme Glisse 		}
21988763cb45SJérôme Glisse 
21998763cb45SJérôme Glisse 		page = pmd_page(*pmdp);
22008763cb45SJérôme Glisse 		if (is_huge_zero_page(page)) {
22018763cb45SJérôme Glisse 			spin_unlock(ptl);
22028763cb45SJérôme Glisse 			split_huge_pmd(vma, pmdp, addr);
22038763cb45SJérôme Glisse 			if (pmd_trans_unstable(pmdp))
22048315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
22058763cb45SJérôme Glisse 								walk);
22068763cb45SJérôme Glisse 		} else {
22078763cb45SJérôme Glisse 			int ret;
22088763cb45SJérôme Glisse 
22098763cb45SJérôme Glisse 			get_page(page);
22108763cb45SJérôme Glisse 			spin_unlock(ptl);
22118763cb45SJérôme Glisse 			if (unlikely(!trylock_page(page)))
22128315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
22138763cb45SJérôme Glisse 								walk);
22148763cb45SJérôme Glisse 			ret = split_huge_page(page);
22158763cb45SJérôme Glisse 			unlock_page(page);
22168763cb45SJérôme Glisse 			put_page(page);
22178315ada7SJérôme Glisse 			if (ret)
22188315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
22198315ada7SJérôme Glisse 								walk);
22208315ada7SJérôme Glisse 			if (pmd_none(*pmdp))
22218763cb45SJérôme Glisse 				return migrate_vma_collect_hole(start, end,
22228763cb45SJérôme Glisse 								walk);
22238763cb45SJérôme Glisse 		}
22248763cb45SJérôme Glisse 	}
22258763cb45SJérôme Glisse 
22268763cb45SJérôme Glisse 	if (unlikely(pmd_bad(*pmdp)))
22278315ada7SJérôme Glisse 		return migrate_vma_collect_skip(start, end, walk);
22288763cb45SJérôme Glisse 
22298763cb45SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
22308c3328f1SJérôme Glisse 	arch_enter_lazy_mmu_mode();
22318c3328f1SJérôme Glisse 
22328763cb45SJérôme Glisse 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
22338763cb45SJérôme Glisse 		unsigned long mpfn, pfn;
22348763cb45SJérôme Glisse 		struct page *page;
22358c3328f1SJérôme Glisse 		swp_entry_t entry;
22368763cb45SJérôme Glisse 		pte_t pte;
22378763cb45SJérôme Glisse 
22388763cb45SJérôme Glisse 		pte = *ptep;
22398763cb45SJérôme Glisse 		pfn = pte_pfn(pte);
22408763cb45SJérôme Glisse 
2241a5430ddaSJérôme Glisse 		if (pte_none(pte)) {
22428315ada7SJérôme Glisse 			mpfn = MIGRATE_PFN_MIGRATE;
22438315ada7SJérôme Glisse 			migrate->cpages++;
22448315ada7SJérôme Glisse 			pfn = 0;
22458763cb45SJérôme Glisse 			goto next;
22468763cb45SJérôme Glisse 		}
22478763cb45SJérôme Glisse 
2248a5430ddaSJérôme Glisse 		if (!pte_present(pte)) {
2249a5430ddaSJérôme Glisse 			mpfn = pfn = 0;
2250a5430ddaSJérôme Glisse 
2251a5430ddaSJérôme Glisse 			/*
2252a5430ddaSJérôme Glisse 			 * Only care about unaddressable device page special
2253a5430ddaSJérôme Glisse 			 * page table entry. Other special swap entries are not
2254a5430ddaSJérôme Glisse 			 * migratable, and we ignore regular swapped page.
2255a5430ddaSJérôme Glisse 			 */
2256a5430ddaSJérôme Glisse 			entry = pte_to_swp_entry(pte);
2257a5430ddaSJérôme Glisse 			if (!is_device_private_entry(entry))
2258a5430ddaSJérôme Glisse 				goto next;
2259a5430ddaSJérôme Glisse 
2260a5430ddaSJérôme Glisse 			page = device_private_entry_to_page(entry);
2261a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(page_to_pfn(page))|
2262a5430ddaSJérôme Glisse 				MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2263a5430ddaSJérôme Glisse 			if (is_write_device_private_entry(entry))
2264a5430ddaSJérôme Glisse 				mpfn |= MIGRATE_PFN_WRITE;
2265a5430ddaSJérôme Glisse 		} else {
22668315ada7SJérôme Glisse 			if (is_zero_pfn(pfn)) {
22678315ada7SJérôme Glisse 				mpfn = MIGRATE_PFN_MIGRATE;
22688315ada7SJérôme Glisse 				migrate->cpages++;
22698315ada7SJérôme Glisse 				pfn = 0;
22708315ada7SJérôme Glisse 				goto next;
22718315ada7SJérôme Glisse 			}
2272df6ad698SJérôme Glisse 			page = _vm_normal_page(migrate->vma, addr, pte, true);
2273a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2274a5430ddaSJérôme Glisse 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2275a5430ddaSJérôme Glisse 		}
2276a5430ddaSJérôme Glisse 
2277a5430ddaSJérôme Glisse 		/* FIXME support THP */
22788763cb45SJérôme Glisse 		if (!page || !page->mapping || PageTransCompound(page)) {
22798763cb45SJérôme Glisse 			mpfn = pfn = 0;
22808763cb45SJérôme Glisse 			goto next;
22818763cb45SJérôme Glisse 		}
2282a5430ddaSJérôme Glisse 		pfn = page_to_pfn(page);
22838763cb45SJérôme Glisse 
22848763cb45SJérôme Glisse 		/*
22858763cb45SJérôme Glisse 		 * By getting a reference on the page we pin it and that blocks
22868763cb45SJérôme Glisse 		 * any kind of migration. Side effect is that it "freezes" the
22878763cb45SJérôme Glisse 		 * pte.
22888763cb45SJérôme Glisse 		 *
22898763cb45SJérôme Glisse 		 * We drop this reference after isolating the page from the lru
22908763cb45SJérôme Glisse 		 * for non device page (device page are not on the lru and thus
22918763cb45SJérôme Glisse 		 * can't be dropped from it).
22928763cb45SJérôme Glisse 		 */
22938763cb45SJérôme Glisse 		get_page(page);
22948763cb45SJérôme Glisse 		migrate->cpages++;
22958763cb45SJérôme Glisse 
22968c3328f1SJérôme Glisse 		/*
22978c3328f1SJérôme Glisse 		 * Optimize for the common case where page is only mapped once
22988c3328f1SJérôme Glisse 		 * in one process. If we can lock the page, then we can safely
22998c3328f1SJérôme Glisse 		 * set up a special migration page table entry now.
23008c3328f1SJérôme Glisse 		 */
23018c3328f1SJérôme Glisse 		if (trylock_page(page)) {
23028c3328f1SJérôme Glisse 			pte_t swp_pte;
23038c3328f1SJérôme Glisse 
23048c3328f1SJérôme Glisse 			mpfn |= MIGRATE_PFN_LOCKED;
23058c3328f1SJérôme Glisse 			ptep_get_and_clear(mm, addr, ptep);
23068c3328f1SJérôme Glisse 
23078c3328f1SJérôme Glisse 			/* Setup special migration page table entry */
23088c3328f1SJérôme Glisse 			entry = make_migration_entry(page, pte_write(pte));
23098c3328f1SJérôme Glisse 			swp_pte = swp_entry_to_pte(entry);
23108c3328f1SJérôme Glisse 			if (pte_soft_dirty(pte))
23118c3328f1SJérôme Glisse 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
23128c3328f1SJérôme Glisse 			set_pte_at(mm, addr, ptep, swp_pte);
23138c3328f1SJérôme Glisse 
23148c3328f1SJérôme Glisse 			/*
23158c3328f1SJérôme Glisse 			 * This is like regular unmap: we remove the rmap and
23168c3328f1SJérôme Glisse 			 * drop page refcount. Page won't be freed, as we took
23178c3328f1SJérôme Glisse 			 * a reference just above.
23188c3328f1SJérôme Glisse 			 */
23198c3328f1SJérôme Glisse 			page_remove_rmap(page, false);
23208c3328f1SJérôme Glisse 			put_page(page);
2321a5430ddaSJérôme Glisse 
2322a5430ddaSJérôme Glisse 			if (pte_present(pte))
23238c3328f1SJérôme Glisse 				unmapped++;
23248c3328f1SJérôme Glisse 		}
23258c3328f1SJérôme Glisse 
23268763cb45SJérôme Glisse next:
2327a5430ddaSJérôme Glisse 		migrate->dst[migrate->npages] = 0;
23288763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = mpfn;
23298763cb45SJérôme Glisse 	}
23308c3328f1SJérôme Glisse 	arch_leave_lazy_mmu_mode();
23318763cb45SJérôme Glisse 	pte_unmap_unlock(ptep - 1, ptl);
23328763cb45SJérôme Glisse 
23338c3328f1SJérôme Glisse 	/* Only flush the TLB if we actually modified any entries */
23348c3328f1SJérôme Glisse 	if (unmapped)
23358c3328f1SJérôme Glisse 		flush_tlb_range(walk->vma, start, end);
23368c3328f1SJérôme Glisse 
23378763cb45SJérôme Glisse 	return 0;
23388763cb45SJérôme Glisse }
23398763cb45SJérôme Glisse 
23408763cb45SJérôme Glisse /*
23418763cb45SJérôme Glisse  * migrate_vma_collect() - collect pages over a range of virtual addresses
23428763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
23438763cb45SJérôme Glisse  *
23448763cb45SJérôme Glisse  * This will walk the CPU page table. For each virtual address backed by a
23458763cb45SJérôme Glisse  * valid page, it updates the src array and takes a reference on the page, in
23468763cb45SJérôme Glisse  * order to pin the page until we lock it and unmap it.
23478763cb45SJérôme Glisse  */
23488763cb45SJérôme Glisse static void migrate_vma_collect(struct migrate_vma *migrate)
23498763cb45SJérôme Glisse {
23508763cb45SJérôme Glisse 	struct mm_walk mm_walk;
23518763cb45SJérôme Glisse 
23528763cb45SJérôme Glisse 	mm_walk.pmd_entry = migrate_vma_collect_pmd;
23538763cb45SJérôme Glisse 	mm_walk.pte_entry = NULL;
23548763cb45SJérôme Glisse 	mm_walk.pte_hole = migrate_vma_collect_hole;
23558763cb45SJérôme Glisse 	mm_walk.hugetlb_entry = NULL;
23568763cb45SJérôme Glisse 	mm_walk.test_walk = NULL;
23578763cb45SJérôme Glisse 	mm_walk.vma = migrate->vma;
23588763cb45SJérôme Glisse 	mm_walk.mm = migrate->vma->vm_mm;
23598763cb45SJérôme Glisse 	mm_walk.private = migrate;
23608763cb45SJérôme Glisse 
23618c3328f1SJérôme Glisse 	mmu_notifier_invalidate_range_start(mm_walk.mm,
23628c3328f1SJérôme Glisse 					    migrate->start,
23638c3328f1SJérôme Glisse 					    migrate->end);
23648763cb45SJérôme Glisse 	walk_page_range(migrate->start, migrate->end, &mm_walk);
23658c3328f1SJérôme Glisse 	mmu_notifier_invalidate_range_end(mm_walk.mm,
23668c3328f1SJérôme Glisse 					  migrate->start,
23678c3328f1SJérôme Glisse 					  migrate->end);
23688763cb45SJérôme Glisse 
23698763cb45SJérôme Glisse 	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
23708763cb45SJérôme Glisse }
23718763cb45SJérôme Glisse 
23728763cb45SJérôme Glisse /*
23738763cb45SJérôme Glisse  * migrate_vma_check_page() - check if page is pinned or not
23748763cb45SJérôme Glisse  * @page: struct page to check
23758763cb45SJérôme Glisse  *
23768763cb45SJérôme Glisse  * Pinned pages cannot be migrated. This is the same test as in
23778763cb45SJérôme Glisse  * migrate_page_move_mapping(), except that here we allow migration of a
23788763cb45SJérôme Glisse  * ZONE_DEVICE page.
23798763cb45SJérôme Glisse  */
23808763cb45SJérôme Glisse static bool migrate_vma_check_page(struct page *page)
23818763cb45SJérôme Glisse {
23828763cb45SJérôme Glisse 	/*
23838763cb45SJérôme Glisse 	 * One extra ref because caller holds an extra reference, either from
23848763cb45SJérôme Glisse 	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
23858763cb45SJérôme Glisse 	 * a device page.
23868763cb45SJérôme Glisse 	 */
23878763cb45SJérôme Glisse 	int extra = 1;
23888763cb45SJérôme Glisse 
23898763cb45SJérôme Glisse 	/*
23908763cb45SJérôme Glisse 	 * FIXME support THP (transparent huge page), it is bit more complex to
23918763cb45SJérôme Glisse 	 * check them than regular pages, because they can be mapped with a pmd
23928763cb45SJérôme Glisse 	 * or with a pte (split pte mapping).
23938763cb45SJérôme Glisse 	 */
23948763cb45SJérôme Glisse 	if (PageCompound(page))
23958763cb45SJérôme Glisse 		return false;
23968763cb45SJérôme Glisse 
2397a5430ddaSJérôme Glisse 	/* Page from ZONE_DEVICE have one extra reference */
2398a5430ddaSJérôme Glisse 	if (is_zone_device_page(page)) {
2399a5430ddaSJérôme Glisse 		/*
2400a5430ddaSJérôme Glisse 		 * Private page can never be pin as they have no valid pte and
2401a5430ddaSJérôme Glisse 		 * GUP will fail for those. Yet if there is a pending migration
2402a5430ddaSJérôme Glisse 		 * a thread might try to wait on the pte migration entry and
2403a5430ddaSJérôme Glisse 		 * will bump the page reference count. Sadly there is no way to
2404a5430ddaSJérôme Glisse 		 * differentiate a regular pin from migration wait. Hence to
2405a5430ddaSJérôme Glisse 		 * avoid 2 racing thread trying to migrate back to CPU to enter
2406a5430ddaSJérôme Glisse 		 * infinite loop (one stoping migration because the other is
2407a5430ddaSJérôme Glisse 		 * waiting on pte migration entry). We always return true here.
2408a5430ddaSJérôme Glisse 		 *
2409a5430ddaSJérôme Glisse 		 * FIXME proper solution is to rework migration_entry_wait() so
2410a5430ddaSJérôme Glisse 		 * it does not need to take a reference on page.
2411a5430ddaSJérôme Glisse 		 */
2412a5430ddaSJérôme Glisse 		if (is_device_private_page(page))
2413a5430ddaSJérôme Glisse 			return true;
2414a5430ddaSJérôme Glisse 
2415df6ad698SJérôme Glisse 		/*
2416df6ad698SJérôme Glisse 		 * Only allow device public page to be migrated and account for
2417df6ad698SJérôme Glisse 		 * the extra reference count imply by ZONE_DEVICE pages.
2418df6ad698SJérôme Glisse 		 */
2419df6ad698SJérôme Glisse 		if (!is_device_public_page(page))
2420a5430ddaSJérôme Glisse 			return false;
2421df6ad698SJérôme Glisse 		extra++;
2422a5430ddaSJérôme Glisse 	}
2423a5430ddaSJérôme Glisse 
2424df6ad698SJérôme Glisse 	/* For file back page */
2425df6ad698SJérôme Glisse 	if (page_mapping(page))
2426df6ad698SJérôme Glisse 		extra += 1 + page_has_private(page);
2427df6ad698SJérôme Glisse 
24288763cb45SJérôme Glisse 	if ((page_count(page) - extra) > page_mapcount(page))
24298763cb45SJérôme Glisse 		return false;
24308763cb45SJérôme Glisse 
24318763cb45SJérôme Glisse 	return true;
24328763cb45SJérôme Glisse }
24338763cb45SJérôme Glisse 
24348763cb45SJérôme Glisse /*
24358763cb45SJérôme Glisse  * migrate_vma_prepare() - lock pages and isolate them from the lru
24368763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
24378763cb45SJérôme Glisse  *
24388763cb45SJérôme Glisse  * This locks pages that have been collected by migrate_vma_collect(). Once each
24398763cb45SJérôme Glisse  * page is locked it is isolated from the lru (for non-device pages). Finally,
24408763cb45SJérôme Glisse  * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
24418763cb45SJérôme Glisse  * migrated by concurrent kernel threads.
24428763cb45SJérôme Glisse  */
24438763cb45SJérôme Glisse static void migrate_vma_prepare(struct migrate_vma *migrate)
24448763cb45SJérôme Glisse {
24458763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
24468c3328f1SJérôme Glisse 	const unsigned long start = migrate->start;
24478c3328f1SJérôme Glisse 	unsigned long addr, i, restore = 0;
24488763cb45SJérôme Glisse 	bool allow_drain = true;
24498763cb45SJérôme Glisse 
24508763cb45SJérôme Glisse 	lru_add_drain();
24518763cb45SJérôme Glisse 
24528763cb45SJérôme Glisse 	for (i = 0; (i < npages) && migrate->cpages; i++) {
24538763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
24548c3328f1SJérôme Glisse 		bool remap = true;
24558763cb45SJérôme Glisse 
24568763cb45SJérôme Glisse 		if (!page)
24578763cb45SJérôme Glisse 			continue;
24588763cb45SJérôme Glisse 
24598c3328f1SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
24608763cb45SJérôme Glisse 			/*
24618763cb45SJérôme Glisse 			 * Because we are migrating several pages there can be
24628763cb45SJérôme Glisse 			 * a deadlock between 2 concurrent migration where each
24638763cb45SJérôme Glisse 			 * are waiting on each other page lock.
24648763cb45SJérôme Glisse 			 *
24658763cb45SJérôme Glisse 			 * Make migrate_vma() a best effort thing and backoff
24668763cb45SJérôme Glisse 			 * for any page we can not lock right away.
24678763cb45SJérôme Glisse 			 */
24688763cb45SJérôme Glisse 			if (!trylock_page(page)) {
24698763cb45SJérôme Glisse 				migrate->src[i] = 0;
24708763cb45SJérôme Glisse 				migrate->cpages--;
24718763cb45SJérôme Glisse 				put_page(page);
24728763cb45SJérôme Glisse 				continue;
24738763cb45SJérôme Glisse 			}
24748c3328f1SJérôme Glisse 			remap = false;
24758763cb45SJérôme Glisse 			migrate->src[i] |= MIGRATE_PFN_LOCKED;
24768c3328f1SJérôme Glisse 		}
24778763cb45SJérôme Glisse 
2478a5430ddaSJérôme Glisse 		/* ZONE_DEVICE pages are not on LRU */
2479a5430ddaSJérôme Glisse 		if (!is_zone_device_page(page)) {
24808763cb45SJérôme Glisse 			if (!PageLRU(page) && allow_drain) {
24818763cb45SJérôme Glisse 				/* Drain CPU's pagevec */
24828763cb45SJérôme Glisse 				lru_add_drain_all();
24838763cb45SJérôme Glisse 				allow_drain = false;
24848763cb45SJérôme Glisse 			}
24858763cb45SJérôme Glisse 
24868763cb45SJérôme Glisse 			if (isolate_lru_page(page)) {
24878c3328f1SJérôme Glisse 				if (remap) {
24888c3328f1SJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
24898c3328f1SJérôme Glisse 					migrate->cpages--;
24908c3328f1SJérôme Glisse 					restore++;
24918c3328f1SJérôme Glisse 				} else {
24928763cb45SJérôme Glisse 					migrate->src[i] = 0;
24938763cb45SJérôme Glisse 					unlock_page(page);
24948763cb45SJérôme Glisse 					migrate->cpages--;
24958763cb45SJérôme Glisse 					put_page(page);
24968c3328f1SJérôme Glisse 				}
24978763cb45SJérôme Glisse 				continue;
24988763cb45SJérôme Glisse 			}
24998763cb45SJérôme Glisse 
2500a5430ddaSJérôme Glisse 			/* Drop the reference we took in collect */
2501a5430ddaSJérôme Glisse 			put_page(page);
2502a5430ddaSJérôme Glisse 		}
2503a5430ddaSJérôme Glisse 
25048763cb45SJérôme Glisse 		if (!migrate_vma_check_page(page)) {
25058c3328f1SJérôme Glisse 			if (remap) {
25068c3328f1SJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
25078c3328f1SJérôme Glisse 				migrate->cpages--;
25088c3328f1SJérôme Glisse 				restore++;
25098c3328f1SJérôme Glisse 
2510a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page)) {
25118c3328f1SJérôme Glisse 					get_page(page);
25128c3328f1SJérôme Glisse 					putback_lru_page(page);
2513a5430ddaSJérôme Glisse 				}
25148c3328f1SJérôme Glisse 			} else {
25158763cb45SJérôme Glisse 				migrate->src[i] = 0;
25168763cb45SJérôme Glisse 				unlock_page(page);
25178763cb45SJérôme Glisse 				migrate->cpages--;
25188763cb45SJérôme Glisse 
2519a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page))
25208763cb45SJérôme Glisse 					putback_lru_page(page);
2521a5430ddaSJérôme Glisse 				else
2522a5430ddaSJérôme Glisse 					put_page(page);
25238763cb45SJérôme Glisse 			}
25248763cb45SJérôme Glisse 		}
25258763cb45SJérôme Glisse 	}
25268763cb45SJérôme Glisse 
25278c3328f1SJérôme Glisse 	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
25288c3328f1SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25298c3328f1SJérôme Glisse 
25308c3328f1SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
25318c3328f1SJérôme Glisse 			continue;
25328c3328f1SJérôme Glisse 
25338c3328f1SJérôme Glisse 		remove_migration_pte(page, migrate->vma, addr, page);
25348c3328f1SJérôme Glisse 
25358c3328f1SJérôme Glisse 		migrate->src[i] = 0;
25368c3328f1SJérôme Glisse 		unlock_page(page);
25378c3328f1SJérôme Glisse 		put_page(page);
25388c3328f1SJérôme Glisse 		restore--;
25398c3328f1SJérôme Glisse 	}
25408c3328f1SJérôme Glisse }
25418c3328f1SJérôme Glisse 
25428763cb45SJérôme Glisse /*
25438763cb45SJérôme Glisse  * migrate_vma_unmap() - replace page mapping with special migration pte entry
25448763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
25458763cb45SJérôme Glisse  *
25468763cb45SJérôme Glisse  * Replace page mapping (CPU page table pte) with a special migration pte entry
25478763cb45SJérôme Glisse  * and check again if it has been pinned. Pinned pages are restored because we
25488763cb45SJérôme Glisse  * cannot migrate them.
25498763cb45SJérôme Glisse  *
25508763cb45SJérôme Glisse  * This is the last step before we call the device driver callback to allocate
25518763cb45SJérôme Glisse  * destination memory and copy contents of original page over to new page.
25528763cb45SJérôme Glisse  */
25538763cb45SJérôme Glisse static void migrate_vma_unmap(struct migrate_vma *migrate)
25548763cb45SJérôme Glisse {
25558763cb45SJérôme Glisse 	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
25568763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
25578763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
25588763cb45SJérôme Glisse 	unsigned long addr, i, restore = 0;
25598763cb45SJérôme Glisse 
25608763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
25618763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25628763cb45SJérôme Glisse 
25638763cb45SJérôme Glisse 		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
25648763cb45SJérôme Glisse 			continue;
25658763cb45SJérôme Glisse 
25668c3328f1SJérôme Glisse 		if (page_mapped(page)) {
25678763cb45SJérôme Glisse 			try_to_unmap(page, flags);
25688c3328f1SJérôme Glisse 			if (page_mapped(page))
25698c3328f1SJérôme Glisse 				goto restore;
25708c3328f1SJérôme Glisse 		}
25718c3328f1SJérôme Glisse 
25728c3328f1SJérôme Glisse 		if (migrate_vma_check_page(page))
25738c3328f1SJérôme Glisse 			continue;
25748c3328f1SJérôme Glisse 
25758c3328f1SJérôme Glisse restore:
25768763cb45SJérôme Glisse 		migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
25778763cb45SJérôme Glisse 		migrate->cpages--;
25788763cb45SJérôme Glisse 		restore++;
25798763cb45SJérôme Glisse 	}
25808763cb45SJérôme Glisse 
25818763cb45SJérôme Glisse 	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
25828763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25838763cb45SJérôme Glisse 
25848763cb45SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
25858763cb45SJérôme Glisse 			continue;
25868763cb45SJérôme Glisse 
25878763cb45SJérôme Glisse 		remove_migration_ptes(page, page, false);
25888763cb45SJérôme Glisse 
25898763cb45SJérôme Glisse 		migrate->src[i] = 0;
25908763cb45SJérôme Glisse 		unlock_page(page);
25918763cb45SJérôme Glisse 		restore--;
25928763cb45SJérôme Glisse 
2593a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
2594a5430ddaSJérôme Glisse 			put_page(page);
2595a5430ddaSJérôme Glisse 		else
25968763cb45SJérôme Glisse 			putback_lru_page(page);
25978763cb45SJérôme Glisse 	}
25988763cb45SJérôme Glisse }
25998763cb45SJérôme Glisse 
26008315ada7SJérôme Glisse static void migrate_vma_insert_page(struct migrate_vma *migrate,
26018315ada7SJérôme Glisse 				    unsigned long addr,
26028315ada7SJérôme Glisse 				    struct page *page,
26038315ada7SJérôme Glisse 				    unsigned long *src,
26048315ada7SJérôme Glisse 				    unsigned long *dst)
26058315ada7SJérôme Glisse {
26068315ada7SJérôme Glisse 	struct vm_area_struct *vma = migrate->vma;
26078315ada7SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
26088315ada7SJérôme Glisse 	struct mem_cgroup *memcg;
26098315ada7SJérôme Glisse 	bool flush = false;
26108315ada7SJérôme Glisse 	spinlock_t *ptl;
26118315ada7SJérôme Glisse 	pte_t entry;
26128315ada7SJérôme Glisse 	pgd_t *pgdp;
26138315ada7SJérôme Glisse 	p4d_t *p4dp;
26148315ada7SJérôme Glisse 	pud_t *pudp;
26158315ada7SJérôme Glisse 	pmd_t *pmdp;
26168315ada7SJérôme Glisse 	pte_t *ptep;
26178315ada7SJérôme Glisse 
26188315ada7SJérôme Glisse 	/* Only allow populating anonymous memory */
26198315ada7SJérôme Glisse 	if (!vma_is_anonymous(vma))
26208315ada7SJérôme Glisse 		goto abort;
26218315ada7SJérôme Glisse 
26228315ada7SJérôme Glisse 	pgdp = pgd_offset(mm, addr);
26238315ada7SJérôme Glisse 	p4dp = p4d_alloc(mm, pgdp, addr);
26248315ada7SJérôme Glisse 	if (!p4dp)
26258315ada7SJérôme Glisse 		goto abort;
26268315ada7SJérôme Glisse 	pudp = pud_alloc(mm, p4dp, addr);
26278315ada7SJérôme Glisse 	if (!pudp)
26288315ada7SJérôme Glisse 		goto abort;
26298315ada7SJérôme Glisse 	pmdp = pmd_alloc(mm, pudp, addr);
26308315ada7SJérôme Glisse 	if (!pmdp)
26318315ada7SJérôme Glisse 		goto abort;
26328315ada7SJérôme Glisse 
26338315ada7SJérôme Glisse 	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
26348315ada7SJérôme Glisse 		goto abort;
26358315ada7SJérôme Glisse 
26368315ada7SJérôme Glisse 	/*
26378315ada7SJérôme Glisse 	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
26388315ada7SJérôme Glisse 	 * pte_offset_map() on pmds where a huge pmd might be created
26398315ada7SJérôme Glisse 	 * from a different thread.
26408315ada7SJérôme Glisse 	 *
26418315ada7SJérôme Glisse 	 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
26428315ada7SJérôme Glisse 	 * parallel threads are excluded by other means.
26438315ada7SJérôme Glisse 	 *
26448315ada7SJérôme Glisse 	 * Here we only have down_read(mmap_sem).
26458315ada7SJérôme Glisse 	 */
26468315ada7SJérôme Glisse 	if (pte_alloc(mm, pmdp, addr))
26478315ada7SJérôme Glisse 		goto abort;
26488315ada7SJérôme Glisse 
26498315ada7SJérôme Glisse 	/* See the comment in pte_alloc_one_map() */
26508315ada7SJérôme Glisse 	if (unlikely(pmd_trans_unstable(pmdp)))
26518315ada7SJérôme Glisse 		goto abort;
26528315ada7SJérôme Glisse 
26538315ada7SJérôme Glisse 	if (unlikely(anon_vma_prepare(vma)))
26548315ada7SJérôme Glisse 		goto abort;
26558315ada7SJérôme Glisse 	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
26568315ada7SJérôme Glisse 		goto abort;
26578315ada7SJérôme Glisse 
26588315ada7SJérôme Glisse 	/*
26598315ada7SJérôme Glisse 	 * The memory barrier inside __SetPageUptodate makes sure that
26608315ada7SJérôme Glisse 	 * preceding stores to the page contents become visible before
26618315ada7SJérôme Glisse 	 * the set_pte_at() write.
26628315ada7SJérôme Glisse 	 */
26638315ada7SJérôme Glisse 	__SetPageUptodate(page);
26648315ada7SJérôme Glisse 
2665df6ad698SJérôme Glisse 	if (is_zone_device_page(page)) {
2666df6ad698SJérôme Glisse 		if (is_device_private_page(page)) {
26678315ada7SJérôme Glisse 			swp_entry_t swp_entry;
26688315ada7SJérôme Glisse 
26698315ada7SJérôme Glisse 			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
26708315ada7SJérôme Glisse 			entry = swp_entry_to_pte(swp_entry);
2671df6ad698SJérôme Glisse 		} else if (is_device_public_page(page)) {
2672df6ad698SJérôme Glisse 			entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
2673df6ad698SJérôme Glisse 			if (vma->vm_flags & VM_WRITE)
2674df6ad698SJérôme Glisse 				entry = pte_mkwrite(pte_mkdirty(entry));
2675df6ad698SJérôme Glisse 			entry = pte_mkdevmap(entry);
2676df6ad698SJérôme Glisse 		}
26778315ada7SJérôme Glisse 	} else {
26788315ada7SJérôme Glisse 		entry = mk_pte(page, vma->vm_page_prot);
26798315ada7SJérôme Glisse 		if (vma->vm_flags & VM_WRITE)
26808315ada7SJérôme Glisse 			entry = pte_mkwrite(pte_mkdirty(entry));
26818315ada7SJérôme Glisse 	}
26828315ada7SJérôme Glisse 
26838315ada7SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
26848315ada7SJérôme Glisse 
26858315ada7SJérôme Glisse 	if (pte_present(*ptep)) {
26868315ada7SJérôme Glisse 		unsigned long pfn = pte_pfn(*ptep);
26878315ada7SJérôme Glisse 
26888315ada7SJérôme Glisse 		if (!is_zero_pfn(pfn)) {
26898315ada7SJérôme Glisse 			pte_unmap_unlock(ptep, ptl);
26908315ada7SJérôme Glisse 			mem_cgroup_cancel_charge(page, memcg, false);
26918315ada7SJérôme Glisse 			goto abort;
26928315ada7SJérôme Glisse 		}
26938315ada7SJérôme Glisse 		flush = true;
26948315ada7SJérôme Glisse 	} else if (!pte_none(*ptep)) {
26958315ada7SJérôme Glisse 		pte_unmap_unlock(ptep, ptl);
26968315ada7SJérôme Glisse 		mem_cgroup_cancel_charge(page, memcg, false);
26978315ada7SJérôme Glisse 		goto abort;
26988315ada7SJérôme Glisse 	}
26998315ada7SJérôme Glisse 
27008315ada7SJérôme Glisse 	/*
27018315ada7SJérôme Glisse 	 * Check for usefaultfd but do not deliver the fault. Instead,
27028315ada7SJérôme Glisse 	 * just back off.
27038315ada7SJérôme Glisse 	 */
27048315ada7SJérôme Glisse 	if (userfaultfd_missing(vma)) {
27058315ada7SJérôme Glisse 		pte_unmap_unlock(ptep, ptl);
27068315ada7SJérôme Glisse 		mem_cgroup_cancel_charge(page, memcg, false);
27078315ada7SJérôme Glisse 		goto abort;
27088315ada7SJérôme Glisse 	}
27098315ada7SJérôme Glisse 
27108315ada7SJérôme Glisse 	inc_mm_counter(mm, MM_ANONPAGES);
27118315ada7SJérôme Glisse 	page_add_new_anon_rmap(page, vma, addr, false);
27128315ada7SJérôme Glisse 	mem_cgroup_commit_charge(page, memcg, false, false);
27138315ada7SJérôme Glisse 	if (!is_zone_device_page(page))
27148315ada7SJérôme Glisse 		lru_cache_add_active_or_unevictable(page, vma);
27158315ada7SJérôme Glisse 	get_page(page);
27168315ada7SJérôme Glisse 
27178315ada7SJérôme Glisse 	if (flush) {
27188315ada7SJérôme Glisse 		flush_cache_page(vma, addr, pte_pfn(*ptep));
27198315ada7SJérôme Glisse 		ptep_clear_flush_notify(vma, addr, ptep);
27208315ada7SJérôme Glisse 		set_pte_at_notify(mm, addr, ptep, entry);
27218315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
27228315ada7SJérôme Glisse 	} else {
27238315ada7SJérôme Glisse 		/* No need to invalidate - it was non-present before */
27248315ada7SJérôme Glisse 		set_pte_at(mm, addr, ptep, entry);
27258315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
27268315ada7SJérôme Glisse 	}
27278315ada7SJérôme Glisse 
27288315ada7SJérôme Glisse 	pte_unmap_unlock(ptep, ptl);
27298315ada7SJérôme Glisse 	*src = MIGRATE_PFN_MIGRATE;
27308315ada7SJérôme Glisse 	return;
27318315ada7SJérôme Glisse 
27328315ada7SJérôme Glisse abort:
27338315ada7SJérôme Glisse 	*src &= ~MIGRATE_PFN_MIGRATE;
27348315ada7SJérôme Glisse }
27358315ada7SJérôme Glisse 
27368763cb45SJérôme Glisse /*
27378763cb45SJérôme Glisse  * migrate_vma_pages() - migrate meta-data from src page to dst page
27388763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
27398763cb45SJérôme Glisse  *
27408763cb45SJérôme Glisse  * This migrates struct page meta-data from source struct page to destination
27418763cb45SJérôme Glisse  * struct page. This effectively finishes the migration from source page to the
27428763cb45SJérôme Glisse  * destination page.
27438763cb45SJérôme Glisse  */
27448763cb45SJérôme Glisse static void migrate_vma_pages(struct migrate_vma *migrate)
27458763cb45SJérôme Glisse {
27468763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
27478763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
27488315ada7SJérôme Glisse 	struct vm_area_struct *vma = migrate->vma;
27498315ada7SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
27508315ada7SJérôme Glisse 	unsigned long addr, i, mmu_start;
27518315ada7SJérôme Glisse 	bool notified = false;
27528763cb45SJérôme Glisse 
27538763cb45SJérôme Glisse 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
27548763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
27558763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
27568763cb45SJérôme Glisse 		struct address_space *mapping;
27578763cb45SJérôme Glisse 		int r;
27588763cb45SJérôme Glisse 
27598315ada7SJérôme Glisse 		if (!newpage) {
27608315ada7SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
27618763cb45SJérôme Glisse 			continue;
27628315ada7SJérôme Glisse 		}
27638315ada7SJérôme Glisse 
27648315ada7SJérôme Glisse 		if (!page) {
27658315ada7SJérôme Glisse 			if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
27668763cb45SJérôme Glisse 				continue;
27678315ada7SJérôme Glisse 			}
27688315ada7SJérôme Glisse 			if (!notified) {
27698315ada7SJérôme Glisse 				mmu_start = addr;
27708315ada7SJérôme Glisse 				notified = true;
27718315ada7SJérôme Glisse 				mmu_notifier_invalidate_range_start(mm,
27728315ada7SJérôme Glisse 								mmu_start,
27738315ada7SJérôme Glisse 								migrate->end);
27748315ada7SJérôme Glisse 			}
27758315ada7SJérôme Glisse 			migrate_vma_insert_page(migrate, addr, newpage,
27768315ada7SJérôme Glisse 						&migrate->src[i],
27778315ada7SJérôme Glisse 						&migrate->dst[i]);
27788315ada7SJérôme Glisse 			continue;
27798315ada7SJérôme Glisse 		}
27808763cb45SJérôme Glisse 
27818763cb45SJérôme Glisse 		mapping = page_mapping(page);
27828763cb45SJérôme Glisse 
2783a5430ddaSJérôme Glisse 		if (is_zone_device_page(newpage)) {
2784a5430ddaSJérôme Glisse 			if (is_device_private_page(newpage)) {
2785a5430ddaSJérôme Glisse 				/*
2786a5430ddaSJérôme Glisse 				 * For now only support private anonymous when
2787a5430ddaSJérôme Glisse 				 * migrating to un-addressable device memory.
2788a5430ddaSJérôme Glisse 				 */
2789a5430ddaSJérôme Glisse 				if (mapping) {
2790a5430ddaSJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2791a5430ddaSJérôme Glisse 					continue;
2792a5430ddaSJérôme Glisse 				}
2793df6ad698SJérôme Glisse 			} else if (!is_device_public_page(newpage)) {
2794a5430ddaSJérôme Glisse 				/*
2795a5430ddaSJérôme Glisse 				 * Other types of ZONE_DEVICE page are not
2796a5430ddaSJérôme Glisse 				 * supported.
2797a5430ddaSJérôme Glisse 				 */
2798a5430ddaSJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2799a5430ddaSJérôme Glisse 				continue;
2800a5430ddaSJérôme Glisse 			}
2801a5430ddaSJérôme Glisse 		}
2802a5430ddaSJérôme Glisse 
28038763cb45SJérôme Glisse 		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
28048763cb45SJérôme Glisse 		if (r != MIGRATEPAGE_SUCCESS)
28058763cb45SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
28068763cb45SJérôme Glisse 	}
28078315ada7SJérôme Glisse 
28088315ada7SJérôme Glisse 	if (notified)
28098315ada7SJérôme Glisse 		mmu_notifier_invalidate_range_end(mm, mmu_start,
28108315ada7SJérôme Glisse 						  migrate->end);
28118763cb45SJérôme Glisse }
28128763cb45SJérôme Glisse 
28138763cb45SJérôme Glisse /*
28148763cb45SJérôme Glisse  * migrate_vma_finalize() - restore CPU page table entry
28158763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
28168763cb45SJérôme Glisse  *
28178763cb45SJérôme Glisse  * This replaces the special migration pte entry with either a mapping to the
28188763cb45SJérôme Glisse  * new page if migration was successful for that page, or to the original page
28198763cb45SJérôme Glisse  * otherwise.
28208763cb45SJérôme Glisse  *
28218763cb45SJérôme Glisse  * This also unlocks the pages and puts them back on the lru, or drops the extra
28228763cb45SJérôme Glisse  * refcount, for device pages.
28238763cb45SJérôme Glisse  */
28248763cb45SJérôme Glisse static void migrate_vma_finalize(struct migrate_vma *migrate)
28258763cb45SJérôme Glisse {
28268763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
28278763cb45SJérôme Glisse 	unsigned long i;
28288763cb45SJérôme Glisse 
28298763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
28308763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
28318763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
28328763cb45SJérôme Glisse 
28338315ada7SJérôme Glisse 		if (!page) {
28348315ada7SJérôme Glisse 			if (newpage) {
28358315ada7SJérôme Glisse 				unlock_page(newpage);
28368315ada7SJérôme Glisse 				put_page(newpage);
28378315ada7SJérôme Glisse 			}
28388763cb45SJérôme Glisse 			continue;
28398315ada7SJérôme Glisse 		}
28408315ada7SJérôme Glisse 
28418763cb45SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
28428763cb45SJérôme Glisse 			if (newpage) {
28438763cb45SJérôme Glisse 				unlock_page(newpage);
28448763cb45SJérôme Glisse 				put_page(newpage);
28458763cb45SJérôme Glisse 			}
28468763cb45SJérôme Glisse 			newpage = page;
28478763cb45SJérôme Glisse 		}
28488763cb45SJérôme Glisse 
28498763cb45SJérôme Glisse 		remove_migration_ptes(page, newpage, false);
28508763cb45SJérôme Glisse 		unlock_page(page);
28518763cb45SJérôme Glisse 		migrate->cpages--;
28528763cb45SJérôme Glisse 
2853a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
2854a5430ddaSJérôme Glisse 			put_page(page);
2855a5430ddaSJérôme Glisse 		else
28568763cb45SJérôme Glisse 			putback_lru_page(page);
28578763cb45SJérôme Glisse 
28588763cb45SJérôme Glisse 		if (newpage != page) {
28598763cb45SJérôme Glisse 			unlock_page(newpage);
2860a5430ddaSJérôme Glisse 			if (is_zone_device_page(newpage))
2861a5430ddaSJérôme Glisse 				put_page(newpage);
2862a5430ddaSJérôme Glisse 			else
28638763cb45SJérôme Glisse 				putback_lru_page(newpage);
28648763cb45SJérôme Glisse 		}
28658763cb45SJérôme Glisse 	}
28668763cb45SJérôme Glisse }
28678763cb45SJérôme Glisse 
28688763cb45SJérôme Glisse /*
28698763cb45SJérôme Glisse  * migrate_vma() - migrate a range of memory inside vma
28708763cb45SJérôme Glisse  *
28718763cb45SJérôme Glisse  * @ops: migration callback for allocating destination memory and copying
28728763cb45SJérôme Glisse  * @vma: virtual memory area containing the range to be migrated
28738763cb45SJérôme Glisse  * @start: start address of the range to migrate (inclusive)
28748763cb45SJérôme Glisse  * @end: end address of the range to migrate (exclusive)
28758763cb45SJérôme Glisse  * @src: array of hmm_pfn_t containing source pfns
28768763cb45SJérôme Glisse  * @dst: array of hmm_pfn_t containing destination pfns
28778763cb45SJérôme Glisse  * @private: pointer passed back to each of the callback
28788763cb45SJérôme Glisse  * Returns: 0 on success, error code otherwise
28798763cb45SJérôme Glisse  *
28808763cb45SJérôme Glisse  * This function tries to migrate a range of memory virtual address range, using
28818763cb45SJérôme Glisse  * callbacks to allocate and copy memory from source to destination. First it
28828763cb45SJérôme Glisse  * collects all the pages backing each virtual address in the range, saving this
28838763cb45SJérôme Glisse  * inside the src array. Then it locks those pages and unmaps them. Once the pages
28848763cb45SJérôme Glisse  * are locked and unmapped, it checks whether each page is pinned or not. Pages
28858763cb45SJérôme Glisse  * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
28868763cb45SJérôme Glisse  * in the corresponding src array entry. It then restores any pages that are
28878763cb45SJérôme Glisse  * pinned, by remapping and unlocking those pages.
28888763cb45SJérôme Glisse  *
28898763cb45SJérôme Glisse  * At this point it calls the alloc_and_copy() callback. For documentation on
28908763cb45SJérôme Glisse  * what is expected from that callback, see struct migrate_vma_ops comments in
28918763cb45SJérôme Glisse  * include/linux/migrate.h
28928763cb45SJérôme Glisse  *
28938763cb45SJérôme Glisse  * After the alloc_and_copy() callback, this function goes over each entry in
28948763cb45SJérôme Glisse  * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
28958763cb45SJérôme Glisse  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
28968763cb45SJérôme Glisse  * then the function tries to migrate struct page information from the source
28978763cb45SJérôme Glisse  * struct page to the destination struct page. If it fails to migrate the struct
28988763cb45SJérôme Glisse  * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
28998763cb45SJérôme Glisse  * array.
29008763cb45SJérôme Glisse  *
29018763cb45SJérôme Glisse  * At this point all successfully migrated pages have an entry in the src
29028763cb45SJérôme Glisse  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
29038763cb45SJérôme Glisse  * array entry with MIGRATE_PFN_VALID flag set.
29048763cb45SJérôme Glisse  *
29058763cb45SJérôme Glisse  * It then calls the finalize_and_map() callback. See comments for "struct
29068763cb45SJérôme Glisse  * migrate_vma_ops", in include/linux/migrate.h for details about
29078763cb45SJérôme Glisse  * finalize_and_map() behavior.
29088763cb45SJérôme Glisse  *
29098763cb45SJérôme Glisse  * After the finalize_and_map() callback, for successfully migrated pages, this
29108763cb45SJérôme Glisse  * function updates the CPU page table to point to new pages, otherwise it
29118763cb45SJérôme Glisse  * restores the CPU page table to point to the original source pages.
29128763cb45SJérôme Glisse  *
29138763cb45SJérôme Glisse  * Function returns 0 after the above steps, even if no pages were migrated
29148763cb45SJérôme Glisse  * (The function only returns an error if any of the arguments are invalid.)
29158763cb45SJérôme Glisse  *
29168763cb45SJérôme Glisse  * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
29178763cb45SJérôme Glisse  * unsigned long entries.
29188763cb45SJérôme Glisse  */
29198763cb45SJérôme Glisse int migrate_vma(const struct migrate_vma_ops *ops,
29208763cb45SJérôme Glisse 		struct vm_area_struct *vma,
29218763cb45SJérôme Glisse 		unsigned long start,
29228763cb45SJérôme Glisse 		unsigned long end,
29238763cb45SJérôme Glisse 		unsigned long *src,
29248763cb45SJérôme Glisse 		unsigned long *dst,
29258763cb45SJérôme Glisse 		void *private)
29268763cb45SJérôme Glisse {
29278763cb45SJérôme Glisse 	struct migrate_vma migrate;
29288763cb45SJérôme Glisse 
29298763cb45SJérôme Glisse 	/* Sanity check the arguments */
29308763cb45SJérôme Glisse 	start &= PAGE_MASK;
29318763cb45SJérôme Glisse 	end &= PAGE_MASK;
29328763cb45SJérôme Glisse 	if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
29338763cb45SJérôme Glisse 		return -EINVAL;
29348763cb45SJérôme Glisse 	if (start < vma->vm_start || start >= vma->vm_end)
29358763cb45SJérôme Glisse 		return -EINVAL;
29368763cb45SJérôme Glisse 	if (end <= vma->vm_start || end > vma->vm_end)
29378763cb45SJérôme Glisse 		return -EINVAL;
29388763cb45SJérôme Glisse 	if (!ops || !src || !dst || start >= end)
29398763cb45SJérôme Glisse 		return -EINVAL;
29408763cb45SJérôme Glisse 
29418763cb45SJérôme Glisse 	memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
29428763cb45SJérôme Glisse 	migrate.src = src;
29438763cb45SJérôme Glisse 	migrate.dst = dst;
29448763cb45SJérôme Glisse 	migrate.start = start;
29458763cb45SJérôme Glisse 	migrate.npages = 0;
29468763cb45SJérôme Glisse 	migrate.cpages = 0;
29478763cb45SJérôme Glisse 	migrate.end = end;
29488763cb45SJérôme Glisse 	migrate.vma = vma;
29498763cb45SJérôme Glisse 
29508763cb45SJérôme Glisse 	/* Collect, and try to unmap source pages */
29518763cb45SJérôme Glisse 	migrate_vma_collect(&migrate);
29528763cb45SJérôme Glisse 	if (!migrate.cpages)
29538763cb45SJérôme Glisse 		return 0;
29548763cb45SJérôme Glisse 
29558763cb45SJérôme Glisse 	/* Lock and isolate page */
29568763cb45SJérôme Glisse 	migrate_vma_prepare(&migrate);
29578763cb45SJérôme Glisse 	if (!migrate.cpages)
29588763cb45SJérôme Glisse 		return 0;
29598763cb45SJérôme Glisse 
29608763cb45SJérôme Glisse 	/* Unmap pages */
29618763cb45SJérôme Glisse 	migrate_vma_unmap(&migrate);
29628763cb45SJérôme Glisse 	if (!migrate.cpages)
29638763cb45SJérôme Glisse 		return 0;
29648763cb45SJérôme Glisse 
29658763cb45SJérôme Glisse 	/*
29668763cb45SJérôme Glisse 	 * At this point pages are locked and unmapped, and thus they have
29678763cb45SJérôme Glisse 	 * stable content and can safely be copied to destination memory that
29688763cb45SJérôme Glisse 	 * is allocated by the callback.
29698763cb45SJérôme Glisse 	 *
29708763cb45SJérôme Glisse 	 * Note that migration can fail in migrate_vma_struct_page() for each
29718763cb45SJérôme Glisse 	 * individual page.
29728763cb45SJérôme Glisse 	 */
29738763cb45SJérôme Glisse 	ops->alloc_and_copy(vma, src, dst, start, end, private);
29748763cb45SJérôme Glisse 
29758763cb45SJérôme Glisse 	/* This does the real migration of struct page */
29768763cb45SJérôme Glisse 	migrate_vma_pages(&migrate);
29778763cb45SJérôme Glisse 
29788763cb45SJérôme Glisse 	ops->finalize_and_map(vma, src, dst, start, end, private);
29798763cb45SJérôme Glisse 
29808763cb45SJérôme Glisse 	/* Unlock and remap pages */
29818763cb45SJérôme Glisse 	migrate_vma_finalize(&migrate);
29828763cb45SJérôme Glisse 
29838763cb45SJérôme Glisse 	return 0;
29848763cb45SJérôme Glisse }
29858763cb45SJérôme Glisse EXPORT_SYMBOL(migrate_vma);
29866b368cd4SJérôme Glisse #endif /* defined(MIGRATE_VMA_HELPER) */
2987