xref: /linux/mm/migrate.c (revision 8315ada7f095bfa2cae0cd1e915b95bf6226897d)
1b20a3503SChristoph Lameter /*
214e0f9bcSHugh Dickins  * Memory Migration functionality - linux/mm/migrate.c
3b20a3503SChristoph Lameter  *
4b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5b20a3503SChristoph Lameter  *
6b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
7b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
8b20a3503SChristoph Lameter  *
9b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
11b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
12cde53535SChristoph Lameter  * Christoph Lameter
13b20a3503SChristoph Lameter  */
14b20a3503SChristoph Lameter 
15b20a3503SChristoph Lameter #include <linux/migrate.h>
16b95f1b31SPaul Gortmaker #include <linux/export.h>
17b20a3503SChristoph Lameter #include <linux/swap.h>
180697212aSChristoph Lameter #include <linux/swapops.h>
19b20a3503SChristoph Lameter #include <linux/pagemap.h>
20e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
21b20a3503SChristoph Lameter #include <linux/mm_inline.h>
22b488893aSPavel Emelyanov #include <linux/nsproxy.h>
23b20a3503SChristoph Lameter #include <linux/pagevec.h>
24e9995ef9SHugh Dickins #include <linux/ksm.h>
25b20a3503SChristoph Lameter #include <linux/rmap.h>
26b20a3503SChristoph Lameter #include <linux/topology.h>
27b20a3503SChristoph Lameter #include <linux/cpu.h>
28b20a3503SChristoph Lameter #include <linux/cpuset.h>
2904e62a29SChristoph Lameter #include <linux/writeback.h>
30742755a1SChristoph Lameter #include <linux/mempolicy.h>
31742755a1SChristoph Lameter #include <linux/vmalloc.h>
3286c3a764SDavid Quigley #include <linux/security.h>
3342cb14b1SHugh Dickins #include <linux/backing-dev.h>
34bda807d4SMinchan Kim #include <linux/compaction.h>
354f5ca265SAdrian Bunk #include <linux/syscalls.h>
36290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
378e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
385a0e3ad6STejun Heo #include <linux/gfp.h>
39a5430ddaSJérôme Glisse #include <linux/memremap.h>
40*8315ada7SJérôme Glisse #include <linux/userfaultfd_k.h>
41bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h>
42f714f4f2SMel Gorman #include <linux/mmu_notifier.h>
4333c3fc71SVladimir Davydov #include <linux/page_idle.h>
44d435edcaSVlastimil Babka #include <linux/page_owner.h>
456e84f315SIngo Molnar #include <linux/sched/mm.h>
46197e7e52SLinus Torvalds #include <linux/ptrace.h>
47b20a3503SChristoph Lameter 
480d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
490d1836c3SMichal Nazarewicz 
507b2a2d4aSMel Gorman #define CREATE_TRACE_POINTS
517b2a2d4aSMel Gorman #include <trace/events/migrate.h>
527b2a2d4aSMel Gorman 
53b20a3503SChristoph Lameter #include "internal.h"
54b20a3503SChristoph Lameter 
55b20a3503SChristoph Lameter /*
56742755a1SChristoph Lameter  * migrate_prep() needs to be called before we start compiling a list of pages
57748446bbSMel Gorman  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
58748446bbSMel Gorman  * undesirable, use migrate_prep_local()
59b20a3503SChristoph Lameter  */
60b20a3503SChristoph Lameter int migrate_prep(void)
61b20a3503SChristoph Lameter {
62b20a3503SChristoph Lameter 	/*
63b20a3503SChristoph Lameter 	 * Clear the LRU lists so pages can be isolated.
64b20a3503SChristoph Lameter 	 * Note that pages may be moved off the LRU after we have
65b20a3503SChristoph Lameter 	 * drained them. Those pages will fail to migrate like other
66b20a3503SChristoph Lameter 	 * pages that may be busy.
67b20a3503SChristoph Lameter 	 */
68b20a3503SChristoph Lameter 	lru_add_drain_all();
69b20a3503SChristoph Lameter 
70b20a3503SChristoph Lameter 	return 0;
71b20a3503SChristoph Lameter }
72b20a3503SChristoph Lameter 
73748446bbSMel Gorman /* Do the necessary work of migrate_prep but not if it involves other CPUs */
74748446bbSMel Gorman int migrate_prep_local(void)
75748446bbSMel Gorman {
76748446bbSMel Gorman 	lru_add_drain();
77748446bbSMel Gorman 
78748446bbSMel Gorman 	return 0;
79748446bbSMel Gorman }
80748446bbSMel Gorman 
819e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode)
82bda807d4SMinchan Kim {
83bda807d4SMinchan Kim 	struct address_space *mapping;
84bda807d4SMinchan Kim 
85bda807d4SMinchan Kim 	/*
86bda807d4SMinchan Kim 	 * Avoid burning cycles with pages that are yet under __free_pages(),
87bda807d4SMinchan Kim 	 * or just got freed under us.
88bda807d4SMinchan Kim 	 *
89bda807d4SMinchan Kim 	 * In case we 'win' a race for a movable page being freed under us and
90bda807d4SMinchan Kim 	 * raise its refcount preventing __free_pages() from doing its job
91bda807d4SMinchan Kim 	 * the put_page() at the end of this block will take care of
92bda807d4SMinchan Kim 	 * release this page, thus avoiding a nasty leakage.
93bda807d4SMinchan Kim 	 */
94bda807d4SMinchan Kim 	if (unlikely(!get_page_unless_zero(page)))
95bda807d4SMinchan Kim 		goto out;
96bda807d4SMinchan Kim 
97bda807d4SMinchan Kim 	/*
98bda807d4SMinchan Kim 	 * Check PageMovable before holding a PG_lock because page's owner
99bda807d4SMinchan Kim 	 * assumes anybody doesn't touch PG_lock of newly allocated page
100bda807d4SMinchan Kim 	 * so unconditionally grapping the lock ruins page's owner side.
101bda807d4SMinchan Kim 	 */
102bda807d4SMinchan Kim 	if (unlikely(!__PageMovable(page)))
103bda807d4SMinchan Kim 		goto out_putpage;
104bda807d4SMinchan Kim 	/*
105bda807d4SMinchan Kim 	 * As movable pages are not isolated from LRU lists, concurrent
106bda807d4SMinchan Kim 	 * compaction threads can race against page migration functions
107bda807d4SMinchan Kim 	 * as well as race against the releasing a page.
108bda807d4SMinchan Kim 	 *
109bda807d4SMinchan Kim 	 * In order to avoid having an already isolated movable page
110bda807d4SMinchan Kim 	 * being (wrongly) re-isolated while it is under migration,
111bda807d4SMinchan Kim 	 * or to avoid attempting to isolate pages being released,
112bda807d4SMinchan Kim 	 * lets be sure we have the page lock
113bda807d4SMinchan Kim 	 * before proceeding with the movable page isolation steps.
114bda807d4SMinchan Kim 	 */
115bda807d4SMinchan Kim 	if (unlikely(!trylock_page(page)))
116bda807d4SMinchan Kim 		goto out_putpage;
117bda807d4SMinchan Kim 
118bda807d4SMinchan Kim 	if (!PageMovable(page) || PageIsolated(page))
119bda807d4SMinchan Kim 		goto out_no_isolated;
120bda807d4SMinchan Kim 
121bda807d4SMinchan Kim 	mapping = page_mapping(page);
122bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!mapping, page);
123bda807d4SMinchan Kim 
124bda807d4SMinchan Kim 	if (!mapping->a_ops->isolate_page(page, mode))
125bda807d4SMinchan Kim 		goto out_no_isolated;
126bda807d4SMinchan Kim 
127bda807d4SMinchan Kim 	/* Driver shouldn't use PG_isolated bit of page->flags */
128bda807d4SMinchan Kim 	WARN_ON_ONCE(PageIsolated(page));
129bda807d4SMinchan Kim 	__SetPageIsolated(page);
130bda807d4SMinchan Kim 	unlock_page(page);
131bda807d4SMinchan Kim 
1329e5bcd61SYisheng Xie 	return 0;
133bda807d4SMinchan Kim 
134bda807d4SMinchan Kim out_no_isolated:
135bda807d4SMinchan Kim 	unlock_page(page);
136bda807d4SMinchan Kim out_putpage:
137bda807d4SMinchan Kim 	put_page(page);
138bda807d4SMinchan Kim out:
1399e5bcd61SYisheng Xie 	return -EBUSY;
140bda807d4SMinchan Kim }
141bda807d4SMinchan Kim 
142bda807d4SMinchan Kim /* It should be called on page which is PG_movable */
143bda807d4SMinchan Kim void putback_movable_page(struct page *page)
144bda807d4SMinchan Kim {
145bda807d4SMinchan Kim 	struct address_space *mapping;
146bda807d4SMinchan Kim 
147bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageLocked(page), page);
148bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageMovable(page), page);
149bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
150bda807d4SMinchan Kim 
151bda807d4SMinchan Kim 	mapping = page_mapping(page);
152bda807d4SMinchan Kim 	mapping->a_ops->putback_page(page);
153bda807d4SMinchan Kim 	__ClearPageIsolated(page);
154bda807d4SMinchan Kim }
155bda807d4SMinchan Kim 
156b20a3503SChristoph Lameter /*
1575733c7d1SRafael Aquini  * Put previously isolated pages back onto the appropriate lists
1585733c7d1SRafael Aquini  * from where they were once taken off for compaction/migration.
1595733c7d1SRafael Aquini  *
16059c82b70SJoonsoo Kim  * This function shall be used whenever the isolated pageset has been
16159c82b70SJoonsoo Kim  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
16259c82b70SJoonsoo Kim  * and isolate_huge_page().
1635733c7d1SRafael Aquini  */
1645733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l)
1655733c7d1SRafael Aquini {
1665733c7d1SRafael Aquini 	struct page *page;
1675733c7d1SRafael Aquini 	struct page *page2;
1685733c7d1SRafael Aquini 
1695733c7d1SRafael Aquini 	list_for_each_entry_safe(page, page2, l, lru) {
17031caf665SNaoya Horiguchi 		if (unlikely(PageHuge(page))) {
17131caf665SNaoya Horiguchi 			putback_active_hugepage(page);
17231caf665SNaoya Horiguchi 			continue;
17331caf665SNaoya Horiguchi 		}
1745733c7d1SRafael Aquini 		list_del(&page->lru);
175bda807d4SMinchan Kim 		/*
176bda807d4SMinchan Kim 		 * We isolated non-lru movable page so here we can use
177bda807d4SMinchan Kim 		 * __PageMovable because LRU page's mapping cannot have
178bda807d4SMinchan Kim 		 * PAGE_MAPPING_MOVABLE.
179bda807d4SMinchan Kim 		 */
180b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(page))) {
181bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
182bda807d4SMinchan Kim 			lock_page(page);
183bda807d4SMinchan Kim 			if (PageMovable(page))
184bda807d4SMinchan Kim 				putback_movable_page(page);
185bf6bddf1SRafael Aquini 			else
186bda807d4SMinchan Kim 				__ClearPageIsolated(page);
187bda807d4SMinchan Kim 			unlock_page(page);
188bda807d4SMinchan Kim 			put_page(page);
189bda807d4SMinchan Kim 		} else {
190e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
191e8db67ebSNaoya Horiguchi 					page_is_file_cache(page), -hpage_nr_pages(page));
192fc280fe8SRabin Vincent 			putback_lru_page(page);
193b20a3503SChristoph Lameter 		}
194b20a3503SChristoph Lameter 	}
195bda807d4SMinchan Kim }
196b20a3503SChristoph Lameter 
1970697212aSChristoph Lameter /*
1980697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
1990697212aSChristoph Lameter  */
200e4b82222SMinchan Kim static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
201e9995ef9SHugh Dickins 				 unsigned long addr, void *old)
2020697212aSChristoph Lameter {
2033fe87967SKirill A. Shutemov 	struct page_vma_mapped_walk pvmw = {
2043fe87967SKirill A. Shutemov 		.page = old,
2053fe87967SKirill A. Shutemov 		.vma = vma,
2063fe87967SKirill A. Shutemov 		.address = addr,
2073fe87967SKirill A. Shutemov 		.flags = PVMW_SYNC | PVMW_MIGRATION,
2083fe87967SKirill A. Shutemov 	};
2093fe87967SKirill A. Shutemov 	struct page *new;
2103fe87967SKirill A. Shutemov 	pte_t pte;
2110697212aSChristoph Lameter 	swp_entry_t entry;
2120697212aSChristoph Lameter 
2133fe87967SKirill A. Shutemov 	VM_BUG_ON_PAGE(PageTail(page), page);
2143fe87967SKirill A. Shutemov 	while (page_vma_mapped_walk(&pvmw)) {
2154b0ece6fSNaoya Horiguchi 		if (PageKsm(page))
2164b0ece6fSNaoya Horiguchi 			new = page;
2174b0ece6fSNaoya Horiguchi 		else
2183fe87967SKirill A. Shutemov 			new = page - pvmw.page->index +
2193fe87967SKirill A. Shutemov 				linear_page_index(vma, pvmw.address);
2200697212aSChristoph Lameter 
221616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
222616b8371SZi Yan 		/* PMD-mapped THP migration entry */
223616b8371SZi Yan 		if (!pvmw.pte) {
224616b8371SZi Yan 			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
225616b8371SZi Yan 			remove_migration_pmd(&pvmw, new);
226616b8371SZi Yan 			continue;
227616b8371SZi Yan 		}
228616b8371SZi Yan #endif
229616b8371SZi Yan 
2300697212aSChristoph Lameter 		get_page(new);
2316d2329f8SAndrea Arcangeli 		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
2323fe87967SKirill A. Shutemov 		if (pte_swp_soft_dirty(*pvmw.pte))
233c3d16e16SCyrill Gorcunov 			pte = pte_mksoft_dirty(pte);
234d3cb8bf6SMel Gorman 
2353fe87967SKirill A. Shutemov 		/*
2363fe87967SKirill A. Shutemov 		 * Recheck VMA as permissions can change since migration started
2373fe87967SKirill A. Shutemov 		 */
2383fe87967SKirill A. Shutemov 		entry = pte_to_swp_entry(*pvmw.pte);
2390697212aSChristoph Lameter 		if (is_write_migration_entry(entry))
240d3cb8bf6SMel Gorman 			pte = maybe_mkwrite(pte, vma);
241d3cb8bf6SMel Gorman 
242a5430ddaSJérôme Glisse 		if (unlikely(is_zone_device_page(new)) &&
243a5430ddaSJérôme Glisse 		    is_device_private_page(new)) {
244a5430ddaSJérôme Glisse 			entry = make_device_private_entry(new, pte_write(pte));
245a5430ddaSJérôme Glisse 			pte = swp_entry_to_pte(entry);
246a5430ddaSJérôme Glisse 		} else
247383321abSAneesh Kumar K.V 			flush_dcache_page(new);
248a5430ddaSJérôme Glisse 
2493ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
250be7517d6STony Lu 		if (PageHuge(new)) {
251290408d4SNaoya Horiguchi 			pte = pte_mkhuge(pte);
252be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, new, 0);
253383321abSAneesh Kumar K.V 			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
25404e62a29SChristoph Lameter 			if (PageAnon(new))
2553fe87967SKirill A. Shutemov 				hugepage_add_anon_rmap(new, vma, pvmw.address);
256290408d4SNaoya Horiguchi 			else
25753f9263bSKirill A. Shutemov 				page_dup_rmap(new, true);
258383321abSAneesh Kumar K.V 		} else
259383321abSAneesh Kumar K.V #endif
260383321abSAneesh Kumar K.V 		{
261383321abSAneesh Kumar K.V 			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
262383321abSAneesh Kumar K.V 
263383321abSAneesh Kumar K.V 			if (PageAnon(new))
2643fe87967SKirill A. Shutemov 				page_add_anon_rmap(new, vma, pvmw.address, false);
26504e62a29SChristoph Lameter 			else
266dd78feddSKirill A. Shutemov 				page_add_file_rmap(new, false);
267383321abSAneesh Kumar K.V 		}
268e388466dSKirill A. Shutemov 		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
26951afb12bSHugh Dickins 			mlock_vma_page(new);
27051afb12bSHugh Dickins 
27104e62a29SChristoph Lameter 		/* No need to invalidate - it was non-present before */
2723fe87967SKirill A. Shutemov 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
2733fe87967SKirill A. Shutemov 	}
2743fe87967SKirill A. Shutemov 
275e4b82222SMinchan Kim 	return true;
2760697212aSChristoph Lameter }
2770697212aSChristoph Lameter 
2780697212aSChristoph Lameter /*
27904e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
28004e62a29SChristoph Lameter  * references to the indicated page.
28104e62a29SChristoph Lameter  */
282e388466dSKirill A. Shutemov void remove_migration_ptes(struct page *old, struct page *new, bool locked)
28304e62a29SChristoph Lameter {
284051ac83aSJoonsoo Kim 	struct rmap_walk_control rwc = {
285051ac83aSJoonsoo Kim 		.rmap_one = remove_migration_pte,
286051ac83aSJoonsoo Kim 		.arg = old,
287051ac83aSJoonsoo Kim 	};
288051ac83aSJoonsoo Kim 
289e388466dSKirill A. Shutemov 	if (locked)
290e388466dSKirill A. Shutemov 		rmap_walk_locked(new, &rwc);
291e388466dSKirill A. Shutemov 	else
292051ac83aSJoonsoo Kim 		rmap_walk(new, &rwc);
29304e62a29SChristoph Lameter }
29404e62a29SChristoph Lameter 
29504e62a29SChristoph Lameter /*
2960697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2970697212aSChristoph Lameter  * get to the page and wait until migration is finished.
2980697212aSChristoph Lameter  * When we return from this function the fault will be retried.
2990697212aSChristoph Lameter  */
300e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
30130dad309SNaoya Horiguchi 				spinlock_t *ptl)
3020697212aSChristoph Lameter {
30330dad309SNaoya Horiguchi 	pte_t pte;
3040697212aSChristoph Lameter 	swp_entry_t entry;
3050697212aSChristoph Lameter 	struct page *page;
3060697212aSChristoph Lameter 
30730dad309SNaoya Horiguchi 	spin_lock(ptl);
3080697212aSChristoph Lameter 	pte = *ptep;
3090697212aSChristoph Lameter 	if (!is_swap_pte(pte))
3100697212aSChristoph Lameter 		goto out;
3110697212aSChristoph Lameter 
3120697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
3130697212aSChristoph Lameter 	if (!is_migration_entry(entry))
3140697212aSChristoph Lameter 		goto out;
3150697212aSChristoph Lameter 
3160697212aSChristoph Lameter 	page = migration_entry_to_page(entry);
3170697212aSChristoph Lameter 
318e286781dSNick Piggin 	/*
319e286781dSNick Piggin 	 * Once radix-tree replacement of page migration started, page_count
320e286781dSNick Piggin 	 * *must* be zero. And, we don't want to call wait_on_page_locked()
321e286781dSNick Piggin 	 * against a page without get_page().
322e286781dSNick Piggin 	 * So, we use get_page_unless_zero(), here. Even failed, page fault
323e286781dSNick Piggin 	 * will occur again.
324e286781dSNick Piggin 	 */
325e286781dSNick Piggin 	if (!get_page_unless_zero(page))
326e286781dSNick Piggin 		goto out;
3270697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3280697212aSChristoph Lameter 	wait_on_page_locked(page);
3290697212aSChristoph Lameter 	put_page(page);
3300697212aSChristoph Lameter 	return;
3310697212aSChristoph Lameter out:
3320697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
3330697212aSChristoph Lameter }
3340697212aSChristoph Lameter 
33530dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
33630dad309SNaoya Horiguchi 				unsigned long address)
33730dad309SNaoya Horiguchi {
33830dad309SNaoya Horiguchi 	spinlock_t *ptl = pte_lockptr(mm, pmd);
33930dad309SNaoya Horiguchi 	pte_t *ptep = pte_offset_map(pmd, address);
34030dad309SNaoya Horiguchi 	__migration_entry_wait(mm, ptep, ptl);
34130dad309SNaoya Horiguchi }
34230dad309SNaoya Horiguchi 
343cb900f41SKirill A. Shutemov void migration_entry_wait_huge(struct vm_area_struct *vma,
344cb900f41SKirill A. Shutemov 		struct mm_struct *mm, pte_t *pte)
34530dad309SNaoya Horiguchi {
346cb900f41SKirill A. Shutemov 	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
34730dad309SNaoya Horiguchi 	__migration_entry_wait(mm, pte, ptl);
34830dad309SNaoya Horiguchi }
34930dad309SNaoya Horiguchi 
350616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
351616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
352616b8371SZi Yan {
353616b8371SZi Yan 	spinlock_t *ptl;
354616b8371SZi Yan 	struct page *page;
355616b8371SZi Yan 
356616b8371SZi Yan 	ptl = pmd_lock(mm, pmd);
357616b8371SZi Yan 	if (!is_pmd_migration_entry(*pmd))
358616b8371SZi Yan 		goto unlock;
359616b8371SZi Yan 	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
360616b8371SZi Yan 	if (!get_page_unless_zero(page))
361616b8371SZi Yan 		goto unlock;
362616b8371SZi Yan 	spin_unlock(ptl);
363616b8371SZi Yan 	wait_on_page_locked(page);
364616b8371SZi Yan 	put_page(page);
365616b8371SZi Yan 	return;
366616b8371SZi Yan unlock:
367616b8371SZi Yan 	spin_unlock(ptl);
368616b8371SZi Yan }
369616b8371SZi Yan #endif
370616b8371SZi Yan 
371b969c4abSMel Gorman #ifdef CONFIG_BLOCK
372b969c4abSMel Gorman /* Returns true if all buffers are successfully locked */
373a6bc32b8SMel Gorman static bool buffer_migrate_lock_buffers(struct buffer_head *head,
374a6bc32b8SMel Gorman 							enum migrate_mode mode)
375b969c4abSMel Gorman {
376b969c4abSMel Gorman 	struct buffer_head *bh = head;
377b969c4abSMel Gorman 
378b969c4abSMel Gorman 	/* Simple case, sync compaction */
379a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC) {
380b969c4abSMel Gorman 		do {
381b969c4abSMel Gorman 			get_bh(bh);
382b969c4abSMel Gorman 			lock_buffer(bh);
383b969c4abSMel Gorman 			bh = bh->b_this_page;
384b969c4abSMel Gorman 
385b969c4abSMel Gorman 		} while (bh != head);
386b969c4abSMel Gorman 
387b969c4abSMel Gorman 		return true;
388b969c4abSMel Gorman 	}
389b969c4abSMel Gorman 
390b969c4abSMel Gorman 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
391b969c4abSMel Gorman 	do {
392b969c4abSMel Gorman 		get_bh(bh);
393b969c4abSMel Gorman 		if (!trylock_buffer(bh)) {
394b969c4abSMel Gorman 			/*
395b969c4abSMel Gorman 			 * We failed to lock the buffer and cannot stall in
396b969c4abSMel Gorman 			 * async migration. Release the taken locks
397b969c4abSMel Gorman 			 */
398b969c4abSMel Gorman 			struct buffer_head *failed_bh = bh;
399b969c4abSMel Gorman 			put_bh(failed_bh);
400b969c4abSMel Gorman 			bh = head;
401b969c4abSMel Gorman 			while (bh != failed_bh) {
402b969c4abSMel Gorman 				unlock_buffer(bh);
403b969c4abSMel Gorman 				put_bh(bh);
404b969c4abSMel Gorman 				bh = bh->b_this_page;
405b969c4abSMel Gorman 			}
406b969c4abSMel Gorman 			return false;
407b969c4abSMel Gorman 		}
408b969c4abSMel Gorman 
409b969c4abSMel Gorman 		bh = bh->b_this_page;
410b969c4abSMel Gorman 	} while (bh != head);
411b969c4abSMel Gorman 	return true;
412b969c4abSMel Gorman }
413b969c4abSMel Gorman #else
414b969c4abSMel Gorman static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
415a6bc32b8SMel Gorman 							enum migrate_mode mode)
416b969c4abSMel Gorman {
417b969c4abSMel Gorman 	return true;
418b969c4abSMel Gorman }
419b969c4abSMel Gorman #endif /* CONFIG_BLOCK */
420b969c4abSMel Gorman 
421b20a3503SChristoph Lameter /*
422c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
4235b5c7120SChristoph Lameter  *
4245b5c7120SChristoph Lameter  * The number of remaining references must be:
4255b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
4265b5c7120SChristoph Lameter  * 2 for pages with a mapping
427266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
428b20a3503SChristoph Lameter  */
42936bc08ccSGu Zheng int migrate_page_move_mapping(struct address_space *mapping,
430b969c4abSMel Gorman 		struct page *newpage, struct page *page,
4318e321fefSBenjamin LaHaise 		struct buffer_head *head, enum migrate_mode mode,
4328e321fefSBenjamin LaHaise 		int extra_count)
433b20a3503SChristoph Lameter {
43442cb14b1SHugh Dickins 	struct zone *oldzone, *newzone;
43542cb14b1SHugh Dickins 	int dirty;
4368e321fefSBenjamin LaHaise 	int expected_count = 1 + extra_count;
4377cf9c2c7SNick Piggin 	void **pslot;
438b20a3503SChristoph Lameter 
4398763cb45SJérôme Glisse 	/*
4408763cb45SJérôme Glisse 	 * ZONE_DEVICE pages have 1 refcount always held by their device
4418763cb45SJérôme Glisse 	 *
4428763cb45SJérôme Glisse 	 * Note that DAX memory will never reach that point as it does not have
4438763cb45SJérôme Glisse 	 * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
4448763cb45SJérôme Glisse 	 */
4458763cb45SJérôme Glisse 	expected_count += is_zone_device_page(page);
4468763cb45SJérôme Glisse 
4476c5240aeSChristoph Lameter 	if (!mapping) {
4480e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
4498e321fefSBenjamin LaHaise 		if (page_count(page) != expected_count)
4506c5240aeSChristoph Lameter 			return -EAGAIN;
451cf4b769aSHugh Dickins 
452cf4b769aSHugh Dickins 		/* No turning back from here */
453cf4b769aSHugh Dickins 		newpage->index = page->index;
454cf4b769aSHugh Dickins 		newpage->mapping = page->mapping;
455cf4b769aSHugh Dickins 		if (PageSwapBacked(page))
456fa9949daSHugh Dickins 			__SetPageSwapBacked(newpage);
457cf4b769aSHugh Dickins 
45878bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
4596c5240aeSChristoph Lameter 	}
4606c5240aeSChristoph Lameter 
46142cb14b1SHugh Dickins 	oldzone = page_zone(page);
46242cb14b1SHugh Dickins 	newzone = page_zone(newpage);
46342cb14b1SHugh Dickins 
46419fd6231SNick Piggin 	spin_lock_irq(&mapping->tree_lock);
465b20a3503SChristoph Lameter 
4667cf9c2c7SNick Piggin 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
467b20a3503SChristoph Lameter  					page_index(page));
468b20a3503SChristoph Lameter 
4698e321fefSBenjamin LaHaise 	expected_count += 1 + page_has_private(page);
470e286781dSNick Piggin 	if (page_count(page) != expected_count ||
47129c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
47219fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
473e23ca00bSChristoph Lameter 		return -EAGAIN;
474b20a3503SChristoph Lameter 	}
475b20a3503SChristoph Lameter 
476fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
47719fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
478e286781dSNick Piggin 		return -EAGAIN;
479e286781dSNick Piggin 	}
480e286781dSNick Piggin 
481b20a3503SChristoph Lameter 	/*
482b969c4abSMel Gorman 	 * In the async migration case of moving a page with buffers, lock the
483b969c4abSMel Gorman 	 * buffers using trylock before the mapping is moved. If the mapping
484b969c4abSMel Gorman 	 * was moved, we later failed to lock the buffers and could not move
485b969c4abSMel Gorman 	 * the mapping back due to an elevated page count, we would have to
486b969c4abSMel Gorman 	 * block waiting on other references to be dropped.
487b969c4abSMel Gorman 	 */
488a6bc32b8SMel Gorman 	if (mode == MIGRATE_ASYNC && head &&
489a6bc32b8SMel Gorman 			!buffer_migrate_lock_buffers(head, mode)) {
490fe896d18SJoonsoo Kim 		page_ref_unfreeze(page, expected_count);
491b969c4abSMel Gorman 		spin_unlock_irq(&mapping->tree_lock);
492b969c4abSMel Gorman 		return -EAGAIN;
493b969c4abSMel Gorman 	}
494b969c4abSMel Gorman 
495b969c4abSMel Gorman 	/*
496cf4b769aSHugh Dickins 	 * Now we know that no one else is looking at the page:
497cf4b769aSHugh Dickins 	 * no turning back from here.
498b20a3503SChristoph Lameter 	 */
499cf4b769aSHugh Dickins 	newpage->index = page->index;
500cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
5017cf9c2c7SNick Piggin 	get_page(newpage);	/* add cache reference */
5026326fec1SNicholas Piggin 	if (PageSwapBacked(page)) {
5036326fec1SNicholas Piggin 		__SetPageSwapBacked(newpage);
504b20a3503SChristoph Lameter 		if (PageSwapCache(page)) {
505b20a3503SChristoph Lameter 			SetPageSwapCache(newpage);
506b20a3503SChristoph Lameter 			set_page_private(newpage, page_private(page));
507b20a3503SChristoph Lameter 		}
5086326fec1SNicholas Piggin 	} else {
5096326fec1SNicholas Piggin 		VM_BUG_ON_PAGE(PageSwapCache(page), page);
5106326fec1SNicholas Piggin 	}
511b20a3503SChristoph Lameter 
51242cb14b1SHugh Dickins 	/* Move dirty while page refs frozen and newpage not yet exposed */
51342cb14b1SHugh Dickins 	dirty = PageDirty(page);
51442cb14b1SHugh Dickins 	if (dirty) {
51542cb14b1SHugh Dickins 		ClearPageDirty(page);
51642cb14b1SHugh Dickins 		SetPageDirty(newpage);
51742cb14b1SHugh Dickins 	}
51842cb14b1SHugh Dickins 
5196d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
5207cf9c2c7SNick Piggin 
5217cf9c2c7SNick Piggin 	/*
522937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
523937a94c9SJacobo Giralt 	 * to one less reference.
5247cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
5257cf9c2c7SNick Piggin 	 */
526fe896d18SJoonsoo Kim 	page_ref_unfreeze(page, expected_count - 1);
5277cf9c2c7SNick Piggin 
52842cb14b1SHugh Dickins 	spin_unlock(&mapping->tree_lock);
52942cb14b1SHugh Dickins 	/* Leave irq disabled to prevent preemption while updating stats */
53042cb14b1SHugh Dickins 
5310e8c7d0fSChristoph Lameter 	/*
5320e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
5330e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
5340e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
5350e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
5360e8c7d0fSChristoph Lameter 	 *
5370e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
5384b9d0fabSMel Gorman 	 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
5390e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
5400e8c7d0fSChristoph Lameter 	 */
54142cb14b1SHugh Dickins 	if (newzone != oldzone) {
54211fb9989SMel Gorman 		__dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
54311fb9989SMel Gorman 		__inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
54442cb14b1SHugh Dickins 		if (PageSwapBacked(page) && !PageSwapCache(page)) {
54511fb9989SMel Gorman 			__dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
54611fb9989SMel Gorman 			__inc_node_state(newzone->zone_pgdat, NR_SHMEM);
5474b02108aSKOSAKI Motohiro 		}
54842cb14b1SHugh Dickins 		if (dirty && mapping_cap_account_dirty(mapping)) {
54911fb9989SMel Gorman 			__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
5505a1c84b4SMel Gorman 			__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
55111fb9989SMel Gorman 			__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
5525a1c84b4SMel Gorman 			__inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
55342cb14b1SHugh Dickins 		}
55442cb14b1SHugh Dickins 	}
55542cb14b1SHugh Dickins 	local_irq_enable();
556b20a3503SChristoph Lameter 
55778bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
558b20a3503SChristoph Lameter }
5591118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_move_mapping);
560b20a3503SChristoph Lameter 
561b20a3503SChristoph Lameter /*
562290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
563290408d4SNaoya Horiguchi  * of migrate_page_move_mapping().
564290408d4SNaoya Horiguchi  */
565290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
566290408d4SNaoya Horiguchi 				   struct page *newpage, struct page *page)
567290408d4SNaoya Horiguchi {
568290408d4SNaoya Horiguchi 	int expected_count;
569290408d4SNaoya Horiguchi 	void **pslot;
570290408d4SNaoya Horiguchi 
571290408d4SNaoya Horiguchi 	spin_lock_irq(&mapping->tree_lock);
572290408d4SNaoya Horiguchi 
573290408d4SNaoya Horiguchi 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
574290408d4SNaoya Horiguchi 					page_index(page));
575290408d4SNaoya Horiguchi 
576290408d4SNaoya Horiguchi 	expected_count = 2 + page_has_private(page);
577290408d4SNaoya Horiguchi 	if (page_count(page) != expected_count ||
57829c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
579290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
580290408d4SNaoya Horiguchi 		return -EAGAIN;
581290408d4SNaoya Horiguchi 	}
582290408d4SNaoya Horiguchi 
583fe896d18SJoonsoo Kim 	if (!page_ref_freeze(page, expected_count)) {
584290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
585290408d4SNaoya Horiguchi 		return -EAGAIN;
586290408d4SNaoya Horiguchi 	}
587290408d4SNaoya Horiguchi 
588cf4b769aSHugh Dickins 	newpage->index = page->index;
589cf4b769aSHugh Dickins 	newpage->mapping = page->mapping;
5906a93ca8fSJohannes Weiner 
591290408d4SNaoya Horiguchi 	get_page(newpage);
592290408d4SNaoya Horiguchi 
5936d75f366SJohannes Weiner 	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
594290408d4SNaoya Horiguchi 
595fe896d18SJoonsoo Kim 	page_ref_unfreeze(page, expected_count - 1);
596290408d4SNaoya Horiguchi 
597290408d4SNaoya Horiguchi 	spin_unlock_irq(&mapping->tree_lock);
5986a93ca8fSJohannes Weiner 
59978bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
600290408d4SNaoya Horiguchi }
601290408d4SNaoya Horiguchi 
602290408d4SNaoya Horiguchi /*
60330b0a105SDave Hansen  * Gigantic pages are so large that we do not guarantee that page++ pointer
60430b0a105SDave Hansen  * arithmetic will work across the entire page.  We need something more
60530b0a105SDave Hansen  * specialized.
60630b0a105SDave Hansen  */
60730b0a105SDave Hansen static void __copy_gigantic_page(struct page *dst, struct page *src,
60830b0a105SDave Hansen 				int nr_pages)
60930b0a105SDave Hansen {
61030b0a105SDave Hansen 	int i;
61130b0a105SDave Hansen 	struct page *dst_base = dst;
61230b0a105SDave Hansen 	struct page *src_base = src;
61330b0a105SDave Hansen 
61430b0a105SDave Hansen 	for (i = 0; i < nr_pages; ) {
61530b0a105SDave Hansen 		cond_resched();
61630b0a105SDave Hansen 		copy_highpage(dst, src);
61730b0a105SDave Hansen 
61830b0a105SDave Hansen 		i++;
61930b0a105SDave Hansen 		dst = mem_map_next(dst, dst_base, i);
62030b0a105SDave Hansen 		src = mem_map_next(src, src_base, i);
62130b0a105SDave Hansen 	}
62230b0a105SDave Hansen }
62330b0a105SDave Hansen 
62430b0a105SDave Hansen static void copy_huge_page(struct page *dst, struct page *src)
62530b0a105SDave Hansen {
62630b0a105SDave Hansen 	int i;
62730b0a105SDave Hansen 	int nr_pages;
62830b0a105SDave Hansen 
62930b0a105SDave Hansen 	if (PageHuge(src)) {
63030b0a105SDave Hansen 		/* hugetlbfs page */
63130b0a105SDave Hansen 		struct hstate *h = page_hstate(src);
63230b0a105SDave Hansen 		nr_pages = pages_per_huge_page(h);
63330b0a105SDave Hansen 
63430b0a105SDave Hansen 		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
63530b0a105SDave Hansen 			__copy_gigantic_page(dst, src, nr_pages);
63630b0a105SDave Hansen 			return;
63730b0a105SDave Hansen 		}
63830b0a105SDave Hansen 	} else {
63930b0a105SDave Hansen 		/* thp page */
64030b0a105SDave Hansen 		BUG_ON(!PageTransHuge(src));
64130b0a105SDave Hansen 		nr_pages = hpage_nr_pages(src);
64230b0a105SDave Hansen 	}
64330b0a105SDave Hansen 
64430b0a105SDave Hansen 	for (i = 0; i < nr_pages; i++) {
64530b0a105SDave Hansen 		cond_resched();
64630b0a105SDave Hansen 		copy_highpage(dst + i, src + i);
64730b0a105SDave Hansen 	}
64830b0a105SDave Hansen }
64930b0a105SDave Hansen 
65030b0a105SDave Hansen /*
651b20a3503SChristoph Lameter  * Copy the page to its new location
652b20a3503SChristoph Lameter  */
6532916ecc0SJérôme Glisse void migrate_page_states(struct page *newpage, struct page *page)
654b20a3503SChristoph Lameter {
6557851a45cSRik van Riel 	int cpupid;
6567851a45cSRik van Riel 
657b20a3503SChristoph Lameter 	if (PageError(page))
658b20a3503SChristoph Lameter 		SetPageError(newpage);
659b20a3503SChristoph Lameter 	if (PageReferenced(page))
660b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
661b20a3503SChristoph Lameter 	if (PageUptodate(page))
662b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
663894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
664309381feSSasha Levin 		VM_BUG_ON_PAGE(PageUnevictable(page), page);
665b20a3503SChristoph Lameter 		SetPageActive(newpage);
666418b27efSLee Schermerhorn 	} else if (TestClearPageUnevictable(page))
667418b27efSLee Schermerhorn 		SetPageUnevictable(newpage);
668b20a3503SChristoph Lameter 	if (PageChecked(page))
669b20a3503SChristoph Lameter 		SetPageChecked(newpage);
670b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
671b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
672b20a3503SChristoph Lameter 
67342cb14b1SHugh Dickins 	/* Move dirty on pages not done by migrate_page_move_mapping() */
67442cb14b1SHugh Dickins 	if (PageDirty(page))
675752dc185SHugh Dickins 		SetPageDirty(newpage);
676b20a3503SChristoph Lameter 
67733c3fc71SVladimir Davydov 	if (page_is_young(page))
67833c3fc71SVladimir Davydov 		set_page_young(newpage);
67933c3fc71SVladimir Davydov 	if (page_is_idle(page))
68033c3fc71SVladimir Davydov 		set_page_idle(newpage);
68133c3fc71SVladimir Davydov 
6827851a45cSRik van Riel 	/*
6837851a45cSRik van Riel 	 * Copy NUMA information to the new page, to prevent over-eager
6847851a45cSRik van Riel 	 * future migrations of this same page.
6857851a45cSRik van Riel 	 */
6867851a45cSRik van Riel 	cpupid = page_cpupid_xchg_last(page, -1);
6877851a45cSRik van Riel 	page_cpupid_xchg_last(newpage, cpupid);
6887851a45cSRik van Riel 
689e9995ef9SHugh Dickins 	ksm_migrate_page(newpage, page);
690c8d6553bSHugh Dickins 	/*
691c8d6553bSHugh Dickins 	 * Please do not reorder this without considering how mm/ksm.c's
692c8d6553bSHugh Dickins 	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
693c8d6553bSHugh Dickins 	 */
694b3b3a99cSNaoya Horiguchi 	if (PageSwapCache(page))
695b20a3503SChristoph Lameter 		ClearPageSwapCache(page);
696b20a3503SChristoph Lameter 	ClearPagePrivate(page);
697b20a3503SChristoph Lameter 	set_page_private(page, 0);
698b20a3503SChristoph Lameter 
699b20a3503SChristoph Lameter 	/*
700b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
701b20a3503SChristoph Lameter 	 * wake them up.
702b20a3503SChristoph Lameter 	 */
703b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
704b20a3503SChristoph Lameter 		end_page_writeback(newpage);
705d435edcaSVlastimil Babka 
706d435edcaSVlastimil Babka 	copy_page_owner(page, newpage);
70774485cf2SJohannes Weiner 
70874485cf2SJohannes Weiner 	mem_cgroup_migrate(page, newpage);
709b20a3503SChristoph Lameter }
7102916ecc0SJérôme Glisse EXPORT_SYMBOL(migrate_page_states);
7112916ecc0SJérôme Glisse 
7122916ecc0SJérôme Glisse void migrate_page_copy(struct page *newpage, struct page *page)
7132916ecc0SJérôme Glisse {
7142916ecc0SJérôme Glisse 	if (PageHuge(page) || PageTransHuge(page))
7152916ecc0SJérôme Glisse 		copy_huge_page(newpage, page);
7162916ecc0SJérôme Glisse 	else
7172916ecc0SJérôme Glisse 		copy_highpage(newpage, page);
7182916ecc0SJérôme Glisse 
7192916ecc0SJérôme Glisse 	migrate_page_states(newpage, page);
7202916ecc0SJérôme Glisse }
7211118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_copy);
722b20a3503SChristoph Lameter 
7231d8b85ccSChristoph Lameter /************************************************************
7241d8b85ccSChristoph Lameter  *                    Migration functions
7251d8b85ccSChristoph Lameter  ***********************************************************/
7261d8b85ccSChristoph Lameter 
727b20a3503SChristoph Lameter /*
728bda807d4SMinchan Kim  * Common logic to directly migrate a single LRU page suitable for
729266cf658SDavid Howells  * pages that do not use PagePrivate/PagePrivate2.
730b20a3503SChristoph Lameter  *
731b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
732b20a3503SChristoph Lameter  */
7332d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
734a6bc32b8SMel Gorman 		struct page *newpage, struct page *page,
735a6bc32b8SMel Gorman 		enum migrate_mode mode)
736b20a3503SChristoph Lameter {
737b20a3503SChristoph Lameter 	int rc;
738b20a3503SChristoph Lameter 
739b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
740b20a3503SChristoph Lameter 
7418e321fefSBenjamin LaHaise 	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
742b20a3503SChristoph Lameter 
74378bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
744b20a3503SChristoph Lameter 		return rc;
745b20a3503SChristoph Lameter 
7462916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
747b20a3503SChristoph Lameter 		migrate_page_copy(newpage, page);
7482916ecc0SJérôme Glisse 	else
7492916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
75078bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
751b20a3503SChristoph Lameter }
752b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
753b20a3503SChristoph Lameter 
7549361401eSDavid Howells #ifdef CONFIG_BLOCK
755b20a3503SChristoph Lameter /*
7561d8b85ccSChristoph Lameter  * Migration function for pages with buffers. This function can only be used
7571d8b85ccSChristoph Lameter  * if the underlying filesystem guarantees that no other references to "page"
7581d8b85ccSChristoph Lameter  * exist.
7591d8b85ccSChristoph Lameter  */
7602d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping,
761a6bc32b8SMel Gorman 		struct page *newpage, struct page *page, enum migrate_mode mode)
7621d8b85ccSChristoph Lameter {
7631d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
7641d8b85ccSChristoph Lameter 	int rc;
7651d8b85ccSChristoph Lameter 
7661d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
767a6bc32b8SMel Gorman 		return migrate_page(mapping, newpage, page, mode);
7681d8b85ccSChristoph Lameter 
7691d8b85ccSChristoph Lameter 	head = page_buffers(page);
7701d8b85ccSChristoph Lameter 
7718e321fefSBenjamin LaHaise 	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
7721d8b85ccSChristoph Lameter 
77378bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
7741d8b85ccSChristoph Lameter 		return rc;
7751d8b85ccSChristoph Lameter 
776b969c4abSMel Gorman 	/*
777b969c4abSMel Gorman 	 * In the async case, migrate_page_move_mapping locked the buffers
778b969c4abSMel Gorman 	 * with an IRQ-safe spinlock held. In the sync case, the buffers
779b969c4abSMel Gorman 	 * need to be locked now
780b969c4abSMel Gorman 	 */
781a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC)
782a6bc32b8SMel Gorman 		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
7831d8b85ccSChristoph Lameter 
7841d8b85ccSChristoph Lameter 	ClearPagePrivate(page);
7851d8b85ccSChristoph Lameter 	set_page_private(newpage, page_private(page));
7861d8b85ccSChristoph Lameter 	set_page_private(page, 0);
7871d8b85ccSChristoph Lameter 	put_page(page);
7881d8b85ccSChristoph Lameter 	get_page(newpage);
7891d8b85ccSChristoph Lameter 
7901d8b85ccSChristoph Lameter 	bh = head;
7911d8b85ccSChristoph Lameter 	do {
7921d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
7931d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
7941d8b85ccSChristoph Lameter 
7951d8b85ccSChristoph Lameter 	} while (bh != head);
7961d8b85ccSChristoph Lameter 
7971d8b85ccSChristoph Lameter 	SetPagePrivate(newpage);
7981d8b85ccSChristoph Lameter 
7992916ecc0SJérôme Glisse 	if (mode != MIGRATE_SYNC_NO_COPY)
8001d8b85ccSChristoph Lameter 		migrate_page_copy(newpage, page);
8012916ecc0SJérôme Glisse 	else
8022916ecc0SJérôme Glisse 		migrate_page_states(newpage, page);
8031d8b85ccSChristoph Lameter 
8041d8b85ccSChristoph Lameter 	bh = head;
8051d8b85ccSChristoph Lameter 	do {
8061d8b85ccSChristoph Lameter 		unlock_buffer(bh);
8071d8b85ccSChristoph Lameter 		put_bh(bh);
8081d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
8091d8b85ccSChristoph Lameter 
8101d8b85ccSChristoph Lameter 	} while (bh != head);
8111d8b85ccSChristoph Lameter 
81278bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
8131d8b85ccSChristoph Lameter }
8141d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
8159361401eSDavid Howells #endif
8161d8b85ccSChristoph Lameter 
81704e62a29SChristoph Lameter /*
81804e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
81904e62a29SChristoph Lameter  */
82004e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
82104e62a29SChristoph Lameter {
82204e62a29SChristoph Lameter 	struct writeback_control wbc = {
82304e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
82404e62a29SChristoph Lameter 		.nr_to_write = 1,
82504e62a29SChristoph Lameter 		.range_start = 0,
82604e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
82704e62a29SChristoph Lameter 		.for_reclaim = 1
82804e62a29SChristoph Lameter 	};
82904e62a29SChristoph Lameter 	int rc;
83004e62a29SChristoph Lameter 
83104e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
83204e62a29SChristoph Lameter 		/* No write method for the address space */
83304e62a29SChristoph Lameter 		return -EINVAL;
83404e62a29SChristoph Lameter 
83504e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
83604e62a29SChristoph Lameter 		/* Someone else already triggered a write */
83704e62a29SChristoph Lameter 		return -EAGAIN;
83804e62a29SChristoph Lameter 
83904e62a29SChristoph Lameter 	/*
84004e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
84104e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
84204e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
84304e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
84404e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
84504e62a29SChristoph Lameter 	 * be successful.
84604e62a29SChristoph Lameter 	 */
847e388466dSKirill A. Shutemov 	remove_migration_ptes(page, page, false);
84804e62a29SChristoph Lameter 
84904e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
85004e62a29SChristoph Lameter 
85104e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
85204e62a29SChristoph Lameter 		/* unlocked. Relock */
85304e62a29SChristoph Lameter 		lock_page(page);
85404e62a29SChristoph Lameter 
855bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
85604e62a29SChristoph Lameter }
85704e62a29SChristoph Lameter 
85804e62a29SChristoph Lameter /*
85904e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
86004e62a29SChristoph Lameter  */
8618351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
862a6bc32b8SMel Gorman 	struct page *newpage, struct page *page, enum migrate_mode mode)
8638351a6e4SChristoph Lameter {
864b969c4abSMel Gorman 	if (PageDirty(page)) {
865a6bc32b8SMel Gorman 		/* Only writeback pages in full synchronous migration */
8662916ecc0SJérôme Glisse 		switch (mode) {
8672916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
8682916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
8692916ecc0SJérôme Glisse 			break;
8702916ecc0SJérôme Glisse 		default:
871b969c4abSMel Gorman 			return -EBUSY;
8722916ecc0SJérôme Glisse 		}
87304e62a29SChristoph Lameter 		return writeout(mapping, page);
874b969c4abSMel Gorman 	}
8758351a6e4SChristoph Lameter 
8768351a6e4SChristoph Lameter 	/*
8778351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
8788351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
8798351a6e4SChristoph Lameter 	 */
880266cf658SDavid Howells 	if (page_has_private(page) &&
8818351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
8828351a6e4SChristoph Lameter 		return -EAGAIN;
8838351a6e4SChristoph Lameter 
884a6bc32b8SMel Gorman 	return migrate_page(mapping, newpage, page, mode);
8858351a6e4SChristoph Lameter }
8868351a6e4SChristoph Lameter 
8871d8b85ccSChristoph Lameter /*
888e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
889e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
890b20a3503SChristoph Lameter  *
891e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
892e24f0b8fSChristoph Lameter  * is successful.
893894bc310SLee Schermerhorn  *
894894bc310SLee Schermerhorn  * Return value:
895894bc310SLee Schermerhorn  *   < 0 - error code
89678bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
897b20a3503SChristoph Lameter  */
8983fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page,
8995c3f9a67SHugh Dickins 				enum migrate_mode mode)
900b20a3503SChristoph Lameter {
901e24f0b8fSChristoph Lameter 	struct address_space *mapping;
902bda807d4SMinchan Kim 	int rc = -EAGAIN;
903bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
904b20a3503SChristoph Lameter 
9057db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(page), page);
9067db7671fSHugh Dickins 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
907b20a3503SChristoph Lameter 
908b20a3503SChristoph Lameter 	mapping = page_mapping(page);
909bda807d4SMinchan Kim 
910bda807d4SMinchan Kim 	if (likely(is_lru)) {
911b20a3503SChristoph Lameter 		if (!mapping)
912a6bc32b8SMel Gorman 			rc = migrate_page(mapping, newpage, page, mode);
9136c5240aeSChristoph Lameter 		else if (mapping->a_ops->migratepage)
914b20a3503SChristoph Lameter 			/*
915bda807d4SMinchan Kim 			 * Most pages have a mapping and most filesystems
916bda807d4SMinchan Kim 			 * provide a migratepage callback. Anonymous pages
917bda807d4SMinchan Kim 			 * are part of swap space which also has its own
918bda807d4SMinchan Kim 			 * migratepage callback. This is the most common path
919bda807d4SMinchan Kim 			 * for page migration.
920b20a3503SChristoph Lameter 			 */
921bda807d4SMinchan Kim 			rc = mapping->a_ops->migratepage(mapping, newpage,
922bda807d4SMinchan Kim 							page, mode);
9238351a6e4SChristoph Lameter 		else
924bda807d4SMinchan Kim 			rc = fallback_migrate_page(mapping, newpage,
925bda807d4SMinchan Kim 							page, mode);
926bda807d4SMinchan Kim 	} else {
927bda807d4SMinchan Kim 		/*
928bda807d4SMinchan Kim 		 * In case of non-lru page, it could be released after
929bda807d4SMinchan Kim 		 * isolation step. In that case, we shouldn't try migration.
930bda807d4SMinchan Kim 		 */
931bda807d4SMinchan Kim 		VM_BUG_ON_PAGE(!PageIsolated(page), page);
932bda807d4SMinchan Kim 		if (!PageMovable(page)) {
933bda807d4SMinchan Kim 			rc = MIGRATEPAGE_SUCCESS;
934bda807d4SMinchan Kim 			__ClearPageIsolated(page);
935bda807d4SMinchan Kim 			goto out;
936bda807d4SMinchan Kim 		}
937bda807d4SMinchan Kim 
938bda807d4SMinchan Kim 		rc = mapping->a_ops->migratepage(mapping, newpage,
939bda807d4SMinchan Kim 						page, mode);
940bda807d4SMinchan Kim 		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
941bda807d4SMinchan Kim 			!PageIsolated(page));
942bda807d4SMinchan Kim 	}
943b20a3503SChristoph Lameter 
9445c3f9a67SHugh Dickins 	/*
9455c3f9a67SHugh Dickins 	 * When successful, old pagecache page->mapping must be cleared before
9465c3f9a67SHugh Dickins 	 * page is freed; but stats require that PageAnon be left as PageAnon.
9475c3f9a67SHugh Dickins 	 */
9485c3f9a67SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
949bda807d4SMinchan Kim 		if (__PageMovable(page)) {
950bda807d4SMinchan Kim 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
951bda807d4SMinchan Kim 
952bda807d4SMinchan Kim 			/*
953bda807d4SMinchan Kim 			 * We clear PG_movable under page_lock so any compactor
954bda807d4SMinchan Kim 			 * cannot try to migrate this page.
955bda807d4SMinchan Kim 			 */
956bda807d4SMinchan Kim 			__ClearPageIsolated(page);
957bda807d4SMinchan Kim 		}
958bda807d4SMinchan Kim 
959bda807d4SMinchan Kim 		/*
960bda807d4SMinchan Kim 		 * Anonymous and movable page->mapping will be cleard by
961bda807d4SMinchan Kim 		 * free_pages_prepare so don't reset it here for keeping
962bda807d4SMinchan Kim 		 * the type to work PageAnon, for example.
963bda807d4SMinchan Kim 		 */
964bda807d4SMinchan Kim 		if (!PageMappingFlags(page))
9655c3f9a67SHugh Dickins 			page->mapping = NULL;
9663fe2011fSMel Gorman 	}
967bda807d4SMinchan Kim out:
968e24f0b8fSChristoph Lameter 	return rc;
969e24f0b8fSChristoph Lameter }
970e24f0b8fSChristoph Lameter 
9710dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage,
9729c620e2bSHugh Dickins 				int force, enum migrate_mode mode)
973e24f0b8fSChristoph Lameter {
9740dabec93SMinchan Kim 	int rc = -EAGAIN;
9752ebba6b7SHugh Dickins 	int page_was_mapped = 0;
9763f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
977bda807d4SMinchan Kim 	bool is_lru = !__PageMovable(page);
97895a402c3SChristoph Lameter 
979529ae9aaSNick Piggin 	if (!trylock_page(page)) {
980a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
9810dabec93SMinchan Kim 			goto out;
9823e7d3449SMel Gorman 
9833e7d3449SMel Gorman 		/*
9843e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
9853e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
9863e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
9873e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
9883e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
9893e7d3449SMel Gorman 		 * mpage_readpages). If an allocation happens for the
9903e7d3449SMel Gorman 		 * second or third page, the process can end up locking
9913e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
9923e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
9933e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
9943e7d3449SMel Gorman 		 * altogether.
9953e7d3449SMel Gorman 		 */
9963e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
9970dabec93SMinchan Kim 			goto out;
9983e7d3449SMel Gorman 
999e24f0b8fSChristoph Lameter 		lock_page(page);
1000e24f0b8fSChristoph Lameter 	}
1001e24f0b8fSChristoph Lameter 
1002e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
100311bc82d6SAndrea Arcangeli 		/*
1004fed5b64aSJianguo Wu 		 * Only in the case of a full synchronous migration is it
1005a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
1006a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
1007a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
100811bc82d6SAndrea Arcangeli 		 */
10092916ecc0SJérôme Glisse 		switch (mode) {
10102916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
10112916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
10122916ecc0SJérôme Glisse 			break;
10132916ecc0SJérôme Glisse 		default:
101411bc82d6SAndrea Arcangeli 			rc = -EBUSY;
10150a31bc97SJohannes Weiner 			goto out_unlock;
101611bc82d6SAndrea Arcangeli 		}
101711bc82d6SAndrea Arcangeli 		if (!force)
10180a31bc97SJohannes Weiner 			goto out_unlock;
1019e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
1020e24f0b8fSChristoph Lameter 	}
102103f15c86SHugh Dickins 
1022e24f0b8fSChristoph Lameter 	/*
1023dc386d4dSKAMEZAWA Hiroyuki 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
1024dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
10251ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
1026dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
1027989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
1028989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
10293fe2011fSMel Gorman 	 *
103003f15c86SHugh Dickins 	 * Only page_get_anon_vma() understands the subtleties of
103103f15c86SHugh Dickins 	 * getting a hold on an anon_vma from outside one of its mms.
103203f15c86SHugh Dickins 	 * But if we cannot get anon_vma, then we won't need it anyway,
103303f15c86SHugh Dickins 	 * because that implies that the anon page is no longer mapped
103403f15c86SHugh Dickins 	 * (and cannot be remapped so long as we hold the page lock).
10353fe2011fSMel Gorman 	 */
103603f15c86SHugh Dickins 	if (PageAnon(page) && !PageKsm(page))
103703f15c86SHugh Dickins 		anon_vma = page_get_anon_vma(page);
103862e1c553SShaohua Li 
10397db7671fSHugh Dickins 	/*
10407db7671fSHugh Dickins 	 * Block others from accessing the new page when we get around to
10417db7671fSHugh Dickins 	 * establishing additional references. We are usually the only one
10427db7671fSHugh Dickins 	 * holding a reference to newpage at this point. We used to have a BUG
10437db7671fSHugh Dickins 	 * here if trylock_page(newpage) fails, but would like to allow for
10447db7671fSHugh Dickins 	 * cases where there might be a race with the previous use of newpage.
10457db7671fSHugh Dickins 	 * This is much like races on refcount of oldpage: just don't BUG().
10467db7671fSHugh Dickins 	 */
10477db7671fSHugh Dickins 	if (unlikely(!trylock_page(newpage)))
10487db7671fSHugh Dickins 		goto out_unlock;
10497db7671fSHugh Dickins 
1050bda807d4SMinchan Kim 	if (unlikely(!is_lru)) {
1051bda807d4SMinchan Kim 		rc = move_to_new_page(newpage, page, mode);
1052bda807d4SMinchan Kim 		goto out_unlock_both;
1053bda807d4SMinchan Kim 	}
1054bda807d4SMinchan Kim 
1055dc386d4dSKAMEZAWA Hiroyuki 	/*
105662e1c553SShaohua Li 	 * Corner case handling:
105762e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
105862e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
105962e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
106062e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
106162e1c553SShaohua Li 	 * 2. An orphaned page (see truncate_complete_page) might have
106262e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
106362e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
106462e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
106562e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
1066dc386d4dSKAMEZAWA Hiroyuki 	 */
106762e1c553SShaohua Li 	if (!page->mapping) {
1068309381feSSasha Levin 		VM_BUG_ON_PAGE(PageAnon(page), page);
10691ce82b69SHugh Dickins 		if (page_has_private(page)) {
107062e1c553SShaohua Li 			try_to_free_buffers(page);
10717db7671fSHugh Dickins 			goto out_unlock_both;
107262e1c553SShaohua Li 		}
10737db7671fSHugh Dickins 	} else if (page_mapped(page)) {
10747db7671fSHugh Dickins 		/* Establish migration ptes */
107503f15c86SHugh Dickins 		VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
107603f15c86SHugh Dickins 				page);
10772ebba6b7SHugh Dickins 		try_to_unmap(page,
1078da1b13ccSWanpeng Li 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
10792ebba6b7SHugh Dickins 		page_was_mapped = 1;
10802ebba6b7SHugh Dickins 	}
1081dc386d4dSKAMEZAWA Hiroyuki 
1082e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
10835c3f9a67SHugh Dickins 		rc = move_to_new_page(newpage, page, mode);
1084e24f0b8fSChristoph Lameter 
10855c3f9a67SHugh Dickins 	if (page_was_mapped)
10865c3f9a67SHugh Dickins 		remove_migration_ptes(page,
1087e388466dSKirill A. Shutemov 			rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
10883f6c8272SMel Gorman 
10897db7671fSHugh Dickins out_unlock_both:
10907db7671fSHugh Dickins 	unlock_page(newpage);
10917db7671fSHugh Dickins out_unlock:
10923f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
109376545066SRik van Riel 	if (anon_vma)
10949e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
1095b20a3503SChristoph Lameter 	unlock_page(page);
10960dabec93SMinchan Kim out:
1097c6c919ebSMinchan Kim 	/*
1098c6c919ebSMinchan Kim 	 * If migration is successful, decrease refcount of the newpage
1099c6c919ebSMinchan Kim 	 * which will not free the page because new page owner increased
1100c6c919ebSMinchan Kim 	 * refcounter. As well, if it is LRU page, add the page to LRU
1101c6c919ebSMinchan Kim 	 * list in here.
1102c6c919ebSMinchan Kim 	 */
1103c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1104b1123ea6SMinchan Kim 		if (unlikely(__PageMovable(newpage)))
1105c6c919ebSMinchan Kim 			put_page(newpage);
1106c6c919ebSMinchan Kim 		else
1107c6c919ebSMinchan Kim 			putback_lru_page(newpage);
1108c6c919ebSMinchan Kim 	}
1109c6c919ebSMinchan Kim 
11100dabec93SMinchan Kim 	return rc;
11110dabec93SMinchan Kim }
111295a402c3SChristoph Lameter 
11130dabec93SMinchan Kim /*
1114ef2a5153SGeert Uytterhoeven  * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move().  Work
1115ef2a5153SGeert Uytterhoeven  * around it.
1116ef2a5153SGeert Uytterhoeven  */
1117ef2a5153SGeert Uytterhoeven #if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
1118ef2a5153SGeert Uytterhoeven #define ICE_noinline noinline
1119ef2a5153SGeert Uytterhoeven #else
1120ef2a5153SGeert Uytterhoeven #define ICE_noinline
1121ef2a5153SGeert Uytterhoeven #endif
1122ef2a5153SGeert Uytterhoeven 
1123ef2a5153SGeert Uytterhoeven /*
11240dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
11250dabec93SMinchan Kim  * to the newly allocated page in newpage.
11260dabec93SMinchan Kim  */
1127ef2a5153SGeert Uytterhoeven static ICE_noinline int unmap_and_move(new_page_t get_new_page,
1128ef2a5153SGeert Uytterhoeven 				   free_page_t put_new_page,
1129ef2a5153SGeert Uytterhoeven 				   unsigned long private, struct page *page,
1130add05cecSNaoya Horiguchi 				   int force, enum migrate_mode mode,
1131add05cecSNaoya Horiguchi 				   enum migrate_reason reason)
11320dabec93SMinchan Kim {
11332def7424SHugh Dickins 	int rc = MIGRATEPAGE_SUCCESS;
11340dabec93SMinchan Kim 	int *result = NULL;
11352def7424SHugh Dickins 	struct page *newpage;
11360dabec93SMinchan Kim 
11372def7424SHugh Dickins 	newpage = get_new_page(page, private, &result);
11380dabec93SMinchan Kim 	if (!newpage)
11390dabec93SMinchan Kim 		return -ENOMEM;
11400dabec93SMinchan Kim 
11410dabec93SMinchan Kim 	if (page_count(page) == 1) {
11420dabec93SMinchan Kim 		/* page was freed from under us. So we are done. */
1143c6c919ebSMinchan Kim 		ClearPageActive(page);
1144c6c919ebSMinchan Kim 		ClearPageUnevictable(page);
1145bda807d4SMinchan Kim 		if (unlikely(__PageMovable(page))) {
1146bda807d4SMinchan Kim 			lock_page(page);
1147bda807d4SMinchan Kim 			if (!PageMovable(page))
1148bda807d4SMinchan Kim 				__ClearPageIsolated(page);
1149bda807d4SMinchan Kim 			unlock_page(page);
1150bda807d4SMinchan Kim 		}
1151c6c919ebSMinchan Kim 		if (put_new_page)
1152c6c919ebSMinchan Kim 			put_new_page(newpage, private);
1153c6c919ebSMinchan Kim 		else
1154c6c919ebSMinchan Kim 			put_page(newpage);
11550dabec93SMinchan Kim 		goto out;
11560dabec93SMinchan Kim 	}
11570dabec93SMinchan Kim 
1158616b8371SZi Yan 	if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
11594d2fa965SKirill A. Shutemov 		lock_page(page);
11604d2fa965SKirill A. Shutemov 		rc = split_huge_page(page);
11614d2fa965SKirill A. Shutemov 		unlock_page(page);
11624d2fa965SKirill A. Shutemov 		if (rc)
11630dabec93SMinchan Kim 			goto out;
11644d2fa965SKirill A. Shutemov 	}
11650dabec93SMinchan Kim 
11669c620e2bSHugh Dickins 	rc = __unmap_and_move(page, newpage, force, mode);
1167c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS)
11687cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(newpage, reason);
1169bf6bddf1SRafael Aquini 
11700dabec93SMinchan Kim out:
1171e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
1172aaa994b3SChristoph Lameter 		/*
1173aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
1174aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
1175aaa994b3SChristoph Lameter 		 * migrated will have kepts its references and be
1176aaa994b3SChristoph Lameter 		 * restored.
1177aaa994b3SChristoph Lameter 		 */
1178aaa994b3SChristoph Lameter 		list_del(&page->lru);
11796afcf8efSMing Ling 
11806afcf8efSMing Ling 		/*
11816afcf8efSMing Ling 		 * Compaction can migrate also non-LRU pages which are
11826afcf8efSMing Ling 		 * not accounted to NR_ISOLATED_*. They can be recognized
11836afcf8efSMing Ling 		 * as __PageMovable
11846afcf8efSMing Ling 		 */
11856afcf8efSMing Ling 		if (likely(!__PageMovable(page)))
1186e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1187e8db67ebSNaoya Horiguchi 					page_is_file_cache(page), -hpage_nr_pages(page));
1188e24f0b8fSChristoph Lameter 	}
118968711a74SDavid Rientjes 
119095a402c3SChristoph Lameter 	/*
1191c6c919ebSMinchan Kim 	 * If migration is successful, releases reference grabbed during
1192c6c919ebSMinchan Kim 	 * isolation. Otherwise, restore the page to right list unless
1193c6c919ebSMinchan Kim 	 * we want to retry.
119495a402c3SChristoph Lameter 	 */
1195c6c919ebSMinchan Kim 	if (rc == MIGRATEPAGE_SUCCESS) {
1196c6c919ebSMinchan Kim 		put_page(page);
1197c6c919ebSMinchan Kim 		if (reason == MR_MEMORY_FAILURE) {
1198c6c919ebSMinchan Kim 			/*
1199c6c919ebSMinchan Kim 			 * Set PG_HWPoison on just freed page
1200c6c919ebSMinchan Kim 			 * intentionally. Although it's rather weird,
1201c6c919ebSMinchan Kim 			 * it's how HWPoison flag works at the moment.
1202c6c919ebSMinchan Kim 			 */
1203c6c919ebSMinchan Kim 			if (!test_set_page_hwpoison(page))
1204c6c919ebSMinchan Kim 				num_poisoned_pages_inc();
1205c6c919ebSMinchan Kim 		}
1206c6c919ebSMinchan Kim 	} else {
1207bda807d4SMinchan Kim 		if (rc != -EAGAIN) {
1208bda807d4SMinchan Kim 			if (likely(!__PageMovable(page))) {
1209c6c919ebSMinchan Kim 				putback_lru_page(page);
1210bda807d4SMinchan Kim 				goto put_new;
1211bda807d4SMinchan Kim 			}
1212bda807d4SMinchan Kim 
1213bda807d4SMinchan Kim 			lock_page(page);
1214bda807d4SMinchan Kim 			if (PageMovable(page))
1215bda807d4SMinchan Kim 				putback_movable_page(page);
1216bda807d4SMinchan Kim 			else
1217bda807d4SMinchan Kim 				__ClearPageIsolated(page);
1218bda807d4SMinchan Kim 			unlock_page(page);
1219bda807d4SMinchan Kim 			put_page(page);
1220bda807d4SMinchan Kim 		}
1221bda807d4SMinchan Kim put_new:
1222cf4b769aSHugh Dickins 		if (put_new_page)
122368711a74SDavid Rientjes 			put_new_page(newpage, private);
1224c6c919ebSMinchan Kim 		else
1225d6d86c0aSKonstantin Khlebnikov 			put_page(newpage);
1226c6c919ebSMinchan Kim 	}
122768711a74SDavid Rientjes 
1228742755a1SChristoph Lameter 	if (result) {
1229742755a1SChristoph Lameter 		if (rc)
1230742755a1SChristoph Lameter 			*result = rc;
1231742755a1SChristoph Lameter 		else
1232742755a1SChristoph Lameter 			*result = page_to_nid(newpage);
1233742755a1SChristoph Lameter 	}
1234e24f0b8fSChristoph Lameter 	return rc;
1235e24f0b8fSChristoph Lameter }
1236b20a3503SChristoph Lameter 
1237e24f0b8fSChristoph Lameter /*
1238290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
1239290408d4SNaoya Horiguchi  *
1240290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
1241290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
1242290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
1243290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
1244290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
1245290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
1246290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
1247290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
1248290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
1249290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
1250290408d4SNaoya Horiguchi  *
1251290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
1252290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
1253290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
1254290408d4SNaoya Horiguchi  */
1255290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
125668711a74SDavid Rientjes 				free_page_t put_new_page, unsigned long private,
125768711a74SDavid Rientjes 				struct page *hpage, int force,
12587cd12b4aSVlastimil Babka 				enum migrate_mode mode, int reason)
1259290408d4SNaoya Horiguchi {
12602def7424SHugh Dickins 	int rc = -EAGAIN;
1261290408d4SNaoya Horiguchi 	int *result = NULL;
12622ebba6b7SHugh Dickins 	int page_was_mapped = 0;
126332665f2bSJoonsoo Kim 	struct page *new_hpage;
1264290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
1265290408d4SNaoya Horiguchi 
126683467efbSNaoya Horiguchi 	/*
126783467efbSNaoya Horiguchi 	 * Movability of hugepages depends on architectures and hugepage size.
126883467efbSNaoya Horiguchi 	 * This check is necessary because some callers of hugepage migration
126983467efbSNaoya Horiguchi 	 * like soft offline and memory hotremove don't walk through page
127083467efbSNaoya Horiguchi 	 * tables or check whether the hugepage is pmd-based or not before
127183467efbSNaoya Horiguchi 	 * kicking migration.
127283467efbSNaoya Horiguchi 	 */
1273100873d7SNaoya Horiguchi 	if (!hugepage_migration_supported(page_hstate(hpage))) {
127432665f2bSJoonsoo Kim 		putback_active_hugepage(hpage);
127583467efbSNaoya Horiguchi 		return -ENOSYS;
127632665f2bSJoonsoo Kim 	}
127783467efbSNaoya Horiguchi 
127832665f2bSJoonsoo Kim 	new_hpage = get_new_page(hpage, private, &result);
1279290408d4SNaoya Horiguchi 	if (!new_hpage)
1280290408d4SNaoya Horiguchi 		return -ENOMEM;
1281290408d4SNaoya Horiguchi 
1282290408d4SNaoya Horiguchi 	if (!trylock_page(hpage)) {
12832916ecc0SJérôme Glisse 		if (!force)
1284290408d4SNaoya Horiguchi 			goto out;
12852916ecc0SJérôme Glisse 		switch (mode) {
12862916ecc0SJérôme Glisse 		case MIGRATE_SYNC:
12872916ecc0SJérôme Glisse 		case MIGRATE_SYNC_NO_COPY:
12882916ecc0SJérôme Glisse 			break;
12892916ecc0SJérôme Glisse 		default:
12902916ecc0SJérôme Glisse 			goto out;
12912916ecc0SJérôme Glisse 		}
1292290408d4SNaoya Horiguchi 		lock_page(hpage);
1293290408d4SNaoya Horiguchi 	}
1294290408d4SNaoya Horiguchi 
1295746b18d4SPeter Zijlstra 	if (PageAnon(hpage))
1296746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(hpage);
1297290408d4SNaoya Horiguchi 
12987db7671fSHugh Dickins 	if (unlikely(!trylock_page(new_hpage)))
12997db7671fSHugh Dickins 		goto put_anon;
13007db7671fSHugh Dickins 
13012ebba6b7SHugh Dickins 	if (page_mapped(hpage)) {
13022ebba6b7SHugh Dickins 		try_to_unmap(hpage,
13032ebba6b7SHugh Dickins 			TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
13042ebba6b7SHugh Dickins 		page_was_mapped = 1;
13052ebba6b7SHugh Dickins 	}
1306290408d4SNaoya Horiguchi 
1307290408d4SNaoya Horiguchi 	if (!page_mapped(hpage))
13085c3f9a67SHugh Dickins 		rc = move_to_new_page(new_hpage, hpage, mode);
1309290408d4SNaoya Horiguchi 
13105c3f9a67SHugh Dickins 	if (page_was_mapped)
13115c3f9a67SHugh Dickins 		remove_migration_ptes(hpage,
1312e388466dSKirill A. Shutemov 			rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
1313290408d4SNaoya Horiguchi 
13147db7671fSHugh Dickins 	unlock_page(new_hpage);
13157db7671fSHugh Dickins 
13167db7671fSHugh Dickins put_anon:
1317fd4a4663SHugh Dickins 	if (anon_vma)
13189e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
13198e6ac7faSAneesh Kumar K.V 
13202def7424SHugh Dickins 	if (rc == MIGRATEPAGE_SUCCESS) {
13218e6ac7faSAneesh Kumar K.V 		hugetlb_cgroup_migrate(hpage, new_hpage);
13222def7424SHugh Dickins 		put_new_page = NULL;
13237cd12b4aSVlastimil Babka 		set_page_owner_migrate_reason(new_hpage, reason);
13242def7424SHugh Dickins 	}
13258e6ac7faSAneesh Kumar K.V 
1326290408d4SNaoya Horiguchi 	unlock_page(hpage);
132709761333SHillf Danton out:
1328b8ec1ceeSNaoya Horiguchi 	if (rc != -EAGAIN)
1329b8ec1ceeSNaoya Horiguchi 		putback_active_hugepage(hpage);
1330c3114a84SAnshuman Khandual 	if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
1331c3114a84SAnshuman Khandual 		num_poisoned_pages_inc();
133268711a74SDavid Rientjes 
133368711a74SDavid Rientjes 	/*
133468711a74SDavid Rientjes 	 * If migration was not successful and there's a freeing callback, use
133568711a74SDavid Rientjes 	 * it.  Otherwise, put_page() will drop the reference grabbed during
133668711a74SDavid Rientjes 	 * isolation.
133768711a74SDavid Rientjes 	 */
13382def7424SHugh Dickins 	if (put_new_page)
133968711a74SDavid Rientjes 		put_new_page(new_hpage, private);
134068711a74SDavid Rientjes 	else
13413aaa76e1SNaoya Horiguchi 		putback_active_hugepage(new_hpage);
134268711a74SDavid Rientjes 
1343290408d4SNaoya Horiguchi 	if (result) {
1344290408d4SNaoya Horiguchi 		if (rc)
1345290408d4SNaoya Horiguchi 			*result = rc;
1346290408d4SNaoya Horiguchi 		else
1347290408d4SNaoya Horiguchi 			*result = page_to_nid(new_hpage);
1348290408d4SNaoya Horiguchi 	}
1349290408d4SNaoya Horiguchi 	return rc;
1350290408d4SNaoya Horiguchi }
1351290408d4SNaoya Horiguchi 
1352290408d4SNaoya Horiguchi /*
1353c73e5c9cSSrivatsa S. Bhat  * migrate_pages - migrate the pages specified in a list, to the free pages
1354c73e5c9cSSrivatsa S. Bhat  *		   supplied as the target for the page migration
1355e24f0b8fSChristoph Lameter  *
1356c73e5c9cSSrivatsa S. Bhat  * @from:		The list of pages to be migrated.
1357c73e5c9cSSrivatsa S. Bhat  * @get_new_page:	The function used to allocate free pages to be used
1358c73e5c9cSSrivatsa S. Bhat  *			as the target of the page migration.
135968711a74SDavid Rientjes  * @put_new_page:	The function used to free target pages if migration
136068711a74SDavid Rientjes  *			fails, or NULL if no special handling is necessary.
1361c73e5c9cSSrivatsa S. Bhat  * @private:		Private data to be passed on to get_new_page()
1362c73e5c9cSSrivatsa S. Bhat  * @mode:		The migration mode that specifies the constraints for
1363c73e5c9cSSrivatsa S. Bhat  *			page migration, if any.
1364c73e5c9cSSrivatsa S. Bhat  * @reason:		The reason for page migration.
1365e24f0b8fSChristoph Lameter  *
1366c73e5c9cSSrivatsa S. Bhat  * The function returns after 10 attempts or if no pages are movable any more
1367c73e5c9cSSrivatsa S. Bhat  * because the list has become empty or no retryable pages exist any more.
136814e0f9bcSHugh Dickins  * The caller should call putback_movable_pages() to return pages to the LRU
136928bd6578SMinchan Kim  * or free list only if ret != 0.
1370e24f0b8fSChristoph Lameter  *
1371c73e5c9cSSrivatsa S. Bhat  * Returns the number of pages that were not migrated, or an error code.
1372e24f0b8fSChristoph Lameter  */
13739c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page,
137468711a74SDavid Rientjes 		free_page_t put_new_page, unsigned long private,
137568711a74SDavid Rientjes 		enum migrate_mode mode, int reason)
1376e24f0b8fSChristoph Lameter {
1377e24f0b8fSChristoph Lameter 	int retry = 1;
1378e24f0b8fSChristoph Lameter 	int nr_failed = 0;
13795647bc29SMel Gorman 	int nr_succeeded = 0;
1380e24f0b8fSChristoph Lameter 	int pass = 0;
1381e24f0b8fSChristoph Lameter 	struct page *page;
1382e24f0b8fSChristoph Lameter 	struct page *page2;
1383e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
1384e24f0b8fSChristoph Lameter 	int rc;
13852d1db3b1SChristoph Lameter 
1386e24f0b8fSChristoph Lameter 	if (!swapwrite)
1387e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
1388e24f0b8fSChristoph Lameter 
1389e24f0b8fSChristoph Lameter 	for(pass = 0; pass < 10 && retry; pass++) {
1390e24f0b8fSChristoph Lameter 		retry = 0;
1391e24f0b8fSChristoph Lameter 
1392e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
1393e24f0b8fSChristoph Lameter 			cond_resched();
1394e24f0b8fSChristoph Lameter 
139531caf665SNaoya Horiguchi 			if (PageHuge(page))
139631caf665SNaoya Horiguchi 				rc = unmap_and_move_huge_page(get_new_page,
139768711a74SDavid Rientjes 						put_new_page, private, page,
13987cd12b4aSVlastimil Babka 						pass > 2, mode, reason);
139931caf665SNaoya Horiguchi 			else
140068711a74SDavid Rientjes 				rc = unmap_and_move(get_new_page, put_new_page,
1401add05cecSNaoya Horiguchi 						private, page, pass > 2, mode,
1402add05cecSNaoya Horiguchi 						reason);
1403e24f0b8fSChristoph Lameter 
1404e24f0b8fSChristoph Lameter 			switch(rc) {
140595a402c3SChristoph Lameter 			case -ENOMEM:
1406dfef2ef4SDavid Rientjes 				nr_failed++;
140795a402c3SChristoph Lameter 				goto out;
1408e24f0b8fSChristoph Lameter 			case -EAGAIN:
1409b20a3503SChristoph Lameter 				retry++;
1410e24f0b8fSChristoph Lameter 				break;
141178bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
14125647bc29SMel Gorman 				nr_succeeded++;
1413e24f0b8fSChristoph Lameter 				break;
1414e24f0b8fSChristoph Lameter 			default:
1415354a3363SNaoya Horiguchi 				/*
1416354a3363SNaoya Horiguchi 				 * Permanent failure (-EBUSY, -ENOSYS, etc.):
1417354a3363SNaoya Horiguchi 				 * unlike -EAGAIN case, the failed page is
1418354a3363SNaoya Horiguchi 				 * removed from migration page list and not
1419354a3363SNaoya Horiguchi 				 * retried in the next outer loop.
1420354a3363SNaoya Horiguchi 				 */
1421b20a3503SChristoph Lameter 				nr_failed++;
1422e24f0b8fSChristoph Lameter 				break;
1423b20a3503SChristoph Lameter 			}
1424b20a3503SChristoph Lameter 		}
1425e24f0b8fSChristoph Lameter 	}
1426f2f81fb2SVlastimil Babka 	nr_failed += retry;
1427f2f81fb2SVlastimil Babka 	rc = nr_failed;
142895a402c3SChristoph Lameter out:
14295647bc29SMel Gorman 	if (nr_succeeded)
14305647bc29SMel Gorman 		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
14315647bc29SMel Gorman 	if (nr_failed)
14325647bc29SMel Gorman 		count_vm_events(PGMIGRATE_FAIL, nr_failed);
14337b2a2d4aSMel Gorman 	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
14347b2a2d4aSMel Gorman 
1435b20a3503SChristoph Lameter 	if (!swapwrite)
1436b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
1437b20a3503SChristoph Lameter 
143895a402c3SChristoph Lameter 	return rc;
1439b20a3503SChristoph Lameter }
1440b20a3503SChristoph Lameter 
1441742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1442742755a1SChristoph Lameter /*
1443742755a1SChristoph Lameter  * Move a list of individual pages
1444742755a1SChristoph Lameter  */
1445742755a1SChristoph Lameter struct page_to_node {
1446742755a1SChristoph Lameter 	unsigned long addr;
1447742755a1SChristoph Lameter 	struct page *page;
1448742755a1SChristoph Lameter 	int node;
1449742755a1SChristoph Lameter 	int status;
1450742755a1SChristoph Lameter };
1451742755a1SChristoph Lameter 
1452742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private,
1453742755a1SChristoph Lameter 		int **result)
1454742755a1SChristoph Lameter {
1455742755a1SChristoph Lameter 	struct page_to_node *pm = (struct page_to_node *)private;
1456742755a1SChristoph Lameter 
1457742755a1SChristoph Lameter 	while (pm->node != MAX_NUMNODES && pm->page != p)
1458742755a1SChristoph Lameter 		pm++;
1459742755a1SChristoph Lameter 
1460742755a1SChristoph Lameter 	if (pm->node == MAX_NUMNODES)
1461742755a1SChristoph Lameter 		return NULL;
1462742755a1SChristoph Lameter 
1463742755a1SChristoph Lameter 	*result = &pm->status;
1464742755a1SChristoph Lameter 
1465e632a938SNaoya Horiguchi 	if (PageHuge(p))
1466e632a938SNaoya Horiguchi 		return alloc_huge_page_node(page_hstate(compound_head(p)),
1467e632a938SNaoya Horiguchi 					pm->node);
1468e8db67ebSNaoya Horiguchi 	else if (thp_migration_supported() && PageTransHuge(p)) {
1469e8db67ebSNaoya Horiguchi 		struct page *thp;
1470e8db67ebSNaoya Horiguchi 
1471e8db67ebSNaoya Horiguchi 		thp = alloc_pages_node(pm->node,
1472e8db67ebSNaoya Horiguchi 			(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
1473e8db67ebSNaoya Horiguchi 			HPAGE_PMD_ORDER);
1474e8db67ebSNaoya Horiguchi 		if (!thp)
1475e8db67ebSNaoya Horiguchi 			return NULL;
1476e8db67ebSNaoya Horiguchi 		prep_transhuge_page(thp);
1477e8db67ebSNaoya Horiguchi 		return thp;
1478e8db67ebSNaoya Horiguchi 	} else
147996db800fSVlastimil Babka 		return __alloc_pages_node(pm->node,
1480e97ca8e5SJohannes Weiner 				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
1481742755a1SChristoph Lameter }
1482742755a1SChristoph Lameter 
1483742755a1SChristoph Lameter /*
1484742755a1SChristoph Lameter  * Move a set of pages as indicated in the pm array. The addr
1485742755a1SChristoph Lameter  * field must be set to the virtual address of the page to be moved
1486742755a1SChristoph Lameter  * and the node number must contain a valid target node.
14875e9a0f02SBrice Goglin  * The pm array ends with node = MAX_NUMNODES.
1488742755a1SChristoph Lameter  */
14895e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm,
14905e9a0f02SBrice Goglin 				      struct page_to_node *pm,
1491742755a1SChristoph Lameter 				      int migrate_all)
1492742755a1SChristoph Lameter {
1493742755a1SChristoph Lameter 	int err;
1494742755a1SChristoph Lameter 	struct page_to_node *pp;
1495742755a1SChristoph Lameter 	LIST_HEAD(pagelist);
1496742755a1SChristoph Lameter 
1497742755a1SChristoph Lameter 	down_read(&mm->mmap_sem);
1498742755a1SChristoph Lameter 
1499742755a1SChristoph Lameter 	/*
1500742755a1SChristoph Lameter 	 * Build a list of pages to migrate
1501742755a1SChristoph Lameter 	 */
1502742755a1SChristoph Lameter 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1503742755a1SChristoph Lameter 		struct vm_area_struct *vma;
1504742755a1SChristoph Lameter 		struct page *page;
1505e8db67ebSNaoya Horiguchi 		struct page *head;
1506e8db67ebSNaoya Horiguchi 		unsigned int follflags;
1507742755a1SChristoph Lameter 
1508742755a1SChristoph Lameter 		err = -EFAULT;
1509742755a1SChristoph Lameter 		vma = find_vma(mm, pp->addr);
151070384dc6SGleb Natapov 		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1511742755a1SChristoph Lameter 			goto set_status;
1512742755a1SChristoph Lameter 
1513d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
1514e8db67ebSNaoya Horiguchi 		follflags = FOLL_GET | FOLL_DUMP;
1515e8db67ebSNaoya Horiguchi 		if (!thp_migration_supported())
1516e8db67ebSNaoya Horiguchi 			follflags |= FOLL_SPLIT;
1517e8db67ebSNaoya Horiguchi 		page = follow_page(vma, pp->addr, follflags);
151889f5b7daSLinus Torvalds 
151989f5b7daSLinus Torvalds 		err = PTR_ERR(page);
152089f5b7daSLinus Torvalds 		if (IS_ERR(page))
152189f5b7daSLinus Torvalds 			goto set_status;
152289f5b7daSLinus Torvalds 
1523742755a1SChristoph Lameter 		err = -ENOENT;
1524742755a1SChristoph Lameter 		if (!page)
1525742755a1SChristoph Lameter 			goto set_status;
1526742755a1SChristoph Lameter 
1527742755a1SChristoph Lameter 		err = page_to_nid(page);
1528742755a1SChristoph Lameter 
1529742755a1SChristoph Lameter 		if (err == pp->node)
1530742755a1SChristoph Lameter 			/*
1531742755a1SChristoph Lameter 			 * Node already in the right place
1532742755a1SChristoph Lameter 			 */
1533742755a1SChristoph Lameter 			goto put_and_set;
1534742755a1SChristoph Lameter 
1535742755a1SChristoph Lameter 		err = -EACCES;
1536742755a1SChristoph Lameter 		if (page_mapcount(page) > 1 &&
1537742755a1SChristoph Lameter 				!migrate_all)
1538742755a1SChristoph Lameter 			goto put_and_set;
1539742755a1SChristoph Lameter 
1540e632a938SNaoya Horiguchi 		if (PageHuge(page)) {
1541e8db67ebSNaoya Horiguchi 			if (PageHead(page)) {
1542e632a938SNaoya Horiguchi 				isolate_huge_page(page, &pagelist);
1543e8db67ebSNaoya Horiguchi 				err = 0;
1544e8db67ebSNaoya Horiguchi 				pp->page = page;
1545e8db67ebSNaoya Horiguchi 			}
1546e632a938SNaoya Horiguchi 			goto put_and_set;
1547e632a938SNaoya Horiguchi 		}
1548e632a938SNaoya Horiguchi 
1549e8db67ebSNaoya Horiguchi 		pp->page = compound_head(page);
1550e8db67ebSNaoya Horiguchi 		head = compound_head(page);
1551e8db67ebSNaoya Horiguchi 		err = isolate_lru_page(head);
15526d9c285aSKOSAKI Motohiro 		if (!err) {
1553e8db67ebSNaoya Horiguchi 			list_add_tail(&head->lru, &pagelist);
1554e8db67ebSNaoya Horiguchi 			mod_node_page_state(page_pgdat(head),
1555e8db67ebSNaoya Horiguchi 				NR_ISOLATED_ANON + page_is_file_cache(head),
1556e8db67ebSNaoya Horiguchi 				hpage_nr_pages(head));
15576d9c285aSKOSAKI Motohiro 		}
1558742755a1SChristoph Lameter put_and_set:
1559742755a1SChristoph Lameter 		/*
1560742755a1SChristoph Lameter 		 * Either remove the duplicate refcount from
1561742755a1SChristoph Lameter 		 * isolate_lru_page() or drop the page ref if it was
1562742755a1SChristoph Lameter 		 * not isolated.
1563742755a1SChristoph Lameter 		 */
1564742755a1SChristoph Lameter 		put_page(page);
1565742755a1SChristoph Lameter set_status:
1566742755a1SChristoph Lameter 		pp->status = err;
1567742755a1SChristoph Lameter 	}
1568742755a1SChristoph Lameter 
1569e78bbfa8SBrice Goglin 	err = 0;
1570cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
157168711a74SDavid Rientjes 		err = migrate_pages(&pagelist, new_page_node, NULL,
15729c620e2bSHugh Dickins 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1573cf608ac1SMinchan Kim 		if (err)
1574e632a938SNaoya Horiguchi 			putback_movable_pages(&pagelist);
1575cf608ac1SMinchan Kim 	}
1576742755a1SChristoph Lameter 
1577742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
1578742755a1SChristoph Lameter 	return err;
1579742755a1SChristoph Lameter }
1580742755a1SChristoph Lameter 
1581742755a1SChristoph Lameter /*
15825e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
15835e9a0f02SBrice Goglin  * the corresponding array of status.
15845e9a0f02SBrice Goglin  */
15853268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
15865e9a0f02SBrice Goglin 			 unsigned long nr_pages,
15875e9a0f02SBrice Goglin 			 const void __user * __user *pages,
15885e9a0f02SBrice Goglin 			 const int __user *nodes,
15895e9a0f02SBrice Goglin 			 int __user *status, int flags)
15905e9a0f02SBrice Goglin {
15913140a227SBrice Goglin 	struct page_to_node *pm;
15923140a227SBrice Goglin 	unsigned long chunk_nr_pages;
15933140a227SBrice Goglin 	unsigned long chunk_start;
15943140a227SBrice Goglin 	int err;
15955e9a0f02SBrice Goglin 
15965e9a0f02SBrice Goglin 	err = -ENOMEM;
15973140a227SBrice Goglin 	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
15983140a227SBrice Goglin 	if (!pm)
15995e9a0f02SBrice Goglin 		goto out;
160035282a2dSBrice Goglin 
160135282a2dSBrice Goglin 	migrate_prep();
160235282a2dSBrice Goglin 
16035e9a0f02SBrice Goglin 	/*
16043140a227SBrice Goglin 	 * Store a chunk of page_to_node array in a page,
16053140a227SBrice Goglin 	 * but keep the last one as a marker
16065e9a0f02SBrice Goglin 	 */
16073140a227SBrice Goglin 	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
16083140a227SBrice Goglin 
16093140a227SBrice Goglin 	for (chunk_start = 0;
16103140a227SBrice Goglin 	     chunk_start < nr_pages;
16113140a227SBrice Goglin 	     chunk_start += chunk_nr_pages) {
16123140a227SBrice Goglin 		int j;
16133140a227SBrice Goglin 
16143140a227SBrice Goglin 		if (chunk_start + chunk_nr_pages > nr_pages)
16153140a227SBrice Goglin 			chunk_nr_pages = nr_pages - chunk_start;
16163140a227SBrice Goglin 
16173140a227SBrice Goglin 		/* fill the chunk pm with addrs and nodes from user-space */
16183140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++) {
16195e9a0f02SBrice Goglin 			const void __user *p;
16205e9a0f02SBrice Goglin 			int node;
16215e9a0f02SBrice Goglin 
16223140a227SBrice Goglin 			err = -EFAULT;
16233140a227SBrice Goglin 			if (get_user(p, pages + j + chunk_start))
16243140a227SBrice Goglin 				goto out_pm;
16253140a227SBrice Goglin 			pm[j].addr = (unsigned long) p;
16263140a227SBrice Goglin 
16273140a227SBrice Goglin 			if (get_user(node, nodes + j + chunk_start))
16285e9a0f02SBrice Goglin 				goto out_pm;
16295e9a0f02SBrice Goglin 
16305e9a0f02SBrice Goglin 			err = -ENODEV;
16316f5a55f1SLinus Torvalds 			if (node < 0 || node >= MAX_NUMNODES)
16326f5a55f1SLinus Torvalds 				goto out_pm;
16336f5a55f1SLinus Torvalds 
1634389162c2SLai Jiangshan 			if (!node_state(node, N_MEMORY))
16355e9a0f02SBrice Goglin 				goto out_pm;
16365e9a0f02SBrice Goglin 
16375e9a0f02SBrice Goglin 			err = -EACCES;
16385e9a0f02SBrice Goglin 			if (!node_isset(node, task_nodes))
16395e9a0f02SBrice Goglin 				goto out_pm;
16405e9a0f02SBrice Goglin 
16413140a227SBrice Goglin 			pm[j].node = node;
16425e9a0f02SBrice Goglin 		}
16435e9a0f02SBrice Goglin 
16443140a227SBrice Goglin 		/* End marker for this chunk */
16453140a227SBrice Goglin 		pm[chunk_nr_pages].node = MAX_NUMNODES;
16463140a227SBrice Goglin 
16473140a227SBrice Goglin 		/* Migrate this chunk */
16483140a227SBrice Goglin 		err = do_move_page_to_node_array(mm, pm,
16493140a227SBrice Goglin 						 flags & MPOL_MF_MOVE_ALL);
16503140a227SBrice Goglin 		if (err < 0)
16513140a227SBrice Goglin 			goto out_pm;
16523140a227SBrice Goglin 
16535e9a0f02SBrice Goglin 		/* Return status information */
16543140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++)
16553140a227SBrice Goglin 			if (put_user(pm[j].status, status + j + chunk_start)) {
16565e9a0f02SBrice Goglin 				err = -EFAULT;
16573140a227SBrice Goglin 				goto out_pm;
16583140a227SBrice Goglin 			}
16593140a227SBrice Goglin 	}
16603140a227SBrice Goglin 	err = 0;
16615e9a0f02SBrice Goglin 
16625e9a0f02SBrice Goglin out_pm:
16633140a227SBrice Goglin 	free_page((unsigned long)pm);
16645e9a0f02SBrice Goglin out:
16655e9a0f02SBrice Goglin 	return err;
16665e9a0f02SBrice Goglin }
16675e9a0f02SBrice Goglin 
16685e9a0f02SBrice Goglin /*
16692f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1670742755a1SChristoph Lameter  */
167180bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
167280bba129SBrice Goglin 				const void __user **pages, int *status)
1673742755a1SChristoph Lameter {
16742f007e74SBrice Goglin 	unsigned long i;
1675742755a1SChristoph Lameter 
16762f007e74SBrice Goglin 	down_read(&mm->mmap_sem);
16772f007e74SBrice Goglin 
16782f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
167980bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
16802f007e74SBrice Goglin 		struct vm_area_struct *vma;
16812f007e74SBrice Goglin 		struct page *page;
1682c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
16832f007e74SBrice Goglin 
16842f007e74SBrice Goglin 		vma = find_vma(mm, addr);
168570384dc6SGleb Natapov 		if (!vma || addr < vma->vm_start)
1686742755a1SChristoph Lameter 			goto set_status;
1687742755a1SChristoph Lameter 
1688d899844eSKirill A. Shutemov 		/* FOLL_DUMP to ignore special (like zero) pages */
1689d899844eSKirill A. Shutemov 		page = follow_page(vma, addr, FOLL_DUMP);
169089f5b7daSLinus Torvalds 
169189f5b7daSLinus Torvalds 		err = PTR_ERR(page);
169289f5b7daSLinus Torvalds 		if (IS_ERR(page))
169389f5b7daSLinus Torvalds 			goto set_status;
169489f5b7daSLinus Torvalds 
1695d899844eSKirill A. Shutemov 		err = page ? page_to_nid(page) : -ENOENT;
1696742755a1SChristoph Lameter set_status:
169780bba129SBrice Goglin 		*status = err;
169880bba129SBrice Goglin 
169980bba129SBrice Goglin 		pages++;
170080bba129SBrice Goglin 		status++;
170180bba129SBrice Goglin 	}
170280bba129SBrice Goglin 
170380bba129SBrice Goglin 	up_read(&mm->mmap_sem);
170480bba129SBrice Goglin }
170580bba129SBrice Goglin 
170680bba129SBrice Goglin /*
170780bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
170880bba129SBrice Goglin  * a user array of status.
170980bba129SBrice Goglin  */
171080bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
171180bba129SBrice Goglin 			 const void __user * __user *pages,
171280bba129SBrice Goglin 			 int __user *status)
171380bba129SBrice Goglin {
171480bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16
171580bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
171680bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
171780bba129SBrice Goglin 
171887b8d1adSH. Peter Anvin 	while (nr_pages) {
171987b8d1adSH. Peter Anvin 		unsigned long chunk_nr;
172080bba129SBrice Goglin 
172187b8d1adSH. Peter Anvin 		chunk_nr = nr_pages;
172287b8d1adSH. Peter Anvin 		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
172387b8d1adSH. Peter Anvin 			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
172487b8d1adSH. Peter Anvin 
172587b8d1adSH. Peter Anvin 		if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
172687b8d1adSH. Peter Anvin 			break;
172780bba129SBrice Goglin 
172880bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
172980bba129SBrice Goglin 
173087b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
173187b8d1adSH. Peter Anvin 			break;
1732742755a1SChristoph Lameter 
173387b8d1adSH. Peter Anvin 		pages += chunk_nr;
173487b8d1adSH. Peter Anvin 		status += chunk_nr;
173587b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
173687b8d1adSH. Peter Anvin 	}
173787b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1738742755a1SChristoph Lameter }
1739742755a1SChristoph Lameter 
1740742755a1SChristoph Lameter /*
1741742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
1742742755a1SChristoph Lameter  * process.
1743742755a1SChristoph Lameter  */
1744938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1745938bb9f5SHeiko Carstens 		const void __user * __user *, pages,
1746938bb9f5SHeiko Carstens 		const int __user *, nodes,
1747938bb9f5SHeiko Carstens 		int __user *, status, int, flags)
1748742755a1SChristoph Lameter {
1749742755a1SChristoph Lameter 	struct task_struct *task;
1750742755a1SChristoph Lameter 	struct mm_struct *mm;
17515e9a0f02SBrice Goglin 	int err;
17523268c63eSChristoph Lameter 	nodemask_t task_nodes;
1753742755a1SChristoph Lameter 
1754742755a1SChristoph Lameter 	/* Check flags */
1755742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1756742755a1SChristoph Lameter 		return -EINVAL;
1757742755a1SChristoph Lameter 
1758742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1759742755a1SChristoph Lameter 		return -EPERM;
1760742755a1SChristoph Lameter 
1761742755a1SChristoph Lameter 	/* Find the mm_struct */
1762a879bf58SGreg Thelen 	rcu_read_lock();
1763228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
1764742755a1SChristoph Lameter 	if (!task) {
1765a879bf58SGreg Thelen 		rcu_read_unlock();
1766742755a1SChristoph Lameter 		return -ESRCH;
1767742755a1SChristoph Lameter 	}
17683268c63eSChristoph Lameter 	get_task_struct(task);
1769742755a1SChristoph Lameter 
1770742755a1SChristoph Lameter 	/*
1771742755a1SChristoph Lameter 	 * Check if this process has the right to modify the specified
1772197e7e52SLinus Torvalds 	 * process. Use the regular "ptrace_may_access()" checks.
1773742755a1SChristoph Lameter 	 */
1774197e7e52SLinus Torvalds 	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1775c69e8d9cSDavid Howells 		rcu_read_unlock();
1776742755a1SChristoph Lameter 		err = -EPERM;
17775e9a0f02SBrice Goglin 		goto out;
1778742755a1SChristoph Lameter 	}
1779c69e8d9cSDavid Howells 	rcu_read_unlock();
1780742755a1SChristoph Lameter 
178186c3a764SDavid Quigley  	err = security_task_movememory(task);
178286c3a764SDavid Quigley  	if (err)
1783742755a1SChristoph Lameter 		goto out;
1784742755a1SChristoph Lameter 
17853268c63eSChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
17863268c63eSChristoph Lameter 	mm = get_task_mm(task);
17873268c63eSChristoph Lameter 	put_task_struct(task);
17883268c63eSChristoph Lameter 
17896e8b09eaSSasha Levin 	if (!mm)
17906e8b09eaSSasha Levin 		return -EINVAL;
17916e8b09eaSSasha Levin 
17923268c63eSChristoph Lameter 	if (nodes)
17933268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
17943268c63eSChristoph Lameter 				    nodes, status, flags);
17953268c63eSChristoph Lameter 	else
17965e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
17973268c63eSChristoph Lameter 
17983268c63eSChristoph Lameter 	mmput(mm);
17993268c63eSChristoph Lameter 	return err;
1800742755a1SChristoph Lameter 
1801742755a1SChristoph Lameter out:
18023268c63eSChristoph Lameter 	put_task_struct(task);
1803742755a1SChristoph Lameter 	return err;
1804742755a1SChristoph Lameter }
1805742755a1SChristoph Lameter 
18067039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING
18077039e1dbSPeter Zijlstra /*
18087039e1dbSPeter Zijlstra  * Returns true if this is a safe migration target node for misplaced NUMA
18097039e1dbSPeter Zijlstra  * pages. Currently it only checks the watermarks which crude
18107039e1dbSPeter Zijlstra  */
18117039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
18123abef4e6SMel Gorman 				   unsigned long nr_migrate_pages)
18137039e1dbSPeter Zijlstra {
18147039e1dbSPeter Zijlstra 	int z;
1815599d0c95SMel Gorman 
18167039e1dbSPeter Zijlstra 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
18177039e1dbSPeter Zijlstra 		struct zone *zone = pgdat->node_zones + z;
18187039e1dbSPeter Zijlstra 
18197039e1dbSPeter Zijlstra 		if (!populated_zone(zone))
18207039e1dbSPeter Zijlstra 			continue;
18217039e1dbSPeter Zijlstra 
18227039e1dbSPeter Zijlstra 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
18237039e1dbSPeter Zijlstra 		if (!zone_watermark_ok(zone, 0,
18247039e1dbSPeter Zijlstra 				       high_wmark_pages(zone) +
18257039e1dbSPeter Zijlstra 				       nr_migrate_pages,
18267039e1dbSPeter Zijlstra 				       0, 0))
18277039e1dbSPeter Zijlstra 			continue;
18287039e1dbSPeter Zijlstra 		return true;
18297039e1dbSPeter Zijlstra 	}
18307039e1dbSPeter Zijlstra 	return false;
18317039e1dbSPeter Zijlstra }
18327039e1dbSPeter Zijlstra 
18337039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page,
18347039e1dbSPeter Zijlstra 					   unsigned long data,
18357039e1dbSPeter Zijlstra 					   int **result)
18367039e1dbSPeter Zijlstra {
18377039e1dbSPeter Zijlstra 	int nid = (int) data;
18387039e1dbSPeter Zijlstra 	struct page *newpage;
18397039e1dbSPeter Zijlstra 
184096db800fSVlastimil Babka 	newpage = __alloc_pages_node(nid,
1841e97ca8e5SJohannes Weiner 					 (GFP_HIGHUSER_MOVABLE |
1842e97ca8e5SJohannes Weiner 					  __GFP_THISNODE | __GFP_NOMEMALLOC |
1843e97ca8e5SJohannes Weiner 					  __GFP_NORETRY | __GFP_NOWARN) &
18448479eba7SMel Gorman 					 ~__GFP_RECLAIM, 0);
1845bac0382cSHillf Danton 
18467039e1dbSPeter Zijlstra 	return newpage;
18477039e1dbSPeter Zijlstra }
18487039e1dbSPeter Zijlstra 
18497039e1dbSPeter Zijlstra /*
1850a8f60772SMel Gorman  * page migration rate limiting control.
1851a8f60772SMel Gorman  * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1852a8f60772SMel Gorman  * window of time. Default here says do not migrate more than 1280M per second.
1853a8f60772SMel Gorman  */
1854a8f60772SMel Gorman static unsigned int migrate_interval_millisecs __read_mostly = 100;
1855a8f60772SMel Gorman static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1856a8f60772SMel Gorman 
1857b32967ffSMel Gorman /* Returns true if the node is migrate rate-limited after the update */
18581c30e017SMel Gorman static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
18591c30e017SMel Gorman 					unsigned long nr_pages)
1860b32967ffSMel Gorman {
1861b32967ffSMel Gorman 	/*
1862b32967ffSMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
1863b32967ffSMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
1864b32967ffSMel Gorman 	 * all the time is being spent migrating!
1865b32967ffSMel Gorman 	 */
1866b32967ffSMel Gorman 	if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
18671c5e9c27SMel Gorman 		spin_lock(&pgdat->numabalancing_migrate_lock);
1868b32967ffSMel Gorman 		pgdat->numabalancing_migrate_nr_pages = 0;
1869b32967ffSMel Gorman 		pgdat->numabalancing_migrate_next_window = jiffies +
1870b32967ffSMel Gorman 			msecs_to_jiffies(migrate_interval_millisecs);
18711c5e9c27SMel Gorman 		spin_unlock(&pgdat->numabalancing_migrate_lock);
1872b32967ffSMel Gorman 	}
1873af1839d7SMel Gorman 	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
1874af1839d7SMel Gorman 		trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
1875af1839d7SMel Gorman 								nr_pages);
18761c5e9c27SMel Gorman 		return true;
1877af1839d7SMel Gorman 	}
1878b32967ffSMel Gorman 
18791c5e9c27SMel Gorman 	/*
18801c5e9c27SMel Gorman 	 * This is an unlocked non-atomic update so errors are possible.
18811c5e9c27SMel Gorman 	 * The consequences are failing to migrate when we potentiall should
18821c5e9c27SMel Gorman 	 * have which is not severe enough to warrant locking. If it is ever
18831c5e9c27SMel Gorman 	 * a problem, it can be converted to a per-cpu counter.
18841c5e9c27SMel Gorman 	 */
18851c5e9c27SMel Gorman 	pgdat->numabalancing_migrate_nr_pages += nr_pages;
18861c5e9c27SMel Gorman 	return false;
1887b32967ffSMel Gorman }
1888b32967ffSMel Gorman 
18891c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1890b32967ffSMel Gorman {
1891340ef390SHugh Dickins 	int page_lru;
1892b32967ffSMel Gorman 
1893309381feSSasha Levin 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
18943abef4e6SMel Gorman 
1895b32967ffSMel Gorman 	/* Avoid migrating to a node that is nearly full */
1896340ef390SHugh Dickins 	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1897340ef390SHugh Dickins 		return 0;
1898b32967ffSMel Gorman 
1899340ef390SHugh Dickins 	if (isolate_lru_page(page))
1900340ef390SHugh Dickins 		return 0;
1901340ef390SHugh Dickins 
1902340ef390SHugh Dickins 	/*
1903340ef390SHugh Dickins 	 * migrate_misplaced_transhuge_page() skips page migration's usual
1904340ef390SHugh Dickins 	 * check on page_count(), so we must do it here, now that the page
1905340ef390SHugh Dickins 	 * has been isolated: a GUP pin, or any other pin, prevents migration.
1906340ef390SHugh Dickins 	 * The expected page count is 3: 1 for page's mapcount and 1 for the
1907340ef390SHugh Dickins 	 * caller's pin and 1 for the reference taken by isolate_lru_page().
1908340ef390SHugh Dickins 	 */
1909340ef390SHugh Dickins 	if (PageTransHuge(page) && page_count(page) != 3) {
1910340ef390SHugh Dickins 		putback_lru_page(page);
1911b32967ffSMel Gorman 		return 0;
1912b32967ffSMel Gorman 	}
1913b32967ffSMel Gorman 
1914b32967ffSMel Gorman 	page_lru = page_is_file_cache(page);
1915599d0c95SMel Gorman 	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
1916340ef390SHugh Dickins 				hpage_nr_pages(page));
1917b32967ffSMel Gorman 
1918b32967ffSMel Gorman 	/*
1919340ef390SHugh Dickins 	 * Isolating the page has taken another reference, so the
1920340ef390SHugh Dickins 	 * caller's reference can be safely dropped without the page
1921340ef390SHugh Dickins 	 * disappearing underneath us during migration.
1922b32967ffSMel Gorman 	 */
1923b32967ffSMel Gorman 	put_page(page);
1924340ef390SHugh Dickins 	return 1;
1925b32967ffSMel Gorman }
1926b32967ffSMel Gorman 
1927de466bd6SMel Gorman bool pmd_trans_migrating(pmd_t pmd)
1928de466bd6SMel Gorman {
1929de466bd6SMel Gorman 	struct page *page = pmd_page(pmd);
1930de466bd6SMel Gorman 	return PageLocked(page);
1931de466bd6SMel Gorman }
1932de466bd6SMel Gorman 
1933a8f60772SMel Gorman /*
19347039e1dbSPeter Zijlstra  * Attempt to migrate a misplaced page to the specified destination
19357039e1dbSPeter Zijlstra  * node. Caller is expected to have an elevated reference count on
19367039e1dbSPeter Zijlstra  * the page that will be dropped by this function before returning.
19377039e1dbSPeter Zijlstra  */
19381bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
19391bc115d8SMel Gorman 			   int node)
19407039e1dbSPeter Zijlstra {
1941a8f60772SMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
1942340ef390SHugh Dickins 	int isolated;
1943b32967ffSMel Gorman 	int nr_remaining;
19447039e1dbSPeter Zijlstra 	LIST_HEAD(migratepages);
19457039e1dbSPeter Zijlstra 
19467039e1dbSPeter Zijlstra 	/*
19471bc115d8SMel Gorman 	 * Don't migrate file pages that are mapped in multiple processes
19481bc115d8SMel Gorman 	 * with execute permissions as they are probably shared libraries.
19497039e1dbSPeter Zijlstra 	 */
19501bc115d8SMel Gorman 	if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
19511bc115d8SMel Gorman 	    (vma->vm_flags & VM_EXEC))
19527039e1dbSPeter Zijlstra 		goto out;
19537039e1dbSPeter Zijlstra 
1954a8f60772SMel Gorman 	/*
1955a8f60772SMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
1956a8f60772SMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
1957a8f60772SMel Gorman 	 * all the time is being spent migrating!
1958a8f60772SMel Gorman 	 */
1959340ef390SHugh Dickins 	if (numamigrate_update_ratelimit(pgdat, 1))
1960a8f60772SMel Gorman 		goto out;
1961a8f60772SMel Gorman 
1962b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
1963b32967ffSMel Gorman 	if (!isolated)
19647039e1dbSPeter Zijlstra 		goto out;
19657039e1dbSPeter Zijlstra 
19667039e1dbSPeter Zijlstra 	list_add(&page->lru, &migratepages);
19679c620e2bSHugh Dickins 	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
196868711a74SDavid Rientjes 				     NULL, node, MIGRATE_ASYNC,
196968711a74SDavid Rientjes 				     MR_NUMA_MISPLACED);
19707039e1dbSPeter Zijlstra 	if (nr_remaining) {
197159c82b70SJoonsoo Kim 		if (!list_empty(&migratepages)) {
197259c82b70SJoonsoo Kim 			list_del(&page->lru);
1973599d0c95SMel Gorman 			dec_node_page_state(page, NR_ISOLATED_ANON +
197459c82b70SJoonsoo Kim 					page_is_file_cache(page));
197559c82b70SJoonsoo Kim 			putback_lru_page(page);
197659c82b70SJoonsoo Kim 		}
19777039e1dbSPeter Zijlstra 		isolated = 0;
197803c5a6e1SMel Gorman 	} else
197903c5a6e1SMel Gorman 		count_vm_numa_event(NUMA_PAGE_MIGRATE);
19807039e1dbSPeter Zijlstra 	BUG_ON(!list_empty(&migratepages));
19817039e1dbSPeter Zijlstra 	return isolated;
1982340ef390SHugh Dickins 
1983340ef390SHugh Dickins out:
1984340ef390SHugh Dickins 	put_page(page);
1985340ef390SHugh Dickins 	return 0;
19867039e1dbSPeter Zijlstra }
1987220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */
1988b32967ffSMel Gorman 
1989220018d3SMel Gorman #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1990340ef390SHugh Dickins /*
1991340ef390SHugh Dickins  * Migrates a THP to a given target node. page must be locked and is unlocked
1992340ef390SHugh Dickins  * before returning.
1993340ef390SHugh Dickins  */
1994b32967ffSMel Gorman int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1995b32967ffSMel Gorman 				struct vm_area_struct *vma,
1996b32967ffSMel Gorman 				pmd_t *pmd, pmd_t entry,
1997b32967ffSMel Gorman 				unsigned long address,
1998b32967ffSMel Gorman 				struct page *page, int node)
1999b32967ffSMel Gorman {
2000c4088ebdSKirill A. Shutemov 	spinlock_t *ptl;
2001b32967ffSMel Gorman 	pg_data_t *pgdat = NODE_DATA(node);
2002b32967ffSMel Gorman 	int isolated = 0;
2003b32967ffSMel Gorman 	struct page *new_page = NULL;
2004b32967ffSMel Gorman 	int page_lru = page_is_file_cache(page);
2005f714f4f2SMel Gorman 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
2006f714f4f2SMel Gorman 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
2007b32967ffSMel Gorman 
2008b32967ffSMel Gorman 	/*
2009b32967ffSMel Gorman 	 * Rate-limit the amount of data that is being migrated to a node.
2010b32967ffSMel Gorman 	 * Optimal placement is no good if the memory bus is saturated and
2011b32967ffSMel Gorman 	 * all the time is being spent migrating!
2012b32967ffSMel Gorman 	 */
2013d28d4335SMel Gorman 	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
2014b32967ffSMel Gorman 		goto out_dropref;
2015b32967ffSMel Gorman 
2016b32967ffSMel Gorman 	new_page = alloc_pages_node(node,
201725160354SVlastimil Babka 		(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
2018e97ca8e5SJohannes Weiner 		HPAGE_PMD_ORDER);
2019340ef390SHugh Dickins 	if (!new_page)
2020340ef390SHugh Dickins 		goto out_fail;
20219a982250SKirill A. Shutemov 	prep_transhuge_page(new_page);
2022340ef390SHugh Dickins 
2023b32967ffSMel Gorman 	isolated = numamigrate_isolate_page(pgdat, page);
2024340ef390SHugh Dickins 	if (!isolated) {
2025b32967ffSMel Gorman 		put_page(new_page);
2026340ef390SHugh Dickins 		goto out_fail;
2027b32967ffSMel Gorman 	}
2028b0943d61SMel Gorman 
2029b32967ffSMel Gorman 	/* Prepare a page as a migration target */
203048c935adSKirill A. Shutemov 	__SetPageLocked(new_page);
2031d44d363fSShaohua Li 	if (PageSwapBacked(page))
2032fa9949daSHugh Dickins 		__SetPageSwapBacked(new_page);
2033b32967ffSMel Gorman 
2034b32967ffSMel Gorman 	/* anon mapping, we can simply copy page->mapping to the new page: */
2035b32967ffSMel Gorman 	new_page->mapping = page->mapping;
2036b32967ffSMel Gorman 	new_page->index = page->index;
2037b32967ffSMel Gorman 	migrate_page_copy(new_page, page);
2038b32967ffSMel Gorman 	WARN_ON(PageLRU(new_page));
2039b32967ffSMel Gorman 
2040b32967ffSMel Gorman 	/* Recheck the target PMD */
2041f714f4f2SMel Gorman 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2042c4088ebdSKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
2043f4e177d1SWill Deacon 	if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
2044c4088ebdSKirill A. Shutemov 		spin_unlock(ptl);
2045f714f4f2SMel Gorman 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2046b32967ffSMel Gorman 
2047b32967ffSMel Gorman 		/* Reverse changes made by migrate_page_copy() */
2048b32967ffSMel Gorman 		if (TestClearPageActive(new_page))
2049b32967ffSMel Gorman 			SetPageActive(page);
2050b32967ffSMel Gorman 		if (TestClearPageUnevictable(new_page))
2051b32967ffSMel Gorman 			SetPageUnevictable(page);
2052b32967ffSMel Gorman 
2053b32967ffSMel Gorman 		unlock_page(new_page);
2054b32967ffSMel Gorman 		put_page(new_page);		/* Free it */
2055b32967ffSMel Gorman 
2056a54a407fSMel Gorman 		/* Retake the callers reference and putback on LRU */
2057a54a407fSMel Gorman 		get_page(page);
2058b32967ffSMel Gorman 		putback_lru_page(page);
2059599d0c95SMel Gorman 		mod_node_page_state(page_pgdat(page),
2060a54a407fSMel Gorman 			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
2061eb4489f6SMel Gorman 
2062eb4489f6SMel Gorman 		goto out_unlock;
2063b32967ffSMel Gorman 	}
2064b32967ffSMel Gorman 
206510102459SKirill A. Shutemov 	entry = mk_huge_pmd(new_page, vma->vm_page_prot);
20662b4847e7SMel Gorman 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2067b32967ffSMel Gorman 
20682b4847e7SMel Gorman 	/*
20692b4847e7SMel Gorman 	 * Clear the old entry under pagetable lock and establish the new PTE.
20702b4847e7SMel Gorman 	 * Any parallel GUP will either observe the old page blocking on the
20712b4847e7SMel Gorman 	 * page lock, block on the page table lock or observe the new page.
20722b4847e7SMel Gorman 	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
20732b4847e7SMel Gorman 	 * guarantee the copy is visible before the pagetable update.
20742b4847e7SMel Gorman 	 */
2075f714f4f2SMel Gorman 	flush_cache_range(vma, mmun_start, mmun_end);
2076d281ee61SKirill A. Shutemov 	page_add_anon_rmap(new_page, vma, mmun_start, true);
20778809aa2dSAneesh Kumar K.V 	pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
2078f714f4f2SMel Gorman 	set_pmd_at(mm, mmun_start, pmd, entry);
2079ce4a9cc5SStephen Rothwell 	update_mmu_cache_pmd(vma, address, &entry);
20802b4847e7SMel Gorman 
2081f4e177d1SWill Deacon 	page_ref_unfreeze(page, 2);
208251afb12bSHugh Dickins 	mlock_migrate_page(new_page, page);
2083d281ee61SKirill A. Shutemov 	page_remove_rmap(page, true);
20847cd12b4aSVlastimil Babka 	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
20852b4847e7SMel Gorman 
2086c4088ebdSKirill A. Shutemov 	spin_unlock(ptl);
2087f714f4f2SMel Gorman 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2088b32967ffSMel Gorman 
208911de9927SMel Gorman 	/* Take an "isolate" reference and put new page on the LRU. */
209011de9927SMel Gorman 	get_page(new_page);
209111de9927SMel Gorman 	putback_lru_page(new_page);
209211de9927SMel Gorman 
2093b32967ffSMel Gorman 	unlock_page(new_page);
2094b32967ffSMel Gorman 	unlock_page(page);
2095b32967ffSMel Gorman 	put_page(page);			/* Drop the rmap reference */
2096b32967ffSMel Gorman 	put_page(page);			/* Drop the LRU isolation reference */
2097b32967ffSMel Gorman 
2098b32967ffSMel Gorman 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
2099b32967ffSMel Gorman 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
2100b32967ffSMel Gorman 
2101599d0c95SMel Gorman 	mod_node_page_state(page_pgdat(page),
2102b32967ffSMel Gorman 			NR_ISOLATED_ANON + page_lru,
2103b32967ffSMel Gorman 			-HPAGE_PMD_NR);
2104b32967ffSMel Gorman 	return isolated;
2105b32967ffSMel Gorman 
2106340ef390SHugh Dickins out_fail:
2107340ef390SHugh Dickins 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
2108b32967ffSMel Gorman out_dropref:
21092b4847e7SMel Gorman 	ptl = pmd_lock(mm, pmd);
21102b4847e7SMel Gorman 	if (pmd_same(*pmd, entry)) {
21114d942466SMel Gorman 		entry = pmd_modify(entry, vma->vm_page_prot);
2112f714f4f2SMel Gorman 		set_pmd_at(mm, mmun_start, pmd, entry);
2113a54a407fSMel Gorman 		update_mmu_cache_pmd(vma, address, &entry);
21142b4847e7SMel Gorman 	}
21152b4847e7SMel Gorman 	spin_unlock(ptl);
2116a54a407fSMel Gorman 
2117eb4489f6SMel Gorman out_unlock:
2118340ef390SHugh Dickins 	unlock_page(page);
2119b32967ffSMel Gorman 	put_page(page);
2120b32967ffSMel Gorman 	return 0;
2121b32967ffSMel Gorman }
21227039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA_BALANCING */
21237039e1dbSPeter Zijlstra 
21247039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA */
21258763cb45SJérôme Glisse 
21268763cb45SJérôme Glisse 
21278763cb45SJérôme Glisse struct migrate_vma {
21288763cb45SJérôme Glisse 	struct vm_area_struct	*vma;
21298763cb45SJérôme Glisse 	unsigned long		*dst;
21308763cb45SJérôme Glisse 	unsigned long		*src;
21318763cb45SJérôme Glisse 	unsigned long		cpages;
21328763cb45SJérôme Glisse 	unsigned long		npages;
21338763cb45SJérôme Glisse 	unsigned long		start;
21348763cb45SJérôme Glisse 	unsigned long		end;
21358763cb45SJérôme Glisse };
21368763cb45SJérôme Glisse 
21378763cb45SJérôme Glisse static int migrate_vma_collect_hole(unsigned long start,
21388763cb45SJérôme Glisse 				    unsigned long end,
21398763cb45SJérôme Glisse 				    struct mm_walk *walk)
21408763cb45SJérôme Glisse {
21418763cb45SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
21428763cb45SJérôme Glisse 	unsigned long addr;
21438763cb45SJérôme Glisse 
21448763cb45SJérôme Glisse 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2145*8315ada7SJérôme Glisse 		migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
2146*8315ada7SJérôme Glisse 		migrate->dst[migrate->npages] = 0;
2147*8315ada7SJérôme Glisse 		migrate->cpages++;
2148*8315ada7SJérôme Glisse 	}
2149*8315ada7SJérôme Glisse 
2150*8315ada7SJérôme Glisse 	return 0;
2151*8315ada7SJérôme Glisse }
2152*8315ada7SJérôme Glisse 
2153*8315ada7SJérôme Glisse static int migrate_vma_collect_skip(unsigned long start,
2154*8315ada7SJérôme Glisse 				    unsigned long end,
2155*8315ada7SJérôme Glisse 				    struct mm_walk *walk)
2156*8315ada7SJérôme Glisse {
2157*8315ada7SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
2158*8315ada7SJérôme Glisse 	unsigned long addr;
2159*8315ada7SJérôme Glisse 
2160*8315ada7SJérôme Glisse 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
21618763cb45SJérôme Glisse 		migrate->dst[migrate->npages] = 0;
21628763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = 0;
21638763cb45SJérôme Glisse 	}
21648763cb45SJérôme Glisse 
21658763cb45SJérôme Glisse 	return 0;
21668763cb45SJérôme Glisse }
21678763cb45SJérôme Glisse 
21688763cb45SJérôme Glisse static int migrate_vma_collect_pmd(pmd_t *pmdp,
21698763cb45SJérôme Glisse 				   unsigned long start,
21708763cb45SJérôme Glisse 				   unsigned long end,
21718763cb45SJérôme Glisse 				   struct mm_walk *walk)
21728763cb45SJérôme Glisse {
21738763cb45SJérôme Glisse 	struct migrate_vma *migrate = walk->private;
21748763cb45SJérôme Glisse 	struct vm_area_struct *vma = walk->vma;
21758763cb45SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
21768c3328f1SJérôme Glisse 	unsigned long addr = start, unmapped = 0;
21778763cb45SJérôme Glisse 	spinlock_t *ptl;
21788763cb45SJérôme Glisse 	pte_t *ptep;
21798763cb45SJérôme Glisse 
21808763cb45SJérôme Glisse again:
21818763cb45SJérôme Glisse 	if (pmd_none(*pmdp))
21828763cb45SJérôme Glisse 		return migrate_vma_collect_hole(start, end, walk);
21838763cb45SJérôme Glisse 
21848763cb45SJérôme Glisse 	if (pmd_trans_huge(*pmdp)) {
21858763cb45SJérôme Glisse 		struct page *page;
21868763cb45SJérôme Glisse 
21878763cb45SJérôme Glisse 		ptl = pmd_lock(mm, pmdp);
21888763cb45SJérôme Glisse 		if (unlikely(!pmd_trans_huge(*pmdp))) {
21898763cb45SJérôme Glisse 			spin_unlock(ptl);
21908763cb45SJérôme Glisse 			goto again;
21918763cb45SJérôme Glisse 		}
21928763cb45SJérôme Glisse 
21938763cb45SJérôme Glisse 		page = pmd_page(*pmdp);
21948763cb45SJérôme Glisse 		if (is_huge_zero_page(page)) {
21958763cb45SJérôme Glisse 			spin_unlock(ptl);
21968763cb45SJérôme Glisse 			split_huge_pmd(vma, pmdp, addr);
21978763cb45SJérôme Glisse 			if (pmd_trans_unstable(pmdp))
2198*8315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
21998763cb45SJérôme Glisse 								walk);
22008763cb45SJérôme Glisse 		} else {
22018763cb45SJérôme Glisse 			int ret;
22028763cb45SJérôme Glisse 
22038763cb45SJérôme Glisse 			get_page(page);
22048763cb45SJérôme Glisse 			spin_unlock(ptl);
22058763cb45SJérôme Glisse 			if (unlikely(!trylock_page(page)))
2206*8315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
22078763cb45SJérôme Glisse 								walk);
22088763cb45SJérôme Glisse 			ret = split_huge_page(page);
22098763cb45SJérôme Glisse 			unlock_page(page);
22108763cb45SJérôme Glisse 			put_page(page);
2211*8315ada7SJérôme Glisse 			if (ret)
2212*8315ada7SJérôme Glisse 				return migrate_vma_collect_skip(start, end,
2213*8315ada7SJérôme Glisse 								walk);
2214*8315ada7SJérôme Glisse 			if (pmd_none(*pmdp))
22158763cb45SJérôme Glisse 				return migrate_vma_collect_hole(start, end,
22168763cb45SJérôme Glisse 								walk);
22178763cb45SJérôme Glisse 		}
22188763cb45SJérôme Glisse 	}
22198763cb45SJérôme Glisse 
22208763cb45SJérôme Glisse 	if (unlikely(pmd_bad(*pmdp)))
2221*8315ada7SJérôme Glisse 		return migrate_vma_collect_skip(start, end, walk);
22228763cb45SJérôme Glisse 
22238763cb45SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
22248c3328f1SJérôme Glisse 	arch_enter_lazy_mmu_mode();
22258c3328f1SJérôme Glisse 
22268763cb45SJérôme Glisse 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
22278763cb45SJérôme Glisse 		unsigned long mpfn, pfn;
22288763cb45SJérôme Glisse 		struct page *page;
22298c3328f1SJérôme Glisse 		swp_entry_t entry;
22308763cb45SJérôme Glisse 		pte_t pte;
22318763cb45SJérôme Glisse 
22328763cb45SJérôme Glisse 		pte = *ptep;
22338763cb45SJérôme Glisse 		pfn = pte_pfn(pte);
22348763cb45SJérôme Glisse 
2235a5430ddaSJérôme Glisse 		if (pte_none(pte)) {
2236*8315ada7SJérôme Glisse 			mpfn = MIGRATE_PFN_MIGRATE;
2237*8315ada7SJérôme Glisse 			migrate->cpages++;
2238*8315ada7SJérôme Glisse 			pfn = 0;
22398763cb45SJérôme Glisse 			goto next;
22408763cb45SJérôme Glisse 		}
22418763cb45SJérôme Glisse 
2242a5430ddaSJérôme Glisse 		if (!pte_present(pte)) {
2243a5430ddaSJérôme Glisse 			mpfn = pfn = 0;
2244a5430ddaSJérôme Glisse 
2245a5430ddaSJérôme Glisse 			/*
2246a5430ddaSJérôme Glisse 			 * Only care about unaddressable device page special
2247a5430ddaSJérôme Glisse 			 * page table entry. Other special swap entries are not
2248a5430ddaSJérôme Glisse 			 * migratable, and we ignore regular swapped page.
2249a5430ddaSJérôme Glisse 			 */
2250a5430ddaSJérôme Glisse 			entry = pte_to_swp_entry(pte);
2251a5430ddaSJérôme Glisse 			if (!is_device_private_entry(entry))
2252a5430ddaSJérôme Glisse 				goto next;
2253a5430ddaSJérôme Glisse 
2254a5430ddaSJérôme Glisse 			page = device_private_entry_to_page(entry);
2255a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(page_to_pfn(page))|
2256a5430ddaSJérôme Glisse 				MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
2257a5430ddaSJérôme Glisse 			if (is_write_device_private_entry(entry))
2258a5430ddaSJérôme Glisse 				mpfn |= MIGRATE_PFN_WRITE;
2259a5430ddaSJérôme Glisse 		} else {
2260*8315ada7SJérôme Glisse 			if (is_zero_pfn(pfn)) {
2261*8315ada7SJérôme Glisse 				mpfn = MIGRATE_PFN_MIGRATE;
2262*8315ada7SJérôme Glisse 				migrate->cpages++;
2263*8315ada7SJérôme Glisse 				pfn = 0;
2264*8315ada7SJérôme Glisse 				goto next;
2265*8315ada7SJérôme Glisse 			}
22668763cb45SJérôme Glisse 			page = vm_normal_page(migrate->vma, addr, pte);
2267a5430ddaSJérôme Glisse 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2268a5430ddaSJérôme Glisse 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2269a5430ddaSJérôme Glisse 		}
2270a5430ddaSJérôme Glisse 
2271a5430ddaSJérôme Glisse 		/* FIXME support THP */
22728763cb45SJérôme Glisse 		if (!page || !page->mapping || PageTransCompound(page)) {
22738763cb45SJérôme Glisse 			mpfn = pfn = 0;
22748763cb45SJérôme Glisse 			goto next;
22758763cb45SJérôme Glisse 		}
2276a5430ddaSJérôme Glisse 		pfn = page_to_pfn(page);
22778763cb45SJérôme Glisse 
22788763cb45SJérôme Glisse 		/*
22798763cb45SJérôme Glisse 		 * By getting a reference on the page we pin it and that blocks
22808763cb45SJérôme Glisse 		 * any kind of migration. Side effect is that it "freezes" the
22818763cb45SJérôme Glisse 		 * pte.
22828763cb45SJérôme Glisse 		 *
22838763cb45SJérôme Glisse 		 * We drop this reference after isolating the page from the lru
22848763cb45SJérôme Glisse 		 * for non device page (device page are not on the lru and thus
22858763cb45SJérôme Glisse 		 * can't be dropped from it).
22868763cb45SJérôme Glisse 		 */
22878763cb45SJérôme Glisse 		get_page(page);
22888763cb45SJérôme Glisse 		migrate->cpages++;
22898763cb45SJérôme Glisse 
22908c3328f1SJérôme Glisse 		/*
22918c3328f1SJérôme Glisse 		 * Optimize for the common case where page is only mapped once
22928c3328f1SJérôme Glisse 		 * in one process. If we can lock the page, then we can safely
22938c3328f1SJérôme Glisse 		 * set up a special migration page table entry now.
22948c3328f1SJérôme Glisse 		 */
22958c3328f1SJérôme Glisse 		if (trylock_page(page)) {
22968c3328f1SJérôme Glisse 			pte_t swp_pte;
22978c3328f1SJérôme Glisse 
22988c3328f1SJérôme Glisse 			mpfn |= MIGRATE_PFN_LOCKED;
22998c3328f1SJérôme Glisse 			ptep_get_and_clear(mm, addr, ptep);
23008c3328f1SJérôme Glisse 
23018c3328f1SJérôme Glisse 			/* Setup special migration page table entry */
23028c3328f1SJérôme Glisse 			entry = make_migration_entry(page, pte_write(pte));
23038c3328f1SJérôme Glisse 			swp_pte = swp_entry_to_pte(entry);
23048c3328f1SJérôme Glisse 			if (pte_soft_dirty(pte))
23058c3328f1SJérôme Glisse 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
23068c3328f1SJérôme Glisse 			set_pte_at(mm, addr, ptep, swp_pte);
23078c3328f1SJérôme Glisse 
23088c3328f1SJérôme Glisse 			/*
23098c3328f1SJérôme Glisse 			 * This is like regular unmap: we remove the rmap and
23108c3328f1SJérôme Glisse 			 * drop page refcount. Page won't be freed, as we took
23118c3328f1SJérôme Glisse 			 * a reference just above.
23128c3328f1SJérôme Glisse 			 */
23138c3328f1SJérôme Glisse 			page_remove_rmap(page, false);
23148c3328f1SJérôme Glisse 			put_page(page);
2315a5430ddaSJérôme Glisse 
2316a5430ddaSJérôme Glisse 			if (pte_present(pte))
23178c3328f1SJérôme Glisse 				unmapped++;
23188c3328f1SJérôme Glisse 		}
23198c3328f1SJérôme Glisse 
23208763cb45SJérôme Glisse next:
2321a5430ddaSJérôme Glisse 		migrate->dst[migrate->npages] = 0;
23228763cb45SJérôme Glisse 		migrate->src[migrate->npages++] = mpfn;
23238763cb45SJérôme Glisse 	}
23248c3328f1SJérôme Glisse 	arch_leave_lazy_mmu_mode();
23258763cb45SJérôme Glisse 	pte_unmap_unlock(ptep - 1, ptl);
23268763cb45SJérôme Glisse 
23278c3328f1SJérôme Glisse 	/* Only flush the TLB if we actually modified any entries */
23288c3328f1SJérôme Glisse 	if (unmapped)
23298c3328f1SJérôme Glisse 		flush_tlb_range(walk->vma, start, end);
23308c3328f1SJérôme Glisse 
23318763cb45SJérôme Glisse 	return 0;
23328763cb45SJérôme Glisse }
23338763cb45SJérôme Glisse 
23348763cb45SJérôme Glisse /*
23358763cb45SJérôme Glisse  * migrate_vma_collect() - collect pages over a range of virtual addresses
23368763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
23378763cb45SJérôme Glisse  *
23388763cb45SJérôme Glisse  * This will walk the CPU page table. For each virtual address backed by a
23398763cb45SJérôme Glisse  * valid page, it updates the src array and takes a reference on the page, in
23408763cb45SJérôme Glisse  * order to pin the page until we lock it and unmap it.
23418763cb45SJérôme Glisse  */
23428763cb45SJérôme Glisse static void migrate_vma_collect(struct migrate_vma *migrate)
23438763cb45SJérôme Glisse {
23448763cb45SJérôme Glisse 	struct mm_walk mm_walk;
23458763cb45SJérôme Glisse 
23468763cb45SJérôme Glisse 	mm_walk.pmd_entry = migrate_vma_collect_pmd;
23478763cb45SJérôme Glisse 	mm_walk.pte_entry = NULL;
23488763cb45SJérôme Glisse 	mm_walk.pte_hole = migrate_vma_collect_hole;
23498763cb45SJérôme Glisse 	mm_walk.hugetlb_entry = NULL;
23508763cb45SJérôme Glisse 	mm_walk.test_walk = NULL;
23518763cb45SJérôme Glisse 	mm_walk.vma = migrate->vma;
23528763cb45SJérôme Glisse 	mm_walk.mm = migrate->vma->vm_mm;
23538763cb45SJérôme Glisse 	mm_walk.private = migrate;
23548763cb45SJérôme Glisse 
23558c3328f1SJérôme Glisse 	mmu_notifier_invalidate_range_start(mm_walk.mm,
23568c3328f1SJérôme Glisse 					    migrate->start,
23578c3328f1SJérôme Glisse 					    migrate->end);
23588763cb45SJérôme Glisse 	walk_page_range(migrate->start, migrate->end, &mm_walk);
23598c3328f1SJérôme Glisse 	mmu_notifier_invalidate_range_end(mm_walk.mm,
23608c3328f1SJérôme Glisse 					  migrate->start,
23618c3328f1SJérôme Glisse 					  migrate->end);
23628763cb45SJérôme Glisse 
23638763cb45SJérôme Glisse 	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
23648763cb45SJérôme Glisse }
23658763cb45SJérôme Glisse 
23668763cb45SJérôme Glisse /*
23678763cb45SJérôme Glisse  * migrate_vma_check_page() - check if page is pinned or not
23688763cb45SJérôme Glisse  * @page: struct page to check
23698763cb45SJérôme Glisse  *
23708763cb45SJérôme Glisse  * Pinned pages cannot be migrated. This is the same test as in
23718763cb45SJérôme Glisse  * migrate_page_move_mapping(), except that here we allow migration of a
23728763cb45SJérôme Glisse  * ZONE_DEVICE page.
23738763cb45SJérôme Glisse  */
23748763cb45SJérôme Glisse static bool migrate_vma_check_page(struct page *page)
23758763cb45SJérôme Glisse {
23768763cb45SJérôme Glisse 	/*
23778763cb45SJérôme Glisse 	 * One extra ref because caller holds an extra reference, either from
23788763cb45SJérôme Glisse 	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
23798763cb45SJérôme Glisse 	 * a device page.
23808763cb45SJérôme Glisse 	 */
23818763cb45SJérôme Glisse 	int extra = 1;
23828763cb45SJérôme Glisse 
23838763cb45SJérôme Glisse 	/*
23848763cb45SJérôme Glisse 	 * FIXME support THP (transparent huge page), it is bit more complex to
23858763cb45SJérôme Glisse 	 * check them than regular pages, because they can be mapped with a pmd
23868763cb45SJérôme Glisse 	 * or with a pte (split pte mapping).
23878763cb45SJérôme Glisse 	 */
23888763cb45SJérôme Glisse 	if (PageCompound(page))
23898763cb45SJérôme Glisse 		return false;
23908763cb45SJérôme Glisse 
2391a5430ddaSJérôme Glisse 	/* Page from ZONE_DEVICE have one extra reference */
2392a5430ddaSJérôme Glisse 	if (is_zone_device_page(page)) {
2393a5430ddaSJérôme Glisse 		/*
2394a5430ddaSJérôme Glisse 		 * Private page can never be pin as they have no valid pte and
2395a5430ddaSJérôme Glisse 		 * GUP will fail for those. Yet if there is a pending migration
2396a5430ddaSJérôme Glisse 		 * a thread might try to wait on the pte migration entry and
2397a5430ddaSJérôme Glisse 		 * will bump the page reference count. Sadly there is no way to
2398a5430ddaSJérôme Glisse 		 * differentiate a regular pin from migration wait. Hence to
2399a5430ddaSJérôme Glisse 		 * avoid 2 racing thread trying to migrate back to CPU to enter
2400a5430ddaSJérôme Glisse 		 * infinite loop (one stoping migration because the other is
2401a5430ddaSJérôme Glisse 		 * waiting on pte migration entry). We always return true here.
2402a5430ddaSJérôme Glisse 		 *
2403a5430ddaSJérôme Glisse 		 * FIXME proper solution is to rework migration_entry_wait() so
2404a5430ddaSJérôme Glisse 		 * it does not need to take a reference on page.
2405a5430ddaSJérôme Glisse 		 */
2406a5430ddaSJérôme Glisse 		if (is_device_private_page(page))
2407a5430ddaSJérôme Glisse 			return true;
2408a5430ddaSJérôme Glisse 
2409a5430ddaSJérôme Glisse 		/* Other ZONE_DEVICE memory type are not supported */
2410a5430ddaSJérôme Glisse 		return false;
2411a5430ddaSJérôme Glisse 	}
2412a5430ddaSJérôme Glisse 
24138763cb45SJérôme Glisse 	if ((page_count(page) - extra) > page_mapcount(page))
24148763cb45SJérôme Glisse 		return false;
24158763cb45SJérôme Glisse 
24168763cb45SJérôme Glisse 	return true;
24178763cb45SJérôme Glisse }
24188763cb45SJérôme Glisse 
24198763cb45SJérôme Glisse /*
24208763cb45SJérôme Glisse  * migrate_vma_prepare() - lock pages and isolate them from the lru
24218763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
24228763cb45SJérôme Glisse  *
24238763cb45SJérôme Glisse  * This locks pages that have been collected by migrate_vma_collect(). Once each
24248763cb45SJérôme Glisse  * page is locked it is isolated from the lru (for non-device pages). Finally,
24258763cb45SJérôme Glisse  * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
24268763cb45SJérôme Glisse  * migrated by concurrent kernel threads.
24278763cb45SJérôme Glisse  */
24288763cb45SJérôme Glisse static void migrate_vma_prepare(struct migrate_vma *migrate)
24298763cb45SJérôme Glisse {
24308763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
24318c3328f1SJérôme Glisse 	const unsigned long start = migrate->start;
24328c3328f1SJérôme Glisse 	unsigned long addr, i, restore = 0;
24338763cb45SJérôme Glisse 	bool allow_drain = true;
24348763cb45SJérôme Glisse 
24358763cb45SJérôme Glisse 	lru_add_drain();
24368763cb45SJérôme Glisse 
24378763cb45SJérôme Glisse 	for (i = 0; (i < npages) && migrate->cpages; i++) {
24388763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
24398c3328f1SJérôme Glisse 		bool remap = true;
24408763cb45SJérôme Glisse 
24418763cb45SJérôme Glisse 		if (!page)
24428763cb45SJérôme Glisse 			continue;
24438763cb45SJérôme Glisse 
24448c3328f1SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
24458763cb45SJérôme Glisse 			/*
24468763cb45SJérôme Glisse 			 * Because we are migrating several pages there can be
24478763cb45SJérôme Glisse 			 * a deadlock between 2 concurrent migration where each
24488763cb45SJérôme Glisse 			 * are waiting on each other page lock.
24498763cb45SJérôme Glisse 			 *
24508763cb45SJérôme Glisse 			 * Make migrate_vma() a best effort thing and backoff
24518763cb45SJérôme Glisse 			 * for any page we can not lock right away.
24528763cb45SJérôme Glisse 			 */
24538763cb45SJérôme Glisse 			if (!trylock_page(page)) {
24548763cb45SJérôme Glisse 				migrate->src[i] = 0;
24558763cb45SJérôme Glisse 				migrate->cpages--;
24568763cb45SJérôme Glisse 				put_page(page);
24578763cb45SJérôme Glisse 				continue;
24588763cb45SJérôme Glisse 			}
24598c3328f1SJérôme Glisse 			remap = false;
24608763cb45SJérôme Glisse 			migrate->src[i] |= MIGRATE_PFN_LOCKED;
24618c3328f1SJérôme Glisse 		}
24628763cb45SJérôme Glisse 
2463a5430ddaSJérôme Glisse 		/* ZONE_DEVICE pages are not on LRU */
2464a5430ddaSJérôme Glisse 		if (!is_zone_device_page(page)) {
24658763cb45SJérôme Glisse 			if (!PageLRU(page) && allow_drain) {
24668763cb45SJérôme Glisse 				/* Drain CPU's pagevec */
24678763cb45SJérôme Glisse 				lru_add_drain_all();
24688763cb45SJérôme Glisse 				allow_drain = false;
24698763cb45SJérôme Glisse 			}
24708763cb45SJérôme Glisse 
24718763cb45SJérôme Glisse 			if (isolate_lru_page(page)) {
24728c3328f1SJérôme Glisse 				if (remap) {
24738c3328f1SJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
24748c3328f1SJérôme Glisse 					migrate->cpages--;
24758c3328f1SJérôme Glisse 					restore++;
24768c3328f1SJérôme Glisse 				} else {
24778763cb45SJérôme Glisse 					migrate->src[i] = 0;
24788763cb45SJérôme Glisse 					unlock_page(page);
24798763cb45SJérôme Glisse 					migrate->cpages--;
24808763cb45SJérôme Glisse 					put_page(page);
24818c3328f1SJérôme Glisse 				}
24828763cb45SJérôme Glisse 				continue;
24838763cb45SJérôme Glisse 			}
24848763cb45SJérôme Glisse 
2485a5430ddaSJérôme Glisse 			/* Drop the reference we took in collect */
2486a5430ddaSJérôme Glisse 			put_page(page);
2487a5430ddaSJérôme Glisse 		}
2488a5430ddaSJérôme Glisse 
24898763cb45SJérôme Glisse 		if (!migrate_vma_check_page(page)) {
24908c3328f1SJérôme Glisse 			if (remap) {
24918c3328f1SJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
24928c3328f1SJérôme Glisse 				migrate->cpages--;
24938c3328f1SJérôme Glisse 				restore++;
24948c3328f1SJérôme Glisse 
2495a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page)) {
24968c3328f1SJérôme Glisse 					get_page(page);
24978c3328f1SJérôme Glisse 					putback_lru_page(page);
2498a5430ddaSJérôme Glisse 				}
24998c3328f1SJérôme Glisse 			} else {
25008763cb45SJérôme Glisse 				migrate->src[i] = 0;
25018763cb45SJérôme Glisse 				unlock_page(page);
25028763cb45SJérôme Glisse 				migrate->cpages--;
25038763cb45SJérôme Glisse 
2504a5430ddaSJérôme Glisse 				if (!is_zone_device_page(page))
25058763cb45SJérôme Glisse 					putback_lru_page(page);
2506a5430ddaSJérôme Glisse 				else
2507a5430ddaSJérôme Glisse 					put_page(page);
25088763cb45SJérôme Glisse 			}
25098763cb45SJérôme Glisse 		}
25108763cb45SJérôme Glisse 	}
25118763cb45SJérôme Glisse 
25128c3328f1SJérôme Glisse 	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
25138c3328f1SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25148c3328f1SJérôme Glisse 
25158c3328f1SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
25168c3328f1SJérôme Glisse 			continue;
25178c3328f1SJérôme Glisse 
25188c3328f1SJérôme Glisse 		remove_migration_pte(page, migrate->vma, addr, page);
25198c3328f1SJérôme Glisse 
25208c3328f1SJérôme Glisse 		migrate->src[i] = 0;
25218c3328f1SJérôme Glisse 		unlock_page(page);
25228c3328f1SJérôme Glisse 		put_page(page);
25238c3328f1SJérôme Glisse 		restore--;
25248c3328f1SJérôme Glisse 	}
25258c3328f1SJérôme Glisse }
25268c3328f1SJérôme Glisse 
25278763cb45SJérôme Glisse /*
25288763cb45SJérôme Glisse  * migrate_vma_unmap() - replace page mapping with special migration pte entry
25298763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
25308763cb45SJérôme Glisse  *
25318763cb45SJérôme Glisse  * Replace page mapping (CPU page table pte) with a special migration pte entry
25328763cb45SJérôme Glisse  * and check again if it has been pinned. Pinned pages are restored because we
25338763cb45SJérôme Glisse  * cannot migrate them.
25348763cb45SJérôme Glisse  *
25358763cb45SJérôme Glisse  * This is the last step before we call the device driver callback to allocate
25368763cb45SJérôme Glisse  * destination memory and copy contents of original page over to new page.
25378763cb45SJérôme Glisse  */
25388763cb45SJérôme Glisse static void migrate_vma_unmap(struct migrate_vma *migrate)
25398763cb45SJérôme Glisse {
25408763cb45SJérôme Glisse 	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
25418763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
25428763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
25438763cb45SJérôme Glisse 	unsigned long addr, i, restore = 0;
25448763cb45SJérôme Glisse 
25458763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
25468763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25478763cb45SJérôme Glisse 
25488763cb45SJérôme Glisse 		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
25498763cb45SJérôme Glisse 			continue;
25508763cb45SJérôme Glisse 
25518c3328f1SJérôme Glisse 		if (page_mapped(page)) {
25528763cb45SJérôme Glisse 			try_to_unmap(page, flags);
25538c3328f1SJérôme Glisse 			if (page_mapped(page))
25548c3328f1SJérôme Glisse 				goto restore;
25558c3328f1SJérôme Glisse 		}
25568c3328f1SJérôme Glisse 
25578c3328f1SJérôme Glisse 		if (migrate_vma_check_page(page))
25588c3328f1SJérôme Glisse 			continue;
25598c3328f1SJérôme Glisse 
25608c3328f1SJérôme Glisse restore:
25618763cb45SJérôme Glisse 		migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
25628763cb45SJérôme Glisse 		migrate->cpages--;
25638763cb45SJérôme Glisse 		restore++;
25648763cb45SJérôme Glisse 	}
25658763cb45SJérôme Glisse 
25668763cb45SJérôme Glisse 	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
25678763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
25688763cb45SJérôme Glisse 
25698763cb45SJérôme Glisse 		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
25708763cb45SJérôme Glisse 			continue;
25718763cb45SJérôme Glisse 
25728763cb45SJérôme Glisse 		remove_migration_ptes(page, page, false);
25738763cb45SJérôme Glisse 
25748763cb45SJérôme Glisse 		migrate->src[i] = 0;
25758763cb45SJérôme Glisse 		unlock_page(page);
25768763cb45SJérôme Glisse 		restore--;
25778763cb45SJérôme Glisse 
2578a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
2579a5430ddaSJérôme Glisse 			put_page(page);
2580a5430ddaSJérôme Glisse 		else
25818763cb45SJérôme Glisse 			putback_lru_page(page);
25828763cb45SJérôme Glisse 	}
25838763cb45SJérôme Glisse }
25848763cb45SJérôme Glisse 
2585*8315ada7SJérôme Glisse static void migrate_vma_insert_page(struct migrate_vma *migrate,
2586*8315ada7SJérôme Glisse 				    unsigned long addr,
2587*8315ada7SJérôme Glisse 				    struct page *page,
2588*8315ada7SJérôme Glisse 				    unsigned long *src,
2589*8315ada7SJérôme Glisse 				    unsigned long *dst)
2590*8315ada7SJérôme Glisse {
2591*8315ada7SJérôme Glisse 	struct vm_area_struct *vma = migrate->vma;
2592*8315ada7SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
2593*8315ada7SJérôme Glisse 	struct mem_cgroup *memcg;
2594*8315ada7SJérôme Glisse 	bool flush = false;
2595*8315ada7SJérôme Glisse 	spinlock_t *ptl;
2596*8315ada7SJérôme Glisse 	pte_t entry;
2597*8315ada7SJérôme Glisse 	pgd_t *pgdp;
2598*8315ada7SJérôme Glisse 	p4d_t *p4dp;
2599*8315ada7SJérôme Glisse 	pud_t *pudp;
2600*8315ada7SJérôme Glisse 	pmd_t *pmdp;
2601*8315ada7SJérôme Glisse 	pte_t *ptep;
2602*8315ada7SJérôme Glisse 
2603*8315ada7SJérôme Glisse 	/* Only allow populating anonymous memory */
2604*8315ada7SJérôme Glisse 	if (!vma_is_anonymous(vma))
2605*8315ada7SJérôme Glisse 		goto abort;
2606*8315ada7SJérôme Glisse 
2607*8315ada7SJérôme Glisse 	pgdp = pgd_offset(mm, addr);
2608*8315ada7SJérôme Glisse 	p4dp = p4d_alloc(mm, pgdp, addr);
2609*8315ada7SJérôme Glisse 	if (!p4dp)
2610*8315ada7SJérôme Glisse 		goto abort;
2611*8315ada7SJérôme Glisse 	pudp = pud_alloc(mm, p4dp, addr);
2612*8315ada7SJérôme Glisse 	if (!pudp)
2613*8315ada7SJérôme Glisse 		goto abort;
2614*8315ada7SJérôme Glisse 	pmdp = pmd_alloc(mm, pudp, addr);
2615*8315ada7SJérôme Glisse 	if (!pmdp)
2616*8315ada7SJérôme Glisse 		goto abort;
2617*8315ada7SJérôme Glisse 
2618*8315ada7SJérôme Glisse 	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
2619*8315ada7SJérôme Glisse 		goto abort;
2620*8315ada7SJérôme Glisse 
2621*8315ada7SJérôme Glisse 	/*
2622*8315ada7SJérôme Glisse 	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
2623*8315ada7SJérôme Glisse 	 * pte_offset_map() on pmds where a huge pmd might be created
2624*8315ada7SJérôme Glisse 	 * from a different thread.
2625*8315ada7SJérôme Glisse 	 *
2626*8315ada7SJérôme Glisse 	 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2627*8315ada7SJérôme Glisse 	 * parallel threads are excluded by other means.
2628*8315ada7SJérôme Glisse 	 *
2629*8315ada7SJérôme Glisse 	 * Here we only have down_read(mmap_sem).
2630*8315ada7SJérôme Glisse 	 */
2631*8315ada7SJérôme Glisse 	if (pte_alloc(mm, pmdp, addr))
2632*8315ada7SJérôme Glisse 		goto abort;
2633*8315ada7SJérôme Glisse 
2634*8315ada7SJérôme Glisse 	/* See the comment in pte_alloc_one_map() */
2635*8315ada7SJérôme Glisse 	if (unlikely(pmd_trans_unstable(pmdp)))
2636*8315ada7SJérôme Glisse 		goto abort;
2637*8315ada7SJérôme Glisse 
2638*8315ada7SJérôme Glisse 	if (unlikely(anon_vma_prepare(vma)))
2639*8315ada7SJérôme Glisse 		goto abort;
2640*8315ada7SJérôme Glisse 	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2641*8315ada7SJérôme Glisse 		goto abort;
2642*8315ada7SJérôme Glisse 
2643*8315ada7SJérôme Glisse 	/*
2644*8315ada7SJérôme Glisse 	 * The memory barrier inside __SetPageUptodate makes sure that
2645*8315ada7SJérôme Glisse 	 * preceding stores to the page contents become visible before
2646*8315ada7SJérôme Glisse 	 * the set_pte_at() write.
2647*8315ada7SJérôme Glisse 	 */
2648*8315ada7SJérôme Glisse 	__SetPageUptodate(page);
2649*8315ada7SJérôme Glisse 
2650*8315ada7SJérôme Glisse 	if (is_zone_device_page(page) && is_device_private_page(page)) {
2651*8315ada7SJérôme Glisse 		swp_entry_t swp_entry;
2652*8315ada7SJérôme Glisse 
2653*8315ada7SJérôme Glisse 		swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
2654*8315ada7SJérôme Glisse 		entry = swp_entry_to_pte(swp_entry);
2655*8315ada7SJérôme Glisse 	} else {
2656*8315ada7SJérôme Glisse 		entry = mk_pte(page, vma->vm_page_prot);
2657*8315ada7SJérôme Glisse 		if (vma->vm_flags & VM_WRITE)
2658*8315ada7SJérôme Glisse 			entry = pte_mkwrite(pte_mkdirty(entry));
2659*8315ada7SJérôme Glisse 	}
2660*8315ada7SJérôme Glisse 
2661*8315ada7SJérôme Glisse 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2662*8315ada7SJérôme Glisse 
2663*8315ada7SJérôme Glisse 	if (pte_present(*ptep)) {
2664*8315ada7SJérôme Glisse 		unsigned long pfn = pte_pfn(*ptep);
2665*8315ada7SJérôme Glisse 
2666*8315ada7SJérôme Glisse 		if (!is_zero_pfn(pfn)) {
2667*8315ada7SJérôme Glisse 			pte_unmap_unlock(ptep, ptl);
2668*8315ada7SJérôme Glisse 			mem_cgroup_cancel_charge(page, memcg, false);
2669*8315ada7SJérôme Glisse 			goto abort;
2670*8315ada7SJérôme Glisse 		}
2671*8315ada7SJérôme Glisse 		flush = true;
2672*8315ada7SJérôme Glisse 	} else if (!pte_none(*ptep)) {
2673*8315ada7SJérôme Glisse 		pte_unmap_unlock(ptep, ptl);
2674*8315ada7SJérôme Glisse 		mem_cgroup_cancel_charge(page, memcg, false);
2675*8315ada7SJérôme Glisse 		goto abort;
2676*8315ada7SJérôme Glisse 	}
2677*8315ada7SJérôme Glisse 
2678*8315ada7SJérôme Glisse 	/*
2679*8315ada7SJérôme Glisse 	 * Check for usefaultfd but do not deliver the fault. Instead,
2680*8315ada7SJérôme Glisse 	 * just back off.
2681*8315ada7SJérôme Glisse 	 */
2682*8315ada7SJérôme Glisse 	if (userfaultfd_missing(vma)) {
2683*8315ada7SJérôme Glisse 		pte_unmap_unlock(ptep, ptl);
2684*8315ada7SJérôme Glisse 		mem_cgroup_cancel_charge(page, memcg, false);
2685*8315ada7SJérôme Glisse 		goto abort;
2686*8315ada7SJérôme Glisse 	}
2687*8315ada7SJérôme Glisse 
2688*8315ada7SJérôme Glisse 	inc_mm_counter(mm, MM_ANONPAGES);
2689*8315ada7SJérôme Glisse 	page_add_new_anon_rmap(page, vma, addr, false);
2690*8315ada7SJérôme Glisse 	mem_cgroup_commit_charge(page, memcg, false, false);
2691*8315ada7SJérôme Glisse 	if (!is_zone_device_page(page))
2692*8315ada7SJérôme Glisse 		lru_cache_add_active_or_unevictable(page, vma);
2693*8315ada7SJérôme Glisse 	get_page(page);
2694*8315ada7SJérôme Glisse 
2695*8315ada7SJérôme Glisse 	if (flush) {
2696*8315ada7SJérôme Glisse 		flush_cache_page(vma, addr, pte_pfn(*ptep));
2697*8315ada7SJérôme Glisse 		ptep_clear_flush_notify(vma, addr, ptep);
2698*8315ada7SJérôme Glisse 		set_pte_at_notify(mm, addr, ptep, entry);
2699*8315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
2700*8315ada7SJérôme Glisse 	} else {
2701*8315ada7SJérôme Glisse 		/* No need to invalidate - it was non-present before */
2702*8315ada7SJérôme Glisse 		set_pte_at(mm, addr, ptep, entry);
2703*8315ada7SJérôme Glisse 		update_mmu_cache(vma, addr, ptep);
2704*8315ada7SJérôme Glisse 	}
2705*8315ada7SJérôme Glisse 
2706*8315ada7SJérôme Glisse 	pte_unmap_unlock(ptep, ptl);
2707*8315ada7SJérôme Glisse 	*src = MIGRATE_PFN_MIGRATE;
2708*8315ada7SJérôme Glisse 	return;
2709*8315ada7SJérôme Glisse 
2710*8315ada7SJérôme Glisse abort:
2711*8315ada7SJérôme Glisse 	*src &= ~MIGRATE_PFN_MIGRATE;
2712*8315ada7SJérôme Glisse }
2713*8315ada7SJérôme Glisse 
27148763cb45SJérôme Glisse /*
27158763cb45SJérôme Glisse  * migrate_vma_pages() - migrate meta-data from src page to dst page
27168763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
27178763cb45SJérôme Glisse  *
27188763cb45SJérôme Glisse  * This migrates struct page meta-data from source struct page to destination
27198763cb45SJérôme Glisse  * struct page. This effectively finishes the migration from source page to the
27208763cb45SJérôme Glisse  * destination page.
27218763cb45SJérôme Glisse  */
27228763cb45SJérôme Glisse static void migrate_vma_pages(struct migrate_vma *migrate)
27238763cb45SJérôme Glisse {
27248763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
27258763cb45SJérôme Glisse 	const unsigned long start = migrate->start;
2726*8315ada7SJérôme Glisse 	struct vm_area_struct *vma = migrate->vma;
2727*8315ada7SJérôme Glisse 	struct mm_struct *mm = vma->vm_mm;
2728*8315ada7SJérôme Glisse 	unsigned long addr, i, mmu_start;
2729*8315ada7SJérôme Glisse 	bool notified = false;
27308763cb45SJérôme Glisse 
27318763cb45SJérôme Glisse 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
27328763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
27338763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
27348763cb45SJérôme Glisse 		struct address_space *mapping;
27358763cb45SJérôme Glisse 		int r;
27368763cb45SJérôme Glisse 
2737*8315ada7SJérôme Glisse 		if (!newpage) {
2738*8315ada7SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
27398763cb45SJérôme Glisse 			continue;
2740*8315ada7SJérôme Glisse 		}
2741*8315ada7SJérôme Glisse 
2742*8315ada7SJérôme Glisse 		if (!page) {
2743*8315ada7SJérôme Glisse 			if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
27448763cb45SJérôme Glisse 				continue;
2745*8315ada7SJérôme Glisse 			}
2746*8315ada7SJérôme Glisse 			if (!notified) {
2747*8315ada7SJérôme Glisse 				mmu_start = addr;
2748*8315ada7SJérôme Glisse 				notified = true;
2749*8315ada7SJérôme Glisse 				mmu_notifier_invalidate_range_start(mm,
2750*8315ada7SJérôme Glisse 								mmu_start,
2751*8315ada7SJérôme Glisse 								migrate->end);
2752*8315ada7SJérôme Glisse 			}
2753*8315ada7SJérôme Glisse 			migrate_vma_insert_page(migrate, addr, newpage,
2754*8315ada7SJérôme Glisse 						&migrate->src[i],
2755*8315ada7SJérôme Glisse 						&migrate->dst[i]);
2756*8315ada7SJérôme Glisse 			continue;
2757*8315ada7SJérôme Glisse 		}
27588763cb45SJérôme Glisse 
27598763cb45SJérôme Glisse 		mapping = page_mapping(page);
27608763cb45SJérôme Glisse 
2761a5430ddaSJérôme Glisse 		if (is_zone_device_page(newpage)) {
2762a5430ddaSJérôme Glisse 			if (is_device_private_page(newpage)) {
2763a5430ddaSJérôme Glisse 				/*
2764a5430ddaSJérôme Glisse 				 * For now only support private anonymous when
2765a5430ddaSJérôme Glisse 				 * migrating to un-addressable device memory.
2766a5430ddaSJérôme Glisse 				 */
2767a5430ddaSJérôme Glisse 				if (mapping) {
2768a5430ddaSJérôme Glisse 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2769a5430ddaSJérôme Glisse 					continue;
2770a5430ddaSJérôme Glisse 				}
2771a5430ddaSJérôme Glisse 			} else {
2772a5430ddaSJérôme Glisse 				/*
2773a5430ddaSJérôme Glisse 				 * Other types of ZONE_DEVICE page are not
2774a5430ddaSJérôme Glisse 				 * supported.
2775a5430ddaSJérôme Glisse 				 */
2776a5430ddaSJérôme Glisse 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2777a5430ddaSJérôme Glisse 				continue;
2778a5430ddaSJérôme Glisse 			}
2779a5430ddaSJérôme Glisse 		}
2780a5430ddaSJérôme Glisse 
27818763cb45SJérôme Glisse 		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
27828763cb45SJérôme Glisse 		if (r != MIGRATEPAGE_SUCCESS)
27838763cb45SJérôme Glisse 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
27848763cb45SJérôme Glisse 	}
2785*8315ada7SJérôme Glisse 
2786*8315ada7SJérôme Glisse 	if (notified)
2787*8315ada7SJérôme Glisse 		mmu_notifier_invalidate_range_end(mm, mmu_start,
2788*8315ada7SJérôme Glisse 						  migrate->end);
27898763cb45SJérôme Glisse }
27908763cb45SJérôme Glisse 
27918763cb45SJérôme Glisse /*
27928763cb45SJérôme Glisse  * migrate_vma_finalize() - restore CPU page table entry
27938763cb45SJérôme Glisse  * @migrate: migrate struct containing all migration information
27948763cb45SJérôme Glisse  *
27958763cb45SJérôme Glisse  * This replaces the special migration pte entry with either a mapping to the
27968763cb45SJérôme Glisse  * new page if migration was successful for that page, or to the original page
27978763cb45SJérôme Glisse  * otherwise.
27988763cb45SJérôme Glisse  *
27998763cb45SJérôme Glisse  * This also unlocks the pages and puts them back on the lru, or drops the extra
28008763cb45SJérôme Glisse  * refcount, for device pages.
28018763cb45SJérôme Glisse  */
28028763cb45SJérôme Glisse static void migrate_vma_finalize(struct migrate_vma *migrate)
28038763cb45SJérôme Glisse {
28048763cb45SJérôme Glisse 	const unsigned long npages = migrate->npages;
28058763cb45SJérôme Glisse 	unsigned long i;
28068763cb45SJérôme Glisse 
28078763cb45SJérôme Glisse 	for (i = 0; i < npages; i++) {
28088763cb45SJérôme Glisse 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
28098763cb45SJérôme Glisse 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
28108763cb45SJérôme Glisse 
2811*8315ada7SJérôme Glisse 		if (!page) {
2812*8315ada7SJérôme Glisse 			if (newpage) {
2813*8315ada7SJérôme Glisse 				unlock_page(newpage);
2814*8315ada7SJérôme Glisse 				put_page(newpage);
2815*8315ada7SJérôme Glisse 			}
28168763cb45SJérôme Glisse 			continue;
2817*8315ada7SJérôme Glisse 		}
2818*8315ada7SJérôme Glisse 
28198763cb45SJérôme Glisse 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
28208763cb45SJérôme Glisse 			if (newpage) {
28218763cb45SJérôme Glisse 				unlock_page(newpage);
28228763cb45SJérôme Glisse 				put_page(newpage);
28238763cb45SJérôme Glisse 			}
28248763cb45SJérôme Glisse 			newpage = page;
28258763cb45SJérôme Glisse 		}
28268763cb45SJérôme Glisse 
28278763cb45SJérôme Glisse 		remove_migration_ptes(page, newpage, false);
28288763cb45SJérôme Glisse 		unlock_page(page);
28298763cb45SJérôme Glisse 		migrate->cpages--;
28308763cb45SJérôme Glisse 
2831a5430ddaSJérôme Glisse 		if (is_zone_device_page(page))
2832a5430ddaSJérôme Glisse 			put_page(page);
2833a5430ddaSJérôme Glisse 		else
28348763cb45SJérôme Glisse 			putback_lru_page(page);
28358763cb45SJérôme Glisse 
28368763cb45SJérôme Glisse 		if (newpage != page) {
28378763cb45SJérôme Glisse 			unlock_page(newpage);
2838a5430ddaSJérôme Glisse 			if (is_zone_device_page(newpage))
2839a5430ddaSJérôme Glisse 				put_page(newpage);
2840a5430ddaSJérôme Glisse 			else
28418763cb45SJérôme Glisse 				putback_lru_page(newpage);
28428763cb45SJérôme Glisse 		}
28438763cb45SJérôme Glisse 	}
28448763cb45SJérôme Glisse }
28458763cb45SJérôme Glisse 
28468763cb45SJérôme Glisse /*
28478763cb45SJérôme Glisse  * migrate_vma() - migrate a range of memory inside vma
28488763cb45SJérôme Glisse  *
28498763cb45SJérôme Glisse  * @ops: migration callback for allocating destination memory and copying
28508763cb45SJérôme Glisse  * @vma: virtual memory area containing the range to be migrated
28518763cb45SJérôme Glisse  * @start: start address of the range to migrate (inclusive)
28528763cb45SJérôme Glisse  * @end: end address of the range to migrate (exclusive)
28538763cb45SJérôme Glisse  * @src: array of hmm_pfn_t containing source pfns
28548763cb45SJérôme Glisse  * @dst: array of hmm_pfn_t containing destination pfns
28558763cb45SJérôme Glisse  * @private: pointer passed back to each of the callback
28568763cb45SJérôme Glisse  * Returns: 0 on success, error code otherwise
28578763cb45SJérôme Glisse  *
28588763cb45SJérôme Glisse  * This function tries to migrate a range of memory virtual address range, using
28598763cb45SJérôme Glisse  * callbacks to allocate and copy memory from source to destination. First it
28608763cb45SJérôme Glisse  * collects all the pages backing each virtual address in the range, saving this
28618763cb45SJérôme Glisse  * inside the src array. Then it locks those pages and unmaps them. Once the pages
28628763cb45SJérôme Glisse  * are locked and unmapped, it checks whether each page is pinned or not. Pages
28638763cb45SJérôme Glisse  * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
28648763cb45SJérôme Glisse  * in the corresponding src array entry. It then restores any pages that are
28658763cb45SJérôme Glisse  * pinned, by remapping and unlocking those pages.
28668763cb45SJérôme Glisse  *
28678763cb45SJérôme Glisse  * At this point it calls the alloc_and_copy() callback. For documentation on
28688763cb45SJérôme Glisse  * what is expected from that callback, see struct migrate_vma_ops comments in
28698763cb45SJérôme Glisse  * include/linux/migrate.h
28708763cb45SJérôme Glisse  *
28718763cb45SJérôme Glisse  * After the alloc_and_copy() callback, this function goes over each entry in
28728763cb45SJérôme Glisse  * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
28738763cb45SJérôme Glisse  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
28748763cb45SJérôme Glisse  * then the function tries to migrate struct page information from the source
28758763cb45SJérôme Glisse  * struct page to the destination struct page. If it fails to migrate the struct
28768763cb45SJérôme Glisse  * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
28778763cb45SJérôme Glisse  * array.
28788763cb45SJérôme Glisse  *
28798763cb45SJérôme Glisse  * At this point all successfully migrated pages have an entry in the src
28808763cb45SJérôme Glisse  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
28818763cb45SJérôme Glisse  * array entry with MIGRATE_PFN_VALID flag set.
28828763cb45SJérôme Glisse  *
28838763cb45SJérôme Glisse  * It then calls the finalize_and_map() callback. See comments for "struct
28848763cb45SJérôme Glisse  * migrate_vma_ops", in include/linux/migrate.h for details about
28858763cb45SJérôme Glisse  * finalize_and_map() behavior.
28868763cb45SJérôme Glisse  *
28878763cb45SJérôme Glisse  * After the finalize_and_map() callback, for successfully migrated pages, this
28888763cb45SJérôme Glisse  * function updates the CPU page table to point to new pages, otherwise it
28898763cb45SJérôme Glisse  * restores the CPU page table to point to the original source pages.
28908763cb45SJérôme Glisse  *
28918763cb45SJérôme Glisse  * Function returns 0 after the above steps, even if no pages were migrated
28928763cb45SJérôme Glisse  * (The function only returns an error if any of the arguments are invalid.)
28938763cb45SJérôme Glisse  *
28948763cb45SJérôme Glisse  * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
28958763cb45SJérôme Glisse  * unsigned long entries.
28968763cb45SJérôme Glisse  */
28978763cb45SJérôme Glisse int migrate_vma(const struct migrate_vma_ops *ops,
28988763cb45SJérôme Glisse 		struct vm_area_struct *vma,
28998763cb45SJérôme Glisse 		unsigned long start,
29008763cb45SJérôme Glisse 		unsigned long end,
29018763cb45SJérôme Glisse 		unsigned long *src,
29028763cb45SJérôme Glisse 		unsigned long *dst,
29038763cb45SJérôme Glisse 		void *private)
29048763cb45SJérôme Glisse {
29058763cb45SJérôme Glisse 	struct migrate_vma migrate;
29068763cb45SJérôme Glisse 
29078763cb45SJérôme Glisse 	/* Sanity check the arguments */
29088763cb45SJérôme Glisse 	start &= PAGE_MASK;
29098763cb45SJérôme Glisse 	end &= PAGE_MASK;
29108763cb45SJérôme Glisse 	if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
29118763cb45SJérôme Glisse 		return -EINVAL;
29128763cb45SJérôme Glisse 	if (start < vma->vm_start || start >= vma->vm_end)
29138763cb45SJérôme Glisse 		return -EINVAL;
29148763cb45SJérôme Glisse 	if (end <= vma->vm_start || end > vma->vm_end)
29158763cb45SJérôme Glisse 		return -EINVAL;
29168763cb45SJérôme Glisse 	if (!ops || !src || !dst || start >= end)
29178763cb45SJérôme Glisse 		return -EINVAL;
29188763cb45SJérôme Glisse 
29198763cb45SJérôme Glisse 	memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
29208763cb45SJérôme Glisse 	migrate.src = src;
29218763cb45SJérôme Glisse 	migrate.dst = dst;
29228763cb45SJérôme Glisse 	migrate.start = start;
29238763cb45SJérôme Glisse 	migrate.npages = 0;
29248763cb45SJérôme Glisse 	migrate.cpages = 0;
29258763cb45SJérôme Glisse 	migrate.end = end;
29268763cb45SJérôme Glisse 	migrate.vma = vma;
29278763cb45SJérôme Glisse 
29288763cb45SJérôme Glisse 	/* Collect, and try to unmap source pages */
29298763cb45SJérôme Glisse 	migrate_vma_collect(&migrate);
29308763cb45SJérôme Glisse 	if (!migrate.cpages)
29318763cb45SJérôme Glisse 		return 0;
29328763cb45SJérôme Glisse 
29338763cb45SJérôme Glisse 	/* Lock and isolate page */
29348763cb45SJérôme Glisse 	migrate_vma_prepare(&migrate);
29358763cb45SJérôme Glisse 	if (!migrate.cpages)
29368763cb45SJérôme Glisse 		return 0;
29378763cb45SJérôme Glisse 
29388763cb45SJérôme Glisse 	/* Unmap pages */
29398763cb45SJérôme Glisse 	migrate_vma_unmap(&migrate);
29408763cb45SJérôme Glisse 	if (!migrate.cpages)
29418763cb45SJérôme Glisse 		return 0;
29428763cb45SJérôme Glisse 
29438763cb45SJérôme Glisse 	/*
29448763cb45SJérôme Glisse 	 * At this point pages are locked and unmapped, and thus they have
29458763cb45SJérôme Glisse 	 * stable content and can safely be copied to destination memory that
29468763cb45SJérôme Glisse 	 * is allocated by the callback.
29478763cb45SJérôme Glisse 	 *
29488763cb45SJérôme Glisse 	 * Note that migration can fail in migrate_vma_struct_page() for each
29498763cb45SJérôme Glisse 	 * individual page.
29508763cb45SJérôme Glisse 	 */
29518763cb45SJérôme Glisse 	ops->alloc_and_copy(vma, src, dst, start, end, private);
29528763cb45SJérôme Glisse 
29538763cb45SJérôme Glisse 	/* This does the real migration of struct page */
29548763cb45SJérôme Glisse 	migrate_vma_pages(&migrate);
29558763cb45SJérôme Glisse 
29568763cb45SJérôme Glisse 	ops->finalize_and_map(vma, src, dst, start, end, private);
29578763cb45SJérôme Glisse 
29588763cb45SJérôme Glisse 	/* Unlock and remap pages */
29598763cb45SJérôme Glisse 	migrate_vma_finalize(&migrate);
29608763cb45SJérôme Glisse 
29618763cb45SJérôme Glisse 	return 0;
29628763cb45SJérôme Glisse }
29638763cb45SJérôme Glisse EXPORT_SYMBOL(migrate_vma);
2964