xref: /linux/mm/migrate.c (revision 78bd52097d04205a33a8014a1b8ac01cf1ae9d06)
1b20a3503SChristoph Lameter /*
2b20a3503SChristoph Lameter  * Memory Migration functionality - linux/mm/migration.c
3b20a3503SChristoph Lameter  *
4b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5b20a3503SChristoph Lameter  *
6b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
7b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
8b20a3503SChristoph Lameter  *
9b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
11b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
12cde53535SChristoph Lameter  * Christoph Lameter
13b20a3503SChristoph Lameter  */
14b20a3503SChristoph Lameter 
15b20a3503SChristoph Lameter #include <linux/migrate.h>
16b95f1b31SPaul Gortmaker #include <linux/export.h>
17b20a3503SChristoph Lameter #include <linux/swap.h>
180697212aSChristoph Lameter #include <linux/swapops.h>
19b20a3503SChristoph Lameter #include <linux/pagemap.h>
20e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
21b20a3503SChristoph Lameter #include <linux/mm_inline.h>
22b488893aSPavel Emelyanov #include <linux/nsproxy.h>
23b20a3503SChristoph Lameter #include <linux/pagevec.h>
24e9995ef9SHugh Dickins #include <linux/ksm.h>
25b20a3503SChristoph Lameter #include <linux/rmap.h>
26b20a3503SChristoph Lameter #include <linux/topology.h>
27b20a3503SChristoph Lameter #include <linux/cpu.h>
28b20a3503SChristoph Lameter #include <linux/cpuset.h>
2904e62a29SChristoph Lameter #include <linux/writeback.h>
30742755a1SChristoph Lameter #include <linux/mempolicy.h>
31742755a1SChristoph Lameter #include <linux/vmalloc.h>
3286c3a764SDavid Quigley #include <linux/security.h>
338a9f3ccdSBalbir Singh #include <linux/memcontrol.h>
344f5ca265SAdrian Bunk #include <linux/syscalls.h>
35290408d4SNaoya Horiguchi #include <linux/hugetlb.h>
368e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
375a0e3ad6STejun Heo #include <linux/gfp.h>
38b20a3503SChristoph Lameter 
390d1836c3SMichal Nazarewicz #include <asm/tlbflush.h>
400d1836c3SMichal Nazarewicz 
41b20a3503SChristoph Lameter #include "internal.h"
42b20a3503SChristoph Lameter 
43b20a3503SChristoph Lameter /*
44742755a1SChristoph Lameter  * migrate_prep() needs to be called before we start compiling a list of pages
45748446bbSMel Gorman  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
46748446bbSMel Gorman  * undesirable, use migrate_prep_local()
47b20a3503SChristoph Lameter  */
48b20a3503SChristoph Lameter int migrate_prep(void)
49b20a3503SChristoph Lameter {
50b20a3503SChristoph Lameter 	/*
51b20a3503SChristoph Lameter 	 * Clear the LRU lists so pages can be isolated.
52b20a3503SChristoph Lameter 	 * Note that pages may be moved off the LRU after we have
53b20a3503SChristoph Lameter 	 * drained them. Those pages will fail to migrate like other
54b20a3503SChristoph Lameter 	 * pages that may be busy.
55b20a3503SChristoph Lameter 	 */
56b20a3503SChristoph Lameter 	lru_add_drain_all();
57b20a3503SChristoph Lameter 
58b20a3503SChristoph Lameter 	return 0;
59b20a3503SChristoph Lameter }
60b20a3503SChristoph Lameter 
61748446bbSMel Gorman /* Do the necessary work of migrate_prep but not if it involves other CPUs */
62748446bbSMel Gorman int migrate_prep_local(void)
63748446bbSMel Gorman {
64748446bbSMel Gorman 	lru_add_drain();
65748446bbSMel Gorman 
66748446bbSMel Gorman 	return 0;
67748446bbSMel Gorman }
68748446bbSMel Gorman 
69b20a3503SChristoph Lameter /*
70894bc310SLee Schermerhorn  * Add isolated pages on the list back to the LRU under page lock
71894bc310SLee Schermerhorn  * to avoid leaking evictable pages back onto unevictable list.
72b20a3503SChristoph Lameter  */
73e13861d8SMinchan Kim void putback_lru_pages(struct list_head *l)
74b20a3503SChristoph Lameter {
75b20a3503SChristoph Lameter 	struct page *page;
76b20a3503SChristoph Lameter 	struct page *page2;
77b20a3503SChristoph Lameter 
78b20a3503SChristoph Lameter 	list_for_each_entry_safe(page, page2, l, lru) {
79e24f0b8fSChristoph Lameter 		list_del(&page->lru);
80a731286dSKOSAKI Motohiro 		dec_zone_page_state(page, NR_ISOLATED_ANON +
816c0b1351SJohannes Weiner 				page_is_file_cache(page));
82894bc310SLee Schermerhorn 		putback_lru_page(page);
83b20a3503SChristoph Lameter 	}
84b20a3503SChristoph Lameter }
85b20a3503SChristoph Lameter 
860697212aSChristoph Lameter /*
870697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
880697212aSChristoph Lameter  */
89e9995ef9SHugh Dickins static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
90e9995ef9SHugh Dickins 				 unsigned long addr, void *old)
910697212aSChristoph Lameter {
920697212aSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
930697212aSChristoph Lameter 	swp_entry_t entry;
940697212aSChristoph Lameter  	pmd_t *pmd;
950697212aSChristoph Lameter 	pte_t *ptep, pte;
960697212aSChristoph Lameter  	spinlock_t *ptl;
970697212aSChristoph Lameter 
98290408d4SNaoya Horiguchi 	if (unlikely(PageHuge(new))) {
99290408d4SNaoya Horiguchi 		ptep = huge_pte_offset(mm, addr);
100290408d4SNaoya Horiguchi 		if (!ptep)
101290408d4SNaoya Horiguchi 			goto out;
102290408d4SNaoya Horiguchi 		ptl = &mm->page_table_lock;
103290408d4SNaoya Horiguchi 	} else {
1046219049aSBob Liu 		pmd = mm_find_pmd(mm, addr);
1056219049aSBob Liu 		if (!pmd)
106e9995ef9SHugh Dickins 			goto out;
107500d65d4SAndrea Arcangeli 		if (pmd_trans_huge(*pmd))
108500d65d4SAndrea Arcangeli 			goto out;
1090697212aSChristoph Lameter 
1100697212aSChristoph Lameter 		ptep = pte_offset_map(pmd, addr);
1110697212aSChristoph Lameter 
112486cf46fSHugh Dickins 		/*
113486cf46fSHugh Dickins 		 * Peek to check is_swap_pte() before taking ptlock?  No, we
114486cf46fSHugh Dickins 		 * can race mremap's move_ptes(), which skips anon_vma lock.
115486cf46fSHugh Dickins 		 */
1160697212aSChristoph Lameter 
1170697212aSChristoph Lameter 		ptl = pte_lockptr(mm, pmd);
118290408d4SNaoya Horiguchi 	}
119290408d4SNaoya Horiguchi 
1200697212aSChristoph Lameter  	spin_lock(ptl);
1210697212aSChristoph Lameter 	pte = *ptep;
1220697212aSChristoph Lameter 	if (!is_swap_pte(pte))
123e9995ef9SHugh Dickins 		goto unlock;
1240697212aSChristoph Lameter 
1250697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
1260697212aSChristoph Lameter 
127e9995ef9SHugh Dickins 	if (!is_migration_entry(entry) ||
128e9995ef9SHugh Dickins 	    migration_entry_to_page(entry) != old)
129e9995ef9SHugh Dickins 		goto unlock;
1300697212aSChristoph Lameter 
1310697212aSChristoph Lameter 	get_page(new);
1320697212aSChristoph Lameter 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
1330697212aSChristoph Lameter 	if (is_write_migration_entry(entry))
1340697212aSChristoph Lameter 		pte = pte_mkwrite(pte);
1353ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE
136290408d4SNaoya Horiguchi 	if (PageHuge(new))
137290408d4SNaoya Horiguchi 		pte = pte_mkhuge(pte);
1383ef8fd7fSAndi Kleen #endif
13997ee0524SKAMEZAWA Hiroyuki 	flush_cache_page(vma, addr, pte_pfn(pte));
1400697212aSChristoph Lameter 	set_pte_at(mm, addr, ptep, pte);
14104e62a29SChristoph Lameter 
142290408d4SNaoya Horiguchi 	if (PageHuge(new)) {
14304e62a29SChristoph Lameter 		if (PageAnon(new))
144290408d4SNaoya Horiguchi 			hugepage_add_anon_rmap(new, vma, addr);
145290408d4SNaoya Horiguchi 		else
146290408d4SNaoya Horiguchi 			page_dup_rmap(new);
147290408d4SNaoya Horiguchi 	} else if (PageAnon(new))
1480697212aSChristoph Lameter 		page_add_anon_rmap(new, vma, addr);
14904e62a29SChristoph Lameter 	else
15004e62a29SChristoph Lameter 		page_add_file_rmap(new);
15104e62a29SChristoph Lameter 
15204e62a29SChristoph Lameter 	/* No need to invalidate - it was non-present before */
1534b3073e1SRussell King 	update_mmu_cache(vma, addr, ptep);
154e9995ef9SHugh Dickins unlock:
1550697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
156e9995ef9SHugh Dickins out:
157e9995ef9SHugh Dickins 	return SWAP_AGAIN;
1580697212aSChristoph Lameter }
1590697212aSChristoph Lameter 
1600697212aSChristoph Lameter /*
16104e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
16204e62a29SChristoph Lameter  * references to the indicated page.
16304e62a29SChristoph Lameter  */
16404e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new)
16504e62a29SChristoph Lameter {
166e9995ef9SHugh Dickins 	rmap_walk(new, remove_migration_pte, old);
16704e62a29SChristoph Lameter }
16804e62a29SChristoph Lameter 
16904e62a29SChristoph Lameter /*
1700697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
1710697212aSChristoph Lameter  * get to the page and wait until migration is finished.
1720697212aSChristoph Lameter  * When we return from this function the fault will be retried.
1730697212aSChristoph Lameter  */
1740697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
1750697212aSChristoph Lameter 				unsigned long address)
1760697212aSChristoph Lameter {
1770697212aSChristoph Lameter 	pte_t *ptep, pte;
1780697212aSChristoph Lameter 	spinlock_t *ptl;
1790697212aSChristoph Lameter 	swp_entry_t entry;
1800697212aSChristoph Lameter 	struct page *page;
1810697212aSChristoph Lameter 
1820697212aSChristoph Lameter 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1830697212aSChristoph Lameter 	pte = *ptep;
1840697212aSChristoph Lameter 	if (!is_swap_pte(pte))
1850697212aSChristoph Lameter 		goto out;
1860697212aSChristoph Lameter 
1870697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
1880697212aSChristoph Lameter 	if (!is_migration_entry(entry))
1890697212aSChristoph Lameter 		goto out;
1900697212aSChristoph Lameter 
1910697212aSChristoph Lameter 	page = migration_entry_to_page(entry);
1920697212aSChristoph Lameter 
193e286781dSNick Piggin 	/*
194e286781dSNick Piggin 	 * Once radix-tree replacement of page migration started, page_count
195e286781dSNick Piggin 	 * *must* be zero. And, we don't want to call wait_on_page_locked()
196e286781dSNick Piggin 	 * against a page without get_page().
197e286781dSNick Piggin 	 * So, we use get_page_unless_zero(), here. Even failed, page fault
198e286781dSNick Piggin 	 * will occur again.
199e286781dSNick Piggin 	 */
200e286781dSNick Piggin 	if (!get_page_unless_zero(page))
201e286781dSNick Piggin 		goto out;
2020697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
2030697212aSChristoph Lameter 	wait_on_page_locked(page);
2040697212aSChristoph Lameter 	put_page(page);
2050697212aSChristoph Lameter 	return;
2060697212aSChristoph Lameter out:
2070697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
2080697212aSChristoph Lameter }
2090697212aSChristoph Lameter 
210b969c4abSMel Gorman #ifdef CONFIG_BLOCK
211b969c4abSMel Gorman /* Returns true if all buffers are successfully locked */
212a6bc32b8SMel Gorman static bool buffer_migrate_lock_buffers(struct buffer_head *head,
213a6bc32b8SMel Gorman 							enum migrate_mode mode)
214b969c4abSMel Gorman {
215b969c4abSMel Gorman 	struct buffer_head *bh = head;
216b969c4abSMel Gorman 
217b969c4abSMel Gorman 	/* Simple case, sync compaction */
218a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC) {
219b969c4abSMel Gorman 		do {
220b969c4abSMel Gorman 			get_bh(bh);
221b969c4abSMel Gorman 			lock_buffer(bh);
222b969c4abSMel Gorman 			bh = bh->b_this_page;
223b969c4abSMel Gorman 
224b969c4abSMel Gorman 		} while (bh != head);
225b969c4abSMel Gorman 
226b969c4abSMel Gorman 		return true;
227b969c4abSMel Gorman 	}
228b969c4abSMel Gorman 
229b969c4abSMel Gorman 	/* async case, we cannot block on lock_buffer so use trylock_buffer */
230b969c4abSMel Gorman 	do {
231b969c4abSMel Gorman 		get_bh(bh);
232b969c4abSMel Gorman 		if (!trylock_buffer(bh)) {
233b969c4abSMel Gorman 			/*
234b969c4abSMel Gorman 			 * We failed to lock the buffer and cannot stall in
235b969c4abSMel Gorman 			 * async migration. Release the taken locks
236b969c4abSMel Gorman 			 */
237b969c4abSMel Gorman 			struct buffer_head *failed_bh = bh;
238b969c4abSMel Gorman 			put_bh(failed_bh);
239b969c4abSMel Gorman 			bh = head;
240b969c4abSMel Gorman 			while (bh != failed_bh) {
241b969c4abSMel Gorman 				unlock_buffer(bh);
242b969c4abSMel Gorman 				put_bh(bh);
243b969c4abSMel Gorman 				bh = bh->b_this_page;
244b969c4abSMel Gorman 			}
245b969c4abSMel Gorman 			return false;
246b969c4abSMel Gorman 		}
247b969c4abSMel Gorman 
248b969c4abSMel Gorman 		bh = bh->b_this_page;
249b969c4abSMel Gorman 	} while (bh != head);
250b969c4abSMel Gorman 	return true;
251b969c4abSMel Gorman }
252b969c4abSMel Gorman #else
253b969c4abSMel Gorman static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
254a6bc32b8SMel Gorman 							enum migrate_mode mode)
255b969c4abSMel Gorman {
256b969c4abSMel Gorman 	return true;
257b969c4abSMel Gorman }
258b969c4abSMel Gorman #endif /* CONFIG_BLOCK */
259b969c4abSMel Gorman 
260b20a3503SChristoph Lameter /*
261c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
2625b5c7120SChristoph Lameter  *
2635b5c7120SChristoph Lameter  * The number of remaining references must be:
2645b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
2655b5c7120SChristoph Lameter  * 2 for pages with a mapping
266266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
267b20a3503SChristoph Lameter  */
2682d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping,
269b969c4abSMel Gorman 		struct page *newpage, struct page *page,
270a6bc32b8SMel Gorman 		struct buffer_head *head, enum migrate_mode mode)
271b20a3503SChristoph Lameter {
272e286781dSNick Piggin 	int expected_count;
2737cf9c2c7SNick Piggin 	void **pslot;
274b20a3503SChristoph Lameter 
2756c5240aeSChristoph Lameter 	if (!mapping) {
2760e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
2776c5240aeSChristoph Lameter 		if (page_count(page) != 1)
2786c5240aeSChristoph Lameter 			return -EAGAIN;
279*78bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
2806c5240aeSChristoph Lameter 	}
2816c5240aeSChristoph Lameter 
28219fd6231SNick Piggin 	spin_lock_irq(&mapping->tree_lock);
283b20a3503SChristoph Lameter 
2847cf9c2c7SNick Piggin 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
285b20a3503SChristoph Lameter  					page_index(page));
286b20a3503SChristoph Lameter 
287edcf4748SJohannes Weiner 	expected_count = 2 + page_has_private(page);
288e286781dSNick Piggin 	if (page_count(page) != expected_count ||
28929c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
29019fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
291e23ca00bSChristoph Lameter 		return -EAGAIN;
292b20a3503SChristoph Lameter 	}
293b20a3503SChristoph Lameter 
294e286781dSNick Piggin 	if (!page_freeze_refs(page, expected_count)) {
29519fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
296e286781dSNick Piggin 		return -EAGAIN;
297e286781dSNick Piggin 	}
298e286781dSNick Piggin 
299b20a3503SChristoph Lameter 	/*
300b969c4abSMel Gorman 	 * In the async migration case of moving a page with buffers, lock the
301b969c4abSMel Gorman 	 * buffers using trylock before the mapping is moved. If the mapping
302b969c4abSMel Gorman 	 * was moved, we later failed to lock the buffers and could not move
303b969c4abSMel Gorman 	 * the mapping back due to an elevated page count, we would have to
304b969c4abSMel Gorman 	 * block waiting on other references to be dropped.
305b969c4abSMel Gorman 	 */
306a6bc32b8SMel Gorman 	if (mode == MIGRATE_ASYNC && head &&
307a6bc32b8SMel Gorman 			!buffer_migrate_lock_buffers(head, mode)) {
308b969c4abSMel Gorman 		page_unfreeze_refs(page, expected_count);
309b969c4abSMel Gorman 		spin_unlock_irq(&mapping->tree_lock);
310b969c4abSMel Gorman 		return -EAGAIN;
311b969c4abSMel Gorman 	}
312b969c4abSMel Gorman 
313b969c4abSMel Gorman 	/*
314b20a3503SChristoph Lameter 	 * Now we know that no one else is looking at the page.
315b20a3503SChristoph Lameter 	 */
3167cf9c2c7SNick Piggin 	get_page(newpage);	/* add cache reference */
317b20a3503SChristoph Lameter 	if (PageSwapCache(page)) {
318b20a3503SChristoph Lameter 		SetPageSwapCache(newpage);
319b20a3503SChristoph Lameter 		set_page_private(newpage, page_private(page));
320b20a3503SChristoph Lameter 	}
321b20a3503SChristoph Lameter 
3227cf9c2c7SNick Piggin 	radix_tree_replace_slot(pslot, newpage);
3237cf9c2c7SNick Piggin 
3247cf9c2c7SNick Piggin 	/*
325937a94c9SJacobo Giralt 	 * Drop cache reference from old page by unfreezing
326937a94c9SJacobo Giralt 	 * to one less reference.
3277cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
3287cf9c2c7SNick Piggin 	 */
329937a94c9SJacobo Giralt 	page_unfreeze_refs(page, expected_count - 1);
3307cf9c2c7SNick Piggin 
3310e8c7d0fSChristoph Lameter 	/*
3320e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
3330e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
3340e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
3350e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
3360e8c7d0fSChristoph Lameter 	 *
3370e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
3380e8c7d0fSChristoph Lameter 	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
3390e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
3400e8c7d0fSChristoph Lameter 	 */
3410e8c7d0fSChristoph Lameter 	__dec_zone_page_state(page, NR_FILE_PAGES);
3420e8c7d0fSChristoph Lameter 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
34399a15e21SAndrea Arcangeli 	if (!PageSwapCache(page) && PageSwapBacked(page)) {
3444b02108aSKOSAKI Motohiro 		__dec_zone_page_state(page, NR_SHMEM);
3454b02108aSKOSAKI Motohiro 		__inc_zone_page_state(newpage, NR_SHMEM);
3464b02108aSKOSAKI Motohiro 	}
34719fd6231SNick Piggin 	spin_unlock_irq(&mapping->tree_lock);
348b20a3503SChristoph Lameter 
349*78bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
350b20a3503SChristoph Lameter }
351b20a3503SChristoph Lameter 
352b20a3503SChristoph Lameter /*
353290408d4SNaoya Horiguchi  * The expected number of remaining references is the same as that
354290408d4SNaoya Horiguchi  * of migrate_page_move_mapping().
355290408d4SNaoya Horiguchi  */
356290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping,
357290408d4SNaoya Horiguchi 				   struct page *newpage, struct page *page)
358290408d4SNaoya Horiguchi {
359290408d4SNaoya Horiguchi 	int expected_count;
360290408d4SNaoya Horiguchi 	void **pslot;
361290408d4SNaoya Horiguchi 
362290408d4SNaoya Horiguchi 	if (!mapping) {
363290408d4SNaoya Horiguchi 		if (page_count(page) != 1)
364290408d4SNaoya Horiguchi 			return -EAGAIN;
365*78bd5209SRafael Aquini 		return MIGRATEPAGE_SUCCESS;
366290408d4SNaoya Horiguchi 	}
367290408d4SNaoya Horiguchi 
368290408d4SNaoya Horiguchi 	spin_lock_irq(&mapping->tree_lock);
369290408d4SNaoya Horiguchi 
370290408d4SNaoya Horiguchi 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
371290408d4SNaoya Horiguchi 					page_index(page));
372290408d4SNaoya Horiguchi 
373290408d4SNaoya Horiguchi 	expected_count = 2 + page_has_private(page);
374290408d4SNaoya Horiguchi 	if (page_count(page) != expected_count ||
37529c1f677SMel Gorman 		radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
376290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
377290408d4SNaoya Horiguchi 		return -EAGAIN;
378290408d4SNaoya Horiguchi 	}
379290408d4SNaoya Horiguchi 
380290408d4SNaoya Horiguchi 	if (!page_freeze_refs(page, expected_count)) {
381290408d4SNaoya Horiguchi 		spin_unlock_irq(&mapping->tree_lock);
382290408d4SNaoya Horiguchi 		return -EAGAIN;
383290408d4SNaoya Horiguchi 	}
384290408d4SNaoya Horiguchi 
385290408d4SNaoya Horiguchi 	get_page(newpage);
386290408d4SNaoya Horiguchi 
387290408d4SNaoya Horiguchi 	radix_tree_replace_slot(pslot, newpage);
388290408d4SNaoya Horiguchi 
389937a94c9SJacobo Giralt 	page_unfreeze_refs(page, expected_count - 1);
390290408d4SNaoya Horiguchi 
391290408d4SNaoya Horiguchi 	spin_unlock_irq(&mapping->tree_lock);
392*78bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
393290408d4SNaoya Horiguchi }
394290408d4SNaoya Horiguchi 
395290408d4SNaoya Horiguchi /*
396b20a3503SChristoph Lameter  * Copy the page to its new location
397b20a3503SChristoph Lameter  */
398290408d4SNaoya Horiguchi void migrate_page_copy(struct page *newpage, struct page *page)
399b20a3503SChristoph Lameter {
400290408d4SNaoya Horiguchi 	if (PageHuge(page))
401290408d4SNaoya Horiguchi 		copy_huge_page(newpage, page);
402290408d4SNaoya Horiguchi 	else
403b20a3503SChristoph Lameter 		copy_highpage(newpage, page);
404b20a3503SChristoph Lameter 
405b20a3503SChristoph Lameter 	if (PageError(page))
406b20a3503SChristoph Lameter 		SetPageError(newpage);
407b20a3503SChristoph Lameter 	if (PageReferenced(page))
408b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
409b20a3503SChristoph Lameter 	if (PageUptodate(page))
410b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
411894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
412894bc310SLee Schermerhorn 		VM_BUG_ON(PageUnevictable(page));
413b20a3503SChristoph Lameter 		SetPageActive(newpage);
414418b27efSLee Schermerhorn 	} else if (TestClearPageUnevictable(page))
415418b27efSLee Schermerhorn 		SetPageUnevictable(newpage);
416b20a3503SChristoph Lameter 	if (PageChecked(page))
417b20a3503SChristoph Lameter 		SetPageChecked(newpage);
418b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
419b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
420b20a3503SChristoph Lameter 
421b20a3503SChristoph Lameter 	if (PageDirty(page)) {
422b20a3503SChristoph Lameter 		clear_page_dirty_for_io(page);
4233a902c5fSNick Piggin 		/*
4243a902c5fSNick Piggin 		 * Want to mark the page and the radix tree as dirty, and
4253a902c5fSNick Piggin 		 * redo the accounting that clear_page_dirty_for_io undid,
4263a902c5fSNick Piggin 		 * but we can't use set_page_dirty because that function
4273a902c5fSNick Piggin 		 * is actually a signal that all of the page has become dirty.
42825985edcSLucas De Marchi 		 * Whereas only part of our page may be dirty.
4293a902c5fSNick Piggin 		 */
430752dc185SHugh Dickins 		if (PageSwapBacked(page))
431752dc185SHugh Dickins 			SetPageDirty(newpage);
432752dc185SHugh Dickins 		else
4333a902c5fSNick Piggin 			__set_page_dirty_nobuffers(newpage);
434b20a3503SChristoph Lameter  	}
435b20a3503SChristoph Lameter 
436b291f000SNick Piggin 	mlock_migrate_page(newpage, page);
437e9995ef9SHugh Dickins 	ksm_migrate_page(newpage, page);
438b291f000SNick Piggin 
439b20a3503SChristoph Lameter 	ClearPageSwapCache(page);
440b20a3503SChristoph Lameter 	ClearPagePrivate(page);
441b20a3503SChristoph Lameter 	set_page_private(page, 0);
442b20a3503SChristoph Lameter 
443b20a3503SChristoph Lameter 	/*
444b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
445b20a3503SChristoph Lameter 	 * wake them up.
446b20a3503SChristoph Lameter 	 */
447b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
448b20a3503SChristoph Lameter 		end_page_writeback(newpage);
449b20a3503SChristoph Lameter }
450b20a3503SChristoph Lameter 
4511d8b85ccSChristoph Lameter /************************************************************
4521d8b85ccSChristoph Lameter  *                    Migration functions
4531d8b85ccSChristoph Lameter  ***********************************************************/
4541d8b85ccSChristoph Lameter 
4551d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */
4562d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping,
4572d1db3b1SChristoph Lameter 			struct page *newpage, struct page *page)
4581d8b85ccSChristoph Lameter {
4591d8b85ccSChristoph Lameter 	return -EIO;
4601d8b85ccSChristoph Lameter }
4611d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page);
4621d8b85ccSChristoph Lameter 
463b20a3503SChristoph Lameter /*
464b20a3503SChristoph Lameter  * Common logic to directly migrate a single page suitable for
465266cf658SDavid Howells  * pages that do not use PagePrivate/PagePrivate2.
466b20a3503SChristoph Lameter  *
467b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
468b20a3503SChristoph Lameter  */
4692d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
470a6bc32b8SMel Gorman 		struct page *newpage, struct page *page,
471a6bc32b8SMel Gorman 		enum migrate_mode mode)
472b20a3503SChristoph Lameter {
473b20a3503SChristoph Lameter 	int rc;
474b20a3503SChristoph Lameter 
475b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
476b20a3503SChristoph Lameter 
477a6bc32b8SMel Gorman 	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
478b20a3503SChristoph Lameter 
479*78bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
480b20a3503SChristoph Lameter 		return rc;
481b20a3503SChristoph Lameter 
482b20a3503SChristoph Lameter 	migrate_page_copy(newpage, page);
483*78bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
484b20a3503SChristoph Lameter }
485b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
486b20a3503SChristoph Lameter 
4879361401eSDavid Howells #ifdef CONFIG_BLOCK
488b20a3503SChristoph Lameter /*
4891d8b85ccSChristoph Lameter  * Migration function for pages with buffers. This function can only be used
4901d8b85ccSChristoph Lameter  * if the underlying filesystem guarantees that no other references to "page"
4911d8b85ccSChristoph Lameter  * exist.
4921d8b85ccSChristoph Lameter  */
4932d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping,
494a6bc32b8SMel Gorman 		struct page *newpage, struct page *page, enum migrate_mode mode)
4951d8b85ccSChristoph Lameter {
4961d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
4971d8b85ccSChristoph Lameter 	int rc;
4981d8b85ccSChristoph Lameter 
4991d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
500a6bc32b8SMel Gorman 		return migrate_page(mapping, newpage, page, mode);
5011d8b85ccSChristoph Lameter 
5021d8b85ccSChristoph Lameter 	head = page_buffers(page);
5031d8b85ccSChristoph Lameter 
504a6bc32b8SMel Gorman 	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
5051d8b85ccSChristoph Lameter 
506*78bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS)
5071d8b85ccSChristoph Lameter 		return rc;
5081d8b85ccSChristoph Lameter 
509b969c4abSMel Gorman 	/*
510b969c4abSMel Gorman 	 * In the async case, migrate_page_move_mapping locked the buffers
511b969c4abSMel Gorman 	 * with an IRQ-safe spinlock held. In the sync case, the buffers
512b969c4abSMel Gorman 	 * need to be locked now
513b969c4abSMel Gorman 	 */
514a6bc32b8SMel Gorman 	if (mode != MIGRATE_ASYNC)
515a6bc32b8SMel Gorman 		BUG_ON(!buffer_migrate_lock_buffers(head, mode));
5161d8b85ccSChristoph Lameter 
5171d8b85ccSChristoph Lameter 	ClearPagePrivate(page);
5181d8b85ccSChristoph Lameter 	set_page_private(newpage, page_private(page));
5191d8b85ccSChristoph Lameter 	set_page_private(page, 0);
5201d8b85ccSChristoph Lameter 	put_page(page);
5211d8b85ccSChristoph Lameter 	get_page(newpage);
5221d8b85ccSChristoph Lameter 
5231d8b85ccSChristoph Lameter 	bh = head;
5241d8b85ccSChristoph Lameter 	do {
5251d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
5261d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
5271d8b85ccSChristoph Lameter 
5281d8b85ccSChristoph Lameter 	} while (bh != head);
5291d8b85ccSChristoph Lameter 
5301d8b85ccSChristoph Lameter 	SetPagePrivate(newpage);
5311d8b85ccSChristoph Lameter 
5321d8b85ccSChristoph Lameter 	migrate_page_copy(newpage, page);
5331d8b85ccSChristoph Lameter 
5341d8b85ccSChristoph Lameter 	bh = head;
5351d8b85ccSChristoph Lameter 	do {
5361d8b85ccSChristoph Lameter 		unlock_buffer(bh);
5371d8b85ccSChristoph Lameter  		put_bh(bh);
5381d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
5391d8b85ccSChristoph Lameter 
5401d8b85ccSChristoph Lameter 	} while (bh != head);
5411d8b85ccSChristoph Lameter 
542*78bd5209SRafael Aquini 	return MIGRATEPAGE_SUCCESS;
5431d8b85ccSChristoph Lameter }
5441d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
5459361401eSDavid Howells #endif
5461d8b85ccSChristoph Lameter 
54704e62a29SChristoph Lameter /*
54804e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
54904e62a29SChristoph Lameter  */
55004e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
55104e62a29SChristoph Lameter {
55204e62a29SChristoph Lameter 	struct writeback_control wbc = {
55304e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
55404e62a29SChristoph Lameter 		.nr_to_write = 1,
55504e62a29SChristoph Lameter 		.range_start = 0,
55604e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
55704e62a29SChristoph Lameter 		.for_reclaim = 1
55804e62a29SChristoph Lameter 	};
55904e62a29SChristoph Lameter 	int rc;
56004e62a29SChristoph Lameter 
56104e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
56204e62a29SChristoph Lameter 		/* No write method for the address space */
56304e62a29SChristoph Lameter 		return -EINVAL;
56404e62a29SChristoph Lameter 
56504e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
56604e62a29SChristoph Lameter 		/* Someone else already triggered a write */
56704e62a29SChristoph Lameter 		return -EAGAIN;
56804e62a29SChristoph Lameter 
56904e62a29SChristoph Lameter 	/*
57004e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
57104e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
57204e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
57304e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
57404e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
57504e62a29SChristoph Lameter 	 * be successful.
57604e62a29SChristoph Lameter 	 */
57704e62a29SChristoph Lameter 	remove_migration_ptes(page, page);
57804e62a29SChristoph Lameter 
57904e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
58004e62a29SChristoph Lameter 
58104e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
58204e62a29SChristoph Lameter 		/* unlocked. Relock */
58304e62a29SChristoph Lameter 		lock_page(page);
58404e62a29SChristoph Lameter 
585bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
58604e62a29SChristoph Lameter }
58704e62a29SChristoph Lameter 
58804e62a29SChristoph Lameter /*
58904e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
59004e62a29SChristoph Lameter  */
5918351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
592a6bc32b8SMel Gorman 	struct page *newpage, struct page *page, enum migrate_mode mode)
5938351a6e4SChristoph Lameter {
594b969c4abSMel Gorman 	if (PageDirty(page)) {
595a6bc32b8SMel Gorman 		/* Only writeback pages in full synchronous migration */
596a6bc32b8SMel Gorman 		if (mode != MIGRATE_SYNC)
597b969c4abSMel Gorman 			return -EBUSY;
59804e62a29SChristoph Lameter 		return writeout(mapping, page);
599b969c4abSMel Gorman 	}
6008351a6e4SChristoph Lameter 
6018351a6e4SChristoph Lameter 	/*
6028351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
6038351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
6048351a6e4SChristoph Lameter 	 */
605266cf658SDavid Howells 	if (page_has_private(page) &&
6068351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
6078351a6e4SChristoph Lameter 		return -EAGAIN;
6088351a6e4SChristoph Lameter 
609a6bc32b8SMel Gorman 	return migrate_page(mapping, newpage, page, mode);
6108351a6e4SChristoph Lameter }
6118351a6e4SChristoph Lameter 
6121d8b85ccSChristoph Lameter /*
613e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
614e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
615b20a3503SChristoph Lameter  *
616e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
617e24f0b8fSChristoph Lameter  * is successful.
618894bc310SLee Schermerhorn  *
619894bc310SLee Schermerhorn  * Return value:
620894bc310SLee Schermerhorn  *   < 0 - error code
621*78bd5209SRafael Aquini  *  MIGRATEPAGE_SUCCESS - success
622b20a3503SChristoph Lameter  */
6233fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page,
624a6bc32b8SMel Gorman 				int remap_swapcache, enum migrate_mode mode)
625b20a3503SChristoph Lameter {
626e24f0b8fSChristoph Lameter 	struct address_space *mapping;
627b20a3503SChristoph Lameter 	int rc;
628b20a3503SChristoph Lameter 
629b20a3503SChristoph Lameter 	/*
630e24f0b8fSChristoph Lameter 	 * Block others from accessing the page when we get around to
631e24f0b8fSChristoph Lameter 	 * establishing additional references. We are the only one
632e24f0b8fSChristoph Lameter 	 * holding a reference to the new page at this point.
633b20a3503SChristoph Lameter 	 */
634529ae9aaSNick Piggin 	if (!trylock_page(newpage))
635e24f0b8fSChristoph Lameter 		BUG();
636b20a3503SChristoph Lameter 
6372d1db3b1SChristoph Lameter 	/* Prepare mapping for the new page.*/
6382d1db3b1SChristoph Lameter 	newpage->index = page->index;
6392d1db3b1SChristoph Lameter 	newpage->mapping = page->mapping;
640b2e18538SRik van Riel 	if (PageSwapBacked(page))
641b2e18538SRik van Riel 		SetPageSwapBacked(newpage);
6422d1db3b1SChristoph Lameter 
643b20a3503SChristoph Lameter 	mapping = page_mapping(page);
644b20a3503SChristoph Lameter 	if (!mapping)
645a6bc32b8SMel Gorman 		rc = migrate_page(mapping, newpage, page, mode);
6466c5240aeSChristoph Lameter 	else if (mapping->a_ops->migratepage)
647b20a3503SChristoph Lameter 		/*
648b969c4abSMel Gorman 		 * Most pages have a mapping and most filesystems provide a
649b969c4abSMel Gorman 		 * migratepage callback. Anonymous pages are part of swap
650b969c4abSMel Gorman 		 * space which also has its own migratepage callback. This
651b969c4abSMel Gorman 		 * is the most common path for page migration.
652b20a3503SChristoph Lameter 		 */
6532d1db3b1SChristoph Lameter 		rc = mapping->a_ops->migratepage(mapping,
654a6bc32b8SMel Gorman 						newpage, page, mode);
6558351a6e4SChristoph Lameter 	else
656a6bc32b8SMel Gorman 		rc = fallback_migrate_page(mapping, newpage, page, mode);
657b20a3503SChristoph Lameter 
658*78bd5209SRafael Aquini 	if (rc != MIGRATEPAGE_SUCCESS) {
659e24f0b8fSChristoph Lameter 		newpage->mapping = NULL;
6603fe2011fSMel Gorman 	} else {
6613fe2011fSMel Gorman 		if (remap_swapcache)
6623fe2011fSMel Gorman 			remove_migration_ptes(page, newpage);
66335512ecaSKonstantin Khlebnikov 		page->mapping = NULL;
6643fe2011fSMel Gorman 	}
6656c5240aeSChristoph Lameter 
666b20a3503SChristoph Lameter 	unlock_page(newpage);
667b20a3503SChristoph Lameter 
668e24f0b8fSChristoph Lameter 	return rc;
669e24f0b8fSChristoph Lameter }
670e24f0b8fSChristoph Lameter 
6710dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage,
672a6bc32b8SMel Gorman 			int force, bool offlining, enum migrate_mode mode)
673e24f0b8fSChristoph Lameter {
6740dabec93SMinchan Kim 	int rc = -EAGAIN;
6753fe2011fSMel Gorman 	int remap_swapcache = 1;
67656039efaSKAMEZAWA Hiroyuki 	struct mem_cgroup *mem;
6773f6c8272SMel Gorman 	struct anon_vma *anon_vma = NULL;
67895a402c3SChristoph Lameter 
679529ae9aaSNick Piggin 	if (!trylock_page(page)) {
680a6bc32b8SMel Gorman 		if (!force || mode == MIGRATE_ASYNC)
6810dabec93SMinchan Kim 			goto out;
6823e7d3449SMel Gorman 
6833e7d3449SMel Gorman 		/*
6843e7d3449SMel Gorman 		 * It's not safe for direct compaction to call lock_page.
6853e7d3449SMel Gorman 		 * For example, during page readahead pages are added locked
6863e7d3449SMel Gorman 		 * to the LRU. Later, when the IO completes the pages are
6873e7d3449SMel Gorman 		 * marked uptodate and unlocked. However, the queueing
6883e7d3449SMel Gorman 		 * could be merging multiple pages for one bio (e.g.
6893e7d3449SMel Gorman 		 * mpage_readpages). If an allocation happens for the
6903e7d3449SMel Gorman 		 * second or third page, the process can end up locking
6913e7d3449SMel Gorman 		 * the same page twice and deadlocking. Rather than
6923e7d3449SMel Gorman 		 * trying to be clever about what pages can be locked,
6933e7d3449SMel Gorman 		 * avoid the use of lock_page for direct compaction
6943e7d3449SMel Gorman 		 * altogether.
6953e7d3449SMel Gorman 		 */
6963e7d3449SMel Gorman 		if (current->flags & PF_MEMALLOC)
6970dabec93SMinchan Kim 			goto out;
6983e7d3449SMel Gorman 
699e24f0b8fSChristoph Lameter 		lock_page(page);
700e24f0b8fSChristoph Lameter 	}
701e24f0b8fSChristoph Lameter 
70262b61f61SHugh Dickins 	/*
70362b61f61SHugh Dickins 	 * Only memory hotplug's offline_pages() caller has locked out KSM,
70462b61f61SHugh Dickins 	 * and can safely migrate a KSM page.  The other cases have skipped
70562b61f61SHugh Dickins 	 * PageKsm along with PageReserved - but it is only now when we have
70662b61f61SHugh Dickins 	 * the page lock that we can be certain it will not go KSM beneath us
70762b61f61SHugh Dickins 	 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
70862b61f61SHugh Dickins 	 * its pagecount raised, but only here do we take the page lock which
70962b61f61SHugh Dickins 	 * serializes that).
71062b61f61SHugh Dickins 	 */
71162b61f61SHugh Dickins 	if (PageKsm(page) && !offlining) {
71262b61f61SHugh Dickins 		rc = -EBUSY;
71362b61f61SHugh Dickins 		goto unlock;
71462b61f61SHugh Dickins 	}
71562b61f61SHugh Dickins 
71601b1ae63SKAMEZAWA Hiroyuki 	/* charge against new page */
7170030f535SJohannes Weiner 	mem_cgroup_prepare_migration(page, newpage, &mem);
71801b1ae63SKAMEZAWA Hiroyuki 
719e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
72011bc82d6SAndrea Arcangeli 		/*
721a6bc32b8SMel Gorman 		 * Only in the case of a full syncronous migration is it
722a6bc32b8SMel Gorman 		 * necessary to wait for PageWriteback. In the async case,
723a6bc32b8SMel Gorman 		 * the retry loop is too short and in the sync-light case,
724a6bc32b8SMel Gorman 		 * the overhead of stalling is too much
72511bc82d6SAndrea Arcangeli 		 */
726a6bc32b8SMel Gorman 		if (mode != MIGRATE_SYNC) {
72711bc82d6SAndrea Arcangeli 			rc = -EBUSY;
72811bc82d6SAndrea Arcangeli 			goto uncharge;
72911bc82d6SAndrea Arcangeli 		}
73011bc82d6SAndrea Arcangeli 		if (!force)
73101b1ae63SKAMEZAWA Hiroyuki 			goto uncharge;
732e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
733e24f0b8fSChristoph Lameter 	}
734e24f0b8fSChristoph Lameter 	/*
735dc386d4dSKAMEZAWA Hiroyuki 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
736dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
7371ce82b69SHugh Dickins 	 * This get_anon_vma() delays freeing anon_vma pointer until the end
738dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
739989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
740989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
741e24f0b8fSChristoph Lameter 	 */
742989f89c5SKAMEZAWA Hiroyuki 	if (PageAnon(page)) {
7431ce82b69SHugh Dickins 		/*
7441ce82b69SHugh Dickins 		 * Only page_lock_anon_vma() understands the subtleties of
7451ce82b69SHugh Dickins 		 * getting a hold on an anon_vma from outside one of its mms.
7461ce82b69SHugh Dickins 		 */
747746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(page);
7481ce82b69SHugh Dickins 		if (anon_vma) {
7491ce82b69SHugh Dickins 			/*
750746b18d4SPeter Zijlstra 			 * Anon page
7511ce82b69SHugh Dickins 			 */
7521ce82b69SHugh Dickins 		} else if (PageSwapCache(page)) {
7533fe2011fSMel Gorman 			/*
7543fe2011fSMel Gorman 			 * We cannot be sure that the anon_vma of an unmapped
7553fe2011fSMel Gorman 			 * swapcache page is safe to use because we don't
7563fe2011fSMel Gorman 			 * know in advance if the VMA that this page belonged
7573fe2011fSMel Gorman 			 * to still exists. If the VMA and others sharing the
7583fe2011fSMel Gorman 			 * data have been freed, then the anon_vma could
7593fe2011fSMel Gorman 			 * already be invalid.
7603fe2011fSMel Gorman 			 *
7613fe2011fSMel Gorman 			 * To avoid this possibility, swapcache pages get
7623fe2011fSMel Gorman 			 * migrated but are not remapped when migration
7633fe2011fSMel Gorman 			 * completes
7643fe2011fSMel Gorman 			 */
7653fe2011fSMel Gorman 			remap_swapcache = 0;
7663fe2011fSMel Gorman 		} else {
7671ce82b69SHugh Dickins 			goto uncharge;
768989f89c5SKAMEZAWA Hiroyuki 		}
7693fe2011fSMel Gorman 	}
77062e1c553SShaohua Li 
771dc386d4dSKAMEZAWA Hiroyuki 	/*
77262e1c553SShaohua Li 	 * Corner case handling:
77362e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
77462e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
77562e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
77662e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
77762e1c553SShaohua Li 	 * 2. An orphaned page (see truncate_complete_page) might have
77862e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
77962e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
78062e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
78162e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
782dc386d4dSKAMEZAWA Hiroyuki 	 */
78362e1c553SShaohua Li 	if (!page->mapping) {
7841ce82b69SHugh Dickins 		VM_BUG_ON(PageAnon(page));
7851ce82b69SHugh Dickins 		if (page_has_private(page)) {
78662e1c553SShaohua Li 			try_to_free_buffers(page);
7871ce82b69SHugh Dickins 			goto uncharge;
78862e1c553SShaohua Li 		}
789abfc3488SShaohua Li 		goto skip_unmap;
790abfc3488SShaohua Li 	}
79162e1c553SShaohua Li 
792dc386d4dSKAMEZAWA Hiroyuki 	/* Establish migration ptes or remove ptes */
79314fa31b8SAndi Kleen 	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
794dc386d4dSKAMEZAWA Hiroyuki 
795abfc3488SShaohua Li skip_unmap:
796e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
797a6bc32b8SMel Gorman 		rc = move_to_new_page(newpage, page, remap_swapcache, mode);
798e24f0b8fSChristoph Lameter 
7993fe2011fSMel Gorman 	if (rc && remap_swapcache)
8006c5240aeSChristoph Lameter 		remove_migration_ptes(page, page);
8013f6c8272SMel Gorman 
8023f6c8272SMel Gorman 	/* Drop an anon_vma reference if we took one */
80376545066SRik van Riel 	if (anon_vma)
8049e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
8053f6c8272SMel Gorman 
80601b1ae63SKAMEZAWA Hiroyuki uncharge:
807*78bd5209SRafael Aquini 	mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS);
808e24f0b8fSChristoph Lameter unlock:
809b20a3503SChristoph Lameter 	unlock_page(page);
8100dabec93SMinchan Kim out:
8110dabec93SMinchan Kim 	return rc;
8120dabec93SMinchan Kim }
81395a402c3SChristoph Lameter 
8140dabec93SMinchan Kim /*
8150dabec93SMinchan Kim  * Obtain the lock on page, remove all ptes and migrate the page
8160dabec93SMinchan Kim  * to the newly allocated page in newpage.
8170dabec93SMinchan Kim  */
8180dabec93SMinchan Kim static int unmap_and_move(new_page_t get_new_page, unsigned long private,
819a6bc32b8SMel Gorman 			struct page *page, int force, bool offlining,
820a6bc32b8SMel Gorman 			enum migrate_mode mode)
8210dabec93SMinchan Kim {
8220dabec93SMinchan Kim 	int rc = 0;
8230dabec93SMinchan Kim 	int *result = NULL;
8240dabec93SMinchan Kim 	struct page *newpage = get_new_page(page, private, &result);
8250dabec93SMinchan Kim 
8260dabec93SMinchan Kim 	if (!newpage)
8270dabec93SMinchan Kim 		return -ENOMEM;
8280dabec93SMinchan Kim 
8290dabec93SMinchan Kim 	if (page_count(page) == 1) {
8300dabec93SMinchan Kim 		/* page was freed from under us. So we are done. */
8310dabec93SMinchan Kim 		goto out;
8320dabec93SMinchan Kim 	}
8330dabec93SMinchan Kim 
8340dabec93SMinchan Kim 	if (unlikely(PageTransHuge(page)))
8350dabec93SMinchan Kim 		if (unlikely(split_huge_page(page)))
8360dabec93SMinchan Kim 			goto out;
8370dabec93SMinchan Kim 
838a6bc32b8SMel Gorman 	rc = __unmap_and_move(page, newpage, force, offlining, mode);
8390dabec93SMinchan Kim out:
840e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
841aaa994b3SChristoph Lameter 		/*
842aaa994b3SChristoph Lameter 		 * A page that has been migrated has all references
843aaa994b3SChristoph Lameter 		 * removed and will be freed. A page that has not been
844aaa994b3SChristoph Lameter 		 * migrated will have kepts its references and be
845aaa994b3SChristoph Lameter 		 * restored.
846aaa994b3SChristoph Lameter 		 */
847aaa994b3SChristoph Lameter 		list_del(&page->lru);
848a731286dSKOSAKI Motohiro 		dec_zone_page_state(page, NR_ISOLATED_ANON +
8496c0b1351SJohannes Weiner 				page_is_file_cache(page));
850894bc310SLee Schermerhorn 		putback_lru_page(page);
851e24f0b8fSChristoph Lameter 	}
85295a402c3SChristoph Lameter 	/*
85395a402c3SChristoph Lameter 	 * Move the new page to the LRU. If migration was not successful
85495a402c3SChristoph Lameter 	 * then this will free the page.
85595a402c3SChristoph Lameter 	 */
856894bc310SLee Schermerhorn 	putback_lru_page(newpage);
857742755a1SChristoph Lameter 	if (result) {
858742755a1SChristoph Lameter 		if (rc)
859742755a1SChristoph Lameter 			*result = rc;
860742755a1SChristoph Lameter 		else
861742755a1SChristoph Lameter 			*result = page_to_nid(newpage);
862742755a1SChristoph Lameter 	}
863e24f0b8fSChristoph Lameter 	return rc;
864e24f0b8fSChristoph Lameter }
865b20a3503SChristoph Lameter 
866e24f0b8fSChristoph Lameter /*
867290408d4SNaoya Horiguchi  * Counterpart of unmap_and_move_page() for hugepage migration.
868290408d4SNaoya Horiguchi  *
869290408d4SNaoya Horiguchi  * This function doesn't wait the completion of hugepage I/O
870290408d4SNaoya Horiguchi  * because there is no race between I/O and migration for hugepage.
871290408d4SNaoya Horiguchi  * Note that currently hugepage I/O occurs only in direct I/O
872290408d4SNaoya Horiguchi  * where no lock is held and PG_writeback is irrelevant,
873290408d4SNaoya Horiguchi  * and writeback status of all subpages are counted in the reference
874290408d4SNaoya Horiguchi  * count of the head page (i.e. if all subpages of a 2MB hugepage are
875290408d4SNaoya Horiguchi  * under direct I/O, the reference of the head page is 512 and a bit more.)
876290408d4SNaoya Horiguchi  * This means that when we try to migrate hugepage whose subpages are
877290408d4SNaoya Horiguchi  * doing direct I/O, some references remain after try_to_unmap() and
878290408d4SNaoya Horiguchi  * hugepage migration fails without data corruption.
879290408d4SNaoya Horiguchi  *
880290408d4SNaoya Horiguchi  * There is also no race when direct I/O is issued on the page under migration,
881290408d4SNaoya Horiguchi  * because then pte is replaced with migration swap entry and direct I/O code
882290408d4SNaoya Horiguchi  * will wait in the page fault for migration to complete.
883290408d4SNaoya Horiguchi  */
884290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page,
885290408d4SNaoya Horiguchi 				unsigned long private, struct page *hpage,
886a6bc32b8SMel Gorman 				int force, bool offlining,
887a6bc32b8SMel Gorman 				enum migrate_mode mode)
888290408d4SNaoya Horiguchi {
889290408d4SNaoya Horiguchi 	int rc = 0;
890290408d4SNaoya Horiguchi 	int *result = NULL;
891290408d4SNaoya Horiguchi 	struct page *new_hpage = get_new_page(hpage, private, &result);
892290408d4SNaoya Horiguchi 	struct anon_vma *anon_vma = NULL;
893290408d4SNaoya Horiguchi 
894290408d4SNaoya Horiguchi 	if (!new_hpage)
895290408d4SNaoya Horiguchi 		return -ENOMEM;
896290408d4SNaoya Horiguchi 
897290408d4SNaoya Horiguchi 	rc = -EAGAIN;
898290408d4SNaoya Horiguchi 
899290408d4SNaoya Horiguchi 	if (!trylock_page(hpage)) {
900a6bc32b8SMel Gorman 		if (!force || mode != MIGRATE_SYNC)
901290408d4SNaoya Horiguchi 			goto out;
902290408d4SNaoya Horiguchi 		lock_page(hpage);
903290408d4SNaoya Horiguchi 	}
904290408d4SNaoya Horiguchi 
905746b18d4SPeter Zijlstra 	if (PageAnon(hpage))
906746b18d4SPeter Zijlstra 		anon_vma = page_get_anon_vma(hpage);
907290408d4SNaoya Horiguchi 
908290408d4SNaoya Horiguchi 	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
909290408d4SNaoya Horiguchi 
910290408d4SNaoya Horiguchi 	if (!page_mapped(hpage))
911a6bc32b8SMel Gorman 		rc = move_to_new_page(new_hpage, hpage, 1, mode);
912290408d4SNaoya Horiguchi 
913290408d4SNaoya Horiguchi 	if (rc)
914290408d4SNaoya Horiguchi 		remove_migration_ptes(hpage, hpage);
915290408d4SNaoya Horiguchi 
916fd4a4663SHugh Dickins 	if (anon_vma)
9179e60109fSPeter Zijlstra 		put_anon_vma(anon_vma);
9188e6ac7faSAneesh Kumar K.V 
9198e6ac7faSAneesh Kumar K.V 	if (!rc)
9208e6ac7faSAneesh Kumar K.V 		hugetlb_cgroup_migrate(hpage, new_hpage);
9218e6ac7faSAneesh Kumar K.V 
922290408d4SNaoya Horiguchi 	unlock_page(hpage);
92309761333SHillf Danton out:
924290408d4SNaoya Horiguchi 	put_page(new_hpage);
925290408d4SNaoya Horiguchi 	if (result) {
926290408d4SNaoya Horiguchi 		if (rc)
927290408d4SNaoya Horiguchi 			*result = rc;
928290408d4SNaoya Horiguchi 		else
929290408d4SNaoya Horiguchi 			*result = page_to_nid(new_hpage);
930290408d4SNaoya Horiguchi 	}
931290408d4SNaoya Horiguchi 	return rc;
932290408d4SNaoya Horiguchi }
933290408d4SNaoya Horiguchi 
934290408d4SNaoya Horiguchi /*
935e24f0b8fSChristoph Lameter  * migrate_pages
936e24f0b8fSChristoph Lameter  *
93795a402c3SChristoph Lameter  * The function takes one list of pages to migrate and a function
93895a402c3SChristoph Lameter  * that determines from the page to be migrated and the private data
93995a402c3SChristoph Lameter  * the target of the move and allocates the page.
940e24f0b8fSChristoph Lameter  *
941e24f0b8fSChristoph Lameter  * The function returns after 10 attempts or if no pages
942e24f0b8fSChristoph Lameter  * are movable anymore because to has become empty
943cf608ac1SMinchan Kim  * or no retryable pages exist anymore.
944cf608ac1SMinchan Kim  * Caller should call putback_lru_pages to return pages to the LRU
94528bd6578SMinchan Kim  * or free list only if ret != 0.
946e24f0b8fSChristoph Lameter  *
94795a402c3SChristoph Lameter  * Return: Number of pages not migrated or error code.
948e24f0b8fSChristoph Lameter  */
94995a402c3SChristoph Lameter int migrate_pages(struct list_head *from,
9507f0f2496SMel Gorman 		new_page_t get_new_page, unsigned long private, bool offlining,
951a6bc32b8SMel Gorman 		enum migrate_mode mode)
952e24f0b8fSChristoph Lameter {
953e24f0b8fSChristoph Lameter 	int retry = 1;
954e24f0b8fSChristoph Lameter 	int nr_failed = 0;
955e24f0b8fSChristoph Lameter 	int pass = 0;
956e24f0b8fSChristoph Lameter 	struct page *page;
957e24f0b8fSChristoph Lameter 	struct page *page2;
958e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
959e24f0b8fSChristoph Lameter 	int rc;
9602d1db3b1SChristoph Lameter 
961e24f0b8fSChristoph Lameter 	if (!swapwrite)
962e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
963e24f0b8fSChristoph Lameter 
964e24f0b8fSChristoph Lameter 	for(pass = 0; pass < 10 && retry; pass++) {
965e24f0b8fSChristoph Lameter 		retry = 0;
966e24f0b8fSChristoph Lameter 
967e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
968e24f0b8fSChristoph Lameter 			cond_resched();
969e24f0b8fSChristoph Lameter 
97095a402c3SChristoph Lameter 			rc = unmap_and_move(get_new_page, private,
97177f1fe6bSMel Gorman 						page, pass > 2, offlining,
972a6bc32b8SMel Gorman 						mode);
973e24f0b8fSChristoph Lameter 
974e24f0b8fSChristoph Lameter 			switch(rc) {
97595a402c3SChristoph Lameter 			case -ENOMEM:
97695a402c3SChristoph Lameter 				goto out;
977e24f0b8fSChristoph Lameter 			case -EAGAIN:
978b20a3503SChristoph Lameter 				retry++;
979e24f0b8fSChristoph Lameter 				break;
980*78bd5209SRafael Aquini 			case MIGRATEPAGE_SUCCESS:
981e24f0b8fSChristoph Lameter 				break;
982e24f0b8fSChristoph Lameter 			default:
983b20a3503SChristoph Lameter 				/* Permanent failure */
984b20a3503SChristoph Lameter 				nr_failed++;
985e24f0b8fSChristoph Lameter 				break;
986b20a3503SChristoph Lameter 			}
987b20a3503SChristoph Lameter 		}
988e24f0b8fSChristoph Lameter 	}
989*78bd5209SRafael Aquini 	rc = nr_failed + retry;
99095a402c3SChristoph Lameter out:
991b20a3503SChristoph Lameter 	if (!swapwrite)
992b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
993b20a3503SChristoph Lameter 
99495a402c3SChristoph Lameter 	return rc;
995b20a3503SChristoph Lameter }
996b20a3503SChristoph Lameter 
997189ebff2SAneesh Kumar K.V int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
998189ebff2SAneesh Kumar K.V 		      unsigned long private, bool offlining,
999a6bc32b8SMel Gorman 		      enum migrate_mode mode)
1000290408d4SNaoya Horiguchi {
1001189ebff2SAneesh Kumar K.V 	int pass, rc;
1002290408d4SNaoya Horiguchi 
1003189ebff2SAneesh Kumar K.V 	for (pass = 0; pass < 10; pass++) {
1004290408d4SNaoya Horiguchi 		rc = unmap_and_move_huge_page(get_new_page,
1005189ebff2SAneesh Kumar K.V 					      private, hpage, pass > 2, offlining,
1006a6bc32b8SMel Gorman 					      mode);
1007290408d4SNaoya Horiguchi 		switch (rc) {
1008290408d4SNaoya Horiguchi 		case -ENOMEM:
1009290408d4SNaoya Horiguchi 			goto out;
1010290408d4SNaoya Horiguchi 		case -EAGAIN:
1011189ebff2SAneesh Kumar K.V 			/* try again */
1012189ebff2SAneesh Kumar K.V 			cond_resched();
1013290408d4SNaoya Horiguchi 			break;
1014*78bd5209SRafael Aquini 		case MIGRATEPAGE_SUCCESS:
1015189ebff2SAneesh Kumar K.V 			goto out;
1016290408d4SNaoya Horiguchi 		default:
1017189ebff2SAneesh Kumar K.V 			rc = -EIO;
1018189ebff2SAneesh Kumar K.V 			goto out;
1019290408d4SNaoya Horiguchi 		}
1020290408d4SNaoya Horiguchi 	}
1021290408d4SNaoya Horiguchi out:
1022290408d4SNaoya Horiguchi 	return rc;
1023290408d4SNaoya Horiguchi }
1024290408d4SNaoya Horiguchi 
1025742755a1SChristoph Lameter #ifdef CONFIG_NUMA
1026742755a1SChristoph Lameter /*
1027742755a1SChristoph Lameter  * Move a list of individual pages
1028742755a1SChristoph Lameter  */
1029742755a1SChristoph Lameter struct page_to_node {
1030742755a1SChristoph Lameter 	unsigned long addr;
1031742755a1SChristoph Lameter 	struct page *page;
1032742755a1SChristoph Lameter 	int node;
1033742755a1SChristoph Lameter 	int status;
1034742755a1SChristoph Lameter };
1035742755a1SChristoph Lameter 
1036742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private,
1037742755a1SChristoph Lameter 		int **result)
1038742755a1SChristoph Lameter {
1039742755a1SChristoph Lameter 	struct page_to_node *pm = (struct page_to_node *)private;
1040742755a1SChristoph Lameter 
1041742755a1SChristoph Lameter 	while (pm->node != MAX_NUMNODES && pm->page != p)
1042742755a1SChristoph Lameter 		pm++;
1043742755a1SChristoph Lameter 
1044742755a1SChristoph Lameter 	if (pm->node == MAX_NUMNODES)
1045742755a1SChristoph Lameter 		return NULL;
1046742755a1SChristoph Lameter 
1047742755a1SChristoph Lameter 	*result = &pm->status;
1048742755a1SChristoph Lameter 
10496484eb3eSMel Gorman 	return alloc_pages_exact_node(pm->node,
1050769848c0SMel Gorman 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
1051742755a1SChristoph Lameter }
1052742755a1SChristoph Lameter 
1053742755a1SChristoph Lameter /*
1054742755a1SChristoph Lameter  * Move a set of pages as indicated in the pm array. The addr
1055742755a1SChristoph Lameter  * field must be set to the virtual address of the page to be moved
1056742755a1SChristoph Lameter  * and the node number must contain a valid target node.
10575e9a0f02SBrice Goglin  * The pm array ends with node = MAX_NUMNODES.
1058742755a1SChristoph Lameter  */
10595e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm,
10605e9a0f02SBrice Goglin 				      struct page_to_node *pm,
1061742755a1SChristoph Lameter 				      int migrate_all)
1062742755a1SChristoph Lameter {
1063742755a1SChristoph Lameter 	int err;
1064742755a1SChristoph Lameter 	struct page_to_node *pp;
1065742755a1SChristoph Lameter 	LIST_HEAD(pagelist);
1066742755a1SChristoph Lameter 
1067742755a1SChristoph Lameter 	down_read(&mm->mmap_sem);
1068742755a1SChristoph Lameter 
1069742755a1SChristoph Lameter 	/*
1070742755a1SChristoph Lameter 	 * Build a list of pages to migrate
1071742755a1SChristoph Lameter 	 */
1072742755a1SChristoph Lameter 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1073742755a1SChristoph Lameter 		struct vm_area_struct *vma;
1074742755a1SChristoph Lameter 		struct page *page;
1075742755a1SChristoph Lameter 
1076742755a1SChristoph Lameter 		err = -EFAULT;
1077742755a1SChristoph Lameter 		vma = find_vma(mm, pp->addr);
107870384dc6SGleb Natapov 		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1079742755a1SChristoph Lameter 			goto set_status;
1080742755a1SChristoph Lameter 
1081500d65d4SAndrea Arcangeli 		page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
108289f5b7daSLinus Torvalds 
108389f5b7daSLinus Torvalds 		err = PTR_ERR(page);
108489f5b7daSLinus Torvalds 		if (IS_ERR(page))
108589f5b7daSLinus Torvalds 			goto set_status;
108689f5b7daSLinus Torvalds 
1087742755a1SChristoph Lameter 		err = -ENOENT;
1088742755a1SChristoph Lameter 		if (!page)
1089742755a1SChristoph Lameter 			goto set_status;
1090742755a1SChristoph Lameter 
109162b61f61SHugh Dickins 		/* Use PageReserved to check for zero page */
109262b61f61SHugh Dickins 		if (PageReserved(page) || PageKsm(page))
1093742755a1SChristoph Lameter 			goto put_and_set;
1094742755a1SChristoph Lameter 
1095742755a1SChristoph Lameter 		pp->page = page;
1096742755a1SChristoph Lameter 		err = page_to_nid(page);
1097742755a1SChristoph Lameter 
1098742755a1SChristoph Lameter 		if (err == pp->node)
1099742755a1SChristoph Lameter 			/*
1100742755a1SChristoph Lameter 			 * Node already in the right place
1101742755a1SChristoph Lameter 			 */
1102742755a1SChristoph Lameter 			goto put_and_set;
1103742755a1SChristoph Lameter 
1104742755a1SChristoph Lameter 		err = -EACCES;
1105742755a1SChristoph Lameter 		if (page_mapcount(page) > 1 &&
1106742755a1SChristoph Lameter 				!migrate_all)
1107742755a1SChristoph Lameter 			goto put_and_set;
1108742755a1SChristoph Lameter 
110962695a84SNick Piggin 		err = isolate_lru_page(page);
11106d9c285aSKOSAKI Motohiro 		if (!err) {
111162695a84SNick Piggin 			list_add_tail(&page->lru, &pagelist);
11126d9c285aSKOSAKI Motohiro 			inc_zone_page_state(page, NR_ISOLATED_ANON +
11136d9c285aSKOSAKI Motohiro 					    page_is_file_cache(page));
11146d9c285aSKOSAKI Motohiro 		}
1115742755a1SChristoph Lameter put_and_set:
1116742755a1SChristoph Lameter 		/*
1117742755a1SChristoph Lameter 		 * Either remove the duplicate refcount from
1118742755a1SChristoph Lameter 		 * isolate_lru_page() or drop the page ref if it was
1119742755a1SChristoph Lameter 		 * not isolated.
1120742755a1SChristoph Lameter 		 */
1121742755a1SChristoph Lameter 		put_page(page);
1122742755a1SChristoph Lameter set_status:
1123742755a1SChristoph Lameter 		pp->status = err;
1124742755a1SChristoph Lameter 	}
1125742755a1SChristoph Lameter 
1126e78bbfa8SBrice Goglin 	err = 0;
1127cf608ac1SMinchan Kim 	if (!list_empty(&pagelist)) {
1128742755a1SChristoph Lameter 		err = migrate_pages(&pagelist, new_page_node,
1129a6bc32b8SMel Gorman 				(unsigned long)pm, 0, MIGRATE_SYNC);
1130cf608ac1SMinchan Kim 		if (err)
1131cf608ac1SMinchan Kim 			putback_lru_pages(&pagelist);
1132cf608ac1SMinchan Kim 	}
1133742755a1SChristoph Lameter 
1134742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
1135742755a1SChristoph Lameter 	return err;
1136742755a1SChristoph Lameter }
1137742755a1SChristoph Lameter 
1138742755a1SChristoph Lameter /*
11395e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
11405e9a0f02SBrice Goglin  * the corresponding array of status.
11415e9a0f02SBrice Goglin  */
11423268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
11435e9a0f02SBrice Goglin 			 unsigned long nr_pages,
11445e9a0f02SBrice Goglin 			 const void __user * __user *pages,
11455e9a0f02SBrice Goglin 			 const int __user *nodes,
11465e9a0f02SBrice Goglin 			 int __user *status, int flags)
11475e9a0f02SBrice Goglin {
11483140a227SBrice Goglin 	struct page_to_node *pm;
11493140a227SBrice Goglin 	unsigned long chunk_nr_pages;
11503140a227SBrice Goglin 	unsigned long chunk_start;
11513140a227SBrice Goglin 	int err;
11525e9a0f02SBrice Goglin 
11535e9a0f02SBrice Goglin 	err = -ENOMEM;
11543140a227SBrice Goglin 	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
11553140a227SBrice Goglin 	if (!pm)
11565e9a0f02SBrice Goglin 		goto out;
115735282a2dSBrice Goglin 
115835282a2dSBrice Goglin 	migrate_prep();
115935282a2dSBrice Goglin 
11605e9a0f02SBrice Goglin 	/*
11613140a227SBrice Goglin 	 * Store a chunk of page_to_node array in a page,
11623140a227SBrice Goglin 	 * but keep the last one as a marker
11635e9a0f02SBrice Goglin 	 */
11643140a227SBrice Goglin 	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
11653140a227SBrice Goglin 
11663140a227SBrice Goglin 	for (chunk_start = 0;
11673140a227SBrice Goglin 	     chunk_start < nr_pages;
11683140a227SBrice Goglin 	     chunk_start += chunk_nr_pages) {
11693140a227SBrice Goglin 		int j;
11703140a227SBrice Goglin 
11713140a227SBrice Goglin 		if (chunk_start + chunk_nr_pages > nr_pages)
11723140a227SBrice Goglin 			chunk_nr_pages = nr_pages - chunk_start;
11733140a227SBrice Goglin 
11743140a227SBrice Goglin 		/* fill the chunk pm with addrs and nodes from user-space */
11753140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++) {
11765e9a0f02SBrice Goglin 			const void __user *p;
11775e9a0f02SBrice Goglin 			int node;
11785e9a0f02SBrice Goglin 
11793140a227SBrice Goglin 			err = -EFAULT;
11803140a227SBrice Goglin 			if (get_user(p, pages + j + chunk_start))
11813140a227SBrice Goglin 				goto out_pm;
11823140a227SBrice Goglin 			pm[j].addr = (unsigned long) p;
11833140a227SBrice Goglin 
11843140a227SBrice Goglin 			if (get_user(node, nodes + j + chunk_start))
11855e9a0f02SBrice Goglin 				goto out_pm;
11865e9a0f02SBrice Goglin 
11875e9a0f02SBrice Goglin 			err = -ENODEV;
11886f5a55f1SLinus Torvalds 			if (node < 0 || node >= MAX_NUMNODES)
11896f5a55f1SLinus Torvalds 				goto out_pm;
11906f5a55f1SLinus Torvalds 
11915e9a0f02SBrice Goglin 			if (!node_state(node, N_HIGH_MEMORY))
11925e9a0f02SBrice Goglin 				goto out_pm;
11935e9a0f02SBrice Goglin 
11945e9a0f02SBrice Goglin 			err = -EACCES;
11955e9a0f02SBrice Goglin 			if (!node_isset(node, task_nodes))
11965e9a0f02SBrice Goglin 				goto out_pm;
11975e9a0f02SBrice Goglin 
11983140a227SBrice Goglin 			pm[j].node = node;
11995e9a0f02SBrice Goglin 		}
12005e9a0f02SBrice Goglin 
12013140a227SBrice Goglin 		/* End marker for this chunk */
12023140a227SBrice Goglin 		pm[chunk_nr_pages].node = MAX_NUMNODES;
12033140a227SBrice Goglin 
12043140a227SBrice Goglin 		/* Migrate this chunk */
12053140a227SBrice Goglin 		err = do_move_page_to_node_array(mm, pm,
12063140a227SBrice Goglin 						 flags & MPOL_MF_MOVE_ALL);
12073140a227SBrice Goglin 		if (err < 0)
12083140a227SBrice Goglin 			goto out_pm;
12093140a227SBrice Goglin 
12105e9a0f02SBrice Goglin 		/* Return status information */
12113140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++)
12123140a227SBrice Goglin 			if (put_user(pm[j].status, status + j + chunk_start)) {
12135e9a0f02SBrice Goglin 				err = -EFAULT;
12143140a227SBrice Goglin 				goto out_pm;
12153140a227SBrice Goglin 			}
12163140a227SBrice Goglin 	}
12173140a227SBrice Goglin 	err = 0;
12185e9a0f02SBrice Goglin 
12195e9a0f02SBrice Goglin out_pm:
12203140a227SBrice Goglin 	free_page((unsigned long)pm);
12215e9a0f02SBrice Goglin out:
12225e9a0f02SBrice Goglin 	return err;
12235e9a0f02SBrice Goglin }
12245e9a0f02SBrice Goglin 
12255e9a0f02SBrice Goglin /*
12262f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
1227742755a1SChristoph Lameter  */
122880bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
122980bba129SBrice Goglin 				const void __user **pages, int *status)
1230742755a1SChristoph Lameter {
12312f007e74SBrice Goglin 	unsigned long i;
1232742755a1SChristoph Lameter 
12332f007e74SBrice Goglin 	down_read(&mm->mmap_sem);
12342f007e74SBrice Goglin 
12352f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
123680bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
12372f007e74SBrice Goglin 		struct vm_area_struct *vma;
12382f007e74SBrice Goglin 		struct page *page;
1239c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
12402f007e74SBrice Goglin 
12412f007e74SBrice Goglin 		vma = find_vma(mm, addr);
124270384dc6SGleb Natapov 		if (!vma || addr < vma->vm_start)
1243742755a1SChristoph Lameter 			goto set_status;
1244742755a1SChristoph Lameter 
12452f007e74SBrice Goglin 		page = follow_page(vma, addr, 0);
124689f5b7daSLinus Torvalds 
124789f5b7daSLinus Torvalds 		err = PTR_ERR(page);
124889f5b7daSLinus Torvalds 		if (IS_ERR(page))
124989f5b7daSLinus Torvalds 			goto set_status;
125089f5b7daSLinus Torvalds 
1251742755a1SChristoph Lameter 		err = -ENOENT;
1252742755a1SChristoph Lameter 		/* Use PageReserved to check for zero page */
125362b61f61SHugh Dickins 		if (!page || PageReserved(page) || PageKsm(page))
1254742755a1SChristoph Lameter 			goto set_status;
1255742755a1SChristoph Lameter 
1256742755a1SChristoph Lameter 		err = page_to_nid(page);
1257742755a1SChristoph Lameter set_status:
125880bba129SBrice Goglin 		*status = err;
125980bba129SBrice Goglin 
126080bba129SBrice Goglin 		pages++;
126180bba129SBrice Goglin 		status++;
126280bba129SBrice Goglin 	}
126380bba129SBrice Goglin 
126480bba129SBrice Goglin 	up_read(&mm->mmap_sem);
126580bba129SBrice Goglin }
126680bba129SBrice Goglin 
126780bba129SBrice Goglin /*
126880bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
126980bba129SBrice Goglin  * a user array of status.
127080bba129SBrice Goglin  */
127180bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
127280bba129SBrice Goglin 			 const void __user * __user *pages,
127380bba129SBrice Goglin 			 int __user *status)
127480bba129SBrice Goglin {
127580bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16
127680bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
127780bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
127880bba129SBrice Goglin 
127987b8d1adSH. Peter Anvin 	while (nr_pages) {
128087b8d1adSH. Peter Anvin 		unsigned long chunk_nr;
128180bba129SBrice Goglin 
128287b8d1adSH. Peter Anvin 		chunk_nr = nr_pages;
128387b8d1adSH. Peter Anvin 		if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
128487b8d1adSH. Peter Anvin 			chunk_nr = DO_PAGES_STAT_CHUNK_NR;
128587b8d1adSH. Peter Anvin 
128687b8d1adSH. Peter Anvin 		if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
128787b8d1adSH. Peter Anvin 			break;
128880bba129SBrice Goglin 
128980bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
129080bba129SBrice Goglin 
129187b8d1adSH. Peter Anvin 		if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
129287b8d1adSH. Peter Anvin 			break;
1293742755a1SChristoph Lameter 
129487b8d1adSH. Peter Anvin 		pages += chunk_nr;
129587b8d1adSH. Peter Anvin 		status += chunk_nr;
129687b8d1adSH. Peter Anvin 		nr_pages -= chunk_nr;
129787b8d1adSH. Peter Anvin 	}
129887b8d1adSH. Peter Anvin 	return nr_pages ? -EFAULT : 0;
1299742755a1SChristoph Lameter }
1300742755a1SChristoph Lameter 
1301742755a1SChristoph Lameter /*
1302742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
1303742755a1SChristoph Lameter  * process.
1304742755a1SChristoph Lameter  */
1305938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1306938bb9f5SHeiko Carstens 		const void __user * __user *, pages,
1307938bb9f5SHeiko Carstens 		const int __user *, nodes,
1308938bb9f5SHeiko Carstens 		int __user *, status, int, flags)
1309742755a1SChristoph Lameter {
1310c69e8d9cSDavid Howells 	const struct cred *cred = current_cred(), *tcred;
1311742755a1SChristoph Lameter 	struct task_struct *task;
1312742755a1SChristoph Lameter 	struct mm_struct *mm;
13135e9a0f02SBrice Goglin 	int err;
13143268c63eSChristoph Lameter 	nodemask_t task_nodes;
1315742755a1SChristoph Lameter 
1316742755a1SChristoph Lameter 	/* Check flags */
1317742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1318742755a1SChristoph Lameter 		return -EINVAL;
1319742755a1SChristoph Lameter 
1320742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1321742755a1SChristoph Lameter 		return -EPERM;
1322742755a1SChristoph Lameter 
1323742755a1SChristoph Lameter 	/* Find the mm_struct */
1324a879bf58SGreg Thelen 	rcu_read_lock();
1325228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
1326742755a1SChristoph Lameter 	if (!task) {
1327a879bf58SGreg Thelen 		rcu_read_unlock();
1328742755a1SChristoph Lameter 		return -ESRCH;
1329742755a1SChristoph Lameter 	}
13303268c63eSChristoph Lameter 	get_task_struct(task);
1331742755a1SChristoph Lameter 
1332742755a1SChristoph Lameter 	/*
1333742755a1SChristoph Lameter 	 * Check if this process has the right to modify the specified
1334742755a1SChristoph Lameter 	 * process. The right exists if the process has administrative
1335742755a1SChristoph Lameter 	 * capabilities, superuser privileges or the same
1336742755a1SChristoph Lameter 	 * userid as the target process.
1337742755a1SChristoph Lameter 	 */
1338c69e8d9cSDavid Howells 	tcred = __task_cred(task);
1339b38a86ebSEric W. Biederman 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1340b38a86ebSEric W. Biederman 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1341742755a1SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
1342c69e8d9cSDavid Howells 		rcu_read_unlock();
1343742755a1SChristoph Lameter 		err = -EPERM;
13445e9a0f02SBrice Goglin 		goto out;
1345742755a1SChristoph Lameter 	}
1346c69e8d9cSDavid Howells 	rcu_read_unlock();
1347742755a1SChristoph Lameter 
134886c3a764SDavid Quigley  	err = security_task_movememory(task);
134986c3a764SDavid Quigley  	if (err)
1350742755a1SChristoph Lameter 		goto out;
1351742755a1SChristoph Lameter 
13523268c63eSChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
13533268c63eSChristoph Lameter 	mm = get_task_mm(task);
13543268c63eSChristoph Lameter 	put_task_struct(task);
13553268c63eSChristoph Lameter 
13566e8b09eaSSasha Levin 	if (!mm)
13576e8b09eaSSasha Levin 		return -EINVAL;
13586e8b09eaSSasha Levin 
13593268c63eSChristoph Lameter 	if (nodes)
13603268c63eSChristoph Lameter 		err = do_pages_move(mm, task_nodes, nr_pages, pages,
13613268c63eSChristoph Lameter 				    nodes, status, flags);
13623268c63eSChristoph Lameter 	else
13635e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
13643268c63eSChristoph Lameter 
13653268c63eSChristoph Lameter 	mmput(mm);
13663268c63eSChristoph Lameter 	return err;
1367742755a1SChristoph Lameter 
1368742755a1SChristoph Lameter out:
13693268c63eSChristoph Lameter 	put_task_struct(task);
1370742755a1SChristoph Lameter 	return err;
1371742755a1SChristoph Lameter }
1372742755a1SChristoph Lameter 
13737b2259b3SChristoph Lameter /*
13747b2259b3SChristoph Lameter  * Call migration functions in the vma_ops that may prepare
13757b2259b3SChristoph Lameter  * memory in a vm for migration. migration functions may perform
13767b2259b3SChristoph Lameter  * the migration for vmas that do not have an underlying page struct.
13777b2259b3SChristoph Lameter  */
13787b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
13797b2259b3SChristoph Lameter 	const nodemask_t *from, unsigned long flags)
13807b2259b3SChristoph Lameter {
13817b2259b3SChristoph Lameter  	struct vm_area_struct *vma;
13827b2259b3SChristoph Lameter  	int err = 0;
13837b2259b3SChristoph Lameter 
13841001c9fbSDaisuke Nishimura 	for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
13857b2259b3SChristoph Lameter  		if (vma->vm_ops && vma->vm_ops->migrate) {
13867b2259b3SChristoph Lameter  			err = vma->vm_ops->migrate(vma, to, from, flags);
13877b2259b3SChristoph Lameter  			if (err)
13887b2259b3SChristoph Lameter  				break;
13897b2259b3SChristoph Lameter  		}
13907b2259b3SChristoph Lameter  	}
13917b2259b3SChristoph Lameter  	return err;
13927b2259b3SChristoph Lameter }
139383d1674aSGerald Schaefer #endif
1394