xref: /linux/mm/migrate.c (revision 6f5a55f1a6c5abee15a0e878e5c74d9f1569b8b0)
1b20a3503SChristoph Lameter /*
2b20a3503SChristoph Lameter  * Memory Migration functionality - linux/mm/migration.c
3b20a3503SChristoph Lameter  *
4b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5b20a3503SChristoph Lameter  *
6b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
7b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
8b20a3503SChristoph Lameter  *
9b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
11b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
12cde53535SChristoph Lameter  * Christoph Lameter
13b20a3503SChristoph Lameter  */
14b20a3503SChristoph Lameter 
15b20a3503SChristoph Lameter #include <linux/migrate.h>
16b20a3503SChristoph Lameter #include <linux/module.h>
17b20a3503SChristoph Lameter #include <linux/swap.h>
180697212aSChristoph Lameter #include <linux/swapops.h>
19b20a3503SChristoph Lameter #include <linux/pagemap.h>
20e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
21b20a3503SChristoph Lameter #include <linux/mm_inline.h>
22b488893aSPavel Emelyanov #include <linux/nsproxy.h>
23b20a3503SChristoph Lameter #include <linux/pagevec.h>
24e9995ef9SHugh Dickins #include <linux/ksm.h>
25b20a3503SChristoph Lameter #include <linux/rmap.h>
26b20a3503SChristoph Lameter #include <linux/topology.h>
27b20a3503SChristoph Lameter #include <linux/cpu.h>
28b20a3503SChristoph Lameter #include <linux/cpuset.h>
2904e62a29SChristoph Lameter #include <linux/writeback.h>
30742755a1SChristoph Lameter #include <linux/mempolicy.h>
31742755a1SChristoph Lameter #include <linux/vmalloc.h>
3286c3a764SDavid Quigley #include <linux/security.h>
338a9f3ccdSBalbir Singh #include <linux/memcontrol.h>
344f5ca265SAdrian Bunk #include <linux/syscalls.h>
35b20a3503SChristoph Lameter 
36b20a3503SChristoph Lameter #include "internal.h"
37b20a3503SChristoph Lameter 
38b20a3503SChristoph Lameter #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
39b20a3503SChristoph Lameter 
40b20a3503SChristoph Lameter /*
41742755a1SChristoph Lameter  * migrate_prep() needs to be called before we start compiling a list of pages
42742755a1SChristoph Lameter  * to be migrated using isolate_lru_page().
43b20a3503SChristoph Lameter  */
44b20a3503SChristoph Lameter int migrate_prep(void)
45b20a3503SChristoph Lameter {
46b20a3503SChristoph Lameter 	/*
47b20a3503SChristoph Lameter 	 * Clear the LRU lists so pages can be isolated.
48b20a3503SChristoph Lameter 	 * Note that pages may be moved off the LRU after we have
49b20a3503SChristoph Lameter 	 * drained them. Those pages will fail to migrate like other
50b20a3503SChristoph Lameter 	 * pages that may be busy.
51b20a3503SChristoph Lameter 	 */
52b20a3503SChristoph Lameter 	lru_add_drain_all();
53b20a3503SChristoph Lameter 
54b20a3503SChristoph Lameter 	return 0;
55b20a3503SChristoph Lameter }
56b20a3503SChristoph Lameter 
57b20a3503SChristoph Lameter /*
58894bc310SLee Schermerhorn  * Add isolated pages on the list back to the LRU under page lock
59894bc310SLee Schermerhorn  * to avoid leaking evictable pages back onto unevictable list.
60b20a3503SChristoph Lameter  *
61b20a3503SChristoph Lameter  * returns the number of pages put back.
62b20a3503SChristoph Lameter  */
63b20a3503SChristoph Lameter int putback_lru_pages(struct list_head *l)
64b20a3503SChristoph Lameter {
65b20a3503SChristoph Lameter 	struct page *page;
66b20a3503SChristoph Lameter 	struct page *page2;
67b20a3503SChristoph Lameter 	int count = 0;
68b20a3503SChristoph Lameter 
69b20a3503SChristoph Lameter 	list_for_each_entry_safe(page, page2, l, lru) {
70e24f0b8fSChristoph Lameter 		list_del(&page->lru);
71a731286dSKOSAKI Motohiro 		dec_zone_page_state(page, NR_ISOLATED_ANON +
726c0b1351SJohannes Weiner 				page_is_file_cache(page));
73894bc310SLee Schermerhorn 		putback_lru_page(page);
74b20a3503SChristoph Lameter 		count++;
75b20a3503SChristoph Lameter 	}
76b20a3503SChristoph Lameter 	return count;
77b20a3503SChristoph Lameter }
78b20a3503SChristoph Lameter 
790697212aSChristoph Lameter /*
800697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
810697212aSChristoph Lameter  */
82e9995ef9SHugh Dickins static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
83e9995ef9SHugh Dickins 				 unsigned long addr, void *old)
840697212aSChristoph Lameter {
850697212aSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
860697212aSChristoph Lameter 	swp_entry_t entry;
870697212aSChristoph Lameter  	pgd_t *pgd;
880697212aSChristoph Lameter  	pud_t *pud;
890697212aSChristoph Lameter  	pmd_t *pmd;
900697212aSChristoph Lameter 	pte_t *ptep, pte;
910697212aSChristoph Lameter  	spinlock_t *ptl;
920697212aSChristoph Lameter 
930697212aSChristoph Lameter  	pgd = pgd_offset(mm, addr);
940697212aSChristoph Lameter 	if (!pgd_present(*pgd))
95e9995ef9SHugh Dickins 		goto out;
960697212aSChristoph Lameter 
970697212aSChristoph Lameter 	pud = pud_offset(pgd, addr);
980697212aSChristoph Lameter 	if (!pud_present(*pud))
99e9995ef9SHugh Dickins 		goto out;
1000697212aSChristoph Lameter 
1010697212aSChristoph Lameter 	pmd = pmd_offset(pud, addr);
1020697212aSChristoph Lameter 	if (!pmd_present(*pmd))
103e9995ef9SHugh Dickins 		goto out;
1040697212aSChristoph Lameter 
1050697212aSChristoph Lameter 	ptep = pte_offset_map(pmd, addr);
1060697212aSChristoph Lameter 
1070697212aSChristoph Lameter 	if (!is_swap_pte(*ptep)) {
1080697212aSChristoph Lameter 		pte_unmap(ptep);
109e9995ef9SHugh Dickins 		goto out;
1100697212aSChristoph Lameter  	}
1110697212aSChristoph Lameter 
1120697212aSChristoph Lameter  	ptl = pte_lockptr(mm, pmd);
1130697212aSChristoph Lameter  	spin_lock(ptl);
1140697212aSChristoph Lameter 	pte = *ptep;
1150697212aSChristoph Lameter 	if (!is_swap_pte(pte))
116e9995ef9SHugh Dickins 		goto unlock;
1170697212aSChristoph Lameter 
1180697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
1190697212aSChristoph Lameter 
120e9995ef9SHugh Dickins 	if (!is_migration_entry(entry) ||
121e9995ef9SHugh Dickins 	    migration_entry_to_page(entry) != old)
122e9995ef9SHugh Dickins 		goto unlock;
1230697212aSChristoph Lameter 
1240697212aSChristoph Lameter 	get_page(new);
1250697212aSChristoph Lameter 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
1260697212aSChristoph Lameter 	if (is_write_migration_entry(entry))
1270697212aSChristoph Lameter 		pte = pte_mkwrite(pte);
12897ee0524SKAMEZAWA Hiroyuki 	flush_cache_page(vma, addr, pte_pfn(pte));
1290697212aSChristoph Lameter 	set_pte_at(mm, addr, ptep, pte);
13004e62a29SChristoph Lameter 
13104e62a29SChristoph Lameter 	if (PageAnon(new))
1320697212aSChristoph Lameter 		page_add_anon_rmap(new, vma, addr);
13304e62a29SChristoph Lameter 	else
13404e62a29SChristoph Lameter 		page_add_file_rmap(new);
13504e62a29SChristoph Lameter 
13604e62a29SChristoph Lameter 	/* No need to invalidate - it was non-present before */
13704e62a29SChristoph Lameter 	update_mmu_cache(vma, addr, pte);
138e9995ef9SHugh Dickins unlock:
1390697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
140e9995ef9SHugh Dickins out:
141e9995ef9SHugh Dickins 	return SWAP_AGAIN;
1420697212aSChristoph Lameter }
1430697212aSChristoph Lameter 
1440697212aSChristoph Lameter /*
14504e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
14604e62a29SChristoph Lameter  * references to the indicated page.
14704e62a29SChristoph Lameter  */
14804e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new)
14904e62a29SChristoph Lameter {
150e9995ef9SHugh Dickins 	rmap_walk(new, remove_migration_pte, old);
15104e62a29SChristoph Lameter }
15204e62a29SChristoph Lameter 
15304e62a29SChristoph Lameter /*
1540697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
1550697212aSChristoph Lameter  * get to the page and wait until migration is finished.
1560697212aSChristoph Lameter  * When we return from this function the fault will be retried.
1570697212aSChristoph Lameter  *
1580697212aSChristoph Lameter  * This function is called from do_swap_page().
1590697212aSChristoph Lameter  */
1600697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
1610697212aSChristoph Lameter 				unsigned long address)
1620697212aSChristoph Lameter {
1630697212aSChristoph Lameter 	pte_t *ptep, pte;
1640697212aSChristoph Lameter 	spinlock_t *ptl;
1650697212aSChristoph Lameter 	swp_entry_t entry;
1660697212aSChristoph Lameter 	struct page *page;
1670697212aSChristoph Lameter 
1680697212aSChristoph Lameter 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1690697212aSChristoph Lameter 	pte = *ptep;
1700697212aSChristoph Lameter 	if (!is_swap_pte(pte))
1710697212aSChristoph Lameter 		goto out;
1720697212aSChristoph Lameter 
1730697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
1740697212aSChristoph Lameter 	if (!is_migration_entry(entry))
1750697212aSChristoph Lameter 		goto out;
1760697212aSChristoph Lameter 
1770697212aSChristoph Lameter 	page = migration_entry_to_page(entry);
1780697212aSChristoph Lameter 
179e286781dSNick Piggin 	/*
180e286781dSNick Piggin 	 * Once radix-tree replacement of page migration started, page_count
181e286781dSNick Piggin 	 * *must* be zero. And, we don't want to call wait_on_page_locked()
182e286781dSNick Piggin 	 * against a page without get_page().
183e286781dSNick Piggin 	 * So, we use get_page_unless_zero(), here. Even failed, page fault
184e286781dSNick Piggin 	 * will occur again.
185e286781dSNick Piggin 	 */
186e286781dSNick Piggin 	if (!get_page_unless_zero(page))
187e286781dSNick Piggin 		goto out;
1880697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
1890697212aSChristoph Lameter 	wait_on_page_locked(page);
1900697212aSChristoph Lameter 	put_page(page);
1910697212aSChristoph Lameter 	return;
1920697212aSChristoph Lameter out:
1930697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
1940697212aSChristoph Lameter }
1950697212aSChristoph Lameter 
196b20a3503SChristoph Lameter /*
197c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
1985b5c7120SChristoph Lameter  *
1995b5c7120SChristoph Lameter  * The number of remaining references must be:
2005b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
2015b5c7120SChristoph Lameter  * 2 for pages with a mapping
202266cf658SDavid Howells  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
203b20a3503SChristoph Lameter  */
2042d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping,
2052d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
206b20a3503SChristoph Lameter {
207e286781dSNick Piggin 	int expected_count;
2087cf9c2c7SNick Piggin 	void **pslot;
209b20a3503SChristoph Lameter 
2106c5240aeSChristoph Lameter 	if (!mapping) {
2110e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
2126c5240aeSChristoph Lameter 		if (page_count(page) != 1)
2136c5240aeSChristoph Lameter 			return -EAGAIN;
2146c5240aeSChristoph Lameter 		return 0;
2156c5240aeSChristoph Lameter 	}
2166c5240aeSChristoph Lameter 
21719fd6231SNick Piggin 	spin_lock_irq(&mapping->tree_lock);
218b20a3503SChristoph Lameter 
2197cf9c2c7SNick Piggin 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
220b20a3503SChristoph Lameter  					page_index(page));
221b20a3503SChristoph Lameter 
222edcf4748SJohannes Weiner 	expected_count = 2 + page_has_private(page);
223e286781dSNick Piggin 	if (page_count(page) != expected_count ||
2247cf9c2c7SNick Piggin 			(struct page *)radix_tree_deref_slot(pslot) != page) {
22519fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
226e23ca00bSChristoph Lameter 		return -EAGAIN;
227b20a3503SChristoph Lameter 	}
228b20a3503SChristoph Lameter 
229e286781dSNick Piggin 	if (!page_freeze_refs(page, expected_count)) {
23019fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
231e286781dSNick Piggin 		return -EAGAIN;
232e286781dSNick Piggin 	}
233e286781dSNick Piggin 
234b20a3503SChristoph Lameter 	/*
235b20a3503SChristoph Lameter 	 * Now we know that no one else is looking at the page.
236b20a3503SChristoph Lameter 	 */
2377cf9c2c7SNick Piggin 	get_page(newpage);	/* add cache reference */
238b20a3503SChristoph Lameter 	if (PageSwapCache(page)) {
239b20a3503SChristoph Lameter 		SetPageSwapCache(newpage);
240b20a3503SChristoph Lameter 		set_page_private(newpage, page_private(page));
241b20a3503SChristoph Lameter 	}
242b20a3503SChristoph Lameter 
2437cf9c2c7SNick Piggin 	radix_tree_replace_slot(pslot, newpage);
2447cf9c2c7SNick Piggin 
245e286781dSNick Piggin 	page_unfreeze_refs(page, expected_count);
2467cf9c2c7SNick Piggin 	/*
2477cf9c2c7SNick Piggin 	 * Drop cache reference from old page.
2487cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
2497cf9c2c7SNick Piggin 	 */
250b20a3503SChristoph Lameter 	__put_page(page);
2517cf9c2c7SNick Piggin 
2520e8c7d0fSChristoph Lameter 	/*
2530e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
2540e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
2550e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
2560e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
2570e8c7d0fSChristoph Lameter 	 *
2580e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
2590e8c7d0fSChristoph Lameter 	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
2600e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
2610e8c7d0fSChristoph Lameter 	 */
2620e8c7d0fSChristoph Lameter 	__dec_zone_page_state(page, NR_FILE_PAGES);
2630e8c7d0fSChristoph Lameter 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
2644b02108aSKOSAKI Motohiro 	if (PageSwapBacked(page)) {
2654b02108aSKOSAKI Motohiro 		__dec_zone_page_state(page, NR_SHMEM);
2664b02108aSKOSAKI Motohiro 		__inc_zone_page_state(newpage, NR_SHMEM);
2674b02108aSKOSAKI Motohiro 	}
26819fd6231SNick Piggin 	spin_unlock_irq(&mapping->tree_lock);
269b20a3503SChristoph Lameter 
270b20a3503SChristoph Lameter 	return 0;
271b20a3503SChristoph Lameter }
272b20a3503SChristoph Lameter 
273b20a3503SChristoph Lameter /*
274b20a3503SChristoph Lameter  * Copy the page to its new location
275b20a3503SChristoph Lameter  */
276e7340f73SChristoph Lameter static void migrate_page_copy(struct page *newpage, struct page *page)
277b20a3503SChristoph Lameter {
278b7abea96SKAMEZAWA Hiroyuki 	int anon;
279b7abea96SKAMEZAWA Hiroyuki 
280b20a3503SChristoph Lameter 	copy_highpage(newpage, page);
281b20a3503SChristoph Lameter 
282b20a3503SChristoph Lameter 	if (PageError(page))
283b20a3503SChristoph Lameter 		SetPageError(newpage);
284b20a3503SChristoph Lameter 	if (PageReferenced(page))
285b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
286b20a3503SChristoph Lameter 	if (PageUptodate(page))
287b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
288894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
289894bc310SLee Schermerhorn 		VM_BUG_ON(PageUnevictable(page));
290b20a3503SChristoph Lameter 		SetPageActive(newpage);
291418b27efSLee Schermerhorn 	} else if (TestClearPageUnevictable(page))
292418b27efSLee Schermerhorn 		SetPageUnevictable(newpage);
293b20a3503SChristoph Lameter 	if (PageChecked(page))
294b20a3503SChristoph Lameter 		SetPageChecked(newpage);
295b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
296b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
297b20a3503SChristoph Lameter 
298b20a3503SChristoph Lameter 	if (PageDirty(page)) {
299b20a3503SChristoph Lameter 		clear_page_dirty_for_io(page);
3003a902c5fSNick Piggin 		/*
3013a902c5fSNick Piggin 		 * Want to mark the page and the radix tree as dirty, and
3023a902c5fSNick Piggin 		 * redo the accounting that clear_page_dirty_for_io undid,
3033a902c5fSNick Piggin 		 * but we can't use set_page_dirty because that function
3043a902c5fSNick Piggin 		 * is actually a signal that all of the page has become dirty.
3053a902c5fSNick Piggin 		 * Wheras only part of our page may be dirty.
3063a902c5fSNick Piggin 		 */
3073a902c5fSNick Piggin 		__set_page_dirty_nobuffers(newpage);
308b20a3503SChristoph Lameter  	}
309b20a3503SChristoph Lameter 
310b291f000SNick Piggin 	mlock_migrate_page(newpage, page);
311e9995ef9SHugh Dickins 	ksm_migrate_page(newpage, page);
312b291f000SNick Piggin 
313b20a3503SChristoph Lameter 	ClearPageSwapCache(page);
314b20a3503SChristoph Lameter 	ClearPagePrivate(page);
315b20a3503SChristoph Lameter 	set_page_private(page, 0);
316b7abea96SKAMEZAWA Hiroyuki 	/* page->mapping contains a flag for PageAnon() */
317b7abea96SKAMEZAWA Hiroyuki 	anon = PageAnon(page);
318b20a3503SChristoph Lameter 	page->mapping = NULL;
319b20a3503SChristoph Lameter 
320b20a3503SChristoph Lameter 	/*
321b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
322b20a3503SChristoph Lameter 	 * wake them up.
323b20a3503SChristoph Lameter 	 */
324b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
325b20a3503SChristoph Lameter 		end_page_writeback(newpage);
326b20a3503SChristoph Lameter }
327b20a3503SChristoph Lameter 
3281d8b85ccSChristoph Lameter /************************************************************
3291d8b85ccSChristoph Lameter  *                    Migration functions
3301d8b85ccSChristoph Lameter  ***********************************************************/
3311d8b85ccSChristoph Lameter 
3321d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */
3332d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping,
3342d1db3b1SChristoph Lameter 			struct page *newpage, struct page *page)
3351d8b85ccSChristoph Lameter {
3361d8b85ccSChristoph Lameter 	return -EIO;
3371d8b85ccSChristoph Lameter }
3381d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page);
3391d8b85ccSChristoph Lameter 
340b20a3503SChristoph Lameter /*
341b20a3503SChristoph Lameter  * Common logic to directly migrate a single page suitable for
342266cf658SDavid Howells  * pages that do not use PagePrivate/PagePrivate2.
343b20a3503SChristoph Lameter  *
344b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
345b20a3503SChristoph Lameter  */
3462d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
3472d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
348b20a3503SChristoph Lameter {
349b20a3503SChristoph Lameter 	int rc;
350b20a3503SChristoph Lameter 
351b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
352b20a3503SChristoph Lameter 
3532d1db3b1SChristoph Lameter 	rc = migrate_page_move_mapping(mapping, newpage, page);
354b20a3503SChristoph Lameter 
355b20a3503SChristoph Lameter 	if (rc)
356b20a3503SChristoph Lameter 		return rc;
357b20a3503SChristoph Lameter 
358b20a3503SChristoph Lameter 	migrate_page_copy(newpage, page);
359b20a3503SChristoph Lameter 	return 0;
360b20a3503SChristoph Lameter }
361b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
362b20a3503SChristoph Lameter 
3639361401eSDavid Howells #ifdef CONFIG_BLOCK
364b20a3503SChristoph Lameter /*
3651d8b85ccSChristoph Lameter  * Migration function for pages with buffers. This function can only be used
3661d8b85ccSChristoph Lameter  * if the underlying filesystem guarantees that no other references to "page"
3671d8b85ccSChristoph Lameter  * exist.
3681d8b85ccSChristoph Lameter  */
3692d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping,
3702d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
3711d8b85ccSChristoph Lameter {
3721d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
3731d8b85ccSChristoph Lameter 	int rc;
3741d8b85ccSChristoph Lameter 
3751d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
3762d1db3b1SChristoph Lameter 		return migrate_page(mapping, newpage, page);
3771d8b85ccSChristoph Lameter 
3781d8b85ccSChristoph Lameter 	head = page_buffers(page);
3791d8b85ccSChristoph Lameter 
3802d1db3b1SChristoph Lameter 	rc = migrate_page_move_mapping(mapping, newpage, page);
3811d8b85ccSChristoph Lameter 
3821d8b85ccSChristoph Lameter 	if (rc)
3831d8b85ccSChristoph Lameter 		return rc;
3841d8b85ccSChristoph Lameter 
3851d8b85ccSChristoph Lameter 	bh = head;
3861d8b85ccSChristoph Lameter 	do {
3871d8b85ccSChristoph Lameter 		get_bh(bh);
3881d8b85ccSChristoph Lameter 		lock_buffer(bh);
3891d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
3901d8b85ccSChristoph Lameter 
3911d8b85ccSChristoph Lameter 	} while (bh != head);
3921d8b85ccSChristoph Lameter 
3931d8b85ccSChristoph Lameter 	ClearPagePrivate(page);
3941d8b85ccSChristoph Lameter 	set_page_private(newpage, page_private(page));
3951d8b85ccSChristoph Lameter 	set_page_private(page, 0);
3961d8b85ccSChristoph Lameter 	put_page(page);
3971d8b85ccSChristoph Lameter 	get_page(newpage);
3981d8b85ccSChristoph Lameter 
3991d8b85ccSChristoph Lameter 	bh = head;
4001d8b85ccSChristoph Lameter 	do {
4011d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
4021d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
4031d8b85ccSChristoph Lameter 
4041d8b85ccSChristoph Lameter 	} while (bh != head);
4051d8b85ccSChristoph Lameter 
4061d8b85ccSChristoph Lameter 	SetPagePrivate(newpage);
4071d8b85ccSChristoph Lameter 
4081d8b85ccSChristoph Lameter 	migrate_page_copy(newpage, page);
4091d8b85ccSChristoph Lameter 
4101d8b85ccSChristoph Lameter 	bh = head;
4111d8b85ccSChristoph Lameter 	do {
4121d8b85ccSChristoph Lameter 		unlock_buffer(bh);
4131d8b85ccSChristoph Lameter  		put_bh(bh);
4141d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
4151d8b85ccSChristoph Lameter 
4161d8b85ccSChristoph Lameter 	} while (bh != head);
4171d8b85ccSChristoph Lameter 
4181d8b85ccSChristoph Lameter 	return 0;
4191d8b85ccSChristoph Lameter }
4201d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
4219361401eSDavid Howells #endif
4221d8b85ccSChristoph Lameter 
42304e62a29SChristoph Lameter /*
42404e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
42504e62a29SChristoph Lameter  */
42604e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
42704e62a29SChristoph Lameter {
42804e62a29SChristoph Lameter 	struct writeback_control wbc = {
42904e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
43004e62a29SChristoph Lameter 		.nr_to_write = 1,
43104e62a29SChristoph Lameter 		.range_start = 0,
43204e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
43304e62a29SChristoph Lameter 		.nonblocking = 1,
43404e62a29SChristoph Lameter 		.for_reclaim = 1
43504e62a29SChristoph Lameter 	};
43604e62a29SChristoph Lameter 	int rc;
43704e62a29SChristoph Lameter 
43804e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
43904e62a29SChristoph Lameter 		/* No write method for the address space */
44004e62a29SChristoph Lameter 		return -EINVAL;
44104e62a29SChristoph Lameter 
44204e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
44304e62a29SChristoph Lameter 		/* Someone else already triggered a write */
44404e62a29SChristoph Lameter 		return -EAGAIN;
44504e62a29SChristoph Lameter 
44604e62a29SChristoph Lameter 	/*
44704e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
44804e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
44904e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
45004e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
45104e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
45204e62a29SChristoph Lameter 	 * be successful.
45304e62a29SChristoph Lameter 	 */
45404e62a29SChristoph Lameter 	remove_migration_ptes(page, page);
45504e62a29SChristoph Lameter 
45604e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
45704e62a29SChristoph Lameter 
45804e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
45904e62a29SChristoph Lameter 		/* unlocked. Relock */
46004e62a29SChristoph Lameter 		lock_page(page);
46104e62a29SChristoph Lameter 
462bda8550dSHugh Dickins 	return (rc < 0) ? -EIO : -EAGAIN;
46304e62a29SChristoph Lameter }
46404e62a29SChristoph Lameter 
46504e62a29SChristoph Lameter /*
46604e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
46704e62a29SChristoph Lameter  */
4688351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
4698351a6e4SChristoph Lameter 	struct page *newpage, struct page *page)
4708351a6e4SChristoph Lameter {
47104e62a29SChristoph Lameter 	if (PageDirty(page))
47204e62a29SChristoph Lameter 		return writeout(mapping, page);
4738351a6e4SChristoph Lameter 
4748351a6e4SChristoph Lameter 	/*
4758351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
4768351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
4778351a6e4SChristoph Lameter 	 */
478266cf658SDavid Howells 	if (page_has_private(page) &&
4798351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
4808351a6e4SChristoph Lameter 		return -EAGAIN;
4818351a6e4SChristoph Lameter 
4828351a6e4SChristoph Lameter 	return migrate_page(mapping, newpage, page);
4838351a6e4SChristoph Lameter }
4848351a6e4SChristoph Lameter 
4851d8b85ccSChristoph Lameter /*
486e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
487e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
488b20a3503SChristoph Lameter  *
489e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
490e24f0b8fSChristoph Lameter  * is successful.
491894bc310SLee Schermerhorn  *
492894bc310SLee Schermerhorn  * Return value:
493894bc310SLee Schermerhorn  *   < 0 - error code
494894bc310SLee Schermerhorn  *  == 0 - success
495b20a3503SChristoph Lameter  */
496e24f0b8fSChristoph Lameter static int move_to_new_page(struct page *newpage, struct page *page)
497b20a3503SChristoph Lameter {
498e24f0b8fSChristoph Lameter 	struct address_space *mapping;
499b20a3503SChristoph Lameter 	int rc;
500b20a3503SChristoph Lameter 
501b20a3503SChristoph Lameter 	/*
502e24f0b8fSChristoph Lameter 	 * Block others from accessing the page when we get around to
503e24f0b8fSChristoph Lameter 	 * establishing additional references. We are the only one
504e24f0b8fSChristoph Lameter 	 * holding a reference to the new page at this point.
505b20a3503SChristoph Lameter 	 */
506529ae9aaSNick Piggin 	if (!trylock_page(newpage))
507e24f0b8fSChristoph Lameter 		BUG();
508b20a3503SChristoph Lameter 
5092d1db3b1SChristoph Lameter 	/* Prepare mapping for the new page.*/
5102d1db3b1SChristoph Lameter 	newpage->index = page->index;
5112d1db3b1SChristoph Lameter 	newpage->mapping = page->mapping;
512b2e18538SRik van Riel 	if (PageSwapBacked(page))
513b2e18538SRik van Riel 		SetPageSwapBacked(newpage);
5142d1db3b1SChristoph Lameter 
515b20a3503SChristoph Lameter 	mapping = page_mapping(page);
516b20a3503SChristoph Lameter 	if (!mapping)
5176c5240aeSChristoph Lameter 		rc = migrate_page(mapping, newpage, page);
5186c5240aeSChristoph Lameter 	else if (mapping->a_ops->migratepage)
519b20a3503SChristoph Lameter 		/*
520b20a3503SChristoph Lameter 		 * Most pages have a mapping and most filesystems
521b20a3503SChristoph Lameter 		 * should provide a migration function. Anonymous
522b20a3503SChristoph Lameter 		 * pages are part of swap space which also has its
523b20a3503SChristoph Lameter 		 * own migration function. This is the most common
524b20a3503SChristoph Lameter 		 * path for page migration.
525b20a3503SChristoph Lameter 		 */
5262d1db3b1SChristoph Lameter 		rc = mapping->a_ops->migratepage(mapping,
5272d1db3b1SChristoph Lameter 						newpage, page);
5288351a6e4SChristoph Lameter 	else
5298351a6e4SChristoph Lameter 		rc = fallback_migrate_page(mapping, newpage, page);
530b20a3503SChristoph Lameter 
531e9995ef9SHugh Dickins 	if (!rc)
5326c5240aeSChristoph Lameter 		remove_migration_ptes(page, newpage);
533e9995ef9SHugh Dickins 	else
534e24f0b8fSChristoph Lameter 		newpage->mapping = NULL;
5356c5240aeSChristoph Lameter 
536b20a3503SChristoph Lameter 	unlock_page(newpage);
537b20a3503SChristoph Lameter 
538e24f0b8fSChristoph Lameter 	return rc;
539e24f0b8fSChristoph Lameter }
540e24f0b8fSChristoph Lameter 
541e24f0b8fSChristoph Lameter /*
542e24f0b8fSChristoph Lameter  * Obtain the lock on page, remove all ptes and migrate the page
543e24f0b8fSChristoph Lameter  * to the newly allocated page in newpage.
544e24f0b8fSChristoph Lameter  */
54595a402c3SChristoph Lameter static int unmap_and_move(new_page_t get_new_page, unsigned long private,
54662b61f61SHugh Dickins 			struct page *page, int force, int offlining)
547e24f0b8fSChristoph Lameter {
548e24f0b8fSChristoph Lameter 	int rc = 0;
549742755a1SChristoph Lameter 	int *result = NULL;
550742755a1SChristoph Lameter 	struct page *newpage = get_new_page(page, private, &result);
551989f89c5SKAMEZAWA Hiroyuki 	int rcu_locked = 0;
552ae41be37SKAMEZAWA Hiroyuki 	int charge = 0;
553e00e4316SKAMEZAWA Hiroyuki 	struct mem_cgroup *mem = NULL;
55495a402c3SChristoph Lameter 
55595a402c3SChristoph Lameter 	if (!newpage)
55695a402c3SChristoph Lameter 		return -ENOMEM;
557e24f0b8fSChristoph Lameter 
558894bc310SLee Schermerhorn 	if (page_count(page) == 1) {
559e24f0b8fSChristoph Lameter 		/* page was freed from under us. So we are done. */
56095a402c3SChristoph Lameter 		goto move_newpage;
561894bc310SLee Schermerhorn 	}
562e24f0b8fSChristoph Lameter 
563e8589cc1SKAMEZAWA Hiroyuki 	/* prepare cgroup just returns 0 or -ENOMEM */
564e24f0b8fSChristoph Lameter 	rc = -EAGAIN;
56501b1ae63SKAMEZAWA Hiroyuki 
566529ae9aaSNick Piggin 	if (!trylock_page(page)) {
567e24f0b8fSChristoph Lameter 		if (!force)
56895a402c3SChristoph Lameter 			goto move_newpage;
569e24f0b8fSChristoph Lameter 		lock_page(page);
570e24f0b8fSChristoph Lameter 	}
571e24f0b8fSChristoph Lameter 
57262b61f61SHugh Dickins 	/*
57362b61f61SHugh Dickins 	 * Only memory hotplug's offline_pages() caller has locked out KSM,
57462b61f61SHugh Dickins 	 * and can safely migrate a KSM page.  The other cases have skipped
57562b61f61SHugh Dickins 	 * PageKsm along with PageReserved - but it is only now when we have
57662b61f61SHugh Dickins 	 * the page lock that we can be certain it will not go KSM beneath us
57762b61f61SHugh Dickins 	 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
57862b61f61SHugh Dickins 	 * its pagecount raised, but only here do we take the page lock which
57962b61f61SHugh Dickins 	 * serializes that).
58062b61f61SHugh Dickins 	 */
58162b61f61SHugh Dickins 	if (PageKsm(page) && !offlining) {
58262b61f61SHugh Dickins 		rc = -EBUSY;
58362b61f61SHugh Dickins 		goto unlock;
58462b61f61SHugh Dickins 	}
58562b61f61SHugh Dickins 
58601b1ae63SKAMEZAWA Hiroyuki 	/* charge against new page */
58701b1ae63SKAMEZAWA Hiroyuki 	charge = mem_cgroup_prepare_migration(page, &mem);
58801b1ae63SKAMEZAWA Hiroyuki 	if (charge == -ENOMEM) {
58901b1ae63SKAMEZAWA Hiroyuki 		rc = -ENOMEM;
59001b1ae63SKAMEZAWA Hiroyuki 		goto unlock;
59101b1ae63SKAMEZAWA Hiroyuki 	}
59201b1ae63SKAMEZAWA Hiroyuki 	BUG_ON(charge);
59301b1ae63SKAMEZAWA Hiroyuki 
594e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
595e24f0b8fSChristoph Lameter 		if (!force)
59601b1ae63SKAMEZAWA Hiroyuki 			goto uncharge;
597e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
598e24f0b8fSChristoph Lameter 	}
599e24f0b8fSChristoph Lameter 	/*
600dc386d4dSKAMEZAWA Hiroyuki 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
601dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
602dc386d4dSKAMEZAWA Hiroyuki 	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
603dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
604989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
605989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
606e24f0b8fSChristoph Lameter 	 */
607989f89c5SKAMEZAWA Hiroyuki 	if (PageAnon(page)) {
608dc386d4dSKAMEZAWA Hiroyuki 		rcu_read_lock();
609989f89c5SKAMEZAWA Hiroyuki 		rcu_locked = 1;
610989f89c5SKAMEZAWA Hiroyuki 	}
61162e1c553SShaohua Li 
612dc386d4dSKAMEZAWA Hiroyuki 	/*
61362e1c553SShaohua Li 	 * Corner case handling:
61462e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
61562e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
61662e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
61762e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
61862e1c553SShaohua Li 	 * 2. An orphaned page (see truncate_complete_page) might have
61962e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
62062e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
62162e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
62262e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
623dc386d4dSKAMEZAWA Hiroyuki 	 */
62462e1c553SShaohua Li 	if (!page->mapping) {
625266cf658SDavid Howells 		if (!PageAnon(page) && page_has_private(page)) {
62662e1c553SShaohua Li 			/*
62762e1c553SShaohua Li 			 * Go direct to try_to_free_buffers() here because
62862e1c553SShaohua Li 			 * a) that's what try_to_release_page() would do anyway
62962e1c553SShaohua Li 			 * b) we may be under rcu_read_lock() here, so we can't
63062e1c553SShaohua Li 			 *    use GFP_KERNEL which is what try_to_release_page()
63162e1c553SShaohua Li 			 *    needs to be effective.
63262e1c553SShaohua Li 			 */
63362e1c553SShaohua Li 			try_to_free_buffers(page);
634dc386d4dSKAMEZAWA Hiroyuki 			goto rcu_unlock;
63562e1c553SShaohua Li 		}
636abfc3488SShaohua Li 		goto skip_unmap;
637abfc3488SShaohua Li 	}
63862e1c553SShaohua Li 
639dc386d4dSKAMEZAWA Hiroyuki 	/* Establish migration ptes or remove ptes */
64014fa31b8SAndi Kleen 	try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
641dc386d4dSKAMEZAWA Hiroyuki 
642abfc3488SShaohua Li skip_unmap:
643e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
644e24f0b8fSChristoph Lameter 		rc = move_to_new_page(newpage, page);
645e24f0b8fSChristoph Lameter 
646e8589cc1SKAMEZAWA Hiroyuki 	if (rc)
6476c5240aeSChristoph Lameter 		remove_migration_ptes(page, page);
648dc386d4dSKAMEZAWA Hiroyuki rcu_unlock:
649989f89c5SKAMEZAWA Hiroyuki 	if (rcu_locked)
650dc386d4dSKAMEZAWA Hiroyuki 		rcu_read_unlock();
65101b1ae63SKAMEZAWA Hiroyuki uncharge:
65201b1ae63SKAMEZAWA Hiroyuki 	if (!charge)
65301b1ae63SKAMEZAWA Hiroyuki 		mem_cgroup_end_migration(mem, page, newpage);
654e24f0b8fSChristoph Lameter unlock:
655b20a3503SChristoph Lameter 	unlock_page(page);
65695a402c3SChristoph Lameter 
657e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
658aaa994b3SChristoph Lameter  		/*
659aaa994b3SChristoph Lameter  		 * A page that has been migrated has all references
660aaa994b3SChristoph Lameter  		 * removed and will be freed. A page that has not been
661aaa994b3SChristoph Lameter  		 * migrated will have kepts its references and be
662aaa994b3SChristoph Lameter  		 * restored.
663aaa994b3SChristoph Lameter  		 */
664aaa994b3SChristoph Lameter  		list_del(&page->lru);
665a731286dSKOSAKI Motohiro 		dec_zone_page_state(page, NR_ISOLATED_ANON +
6666c0b1351SJohannes Weiner 				page_is_file_cache(page));
667894bc310SLee Schermerhorn 		putback_lru_page(page);
668e24f0b8fSChristoph Lameter 	}
66995a402c3SChristoph Lameter 
67095a402c3SChristoph Lameter move_newpage:
671894bc310SLee Schermerhorn 
67295a402c3SChristoph Lameter 	/*
67395a402c3SChristoph Lameter 	 * Move the new page to the LRU. If migration was not successful
67495a402c3SChristoph Lameter 	 * then this will free the page.
67595a402c3SChristoph Lameter 	 */
676894bc310SLee Schermerhorn 	putback_lru_page(newpage);
677894bc310SLee Schermerhorn 
678742755a1SChristoph Lameter 	if (result) {
679742755a1SChristoph Lameter 		if (rc)
680742755a1SChristoph Lameter 			*result = rc;
681742755a1SChristoph Lameter 		else
682742755a1SChristoph Lameter 			*result = page_to_nid(newpage);
683742755a1SChristoph Lameter 	}
684e24f0b8fSChristoph Lameter 	return rc;
685e24f0b8fSChristoph Lameter }
686b20a3503SChristoph Lameter 
687e24f0b8fSChristoph Lameter /*
688e24f0b8fSChristoph Lameter  * migrate_pages
689e24f0b8fSChristoph Lameter  *
69095a402c3SChristoph Lameter  * The function takes one list of pages to migrate and a function
69195a402c3SChristoph Lameter  * that determines from the page to be migrated and the private data
69295a402c3SChristoph Lameter  * the target of the move and allocates the page.
693e24f0b8fSChristoph Lameter  *
694e24f0b8fSChristoph Lameter  * The function returns after 10 attempts or if no pages
695e24f0b8fSChristoph Lameter  * are movable anymore because to has become empty
696aaa994b3SChristoph Lameter  * or no retryable pages exist anymore. All pages will be
697e9534b3fSGabriel Craciunescu  * returned to the LRU or freed.
698e24f0b8fSChristoph Lameter  *
69995a402c3SChristoph Lameter  * Return: Number of pages not migrated or error code.
700e24f0b8fSChristoph Lameter  */
70195a402c3SChristoph Lameter int migrate_pages(struct list_head *from,
70262b61f61SHugh Dickins 		new_page_t get_new_page, unsigned long private, int offlining)
703e24f0b8fSChristoph Lameter {
704e24f0b8fSChristoph Lameter 	int retry = 1;
705e24f0b8fSChristoph Lameter 	int nr_failed = 0;
706e24f0b8fSChristoph Lameter 	int pass = 0;
707e24f0b8fSChristoph Lameter 	struct page *page;
708e24f0b8fSChristoph Lameter 	struct page *page2;
709e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
710e24f0b8fSChristoph Lameter 	int rc;
7112d1db3b1SChristoph Lameter 
712e24f0b8fSChristoph Lameter 	if (!swapwrite)
713e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
714e24f0b8fSChristoph Lameter 
715e24f0b8fSChristoph Lameter 	for(pass = 0; pass < 10 && retry; pass++) {
716e24f0b8fSChristoph Lameter 		retry = 0;
717e24f0b8fSChristoph Lameter 
718e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
719e24f0b8fSChristoph Lameter 			cond_resched();
720e24f0b8fSChristoph Lameter 
72195a402c3SChristoph Lameter 			rc = unmap_and_move(get_new_page, private,
72262b61f61SHugh Dickins 						page, pass > 2, offlining);
723e24f0b8fSChristoph Lameter 
724e24f0b8fSChristoph Lameter 			switch(rc) {
72595a402c3SChristoph Lameter 			case -ENOMEM:
72695a402c3SChristoph Lameter 				goto out;
727e24f0b8fSChristoph Lameter 			case -EAGAIN:
728b20a3503SChristoph Lameter 				retry++;
729e24f0b8fSChristoph Lameter 				break;
730e24f0b8fSChristoph Lameter 			case 0:
731e24f0b8fSChristoph Lameter 				break;
732e24f0b8fSChristoph Lameter 			default:
733b20a3503SChristoph Lameter 				/* Permanent failure */
734b20a3503SChristoph Lameter 				nr_failed++;
735e24f0b8fSChristoph Lameter 				break;
736b20a3503SChristoph Lameter 			}
737b20a3503SChristoph Lameter 		}
738e24f0b8fSChristoph Lameter 	}
73995a402c3SChristoph Lameter 	rc = 0;
74095a402c3SChristoph Lameter out:
741b20a3503SChristoph Lameter 	if (!swapwrite)
742b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
743b20a3503SChristoph Lameter 
744aaa994b3SChristoph Lameter 	putback_lru_pages(from);
74595a402c3SChristoph Lameter 
74695a402c3SChristoph Lameter 	if (rc)
74795a402c3SChristoph Lameter 		return rc;
74895a402c3SChristoph Lameter 
749b20a3503SChristoph Lameter 	return nr_failed + retry;
750b20a3503SChristoph Lameter }
751b20a3503SChristoph Lameter 
752742755a1SChristoph Lameter #ifdef CONFIG_NUMA
753742755a1SChristoph Lameter /*
754742755a1SChristoph Lameter  * Move a list of individual pages
755742755a1SChristoph Lameter  */
756742755a1SChristoph Lameter struct page_to_node {
757742755a1SChristoph Lameter 	unsigned long addr;
758742755a1SChristoph Lameter 	struct page *page;
759742755a1SChristoph Lameter 	int node;
760742755a1SChristoph Lameter 	int status;
761742755a1SChristoph Lameter };
762742755a1SChristoph Lameter 
763742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private,
764742755a1SChristoph Lameter 		int **result)
765742755a1SChristoph Lameter {
766742755a1SChristoph Lameter 	struct page_to_node *pm = (struct page_to_node *)private;
767742755a1SChristoph Lameter 
768742755a1SChristoph Lameter 	while (pm->node != MAX_NUMNODES && pm->page != p)
769742755a1SChristoph Lameter 		pm++;
770742755a1SChristoph Lameter 
771742755a1SChristoph Lameter 	if (pm->node == MAX_NUMNODES)
772742755a1SChristoph Lameter 		return NULL;
773742755a1SChristoph Lameter 
774742755a1SChristoph Lameter 	*result = &pm->status;
775742755a1SChristoph Lameter 
7766484eb3eSMel Gorman 	return alloc_pages_exact_node(pm->node,
777769848c0SMel Gorman 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
778742755a1SChristoph Lameter }
779742755a1SChristoph Lameter 
780742755a1SChristoph Lameter /*
781742755a1SChristoph Lameter  * Move a set of pages as indicated in the pm array. The addr
782742755a1SChristoph Lameter  * field must be set to the virtual address of the page to be moved
783742755a1SChristoph Lameter  * and the node number must contain a valid target node.
7845e9a0f02SBrice Goglin  * The pm array ends with node = MAX_NUMNODES.
785742755a1SChristoph Lameter  */
7865e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm,
7875e9a0f02SBrice Goglin 				      struct page_to_node *pm,
788742755a1SChristoph Lameter 				      int migrate_all)
789742755a1SChristoph Lameter {
790742755a1SChristoph Lameter 	int err;
791742755a1SChristoph Lameter 	struct page_to_node *pp;
792742755a1SChristoph Lameter 	LIST_HEAD(pagelist);
793742755a1SChristoph Lameter 
794742755a1SChristoph Lameter 	down_read(&mm->mmap_sem);
795742755a1SChristoph Lameter 
796742755a1SChristoph Lameter 	/*
797742755a1SChristoph Lameter 	 * Build a list of pages to migrate
798742755a1SChristoph Lameter 	 */
799742755a1SChristoph Lameter 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
800742755a1SChristoph Lameter 		struct vm_area_struct *vma;
801742755a1SChristoph Lameter 		struct page *page;
802742755a1SChristoph Lameter 
803742755a1SChristoph Lameter 		err = -EFAULT;
804742755a1SChristoph Lameter 		vma = find_vma(mm, pp->addr);
8050dc952dcSChristoph Lameter 		if (!vma || !vma_migratable(vma))
806742755a1SChristoph Lameter 			goto set_status;
807742755a1SChristoph Lameter 
808742755a1SChristoph Lameter 		page = follow_page(vma, pp->addr, FOLL_GET);
80989f5b7daSLinus Torvalds 
81089f5b7daSLinus Torvalds 		err = PTR_ERR(page);
81189f5b7daSLinus Torvalds 		if (IS_ERR(page))
81289f5b7daSLinus Torvalds 			goto set_status;
81389f5b7daSLinus Torvalds 
814742755a1SChristoph Lameter 		err = -ENOENT;
815742755a1SChristoph Lameter 		if (!page)
816742755a1SChristoph Lameter 			goto set_status;
817742755a1SChristoph Lameter 
81862b61f61SHugh Dickins 		/* Use PageReserved to check for zero page */
81962b61f61SHugh Dickins 		if (PageReserved(page) || PageKsm(page))
820742755a1SChristoph Lameter 			goto put_and_set;
821742755a1SChristoph Lameter 
822742755a1SChristoph Lameter 		pp->page = page;
823742755a1SChristoph Lameter 		err = page_to_nid(page);
824742755a1SChristoph Lameter 
825742755a1SChristoph Lameter 		if (err == pp->node)
826742755a1SChristoph Lameter 			/*
827742755a1SChristoph Lameter 			 * Node already in the right place
828742755a1SChristoph Lameter 			 */
829742755a1SChristoph Lameter 			goto put_and_set;
830742755a1SChristoph Lameter 
831742755a1SChristoph Lameter 		err = -EACCES;
832742755a1SChristoph Lameter 		if (page_mapcount(page) > 1 &&
833742755a1SChristoph Lameter 				!migrate_all)
834742755a1SChristoph Lameter 			goto put_and_set;
835742755a1SChristoph Lameter 
83662695a84SNick Piggin 		err = isolate_lru_page(page);
8376d9c285aSKOSAKI Motohiro 		if (!err) {
83862695a84SNick Piggin 			list_add_tail(&page->lru, &pagelist);
8396d9c285aSKOSAKI Motohiro 			inc_zone_page_state(page, NR_ISOLATED_ANON +
8406d9c285aSKOSAKI Motohiro 					    page_is_file_cache(page));
8416d9c285aSKOSAKI Motohiro 		}
842742755a1SChristoph Lameter put_and_set:
843742755a1SChristoph Lameter 		/*
844742755a1SChristoph Lameter 		 * Either remove the duplicate refcount from
845742755a1SChristoph Lameter 		 * isolate_lru_page() or drop the page ref if it was
846742755a1SChristoph Lameter 		 * not isolated.
847742755a1SChristoph Lameter 		 */
848742755a1SChristoph Lameter 		put_page(page);
849742755a1SChristoph Lameter set_status:
850742755a1SChristoph Lameter 		pp->status = err;
851742755a1SChristoph Lameter 	}
852742755a1SChristoph Lameter 
853e78bbfa8SBrice Goglin 	err = 0;
854742755a1SChristoph Lameter 	if (!list_empty(&pagelist))
855742755a1SChristoph Lameter 		err = migrate_pages(&pagelist, new_page_node,
85662b61f61SHugh Dickins 				(unsigned long)pm, 0);
857742755a1SChristoph Lameter 
858742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
859742755a1SChristoph Lameter 	return err;
860742755a1SChristoph Lameter }
861742755a1SChristoph Lameter 
862742755a1SChristoph Lameter /*
8635e9a0f02SBrice Goglin  * Migrate an array of page address onto an array of nodes and fill
8645e9a0f02SBrice Goglin  * the corresponding array of status.
8655e9a0f02SBrice Goglin  */
8665e9a0f02SBrice Goglin static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
8675e9a0f02SBrice Goglin 			 unsigned long nr_pages,
8685e9a0f02SBrice Goglin 			 const void __user * __user *pages,
8695e9a0f02SBrice Goglin 			 const int __user *nodes,
8705e9a0f02SBrice Goglin 			 int __user *status, int flags)
8715e9a0f02SBrice Goglin {
8723140a227SBrice Goglin 	struct page_to_node *pm;
8735e9a0f02SBrice Goglin 	nodemask_t task_nodes;
8743140a227SBrice Goglin 	unsigned long chunk_nr_pages;
8753140a227SBrice Goglin 	unsigned long chunk_start;
8763140a227SBrice Goglin 	int err;
8775e9a0f02SBrice Goglin 
8785e9a0f02SBrice Goglin 	task_nodes = cpuset_mems_allowed(task);
8795e9a0f02SBrice Goglin 
8805e9a0f02SBrice Goglin 	err = -ENOMEM;
8813140a227SBrice Goglin 	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
8823140a227SBrice Goglin 	if (!pm)
8835e9a0f02SBrice Goglin 		goto out;
88435282a2dSBrice Goglin 
88535282a2dSBrice Goglin 	migrate_prep();
88635282a2dSBrice Goglin 
8875e9a0f02SBrice Goglin 	/*
8883140a227SBrice Goglin 	 * Store a chunk of page_to_node array in a page,
8893140a227SBrice Goglin 	 * but keep the last one as a marker
8905e9a0f02SBrice Goglin 	 */
8913140a227SBrice Goglin 	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
8923140a227SBrice Goglin 
8933140a227SBrice Goglin 	for (chunk_start = 0;
8943140a227SBrice Goglin 	     chunk_start < nr_pages;
8953140a227SBrice Goglin 	     chunk_start += chunk_nr_pages) {
8963140a227SBrice Goglin 		int j;
8973140a227SBrice Goglin 
8983140a227SBrice Goglin 		if (chunk_start + chunk_nr_pages > nr_pages)
8993140a227SBrice Goglin 			chunk_nr_pages = nr_pages - chunk_start;
9003140a227SBrice Goglin 
9013140a227SBrice Goglin 		/* fill the chunk pm with addrs and nodes from user-space */
9023140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++) {
9035e9a0f02SBrice Goglin 			const void __user *p;
9045e9a0f02SBrice Goglin 			int node;
9055e9a0f02SBrice Goglin 
9063140a227SBrice Goglin 			err = -EFAULT;
9073140a227SBrice Goglin 			if (get_user(p, pages + j + chunk_start))
9083140a227SBrice Goglin 				goto out_pm;
9093140a227SBrice Goglin 			pm[j].addr = (unsigned long) p;
9103140a227SBrice Goglin 
9113140a227SBrice Goglin 			if (get_user(node, nodes + j + chunk_start))
9125e9a0f02SBrice Goglin 				goto out_pm;
9135e9a0f02SBrice Goglin 
9145e9a0f02SBrice Goglin 			err = -ENODEV;
915*6f5a55f1SLinus Torvalds 			if (node < 0 || node >= MAX_NUMNODES)
916*6f5a55f1SLinus Torvalds 				goto out_pm;
917*6f5a55f1SLinus Torvalds 
9185e9a0f02SBrice Goglin 			if (!node_state(node, N_HIGH_MEMORY))
9195e9a0f02SBrice Goglin 				goto out_pm;
9205e9a0f02SBrice Goglin 
9215e9a0f02SBrice Goglin 			err = -EACCES;
9225e9a0f02SBrice Goglin 			if (!node_isset(node, task_nodes))
9235e9a0f02SBrice Goglin 				goto out_pm;
9245e9a0f02SBrice Goglin 
9253140a227SBrice Goglin 			pm[j].node = node;
9265e9a0f02SBrice Goglin 		}
9275e9a0f02SBrice Goglin 
9283140a227SBrice Goglin 		/* End marker for this chunk */
9293140a227SBrice Goglin 		pm[chunk_nr_pages].node = MAX_NUMNODES;
9303140a227SBrice Goglin 
9313140a227SBrice Goglin 		/* Migrate this chunk */
9323140a227SBrice Goglin 		err = do_move_page_to_node_array(mm, pm,
9333140a227SBrice Goglin 						 flags & MPOL_MF_MOVE_ALL);
9343140a227SBrice Goglin 		if (err < 0)
9353140a227SBrice Goglin 			goto out_pm;
9363140a227SBrice Goglin 
9375e9a0f02SBrice Goglin 		/* Return status information */
9383140a227SBrice Goglin 		for (j = 0; j < chunk_nr_pages; j++)
9393140a227SBrice Goglin 			if (put_user(pm[j].status, status + j + chunk_start)) {
9405e9a0f02SBrice Goglin 				err = -EFAULT;
9413140a227SBrice Goglin 				goto out_pm;
9423140a227SBrice Goglin 			}
9433140a227SBrice Goglin 	}
9443140a227SBrice Goglin 	err = 0;
9455e9a0f02SBrice Goglin 
9465e9a0f02SBrice Goglin out_pm:
9473140a227SBrice Goglin 	free_page((unsigned long)pm);
9485e9a0f02SBrice Goglin out:
9495e9a0f02SBrice Goglin 	return err;
9505e9a0f02SBrice Goglin }
9515e9a0f02SBrice Goglin 
9525e9a0f02SBrice Goglin /*
9532f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
954742755a1SChristoph Lameter  */
95580bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
95680bba129SBrice Goglin 				const void __user **pages, int *status)
957742755a1SChristoph Lameter {
9582f007e74SBrice Goglin 	unsigned long i;
959742755a1SChristoph Lameter 
9602f007e74SBrice Goglin 	down_read(&mm->mmap_sem);
9612f007e74SBrice Goglin 
9622f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
96380bba129SBrice Goglin 		unsigned long addr = (unsigned long)(*pages);
9642f007e74SBrice Goglin 		struct vm_area_struct *vma;
9652f007e74SBrice Goglin 		struct page *page;
966c095adbcSKOSAKI Motohiro 		int err = -EFAULT;
9672f007e74SBrice Goglin 
9682f007e74SBrice Goglin 		vma = find_vma(mm, addr);
969742755a1SChristoph Lameter 		if (!vma)
970742755a1SChristoph Lameter 			goto set_status;
971742755a1SChristoph Lameter 
9722f007e74SBrice Goglin 		page = follow_page(vma, addr, 0);
97389f5b7daSLinus Torvalds 
97489f5b7daSLinus Torvalds 		err = PTR_ERR(page);
97589f5b7daSLinus Torvalds 		if (IS_ERR(page))
97689f5b7daSLinus Torvalds 			goto set_status;
97789f5b7daSLinus Torvalds 
978742755a1SChristoph Lameter 		err = -ENOENT;
979742755a1SChristoph Lameter 		/* Use PageReserved to check for zero page */
98062b61f61SHugh Dickins 		if (!page || PageReserved(page) || PageKsm(page))
981742755a1SChristoph Lameter 			goto set_status;
982742755a1SChristoph Lameter 
983742755a1SChristoph Lameter 		err = page_to_nid(page);
984742755a1SChristoph Lameter set_status:
98580bba129SBrice Goglin 		*status = err;
98680bba129SBrice Goglin 
98780bba129SBrice Goglin 		pages++;
98880bba129SBrice Goglin 		status++;
98980bba129SBrice Goglin 	}
99080bba129SBrice Goglin 
99180bba129SBrice Goglin 	up_read(&mm->mmap_sem);
99280bba129SBrice Goglin }
99380bba129SBrice Goglin 
99480bba129SBrice Goglin /*
99580bba129SBrice Goglin  * Determine the nodes of a user array of pages and store it in
99680bba129SBrice Goglin  * a user array of status.
99780bba129SBrice Goglin  */
99880bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
99980bba129SBrice Goglin 			 const void __user * __user *pages,
100080bba129SBrice Goglin 			 int __user *status)
100180bba129SBrice Goglin {
100280bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16
100380bba129SBrice Goglin 	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
100480bba129SBrice Goglin 	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
100580bba129SBrice Goglin 	unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
100680bba129SBrice Goglin 	int err;
100780bba129SBrice Goglin 
100880bba129SBrice Goglin 	for (i = 0; i < nr_pages; i += chunk_nr) {
1009b9255850SH. Peter Anvin 		if (chunk_nr > nr_pages - i)
101080bba129SBrice Goglin 			chunk_nr = nr_pages - i;
101180bba129SBrice Goglin 
101280bba129SBrice Goglin 		err = copy_from_user(chunk_pages, &pages[i],
101380bba129SBrice Goglin 				     chunk_nr * sizeof(*chunk_pages));
101480bba129SBrice Goglin 		if (err) {
101580bba129SBrice Goglin 			err = -EFAULT;
101680bba129SBrice Goglin 			goto out;
101780bba129SBrice Goglin 		}
101880bba129SBrice Goglin 
101980bba129SBrice Goglin 		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
102080bba129SBrice Goglin 
102180bba129SBrice Goglin 		err = copy_to_user(&status[i], chunk_status,
102280bba129SBrice Goglin 				   chunk_nr * sizeof(*chunk_status));
102380bba129SBrice Goglin 		if (err) {
102480bba129SBrice Goglin 			err = -EFAULT;
102580bba129SBrice Goglin 			goto out;
102680bba129SBrice Goglin 		}
1027742755a1SChristoph Lameter 	}
10282f007e74SBrice Goglin 	err = 0;
1029742755a1SChristoph Lameter 
10302f007e74SBrice Goglin out:
10312f007e74SBrice Goglin 	return err;
1032742755a1SChristoph Lameter }
1033742755a1SChristoph Lameter 
1034742755a1SChristoph Lameter /*
1035742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
1036742755a1SChristoph Lameter  * process.
1037742755a1SChristoph Lameter  */
1038938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1039938bb9f5SHeiko Carstens 		const void __user * __user *, pages,
1040938bb9f5SHeiko Carstens 		const int __user *, nodes,
1041938bb9f5SHeiko Carstens 		int __user *, status, int, flags)
1042742755a1SChristoph Lameter {
1043c69e8d9cSDavid Howells 	const struct cred *cred = current_cred(), *tcred;
1044742755a1SChristoph Lameter 	struct task_struct *task;
1045742755a1SChristoph Lameter 	struct mm_struct *mm;
10465e9a0f02SBrice Goglin 	int err;
1047742755a1SChristoph Lameter 
1048742755a1SChristoph Lameter 	/* Check flags */
1049742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1050742755a1SChristoph Lameter 		return -EINVAL;
1051742755a1SChristoph Lameter 
1052742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1053742755a1SChristoph Lameter 		return -EPERM;
1054742755a1SChristoph Lameter 
1055742755a1SChristoph Lameter 	/* Find the mm_struct */
1056742755a1SChristoph Lameter 	read_lock(&tasklist_lock);
1057228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
1058742755a1SChristoph Lameter 	if (!task) {
1059742755a1SChristoph Lameter 		read_unlock(&tasklist_lock);
1060742755a1SChristoph Lameter 		return -ESRCH;
1061742755a1SChristoph Lameter 	}
1062742755a1SChristoph Lameter 	mm = get_task_mm(task);
1063742755a1SChristoph Lameter 	read_unlock(&tasklist_lock);
1064742755a1SChristoph Lameter 
1065742755a1SChristoph Lameter 	if (!mm)
1066742755a1SChristoph Lameter 		return -EINVAL;
1067742755a1SChristoph Lameter 
1068742755a1SChristoph Lameter 	/*
1069742755a1SChristoph Lameter 	 * Check if this process has the right to modify the specified
1070742755a1SChristoph Lameter 	 * process. The right exists if the process has administrative
1071742755a1SChristoph Lameter 	 * capabilities, superuser privileges or the same
1072742755a1SChristoph Lameter 	 * userid as the target process.
1073742755a1SChristoph Lameter 	 */
1074c69e8d9cSDavid Howells 	rcu_read_lock();
1075c69e8d9cSDavid Howells 	tcred = __task_cred(task);
1076b6dff3ecSDavid Howells 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1077b6dff3ecSDavid Howells 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1078742755a1SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
1079c69e8d9cSDavid Howells 		rcu_read_unlock();
1080742755a1SChristoph Lameter 		err = -EPERM;
10815e9a0f02SBrice Goglin 		goto out;
1082742755a1SChristoph Lameter 	}
1083c69e8d9cSDavid Howells 	rcu_read_unlock();
1084742755a1SChristoph Lameter 
108586c3a764SDavid Quigley  	err = security_task_movememory(task);
108686c3a764SDavid Quigley  	if (err)
1087742755a1SChristoph Lameter 		goto out;
1088742755a1SChristoph Lameter 
1089742755a1SChristoph Lameter 	if (nodes) {
10905e9a0f02SBrice Goglin 		err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
10915e9a0f02SBrice Goglin 				    flags);
10925e9a0f02SBrice Goglin 	} else {
10935e9a0f02SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
1094742755a1SChristoph Lameter 	}
1095742755a1SChristoph Lameter 
1096742755a1SChristoph Lameter out:
1097742755a1SChristoph Lameter 	mmput(mm);
1098742755a1SChristoph Lameter 	return err;
1099742755a1SChristoph Lameter }
1100742755a1SChristoph Lameter 
11017b2259b3SChristoph Lameter /*
11027b2259b3SChristoph Lameter  * Call migration functions in the vma_ops that may prepare
11037b2259b3SChristoph Lameter  * memory in a vm for migration. migration functions may perform
11047b2259b3SChristoph Lameter  * the migration for vmas that do not have an underlying page struct.
11057b2259b3SChristoph Lameter  */
11067b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
11077b2259b3SChristoph Lameter 	const nodemask_t *from, unsigned long flags)
11087b2259b3SChristoph Lameter {
11097b2259b3SChristoph Lameter  	struct vm_area_struct *vma;
11107b2259b3SChristoph Lameter  	int err = 0;
11117b2259b3SChristoph Lameter 
11121001c9fbSDaisuke Nishimura 	for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
11137b2259b3SChristoph Lameter  		if (vma->vm_ops && vma->vm_ops->migrate) {
11147b2259b3SChristoph Lameter  			err = vma->vm_ops->migrate(vma, to, from, flags);
11157b2259b3SChristoph Lameter  			if (err)
11167b2259b3SChristoph Lameter  				break;
11177b2259b3SChristoph Lameter  		}
11187b2259b3SChristoph Lameter  	}
11197b2259b3SChristoph Lameter  	return err;
11207b2259b3SChristoph Lameter }
112183d1674aSGerald Schaefer #endif
1122