xref: /linux/mm/migrate.c (revision 2f007e74bb85b9fc4eab28524052161703300f1a)
1b20a3503SChristoph Lameter /*
2b20a3503SChristoph Lameter  * Memory Migration functionality - linux/mm/migration.c
3b20a3503SChristoph Lameter  *
4b20a3503SChristoph Lameter  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5b20a3503SChristoph Lameter  *
6b20a3503SChristoph Lameter  * Page migration was first developed in the context of the memory hotplug
7b20a3503SChristoph Lameter  * project. The main authors of the migration code are:
8b20a3503SChristoph Lameter  *
9b20a3503SChristoph Lameter  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10b20a3503SChristoph Lameter  * Hirokazu Takahashi <taka@valinux.co.jp>
11b20a3503SChristoph Lameter  * Dave Hansen <haveblue@us.ibm.com>
12cde53535SChristoph Lameter  * Christoph Lameter
13b20a3503SChristoph Lameter  */
14b20a3503SChristoph Lameter 
15b20a3503SChristoph Lameter #include <linux/migrate.h>
16b20a3503SChristoph Lameter #include <linux/module.h>
17b20a3503SChristoph Lameter #include <linux/swap.h>
180697212aSChristoph Lameter #include <linux/swapops.h>
19b20a3503SChristoph Lameter #include <linux/pagemap.h>
20e23ca00bSChristoph Lameter #include <linux/buffer_head.h>
21b20a3503SChristoph Lameter #include <linux/mm_inline.h>
22b488893aSPavel Emelyanov #include <linux/nsproxy.h>
23b20a3503SChristoph Lameter #include <linux/pagevec.h>
24b20a3503SChristoph Lameter #include <linux/rmap.h>
25b20a3503SChristoph Lameter #include <linux/topology.h>
26b20a3503SChristoph Lameter #include <linux/cpu.h>
27b20a3503SChristoph Lameter #include <linux/cpuset.h>
2804e62a29SChristoph Lameter #include <linux/writeback.h>
29742755a1SChristoph Lameter #include <linux/mempolicy.h>
30742755a1SChristoph Lameter #include <linux/vmalloc.h>
3186c3a764SDavid Quigley #include <linux/security.h>
328a9f3ccdSBalbir Singh #include <linux/memcontrol.h>
334f5ca265SAdrian Bunk #include <linux/syscalls.h>
34b20a3503SChristoph Lameter 
35b20a3503SChristoph Lameter #include "internal.h"
36b20a3503SChristoph Lameter 
37b20a3503SChristoph Lameter #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38b20a3503SChristoph Lameter 
39b20a3503SChristoph Lameter /*
40742755a1SChristoph Lameter  * migrate_prep() needs to be called before we start compiling a list of pages
41742755a1SChristoph Lameter  * to be migrated using isolate_lru_page().
42b20a3503SChristoph Lameter  */
43b20a3503SChristoph Lameter int migrate_prep(void)
44b20a3503SChristoph Lameter {
45b20a3503SChristoph Lameter 	/*
46b20a3503SChristoph Lameter 	 * Clear the LRU lists so pages can be isolated.
47b20a3503SChristoph Lameter 	 * Note that pages may be moved off the LRU after we have
48b20a3503SChristoph Lameter 	 * drained them. Those pages will fail to migrate like other
49b20a3503SChristoph Lameter 	 * pages that may be busy.
50b20a3503SChristoph Lameter 	 */
51b20a3503SChristoph Lameter 	lru_add_drain_all();
52b20a3503SChristoph Lameter 
53b20a3503SChristoph Lameter 	return 0;
54b20a3503SChristoph Lameter }
55b20a3503SChristoph Lameter 
56b20a3503SChristoph Lameter /*
57894bc310SLee Schermerhorn  * Add isolated pages on the list back to the LRU under page lock
58894bc310SLee Schermerhorn  * to avoid leaking evictable pages back onto unevictable list.
59b20a3503SChristoph Lameter  *
60b20a3503SChristoph Lameter  * returns the number of pages put back.
61b20a3503SChristoph Lameter  */
62b20a3503SChristoph Lameter int putback_lru_pages(struct list_head *l)
63b20a3503SChristoph Lameter {
64b20a3503SChristoph Lameter 	struct page *page;
65b20a3503SChristoph Lameter 	struct page *page2;
66b20a3503SChristoph Lameter 	int count = 0;
67b20a3503SChristoph Lameter 
68b20a3503SChristoph Lameter 	list_for_each_entry_safe(page, page2, l, lru) {
69e24f0b8fSChristoph Lameter 		list_del(&page->lru);
70894bc310SLee Schermerhorn 		putback_lru_page(page);
71b20a3503SChristoph Lameter 		count++;
72b20a3503SChristoph Lameter 	}
73b20a3503SChristoph Lameter 	return count;
74b20a3503SChristoph Lameter }
75b20a3503SChristoph Lameter 
760697212aSChristoph Lameter /*
770697212aSChristoph Lameter  * Restore a potential migration pte to a working pte entry
780697212aSChristoph Lameter  */
7904e62a29SChristoph Lameter static void remove_migration_pte(struct vm_area_struct *vma,
800697212aSChristoph Lameter 		struct page *old, struct page *new)
810697212aSChristoph Lameter {
820697212aSChristoph Lameter 	struct mm_struct *mm = vma->vm_mm;
830697212aSChristoph Lameter 	swp_entry_t entry;
840697212aSChristoph Lameter  	pgd_t *pgd;
850697212aSChristoph Lameter  	pud_t *pud;
860697212aSChristoph Lameter  	pmd_t *pmd;
870697212aSChristoph Lameter 	pte_t *ptep, pte;
880697212aSChristoph Lameter  	spinlock_t *ptl;
8904e62a29SChristoph Lameter 	unsigned long addr = page_address_in_vma(new, vma);
9004e62a29SChristoph Lameter 
9104e62a29SChristoph Lameter 	if (addr == -EFAULT)
9204e62a29SChristoph Lameter 		return;
930697212aSChristoph Lameter 
940697212aSChristoph Lameter  	pgd = pgd_offset(mm, addr);
950697212aSChristoph Lameter 	if (!pgd_present(*pgd))
960697212aSChristoph Lameter                 return;
970697212aSChristoph Lameter 
980697212aSChristoph Lameter 	pud = pud_offset(pgd, addr);
990697212aSChristoph Lameter 	if (!pud_present(*pud))
1000697212aSChristoph Lameter                 return;
1010697212aSChristoph Lameter 
1020697212aSChristoph Lameter 	pmd = pmd_offset(pud, addr);
1030697212aSChristoph Lameter 	if (!pmd_present(*pmd))
1040697212aSChristoph Lameter 		return;
1050697212aSChristoph Lameter 
1060697212aSChristoph Lameter 	ptep = pte_offset_map(pmd, addr);
1070697212aSChristoph Lameter 
1080697212aSChristoph Lameter 	if (!is_swap_pte(*ptep)) {
1090697212aSChristoph Lameter 		pte_unmap(ptep);
1100697212aSChristoph Lameter  		return;
1110697212aSChristoph Lameter  	}
1120697212aSChristoph Lameter 
1130697212aSChristoph Lameter  	ptl = pte_lockptr(mm, pmd);
1140697212aSChristoph Lameter  	spin_lock(ptl);
1150697212aSChristoph Lameter 	pte = *ptep;
1160697212aSChristoph Lameter 	if (!is_swap_pte(pte))
1170697212aSChristoph Lameter 		goto out;
1180697212aSChristoph Lameter 
1190697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
1200697212aSChristoph Lameter 
1210697212aSChristoph Lameter 	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
1220697212aSChristoph Lameter 		goto out;
1230697212aSChristoph Lameter 
12498837c7fSHugh Dickins 	/*
12598837c7fSHugh Dickins 	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
12698837c7fSHugh Dickins 	 * Failure is not an option here: we're now expected to remove every
12798837c7fSHugh Dickins 	 * migration pte, and will cause crashes otherwise.  Normally this
12898837c7fSHugh Dickins 	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
12998837c7fSHugh Dickins 	 * page_cgroup count for safety, that's now attached to the new page,
13098837c7fSHugh Dickins 	 * so this charge should just be another incrementation of the count,
13198837c7fSHugh Dickins 	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
13298837c7fSHugh Dickins 	 * there's been a force_empty, those reference counts may no longer
13398837c7fSHugh Dickins 	 * be reliable, and this charge can actually fail: oh well, we don't
13498837c7fSHugh Dickins 	 * make the situation any worse by proceeding as if it had succeeded.
13598837c7fSHugh Dickins 	 */
13698837c7fSHugh Dickins 	mem_cgroup_charge(new, mm, GFP_ATOMIC);
13798837c7fSHugh Dickins 
1380697212aSChristoph Lameter 	get_page(new);
1390697212aSChristoph Lameter 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
1400697212aSChristoph Lameter 	if (is_write_migration_entry(entry))
1410697212aSChristoph Lameter 		pte = pte_mkwrite(pte);
14297ee0524SKAMEZAWA Hiroyuki 	flush_cache_page(vma, addr, pte_pfn(pte));
1430697212aSChristoph Lameter 	set_pte_at(mm, addr, ptep, pte);
14404e62a29SChristoph Lameter 
14504e62a29SChristoph Lameter 	if (PageAnon(new))
1460697212aSChristoph Lameter 		page_add_anon_rmap(new, vma, addr);
14704e62a29SChristoph Lameter 	else
14804e62a29SChristoph Lameter 		page_add_file_rmap(new);
14904e62a29SChristoph Lameter 
15004e62a29SChristoph Lameter 	/* No need to invalidate - it was non-present before */
15104e62a29SChristoph Lameter 	update_mmu_cache(vma, addr, pte);
15204e62a29SChristoph Lameter 
1530697212aSChristoph Lameter out:
1540697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
1550697212aSChristoph Lameter }
1560697212aSChristoph Lameter 
1570697212aSChristoph Lameter /*
15804e62a29SChristoph Lameter  * Note that remove_file_migration_ptes will only work on regular mappings,
15904e62a29SChristoph Lameter  * Nonlinear mappings do not use migration entries.
16004e62a29SChristoph Lameter  */
16104e62a29SChristoph Lameter static void remove_file_migration_ptes(struct page *old, struct page *new)
16204e62a29SChristoph Lameter {
16304e62a29SChristoph Lameter 	struct vm_area_struct *vma;
16404e62a29SChristoph Lameter 	struct address_space *mapping = page_mapping(new);
16504e62a29SChristoph Lameter 	struct prio_tree_iter iter;
16604e62a29SChristoph Lameter 	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
16704e62a29SChristoph Lameter 
16804e62a29SChristoph Lameter 	if (!mapping)
16904e62a29SChristoph Lameter 		return;
17004e62a29SChristoph Lameter 
17104e62a29SChristoph Lameter 	spin_lock(&mapping->i_mmap_lock);
17204e62a29SChristoph Lameter 
17304e62a29SChristoph Lameter 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
17404e62a29SChristoph Lameter 		remove_migration_pte(vma, old, new);
17504e62a29SChristoph Lameter 
17604e62a29SChristoph Lameter 	spin_unlock(&mapping->i_mmap_lock);
17704e62a29SChristoph Lameter }
17804e62a29SChristoph Lameter 
17904e62a29SChristoph Lameter /*
1800697212aSChristoph Lameter  * Must hold mmap_sem lock on at least one of the vmas containing
1810697212aSChristoph Lameter  * the page so that the anon_vma cannot vanish.
1820697212aSChristoph Lameter  */
18304e62a29SChristoph Lameter static void remove_anon_migration_ptes(struct page *old, struct page *new)
1840697212aSChristoph Lameter {
1850697212aSChristoph Lameter 	struct anon_vma *anon_vma;
1860697212aSChristoph Lameter 	struct vm_area_struct *vma;
1870697212aSChristoph Lameter 	unsigned long mapping;
1880697212aSChristoph Lameter 
1890697212aSChristoph Lameter 	mapping = (unsigned long)new->mapping;
1900697212aSChristoph Lameter 
1910697212aSChristoph Lameter 	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
1920697212aSChristoph Lameter 		return;
1930697212aSChristoph Lameter 
1940697212aSChristoph Lameter 	/*
1950697212aSChristoph Lameter 	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
1960697212aSChristoph Lameter 	 */
1970697212aSChristoph Lameter 	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
1980697212aSChristoph Lameter 	spin_lock(&anon_vma->lock);
1990697212aSChristoph Lameter 
2000697212aSChristoph Lameter 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
20104e62a29SChristoph Lameter 		remove_migration_pte(vma, old, new);
2020697212aSChristoph Lameter 
2030697212aSChristoph Lameter 	spin_unlock(&anon_vma->lock);
2040697212aSChristoph Lameter }
2050697212aSChristoph Lameter 
2060697212aSChristoph Lameter /*
20704e62a29SChristoph Lameter  * Get rid of all migration entries and replace them by
20804e62a29SChristoph Lameter  * references to the indicated page.
20904e62a29SChristoph Lameter  */
21004e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new)
21104e62a29SChristoph Lameter {
21204e62a29SChristoph Lameter 	if (PageAnon(new))
21304e62a29SChristoph Lameter 		remove_anon_migration_ptes(old, new);
21404e62a29SChristoph Lameter 	else
21504e62a29SChristoph Lameter 		remove_file_migration_ptes(old, new);
21604e62a29SChristoph Lameter }
21704e62a29SChristoph Lameter 
21804e62a29SChristoph Lameter /*
2190697212aSChristoph Lameter  * Something used the pte of a page under migration. We need to
2200697212aSChristoph Lameter  * get to the page and wait until migration is finished.
2210697212aSChristoph Lameter  * When we return from this function the fault will be retried.
2220697212aSChristoph Lameter  *
2230697212aSChristoph Lameter  * This function is called from do_swap_page().
2240697212aSChristoph Lameter  */
2250697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
2260697212aSChristoph Lameter 				unsigned long address)
2270697212aSChristoph Lameter {
2280697212aSChristoph Lameter 	pte_t *ptep, pte;
2290697212aSChristoph Lameter 	spinlock_t *ptl;
2300697212aSChristoph Lameter 	swp_entry_t entry;
2310697212aSChristoph Lameter 	struct page *page;
2320697212aSChristoph Lameter 
2330697212aSChristoph Lameter 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2340697212aSChristoph Lameter 	pte = *ptep;
2350697212aSChristoph Lameter 	if (!is_swap_pte(pte))
2360697212aSChristoph Lameter 		goto out;
2370697212aSChristoph Lameter 
2380697212aSChristoph Lameter 	entry = pte_to_swp_entry(pte);
2390697212aSChristoph Lameter 	if (!is_migration_entry(entry))
2400697212aSChristoph Lameter 		goto out;
2410697212aSChristoph Lameter 
2420697212aSChristoph Lameter 	page = migration_entry_to_page(entry);
2430697212aSChristoph Lameter 
244e286781dSNick Piggin 	/*
245e286781dSNick Piggin 	 * Once radix-tree replacement of page migration started, page_count
246e286781dSNick Piggin 	 * *must* be zero. And, we don't want to call wait_on_page_locked()
247e286781dSNick Piggin 	 * against a page without get_page().
248e286781dSNick Piggin 	 * So, we use get_page_unless_zero(), here. Even failed, page fault
249e286781dSNick Piggin 	 * will occur again.
250e286781dSNick Piggin 	 */
251e286781dSNick Piggin 	if (!get_page_unless_zero(page))
252e286781dSNick Piggin 		goto out;
2530697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
2540697212aSChristoph Lameter 	wait_on_page_locked(page);
2550697212aSChristoph Lameter 	put_page(page);
2560697212aSChristoph Lameter 	return;
2570697212aSChristoph Lameter out:
2580697212aSChristoph Lameter 	pte_unmap_unlock(ptep, ptl);
2590697212aSChristoph Lameter }
2600697212aSChristoph Lameter 
261b20a3503SChristoph Lameter /*
262c3fcf8a5SChristoph Lameter  * Replace the page in the mapping.
2635b5c7120SChristoph Lameter  *
2645b5c7120SChristoph Lameter  * The number of remaining references must be:
2655b5c7120SChristoph Lameter  * 1 for anonymous pages without a mapping
2665b5c7120SChristoph Lameter  * 2 for pages with a mapping
2675b5c7120SChristoph Lameter  * 3 for pages with a mapping and PagePrivate set.
268b20a3503SChristoph Lameter  */
2692d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping,
2702d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
271b20a3503SChristoph Lameter {
272e286781dSNick Piggin 	int expected_count;
2737cf9c2c7SNick Piggin 	void **pslot;
274b20a3503SChristoph Lameter 
2756c5240aeSChristoph Lameter 	if (!mapping) {
2760e8c7d0fSChristoph Lameter 		/* Anonymous page without mapping */
2776c5240aeSChristoph Lameter 		if (page_count(page) != 1)
2786c5240aeSChristoph Lameter 			return -EAGAIN;
2796c5240aeSChristoph Lameter 		return 0;
2806c5240aeSChristoph Lameter 	}
2816c5240aeSChristoph Lameter 
28219fd6231SNick Piggin 	spin_lock_irq(&mapping->tree_lock);
283b20a3503SChristoph Lameter 
2847cf9c2c7SNick Piggin 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
285b20a3503SChristoph Lameter  					page_index(page));
286b20a3503SChristoph Lameter 
287e286781dSNick Piggin 	expected_count = 2 + !!PagePrivate(page);
288e286781dSNick Piggin 	if (page_count(page) != expected_count ||
2897cf9c2c7SNick Piggin 			(struct page *)radix_tree_deref_slot(pslot) != page) {
29019fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
291e23ca00bSChristoph Lameter 		return -EAGAIN;
292b20a3503SChristoph Lameter 	}
293b20a3503SChristoph Lameter 
294e286781dSNick Piggin 	if (!page_freeze_refs(page, expected_count)) {
29519fd6231SNick Piggin 		spin_unlock_irq(&mapping->tree_lock);
296e286781dSNick Piggin 		return -EAGAIN;
297e286781dSNick Piggin 	}
298e286781dSNick Piggin 
299b20a3503SChristoph Lameter 	/*
300b20a3503SChristoph Lameter 	 * Now we know that no one else is looking at the page.
301b20a3503SChristoph Lameter 	 */
3027cf9c2c7SNick Piggin 	get_page(newpage);	/* add cache reference */
3036c5240aeSChristoph Lameter #ifdef CONFIG_SWAP
304b20a3503SChristoph Lameter 	if (PageSwapCache(page)) {
305b20a3503SChristoph Lameter 		SetPageSwapCache(newpage);
306b20a3503SChristoph Lameter 		set_page_private(newpage, page_private(page));
307b20a3503SChristoph Lameter 	}
3086c5240aeSChristoph Lameter #endif
309b20a3503SChristoph Lameter 
3107cf9c2c7SNick Piggin 	radix_tree_replace_slot(pslot, newpage);
3117cf9c2c7SNick Piggin 
312e286781dSNick Piggin 	page_unfreeze_refs(page, expected_count);
3137cf9c2c7SNick Piggin 	/*
3147cf9c2c7SNick Piggin 	 * Drop cache reference from old page.
3157cf9c2c7SNick Piggin 	 * We know this isn't the last reference.
3167cf9c2c7SNick Piggin 	 */
317b20a3503SChristoph Lameter 	__put_page(page);
3187cf9c2c7SNick Piggin 
3190e8c7d0fSChristoph Lameter 	/*
3200e8c7d0fSChristoph Lameter 	 * If moved to a different zone then also account
3210e8c7d0fSChristoph Lameter 	 * the page for that zone. Other VM counters will be
3220e8c7d0fSChristoph Lameter 	 * taken care of when we establish references to the
3230e8c7d0fSChristoph Lameter 	 * new page and drop references to the old page.
3240e8c7d0fSChristoph Lameter 	 *
3250e8c7d0fSChristoph Lameter 	 * Note that anonymous pages are accounted for
3260e8c7d0fSChristoph Lameter 	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
3270e8c7d0fSChristoph Lameter 	 * are mapped to swap space.
3280e8c7d0fSChristoph Lameter 	 */
3290e8c7d0fSChristoph Lameter 	__dec_zone_page_state(page, NR_FILE_PAGES);
3300e8c7d0fSChristoph Lameter 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
3310e8c7d0fSChristoph Lameter 
33219fd6231SNick Piggin 	spin_unlock_irq(&mapping->tree_lock);
33319fd6231SNick Piggin 	if (!PageSwapCache(newpage))
33469029cd5SKAMEZAWA Hiroyuki 		mem_cgroup_uncharge_cache_page(page);
335b20a3503SChristoph Lameter 
336b20a3503SChristoph Lameter 	return 0;
337b20a3503SChristoph Lameter }
338b20a3503SChristoph Lameter 
339b20a3503SChristoph Lameter /*
340b20a3503SChristoph Lameter  * Copy the page to its new location
341b20a3503SChristoph Lameter  */
342e7340f73SChristoph Lameter static void migrate_page_copy(struct page *newpage, struct page *page)
343b20a3503SChristoph Lameter {
344b20a3503SChristoph Lameter 	copy_highpage(newpage, page);
345b20a3503SChristoph Lameter 
346b20a3503SChristoph Lameter 	if (PageError(page))
347b20a3503SChristoph Lameter 		SetPageError(newpage);
348b20a3503SChristoph Lameter 	if (PageReferenced(page))
349b20a3503SChristoph Lameter 		SetPageReferenced(newpage);
350b20a3503SChristoph Lameter 	if (PageUptodate(page))
351b20a3503SChristoph Lameter 		SetPageUptodate(newpage);
352894bc310SLee Schermerhorn 	if (TestClearPageActive(page)) {
353894bc310SLee Schermerhorn 		VM_BUG_ON(PageUnevictable(page));
354b20a3503SChristoph Lameter 		SetPageActive(newpage);
355894bc310SLee Schermerhorn 	} else
356894bc310SLee Schermerhorn 		unevictable_migrate_page(newpage, page);
357b20a3503SChristoph Lameter 	if (PageChecked(page))
358b20a3503SChristoph Lameter 		SetPageChecked(newpage);
359b20a3503SChristoph Lameter 	if (PageMappedToDisk(page))
360b20a3503SChristoph Lameter 		SetPageMappedToDisk(newpage);
361b20a3503SChristoph Lameter 
362b20a3503SChristoph Lameter 	if (PageDirty(page)) {
363b20a3503SChristoph Lameter 		clear_page_dirty_for_io(page);
3643a902c5fSNick Piggin 		/*
3653a902c5fSNick Piggin 		 * Want to mark the page and the radix tree as dirty, and
3663a902c5fSNick Piggin 		 * redo the accounting that clear_page_dirty_for_io undid,
3673a902c5fSNick Piggin 		 * but we can't use set_page_dirty because that function
3683a902c5fSNick Piggin 		 * is actually a signal that all of the page has become dirty.
3693a902c5fSNick Piggin 		 * Wheras only part of our page may be dirty.
3703a902c5fSNick Piggin 		 */
3713a902c5fSNick Piggin 		__set_page_dirty_nobuffers(newpage);
372b20a3503SChristoph Lameter  	}
373b20a3503SChristoph Lameter 
374b291f000SNick Piggin 	mlock_migrate_page(newpage, page);
375b291f000SNick Piggin 
3766c5240aeSChristoph Lameter #ifdef CONFIG_SWAP
377b20a3503SChristoph Lameter 	ClearPageSwapCache(page);
3786c5240aeSChristoph Lameter #endif
379b20a3503SChristoph Lameter 	ClearPagePrivate(page);
380b20a3503SChristoph Lameter 	set_page_private(page, 0);
381b20a3503SChristoph Lameter 	page->mapping = NULL;
382b20a3503SChristoph Lameter 
383b20a3503SChristoph Lameter 	/*
384b20a3503SChristoph Lameter 	 * If any waiters have accumulated on the new page then
385b20a3503SChristoph Lameter 	 * wake them up.
386b20a3503SChristoph Lameter 	 */
387b20a3503SChristoph Lameter 	if (PageWriteback(newpage))
388b20a3503SChristoph Lameter 		end_page_writeback(newpage);
389b20a3503SChristoph Lameter }
390b20a3503SChristoph Lameter 
3911d8b85ccSChristoph Lameter /************************************************************
3921d8b85ccSChristoph Lameter  *                    Migration functions
3931d8b85ccSChristoph Lameter  ***********************************************************/
3941d8b85ccSChristoph Lameter 
3951d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */
3962d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping,
3972d1db3b1SChristoph Lameter 			struct page *newpage, struct page *page)
3981d8b85ccSChristoph Lameter {
3991d8b85ccSChristoph Lameter 	return -EIO;
4001d8b85ccSChristoph Lameter }
4011d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page);
4021d8b85ccSChristoph Lameter 
403b20a3503SChristoph Lameter /*
404b20a3503SChristoph Lameter  * Common logic to directly migrate a single page suitable for
405b20a3503SChristoph Lameter  * pages that do not use PagePrivate.
406b20a3503SChristoph Lameter  *
407b20a3503SChristoph Lameter  * Pages are locked upon entry and exit.
408b20a3503SChristoph Lameter  */
4092d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping,
4102d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
411b20a3503SChristoph Lameter {
412b20a3503SChristoph Lameter 	int rc;
413b20a3503SChristoph Lameter 
414b20a3503SChristoph Lameter 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
415b20a3503SChristoph Lameter 
4162d1db3b1SChristoph Lameter 	rc = migrate_page_move_mapping(mapping, newpage, page);
417b20a3503SChristoph Lameter 
418b20a3503SChristoph Lameter 	if (rc)
419b20a3503SChristoph Lameter 		return rc;
420b20a3503SChristoph Lameter 
421b20a3503SChristoph Lameter 	migrate_page_copy(newpage, page);
422b20a3503SChristoph Lameter 	return 0;
423b20a3503SChristoph Lameter }
424b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page);
425b20a3503SChristoph Lameter 
4269361401eSDavid Howells #ifdef CONFIG_BLOCK
427b20a3503SChristoph Lameter /*
4281d8b85ccSChristoph Lameter  * Migration function for pages with buffers. This function can only be used
4291d8b85ccSChristoph Lameter  * if the underlying filesystem guarantees that no other references to "page"
4301d8b85ccSChristoph Lameter  * exist.
4311d8b85ccSChristoph Lameter  */
4322d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping,
4332d1db3b1SChristoph Lameter 		struct page *newpage, struct page *page)
4341d8b85ccSChristoph Lameter {
4351d8b85ccSChristoph Lameter 	struct buffer_head *bh, *head;
4361d8b85ccSChristoph Lameter 	int rc;
4371d8b85ccSChristoph Lameter 
4381d8b85ccSChristoph Lameter 	if (!page_has_buffers(page))
4392d1db3b1SChristoph Lameter 		return migrate_page(mapping, newpage, page);
4401d8b85ccSChristoph Lameter 
4411d8b85ccSChristoph Lameter 	head = page_buffers(page);
4421d8b85ccSChristoph Lameter 
4432d1db3b1SChristoph Lameter 	rc = migrate_page_move_mapping(mapping, newpage, page);
4441d8b85ccSChristoph Lameter 
4451d8b85ccSChristoph Lameter 	if (rc)
4461d8b85ccSChristoph Lameter 		return rc;
4471d8b85ccSChristoph Lameter 
4481d8b85ccSChristoph Lameter 	bh = head;
4491d8b85ccSChristoph Lameter 	do {
4501d8b85ccSChristoph Lameter 		get_bh(bh);
4511d8b85ccSChristoph Lameter 		lock_buffer(bh);
4521d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
4531d8b85ccSChristoph Lameter 
4541d8b85ccSChristoph Lameter 	} while (bh != head);
4551d8b85ccSChristoph Lameter 
4561d8b85ccSChristoph Lameter 	ClearPagePrivate(page);
4571d8b85ccSChristoph Lameter 	set_page_private(newpage, page_private(page));
4581d8b85ccSChristoph Lameter 	set_page_private(page, 0);
4591d8b85ccSChristoph Lameter 	put_page(page);
4601d8b85ccSChristoph Lameter 	get_page(newpage);
4611d8b85ccSChristoph Lameter 
4621d8b85ccSChristoph Lameter 	bh = head;
4631d8b85ccSChristoph Lameter 	do {
4641d8b85ccSChristoph Lameter 		set_bh_page(bh, newpage, bh_offset(bh));
4651d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
4661d8b85ccSChristoph Lameter 
4671d8b85ccSChristoph Lameter 	} while (bh != head);
4681d8b85ccSChristoph Lameter 
4691d8b85ccSChristoph Lameter 	SetPagePrivate(newpage);
4701d8b85ccSChristoph Lameter 
4711d8b85ccSChristoph Lameter 	migrate_page_copy(newpage, page);
4721d8b85ccSChristoph Lameter 
4731d8b85ccSChristoph Lameter 	bh = head;
4741d8b85ccSChristoph Lameter 	do {
4751d8b85ccSChristoph Lameter 		unlock_buffer(bh);
4761d8b85ccSChristoph Lameter  		put_bh(bh);
4771d8b85ccSChristoph Lameter 		bh = bh->b_this_page;
4781d8b85ccSChristoph Lameter 
4791d8b85ccSChristoph Lameter 	} while (bh != head);
4801d8b85ccSChristoph Lameter 
4811d8b85ccSChristoph Lameter 	return 0;
4821d8b85ccSChristoph Lameter }
4831d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page);
4849361401eSDavid Howells #endif
4851d8b85ccSChristoph Lameter 
48604e62a29SChristoph Lameter /*
48704e62a29SChristoph Lameter  * Writeback a page to clean the dirty state
48804e62a29SChristoph Lameter  */
48904e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page)
49004e62a29SChristoph Lameter {
49104e62a29SChristoph Lameter 	struct writeback_control wbc = {
49204e62a29SChristoph Lameter 		.sync_mode = WB_SYNC_NONE,
49304e62a29SChristoph Lameter 		.nr_to_write = 1,
49404e62a29SChristoph Lameter 		.range_start = 0,
49504e62a29SChristoph Lameter 		.range_end = LLONG_MAX,
49604e62a29SChristoph Lameter 		.nonblocking = 1,
49704e62a29SChristoph Lameter 		.for_reclaim = 1
49804e62a29SChristoph Lameter 	};
49904e62a29SChristoph Lameter 	int rc;
50004e62a29SChristoph Lameter 
50104e62a29SChristoph Lameter 	if (!mapping->a_ops->writepage)
50204e62a29SChristoph Lameter 		/* No write method for the address space */
50304e62a29SChristoph Lameter 		return -EINVAL;
50404e62a29SChristoph Lameter 
50504e62a29SChristoph Lameter 	if (!clear_page_dirty_for_io(page))
50604e62a29SChristoph Lameter 		/* Someone else already triggered a write */
50704e62a29SChristoph Lameter 		return -EAGAIN;
50804e62a29SChristoph Lameter 
50904e62a29SChristoph Lameter 	/*
51004e62a29SChristoph Lameter 	 * A dirty page may imply that the underlying filesystem has
51104e62a29SChristoph Lameter 	 * the page on some queue. So the page must be clean for
51204e62a29SChristoph Lameter 	 * migration. Writeout may mean we loose the lock and the
51304e62a29SChristoph Lameter 	 * page state is no longer what we checked for earlier.
51404e62a29SChristoph Lameter 	 * At this point we know that the migration attempt cannot
51504e62a29SChristoph Lameter 	 * be successful.
51604e62a29SChristoph Lameter 	 */
51704e62a29SChristoph Lameter 	remove_migration_ptes(page, page);
51804e62a29SChristoph Lameter 
51904e62a29SChristoph Lameter 	rc = mapping->a_ops->writepage(page, &wbc);
52004e62a29SChristoph Lameter 	if (rc < 0)
52104e62a29SChristoph Lameter 		/* I/O Error writing */
52204e62a29SChristoph Lameter 		return -EIO;
52304e62a29SChristoph Lameter 
52404e62a29SChristoph Lameter 	if (rc != AOP_WRITEPAGE_ACTIVATE)
52504e62a29SChristoph Lameter 		/* unlocked. Relock */
52604e62a29SChristoph Lameter 		lock_page(page);
52704e62a29SChristoph Lameter 
52804e62a29SChristoph Lameter 	return -EAGAIN;
52904e62a29SChristoph Lameter }
53004e62a29SChristoph Lameter 
53104e62a29SChristoph Lameter /*
53204e62a29SChristoph Lameter  * Default handling if a filesystem does not provide a migration function.
53304e62a29SChristoph Lameter  */
5348351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping,
5358351a6e4SChristoph Lameter 	struct page *newpage, struct page *page)
5368351a6e4SChristoph Lameter {
53704e62a29SChristoph Lameter 	if (PageDirty(page))
53804e62a29SChristoph Lameter 		return writeout(mapping, page);
5398351a6e4SChristoph Lameter 
5408351a6e4SChristoph Lameter 	/*
5418351a6e4SChristoph Lameter 	 * Buffers may be managed in a filesystem specific way.
5428351a6e4SChristoph Lameter 	 * We must have no buffers or drop them.
5438351a6e4SChristoph Lameter 	 */
544b398f6bfSDavid Howells 	if (PagePrivate(page) &&
5458351a6e4SChristoph Lameter 	    !try_to_release_page(page, GFP_KERNEL))
5468351a6e4SChristoph Lameter 		return -EAGAIN;
5478351a6e4SChristoph Lameter 
5488351a6e4SChristoph Lameter 	return migrate_page(mapping, newpage, page);
5498351a6e4SChristoph Lameter }
5508351a6e4SChristoph Lameter 
5511d8b85ccSChristoph Lameter /*
552e24f0b8fSChristoph Lameter  * Move a page to a newly allocated page
553e24f0b8fSChristoph Lameter  * The page is locked and all ptes have been successfully removed.
554b20a3503SChristoph Lameter  *
555e24f0b8fSChristoph Lameter  * The new page will have replaced the old page if this function
556e24f0b8fSChristoph Lameter  * is successful.
557894bc310SLee Schermerhorn  *
558894bc310SLee Schermerhorn  * Return value:
559894bc310SLee Schermerhorn  *   < 0 - error code
560894bc310SLee Schermerhorn  *  == 0 - success
561b20a3503SChristoph Lameter  */
562e24f0b8fSChristoph Lameter static int move_to_new_page(struct page *newpage, struct page *page)
563b20a3503SChristoph Lameter {
564e24f0b8fSChristoph Lameter 	struct address_space *mapping;
565b20a3503SChristoph Lameter 	int rc;
566b20a3503SChristoph Lameter 
567b20a3503SChristoph Lameter 	/*
568e24f0b8fSChristoph Lameter 	 * Block others from accessing the page when we get around to
569e24f0b8fSChristoph Lameter 	 * establishing additional references. We are the only one
570e24f0b8fSChristoph Lameter 	 * holding a reference to the new page at this point.
571b20a3503SChristoph Lameter 	 */
572529ae9aaSNick Piggin 	if (!trylock_page(newpage))
573e24f0b8fSChristoph Lameter 		BUG();
574b20a3503SChristoph Lameter 
5752d1db3b1SChristoph Lameter 	/* Prepare mapping for the new page.*/
5762d1db3b1SChristoph Lameter 	newpage->index = page->index;
5772d1db3b1SChristoph Lameter 	newpage->mapping = page->mapping;
578b2e18538SRik van Riel 	if (PageSwapBacked(page))
579b2e18538SRik van Riel 		SetPageSwapBacked(newpage);
5802d1db3b1SChristoph Lameter 
581b20a3503SChristoph Lameter 	mapping = page_mapping(page);
582b20a3503SChristoph Lameter 	if (!mapping)
5836c5240aeSChristoph Lameter 		rc = migrate_page(mapping, newpage, page);
5846c5240aeSChristoph Lameter 	else if (mapping->a_ops->migratepage)
585b20a3503SChristoph Lameter 		/*
586b20a3503SChristoph Lameter 		 * Most pages have a mapping and most filesystems
587b20a3503SChristoph Lameter 		 * should provide a migration function. Anonymous
588b20a3503SChristoph Lameter 		 * pages are part of swap space which also has its
589b20a3503SChristoph Lameter 		 * own migration function. This is the most common
590b20a3503SChristoph Lameter 		 * path for page migration.
591b20a3503SChristoph Lameter 		 */
5922d1db3b1SChristoph Lameter 		rc = mapping->a_ops->migratepage(mapping,
5932d1db3b1SChristoph Lameter 						newpage, page);
5948351a6e4SChristoph Lameter 	else
5958351a6e4SChristoph Lameter 		rc = fallback_migrate_page(mapping, newpage, page);
596b20a3503SChristoph Lameter 
597ae41be37SKAMEZAWA Hiroyuki 	if (!rc) {
5986c5240aeSChristoph Lameter 		remove_migration_ptes(page, newpage);
599ae41be37SKAMEZAWA Hiroyuki 	} else
600e24f0b8fSChristoph Lameter 		newpage->mapping = NULL;
6016c5240aeSChristoph Lameter 
602b20a3503SChristoph Lameter 	unlock_page(newpage);
603b20a3503SChristoph Lameter 
604e24f0b8fSChristoph Lameter 	return rc;
605e24f0b8fSChristoph Lameter }
606e24f0b8fSChristoph Lameter 
607e24f0b8fSChristoph Lameter /*
608e24f0b8fSChristoph Lameter  * Obtain the lock on page, remove all ptes and migrate the page
609e24f0b8fSChristoph Lameter  * to the newly allocated page in newpage.
610e24f0b8fSChristoph Lameter  */
61195a402c3SChristoph Lameter static int unmap_and_move(new_page_t get_new_page, unsigned long private,
61295a402c3SChristoph Lameter 			struct page *page, int force)
613e24f0b8fSChristoph Lameter {
614e24f0b8fSChristoph Lameter 	int rc = 0;
615742755a1SChristoph Lameter 	int *result = NULL;
616742755a1SChristoph Lameter 	struct page *newpage = get_new_page(page, private, &result);
617989f89c5SKAMEZAWA Hiroyuki 	int rcu_locked = 0;
618ae41be37SKAMEZAWA Hiroyuki 	int charge = 0;
61995a402c3SChristoph Lameter 
62095a402c3SChristoph Lameter 	if (!newpage)
62195a402c3SChristoph Lameter 		return -ENOMEM;
622e24f0b8fSChristoph Lameter 
623894bc310SLee Schermerhorn 	if (page_count(page) == 1) {
624e24f0b8fSChristoph Lameter 		/* page was freed from under us. So we are done. */
62595a402c3SChristoph Lameter 		goto move_newpage;
626894bc310SLee Schermerhorn 	}
627e24f0b8fSChristoph Lameter 
628e8589cc1SKAMEZAWA Hiroyuki 	charge = mem_cgroup_prepare_migration(page, newpage);
629e8589cc1SKAMEZAWA Hiroyuki 	if (charge == -ENOMEM) {
630e8589cc1SKAMEZAWA Hiroyuki 		rc = -ENOMEM;
631e8589cc1SKAMEZAWA Hiroyuki 		goto move_newpage;
632e8589cc1SKAMEZAWA Hiroyuki 	}
633e8589cc1SKAMEZAWA Hiroyuki 	/* prepare cgroup just returns 0 or -ENOMEM */
634e8589cc1SKAMEZAWA Hiroyuki 	BUG_ON(charge);
635e8589cc1SKAMEZAWA Hiroyuki 
636e24f0b8fSChristoph Lameter 	rc = -EAGAIN;
637529ae9aaSNick Piggin 	if (!trylock_page(page)) {
638e24f0b8fSChristoph Lameter 		if (!force)
63995a402c3SChristoph Lameter 			goto move_newpage;
640e24f0b8fSChristoph Lameter 		lock_page(page);
641e24f0b8fSChristoph Lameter 	}
642e24f0b8fSChristoph Lameter 
643e24f0b8fSChristoph Lameter 	if (PageWriteback(page)) {
644e24f0b8fSChristoph Lameter 		if (!force)
645e24f0b8fSChristoph Lameter 			goto unlock;
646e24f0b8fSChristoph Lameter 		wait_on_page_writeback(page);
647e24f0b8fSChristoph Lameter 	}
648e24f0b8fSChristoph Lameter 	/*
649dc386d4dSKAMEZAWA Hiroyuki 	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
650dc386d4dSKAMEZAWA Hiroyuki 	 * we cannot notice that anon_vma is freed while we migrates a page.
651dc386d4dSKAMEZAWA Hiroyuki 	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
652dc386d4dSKAMEZAWA Hiroyuki 	 * of migration. File cache pages are no problem because of page_lock()
653989f89c5SKAMEZAWA Hiroyuki 	 * File Caches may use write_page() or lock_page() in migration, then,
654989f89c5SKAMEZAWA Hiroyuki 	 * just care Anon page here.
655e24f0b8fSChristoph Lameter 	 */
656989f89c5SKAMEZAWA Hiroyuki 	if (PageAnon(page)) {
657dc386d4dSKAMEZAWA Hiroyuki 		rcu_read_lock();
658989f89c5SKAMEZAWA Hiroyuki 		rcu_locked = 1;
659989f89c5SKAMEZAWA Hiroyuki 	}
66062e1c553SShaohua Li 
661dc386d4dSKAMEZAWA Hiroyuki 	/*
66262e1c553SShaohua Li 	 * Corner case handling:
66362e1c553SShaohua Li 	 * 1. When a new swap-cache page is read into, it is added to the LRU
66462e1c553SShaohua Li 	 * and treated as swapcache but it has no rmap yet.
66562e1c553SShaohua Li 	 * Calling try_to_unmap() against a page->mapping==NULL page will
66662e1c553SShaohua Li 	 * trigger a BUG.  So handle it here.
66762e1c553SShaohua Li 	 * 2. An orphaned page (see truncate_complete_page) might have
66862e1c553SShaohua Li 	 * fs-private metadata. The page can be picked up due to memory
66962e1c553SShaohua Li 	 * offlining.  Everywhere else except page reclaim, the page is
67062e1c553SShaohua Li 	 * invisible to the vm, so the page can not be migrated.  So try to
67162e1c553SShaohua Li 	 * free the metadata, so the page can be freed.
672dc386d4dSKAMEZAWA Hiroyuki 	 */
67362e1c553SShaohua Li 	if (!page->mapping) {
67462e1c553SShaohua Li 		if (!PageAnon(page) && PagePrivate(page)) {
67562e1c553SShaohua Li 			/*
67662e1c553SShaohua Li 			 * Go direct to try_to_free_buffers() here because
67762e1c553SShaohua Li 			 * a) that's what try_to_release_page() would do anyway
67862e1c553SShaohua Li 			 * b) we may be under rcu_read_lock() here, so we can't
67962e1c553SShaohua Li 			 *    use GFP_KERNEL which is what try_to_release_page()
68062e1c553SShaohua Li 			 *    needs to be effective.
68162e1c553SShaohua Li 			 */
68262e1c553SShaohua Li 			try_to_free_buffers(page);
68362e1c553SShaohua Li 		}
684dc386d4dSKAMEZAWA Hiroyuki 		goto rcu_unlock;
68562e1c553SShaohua Li 	}
68662e1c553SShaohua Li 
687dc386d4dSKAMEZAWA Hiroyuki 	/* Establish migration ptes or remove ptes */
688e6a1530dSChristoph Lameter 	try_to_unmap(page, 1);
689dc386d4dSKAMEZAWA Hiroyuki 
690e24f0b8fSChristoph Lameter 	if (!page_mapped(page))
691e24f0b8fSChristoph Lameter 		rc = move_to_new_page(newpage, page);
692e24f0b8fSChristoph Lameter 
693e8589cc1SKAMEZAWA Hiroyuki 	if (rc)
6946c5240aeSChristoph Lameter 		remove_migration_ptes(page, page);
695dc386d4dSKAMEZAWA Hiroyuki rcu_unlock:
696989f89c5SKAMEZAWA Hiroyuki 	if (rcu_locked)
697dc386d4dSKAMEZAWA Hiroyuki 		rcu_read_unlock();
698e6a1530dSChristoph Lameter 
699e24f0b8fSChristoph Lameter unlock:
700b20a3503SChristoph Lameter 	unlock_page(page);
70195a402c3SChristoph Lameter 
702e24f0b8fSChristoph Lameter 	if (rc != -EAGAIN) {
703aaa994b3SChristoph Lameter  		/*
704aaa994b3SChristoph Lameter  		 * A page that has been migrated has all references
705aaa994b3SChristoph Lameter  		 * removed and will be freed. A page that has not been
706aaa994b3SChristoph Lameter  		 * migrated will have kepts its references and be
707aaa994b3SChristoph Lameter  		 * restored.
708aaa994b3SChristoph Lameter  		 */
709aaa994b3SChristoph Lameter  		list_del(&page->lru);
710894bc310SLee Schermerhorn 		putback_lru_page(page);
711e24f0b8fSChristoph Lameter 	}
71295a402c3SChristoph Lameter 
71395a402c3SChristoph Lameter move_newpage:
714e8589cc1SKAMEZAWA Hiroyuki 	if (!charge)
715e8589cc1SKAMEZAWA Hiroyuki 		mem_cgroup_end_migration(newpage);
716894bc310SLee Schermerhorn 
71795a402c3SChristoph Lameter 	/*
71895a402c3SChristoph Lameter 	 * Move the new page to the LRU. If migration was not successful
71995a402c3SChristoph Lameter 	 * then this will free the page.
72095a402c3SChristoph Lameter 	 */
721894bc310SLee Schermerhorn 	putback_lru_page(newpage);
722894bc310SLee Schermerhorn 
723742755a1SChristoph Lameter 	if (result) {
724742755a1SChristoph Lameter 		if (rc)
725742755a1SChristoph Lameter 			*result = rc;
726742755a1SChristoph Lameter 		else
727742755a1SChristoph Lameter 			*result = page_to_nid(newpage);
728742755a1SChristoph Lameter 	}
729e24f0b8fSChristoph Lameter 	return rc;
730e24f0b8fSChristoph Lameter }
731b20a3503SChristoph Lameter 
732e24f0b8fSChristoph Lameter /*
733e24f0b8fSChristoph Lameter  * migrate_pages
734e24f0b8fSChristoph Lameter  *
73595a402c3SChristoph Lameter  * The function takes one list of pages to migrate and a function
73695a402c3SChristoph Lameter  * that determines from the page to be migrated and the private data
73795a402c3SChristoph Lameter  * the target of the move and allocates the page.
738e24f0b8fSChristoph Lameter  *
739e24f0b8fSChristoph Lameter  * The function returns after 10 attempts or if no pages
740e24f0b8fSChristoph Lameter  * are movable anymore because to has become empty
741aaa994b3SChristoph Lameter  * or no retryable pages exist anymore. All pages will be
742e9534b3fSGabriel Craciunescu  * returned to the LRU or freed.
743e24f0b8fSChristoph Lameter  *
74495a402c3SChristoph Lameter  * Return: Number of pages not migrated or error code.
745e24f0b8fSChristoph Lameter  */
74695a402c3SChristoph Lameter int migrate_pages(struct list_head *from,
74795a402c3SChristoph Lameter 		new_page_t get_new_page, unsigned long private)
748e24f0b8fSChristoph Lameter {
749e24f0b8fSChristoph Lameter 	int retry = 1;
750e24f0b8fSChristoph Lameter 	int nr_failed = 0;
751e24f0b8fSChristoph Lameter 	int pass = 0;
752e24f0b8fSChristoph Lameter 	struct page *page;
753e24f0b8fSChristoph Lameter 	struct page *page2;
754e24f0b8fSChristoph Lameter 	int swapwrite = current->flags & PF_SWAPWRITE;
755e24f0b8fSChristoph Lameter 	int rc;
7562d1db3b1SChristoph Lameter 
757e24f0b8fSChristoph Lameter 	if (!swapwrite)
758e24f0b8fSChristoph Lameter 		current->flags |= PF_SWAPWRITE;
759e24f0b8fSChristoph Lameter 
760e24f0b8fSChristoph Lameter 	for(pass = 0; pass < 10 && retry; pass++) {
761e24f0b8fSChristoph Lameter 		retry = 0;
762e24f0b8fSChristoph Lameter 
763e24f0b8fSChristoph Lameter 		list_for_each_entry_safe(page, page2, from, lru) {
764e24f0b8fSChristoph Lameter 			cond_resched();
765e24f0b8fSChristoph Lameter 
76695a402c3SChristoph Lameter 			rc = unmap_and_move(get_new_page, private,
76795a402c3SChristoph Lameter 						page, pass > 2);
768e24f0b8fSChristoph Lameter 
769e24f0b8fSChristoph Lameter 			switch(rc) {
77095a402c3SChristoph Lameter 			case -ENOMEM:
77195a402c3SChristoph Lameter 				goto out;
772e24f0b8fSChristoph Lameter 			case -EAGAIN:
773b20a3503SChristoph Lameter 				retry++;
774e24f0b8fSChristoph Lameter 				break;
775e24f0b8fSChristoph Lameter 			case 0:
776e24f0b8fSChristoph Lameter 				break;
777e24f0b8fSChristoph Lameter 			default:
778b20a3503SChristoph Lameter 				/* Permanent failure */
779b20a3503SChristoph Lameter 				nr_failed++;
780e24f0b8fSChristoph Lameter 				break;
781b20a3503SChristoph Lameter 			}
782b20a3503SChristoph Lameter 		}
783e24f0b8fSChristoph Lameter 	}
78495a402c3SChristoph Lameter 	rc = 0;
78595a402c3SChristoph Lameter out:
786b20a3503SChristoph Lameter 	if (!swapwrite)
787b20a3503SChristoph Lameter 		current->flags &= ~PF_SWAPWRITE;
788b20a3503SChristoph Lameter 
789aaa994b3SChristoph Lameter 	putback_lru_pages(from);
79095a402c3SChristoph Lameter 
79195a402c3SChristoph Lameter 	if (rc)
79295a402c3SChristoph Lameter 		return rc;
79395a402c3SChristoph Lameter 
794b20a3503SChristoph Lameter 	return nr_failed + retry;
795b20a3503SChristoph Lameter }
796b20a3503SChristoph Lameter 
797742755a1SChristoph Lameter #ifdef CONFIG_NUMA
798742755a1SChristoph Lameter /*
799742755a1SChristoph Lameter  * Move a list of individual pages
800742755a1SChristoph Lameter  */
801742755a1SChristoph Lameter struct page_to_node {
802742755a1SChristoph Lameter 	unsigned long addr;
803742755a1SChristoph Lameter 	struct page *page;
804742755a1SChristoph Lameter 	int node;
805742755a1SChristoph Lameter 	int status;
806742755a1SChristoph Lameter };
807742755a1SChristoph Lameter 
808742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private,
809742755a1SChristoph Lameter 		int **result)
810742755a1SChristoph Lameter {
811742755a1SChristoph Lameter 	struct page_to_node *pm = (struct page_to_node *)private;
812742755a1SChristoph Lameter 
813742755a1SChristoph Lameter 	while (pm->node != MAX_NUMNODES && pm->page != p)
814742755a1SChristoph Lameter 		pm++;
815742755a1SChristoph Lameter 
816742755a1SChristoph Lameter 	if (pm->node == MAX_NUMNODES)
817742755a1SChristoph Lameter 		return NULL;
818742755a1SChristoph Lameter 
819742755a1SChristoph Lameter 	*result = &pm->status;
820742755a1SChristoph Lameter 
821769848c0SMel Gorman 	return alloc_pages_node(pm->node,
822769848c0SMel Gorman 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
823742755a1SChristoph Lameter }
824742755a1SChristoph Lameter 
825742755a1SChristoph Lameter /*
826742755a1SChristoph Lameter  * Move a set of pages as indicated in the pm array. The addr
827742755a1SChristoph Lameter  * field must be set to the virtual address of the page to be moved
828742755a1SChristoph Lameter  * and the node number must contain a valid target node.
829742755a1SChristoph Lameter  */
830742755a1SChristoph Lameter static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
831742755a1SChristoph Lameter 				int migrate_all)
832742755a1SChristoph Lameter {
833742755a1SChristoph Lameter 	int err;
834742755a1SChristoph Lameter 	struct page_to_node *pp;
835742755a1SChristoph Lameter 	LIST_HEAD(pagelist);
836742755a1SChristoph Lameter 
837742755a1SChristoph Lameter 	down_read(&mm->mmap_sem);
838742755a1SChristoph Lameter 
839742755a1SChristoph Lameter 	/*
840742755a1SChristoph Lameter 	 * Build a list of pages to migrate
841742755a1SChristoph Lameter 	 */
842742755a1SChristoph Lameter 	migrate_prep();
843742755a1SChristoph Lameter 	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
844742755a1SChristoph Lameter 		struct vm_area_struct *vma;
845742755a1SChristoph Lameter 		struct page *page;
846742755a1SChristoph Lameter 
847742755a1SChristoph Lameter 		/*
848742755a1SChristoph Lameter 		 * A valid page pointer that will not match any of the
849742755a1SChristoph Lameter 		 * pages that will be moved.
850742755a1SChristoph Lameter 		 */
851742755a1SChristoph Lameter 		pp->page = ZERO_PAGE(0);
852742755a1SChristoph Lameter 
853742755a1SChristoph Lameter 		err = -EFAULT;
854742755a1SChristoph Lameter 		vma = find_vma(mm, pp->addr);
8550dc952dcSChristoph Lameter 		if (!vma || !vma_migratable(vma))
856742755a1SChristoph Lameter 			goto set_status;
857742755a1SChristoph Lameter 
858742755a1SChristoph Lameter 		page = follow_page(vma, pp->addr, FOLL_GET);
85989f5b7daSLinus Torvalds 
86089f5b7daSLinus Torvalds 		err = PTR_ERR(page);
86189f5b7daSLinus Torvalds 		if (IS_ERR(page))
86289f5b7daSLinus Torvalds 			goto set_status;
86389f5b7daSLinus Torvalds 
864742755a1SChristoph Lameter 		err = -ENOENT;
865742755a1SChristoph Lameter 		if (!page)
866742755a1SChristoph Lameter 			goto set_status;
867742755a1SChristoph Lameter 
868742755a1SChristoph Lameter 		if (PageReserved(page))		/* Check for zero page */
869742755a1SChristoph Lameter 			goto put_and_set;
870742755a1SChristoph Lameter 
871742755a1SChristoph Lameter 		pp->page = page;
872742755a1SChristoph Lameter 		err = page_to_nid(page);
873742755a1SChristoph Lameter 
874742755a1SChristoph Lameter 		if (err == pp->node)
875742755a1SChristoph Lameter 			/*
876742755a1SChristoph Lameter 			 * Node already in the right place
877742755a1SChristoph Lameter 			 */
878742755a1SChristoph Lameter 			goto put_and_set;
879742755a1SChristoph Lameter 
880742755a1SChristoph Lameter 		err = -EACCES;
881742755a1SChristoph Lameter 		if (page_mapcount(page) > 1 &&
882742755a1SChristoph Lameter 				!migrate_all)
883742755a1SChristoph Lameter 			goto put_and_set;
884742755a1SChristoph Lameter 
88562695a84SNick Piggin 		err = isolate_lru_page(page);
88662695a84SNick Piggin 		if (!err)
88762695a84SNick Piggin 			list_add_tail(&page->lru, &pagelist);
888742755a1SChristoph Lameter put_and_set:
889742755a1SChristoph Lameter 		/*
890742755a1SChristoph Lameter 		 * Either remove the duplicate refcount from
891742755a1SChristoph Lameter 		 * isolate_lru_page() or drop the page ref if it was
892742755a1SChristoph Lameter 		 * not isolated.
893742755a1SChristoph Lameter 		 */
894742755a1SChristoph Lameter 		put_page(page);
895742755a1SChristoph Lameter set_status:
896742755a1SChristoph Lameter 		pp->status = err;
897742755a1SChristoph Lameter 	}
898742755a1SChristoph Lameter 
899e78bbfa8SBrice Goglin 	err = 0;
900742755a1SChristoph Lameter 	if (!list_empty(&pagelist))
901742755a1SChristoph Lameter 		err = migrate_pages(&pagelist, new_page_node,
902742755a1SChristoph Lameter 				(unsigned long)pm);
903742755a1SChristoph Lameter 
904742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
905742755a1SChristoph Lameter 	return err;
906742755a1SChristoph Lameter }
907742755a1SChristoph Lameter 
908742755a1SChristoph Lameter /*
909*2f007e74SBrice Goglin  * Determine the nodes of an array of pages and store it in an array of status.
910742755a1SChristoph Lameter  */
911*2f007e74SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
912*2f007e74SBrice Goglin 			 const void __user * __user *pages,
913*2f007e74SBrice Goglin 			 int __user *status)
914742755a1SChristoph Lameter {
915*2f007e74SBrice Goglin 	unsigned long i;
916742755a1SChristoph Lameter 	int err;
917742755a1SChristoph Lameter 
918*2f007e74SBrice Goglin 	down_read(&mm->mmap_sem);
919*2f007e74SBrice Goglin 
920*2f007e74SBrice Goglin 	for (i = 0; i < nr_pages; i++) {
921*2f007e74SBrice Goglin 		const void __user *p;
922*2f007e74SBrice Goglin 		unsigned long addr;
923*2f007e74SBrice Goglin 		struct vm_area_struct *vma;
924*2f007e74SBrice Goglin 		struct page *page;
925*2f007e74SBrice Goglin 
926742755a1SChristoph Lameter 		err = -EFAULT;
927*2f007e74SBrice Goglin 		if (get_user(p, pages+i))
928*2f007e74SBrice Goglin 			goto out;
929*2f007e74SBrice Goglin 		addr = (unsigned long) p;
930*2f007e74SBrice Goglin 
931*2f007e74SBrice Goglin 		vma = find_vma(mm, addr);
932742755a1SChristoph Lameter 		if (!vma)
933742755a1SChristoph Lameter 			goto set_status;
934742755a1SChristoph Lameter 
935*2f007e74SBrice Goglin 		page = follow_page(vma, addr, 0);
93689f5b7daSLinus Torvalds 
93789f5b7daSLinus Torvalds 		err = PTR_ERR(page);
93889f5b7daSLinus Torvalds 		if (IS_ERR(page))
93989f5b7daSLinus Torvalds 			goto set_status;
94089f5b7daSLinus Torvalds 
941742755a1SChristoph Lameter 		err = -ENOENT;
942742755a1SChristoph Lameter 		/* Use PageReserved to check for zero page */
943742755a1SChristoph Lameter 		if (!page || PageReserved(page))
944742755a1SChristoph Lameter 			goto set_status;
945742755a1SChristoph Lameter 
946742755a1SChristoph Lameter 		err = page_to_nid(page);
947742755a1SChristoph Lameter set_status:
948*2f007e74SBrice Goglin 		put_user(err, status+i);
949742755a1SChristoph Lameter 	}
950*2f007e74SBrice Goglin 	err = 0;
951742755a1SChristoph Lameter 
952*2f007e74SBrice Goglin out:
953742755a1SChristoph Lameter 	up_read(&mm->mmap_sem);
954*2f007e74SBrice Goglin 	return err;
955742755a1SChristoph Lameter }
956742755a1SChristoph Lameter 
957742755a1SChristoph Lameter /*
958742755a1SChristoph Lameter  * Move a list of pages in the address space of the currently executing
959742755a1SChristoph Lameter  * process.
960742755a1SChristoph Lameter  */
961742755a1SChristoph Lameter asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
962742755a1SChristoph Lameter 			const void __user * __user *pages,
963742755a1SChristoph Lameter 			const int __user *nodes,
964742755a1SChristoph Lameter 			int __user *status, int flags)
965742755a1SChristoph Lameter {
966742755a1SChristoph Lameter 	int err = 0;
967742755a1SChristoph Lameter 	int i;
968742755a1SChristoph Lameter 	struct task_struct *task;
969742755a1SChristoph Lameter 	nodemask_t task_nodes;
970742755a1SChristoph Lameter 	struct mm_struct *mm;
971742755a1SChristoph Lameter 	struct page_to_node *pm = NULL;
972742755a1SChristoph Lameter 
973742755a1SChristoph Lameter 	/* Check flags */
974742755a1SChristoph Lameter 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
975742755a1SChristoph Lameter 		return -EINVAL;
976742755a1SChristoph Lameter 
977742755a1SChristoph Lameter 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
978742755a1SChristoph Lameter 		return -EPERM;
979742755a1SChristoph Lameter 
980742755a1SChristoph Lameter 	/* Find the mm_struct */
981742755a1SChristoph Lameter 	read_lock(&tasklist_lock);
982228ebcbeSPavel Emelyanov 	task = pid ? find_task_by_vpid(pid) : current;
983742755a1SChristoph Lameter 	if (!task) {
984742755a1SChristoph Lameter 		read_unlock(&tasklist_lock);
985742755a1SChristoph Lameter 		return -ESRCH;
986742755a1SChristoph Lameter 	}
987742755a1SChristoph Lameter 	mm = get_task_mm(task);
988742755a1SChristoph Lameter 	read_unlock(&tasklist_lock);
989742755a1SChristoph Lameter 
990742755a1SChristoph Lameter 	if (!mm)
991742755a1SChristoph Lameter 		return -EINVAL;
992742755a1SChristoph Lameter 
993742755a1SChristoph Lameter 	/*
994742755a1SChristoph Lameter 	 * Check if this process has the right to modify the specified
995742755a1SChristoph Lameter 	 * process. The right exists if the process has administrative
996742755a1SChristoph Lameter 	 * capabilities, superuser privileges or the same
997742755a1SChristoph Lameter 	 * userid as the target process.
998742755a1SChristoph Lameter 	 */
999742755a1SChristoph Lameter 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1000742755a1SChristoph Lameter 	    (current->uid != task->suid) && (current->uid != task->uid) &&
1001742755a1SChristoph Lameter 	    !capable(CAP_SYS_NICE)) {
1002742755a1SChristoph Lameter 		err = -EPERM;
1003742755a1SChristoph Lameter 		goto out2;
1004742755a1SChristoph Lameter 	}
1005742755a1SChristoph Lameter 
100686c3a764SDavid Quigley  	err = security_task_movememory(task);
100786c3a764SDavid Quigley  	if (err)
100886c3a764SDavid Quigley  		goto out2;
100986c3a764SDavid Quigley 
1010*2f007e74SBrice Goglin 	if (!nodes) {
1011*2f007e74SBrice Goglin 		err = do_pages_stat(mm, nr_pages, pages, status);
1012*2f007e74SBrice Goglin 		goto out2;
1013*2f007e74SBrice Goglin 	}
101486c3a764SDavid Quigley 
1015742755a1SChristoph Lameter 	task_nodes = cpuset_mems_allowed(task);
1016742755a1SChristoph Lameter 
1017742755a1SChristoph Lameter 	/* Limit nr_pages so that the multiplication may not overflow */
1018742755a1SChristoph Lameter 	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1019742755a1SChristoph Lameter 		err = -E2BIG;
1020742755a1SChristoph Lameter 		goto out2;
1021742755a1SChristoph Lameter 	}
1022742755a1SChristoph Lameter 
1023742755a1SChristoph Lameter 	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1024742755a1SChristoph Lameter 	if (!pm) {
1025742755a1SChristoph Lameter 		err = -ENOMEM;
1026742755a1SChristoph Lameter 		goto out2;
1027742755a1SChristoph Lameter 	}
1028742755a1SChristoph Lameter 
1029742755a1SChristoph Lameter 	/*
1030742755a1SChristoph Lameter 	 * Get parameters from user space and initialize the pm
1031742755a1SChristoph Lameter 	 * array. Return various errors if the user did something wrong.
1032742755a1SChristoph Lameter 	 */
1033742755a1SChristoph Lameter 	for (i = 0; i < nr_pages; i++) {
10349d966d49SAl Viro 		const void __user *p;
1035742755a1SChristoph Lameter 
1036742755a1SChristoph Lameter 		err = -EFAULT;
1037742755a1SChristoph Lameter 		if (get_user(p, pages + i))
1038742755a1SChristoph Lameter 			goto out;
1039742755a1SChristoph Lameter 
1040742755a1SChristoph Lameter 		pm[i].addr = (unsigned long)p;
1041742755a1SChristoph Lameter 		if (nodes) {
1042742755a1SChristoph Lameter 			int node;
1043742755a1SChristoph Lameter 
1044742755a1SChristoph Lameter 			if (get_user(node, nodes + i))
1045742755a1SChristoph Lameter 				goto out;
1046742755a1SChristoph Lameter 
1047742755a1SChristoph Lameter 			err = -ENODEV;
104856bbd65dSChristoph Lameter 			if (!node_state(node, N_HIGH_MEMORY))
1049742755a1SChristoph Lameter 				goto out;
1050742755a1SChristoph Lameter 
1051742755a1SChristoph Lameter 			err = -EACCES;
1052742755a1SChristoph Lameter 			if (!node_isset(node, task_nodes))
1053742755a1SChristoph Lameter 				goto out;
1054742755a1SChristoph Lameter 
1055742755a1SChristoph Lameter 			pm[i].node = node;
10568ce08464SStephen Rothwell 		} else
10578ce08464SStephen Rothwell 			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
1058742755a1SChristoph Lameter 	}
1059742755a1SChristoph Lameter 	/* End marker */
1060742755a1SChristoph Lameter 	pm[nr_pages].node = MAX_NUMNODES;
1061742755a1SChristoph Lameter 
1062742755a1SChristoph Lameter 	err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1063742755a1SChristoph Lameter 	if (err >= 0)
1064742755a1SChristoph Lameter 		/* Return status information */
1065742755a1SChristoph Lameter 		for (i = 0; i < nr_pages; i++)
1066742755a1SChristoph Lameter 			if (put_user(pm[i].status, status + i))
1067742755a1SChristoph Lameter 				err = -EFAULT;
1068742755a1SChristoph Lameter 
1069742755a1SChristoph Lameter out:
1070742755a1SChristoph Lameter 	vfree(pm);
1071742755a1SChristoph Lameter out2:
1072742755a1SChristoph Lameter 	mmput(mm);
1073742755a1SChristoph Lameter 	return err;
1074742755a1SChristoph Lameter }
1075742755a1SChristoph Lameter 
10767b2259b3SChristoph Lameter /*
10777b2259b3SChristoph Lameter  * Call migration functions in the vma_ops that may prepare
10787b2259b3SChristoph Lameter  * memory in a vm for migration. migration functions may perform
10797b2259b3SChristoph Lameter  * the migration for vmas that do not have an underlying page struct.
10807b2259b3SChristoph Lameter  */
10817b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
10827b2259b3SChristoph Lameter 	const nodemask_t *from, unsigned long flags)
10837b2259b3SChristoph Lameter {
10847b2259b3SChristoph Lameter  	struct vm_area_struct *vma;
10857b2259b3SChristoph Lameter  	int err = 0;
10867b2259b3SChristoph Lameter 
10877b2259b3SChristoph Lameter  	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
10887b2259b3SChristoph Lameter  		if (vma->vm_ops && vma->vm_ops->migrate) {
10897b2259b3SChristoph Lameter  			err = vma->vm_ops->migrate(vma, to, from, flags);
10907b2259b3SChristoph Lameter  			if (err)
10917b2259b3SChristoph Lameter  				break;
10927b2259b3SChristoph Lameter  		}
10937b2259b3SChristoph Lameter  	}
10947b2259b3SChristoph Lameter  	return err;
10957b2259b3SChristoph Lameter }
109683d1674aSGerald Schaefer #endif
1097