1b20a3503SChristoph Lameter /* 2b20a3503SChristoph Lameter * Memory Migration functionality - linux/mm/migration.c 3b20a3503SChristoph Lameter * 4b20a3503SChristoph Lameter * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5b20a3503SChristoph Lameter * 6b20a3503SChristoph Lameter * Page migration was first developed in the context of the memory hotplug 7b20a3503SChristoph Lameter * project. The main authors of the migration code are: 8b20a3503SChristoph Lameter * 9b20a3503SChristoph Lameter * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10b20a3503SChristoph Lameter * Hirokazu Takahashi <taka@valinux.co.jp> 11b20a3503SChristoph Lameter * Dave Hansen <haveblue@us.ibm.com> 12cde53535SChristoph Lameter * Christoph Lameter 13b20a3503SChristoph Lameter */ 14b20a3503SChristoph Lameter 15b20a3503SChristoph Lameter #include <linux/migrate.h> 16b20a3503SChristoph Lameter #include <linux/module.h> 17b20a3503SChristoph Lameter #include <linux/swap.h> 180697212aSChristoph Lameter #include <linux/swapops.h> 19b20a3503SChristoph Lameter #include <linux/pagemap.h> 20e23ca00bSChristoph Lameter #include <linux/buffer_head.h> 21b20a3503SChristoph Lameter #include <linux/mm_inline.h> 22b488893aSPavel Emelyanov #include <linux/nsproxy.h> 23b20a3503SChristoph Lameter #include <linux/pagevec.h> 24e9995ef9SHugh Dickins #include <linux/ksm.h> 25b20a3503SChristoph Lameter #include <linux/rmap.h> 26b20a3503SChristoph Lameter #include <linux/topology.h> 27b20a3503SChristoph Lameter #include <linux/cpu.h> 28b20a3503SChristoph Lameter #include <linux/cpuset.h> 2904e62a29SChristoph Lameter #include <linux/writeback.h> 30742755a1SChristoph Lameter #include <linux/mempolicy.h> 31742755a1SChristoph Lameter #include <linux/vmalloc.h> 3286c3a764SDavid Quigley #include <linux/security.h> 338a9f3ccdSBalbir Singh #include <linux/memcontrol.h> 344f5ca265SAdrian Bunk #include <linux/syscalls.h> 35b20a3503SChristoph Lameter 36b20a3503SChristoph Lameter #include "internal.h" 37b20a3503SChristoph Lameter 38b20a3503SChristoph Lameter #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 39b20a3503SChristoph Lameter 40b20a3503SChristoph Lameter /* 41742755a1SChristoph Lameter * migrate_prep() needs to be called before we start compiling a list of pages 42742755a1SChristoph Lameter * to be migrated using isolate_lru_page(). 43b20a3503SChristoph Lameter */ 44b20a3503SChristoph Lameter int migrate_prep(void) 45b20a3503SChristoph Lameter { 46b20a3503SChristoph Lameter /* 47b20a3503SChristoph Lameter * Clear the LRU lists so pages can be isolated. 48b20a3503SChristoph Lameter * Note that pages may be moved off the LRU after we have 49b20a3503SChristoph Lameter * drained them. Those pages will fail to migrate like other 50b20a3503SChristoph Lameter * pages that may be busy. 51b20a3503SChristoph Lameter */ 52b20a3503SChristoph Lameter lru_add_drain_all(); 53b20a3503SChristoph Lameter 54b20a3503SChristoph Lameter return 0; 55b20a3503SChristoph Lameter } 56b20a3503SChristoph Lameter 57b20a3503SChristoph Lameter /* 58894bc310SLee Schermerhorn * Add isolated pages on the list back to the LRU under page lock 59894bc310SLee Schermerhorn * to avoid leaking evictable pages back onto unevictable list. 60b20a3503SChristoph Lameter * 61b20a3503SChristoph Lameter * returns the number of pages put back. 62b20a3503SChristoph Lameter */ 63b20a3503SChristoph Lameter int putback_lru_pages(struct list_head *l) 64b20a3503SChristoph Lameter { 65b20a3503SChristoph Lameter struct page *page; 66b20a3503SChristoph Lameter struct page *page2; 67b20a3503SChristoph Lameter int count = 0; 68b20a3503SChristoph Lameter 69b20a3503SChristoph Lameter list_for_each_entry_safe(page, page2, l, lru) { 70e24f0b8fSChristoph Lameter list_del(&page->lru); 71a731286dSKOSAKI Motohiro dec_zone_page_state(page, NR_ISOLATED_ANON + 726c0b1351SJohannes Weiner page_is_file_cache(page)); 73894bc310SLee Schermerhorn putback_lru_page(page); 74b20a3503SChristoph Lameter count++; 75b20a3503SChristoph Lameter } 76b20a3503SChristoph Lameter return count; 77b20a3503SChristoph Lameter } 78b20a3503SChristoph Lameter 790697212aSChristoph Lameter /* 800697212aSChristoph Lameter * Restore a potential migration pte to a working pte entry 810697212aSChristoph Lameter */ 82e9995ef9SHugh Dickins static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 83e9995ef9SHugh Dickins unsigned long addr, void *old) 840697212aSChristoph Lameter { 850697212aSChristoph Lameter struct mm_struct *mm = vma->vm_mm; 860697212aSChristoph Lameter swp_entry_t entry; 870697212aSChristoph Lameter pgd_t *pgd; 880697212aSChristoph Lameter pud_t *pud; 890697212aSChristoph Lameter pmd_t *pmd; 900697212aSChristoph Lameter pte_t *ptep, pte; 910697212aSChristoph Lameter spinlock_t *ptl; 920697212aSChristoph Lameter 930697212aSChristoph Lameter pgd = pgd_offset(mm, addr); 940697212aSChristoph Lameter if (!pgd_present(*pgd)) 95e9995ef9SHugh Dickins goto out; 960697212aSChristoph Lameter 970697212aSChristoph Lameter pud = pud_offset(pgd, addr); 980697212aSChristoph Lameter if (!pud_present(*pud)) 99e9995ef9SHugh Dickins goto out; 1000697212aSChristoph Lameter 1010697212aSChristoph Lameter pmd = pmd_offset(pud, addr); 1020697212aSChristoph Lameter if (!pmd_present(*pmd)) 103e9995ef9SHugh Dickins goto out; 1040697212aSChristoph Lameter 1050697212aSChristoph Lameter ptep = pte_offset_map(pmd, addr); 1060697212aSChristoph Lameter 1070697212aSChristoph Lameter if (!is_swap_pte(*ptep)) { 1080697212aSChristoph Lameter pte_unmap(ptep); 109e9995ef9SHugh Dickins goto out; 1100697212aSChristoph Lameter } 1110697212aSChristoph Lameter 1120697212aSChristoph Lameter ptl = pte_lockptr(mm, pmd); 1130697212aSChristoph Lameter spin_lock(ptl); 1140697212aSChristoph Lameter pte = *ptep; 1150697212aSChristoph Lameter if (!is_swap_pte(pte)) 116e9995ef9SHugh Dickins goto unlock; 1170697212aSChristoph Lameter 1180697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 1190697212aSChristoph Lameter 120e9995ef9SHugh Dickins if (!is_migration_entry(entry) || 121e9995ef9SHugh Dickins migration_entry_to_page(entry) != old) 122e9995ef9SHugh Dickins goto unlock; 1230697212aSChristoph Lameter 1240697212aSChristoph Lameter get_page(new); 1250697212aSChristoph Lameter pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 1260697212aSChristoph Lameter if (is_write_migration_entry(entry)) 1270697212aSChristoph Lameter pte = pte_mkwrite(pte); 12897ee0524SKAMEZAWA Hiroyuki flush_cache_page(vma, addr, pte_pfn(pte)); 1290697212aSChristoph Lameter set_pte_at(mm, addr, ptep, pte); 13004e62a29SChristoph Lameter 13104e62a29SChristoph Lameter if (PageAnon(new)) 1320697212aSChristoph Lameter page_add_anon_rmap(new, vma, addr); 13304e62a29SChristoph Lameter else 13404e62a29SChristoph Lameter page_add_file_rmap(new); 13504e62a29SChristoph Lameter 13604e62a29SChristoph Lameter /* No need to invalidate - it was non-present before */ 13704e62a29SChristoph Lameter update_mmu_cache(vma, addr, pte); 138e9995ef9SHugh Dickins unlock: 1390697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 140e9995ef9SHugh Dickins out: 141e9995ef9SHugh Dickins return SWAP_AGAIN; 1420697212aSChristoph Lameter } 1430697212aSChristoph Lameter 1440697212aSChristoph Lameter /* 14504e62a29SChristoph Lameter * Get rid of all migration entries and replace them by 14604e62a29SChristoph Lameter * references to the indicated page. 14704e62a29SChristoph Lameter */ 14804e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new) 14904e62a29SChristoph Lameter { 150e9995ef9SHugh Dickins rmap_walk(new, remove_migration_pte, old); 15104e62a29SChristoph Lameter } 15204e62a29SChristoph Lameter 15304e62a29SChristoph Lameter /* 1540697212aSChristoph Lameter * Something used the pte of a page under migration. We need to 1550697212aSChristoph Lameter * get to the page and wait until migration is finished. 1560697212aSChristoph Lameter * When we return from this function the fault will be retried. 1570697212aSChristoph Lameter * 1580697212aSChristoph Lameter * This function is called from do_swap_page(). 1590697212aSChristoph Lameter */ 1600697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 1610697212aSChristoph Lameter unsigned long address) 1620697212aSChristoph Lameter { 1630697212aSChristoph Lameter pte_t *ptep, pte; 1640697212aSChristoph Lameter spinlock_t *ptl; 1650697212aSChristoph Lameter swp_entry_t entry; 1660697212aSChristoph Lameter struct page *page; 1670697212aSChristoph Lameter 1680697212aSChristoph Lameter ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1690697212aSChristoph Lameter pte = *ptep; 1700697212aSChristoph Lameter if (!is_swap_pte(pte)) 1710697212aSChristoph Lameter goto out; 1720697212aSChristoph Lameter 1730697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 1740697212aSChristoph Lameter if (!is_migration_entry(entry)) 1750697212aSChristoph Lameter goto out; 1760697212aSChristoph Lameter 1770697212aSChristoph Lameter page = migration_entry_to_page(entry); 1780697212aSChristoph Lameter 179e286781dSNick Piggin /* 180e286781dSNick Piggin * Once radix-tree replacement of page migration started, page_count 181e286781dSNick Piggin * *must* be zero. And, we don't want to call wait_on_page_locked() 182e286781dSNick Piggin * against a page without get_page(). 183e286781dSNick Piggin * So, we use get_page_unless_zero(), here. Even failed, page fault 184e286781dSNick Piggin * will occur again. 185e286781dSNick Piggin */ 186e286781dSNick Piggin if (!get_page_unless_zero(page)) 187e286781dSNick Piggin goto out; 1880697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 1890697212aSChristoph Lameter wait_on_page_locked(page); 1900697212aSChristoph Lameter put_page(page); 1910697212aSChristoph Lameter return; 1920697212aSChristoph Lameter out: 1930697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 1940697212aSChristoph Lameter } 1950697212aSChristoph Lameter 196b20a3503SChristoph Lameter /* 197c3fcf8a5SChristoph Lameter * Replace the page in the mapping. 1985b5c7120SChristoph Lameter * 1995b5c7120SChristoph Lameter * The number of remaining references must be: 2005b5c7120SChristoph Lameter * 1 for anonymous pages without a mapping 2015b5c7120SChristoph Lameter * 2 for pages with a mapping 202266cf658SDavid Howells * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 203b20a3503SChristoph Lameter */ 2042d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping, 2052d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 206b20a3503SChristoph Lameter { 207e286781dSNick Piggin int expected_count; 2087cf9c2c7SNick Piggin void **pslot; 209b20a3503SChristoph Lameter 2106c5240aeSChristoph Lameter if (!mapping) { 2110e8c7d0fSChristoph Lameter /* Anonymous page without mapping */ 2126c5240aeSChristoph Lameter if (page_count(page) != 1) 2136c5240aeSChristoph Lameter return -EAGAIN; 2146c5240aeSChristoph Lameter return 0; 2156c5240aeSChristoph Lameter } 2166c5240aeSChristoph Lameter 21719fd6231SNick Piggin spin_lock_irq(&mapping->tree_lock); 218b20a3503SChristoph Lameter 2197cf9c2c7SNick Piggin pslot = radix_tree_lookup_slot(&mapping->page_tree, 220b20a3503SChristoph Lameter page_index(page)); 221b20a3503SChristoph Lameter 222edcf4748SJohannes Weiner expected_count = 2 + page_has_private(page); 223e286781dSNick Piggin if (page_count(page) != expected_count || 2247cf9c2c7SNick Piggin (struct page *)radix_tree_deref_slot(pslot) != page) { 22519fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 226e23ca00bSChristoph Lameter return -EAGAIN; 227b20a3503SChristoph Lameter } 228b20a3503SChristoph Lameter 229e286781dSNick Piggin if (!page_freeze_refs(page, expected_count)) { 23019fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 231e286781dSNick Piggin return -EAGAIN; 232e286781dSNick Piggin } 233e286781dSNick Piggin 234b20a3503SChristoph Lameter /* 235b20a3503SChristoph Lameter * Now we know that no one else is looking at the page. 236b20a3503SChristoph Lameter */ 2377cf9c2c7SNick Piggin get_page(newpage); /* add cache reference */ 238b20a3503SChristoph Lameter if (PageSwapCache(page)) { 239b20a3503SChristoph Lameter SetPageSwapCache(newpage); 240b20a3503SChristoph Lameter set_page_private(newpage, page_private(page)); 241b20a3503SChristoph Lameter } 242b20a3503SChristoph Lameter 2437cf9c2c7SNick Piggin radix_tree_replace_slot(pslot, newpage); 2447cf9c2c7SNick Piggin 245e286781dSNick Piggin page_unfreeze_refs(page, expected_count); 2467cf9c2c7SNick Piggin /* 2477cf9c2c7SNick Piggin * Drop cache reference from old page. 2487cf9c2c7SNick Piggin * We know this isn't the last reference. 2497cf9c2c7SNick Piggin */ 250b20a3503SChristoph Lameter __put_page(page); 2517cf9c2c7SNick Piggin 2520e8c7d0fSChristoph Lameter /* 2530e8c7d0fSChristoph Lameter * If moved to a different zone then also account 2540e8c7d0fSChristoph Lameter * the page for that zone. Other VM counters will be 2550e8c7d0fSChristoph Lameter * taken care of when we establish references to the 2560e8c7d0fSChristoph Lameter * new page and drop references to the old page. 2570e8c7d0fSChristoph Lameter * 2580e8c7d0fSChristoph Lameter * Note that anonymous pages are accounted for 2590e8c7d0fSChristoph Lameter * via NR_FILE_PAGES and NR_ANON_PAGES if they 2600e8c7d0fSChristoph Lameter * are mapped to swap space. 2610e8c7d0fSChristoph Lameter */ 2620e8c7d0fSChristoph Lameter __dec_zone_page_state(page, NR_FILE_PAGES); 2630e8c7d0fSChristoph Lameter __inc_zone_page_state(newpage, NR_FILE_PAGES); 2644b02108aSKOSAKI Motohiro if (PageSwapBacked(page)) { 2654b02108aSKOSAKI Motohiro __dec_zone_page_state(page, NR_SHMEM); 2664b02108aSKOSAKI Motohiro __inc_zone_page_state(newpage, NR_SHMEM); 2674b02108aSKOSAKI Motohiro } 26819fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 269b20a3503SChristoph Lameter 270b20a3503SChristoph Lameter return 0; 271b20a3503SChristoph Lameter } 272b20a3503SChristoph Lameter 273b20a3503SChristoph Lameter /* 274b20a3503SChristoph Lameter * Copy the page to its new location 275b20a3503SChristoph Lameter */ 276e7340f73SChristoph Lameter static void migrate_page_copy(struct page *newpage, struct page *page) 277b20a3503SChristoph Lameter { 278b7abea96SKAMEZAWA Hiroyuki int anon; 279b7abea96SKAMEZAWA Hiroyuki 280b20a3503SChristoph Lameter copy_highpage(newpage, page); 281b20a3503SChristoph Lameter 282b20a3503SChristoph Lameter if (PageError(page)) 283b20a3503SChristoph Lameter SetPageError(newpage); 284b20a3503SChristoph Lameter if (PageReferenced(page)) 285b20a3503SChristoph Lameter SetPageReferenced(newpage); 286b20a3503SChristoph Lameter if (PageUptodate(page)) 287b20a3503SChristoph Lameter SetPageUptodate(newpage); 288894bc310SLee Schermerhorn if (TestClearPageActive(page)) { 289894bc310SLee Schermerhorn VM_BUG_ON(PageUnevictable(page)); 290b20a3503SChristoph Lameter SetPageActive(newpage); 291418b27efSLee Schermerhorn } else if (TestClearPageUnevictable(page)) 292418b27efSLee Schermerhorn SetPageUnevictable(newpage); 293b20a3503SChristoph Lameter if (PageChecked(page)) 294b20a3503SChristoph Lameter SetPageChecked(newpage); 295b20a3503SChristoph Lameter if (PageMappedToDisk(page)) 296b20a3503SChristoph Lameter SetPageMappedToDisk(newpage); 297b20a3503SChristoph Lameter 298b20a3503SChristoph Lameter if (PageDirty(page)) { 299b20a3503SChristoph Lameter clear_page_dirty_for_io(page); 3003a902c5fSNick Piggin /* 3013a902c5fSNick Piggin * Want to mark the page and the radix tree as dirty, and 3023a902c5fSNick Piggin * redo the accounting that clear_page_dirty_for_io undid, 3033a902c5fSNick Piggin * but we can't use set_page_dirty because that function 3043a902c5fSNick Piggin * is actually a signal that all of the page has become dirty. 3053a902c5fSNick Piggin * Wheras only part of our page may be dirty. 3063a902c5fSNick Piggin */ 3073a902c5fSNick Piggin __set_page_dirty_nobuffers(newpage); 308b20a3503SChristoph Lameter } 309b20a3503SChristoph Lameter 310b291f000SNick Piggin mlock_migrate_page(newpage, page); 311e9995ef9SHugh Dickins ksm_migrate_page(newpage, page); 312b291f000SNick Piggin 313b20a3503SChristoph Lameter ClearPageSwapCache(page); 314b20a3503SChristoph Lameter ClearPagePrivate(page); 315b20a3503SChristoph Lameter set_page_private(page, 0); 316b7abea96SKAMEZAWA Hiroyuki /* page->mapping contains a flag for PageAnon() */ 317b7abea96SKAMEZAWA Hiroyuki anon = PageAnon(page); 318b20a3503SChristoph Lameter page->mapping = NULL; 319b20a3503SChristoph Lameter 320b20a3503SChristoph Lameter /* 321b20a3503SChristoph Lameter * If any waiters have accumulated on the new page then 322b20a3503SChristoph Lameter * wake them up. 323b20a3503SChristoph Lameter */ 324b20a3503SChristoph Lameter if (PageWriteback(newpage)) 325b20a3503SChristoph Lameter end_page_writeback(newpage); 326b20a3503SChristoph Lameter } 327b20a3503SChristoph Lameter 3281d8b85ccSChristoph Lameter /************************************************************ 3291d8b85ccSChristoph Lameter * Migration functions 3301d8b85ccSChristoph Lameter ***********************************************************/ 3311d8b85ccSChristoph Lameter 3321d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */ 3332d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping, 3342d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 3351d8b85ccSChristoph Lameter { 3361d8b85ccSChristoph Lameter return -EIO; 3371d8b85ccSChristoph Lameter } 3381d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page); 3391d8b85ccSChristoph Lameter 340b20a3503SChristoph Lameter /* 341b20a3503SChristoph Lameter * Common logic to directly migrate a single page suitable for 342266cf658SDavid Howells * pages that do not use PagePrivate/PagePrivate2. 343b20a3503SChristoph Lameter * 344b20a3503SChristoph Lameter * Pages are locked upon entry and exit. 345b20a3503SChristoph Lameter */ 3462d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping, 3472d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 348b20a3503SChristoph Lameter { 349b20a3503SChristoph Lameter int rc; 350b20a3503SChristoph Lameter 351b20a3503SChristoph Lameter BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 352b20a3503SChristoph Lameter 3532d1db3b1SChristoph Lameter rc = migrate_page_move_mapping(mapping, newpage, page); 354b20a3503SChristoph Lameter 355b20a3503SChristoph Lameter if (rc) 356b20a3503SChristoph Lameter return rc; 357b20a3503SChristoph Lameter 358b20a3503SChristoph Lameter migrate_page_copy(newpage, page); 359b20a3503SChristoph Lameter return 0; 360b20a3503SChristoph Lameter } 361b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page); 362b20a3503SChristoph Lameter 3639361401eSDavid Howells #ifdef CONFIG_BLOCK 364b20a3503SChristoph Lameter /* 3651d8b85ccSChristoph Lameter * Migration function for pages with buffers. This function can only be used 3661d8b85ccSChristoph Lameter * if the underlying filesystem guarantees that no other references to "page" 3671d8b85ccSChristoph Lameter * exist. 3681d8b85ccSChristoph Lameter */ 3692d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping, 3702d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 3711d8b85ccSChristoph Lameter { 3721d8b85ccSChristoph Lameter struct buffer_head *bh, *head; 3731d8b85ccSChristoph Lameter int rc; 3741d8b85ccSChristoph Lameter 3751d8b85ccSChristoph Lameter if (!page_has_buffers(page)) 3762d1db3b1SChristoph Lameter return migrate_page(mapping, newpage, page); 3771d8b85ccSChristoph Lameter 3781d8b85ccSChristoph Lameter head = page_buffers(page); 3791d8b85ccSChristoph Lameter 3802d1db3b1SChristoph Lameter rc = migrate_page_move_mapping(mapping, newpage, page); 3811d8b85ccSChristoph Lameter 3821d8b85ccSChristoph Lameter if (rc) 3831d8b85ccSChristoph Lameter return rc; 3841d8b85ccSChristoph Lameter 3851d8b85ccSChristoph Lameter bh = head; 3861d8b85ccSChristoph Lameter do { 3871d8b85ccSChristoph Lameter get_bh(bh); 3881d8b85ccSChristoph Lameter lock_buffer(bh); 3891d8b85ccSChristoph Lameter bh = bh->b_this_page; 3901d8b85ccSChristoph Lameter 3911d8b85ccSChristoph Lameter } while (bh != head); 3921d8b85ccSChristoph Lameter 3931d8b85ccSChristoph Lameter ClearPagePrivate(page); 3941d8b85ccSChristoph Lameter set_page_private(newpage, page_private(page)); 3951d8b85ccSChristoph Lameter set_page_private(page, 0); 3961d8b85ccSChristoph Lameter put_page(page); 3971d8b85ccSChristoph Lameter get_page(newpage); 3981d8b85ccSChristoph Lameter 3991d8b85ccSChristoph Lameter bh = head; 4001d8b85ccSChristoph Lameter do { 4011d8b85ccSChristoph Lameter set_bh_page(bh, newpage, bh_offset(bh)); 4021d8b85ccSChristoph Lameter bh = bh->b_this_page; 4031d8b85ccSChristoph Lameter 4041d8b85ccSChristoph Lameter } while (bh != head); 4051d8b85ccSChristoph Lameter 4061d8b85ccSChristoph Lameter SetPagePrivate(newpage); 4071d8b85ccSChristoph Lameter 4081d8b85ccSChristoph Lameter migrate_page_copy(newpage, page); 4091d8b85ccSChristoph Lameter 4101d8b85ccSChristoph Lameter bh = head; 4111d8b85ccSChristoph Lameter do { 4121d8b85ccSChristoph Lameter unlock_buffer(bh); 4131d8b85ccSChristoph Lameter put_bh(bh); 4141d8b85ccSChristoph Lameter bh = bh->b_this_page; 4151d8b85ccSChristoph Lameter 4161d8b85ccSChristoph Lameter } while (bh != head); 4171d8b85ccSChristoph Lameter 4181d8b85ccSChristoph Lameter return 0; 4191d8b85ccSChristoph Lameter } 4201d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page); 4219361401eSDavid Howells #endif 4221d8b85ccSChristoph Lameter 42304e62a29SChristoph Lameter /* 42404e62a29SChristoph Lameter * Writeback a page to clean the dirty state 42504e62a29SChristoph Lameter */ 42604e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page) 42704e62a29SChristoph Lameter { 42804e62a29SChristoph Lameter struct writeback_control wbc = { 42904e62a29SChristoph Lameter .sync_mode = WB_SYNC_NONE, 43004e62a29SChristoph Lameter .nr_to_write = 1, 43104e62a29SChristoph Lameter .range_start = 0, 43204e62a29SChristoph Lameter .range_end = LLONG_MAX, 43304e62a29SChristoph Lameter .nonblocking = 1, 43404e62a29SChristoph Lameter .for_reclaim = 1 43504e62a29SChristoph Lameter }; 43604e62a29SChristoph Lameter int rc; 43704e62a29SChristoph Lameter 43804e62a29SChristoph Lameter if (!mapping->a_ops->writepage) 43904e62a29SChristoph Lameter /* No write method for the address space */ 44004e62a29SChristoph Lameter return -EINVAL; 44104e62a29SChristoph Lameter 44204e62a29SChristoph Lameter if (!clear_page_dirty_for_io(page)) 44304e62a29SChristoph Lameter /* Someone else already triggered a write */ 44404e62a29SChristoph Lameter return -EAGAIN; 44504e62a29SChristoph Lameter 44604e62a29SChristoph Lameter /* 44704e62a29SChristoph Lameter * A dirty page may imply that the underlying filesystem has 44804e62a29SChristoph Lameter * the page on some queue. So the page must be clean for 44904e62a29SChristoph Lameter * migration. Writeout may mean we loose the lock and the 45004e62a29SChristoph Lameter * page state is no longer what we checked for earlier. 45104e62a29SChristoph Lameter * At this point we know that the migration attempt cannot 45204e62a29SChristoph Lameter * be successful. 45304e62a29SChristoph Lameter */ 45404e62a29SChristoph Lameter remove_migration_ptes(page, page); 45504e62a29SChristoph Lameter 45604e62a29SChristoph Lameter rc = mapping->a_ops->writepage(page, &wbc); 45704e62a29SChristoph Lameter 45804e62a29SChristoph Lameter if (rc != AOP_WRITEPAGE_ACTIVATE) 45904e62a29SChristoph Lameter /* unlocked. Relock */ 46004e62a29SChristoph Lameter lock_page(page); 46104e62a29SChristoph Lameter 462bda8550dSHugh Dickins return (rc < 0) ? -EIO : -EAGAIN; 46304e62a29SChristoph Lameter } 46404e62a29SChristoph Lameter 46504e62a29SChristoph Lameter /* 46604e62a29SChristoph Lameter * Default handling if a filesystem does not provide a migration function. 46704e62a29SChristoph Lameter */ 4688351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping, 4698351a6e4SChristoph Lameter struct page *newpage, struct page *page) 4708351a6e4SChristoph Lameter { 47104e62a29SChristoph Lameter if (PageDirty(page)) 47204e62a29SChristoph Lameter return writeout(mapping, page); 4738351a6e4SChristoph Lameter 4748351a6e4SChristoph Lameter /* 4758351a6e4SChristoph Lameter * Buffers may be managed in a filesystem specific way. 4768351a6e4SChristoph Lameter * We must have no buffers or drop them. 4778351a6e4SChristoph Lameter */ 478266cf658SDavid Howells if (page_has_private(page) && 4798351a6e4SChristoph Lameter !try_to_release_page(page, GFP_KERNEL)) 4808351a6e4SChristoph Lameter return -EAGAIN; 4818351a6e4SChristoph Lameter 4828351a6e4SChristoph Lameter return migrate_page(mapping, newpage, page); 4838351a6e4SChristoph Lameter } 4848351a6e4SChristoph Lameter 4851d8b85ccSChristoph Lameter /* 486e24f0b8fSChristoph Lameter * Move a page to a newly allocated page 487e24f0b8fSChristoph Lameter * The page is locked and all ptes have been successfully removed. 488b20a3503SChristoph Lameter * 489e24f0b8fSChristoph Lameter * The new page will have replaced the old page if this function 490e24f0b8fSChristoph Lameter * is successful. 491894bc310SLee Schermerhorn * 492894bc310SLee Schermerhorn * Return value: 493894bc310SLee Schermerhorn * < 0 - error code 494894bc310SLee Schermerhorn * == 0 - success 495b20a3503SChristoph Lameter */ 496e24f0b8fSChristoph Lameter static int move_to_new_page(struct page *newpage, struct page *page) 497b20a3503SChristoph Lameter { 498e24f0b8fSChristoph Lameter struct address_space *mapping; 499b20a3503SChristoph Lameter int rc; 500b20a3503SChristoph Lameter 501b20a3503SChristoph Lameter /* 502e24f0b8fSChristoph Lameter * Block others from accessing the page when we get around to 503e24f0b8fSChristoph Lameter * establishing additional references. We are the only one 504e24f0b8fSChristoph Lameter * holding a reference to the new page at this point. 505b20a3503SChristoph Lameter */ 506529ae9aaSNick Piggin if (!trylock_page(newpage)) 507e24f0b8fSChristoph Lameter BUG(); 508b20a3503SChristoph Lameter 5092d1db3b1SChristoph Lameter /* Prepare mapping for the new page.*/ 5102d1db3b1SChristoph Lameter newpage->index = page->index; 5112d1db3b1SChristoph Lameter newpage->mapping = page->mapping; 512b2e18538SRik van Riel if (PageSwapBacked(page)) 513b2e18538SRik van Riel SetPageSwapBacked(newpage); 5142d1db3b1SChristoph Lameter 515b20a3503SChristoph Lameter mapping = page_mapping(page); 516b20a3503SChristoph Lameter if (!mapping) 5176c5240aeSChristoph Lameter rc = migrate_page(mapping, newpage, page); 5186c5240aeSChristoph Lameter else if (mapping->a_ops->migratepage) 519b20a3503SChristoph Lameter /* 520b20a3503SChristoph Lameter * Most pages have a mapping and most filesystems 521b20a3503SChristoph Lameter * should provide a migration function. Anonymous 522b20a3503SChristoph Lameter * pages are part of swap space which also has its 523b20a3503SChristoph Lameter * own migration function. This is the most common 524b20a3503SChristoph Lameter * path for page migration. 525b20a3503SChristoph Lameter */ 5262d1db3b1SChristoph Lameter rc = mapping->a_ops->migratepage(mapping, 5272d1db3b1SChristoph Lameter newpage, page); 5288351a6e4SChristoph Lameter else 5298351a6e4SChristoph Lameter rc = fallback_migrate_page(mapping, newpage, page); 530b20a3503SChristoph Lameter 531e9995ef9SHugh Dickins if (!rc) 5326c5240aeSChristoph Lameter remove_migration_ptes(page, newpage); 533e9995ef9SHugh Dickins else 534e24f0b8fSChristoph Lameter newpage->mapping = NULL; 5356c5240aeSChristoph Lameter 536b20a3503SChristoph Lameter unlock_page(newpage); 537b20a3503SChristoph Lameter 538e24f0b8fSChristoph Lameter return rc; 539e24f0b8fSChristoph Lameter } 540e24f0b8fSChristoph Lameter 541e24f0b8fSChristoph Lameter /* 542e24f0b8fSChristoph Lameter * Obtain the lock on page, remove all ptes and migrate the page 543e24f0b8fSChristoph Lameter * to the newly allocated page in newpage. 544e24f0b8fSChristoph Lameter */ 54595a402c3SChristoph Lameter static int unmap_and_move(new_page_t get_new_page, unsigned long private, 54662b61f61SHugh Dickins struct page *page, int force, int offlining) 547e24f0b8fSChristoph Lameter { 548e24f0b8fSChristoph Lameter int rc = 0; 549742755a1SChristoph Lameter int *result = NULL; 550742755a1SChristoph Lameter struct page *newpage = get_new_page(page, private, &result); 551989f89c5SKAMEZAWA Hiroyuki int rcu_locked = 0; 552ae41be37SKAMEZAWA Hiroyuki int charge = 0; 553e00e4316SKAMEZAWA Hiroyuki struct mem_cgroup *mem = NULL; 55495a402c3SChristoph Lameter 55595a402c3SChristoph Lameter if (!newpage) 55695a402c3SChristoph Lameter return -ENOMEM; 557e24f0b8fSChristoph Lameter 558894bc310SLee Schermerhorn if (page_count(page) == 1) { 559e24f0b8fSChristoph Lameter /* page was freed from under us. So we are done. */ 56095a402c3SChristoph Lameter goto move_newpage; 561894bc310SLee Schermerhorn } 562e24f0b8fSChristoph Lameter 563e8589cc1SKAMEZAWA Hiroyuki /* prepare cgroup just returns 0 or -ENOMEM */ 564e24f0b8fSChristoph Lameter rc = -EAGAIN; 56501b1ae63SKAMEZAWA Hiroyuki 566529ae9aaSNick Piggin if (!trylock_page(page)) { 567e24f0b8fSChristoph Lameter if (!force) 56895a402c3SChristoph Lameter goto move_newpage; 569e24f0b8fSChristoph Lameter lock_page(page); 570e24f0b8fSChristoph Lameter } 571e24f0b8fSChristoph Lameter 57262b61f61SHugh Dickins /* 57362b61f61SHugh Dickins * Only memory hotplug's offline_pages() caller has locked out KSM, 57462b61f61SHugh Dickins * and can safely migrate a KSM page. The other cases have skipped 57562b61f61SHugh Dickins * PageKsm along with PageReserved - but it is only now when we have 57662b61f61SHugh Dickins * the page lock that we can be certain it will not go KSM beneath us 57762b61f61SHugh Dickins * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 57862b61f61SHugh Dickins * its pagecount raised, but only here do we take the page lock which 57962b61f61SHugh Dickins * serializes that). 58062b61f61SHugh Dickins */ 58162b61f61SHugh Dickins if (PageKsm(page) && !offlining) { 58262b61f61SHugh Dickins rc = -EBUSY; 58362b61f61SHugh Dickins goto unlock; 58462b61f61SHugh Dickins } 58562b61f61SHugh Dickins 58601b1ae63SKAMEZAWA Hiroyuki /* charge against new page */ 58701b1ae63SKAMEZAWA Hiroyuki charge = mem_cgroup_prepare_migration(page, &mem); 58801b1ae63SKAMEZAWA Hiroyuki if (charge == -ENOMEM) { 58901b1ae63SKAMEZAWA Hiroyuki rc = -ENOMEM; 59001b1ae63SKAMEZAWA Hiroyuki goto unlock; 59101b1ae63SKAMEZAWA Hiroyuki } 59201b1ae63SKAMEZAWA Hiroyuki BUG_ON(charge); 59301b1ae63SKAMEZAWA Hiroyuki 594e24f0b8fSChristoph Lameter if (PageWriteback(page)) { 595e24f0b8fSChristoph Lameter if (!force) 59601b1ae63SKAMEZAWA Hiroyuki goto uncharge; 597e24f0b8fSChristoph Lameter wait_on_page_writeback(page); 598e24f0b8fSChristoph Lameter } 599e24f0b8fSChristoph Lameter /* 600dc386d4dSKAMEZAWA Hiroyuki * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 601dc386d4dSKAMEZAWA Hiroyuki * we cannot notice that anon_vma is freed while we migrates a page. 602dc386d4dSKAMEZAWA Hiroyuki * This rcu_read_lock() delays freeing anon_vma pointer until the end 603dc386d4dSKAMEZAWA Hiroyuki * of migration. File cache pages are no problem because of page_lock() 604989f89c5SKAMEZAWA Hiroyuki * File Caches may use write_page() or lock_page() in migration, then, 605989f89c5SKAMEZAWA Hiroyuki * just care Anon page here. 606e24f0b8fSChristoph Lameter */ 607989f89c5SKAMEZAWA Hiroyuki if (PageAnon(page)) { 608dc386d4dSKAMEZAWA Hiroyuki rcu_read_lock(); 609989f89c5SKAMEZAWA Hiroyuki rcu_locked = 1; 610989f89c5SKAMEZAWA Hiroyuki } 61162e1c553SShaohua Li 612dc386d4dSKAMEZAWA Hiroyuki /* 61362e1c553SShaohua Li * Corner case handling: 61462e1c553SShaohua Li * 1. When a new swap-cache page is read into, it is added to the LRU 61562e1c553SShaohua Li * and treated as swapcache but it has no rmap yet. 61662e1c553SShaohua Li * Calling try_to_unmap() against a page->mapping==NULL page will 61762e1c553SShaohua Li * trigger a BUG. So handle it here. 61862e1c553SShaohua Li * 2. An orphaned page (see truncate_complete_page) might have 61962e1c553SShaohua Li * fs-private metadata. The page can be picked up due to memory 62062e1c553SShaohua Li * offlining. Everywhere else except page reclaim, the page is 62162e1c553SShaohua Li * invisible to the vm, so the page can not be migrated. So try to 62262e1c553SShaohua Li * free the metadata, so the page can be freed. 623dc386d4dSKAMEZAWA Hiroyuki */ 62462e1c553SShaohua Li if (!page->mapping) { 625266cf658SDavid Howells if (!PageAnon(page) && page_has_private(page)) { 62662e1c553SShaohua Li /* 62762e1c553SShaohua Li * Go direct to try_to_free_buffers() here because 62862e1c553SShaohua Li * a) that's what try_to_release_page() would do anyway 62962e1c553SShaohua Li * b) we may be under rcu_read_lock() here, so we can't 63062e1c553SShaohua Li * use GFP_KERNEL which is what try_to_release_page() 63162e1c553SShaohua Li * needs to be effective. 63262e1c553SShaohua Li */ 63362e1c553SShaohua Li try_to_free_buffers(page); 634dc386d4dSKAMEZAWA Hiroyuki goto rcu_unlock; 63562e1c553SShaohua Li } 636abfc3488SShaohua Li goto skip_unmap; 637abfc3488SShaohua Li } 63862e1c553SShaohua Li 639dc386d4dSKAMEZAWA Hiroyuki /* Establish migration ptes or remove ptes */ 64014fa31b8SAndi Kleen try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 641dc386d4dSKAMEZAWA Hiroyuki 642abfc3488SShaohua Li skip_unmap: 643e24f0b8fSChristoph Lameter if (!page_mapped(page)) 644e24f0b8fSChristoph Lameter rc = move_to_new_page(newpage, page); 645e24f0b8fSChristoph Lameter 646e8589cc1SKAMEZAWA Hiroyuki if (rc) 6476c5240aeSChristoph Lameter remove_migration_ptes(page, page); 648dc386d4dSKAMEZAWA Hiroyuki rcu_unlock: 649989f89c5SKAMEZAWA Hiroyuki if (rcu_locked) 650dc386d4dSKAMEZAWA Hiroyuki rcu_read_unlock(); 65101b1ae63SKAMEZAWA Hiroyuki uncharge: 65201b1ae63SKAMEZAWA Hiroyuki if (!charge) 65301b1ae63SKAMEZAWA Hiroyuki mem_cgroup_end_migration(mem, page, newpage); 654e24f0b8fSChristoph Lameter unlock: 655b20a3503SChristoph Lameter unlock_page(page); 65695a402c3SChristoph Lameter 657e24f0b8fSChristoph Lameter if (rc != -EAGAIN) { 658aaa994b3SChristoph Lameter /* 659aaa994b3SChristoph Lameter * A page that has been migrated has all references 660aaa994b3SChristoph Lameter * removed and will be freed. A page that has not been 661aaa994b3SChristoph Lameter * migrated will have kepts its references and be 662aaa994b3SChristoph Lameter * restored. 663aaa994b3SChristoph Lameter */ 664aaa994b3SChristoph Lameter list_del(&page->lru); 665a731286dSKOSAKI Motohiro dec_zone_page_state(page, NR_ISOLATED_ANON + 6666c0b1351SJohannes Weiner page_is_file_cache(page)); 667894bc310SLee Schermerhorn putback_lru_page(page); 668e24f0b8fSChristoph Lameter } 66995a402c3SChristoph Lameter 67095a402c3SChristoph Lameter move_newpage: 671894bc310SLee Schermerhorn 67295a402c3SChristoph Lameter /* 67395a402c3SChristoph Lameter * Move the new page to the LRU. If migration was not successful 67495a402c3SChristoph Lameter * then this will free the page. 67595a402c3SChristoph Lameter */ 676894bc310SLee Schermerhorn putback_lru_page(newpage); 677894bc310SLee Schermerhorn 678742755a1SChristoph Lameter if (result) { 679742755a1SChristoph Lameter if (rc) 680742755a1SChristoph Lameter *result = rc; 681742755a1SChristoph Lameter else 682742755a1SChristoph Lameter *result = page_to_nid(newpage); 683742755a1SChristoph Lameter } 684e24f0b8fSChristoph Lameter return rc; 685e24f0b8fSChristoph Lameter } 686b20a3503SChristoph Lameter 687e24f0b8fSChristoph Lameter /* 688e24f0b8fSChristoph Lameter * migrate_pages 689e24f0b8fSChristoph Lameter * 69095a402c3SChristoph Lameter * The function takes one list of pages to migrate and a function 69195a402c3SChristoph Lameter * that determines from the page to be migrated and the private data 69295a402c3SChristoph Lameter * the target of the move and allocates the page. 693e24f0b8fSChristoph Lameter * 694e24f0b8fSChristoph Lameter * The function returns after 10 attempts or if no pages 695e24f0b8fSChristoph Lameter * are movable anymore because to has become empty 696aaa994b3SChristoph Lameter * or no retryable pages exist anymore. All pages will be 697e9534b3fSGabriel Craciunescu * returned to the LRU or freed. 698e24f0b8fSChristoph Lameter * 69995a402c3SChristoph Lameter * Return: Number of pages not migrated or error code. 700e24f0b8fSChristoph Lameter */ 70195a402c3SChristoph Lameter int migrate_pages(struct list_head *from, 70262b61f61SHugh Dickins new_page_t get_new_page, unsigned long private, int offlining) 703e24f0b8fSChristoph Lameter { 704e24f0b8fSChristoph Lameter int retry = 1; 705e24f0b8fSChristoph Lameter int nr_failed = 0; 706e24f0b8fSChristoph Lameter int pass = 0; 707e24f0b8fSChristoph Lameter struct page *page; 708e24f0b8fSChristoph Lameter struct page *page2; 709e24f0b8fSChristoph Lameter int swapwrite = current->flags & PF_SWAPWRITE; 710e24f0b8fSChristoph Lameter int rc; 7112d1db3b1SChristoph Lameter 712e24f0b8fSChristoph Lameter if (!swapwrite) 713e24f0b8fSChristoph Lameter current->flags |= PF_SWAPWRITE; 714e24f0b8fSChristoph Lameter 715e24f0b8fSChristoph Lameter for(pass = 0; pass < 10 && retry; pass++) { 716e24f0b8fSChristoph Lameter retry = 0; 717e24f0b8fSChristoph Lameter 718e24f0b8fSChristoph Lameter list_for_each_entry_safe(page, page2, from, lru) { 719e24f0b8fSChristoph Lameter cond_resched(); 720e24f0b8fSChristoph Lameter 72195a402c3SChristoph Lameter rc = unmap_and_move(get_new_page, private, 72262b61f61SHugh Dickins page, pass > 2, offlining); 723e24f0b8fSChristoph Lameter 724e24f0b8fSChristoph Lameter switch(rc) { 72595a402c3SChristoph Lameter case -ENOMEM: 72695a402c3SChristoph Lameter goto out; 727e24f0b8fSChristoph Lameter case -EAGAIN: 728b20a3503SChristoph Lameter retry++; 729e24f0b8fSChristoph Lameter break; 730e24f0b8fSChristoph Lameter case 0: 731e24f0b8fSChristoph Lameter break; 732e24f0b8fSChristoph Lameter default: 733b20a3503SChristoph Lameter /* Permanent failure */ 734b20a3503SChristoph Lameter nr_failed++; 735e24f0b8fSChristoph Lameter break; 736b20a3503SChristoph Lameter } 737b20a3503SChristoph Lameter } 738e24f0b8fSChristoph Lameter } 73995a402c3SChristoph Lameter rc = 0; 74095a402c3SChristoph Lameter out: 741b20a3503SChristoph Lameter if (!swapwrite) 742b20a3503SChristoph Lameter current->flags &= ~PF_SWAPWRITE; 743b20a3503SChristoph Lameter 744aaa994b3SChristoph Lameter putback_lru_pages(from); 74595a402c3SChristoph Lameter 74695a402c3SChristoph Lameter if (rc) 74795a402c3SChristoph Lameter return rc; 74895a402c3SChristoph Lameter 749b20a3503SChristoph Lameter return nr_failed + retry; 750b20a3503SChristoph Lameter } 751b20a3503SChristoph Lameter 752742755a1SChristoph Lameter #ifdef CONFIG_NUMA 753742755a1SChristoph Lameter /* 754742755a1SChristoph Lameter * Move a list of individual pages 755742755a1SChristoph Lameter */ 756742755a1SChristoph Lameter struct page_to_node { 757742755a1SChristoph Lameter unsigned long addr; 758742755a1SChristoph Lameter struct page *page; 759742755a1SChristoph Lameter int node; 760742755a1SChristoph Lameter int status; 761742755a1SChristoph Lameter }; 762742755a1SChristoph Lameter 763742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private, 764742755a1SChristoph Lameter int **result) 765742755a1SChristoph Lameter { 766742755a1SChristoph Lameter struct page_to_node *pm = (struct page_to_node *)private; 767742755a1SChristoph Lameter 768742755a1SChristoph Lameter while (pm->node != MAX_NUMNODES && pm->page != p) 769742755a1SChristoph Lameter pm++; 770742755a1SChristoph Lameter 771742755a1SChristoph Lameter if (pm->node == MAX_NUMNODES) 772742755a1SChristoph Lameter return NULL; 773742755a1SChristoph Lameter 774742755a1SChristoph Lameter *result = &pm->status; 775742755a1SChristoph Lameter 7766484eb3eSMel Gorman return alloc_pages_exact_node(pm->node, 777769848c0SMel Gorman GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 778742755a1SChristoph Lameter } 779742755a1SChristoph Lameter 780742755a1SChristoph Lameter /* 781742755a1SChristoph Lameter * Move a set of pages as indicated in the pm array. The addr 782742755a1SChristoph Lameter * field must be set to the virtual address of the page to be moved 783742755a1SChristoph Lameter * and the node number must contain a valid target node. 7845e9a0f02SBrice Goglin * The pm array ends with node = MAX_NUMNODES. 785742755a1SChristoph Lameter */ 7865e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm, 7875e9a0f02SBrice Goglin struct page_to_node *pm, 788742755a1SChristoph Lameter int migrate_all) 789742755a1SChristoph Lameter { 790742755a1SChristoph Lameter int err; 791742755a1SChristoph Lameter struct page_to_node *pp; 792742755a1SChristoph Lameter LIST_HEAD(pagelist); 793742755a1SChristoph Lameter 794742755a1SChristoph Lameter down_read(&mm->mmap_sem); 795742755a1SChristoph Lameter 796742755a1SChristoph Lameter /* 797742755a1SChristoph Lameter * Build a list of pages to migrate 798742755a1SChristoph Lameter */ 799742755a1SChristoph Lameter for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 800742755a1SChristoph Lameter struct vm_area_struct *vma; 801742755a1SChristoph Lameter struct page *page; 802742755a1SChristoph Lameter 803742755a1SChristoph Lameter err = -EFAULT; 804742755a1SChristoph Lameter vma = find_vma(mm, pp->addr); 8050dc952dcSChristoph Lameter if (!vma || !vma_migratable(vma)) 806742755a1SChristoph Lameter goto set_status; 807742755a1SChristoph Lameter 808742755a1SChristoph Lameter page = follow_page(vma, pp->addr, FOLL_GET); 80989f5b7daSLinus Torvalds 81089f5b7daSLinus Torvalds err = PTR_ERR(page); 81189f5b7daSLinus Torvalds if (IS_ERR(page)) 81289f5b7daSLinus Torvalds goto set_status; 81389f5b7daSLinus Torvalds 814742755a1SChristoph Lameter err = -ENOENT; 815742755a1SChristoph Lameter if (!page) 816742755a1SChristoph Lameter goto set_status; 817742755a1SChristoph Lameter 81862b61f61SHugh Dickins /* Use PageReserved to check for zero page */ 81962b61f61SHugh Dickins if (PageReserved(page) || PageKsm(page)) 820742755a1SChristoph Lameter goto put_and_set; 821742755a1SChristoph Lameter 822742755a1SChristoph Lameter pp->page = page; 823742755a1SChristoph Lameter err = page_to_nid(page); 824742755a1SChristoph Lameter 825742755a1SChristoph Lameter if (err == pp->node) 826742755a1SChristoph Lameter /* 827742755a1SChristoph Lameter * Node already in the right place 828742755a1SChristoph Lameter */ 829742755a1SChristoph Lameter goto put_and_set; 830742755a1SChristoph Lameter 831742755a1SChristoph Lameter err = -EACCES; 832742755a1SChristoph Lameter if (page_mapcount(page) > 1 && 833742755a1SChristoph Lameter !migrate_all) 834742755a1SChristoph Lameter goto put_and_set; 835742755a1SChristoph Lameter 83662695a84SNick Piggin err = isolate_lru_page(page); 8376d9c285aSKOSAKI Motohiro if (!err) { 83862695a84SNick Piggin list_add_tail(&page->lru, &pagelist); 8396d9c285aSKOSAKI Motohiro inc_zone_page_state(page, NR_ISOLATED_ANON + 8406d9c285aSKOSAKI Motohiro page_is_file_cache(page)); 8416d9c285aSKOSAKI Motohiro } 842742755a1SChristoph Lameter put_and_set: 843742755a1SChristoph Lameter /* 844742755a1SChristoph Lameter * Either remove the duplicate refcount from 845742755a1SChristoph Lameter * isolate_lru_page() or drop the page ref if it was 846742755a1SChristoph Lameter * not isolated. 847742755a1SChristoph Lameter */ 848742755a1SChristoph Lameter put_page(page); 849742755a1SChristoph Lameter set_status: 850742755a1SChristoph Lameter pp->status = err; 851742755a1SChristoph Lameter } 852742755a1SChristoph Lameter 853e78bbfa8SBrice Goglin err = 0; 854742755a1SChristoph Lameter if (!list_empty(&pagelist)) 855742755a1SChristoph Lameter err = migrate_pages(&pagelist, new_page_node, 85662b61f61SHugh Dickins (unsigned long)pm, 0); 857742755a1SChristoph Lameter 858742755a1SChristoph Lameter up_read(&mm->mmap_sem); 859742755a1SChristoph Lameter return err; 860742755a1SChristoph Lameter } 861742755a1SChristoph Lameter 862742755a1SChristoph Lameter /* 8635e9a0f02SBrice Goglin * Migrate an array of page address onto an array of nodes and fill 8645e9a0f02SBrice Goglin * the corresponding array of status. 8655e9a0f02SBrice Goglin */ 8665e9a0f02SBrice Goglin static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 8675e9a0f02SBrice Goglin unsigned long nr_pages, 8685e9a0f02SBrice Goglin const void __user * __user *pages, 8695e9a0f02SBrice Goglin const int __user *nodes, 8705e9a0f02SBrice Goglin int __user *status, int flags) 8715e9a0f02SBrice Goglin { 8723140a227SBrice Goglin struct page_to_node *pm; 8735e9a0f02SBrice Goglin nodemask_t task_nodes; 8743140a227SBrice Goglin unsigned long chunk_nr_pages; 8753140a227SBrice Goglin unsigned long chunk_start; 8763140a227SBrice Goglin int err; 8775e9a0f02SBrice Goglin 8785e9a0f02SBrice Goglin task_nodes = cpuset_mems_allowed(task); 8795e9a0f02SBrice Goglin 8805e9a0f02SBrice Goglin err = -ENOMEM; 8813140a227SBrice Goglin pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 8823140a227SBrice Goglin if (!pm) 8835e9a0f02SBrice Goglin goto out; 88435282a2dSBrice Goglin 88535282a2dSBrice Goglin migrate_prep(); 88635282a2dSBrice Goglin 8875e9a0f02SBrice Goglin /* 8883140a227SBrice Goglin * Store a chunk of page_to_node array in a page, 8893140a227SBrice Goglin * but keep the last one as a marker 8905e9a0f02SBrice Goglin */ 8913140a227SBrice Goglin chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 8923140a227SBrice Goglin 8933140a227SBrice Goglin for (chunk_start = 0; 8943140a227SBrice Goglin chunk_start < nr_pages; 8953140a227SBrice Goglin chunk_start += chunk_nr_pages) { 8963140a227SBrice Goglin int j; 8973140a227SBrice Goglin 8983140a227SBrice Goglin if (chunk_start + chunk_nr_pages > nr_pages) 8993140a227SBrice Goglin chunk_nr_pages = nr_pages - chunk_start; 9003140a227SBrice Goglin 9013140a227SBrice Goglin /* fill the chunk pm with addrs and nodes from user-space */ 9023140a227SBrice Goglin for (j = 0; j < chunk_nr_pages; j++) { 9035e9a0f02SBrice Goglin const void __user *p; 9045e9a0f02SBrice Goglin int node; 9055e9a0f02SBrice Goglin 9063140a227SBrice Goglin err = -EFAULT; 9073140a227SBrice Goglin if (get_user(p, pages + j + chunk_start)) 9083140a227SBrice Goglin goto out_pm; 9093140a227SBrice Goglin pm[j].addr = (unsigned long) p; 9103140a227SBrice Goglin 9113140a227SBrice Goglin if (get_user(node, nodes + j + chunk_start)) 9125e9a0f02SBrice Goglin goto out_pm; 9135e9a0f02SBrice Goglin 9145e9a0f02SBrice Goglin err = -ENODEV; 915*6f5a55f1SLinus Torvalds if (node < 0 || node >= MAX_NUMNODES) 916*6f5a55f1SLinus Torvalds goto out_pm; 917*6f5a55f1SLinus Torvalds 9185e9a0f02SBrice Goglin if (!node_state(node, N_HIGH_MEMORY)) 9195e9a0f02SBrice Goglin goto out_pm; 9205e9a0f02SBrice Goglin 9215e9a0f02SBrice Goglin err = -EACCES; 9225e9a0f02SBrice Goglin if (!node_isset(node, task_nodes)) 9235e9a0f02SBrice Goglin goto out_pm; 9245e9a0f02SBrice Goglin 9253140a227SBrice Goglin pm[j].node = node; 9265e9a0f02SBrice Goglin } 9275e9a0f02SBrice Goglin 9283140a227SBrice Goglin /* End marker for this chunk */ 9293140a227SBrice Goglin pm[chunk_nr_pages].node = MAX_NUMNODES; 9303140a227SBrice Goglin 9313140a227SBrice Goglin /* Migrate this chunk */ 9323140a227SBrice Goglin err = do_move_page_to_node_array(mm, pm, 9333140a227SBrice Goglin flags & MPOL_MF_MOVE_ALL); 9343140a227SBrice Goglin if (err < 0) 9353140a227SBrice Goglin goto out_pm; 9363140a227SBrice Goglin 9375e9a0f02SBrice Goglin /* Return status information */ 9383140a227SBrice Goglin for (j = 0; j < chunk_nr_pages; j++) 9393140a227SBrice Goglin if (put_user(pm[j].status, status + j + chunk_start)) { 9405e9a0f02SBrice Goglin err = -EFAULT; 9413140a227SBrice Goglin goto out_pm; 9423140a227SBrice Goglin } 9433140a227SBrice Goglin } 9443140a227SBrice Goglin err = 0; 9455e9a0f02SBrice Goglin 9465e9a0f02SBrice Goglin out_pm: 9473140a227SBrice Goglin free_page((unsigned long)pm); 9485e9a0f02SBrice Goglin out: 9495e9a0f02SBrice Goglin return err; 9505e9a0f02SBrice Goglin } 9515e9a0f02SBrice Goglin 9525e9a0f02SBrice Goglin /* 9532f007e74SBrice Goglin * Determine the nodes of an array of pages and store it in an array of status. 954742755a1SChristoph Lameter */ 95580bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 95680bba129SBrice Goglin const void __user **pages, int *status) 957742755a1SChristoph Lameter { 9582f007e74SBrice Goglin unsigned long i; 959742755a1SChristoph Lameter 9602f007e74SBrice Goglin down_read(&mm->mmap_sem); 9612f007e74SBrice Goglin 9622f007e74SBrice Goglin for (i = 0; i < nr_pages; i++) { 96380bba129SBrice Goglin unsigned long addr = (unsigned long)(*pages); 9642f007e74SBrice Goglin struct vm_area_struct *vma; 9652f007e74SBrice Goglin struct page *page; 966c095adbcSKOSAKI Motohiro int err = -EFAULT; 9672f007e74SBrice Goglin 9682f007e74SBrice Goglin vma = find_vma(mm, addr); 969742755a1SChristoph Lameter if (!vma) 970742755a1SChristoph Lameter goto set_status; 971742755a1SChristoph Lameter 9722f007e74SBrice Goglin page = follow_page(vma, addr, 0); 97389f5b7daSLinus Torvalds 97489f5b7daSLinus Torvalds err = PTR_ERR(page); 97589f5b7daSLinus Torvalds if (IS_ERR(page)) 97689f5b7daSLinus Torvalds goto set_status; 97789f5b7daSLinus Torvalds 978742755a1SChristoph Lameter err = -ENOENT; 979742755a1SChristoph Lameter /* Use PageReserved to check for zero page */ 98062b61f61SHugh Dickins if (!page || PageReserved(page) || PageKsm(page)) 981742755a1SChristoph Lameter goto set_status; 982742755a1SChristoph Lameter 983742755a1SChristoph Lameter err = page_to_nid(page); 984742755a1SChristoph Lameter set_status: 98580bba129SBrice Goglin *status = err; 98680bba129SBrice Goglin 98780bba129SBrice Goglin pages++; 98880bba129SBrice Goglin status++; 98980bba129SBrice Goglin } 99080bba129SBrice Goglin 99180bba129SBrice Goglin up_read(&mm->mmap_sem); 99280bba129SBrice Goglin } 99380bba129SBrice Goglin 99480bba129SBrice Goglin /* 99580bba129SBrice Goglin * Determine the nodes of a user array of pages and store it in 99680bba129SBrice Goglin * a user array of status. 99780bba129SBrice Goglin */ 99880bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 99980bba129SBrice Goglin const void __user * __user *pages, 100080bba129SBrice Goglin int __user *status) 100180bba129SBrice Goglin { 100280bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16 100380bba129SBrice Goglin const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 100480bba129SBrice Goglin int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 100580bba129SBrice Goglin unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; 100680bba129SBrice Goglin int err; 100780bba129SBrice Goglin 100880bba129SBrice Goglin for (i = 0; i < nr_pages; i += chunk_nr) { 1009b9255850SH. Peter Anvin if (chunk_nr > nr_pages - i) 101080bba129SBrice Goglin chunk_nr = nr_pages - i; 101180bba129SBrice Goglin 101280bba129SBrice Goglin err = copy_from_user(chunk_pages, &pages[i], 101380bba129SBrice Goglin chunk_nr * sizeof(*chunk_pages)); 101480bba129SBrice Goglin if (err) { 101580bba129SBrice Goglin err = -EFAULT; 101680bba129SBrice Goglin goto out; 101780bba129SBrice Goglin } 101880bba129SBrice Goglin 101980bba129SBrice Goglin do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 102080bba129SBrice Goglin 102180bba129SBrice Goglin err = copy_to_user(&status[i], chunk_status, 102280bba129SBrice Goglin chunk_nr * sizeof(*chunk_status)); 102380bba129SBrice Goglin if (err) { 102480bba129SBrice Goglin err = -EFAULT; 102580bba129SBrice Goglin goto out; 102680bba129SBrice Goglin } 1027742755a1SChristoph Lameter } 10282f007e74SBrice Goglin err = 0; 1029742755a1SChristoph Lameter 10302f007e74SBrice Goglin out: 10312f007e74SBrice Goglin return err; 1032742755a1SChristoph Lameter } 1033742755a1SChristoph Lameter 1034742755a1SChristoph Lameter /* 1035742755a1SChristoph Lameter * Move a list of pages in the address space of the currently executing 1036742755a1SChristoph Lameter * process. 1037742755a1SChristoph Lameter */ 1038938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1039938bb9f5SHeiko Carstens const void __user * __user *, pages, 1040938bb9f5SHeiko Carstens const int __user *, nodes, 1041938bb9f5SHeiko Carstens int __user *, status, int, flags) 1042742755a1SChristoph Lameter { 1043c69e8d9cSDavid Howells const struct cred *cred = current_cred(), *tcred; 1044742755a1SChristoph Lameter struct task_struct *task; 1045742755a1SChristoph Lameter struct mm_struct *mm; 10465e9a0f02SBrice Goglin int err; 1047742755a1SChristoph Lameter 1048742755a1SChristoph Lameter /* Check flags */ 1049742755a1SChristoph Lameter if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1050742755a1SChristoph Lameter return -EINVAL; 1051742755a1SChristoph Lameter 1052742755a1SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1053742755a1SChristoph Lameter return -EPERM; 1054742755a1SChristoph Lameter 1055742755a1SChristoph Lameter /* Find the mm_struct */ 1056742755a1SChristoph Lameter read_lock(&tasklist_lock); 1057228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 1058742755a1SChristoph Lameter if (!task) { 1059742755a1SChristoph Lameter read_unlock(&tasklist_lock); 1060742755a1SChristoph Lameter return -ESRCH; 1061742755a1SChristoph Lameter } 1062742755a1SChristoph Lameter mm = get_task_mm(task); 1063742755a1SChristoph Lameter read_unlock(&tasklist_lock); 1064742755a1SChristoph Lameter 1065742755a1SChristoph Lameter if (!mm) 1066742755a1SChristoph Lameter return -EINVAL; 1067742755a1SChristoph Lameter 1068742755a1SChristoph Lameter /* 1069742755a1SChristoph Lameter * Check if this process has the right to modify the specified 1070742755a1SChristoph Lameter * process. The right exists if the process has administrative 1071742755a1SChristoph Lameter * capabilities, superuser privileges or the same 1072742755a1SChristoph Lameter * userid as the target process. 1073742755a1SChristoph Lameter */ 1074c69e8d9cSDavid Howells rcu_read_lock(); 1075c69e8d9cSDavid Howells tcred = __task_cred(task); 1076b6dff3ecSDavid Howells if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1077b6dff3ecSDavid Howells cred->uid != tcred->suid && cred->uid != tcred->uid && 1078742755a1SChristoph Lameter !capable(CAP_SYS_NICE)) { 1079c69e8d9cSDavid Howells rcu_read_unlock(); 1080742755a1SChristoph Lameter err = -EPERM; 10815e9a0f02SBrice Goglin goto out; 1082742755a1SChristoph Lameter } 1083c69e8d9cSDavid Howells rcu_read_unlock(); 1084742755a1SChristoph Lameter 108586c3a764SDavid Quigley err = security_task_movememory(task); 108686c3a764SDavid Quigley if (err) 1087742755a1SChristoph Lameter goto out; 1088742755a1SChristoph Lameter 1089742755a1SChristoph Lameter if (nodes) { 10905e9a0f02SBrice Goglin err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 10915e9a0f02SBrice Goglin flags); 10925e9a0f02SBrice Goglin } else { 10935e9a0f02SBrice Goglin err = do_pages_stat(mm, nr_pages, pages, status); 1094742755a1SChristoph Lameter } 1095742755a1SChristoph Lameter 1096742755a1SChristoph Lameter out: 1097742755a1SChristoph Lameter mmput(mm); 1098742755a1SChristoph Lameter return err; 1099742755a1SChristoph Lameter } 1100742755a1SChristoph Lameter 11017b2259b3SChristoph Lameter /* 11027b2259b3SChristoph Lameter * Call migration functions in the vma_ops that may prepare 11037b2259b3SChristoph Lameter * memory in a vm for migration. migration functions may perform 11047b2259b3SChristoph Lameter * the migration for vmas that do not have an underlying page struct. 11057b2259b3SChristoph Lameter */ 11067b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 11077b2259b3SChristoph Lameter const nodemask_t *from, unsigned long flags) 11087b2259b3SChristoph Lameter { 11097b2259b3SChristoph Lameter struct vm_area_struct *vma; 11107b2259b3SChristoph Lameter int err = 0; 11117b2259b3SChristoph Lameter 11121001c9fbSDaisuke Nishimura for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 11137b2259b3SChristoph Lameter if (vma->vm_ops && vma->vm_ops->migrate) { 11147b2259b3SChristoph Lameter err = vma->vm_ops->migrate(vma, to, from, flags); 11157b2259b3SChristoph Lameter if (err) 11167b2259b3SChristoph Lameter break; 11177b2259b3SChristoph Lameter } 11187b2259b3SChristoph Lameter } 11197b2259b3SChristoph Lameter return err; 11207b2259b3SChristoph Lameter } 112183d1674aSGerald Schaefer #endif 1122