1b20a3503SChristoph Lameter /* 2b20a3503SChristoph Lameter * Memory Migration functionality - linux/mm/migration.c 3b20a3503SChristoph Lameter * 4b20a3503SChristoph Lameter * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5b20a3503SChristoph Lameter * 6b20a3503SChristoph Lameter * Page migration was first developed in the context of the memory hotplug 7b20a3503SChristoph Lameter * project. The main authors of the migration code are: 8b20a3503SChristoph Lameter * 9b20a3503SChristoph Lameter * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10b20a3503SChristoph Lameter * Hirokazu Takahashi <taka@valinux.co.jp> 11b20a3503SChristoph Lameter * Dave Hansen <haveblue@us.ibm.com> 12cde53535SChristoph Lameter * Christoph Lameter 13b20a3503SChristoph Lameter */ 14b20a3503SChristoph Lameter 15b20a3503SChristoph Lameter #include <linux/migrate.h> 16b20a3503SChristoph Lameter #include <linux/module.h> 17b20a3503SChristoph Lameter #include <linux/swap.h> 180697212aSChristoph Lameter #include <linux/swapops.h> 19b20a3503SChristoph Lameter #include <linux/pagemap.h> 20e23ca00bSChristoph Lameter #include <linux/buffer_head.h> 21b20a3503SChristoph Lameter #include <linux/mm_inline.h> 22b488893aSPavel Emelyanov #include <linux/nsproxy.h> 23b20a3503SChristoph Lameter #include <linux/pagevec.h> 24b20a3503SChristoph Lameter #include <linux/rmap.h> 25b20a3503SChristoph Lameter #include <linux/topology.h> 26b20a3503SChristoph Lameter #include <linux/cpu.h> 27b20a3503SChristoph Lameter #include <linux/cpuset.h> 2804e62a29SChristoph Lameter #include <linux/writeback.h> 29742755a1SChristoph Lameter #include <linux/mempolicy.h> 30742755a1SChristoph Lameter #include <linux/vmalloc.h> 3186c3a764SDavid Quigley #include <linux/security.h> 328a9f3ccdSBalbir Singh #include <linux/memcontrol.h> 334f5ca265SAdrian Bunk #include <linux/syscalls.h> 34b20a3503SChristoph Lameter 35b20a3503SChristoph Lameter #include "internal.h" 36b20a3503SChristoph Lameter 37b20a3503SChristoph Lameter #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 38b20a3503SChristoph Lameter 39b20a3503SChristoph Lameter /* 40742755a1SChristoph Lameter * migrate_prep() needs to be called before we start compiling a list of pages 41742755a1SChristoph Lameter * to be migrated using isolate_lru_page(). 42b20a3503SChristoph Lameter */ 43b20a3503SChristoph Lameter int migrate_prep(void) 44b20a3503SChristoph Lameter { 45b20a3503SChristoph Lameter /* 46b20a3503SChristoph Lameter * Clear the LRU lists so pages can be isolated. 47b20a3503SChristoph Lameter * Note that pages may be moved off the LRU after we have 48b20a3503SChristoph Lameter * drained them. Those pages will fail to migrate like other 49b20a3503SChristoph Lameter * pages that may be busy. 50b20a3503SChristoph Lameter */ 51b20a3503SChristoph Lameter lru_add_drain_all(); 52b20a3503SChristoph Lameter 53b20a3503SChristoph Lameter return 0; 54b20a3503SChristoph Lameter } 55b20a3503SChristoph Lameter 56b20a3503SChristoph Lameter /* 57894bc310SLee Schermerhorn * Add isolated pages on the list back to the LRU under page lock 58894bc310SLee Schermerhorn * to avoid leaking evictable pages back onto unevictable list. 59b20a3503SChristoph Lameter * 60b20a3503SChristoph Lameter * returns the number of pages put back. 61b20a3503SChristoph Lameter */ 62b20a3503SChristoph Lameter int putback_lru_pages(struct list_head *l) 63b20a3503SChristoph Lameter { 64b20a3503SChristoph Lameter struct page *page; 65b20a3503SChristoph Lameter struct page *page2; 66b20a3503SChristoph Lameter int count = 0; 67b20a3503SChristoph Lameter 68b20a3503SChristoph Lameter list_for_each_entry_safe(page, page2, l, lru) { 69e24f0b8fSChristoph Lameter list_del(&page->lru); 70894bc310SLee Schermerhorn putback_lru_page(page); 71b20a3503SChristoph Lameter count++; 72b20a3503SChristoph Lameter } 73b20a3503SChristoph Lameter return count; 74b20a3503SChristoph Lameter } 75b20a3503SChristoph Lameter 760697212aSChristoph Lameter /* 770697212aSChristoph Lameter * Restore a potential migration pte to a working pte entry 780697212aSChristoph Lameter */ 7904e62a29SChristoph Lameter static void remove_migration_pte(struct vm_area_struct *vma, 800697212aSChristoph Lameter struct page *old, struct page *new) 810697212aSChristoph Lameter { 820697212aSChristoph Lameter struct mm_struct *mm = vma->vm_mm; 830697212aSChristoph Lameter swp_entry_t entry; 840697212aSChristoph Lameter pgd_t *pgd; 850697212aSChristoph Lameter pud_t *pud; 860697212aSChristoph Lameter pmd_t *pmd; 870697212aSChristoph Lameter pte_t *ptep, pte; 880697212aSChristoph Lameter spinlock_t *ptl; 8904e62a29SChristoph Lameter unsigned long addr = page_address_in_vma(new, vma); 9004e62a29SChristoph Lameter 9104e62a29SChristoph Lameter if (addr == -EFAULT) 9204e62a29SChristoph Lameter return; 930697212aSChristoph Lameter 940697212aSChristoph Lameter pgd = pgd_offset(mm, addr); 950697212aSChristoph Lameter if (!pgd_present(*pgd)) 960697212aSChristoph Lameter return; 970697212aSChristoph Lameter 980697212aSChristoph Lameter pud = pud_offset(pgd, addr); 990697212aSChristoph Lameter if (!pud_present(*pud)) 1000697212aSChristoph Lameter return; 1010697212aSChristoph Lameter 1020697212aSChristoph Lameter pmd = pmd_offset(pud, addr); 1030697212aSChristoph Lameter if (!pmd_present(*pmd)) 1040697212aSChristoph Lameter return; 1050697212aSChristoph Lameter 1060697212aSChristoph Lameter ptep = pte_offset_map(pmd, addr); 1070697212aSChristoph Lameter 1080697212aSChristoph Lameter if (!is_swap_pte(*ptep)) { 1090697212aSChristoph Lameter pte_unmap(ptep); 1100697212aSChristoph Lameter return; 1110697212aSChristoph Lameter } 1120697212aSChristoph Lameter 1130697212aSChristoph Lameter ptl = pte_lockptr(mm, pmd); 1140697212aSChristoph Lameter spin_lock(ptl); 1150697212aSChristoph Lameter pte = *ptep; 1160697212aSChristoph Lameter if (!is_swap_pte(pte)) 1170697212aSChristoph Lameter goto out; 1180697212aSChristoph Lameter 1190697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 1200697212aSChristoph Lameter 1210697212aSChristoph Lameter if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 1220697212aSChristoph Lameter goto out; 1230697212aSChristoph Lameter 12498837c7fSHugh Dickins /* 12598837c7fSHugh Dickins * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge. 12698837c7fSHugh Dickins * Failure is not an option here: we're now expected to remove every 12798837c7fSHugh Dickins * migration pte, and will cause crashes otherwise. Normally this 12898837c7fSHugh Dickins * is not an issue: mem_cgroup_prepare_migration bumped up the old 12998837c7fSHugh Dickins * page_cgroup count for safety, that's now attached to the new page, 13098837c7fSHugh Dickins * so this charge should just be another incrementation of the count, 13198837c7fSHugh Dickins * to keep in balance with rmap.c's mem_cgroup_uncharging. But if 13298837c7fSHugh Dickins * there's been a force_empty, those reference counts may no longer 13398837c7fSHugh Dickins * be reliable, and this charge can actually fail: oh well, we don't 13498837c7fSHugh Dickins * make the situation any worse by proceeding as if it had succeeded. 13598837c7fSHugh Dickins */ 13698837c7fSHugh Dickins mem_cgroup_charge(new, mm, GFP_ATOMIC); 13798837c7fSHugh Dickins 1380697212aSChristoph Lameter get_page(new); 1390697212aSChristoph Lameter pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 1400697212aSChristoph Lameter if (is_write_migration_entry(entry)) 1410697212aSChristoph Lameter pte = pte_mkwrite(pte); 14297ee0524SKAMEZAWA Hiroyuki flush_cache_page(vma, addr, pte_pfn(pte)); 1430697212aSChristoph Lameter set_pte_at(mm, addr, ptep, pte); 14404e62a29SChristoph Lameter 14504e62a29SChristoph Lameter if (PageAnon(new)) 1460697212aSChristoph Lameter page_add_anon_rmap(new, vma, addr); 14704e62a29SChristoph Lameter else 14804e62a29SChristoph Lameter page_add_file_rmap(new); 14904e62a29SChristoph Lameter 15004e62a29SChristoph Lameter /* No need to invalidate - it was non-present before */ 15104e62a29SChristoph Lameter update_mmu_cache(vma, addr, pte); 15204e62a29SChristoph Lameter 1530697212aSChristoph Lameter out: 1540697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 1550697212aSChristoph Lameter } 1560697212aSChristoph Lameter 1570697212aSChristoph Lameter /* 15804e62a29SChristoph Lameter * Note that remove_file_migration_ptes will only work on regular mappings, 15904e62a29SChristoph Lameter * Nonlinear mappings do not use migration entries. 16004e62a29SChristoph Lameter */ 16104e62a29SChristoph Lameter static void remove_file_migration_ptes(struct page *old, struct page *new) 16204e62a29SChristoph Lameter { 16304e62a29SChristoph Lameter struct vm_area_struct *vma; 16404e62a29SChristoph Lameter struct address_space *mapping = page_mapping(new); 16504e62a29SChristoph Lameter struct prio_tree_iter iter; 16604e62a29SChristoph Lameter pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 16704e62a29SChristoph Lameter 16804e62a29SChristoph Lameter if (!mapping) 16904e62a29SChristoph Lameter return; 17004e62a29SChristoph Lameter 17104e62a29SChristoph Lameter spin_lock(&mapping->i_mmap_lock); 17204e62a29SChristoph Lameter 17304e62a29SChristoph Lameter vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) 17404e62a29SChristoph Lameter remove_migration_pte(vma, old, new); 17504e62a29SChristoph Lameter 17604e62a29SChristoph Lameter spin_unlock(&mapping->i_mmap_lock); 17704e62a29SChristoph Lameter } 17804e62a29SChristoph Lameter 17904e62a29SChristoph Lameter /* 1800697212aSChristoph Lameter * Must hold mmap_sem lock on at least one of the vmas containing 1810697212aSChristoph Lameter * the page so that the anon_vma cannot vanish. 1820697212aSChristoph Lameter */ 18304e62a29SChristoph Lameter static void remove_anon_migration_ptes(struct page *old, struct page *new) 1840697212aSChristoph Lameter { 1850697212aSChristoph Lameter struct anon_vma *anon_vma; 1860697212aSChristoph Lameter struct vm_area_struct *vma; 1870697212aSChristoph Lameter unsigned long mapping; 1880697212aSChristoph Lameter 1890697212aSChristoph Lameter mapping = (unsigned long)new->mapping; 1900697212aSChristoph Lameter 1910697212aSChristoph Lameter if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) 1920697212aSChristoph Lameter return; 1930697212aSChristoph Lameter 1940697212aSChristoph Lameter /* 1950697212aSChristoph Lameter * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. 1960697212aSChristoph Lameter */ 1970697212aSChristoph Lameter anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); 1980697212aSChristoph Lameter spin_lock(&anon_vma->lock); 1990697212aSChristoph Lameter 2000697212aSChristoph Lameter list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 20104e62a29SChristoph Lameter remove_migration_pte(vma, old, new); 2020697212aSChristoph Lameter 2030697212aSChristoph Lameter spin_unlock(&anon_vma->lock); 2040697212aSChristoph Lameter } 2050697212aSChristoph Lameter 2060697212aSChristoph Lameter /* 20704e62a29SChristoph Lameter * Get rid of all migration entries and replace them by 20804e62a29SChristoph Lameter * references to the indicated page. 20904e62a29SChristoph Lameter */ 21004e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new) 21104e62a29SChristoph Lameter { 21204e62a29SChristoph Lameter if (PageAnon(new)) 21304e62a29SChristoph Lameter remove_anon_migration_ptes(old, new); 21404e62a29SChristoph Lameter else 21504e62a29SChristoph Lameter remove_file_migration_ptes(old, new); 21604e62a29SChristoph Lameter } 21704e62a29SChristoph Lameter 21804e62a29SChristoph Lameter /* 2190697212aSChristoph Lameter * Something used the pte of a page under migration. We need to 2200697212aSChristoph Lameter * get to the page and wait until migration is finished. 2210697212aSChristoph Lameter * When we return from this function the fault will be retried. 2220697212aSChristoph Lameter * 2230697212aSChristoph Lameter * This function is called from do_swap_page(). 2240697212aSChristoph Lameter */ 2250697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 2260697212aSChristoph Lameter unsigned long address) 2270697212aSChristoph Lameter { 2280697212aSChristoph Lameter pte_t *ptep, pte; 2290697212aSChristoph Lameter spinlock_t *ptl; 2300697212aSChristoph Lameter swp_entry_t entry; 2310697212aSChristoph Lameter struct page *page; 2320697212aSChristoph Lameter 2330697212aSChristoph Lameter ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 2340697212aSChristoph Lameter pte = *ptep; 2350697212aSChristoph Lameter if (!is_swap_pte(pte)) 2360697212aSChristoph Lameter goto out; 2370697212aSChristoph Lameter 2380697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 2390697212aSChristoph Lameter if (!is_migration_entry(entry)) 2400697212aSChristoph Lameter goto out; 2410697212aSChristoph Lameter 2420697212aSChristoph Lameter page = migration_entry_to_page(entry); 2430697212aSChristoph Lameter 244e286781dSNick Piggin /* 245e286781dSNick Piggin * Once radix-tree replacement of page migration started, page_count 246e286781dSNick Piggin * *must* be zero. And, we don't want to call wait_on_page_locked() 247e286781dSNick Piggin * against a page without get_page(). 248e286781dSNick Piggin * So, we use get_page_unless_zero(), here. Even failed, page fault 249e286781dSNick Piggin * will occur again. 250e286781dSNick Piggin */ 251e286781dSNick Piggin if (!get_page_unless_zero(page)) 252e286781dSNick Piggin goto out; 2530697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 2540697212aSChristoph Lameter wait_on_page_locked(page); 2550697212aSChristoph Lameter put_page(page); 2560697212aSChristoph Lameter return; 2570697212aSChristoph Lameter out: 2580697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 2590697212aSChristoph Lameter } 2600697212aSChristoph Lameter 261b20a3503SChristoph Lameter /* 262c3fcf8a5SChristoph Lameter * Replace the page in the mapping. 2635b5c7120SChristoph Lameter * 2645b5c7120SChristoph Lameter * The number of remaining references must be: 2655b5c7120SChristoph Lameter * 1 for anonymous pages without a mapping 2665b5c7120SChristoph Lameter * 2 for pages with a mapping 2675b5c7120SChristoph Lameter * 3 for pages with a mapping and PagePrivate set. 268b20a3503SChristoph Lameter */ 2692d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping, 2702d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 271b20a3503SChristoph Lameter { 272e286781dSNick Piggin int expected_count; 2737cf9c2c7SNick Piggin void **pslot; 274b20a3503SChristoph Lameter 2756c5240aeSChristoph Lameter if (!mapping) { 2760e8c7d0fSChristoph Lameter /* Anonymous page without mapping */ 2776c5240aeSChristoph Lameter if (page_count(page) != 1) 2786c5240aeSChristoph Lameter return -EAGAIN; 2796c5240aeSChristoph Lameter return 0; 2806c5240aeSChristoph Lameter } 2816c5240aeSChristoph Lameter 28219fd6231SNick Piggin spin_lock_irq(&mapping->tree_lock); 283b20a3503SChristoph Lameter 2847cf9c2c7SNick Piggin pslot = radix_tree_lookup_slot(&mapping->page_tree, 285b20a3503SChristoph Lameter page_index(page)); 286b20a3503SChristoph Lameter 287e286781dSNick Piggin expected_count = 2 + !!PagePrivate(page); 288e286781dSNick Piggin if (page_count(page) != expected_count || 2897cf9c2c7SNick Piggin (struct page *)radix_tree_deref_slot(pslot) != page) { 29019fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 291e23ca00bSChristoph Lameter return -EAGAIN; 292b20a3503SChristoph Lameter } 293b20a3503SChristoph Lameter 294e286781dSNick Piggin if (!page_freeze_refs(page, expected_count)) { 29519fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 296e286781dSNick Piggin return -EAGAIN; 297e286781dSNick Piggin } 298e286781dSNick Piggin 299b20a3503SChristoph Lameter /* 300b20a3503SChristoph Lameter * Now we know that no one else is looking at the page. 301b20a3503SChristoph Lameter */ 3027cf9c2c7SNick Piggin get_page(newpage); /* add cache reference */ 3036c5240aeSChristoph Lameter #ifdef CONFIG_SWAP 304b20a3503SChristoph Lameter if (PageSwapCache(page)) { 305b20a3503SChristoph Lameter SetPageSwapCache(newpage); 306b20a3503SChristoph Lameter set_page_private(newpage, page_private(page)); 307b20a3503SChristoph Lameter } 3086c5240aeSChristoph Lameter #endif 309b20a3503SChristoph Lameter 3107cf9c2c7SNick Piggin radix_tree_replace_slot(pslot, newpage); 3117cf9c2c7SNick Piggin 312e286781dSNick Piggin page_unfreeze_refs(page, expected_count); 3137cf9c2c7SNick Piggin /* 3147cf9c2c7SNick Piggin * Drop cache reference from old page. 3157cf9c2c7SNick Piggin * We know this isn't the last reference. 3167cf9c2c7SNick Piggin */ 317b20a3503SChristoph Lameter __put_page(page); 3187cf9c2c7SNick Piggin 3190e8c7d0fSChristoph Lameter /* 3200e8c7d0fSChristoph Lameter * If moved to a different zone then also account 3210e8c7d0fSChristoph Lameter * the page for that zone. Other VM counters will be 3220e8c7d0fSChristoph Lameter * taken care of when we establish references to the 3230e8c7d0fSChristoph Lameter * new page and drop references to the old page. 3240e8c7d0fSChristoph Lameter * 3250e8c7d0fSChristoph Lameter * Note that anonymous pages are accounted for 3260e8c7d0fSChristoph Lameter * via NR_FILE_PAGES and NR_ANON_PAGES if they 3270e8c7d0fSChristoph Lameter * are mapped to swap space. 3280e8c7d0fSChristoph Lameter */ 3290e8c7d0fSChristoph Lameter __dec_zone_page_state(page, NR_FILE_PAGES); 3300e8c7d0fSChristoph Lameter __inc_zone_page_state(newpage, NR_FILE_PAGES); 3310e8c7d0fSChristoph Lameter 33219fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 33319fd6231SNick Piggin if (!PageSwapCache(newpage)) 33469029cd5SKAMEZAWA Hiroyuki mem_cgroup_uncharge_cache_page(page); 335b20a3503SChristoph Lameter 336b20a3503SChristoph Lameter return 0; 337b20a3503SChristoph Lameter } 338b20a3503SChristoph Lameter 339b20a3503SChristoph Lameter /* 340b20a3503SChristoph Lameter * Copy the page to its new location 341b20a3503SChristoph Lameter */ 342e7340f73SChristoph Lameter static void migrate_page_copy(struct page *newpage, struct page *page) 343b20a3503SChristoph Lameter { 344b20a3503SChristoph Lameter copy_highpage(newpage, page); 345b20a3503SChristoph Lameter 346b20a3503SChristoph Lameter if (PageError(page)) 347b20a3503SChristoph Lameter SetPageError(newpage); 348b20a3503SChristoph Lameter if (PageReferenced(page)) 349b20a3503SChristoph Lameter SetPageReferenced(newpage); 350b20a3503SChristoph Lameter if (PageUptodate(page)) 351b20a3503SChristoph Lameter SetPageUptodate(newpage); 352894bc310SLee Schermerhorn if (TestClearPageActive(page)) { 353894bc310SLee Schermerhorn VM_BUG_ON(PageUnevictable(page)); 354b20a3503SChristoph Lameter SetPageActive(newpage); 355894bc310SLee Schermerhorn } else 356894bc310SLee Schermerhorn unevictable_migrate_page(newpage, page); 357b20a3503SChristoph Lameter if (PageChecked(page)) 358b20a3503SChristoph Lameter SetPageChecked(newpage); 359b20a3503SChristoph Lameter if (PageMappedToDisk(page)) 360b20a3503SChristoph Lameter SetPageMappedToDisk(newpage); 361b20a3503SChristoph Lameter 362b20a3503SChristoph Lameter if (PageDirty(page)) { 363b20a3503SChristoph Lameter clear_page_dirty_for_io(page); 3643a902c5fSNick Piggin /* 3653a902c5fSNick Piggin * Want to mark the page and the radix tree as dirty, and 3663a902c5fSNick Piggin * redo the accounting that clear_page_dirty_for_io undid, 3673a902c5fSNick Piggin * but we can't use set_page_dirty because that function 3683a902c5fSNick Piggin * is actually a signal that all of the page has become dirty. 3693a902c5fSNick Piggin * Wheras only part of our page may be dirty. 3703a902c5fSNick Piggin */ 3713a902c5fSNick Piggin __set_page_dirty_nobuffers(newpage); 372b20a3503SChristoph Lameter } 373b20a3503SChristoph Lameter 374b291f000SNick Piggin mlock_migrate_page(newpage, page); 375b291f000SNick Piggin 3766c5240aeSChristoph Lameter #ifdef CONFIG_SWAP 377b20a3503SChristoph Lameter ClearPageSwapCache(page); 3786c5240aeSChristoph Lameter #endif 379b20a3503SChristoph Lameter ClearPagePrivate(page); 380b20a3503SChristoph Lameter set_page_private(page, 0); 381b20a3503SChristoph Lameter page->mapping = NULL; 382b20a3503SChristoph Lameter 383b20a3503SChristoph Lameter /* 384b20a3503SChristoph Lameter * If any waiters have accumulated on the new page then 385b20a3503SChristoph Lameter * wake them up. 386b20a3503SChristoph Lameter */ 387b20a3503SChristoph Lameter if (PageWriteback(newpage)) 388b20a3503SChristoph Lameter end_page_writeback(newpage); 389b20a3503SChristoph Lameter } 390b20a3503SChristoph Lameter 3911d8b85ccSChristoph Lameter /************************************************************ 3921d8b85ccSChristoph Lameter * Migration functions 3931d8b85ccSChristoph Lameter ***********************************************************/ 3941d8b85ccSChristoph Lameter 3951d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */ 3962d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping, 3972d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 3981d8b85ccSChristoph Lameter { 3991d8b85ccSChristoph Lameter return -EIO; 4001d8b85ccSChristoph Lameter } 4011d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page); 4021d8b85ccSChristoph Lameter 403b20a3503SChristoph Lameter /* 404b20a3503SChristoph Lameter * Common logic to directly migrate a single page suitable for 405b20a3503SChristoph Lameter * pages that do not use PagePrivate. 406b20a3503SChristoph Lameter * 407b20a3503SChristoph Lameter * Pages are locked upon entry and exit. 408b20a3503SChristoph Lameter */ 4092d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping, 4102d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 411b20a3503SChristoph Lameter { 412b20a3503SChristoph Lameter int rc; 413b20a3503SChristoph Lameter 414b20a3503SChristoph Lameter BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 415b20a3503SChristoph Lameter 4162d1db3b1SChristoph Lameter rc = migrate_page_move_mapping(mapping, newpage, page); 417b20a3503SChristoph Lameter 418b20a3503SChristoph Lameter if (rc) 419b20a3503SChristoph Lameter return rc; 420b20a3503SChristoph Lameter 421b20a3503SChristoph Lameter migrate_page_copy(newpage, page); 422b20a3503SChristoph Lameter return 0; 423b20a3503SChristoph Lameter } 424b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page); 425b20a3503SChristoph Lameter 4269361401eSDavid Howells #ifdef CONFIG_BLOCK 427b20a3503SChristoph Lameter /* 4281d8b85ccSChristoph Lameter * Migration function for pages with buffers. This function can only be used 4291d8b85ccSChristoph Lameter * if the underlying filesystem guarantees that no other references to "page" 4301d8b85ccSChristoph Lameter * exist. 4311d8b85ccSChristoph Lameter */ 4322d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping, 4332d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 4341d8b85ccSChristoph Lameter { 4351d8b85ccSChristoph Lameter struct buffer_head *bh, *head; 4361d8b85ccSChristoph Lameter int rc; 4371d8b85ccSChristoph Lameter 4381d8b85ccSChristoph Lameter if (!page_has_buffers(page)) 4392d1db3b1SChristoph Lameter return migrate_page(mapping, newpage, page); 4401d8b85ccSChristoph Lameter 4411d8b85ccSChristoph Lameter head = page_buffers(page); 4421d8b85ccSChristoph Lameter 4432d1db3b1SChristoph Lameter rc = migrate_page_move_mapping(mapping, newpage, page); 4441d8b85ccSChristoph Lameter 4451d8b85ccSChristoph Lameter if (rc) 4461d8b85ccSChristoph Lameter return rc; 4471d8b85ccSChristoph Lameter 4481d8b85ccSChristoph Lameter bh = head; 4491d8b85ccSChristoph Lameter do { 4501d8b85ccSChristoph Lameter get_bh(bh); 4511d8b85ccSChristoph Lameter lock_buffer(bh); 4521d8b85ccSChristoph Lameter bh = bh->b_this_page; 4531d8b85ccSChristoph Lameter 4541d8b85ccSChristoph Lameter } while (bh != head); 4551d8b85ccSChristoph Lameter 4561d8b85ccSChristoph Lameter ClearPagePrivate(page); 4571d8b85ccSChristoph Lameter set_page_private(newpage, page_private(page)); 4581d8b85ccSChristoph Lameter set_page_private(page, 0); 4591d8b85ccSChristoph Lameter put_page(page); 4601d8b85ccSChristoph Lameter get_page(newpage); 4611d8b85ccSChristoph Lameter 4621d8b85ccSChristoph Lameter bh = head; 4631d8b85ccSChristoph Lameter do { 4641d8b85ccSChristoph Lameter set_bh_page(bh, newpage, bh_offset(bh)); 4651d8b85ccSChristoph Lameter bh = bh->b_this_page; 4661d8b85ccSChristoph Lameter 4671d8b85ccSChristoph Lameter } while (bh != head); 4681d8b85ccSChristoph Lameter 4691d8b85ccSChristoph Lameter SetPagePrivate(newpage); 4701d8b85ccSChristoph Lameter 4711d8b85ccSChristoph Lameter migrate_page_copy(newpage, page); 4721d8b85ccSChristoph Lameter 4731d8b85ccSChristoph Lameter bh = head; 4741d8b85ccSChristoph Lameter do { 4751d8b85ccSChristoph Lameter unlock_buffer(bh); 4761d8b85ccSChristoph Lameter put_bh(bh); 4771d8b85ccSChristoph Lameter bh = bh->b_this_page; 4781d8b85ccSChristoph Lameter 4791d8b85ccSChristoph Lameter } while (bh != head); 4801d8b85ccSChristoph Lameter 4811d8b85ccSChristoph Lameter return 0; 4821d8b85ccSChristoph Lameter } 4831d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page); 4849361401eSDavid Howells #endif 4851d8b85ccSChristoph Lameter 48604e62a29SChristoph Lameter /* 48704e62a29SChristoph Lameter * Writeback a page to clean the dirty state 48804e62a29SChristoph Lameter */ 48904e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page) 49004e62a29SChristoph Lameter { 49104e62a29SChristoph Lameter struct writeback_control wbc = { 49204e62a29SChristoph Lameter .sync_mode = WB_SYNC_NONE, 49304e62a29SChristoph Lameter .nr_to_write = 1, 49404e62a29SChristoph Lameter .range_start = 0, 49504e62a29SChristoph Lameter .range_end = LLONG_MAX, 49604e62a29SChristoph Lameter .nonblocking = 1, 49704e62a29SChristoph Lameter .for_reclaim = 1 49804e62a29SChristoph Lameter }; 49904e62a29SChristoph Lameter int rc; 50004e62a29SChristoph Lameter 50104e62a29SChristoph Lameter if (!mapping->a_ops->writepage) 50204e62a29SChristoph Lameter /* No write method for the address space */ 50304e62a29SChristoph Lameter return -EINVAL; 50404e62a29SChristoph Lameter 50504e62a29SChristoph Lameter if (!clear_page_dirty_for_io(page)) 50604e62a29SChristoph Lameter /* Someone else already triggered a write */ 50704e62a29SChristoph Lameter return -EAGAIN; 50804e62a29SChristoph Lameter 50904e62a29SChristoph Lameter /* 51004e62a29SChristoph Lameter * A dirty page may imply that the underlying filesystem has 51104e62a29SChristoph Lameter * the page on some queue. So the page must be clean for 51204e62a29SChristoph Lameter * migration. Writeout may mean we loose the lock and the 51304e62a29SChristoph Lameter * page state is no longer what we checked for earlier. 51404e62a29SChristoph Lameter * At this point we know that the migration attempt cannot 51504e62a29SChristoph Lameter * be successful. 51604e62a29SChristoph Lameter */ 51704e62a29SChristoph Lameter remove_migration_ptes(page, page); 51804e62a29SChristoph Lameter 51904e62a29SChristoph Lameter rc = mapping->a_ops->writepage(page, &wbc); 52004e62a29SChristoph Lameter if (rc < 0) 52104e62a29SChristoph Lameter /* I/O Error writing */ 52204e62a29SChristoph Lameter return -EIO; 52304e62a29SChristoph Lameter 52404e62a29SChristoph Lameter if (rc != AOP_WRITEPAGE_ACTIVATE) 52504e62a29SChristoph Lameter /* unlocked. Relock */ 52604e62a29SChristoph Lameter lock_page(page); 52704e62a29SChristoph Lameter 52804e62a29SChristoph Lameter return -EAGAIN; 52904e62a29SChristoph Lameter } 53004e62a29SChristoph Lameter 53104e62a29SChristoph Lameter /* 53204e62a29SChristoph Lameter * Default handling if a filesystem does not provide a migration function. 53304e62a29SChristoph Lameter */ 5348351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping, 5358351a6e4SChristoph Lameter struct page *newpage, struct page *page) 5368351a6e4SChristoph Lameter { 53704e62a29SChristoph Lameter if (PageDirty(page)) 53804e62a29SChristoph Lameter return writeout(mapping, page); 5398351a6e4SChristoph Lameter 5408351a6e4SChristoph Lameter /* 5418351a6e4SChristoph Lameter * Buffers may be managed in a filesystem specific way. 5428351a6e4SChristoph Lameter * We must have no buffers or drop them. 5438351a6e4SChristoph Lameter */ 544b398f6bfSDavid Howells if (PagePrivate(page) && 5458351a6e4SChristoph Lameter !try_to_release_page(page, GFP_KERNEL)) 5468351a6e4SChristoph Lameter return -EAGAIN; 5478351a6e4SChristoph Lameter 5488351a6e4SChristoph Lameter return migrate_page(mapping, newpage, page); 5498351a6e4SChristoph Lameter } 5508351a6e4SChristoph Lameter 5511d8b85ccSChristoph Lameter /* 552e24f0b8fSChristoph Lameter * Move a page to a newly allocated page 553e24f0b8fSChristoph Lameter * The page is locked and all ptes have been successfully removed. 554b20a3503SChristoph Lameter * 555e24f0b8fSChristoph Lameter * The new page will have replaced the old page if this function 556e24f0b8fSChristoph Lameter * is successful. 557894bc310SLee Schermerhorn * 558894bc310SLee Schermerhorn * Return value: 559894bc310SLee Schermerhorn * < 0 - error code 560894bc310SLee Schermerhorn * == 0 - success 561b20a3503SChristoph Lameter */ 562e24f0b8fSChristoph Lameter static int move_to_new_page(struct page *newpage, struct page *page) 563b20a3503SChristoph Lameter { 564e24f0b8fSChristoph Lameter struct address_space *mapping; 565b20a3503SChristoph Lameter int rc; 566b20a3503SChristoph Lameter 567b20a3503SChristoph Lameter /* 568e24f0b8fSChristoph Lameter * Block others from accessing the page when we get around to 569e24f0b8fSChristoph Lameter * establishing additional references. We are the only one 570e24f0b8fSChristoph Lameter * holding a reference to the new page at this point. 571b20a3503SChristoph Lameter */ 572529ae9aaSNick Piggin if (!trylock_page(newpage)) 573e24f0b8fSChristoph Lameter BUG(); 574b20a3503SChristoph Lameter 5752d1db3b1SChristoph Lameter /* Prepare mapping for the new page.*/ 5762d1db3b1SChristoph Lameter newpage->index = page->index; 5772d1db3b1SChristoph Lameter newpage->mapping = page->mapping; 578b2e18538SRik van Riel if (PageSwapBacked(page)) 579b2e18538SRik van Riel SetPageSwapBacked(newpage); 5802d1db3b1SChristoph Lameter 581b20a3503SChristoph Lameter mapping = page_mapping(page); 582b20a3503SChristoph Lameter if (!mapping) 5836c5240aeSChristoph Lameter rc = migrate_page(mapping, newpage, page); 5846c5240aeSChristoph Lameter else if (mapping->a_ops->migratepage) 585b20a3503SChristoph Lameter /* 586b20a3503SChristoph Lameter * Most pages have a mapping and most filesystems 587b20a3503SChristoph Lameter * should provide a migration function. Anonymous 588b20a3503SChristoph Lameter * pages are part of swap space which also has its 589b20a3503SChristoph Lameter * own migration function. This is the most common 590b20a3503SChristoph Lameter * path for page migration. 591b20a3503SChristoph Lameter */ 5922d1db3b1SChristoph Lameter rc = mapping->a_ops->migratepage(mapping, 5932d1db3b1SChristoph Lameter newpage, page); 5948351a6e4SChristoph Lameter else 5958351a6e4SChristoph Lameter rc = fallback_migrate_page(mapping, newpage, page); 596b20a3503SChristoph Lameter 597ae41be37SKAMEZAWA Hiroyuki if (!rc) { 5986c5240aeSChristoph Lameter remove_migration_ptes(page, newpage); 599ae41be37SKAMEZAWA Hiroyuki } else 600e24f0b8fSChristoph Lameter newpage->mapping = NULL; 6016c5240aeSChristoph Lameter 602b20a3503SChristoph Lameter unlock_page(newpage); 603b20a3503SChristoph Lameter 604e24f0b8fSChristoph Lameter return rc; 605e24f0b8fSChristoph Lameter } 606e24f0b8fSChristoph Lameter 607e24f0b8fSChristoph Lameter /* 608e24f0b8fSChristoph Lameter * Obtain the lock on page, remove all ptes and migrate the page 609e24f0b8fSChristoph Lameter * to the newly allocated page in newpage. 610e24f0b8fSChristoph Lameter */ 61195a402c3SChristoph Lameter static int unmap_and_move(new_page_t get_new_page, unsigned long private, 61295a402c3SChristoph Lameter struct page *page, int force) 613e24f0b8fSChristoph Lameter { 614e24f0b8fSChristoph Lameter int rc = 0; 615742755a1SChristoph Lameter int *result = NULL; 616742755a1SChristoph Lameter struct page *newpage = get_new_page(page, private, &result); 617989f89c5SKAMEZAWA Hiroyuki int rcu_locked = 0; 618ae41be37SKAMEZAWA Hiroyuki int charge = 0; 61995a402c3SChristoph Lameter 62095a402c3SChristoph Lameter if (!newpage) 62195a402c3SChristoph Lameter return -ENOMEM; 622e24f0b8fSChristoph Lameter 623894bc310SLee Schermerhorn if (page_count(page) == 1) { 624e24f0b8fSChristoph Lameter /* page was freed from under us. So we are done. */ 62595a402c3SChristoph Lameter goto move_newpage; 626894bc310SLee Schermerhorn } 627e24f0b8fSChristoph Lameter 628e8589cc1SKAMEZAWA Hiroyuki charge = mem_cgroup_prepare_migration(page, newpage); 629e8589cc1SKAMEZAWA Hiroyuki if (charge == -ENOMEM) { 630e8589cc1SKAMEZAWA Hiroyuki rc = -ENOMEM; 631e8589cc1SKAMEZAWA Hiroyuki goto move_newpage; 632e8589cc1SKAMEZAWA Hiroyuki } 633e8589cc1SKAMEZAWA Hiroyuki /* prepare cgroup just returns 0 or -ENOMEM */ 634e8589cc1SKAMEZAWA Hiroyuki BUG_ON(charge); 635e8589cc1SKAMEZAWA Hiroyuki 636e24f0b8fSChristoph Lameter rc = -EAGAIN; 637529ae9aaSNick Piggin if (!trylock_page(page)) { 638e24f0b8fSChristoph Lameter if (!force) 63995a402c3SChristoph Lameter goto move_newpage; 640e24f0b8fSChristoph Lameter lock_page(page); 641e24f0b8fSChristoph Lameter } 642e24f0b8fSChristoph Lameter 643e24f0b8fSChristoph Lameter if (PageWriteback(page)) { 644e24f0b8fSChristoph Lameter if (!force) 645e24f0b8fSChristoph Lameter goto unlock; 646e24f0b8fSChristoph Lameter wait_on_page_writeback(page); 647e24f0b8fSChristoph Lameter } 648e24f0b8fSChristoph Lameter /* 649dc386d4dSKAMEZAWA Hiroyuki * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 650dc386d4dSKAMEZAWA Hiroyuki * we cannot notice that anon_vma is freed while we migrates a page. 651dc386d4dSKAMEZAWA Hiroyuki * This rcu_read_lock() delays freeing anon_vma pointer until the end 652dc386d4dSKAMEZAWA Hiroyuki * of migration. File cache pages are no problem because of page_lock() 653989f89c5SKAMEZAWA Hiroyuki * File Caches may use write_page() or lock_page() in migration, then, 654989f89c5SKAMEZAWA Hiroyuki * just care Anon page here. 655e24f0b8fSChristoph Lameter */ 656989f89c5SKAMEZAWA Hiroyuki if (PageAnon(page)) { 657dc386d4dSKAMEZAWA Hiroyuki rcu_read_lock(); 658989f89c5SKAMEZAWA Hiroyuki rcu_locked = 1; 659989f89c5SKAMEZAWA Hiroyuki } 66062e1c553SShaohua Li 661dc386d4dSKAMEZAWA Hiroyuki /* 66262e1c553SShaohua Li * Corner case handling: 66362e1c553SShaohua Li * 1. When a new swap-cache page is read into, it is added to the LRU 66462e1c553SShaohua Li * and treated as swapcache but it has no rmap yet. 66562e1c553SShaohua Li * Calling try_to_unmap() against a page->mapping==NULL page will 66662e1c553SShaohua Li * trigger a BUG. So handle it here. 66762e1c553SShaohua Li * 2. An orphaned page (see truncate_complete_page) might have 66862e1c553SShaohua Li * fs-private metadata. The page can be picked up due to memory 66962e1c553SShaohua Li * offlining. Everywhere else except page reclaim, the page is 67062e1c553SShaohua Li * invisible to the vm, so the page can not be migrated. So try to 67162e1c553SShaohua Li * free the metadata, so the page can be freed. 672dc386d4dSKAMEZAWA Hiroyuki */ 67362e1c553SShaohua Li if (!page->mapping) { 67462e1c553SShaohua Li if (!PageAnon(page) && PagePrivate(page)) { 67562e1c553SShaohua Li /* 67662e1c553SShaohua Li * Go direct to try_to_free_buffers() here because 67762e1c553SShaohua Li * a) that's what try_to_release_page() would do anyway 67862e1c553SShaohua Li * b) we may be under rcu_read_lock() here, so we can't 67962e1c553SShaohua Li * use GFP_KERNEL which is what try_to_release_page() 68062e1c553SShaohua Li * needs to be effective. 68162e1c553SShaohua Li */ 68262e1c553SShaohua Li try_to_free_buffers(page); 68362e1c553SShaohua Li } 684dc386d4dSKAMEZAWA Hiroyuki goto rcu_unlock; 68562e1c553SShaohua Li } 68662e1c553SShaohua Li 687dc386d4dSKAMEZAWA Hiroyuki /* Establish migration ptes or remove ptes */ 688e6a1530dSChristoph Lameter try_to_unmap(page, 1); 689dc386d4dSKAMEZAWA Hiroyuki 690e24f0b8fSChristoph Lameter if (!page_mapped(page)) 691e24f0b8fSChristoph Lameter rc = move_to_new_page(newpage, page); 692e24f0b8fSChristoph Lameter 693e8589cc1SKAMEZAWA Hiroyuki if (rc) 6946c5240aeSChristoph Lameter remove_migration_ptes(page, page); 695dc386d4dSKAMEZAWA Hiroyuki rcu_unlock: 696989f89c5SKAMEZAWA Hiroyuki if (rcu_locked) 697dc386d4dSKAMEZAWA Hiroyuki rcu_read_unlock(); 698e6a1530dSChristoph Lameter 699e24f0b8fSChristoph Lameter unlock: 700b20a3503SChristoph Lameter unlock_page(page); 70195a402c3SChristoph Lameter 702e24f0b8fSChristoph Lameter if (rc != -EAGAIN) { 703aaa994b3SChristoph Lameter /* 704aaa994b3SChristoph Lameter * A page that has been migrated has all references 705aaa994b3SChristoph Lameter * removed and will be freed. A page that has not been 706aaa994b3SChristoph Lameter * migrated will have kepts its references and be 707aaa994b3SChristoph Lameter * restored. 708aaa994b3SChristoph Lameter */ 709aaa994b3SChristoph Lameter list_del(&page->lru); 710894bc310SLee Schermerhorn putback_lru_page(page); 711e24f0b8fSChristoph Lameter } 71295a402c3SChristoph Lameter 71395a402c3SChristoph Lameter move_newpage: 714e8589cc1SKAMEZAWA Hiroyuki if (!charge) 715e8589cc1SKAMEZAWA Hiroyuki mem_cgroup_end_migration(newpage); 716894bc310SLee Schermerhorn 71795a402c3SChristoph Lameter /* 71895a402c3SChristoph Lameter * Move the new page to the LRU. If migration was not successful 71995a402c3SChristoph Lameter * then this will free the page. 72095a402c3SChristoph Lameter */ 721894bc310SLee Schermerhorn putback_lru_page(newpage); 722894bc310SLee Schermerhorn 723742755a1SChristoph Lameter if (result) { 724742755a1SChristoph Lameter if (rc) 725742755a1SChristoph Lameter *result = rc; 726742755a1SChristoph Lameter else 727742755a1SChristoph Lameter *result = page_to_nid(newpage); 728742755a1SChristoph Lameter } 729e24f0b8fSChristoph Lameter return rc; 730e24f0b8fSChristoph Lameter } 731b20a3503SChristoph Lameter 732e24f0b8fSChristoph Lameter /* 733e24f0b8fSChristoph Lameter * migrate_pages 734e24f0b8fSChristoph Lameter * 73595a402c3SChristoph Lameter * The function takes one list of pages to migrate and a function 73695a402c3SChristoph Lameter * that determines from the page to be migrated and the private data 73795a402c3SChristoph Lameter * the target of the move and allocates the page. 738e24f0b8fSChristoph Lameter * 739e24f0b8fSChristoph Lameter * The function returns after 10 attempts or if no pages 740e24f0b8fSChristoph Lameter * are movable anymore because to has become empty 741aaa994b3SChristoph Lameter * or no retryable pages exist anymore. All pages will be 742e9534b3fSGabriel Craciunescu * returned to the LRU or freed. 743e24f0b8fSChristoph Lameter * 74495a402c3SChristoph Lameter * Return: Number of pages not migrated or error code. 745e24f0b8fSChristoph Lameter */ 74695a402c3SChristoph Lameter int migrate_pages(struct list_head *from, 74795a402c3SChristoph Lameter new_page_t get_new_page, unsigned long private) 748e24f0b8fSChristoph Lameter { 749e24f0b8fSChristoph Lameter int retry = 1; 750e24f0b8fSChristoph Lameter int nr_failed = 0; 751e24f0b8fSChristoph Lameter int pass = 0; 752e24f0b8fSChristoph Lameter struct page *page; 753e24f0b8fSChristoph Lameter struct page *page2; 754e24f0b8fSChristoph Lameter int swapwrite = current->flags & PF_SWAPWRITE; 755e24f0b8fSChristoph Lameter int rc; 7562d1db3b1SChristoph Lameter 757e24f0b8fSChristoph Lameter if (!swapwrite) 758e24f0b8fSChristoph Lameter current->flags |= PF_SWAPWRITE; 759e24f0b8fSChristoph Lameter 760e24f0b8fSChristoph Lameter for(pass = 0; pass < 10 && retry; pass++) { 761e24f0b8fSChristoph Lameter retry = 0; 762e24f0b8fSChristoph Lameter 763e24f0b8fSChristoph Lameter list_for_each_entry_safe(page, page2, from, lru) { 764e24f0b8fSChristoph Lameter cond_resched(); 765e24f0b8fSChristoph Lameter 76695a402c3SChristoph Lameter rc = unmap_and_move(get_new_page, private, 76795a402c3SChristoph Lameter page, pass > 2); 768e24f0b8fSChristoph Lameter 769e24f0b8fSChristoph Lameter switch(rc) { 77095a402c3SChristoph Lameter case -ENOMEM: 77195a402c3SChristoph Lameter goto out; 772e24f0b8fSChristoph Lameter case -EAGAIN: 773b20a3503SChristoph Lameter retry++; 774e24f0b8fSChristoph Lameter break; 775e24f0b8fSChristoph Lameter case 0: 776e24f0b8fSChristoph Lameter break; 777e24f0b8fSChristoph Lameter default: 778b20a3503SChristoph Lameter /* Permanent failure */ 779b20a3503SChristoph Lameter nr_failed++; 780e24f0b8fSChristoph Lameter break; 781b20a3503SChristoph Lameter } 782b20a3503SChristoph Lameter } 783e24f0b8fSChristoph Lameter } 78495a402c3SChristoph Lameter rc = 0; 78595a402c3SChristoph Lameter out: 786b20a3503SChristoph Lameter if (!swapwrite) 787b20a3503SChristoph Lameter current->flags &= ~PF_SWAPWRITE; 788b20a3503SChristoph Lameter 789aaa994b3SChristoph Lameter putback_lru_pages(from); 79095a402c3SChristoph Lameter 79195a402c3SChristoph Lameter if (rc) 79295a402c3SChristoph Lameter return rc; 79395a402c3SChristoph Lameter 794b20a3503SChristoph Lameter return nr_failed + retry; 795b20a3503SChristoph Lameter } 796b20a3503SChristoph Lameter 797742755a1SChristoph Lameter #ifdef CONFIG_NUMA 798742755a1SChristoph Lameter /* 799742755a1SChristoph Lameter * Move a list of individual pages 800742755a1SChristoph Lameter */ 801742755a1SChristoph Lameter struct page_to_node { 802742755a1SChristoph Lameter unsigned long addr; 803742755a1SChristoph Lameter struct page *page; 804742755a1SChristoph Lameter int node; 805742755a1SChristoph Lameter int status; 806742755a1SChristoph Lameter }; 807742755a1SChristoph Lameter 808742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private, 809742755a1SChristoph Lameter int **result) 810742755a1SChristoph Lameter { 811742755a1SChristoph Lameter struct page_to_node *pm = (struct page_to_node *)private; 812742755a1SChristoph Lameter 813742755a1SChristoph Lameter while (pm->node != MAX_NUMNODES && pm->page != p) 814742755a1SChristoph Lameter pm++; 815742755a1SChristoph Lameter 816742755a1SChristoph Lameter if (pm->node == MAX_NUMNODES) 817742755a1SChristoph Lameter return NULL; 818742755a1SChristoph Lameter 819742755a1SChristoph Lameter *result = &pm->status; 820742755a1SChristoph Lameter 821769848c0SMel Gorman return alloc_pages_node(pm->node, 822769848c0SMel Gorman GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 823742755a1SChristoph Lameter } 824742755a1SChristoph Lameter 825742755a1SChristoph Lameter /* 826742755a1SChristoph Lameter * Move a set of pages as indicated in the pm array. The addr 827742755a1SChristoph Lameter * field must be set to the virtual address of the page to be moved 828742755a1SChristoph Lameter * and the node number must contain a valid target node. 829742755a1SChristoph Lameter */ 830742755a1SChristoph Lameter static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, 831742755a1SChristoph Lameter int migrate_all) 832742755a1SChristoph Lameter { 833742755a1SChristoph Lameter int err; 834742755a1SChristoph Lameter struct page_to_node *pp; 835742755a1SChristoph Lameter LIST_HEAD(pagelist); 836742755a1SChristoph Lameter 837742755a1SChristoph Lameter down_read(&mm->mmap_sem); 838742755a1SChristoph Lameter 839742755a1SChristoph Lameter /* 840742755a1SChristoph Lameter * Build a list of pages to migrate 841742755a1SChristoph Lameter */ 842742755a1SChristoph Lameter migrate_prep(); 843742755a1SChristoph Lameter for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 844742755a1SChristoph Lameter struct vm_area_struct *vma; 845742755a1SChristoph Lameter struct page *page; 846742755a1SChristoph Lameter 847742755a1SChristoph Lameter /* 848742755a1SChristoph Lameter * A valid page pointer that will not match any of the 849742755a1SChristoph Lameter * pages that will be moved. 850742755a1SChristoph Lameter */ 851742755a1SChristoph Lameter pp->page = ZERO_PAGE(0); 852742755a1SChristoph Lameter 853742755a1SChristoph Lameter err = -EFAULT; 854742755a1SChristoph Lameter vma = find_vma(mm, pp->addr); 8550dc952dcSChristoph Lameter if (!vma || !vma_migratable(vma)) 856742755a1SChristoph Lameter goto set_status; 857742755a1SChristoph Lameter 858742755a1SChristoph Lameter page = follow_page(vma, pp->addr, FOLL_GET); 85989f5b7daSLinus Torvalds 86089f5b7daSLinus Torvalds err = PTR_ERR(page); 86189f5b7daSLinus Torvalds if (IS_ERR(page)) 86289f5b7daSLinus Torvalds goto set_status; 86389f5b7daSLinus Torvalds 864742755a1SChristoph Lameter err = -ENOENT; 865742755a1SChristoph Lameter if (!page) 866742755a1SChristoph Lameter goto set_status; 867742755a1SChristoph Lameter 868742755a1SChristoph Lameter if (PageReserved(page)) /* Check for zero page */ 869742755a1SChristoph Lameter goto put_and_set; 870742755a1SChristoph Lameter 871742755a1SChristoph Lameter pp->page = page; 872742755a1SChristoph Lameter err = page_to_nid(page); 873742755a1SChristoph Lameter 874742755a1SChristoph Lameter if (err == pp->node) 875742755a1SChristoph Lameter /* 876742755a1SChristoph Lameter * Node already in the right place 877742755a1SChristoph Lameter */ 878742755a1SChristoph Lameter goto put_and_set; 879742755a1SChristoph Lameter 880742755a1SChristoph Lameter err = -EACCES; 881742755a1SChristoph Lameter if (page_mapcount(page) > 1 && 882742755a1SChristoph Lameter !migrate_all) 883742755a1SChristoph Lameter goto put_and_set; 884742755a1SChristoph Lameter 88562695a84SNick Piggin err = isolate_lru_page(page); 88662695a84SNick Piggin if (!err) 88762695a84SNick Piggin list_add_tail(&page->lru, &pagelist); 888742755a1SChristoph Lameter put_and_set: 889742755a1SChristoph Lameter /* 890742755a1SChristoph Lameter * Either remove the duplicate refcount from 891742755a1SChristoph Lameter * isolate_lru_page() or drop the page ref if it was 892742755a1SChristoph Lameter * not isolated. 893742755a1SChristoph Lameter */ 894742755a1SChristoph Lameter put_page(page); 895742755a1SChristoph Lameter set_status: 896742755a1SChristoph Lameter pp->status = err; 897742755a1SChristoph Lameter } 898742755a1SChristoph Lameter 899e78bbfa8SBrice Goglin err = 0; 900742755a1SChristoph Lameter if (!list_empty(&pagelist)) 901742755a1SChristoph Lameter err = migrate_pages(&pagelist, new_page_node, 902742755a1SChristoph Lameter (unsigned long)pm); 903742755a1SChristoph Lameter 904742755a1SChristoph Lameter up_read(&mm->mmap_sem); 905742755a1SChristoph Lameter return err; 906742755a1SChristoph Lameter } 907742755a1SChristoph Lameter 908742755a1SChristoph Lameter /* 909*2f007e74SBrice Goglin * Determine the nodes of an array of pages and store it in an array of status. 910742755a1SChristoph Lameter */ 911*2f007e74SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 912*2f007e74SBrice Goglin const void __user * __user *pages, 913*2f007e74SBrice Goglin int __user *status) 914742755a1SChristoph Lameter { 915*2f007e74SBrice Goglin unsigned long i; 916742755a1SChristoph Lameter int err; 917742755a1SChristoph Lameter 918*2f007e74SBrice Goglin down_read(&mm->mmap_sem); 919*2f007e74SBrice Goglin 920*2f007e74SBrice Goglin for (i = 0; i < nr_pages; i++) { 921*2f007e74SBrice Goglin const void __user *p; 922*2f007e74SBrice Goglin unsigned long addr; 923*2f007e74SBrice Goglin struct vm_area_struct *vma; 924*2f007e74SBrice Goglin struct page *page; 925*2f007e74SBrice Goglin 926742755a1SChristoph Lameter err = -EFAULT; 927*2f007e74SBrice Goglin if (get_user(p, pages+i)) 928*2f007e74SBrice Goglin goto out; 929*2f007e74SBrice Goglin addr = (unsigned long) p; 930*2f007e74SBrice Goglin 931*2f007e74SBrice Goglin vma = find_vma(mm, addr); 932742755a1SChristoph Lameter if (!vma) 933742755a1SChristoph Lameter goto set_status; 934742755a1SChristoph Lameter 935*2f007e74SBrice Goglin page = follow_page(vma, addr, 0); 93689f5b7daSLinus Torvalds 93789f5b7daSLinus Torvalds err = PTR_ERR(page); 93889f5b7daSLinus Torvalds if (IS_ERR(page)) 93989f5b7daSLinus Torvalds goto set_status; 94089f5b7daSLinus Torvalds 941742755a1SChristoph Lameter err = -ENOENT; 942742755a1SChristoph Lameter /* Use PageReserved to check for zero page */ 943742755a1SChristoph Lameter if (!page || PageReserved(page)) 944742755a1SChristoph Lameter goto set_status; 945742755a1SChristoph Lameter 946742755a1SChristoph Lameter err = page_to_nid(page); 947742755a1SChristoph Lameter set_status: 948*2f007e74SBrice Goglin put_user(err, status+i); 949742755a1SChristoph Lameter } 950*2f007e74SBrice Goglin err = 0; 951742755a1SChristoph Lameter 952*2f007e74SBrice Goglin out: 953742755a1SChristoph Lameter up_read(&mm->mmap_sem); 954*2f007e74SBrice Goglin return err; 955742755a1SChristoph Lameter } 956742755a1SChristoph Lameter 957742755a1SChristoph Lameter /* 958742755a1SChristoph Lameter * Move a list of pages in the address space of the currently executing 959742755a1SChristoph Lameter * process. 960742755a1SChristoph Lameter */ 961742755a1SChristoph Lameter asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, 962742755a1SChristoph Lameter const void __user * __user *pages, 963742755a1SChristoph Lameter const int __user *nodes, 964742755a1SChristoph Lameter int __user *status, int flags) 965742755a1SChristoph Lameter { 966742755a1SChristoph Lameter int err = 0; 967742755a1SChristoph Lameter int i; 968742755a1SChristoph Lameter struct task_struct *task; 969742755a1SChristoph Lameter nodemask_t task_nodes; 970742755a1SChristoph Lameter struct mm_struct *mm; 971742755a1SChristoph Lameter struct page_to_node *pm = NULL; 972742755a1SChristoph Lameter 973742755a1SChristoph Lameter /* Check flags */ 974742755a1SChristoph Lameter if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 975742755a1SChristoph Lameter return -EINVAL; 976742755a1SChristoph Lameter 977742755a1SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 978742755a1SChristoph Lameter return -EPERM; 979742755a1SChristoph Lameter 980742755a1SChristoph Lameter /* Find the mm_struct */ 981742755a1SChristoph Lameter read_lock(&tasklist_lock); 982228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 983742755a1SChristoph Lameter if (!task) { 984742755a1SChristoph Lameter read_unlock(&tasklist_lock); 985742755a1SChristoph Lameter return -ESRCH; 986742755a1SChristoph Lameter } 987742755a1SChristoph Lameter mm = get_task_mm(task); 988742755a1SChristoph Lameter read_unlock(&tasklist_lock); 989742755a1SChristoph Lameter 990742755a1SChristoph Lameter if (!mm) 991742755a1SChristoph Lameter return -EINVAL; 992742755a1SChristoph Lameter 993742755a1SChristoph Lameter /* 994742755a1SChristoph Lameter * Check if this process has the right to modify the specified 995742755a1SChristoph Lameter * process. The right exists if the process has administrative 996742755a1SChristoph Lameter * capabilities, superuser privileges or the same 997742755a1SChristoph Lameter * userid as the target process. 998742755a1SChristoph Lameter */ 999742755a1SChristoph Lameter if ((current->euid != task->suid) && (current->euid != task->uid) && 1000742755a1SChristoph Lameter (current->uid != task->suid) && (current->uid != task->uid) && 1001742755a1SChristoph Lameter !capable(CAP_SYS_NICE)) { 1002742755a1SChristoph Lameter err = -EPERM; 1003742755a1SChristoph Lameter goto out2; 1004742755a1SChristoph Lameter } 1005742755a1SChristoph Lameter 100686c3a764SDavid Quigley err = security_task_movememory(task); 100786c3a764SDavid Quigley if (err) 100886c3a764SDavid Quigley goto out2; 100986c3a764SDavid Quigley 1010*2f007e74SBrice Goglin if (!nodes) { 1011*2f007e74SBrice Goglin err = do_pages_stat(mm, nr_pages, pages, status); 1012*2f007e74SBrice Goglin goto out2; 1013*2f007e74SBrice Goglin } 101486c3a764SDavid Quigley 1015742755a1SChristoph Lameter task_nodes = cpuset_mems_allowed(task); 1016742755a1SChristoph Lameter 1017742755a1SChristoph Lameter /* Limit nr_pages so that the multiplication may not overflow */ 1018742755a1SChristoph Lameter if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { 1019742755a1SChristoph Lameter err = -E2BIG; 1020742755a1SChristoph Lameter goto out2; 1021742755a1SChristoph Lameter } 1022742755a1SChristoph Lameter 1023742755a1SChristoph Lameter pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); 1024742755a1SChristoph Lameter if (!pm) { 1025742755a1SChristoph Lameter err = -ENOMEM; 1026742755a1SChristoph Lameter goto out2; 1027742755a1SChristoph Lameter } 1028742755a1SChristoph Lameter 1029742755a1SChristoph Lameter /* 1030742755a1SChristoph Lameter * Get parameters from user space and initialize the pm 1031742755a1SChristoph Lameter * array. Return various errors if the user did something wrong. 1032742755a1SChristoph Lameter */ 1033742755a1SChristoph Lameter for (i = 0; i < nr_pages; i++) { 10349d966d49SAl Viro const void __user *p; 1035742755a1SChristoph Lameter 1036742755a1SChristoph Lameter err = -EFAULT; 1037742755a1SChristoph Lameter if (get_user(p, pages + i)) 1038742755a1SChristoph Lameter goto out; 1039742755a1SChristoph Lameter 1040742755a1SChristoph Lameter pm[i].addr = (unsigned long)p; 1041742755a1SChristoph Lameter if (nodes) { 1042742755a1SChristoph Lameter int node; 1043742755a1SChristoph Lameter 1044742755a1SChristoph Lameter if (get_user(node, nodes + i)) 1045742755a1SChristoph Lameter goto out; 1046742755a1SChristoph Lameter 1047742755a1SChristoph Lameter err = -ENODEV; 104856bbd65dSChristoph Lameter if (!node_state(node, N_HIGH_MEMORY)) 1049742755a1SChristoph Lameter goto out; 1050742755a1SChristoph Lameter 1051742755a1SChristoph Lameter err = -EACCES; 1052742755a1SChristoph Lameter if (!node_isset(node, task_nodes)) 1053742755a1SChristoph Lameter goto out; 1054742755a1SChristoph Lameter 1055742755a1SChristoph Lameter pm[i].node = node; 10568ce08464SStephen Rothwell } else 10578ce08464SStephen Rothwell pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 1058742755a1SChristoph Lameter } 1059742755a1SChristoph Lameter /* End marker */ 1060742755a1SChristoph Lameter pm[nr_pages].node = MAX_NUMNODES; 1061742755a1SChristoph Lameter 1062742755a1SChristoph Lameter err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); 1063742755a1SChristoph Lameter if (err >= 0) 1064742755a1SChristoph Lameter /* Return status information */ 1065742755a1SChristoph Lameter for (i = 0; i < nr_pages; i++) 1066742755a1SChristoph Lameter if (put_user(pm[i].status, status + i)) 1067742755a1SChristoph Lameter err = -EFAULT; 1068742755a1SChristoph Lameter 1069742755a1SChristoph Lameter out: 1070742755a1SChristoph Lameter vfree(pm); 1071742755a1SChristoph Lameter out2: 1072742755a1SChristoph Lameter mmput(mm); 1073742755a1SChristoph Lameter return err; 1074742755a1SChristoph Lameter } 1075742755a1SChristoph Lameter 10767b2259b3SChristoph Lameter /* 10777b2259b3SChristoph Lameter * Call migration functions in the vma_ops that may prepare 10787b2259b3SChristoph Lameter * memory in a vm for migration. migration functions may perform 10797b2259b3SChristoph Lameter * the migration for vmas that do not have an underlying page struct. 10807b2259b3SChristoph Lameter */ 10817b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 10827b2259b3SChristoph Lameter const nodemask_t *from, unsigned long flags) 10837b2259b3SChristoph Lameter { 10847b2259b3SChristoph Lameter struct vm_area_struct *vma; 10857b2259b3SChristoph Lameter int err = 0; 10867b2259b3SChristoph Lameter 10877b2259b3SChristoph Lameter for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { 10887b2259b3SChristoph Lameter if (vma->vm_ops && vma->vm_ops->migrate) { 10897b2259b3SChristoph Lameter err = vma->vm_ops->migrate(vma, to, from, flags); 10907b2259b3SChristoph Lameter if (err) 10917b2259b3SChristoph Lameter break; 10927b2259b3SChristoph Lameter } 10937b2259b3SChristoph Lameter } 10947b2259b3SChristoph Lameter return err; 10957b2259b3SChristoph Lameter } 109683d1674aSGerald Schaefer #endif 1097