1b20a3503SChristoph Lameter /* 2b20a3503SChristoph Lameter * Memory Migration functionality - linux/mm/migration.c 3b20a3503SChristoph Lameter * 4b20a3503SChristoph Lameter * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5b20a3503SChristoph Lameter * 6b20a3503SChristoph Lameter * Page migration was first developed in the context of the memory hotplug 7b20a3503SChristoph Lameter * project. The main authors of the migration code are: 8b20a3503SChristoph Lameter * 9b20a3503SChristoph Lameter * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10b20a3503SChristoph Lameter * Hirokazu Takahashi <taka@valinux.co.jp> 11b20a3503SChristoph Lameter * Dave Hansen <haveblue@us.ibm.com> 12cde53535SChristoph Lameter * Christoph Lameter 13b20a3503SChristoph Lameter */ 14b20a3503SChristoph Lameter 15b20a3503SChristoph Lameter #include <linux/migrate.h> 16b95f1b31SPaul Gortmaker #include <linux/export.h> 17b20a3503SChristoph Lameter #include <linux/swap.h> 180697212aSChristoph Lameter #include <linux/swapops.h> 19b20a3503SChristoph Lameter #include <linux/pagemap.h> 20e23ca00bSChristoph Lameter #include <linux/buffer_head.h> 21b20a3503SChristoph Lameter #include <linux/mm_inline.h> 22b488893aSPavel Emelyanov #include <linux/nsproxy.h> 23b20a3503SChristoph Lameter #include <linux/pagevec.h> 24e9995ef9SHugh Dickins #include <linux/ksm.h> 25b20a3503SChristoph Lameter #include <linux/rmap.h> 26b20a3503SChristoph Lameter #include <linux/topology.h> 27b20a3503SChristoph Lameter #include <linux/cpu.h> 28b20a3503SChristoph Lameter #include <linux/cpuset.h> 2904e62a29SChristoph Lameter #include <linux/writeback.h> 30742755a1SChristoph Lameter #include <linux/mempolicy.h> 31742755a1SChristoph Lameter #include <linux/vmalloc.h> 3286c3a764SDavid Quigley #include <linux/security.h> 338a9f3ccdSBalbir Singh #include <linux/memcontrol.h> 344f5ca265SAdrian Bunk #include <linux/syscalls.h> 35290408d4SNaoya Horiguchi #include <linux/hugetlb.h> 368e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h> 375a0e3ad6STejun Heo #include <linux/gfp.h> 38b20a3503SChristoph Lameter 390d1836c3SMichal Nazarewicz #include <asm/tlbflush.h> 400d1836c3SMichal Nazarewicz 41b20a3503SChristoph Lameter #include "internal.h" 42b20a3503SChristoph Lameter 43b20a3503SChristoph Lameter /* 44742755a1SChristoph Lameter * migrate_prep() needs to be called before we start compiling a list of pages 45748446bbSMel Gorman * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 46748446bbSMel Gorman * undesirable, use migrate_prep_local() 47b20a3503SChristoph Lameter */ 48b20a3503SChristoph Lameter int migrate_prep(void) 49b20a3503SChristoph Lameter { 50b20a3503SChristoph Lameter /* 51b20a3503SChristoph Lameter * Clear the LRU lists so pages can be isolated. 52b20a3503SChristoph Lameter * Note that pages may be moved off the LRU after we have 53b20a3503SChristoph Lameter * drained them. Those pages will fail to migrate like other 54b20a3503SChristoph Lameter * pages that may be busy. 55b20a3503SChristoph Lameter */ 56b20a3503SChristoph Lameter lru_add_drain_all(); 57b20a3503SChristoph Lameter 58b20a3503SChristoph Lameter return 0; 59b20a3503SChristoph Lameter } 60b20a3503SChristoph Lameter 61748446bbSMel Gorman /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 62748446bbSMel Gorman int migrate_prep_local(void) 63748446bbSMel Gorman { 64748446bbSMel Gorman lru_add_drain(); 65748446bbSMel Gorman 66748446bbSMel Gorman return 0; 67748446bbSMel Gorman } 68748446bbSMel Gorman 69b20a3503SChristoph Lameter /* 70894bc310SLee Schermerhorn * Add isolated pages on the list back to the LRU under page lock 71894bc310SLee Schermerhorn * to avoid leaking evictable pages back onto unevictable list. 72b20a3503SChristoph Lameter */ 73e13861d8SMinchan Kim void putback_lru_pages(struct list_head *l) 74b20a3503SChristoph Lameter { 75b20a3503SChristoph Lameter struct page *page; 76b20a3503SChristoph Lameter struct page *page2; 77b20a3503SChristoph Lameter 78b20a3503SChristoph Lameter list_for_each_entry_safe(page, page2, l, lru) { 79e24f0b8fSChristoph Lameter list_del(&page->lru); 80a731286dSKOSAKI Motohiro dec_zone_page_state(page, NR_ISOLATED_ANON + 816c0b1351SJohannes Weiner page_is_file_cache(page)); 82894bc310SLee Schermerhorn putback_lru_page(page); 83b20a3503SChristoph Lameter } 84b20a3503SChristoph Lameter } 85b20a3503SChristoph Lameter 860697212aSChristoph Lameter /* 870697212aSChristoph Lameter * Restore a potential migration pte to a working pte entry 880697212aSChristoph Lameter */ 89e9995ef9SHugh Dickins static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, 90e9995ef9SHugh Dickins unsigned long addr, void *old) 910697212aSChristoph Lameter { 920697212aSChristoph Lameter struct mm_struct *mm = vma->vm_mm; 930697212aSChristoph Lameter swp_entry_t entry; 940697212aSChristoph Lameter pmd_t *pmd; 950697212aSChristoph Lameter pte_t *ptep, pte; 960697212aSChristoph Lameter spinlock_t *ptl; 970697212aSChristoph Lameter 98290408d4SNaoya Horiguchi if (unlikely(PageHuge(new))) { 99290408d4SNaoya Horiguchi ptep = huge_pte_offset(mm, addr); 100290408d4SNaoya Horiguchi if (!ptep) 101290408d4SNaoya Horiguchi goto out; 102290408d4SNaoya Horiguchi ptl = &mm->page_table_lock; 103290408d4SNaoya Horiguchi } else { 1046219049aSBob Liu pmd = mm_find_pmd(mm, addr); 1056219049aSBob Liu if (!pmd) 106e9995ef9SHugh Dickins goto out; 107500d65d4SAndrea Arcangeli if (pmd_trans_huge(*pmd)) 108500d65d4SAndrea Arcangeli goto out; 1090697212aSChristoph Lameter 1100697212aSChristoph Lameter ptep = pte_offset_map(pmd, addr); 1110697212aSChristoph Lameter 112486cf46fSHugh Dickins /* 113486cf46fSHugh Dickins * Peek to check is_swap_pte() before taking ptlock? No, we 114486cf46fSHugh Dickins * can race mremap's move_ptes(), which skips anon_vma lock. 115486cf46fSHugh Dickins */ 1160697212aSChristoph Lameter 1170697212aSChristoph Lameter ptl = pte_lockptr(mm, pmd); 118290408d4SNaoya Horiguchi } 119290408d4SNaoya Horiguchi 1200697212aSChristoph Lameter spin_lock(ptl); 1210697212aSChristoph Lameter pte = *ptep; 1220697212aSChristoph Lameter if (!is_swap_pte(pte)) 123e9995ef9SHugh Dickins goto unlock; 1240697212aSChristoph Lameter 1250697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 1260697212aSChristoph Lameter 127e9995ef9SHugh Dickins if (!is_migration_entry(entry) || 128e9995ef9SHugh Dickins migration_entry_to_page(entry) != old) 129e9995ef9SHugh Dickins goto unlock; 1300697212aSChristoph Lameter 1310697212aSChristoph Lameter get_page(new); 1320697212aSChristoph Lameter pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 1330697212aSChristoph Lameter if (is_write_migration_entry(entry)) 1340697212aSChristoph Lameter pte = pte_mkwrite(pte); 1353ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE 136290408d4SNaoya Horiguchi if (PageHuge(new)) 137290408d4SNaoya Horiguchi pte = pte_mkhuge(pte); 1383ef8fd7fSAndi Kleen #endif 13997ee0524SKAMEZAWA Hiroyuki flush_cache_page(vma, addr, pte_pfn(pte)); 1400697212aSChristoph Lameter set_pte_at(mm, addr, ptep, pte); 14104e62a29SChristoph Lameter 142290408d4SNaoya Horiguchi if (PageHuge(new)) { 14304e62a29SChristoph Lameter if (PageAnon(new)) 144290408d4SNaoya Horiguchi hugepage_add_anon_rmap(new, vma, addr); 145290408d4SNaoya Horiguchi else 146290408d4SNaoya Horiguchi page_dup_rmap(new); 147290408d4SNaoya Horiguchi } else if (PageAnon(new)) 1480697212aSChristoph Lameter page_add_anon_rmap(new, vma, addr); 14904e62a29SChristoph Lameter else 15004e62a29SChristoph Lameter page_add_file_rmap(new); 15104e62a29SChristoph Lameter 15204e62a29SChristoph Lameter /* No need to invalidate - it was non-present before */ 1534b3073e1SRussell King update_mmu_cache(vma, addr, ptep); 154e9995ef9SHugh Dickins unlock: 1550697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 156e9995ef9SHugh Dickins out: 157e9995ef9SHugh Dickins return SWAP_AGAIN; 1580697212aSChristoph Lameter } 1590697212aSChristoph Lameter 1600697212aSChristoph Lameter /* 16104e62a29SChristoph Lameter * Get rid of all migration entries and replace them by 16204e62a29SChristoph Lameter * references to the indicated page. 16304e62a29SChristoph Lameter */ 16404e62a29SChristoph Lameter static void remove_migration_ptes(struct page *old, struct page *new) 16504e62a29SChristoph Lameter { 166e9995ef9SHugh Dickins rmap_walk(new, remove_migration_pte, old); 16704e62a29SChristoph Lameter } 16804e62a29SChristoph Lameter 16904e62a29SChristoph Lameter /* 1700697212aSChristoph Lameter * Something used the pte of a page under migration. We need to 1710697212aSChristoph Lameter * get to the page and wait until migration is finished. 1720697212aSChristoph Lameter * When we return from this function the fault will be retried. 1730697212aSChristoph Lameter */ 1740697212aSChristoph Lameter void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 1750697212aSChristoph Lameter unsigned long address) 1760697212aSChristoph Lameter { 1770697212aSChristoph Lameter pte_t *ptep, pte; 1780697212aSChristoph Lameter spinlock_t *ptl; 1790697212aSChristoph Lameter swp_entry_t entry; 1800697212aSChristoph Lameter struct page *page; 1810697212aSChristoph Lameter 1820697212aSChristoph Lameter ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1830697212aSChristoph Lameter pte = *ptep; 1840697212aSChristoph Lameter if (!is_swap_pte(pte)) 1850697212aSChristoph Lameter goto out; 1860697212aSChristoph Lameter 1870697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 1880697212aSChristoph Lameter if (!is_migration_entry(entry)) 1890697212aSChristoph Lameter goto out; 1900697212aSChristoph Lameter 1910697212aSChristoph Lameter page = migration_entry_to_page(entry); 1920697212aSChristoph Lameter 193e286781dSNick Piggin /* 194e286781dSNick Piggin * Once radix-tree replacement of page migration started, page_count 195e286781dSNick Piggin * *must* be zero. And, we don't want to call wait_on_page_locked() 196e286781dSNick Piggin * against a page without get_page(). 197e286781dSNick Piggin * So, we use get_page_unless_zero(), here. Even failed, page fault 198e286781dSNick Piggin * will occur again. 199e286781dSNick Piggin */ 200e286781dSNick Piggin if (!get_page_unless_zero(page)) 201e286781dSNick Piggin goto out; 2020697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 2030697212aSChristoph Lameter wait_on_page_locked(page); 2040697212aSChristoph Lameter put_page(page); 2050697212aSChristoph Lameter return; 2060697212aSChristoph Lameter out: 2070697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 2080697212aSChristoph Lameter } 2090697212aSChristoph Lameter 210b969c4abSMel Gorman #ifdef CONFIG_BLOCK 211b969c4abSMel Gorman /* Returns true if all buffers are successfully locked */ 212a6bc32b8SMel Gorman static bool buffer_migrate_lock_buffers(struct buffer_head *head, 213a6bc32b8SMel Gorman enum migrate_mode mode) 214b969c4abSMel Gorman { 215b969c4abSMel Gorman struct buffer_head *bh = head; 216b969c4abSMel Gorman 217b969c4abSMel Gorman /* Simple case, sync compaction */ 218a6bc32b8SMel Gorman if (mode != MIGRATE_ASYNC) { 219b969c4abSMel Gorman do { 220b969c4abSMel Gorman get_bh(bh); 221b969c4abSMel Gorman lock_buffer(bh); 222b969c4abSMel Gorman bh = bh->b_this_page; 223b969c4abSMel Gorman 224b969c4abSMel Gorman } while (bh != head); 225b969c4abSMel Gorman 226b969c4abSMel Gorman return true; 227b969c4abSMel Gorman } 228b969c4abSMel Gorman 229b969c4abSMel Gorman /* async case, we cannot block on lock_buffer so use trylock_buffer */ 230b969c4abSMel Gorman do { 231b969c4abSMel Gorman get_bh(bh); 232b969c4abSMel Gorman if (!trylock_buffer(bh)) { 233b969c4abSMel Gorman /* 234b969c4abSMel Gorman * We failed to lock the buffer and cannot stall in 235b969c4abSMel Gorman * async migration. Release the taken locks 236b969c4abSMel Gorman */ 237b969c4abSMel Gorman struct buffer_head *failed_bh = bh; 238b969c4abSMel Gorman put_bh(failed_bh); 239b969c4abSMel Gorman bh = head; 240b969c4abSMel Gorman while (bh != failed_bh) { 241b969c4abSMel Gorman unlock_buffer(bh); 242b969c4abSMel Gorman put_bh(bh); 243b969c4abSMel Gorman bh = bh->b_this_page; 244b969c4abSMel Gorman } 245b969c4abSMel Gorman return false; 246b969c4abSMel Gorman } 247b969c4abSMel Gorman 248b969c4abSMel Gorman bh = bh->b_this_page; 249b969c4abSMel Gorman } while (bh != head); 250b969c4abSMel Gorman return true; 251b969c4abSMel Gorman } 252b969c4abSMel Gorman #else 253b969c4abSMel Gorman static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, 254a6bc32b8SMel Gorman enum migrate_mode mode) 255b969c4abSMel Gorman { 256b969c4abSMel Gorman return true; 257b969c4abSMel Gorman } 258b969c4abSMel Gorman #endif /* CONFIG_BLOCK */ 259b969c4abSMel Gorman 260b20a3503SChristoph Lameter /* 261c3fcf8a5SChristoph Lameter * Replace the page in the mapping. 2625b5c7120SChristoph Lameter * 2635b5c7120SChristoph Lameter * The number of remaining references must be: 2645b5c7120SChristoph Lameter * 1 for anonymous pages without a mapping 2655b5c7120SChristoph Lameter * 2 for pages with a mapping 266266cf658SDavid Howells * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 267b20a3503SChristoph Lameter */ 2682d1db3b1SChristoph Lameter static int migrate_page_move_mapping(struct address_space *mapping, 269b969c4abSMel Gorman struct page *newpage, struct page *page, 270a6bc32b8SMel Gorman struct buffer_head *head, enum migrate_mode mode) 271b20a3503SChristoph Lameter { 272e286781dSNick Piggin int expected_count; 2737cf9c2c7SNick Piggin void **pslot; 274b20a3503SChristoph Lameter 2756c5240aeSChristoph Lameter if (!mapping) { 2760e8c7d0fSChristoph Lameter /* Anonymous page without mapping */ 2776c5240aeSChristoph Lameter if (page_count(page) != 1) 2786c5240aeSChristoph Lameter return -EAGAIN; 279*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 2806c5240aeSChristoph Lameter } 2816c5240aeSChristoph Lameter 28219fd6231SNick Piggin spin_lock_irq(&mapping->tree_lock); 283b20a3503SChristoph Lameter 2847cf9c2c7SNick Piggin pslot = radix_tree_lookup_slot(&mapping->page_tree, 285b20a3503SChristoph Lameter page_index(page)); 286b20a3503SChristoph Lameter 287edcf4748SJohannes Weiner expected_count = 2 + page_has_private(page); 288e286781dSNick Piggin if (page_count(page) != expected_count || 28929c1f677SMel Gorman radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { 29019fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 291e23ca00bSChristoph Lameter return -EAGAIN; 292b20a3503SChristoph Lameter } 293b20a3503SChristoph Lameter 294e286781dSNick Piggin if (!page_freeze_refs(page, expected_count)) { 29519fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 296e286781dSNick Piggin return -EAGAIN; 297e286781dSNick Piggin } 298e286781dSNick Piggin 299b20a3503SChristoph Lameter /* 300b969c4abSMel Gorman * In the async migration case of moving a page with buffers, lock the 301b969c4abSMel Gorman * buffers using trylock before the mapping is moved. If the mapping 302b969c4abSMel Gorman * was moved, we later failed to lock the buffers and could not move 303b969c4abSMel Gorman * the mapping back due to an elevated page count, we would have to 304b969c4abSMel Gorman * block waiting on other references to be dropped. 305b969c4abSMel Gorman */ 306a6bc32b8SMel Gorman if (mode == MIGRATE_ASYNC && head && 307a6bc32b8SMel Gorman !buffer_migrate_lock_buffers(head, mode)) { 308b969c4abSMel Gorman page_unfreeze_refs(page, expected_count); 309b969c4abSMel Gorman spin_unlock_irq(&mapping->tree_lock); 310b969c4abSMel Gorman return -EAGAIN; 311b969c4abSMel Gorman } 312b969c4abSMel Gorman 313b969c4abSMel Gorman /* 314b20a3503SChristoph Lameter * Now we know that no one else is looking at the page. 315b20a3503SChristoph Lameter */ 3167cf9c2c7SNick Piggin get_page(newpage); /* add cache reference */ 317b20a3503SChristoph Lameter if (PageSwapCache(page)) { 318b20a3503SChristoph Lameter SetPageSwapCache(newpage); 319b20a3503SChristoph Lameter set_page_private(newpage, page_private(page)); 320b20a3503SChristoph Lameter } 321b20a3503SChristoph Lameter 3227cf9c2c7SNick Piggin radix_tree_replace_slot(pslot, newpage); 3237cf9c2c7SNick Piggin 3247cf9c2c7SNick Piggin /* 325937a94c9SJacobo Giralt * Drop cache reference from old page by unfreezing 326937a94c9SJacobo Giralt * to one less reference. 3277cf9c2c7SNick Piggin * We know this isn't the last reference. 3287cf9c2c7SNick Piggin */ 329937a94c9SJacobo Giralt page_unfreeze_refs(page, expected_count - 1); 3307cf9c2c7SNick Piggin 3310e8c7d0fSChristoph Lameter /* 3320e8c7d0fSChristoph Lameter * If moved to a different zone then also account 3330e8c7d0fSChristoph Lameter * the page for that zone. Other VM counters will be 3340e8c7d0fSChristoph Lameter * taken care of when we establish references to the 3350e8c7d0fSChristoph Lameter * new page and drop references to the old page. 3360e8c7d0fSChristoph Lameter * 3370e8c7d0fSChristoph Lameter * Note that anonymous pages are accounted for 3380e8c7d0fSChristoph Lameter * via NR_FILE_PAGES and NR_ANON_PAGES if they 3390e8c7d0fSChristoph Lameter * are mapped to swap space. 3400e8c7d0fSChristoph Lameter */ 3410e8c7d0fSChristoph Lameter __dec_zone_page_state(page, NR_FILE_PAGES); 3420e8c7d0fSChristoph Lameter __inc_zone_page_state(newpage, NR_FILE_PAGES); 34399a15e21SAndrea Arcangeli if (!PageSwapCache(page) && PageSwapBacked(page)) { 3444b02108aSKOSAKI Motohiro __dec_zone_page_state(page, NR_SHMEM); 3454b02108aSKOSAKI Motohiro __inc_zone_page_state(newpage, NR_SHMEM); 3464b02108aSKOSAKI Motohiro } 34719fd6231SNick Piggin spin_unlock_irq(&mapping->tree_lock); 348b20a3503SChristoph Lameter 349*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 350b20a3503SChristoph Lameter } 351b20a3503SChristoph Lameter 352b20a3503SChristoph Lameter /* 353290408d4SNaoya Horiguchi * The expected number of remaining references is the same as that 354290408d4SNaoya Horiguchi * of migrate_page_move_mapping(). 355290408d4SNaoya Horiguchi */ 356290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping, 357290408d4SNaoya Horiguchi struct page *newpage, struct page *page) 358290408d4SNaoya Horiguchi { 359290408d4SNaoya Horiguchi int expected_count; 360290408d4SNaoya Horiguchi void **pslot; 361290408d4SNaoya Horiguchi 362290408d4SNaoya Horiguchi if (!mapping) { 363290408d4SNaoya Horiguchi if (page_count(page) != 1) 364290408d4SNaoya Horiguchi return -EAGAIN; 365*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 366290408d4SNaoya Horiguchi } 367290408d4SNaoya Horiguchi 368290408d4SNaoya Horiguchi spin_lock_irq(&mapping->tree_lock); 369290408d4SNaoya Horiguchi 370290408d4SNaoya Horiguchi pslot = radix_tree_lookup_slot(&mapping->page_tree, 371290408d4SNaoya Horiguchi page_index(page)); 372290408d4SNaoya Horiguchi 373290408d4SNaoya Horiguchi expected_count = 2 + page_has_private(page); 374290408d4SNaoya Horiguchi if (page_count(page) != expected_count || 37529c1f677SMel Gorman radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { 376290408d4SNaoya Horiguchi spin_unlock_irq(&mapping->tree_lock); 377290408d4SNaoya Horiguchi return -EAGAIN; 378290408d4SNaoya Horiguchi } 379290408d4SNaoya Horiguchi 380290408d4SNaoya Horiguchi if (!page_freeze_refs(page, expected_count)) { 381290408d4SNaoya Horiguchi spin_unlock_irq(&mapping->tree_lock); 382290408d4SNaoya Horiguchi return -EAGAIN; 383290408d4SNaoya Horiguchi } 384290408d4SNaoya Horiguchi 385290408d4SNaoya Horiguchi get_page(newpage); 386290408d4SNaoya Horiguchi 387290408d4SNaoya Horiguchi radix_tree_replace_slot(pslot, newpage); 388290408d4SNaoya Horiguchi 389937a94c9SJacobo Giralt page_unfreeze_refs(page, expected_count - 1); 390290408d4SNaoya Horiguchi 391290408d4SNaoya Horiguchi spin_unlock_irq(&mapping->tree_lock); 392*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 393290408d4SNaoya Horiguchi } 394290408d4SNaoya Horiguchi 395290408d4SNaoya Horiguchi /* 396b20a3503SChristoph Lameter * Copy the page to its new location 397b20a3503SChristoph Lameter */ 398290408d4SNaoya Horiguchi void migrate_page_copy(struct page *newpage, struct page *page) 399b20a3503SChristoph Lameter { 400290408d4SNaoya Horiguchi if (PageHuge(page)) 401290408d4SNaoya Horiguchi copy_huge_page(newpage, page); 402290408d4SNaoya Horiguchi else 403b20a3503SChristoph Lameter copy_highpage(newpage, page); 404b20a3503SChristoph Lameter 405b20a3503SChristoph Lameter if (PageError(page)) 406b20a3503SChristoph Lameter SetPageError(newpage); 407b20a3503SChristoph Lameter if (PageReferenced(page)) 408b20a3503SChristoph Lameter SetPageReferenced(newpage); 409b20a3503SChristoph Lameter if (PageUptodate(page)) 410b20a3503SChristoph Lameter SetPageUptodate(newpage); 411894bc310SLee Schermerhorn if (TestClearPageActive(page)) { 412894bc310SLee Schermerhorn VM_BUG_ON(PageUnevictable(page)); 413b20a3503SChristoph Lameter SetPageActive(newpage); 414418b27efSLee Schermerhorn } else if (TestClearPageUnevictable(page)) 415418b27efSLee Schermerhorn SetPageUnevictable(newpage); 416b20a3503SChristoph Lameter if (PageChecked(page)) 417b20a3503SChristoph Lameter SetPageChecked(newpage); 418b20a3503SChristoph Lameter if (PageMappedToDisk(page)) 419b20a3503SChristoph Lameter SetPageMappedToDisk(newpage); 420b20a3503SChristoph Lameter 421b20a3503SChristoph Lameter if (PageDirty(page)) { 422b20a3503SChristoph Lameter clear_page_dirty_for_io(page); 4233a902c5fSNick Piggin /* 4243a902c5fSNick Piggin * Want to mark the page and the radix tree as dirty, and 4253a902c5fSNick Piggin * redo the accounting that clear_page_dirty_for_io undid, 4263a902c5fSNick Piggin * but we can't use set_page_dirty because that function 4273a902c5fSNick Piggin * is actually a signal that all of the page has become dirty. 42825985edcSLucas De Marchi * Whereas only part of our page may be dirty. 4293a902c5fSNick Piggin */ 430752dc185SHugh Dickins if (PageSwapBacked(page)) 431752dc185SHugh Dickins SetPageDirty(newpage); 432752dc185SHugh Dickins else 4333a902c5fSNick Piggin __set_page_dirty_nobuffers(newpage); 434b20a3503SChristoph Lameter } 435b20a3503SChristoph Lameter 436b291f000SNick Piggin mlock_migrate_page(newpage, page); 437e9995ef9SHugh Dickins ksm_migrate_page(newpage, page); 438b291f000SNick Piggin 439b20a3503SChristoph Lameter ClearPageSwapCache(page); 440b20a3503SChristoph Lameter ClearPagePrivate(page); 441b20a3503SChristoph Lameter set_page_private(page, 0); 442b20a3503SChristoph Lameter 443b20a3503SChristoph Lameter /* 444b20a3503SChristoph Lameter * If any waiters have accumulated on the new page then 445b20a3503SChristoph Lameter * wake them up. 446b20a3503SChristoph Lameter */ 447b20a3503SChristoph Lameter if (PageWriteback(newpage)) 448b20a3503SChristoph Lameter end_page_writeback(newpage); 449b20a3503SChristoph Lameter } 450b20a3503SChristoph Lameter 4511d8b85ccSChristoph Lameter /************************************************************ 4521d8b85ccSChristoph Lameter * Migration functions 4531d8b85ccSChristoph Lameter ***********************************************************/ 4541d8b85ccSChristoph Lameter 4551d8b85ccSChristoph Lameter /* Always fail migration. Used for mappings that are not movable */ 4562d1db3b1SChristoph Lameter int fail_migrate_page(struct address_space *mapping, 4572d1db3b1SChristoph Lameter struct page *newpage, struct page *page) 4581d8b85ccSChristoph Lameter { 4591d8b85ccSChristoph Lameter return -EIO; 4601d8b85ccSChristoph Lameter } 4611d8b85ccSChristoph Lameter EXPORT_SYMBOL(fail_migrate_page); 4621d8b85ccSChristoph Lameter 463b20a3503SChristoph Lameter /* 464b20a3503SChristoph Lameter * Common logic to directly migrate a single page suitable for 465266cf658SDavid Howells * pages that do not use PagePrivate/PagePrivate2. 466b20a3503SChristoph Lameter * 467b20a3503SChristoph Lameter * Pages are locked upon entry and exit. 468b20a3503SChristoph Lameter */ 4692d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping, 470a6bc32b8SMel Gorman struct page *newpage, struct page *page, 471a6bc32b8SMel Gorman enum migrate_mode mode) 472b20a3503SChristoph Lameter { 473b20a3503SChristoph Lameter int rc; 474b20a3503SChristoph Lameter 475b20a3503SChristoph Lameter BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 476b20a3503SChristoph Lameter 477a6bc32b8SMel Gorman rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 478b20a3503SChristoph Lameter 479*78bd5209SRafael Aquini if (rc != MIGRATEPAGE_SUCCESS) 480b20a3503SChristoph Lameter return rc; 481b20a3503SChristoph Lameter 482b20a3503SChristoph Lameter migrate_page_copy(newpage, page); 483*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 484b20a3503SChristoph Lameter } 485b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page); 486b20a3503SChristoph Lameter 4879361401eSDavid Howells #ifdef CONFIG_BLOCK 488b20a3503SChristoph Lameter /* 4891d8b85ccSChristoph Lameter * Migration function for pages with buffers. This function can only be used 4901d8b85ccSChristoph Lameter * if the underlying filesystem guarantees that no other references to "page" 4911d8b85ccSChristoph Lameter * exist. 4921d8b85ccSChristoph Lameter */ 4932d1db3b1SChristoph Lameter int buffer_migrate_page(struct address_space *mapping, 494a6bc32b8SMel Gorman struct page *newpage, struct page *page, enum migrate_mode mode) 4951d8b85ccSChristoph Lameter { 4961d8b85ccSChristoph Lameter struct buffer_head *bh, *head; 4971d8b85ccSChristoph Lameter int rc; 4981d8b85ccSChristoph Lameter 4991d8b85ccSChristoph Lameter if (!page_has_buffers(page)) 500a6bc32b8SMel Gorman return migrate_page(mapping, newpage, page, mode); 5011d8b85ccSChristoph Lameter 5021d8b85ccSChristoph Lameter head = page_buffers(page); 5031d8b85ccSChristoph Lameter 504a6bc32b8SMel Gorman rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 5051d8b85ccSChristoph Lameter 506*78bd5209SRafael Aquini if (rc != MIGRATEPAGE_SUCCESS) 5071d8b85ccSChristoph Lameter return rc; 5081d8b85ccSChristoph Lameter 509b969c4abSMel Gorman /* 510b969c4abSMel Gorman * In the async case, migrate_page_move_mapping locked the buffers 511b969c4abSMel Gorman * with an IRQ-safe spinlock held. In the sync case, the buffers 512b969c4abSMel Gorman * need to be locked now 513b969c4abSMel Gorman */ 514a6bc32b8SMel Gorman if (mode != MIGRATE_ASYNC) 515a6bc32b8SMel Gorman BUG_ON(!buffer_migrate_lock_buffers(head, mode)); 5161d8b85ccSChristoph Lameter 5171d8b85ccSChristoph Lameter ClearPagePrivate(page); 5181d8b85ccSChristoph Lameter set_page_private(newpage, page_private(page)); 5191d8b85ccSChristoph Lameter set_page_private(page, 0); 5201d8b85ccSChristoph Lameter put_page(page); 5211d8b85ccSChristoph Lameter get_page(newpage); 5221d8b85ccSChristoph Lameter 5231d8b85ccSChristoph Lameter bh = head; 5241d8b85ccSChristoph Lameter do { 5251d8b85ccSChristoph Lameter set_bh_page(bh, newpage, bh_offset(bh)); 5261d8b85ccSChristoph Lameter bh = bh->b_this_page; 5271d8b85ccSChristoph Lameter 5281d8b85ccSChristoph Lameter } while (bh != head); 5291d8b85ccSChristoph Lameter 5301d8b85ccSChristoph Lameter SetPagePrivate(newpage); 5311d8b85ccSChristoph Lameter 5321d8b85ccSChristoph Lameter migrate_page_copy(newpage, page); 5331d8b85ccSChristoph Lameter 5341d8b85ccSChristoph Lameter bh = head; 5351d8b85ccSChristoph Lameter do { 5361d8b85ccSChristoph Lameter unlock_buffer(bh); 5371d8b85ccSChristoph Lameter put_bh(bh); 5381d8b85ccSChristoph Lameter bh = bh->b_this_page; 5391d8b85ccSChristoph Lameter 5401d8b85ccSChristoph Lameter } while (bh != head); 5411d8b85ccSChristoph Lameter 542*78bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 5431d8b85ccSChristoph Lameter } 5441d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page); 5459361401eSDavid Howells #endif 5461d8b85ccSChristoph Lameter 54704e62a29SChristoph Lameter /* 54804e62a29SChristoph Lameter * Writeback a page to clean the dirty state 54904e62a29SChristoph Lameter */ 55004e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page) 55104e62a29SChristoph Lameter { 55204e62a29SChristoph Lameter struct writeback_control wbc = { 55304e62a29SChristoph Lameter .sync_mode = WB_SYNC_NONE, 55404e62a29SChristoph Lameter .nr_to_write = 1, 55504e62a29SChristoph Lameter .range_start = 0, 55604e62a29SChristoph Lameter .range_end = LLONG_MAX, 55704e62a29SChristoph Lameter .for_reclaim = 1 55804e62a29SChristoph Lameter }; 55904e62a29SChristoph Lameter int rc; 56004e62a29SChristoph Lameter 56104e62a29SChristoph Lameter if (!mapping->a_ops->writepage) 56204e62a29SChristoph Lameter /* No write method for the address space */ 56304e62a29SChristoph Lameter return -EINVAL; 56404e62a29SChristoph Lameter 56504e62a29SChristoph Lameter if (!clear_page_dirty_for_io(page)) 56604e62a29SChristoph Lameter /* Someone else already triggered a write */ 56704e62a29SChristoph Lameter return -EAGAIN; 56804e62a29SChristoph Lameter 56904e62a29SChristoph Lameter /* 57004e62a29SChristoph Lameter * A dirty page may imply that the underlying filesystem has 57104e62a29SChristoph Lameter * the page on some queue. So the page must be clean for 57204e62a29SChristoph Lameter * migration. Writeout may mean we loose the lock and the 57304e62a29SChristoph Lameter * page state is no longer what we checked for earlier. 57404e62a29SChristoph Lameter * At this point we know that the migration attempt cannot 57504e62a29SChristoph Lameter * be successful. 57604e62a29SChristoph Lameter */ 57704e62a29SChristoph Lameter remove_migration_ptes(page, page); 57804e62a29SChristoph Lameter 57904e62a29SChristoph Lameter rc = mapping->a_ops->writepage(page, &wbc); 58004e62a29SChristoph Lameter 58104e62a29SChristoph Lameter if (rc != AOP_WRITEPAGE_ACTIVATE) 58204e62a29SChristoph Lameter /* unlocked. Relock */ 58304e62a29SChristoph Lameter lock_page(page); 58404e62a29SChristoph Lameter 585bda8550dSHugh Dickins return (rc < 0) ? -EIO : -EAGAIN; 58604e62a29SChristoph Lameter } 58704e62a29SChristoph Lameter 58804e62a29SChristoph Lameter /* 58904e62a29SChristoph Lameter * Default handling if a filesystem does not provide a migration function. 59004e62a29SChristoph Lameter */ 5918351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping, 592a6bc32b8SMel Gorman struct page *newpage, struct page *page, enum migrate_mode mode) 5938351a6e4SChristoph Lameter { 594b969c4abSMel Gorman if (PageDirty(page)) { 595a6bc32b8SMel Gorman /* Only writeback pages in full synchronous migration */ 596a6bc32b8SMel Gorman if (mode != MIGRATE_SYNC) 597b969c4abSMel Gorman return -EBUSY; 59804e62a29SChristoph Lameter return writeout(mapping, page); 599b969c4abSMel Gorman } 6008351a6e4SChristoph Lameter 6018351a6e4SChristoph Lameter /* 6028351a6e4SChristoph Lameter * Buffers may be managed in a filesystem specific way. 6038351a6e4SChristoph Lameter * We must have no buffers or drop them. 6048351a6e4SChristoph Lameter */ 605266cf658SDavid Howells if (page_has_private(page) && 6068351a6e4SChristoph Lameter !try_to_release_page(page, GFP_KERNEL)) 6078351a6e4SChristoph Lameter return -EAGAIN; 6088351a6e4SChristoph Lameter 609a6bc32b8SMel Gorman return migrate_page(mapping, newpage, page, mode); 6108351a6e4SChristoph Lameter } 6118351a6e4SChristoph Lameter 6121d8b85ccSChristoph Lameter /* 613e24f0b8fSChristoph Lameter * Move a page to a newly allocated page 614e24f0b8fSChristoph Lameter * The page is locked and all ptes have been successfully removed. 615b20a3503SChristoph Lameter * 616e24f0b8fSChristoph Lameter * The new page will have replaced the old page if this function 617e24f0b8fSChristoph Lameter * is successful. 618894bc310SLee Schermerhorn * 619894bc310SLee Schermerhorn * Return value: 620894bc310SLee Schermerhorn * < 0 - error code 621*78bd5209SRafael Aquini * MIGRATEPAGE_SUCCESS - success 622b20a3503SChristoph Lameter */ 6233fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page, 624a6bc32b8SMel Gorman int remap_swapcache, enum migrate_mode mode) 625b20a3503SChristoph Lameter { 626e24f0b8fSChristoph Lameter struct address_space *mapping; 627b20a3503SChristoph Lameter int rc; 628b20a3503SChristoph Lameter 629b20a3503SChristoph Lameter /* 630e24f0b8fSChristoph Lameter * Block others from accessing the page when we get around to 631e24f0b8fSChristoph Lameter * establishing additional references. We are the only one 632e24f0b8fSChristoph Lameter * holding a reference to the new page at this point. 633b20a3503SChristoph Lameter */ 634529ae9aaSNick Piggin if (!trylock_page(newpage)) 635e24f0b8fSChristoph Lameter BUG(); 636b20a3503SChristoph Lameter 6372d1db3b1SChristoph Lameter /* Prepare mapping for the new page.*/ 6382d1db3b1SChristoph Lameter newpage->index = page->index; 6392d1db3b1SChristoph Lameter newpage->mapping = page->mapping; 640b2e18538SRik van Riel if (PageSwapBacked(page)) 641b2e18538SRik van Riel SetPageSwapBacked(newpage); 6422d1db3b1SChristoph Lameter 643b20a3503SChristoph Lameter mapping = page_mapping(page); 644b20a3503SChristoph Lameter if (!mapping) 645a6bc32b8SMel Gorman rc = migrate_page(mapping, newpage, page, mode); 6466c5240aeSChristoph Lameter else if (mapping->a_ops->migratepage) 647b20a3503SChristoph Lameter /* 648b969c4abSMel Gorman * Most pages have a mapping and most filesystems provide a 649b969c4abSMel Gorman * migratepage callback. Anonymous pages are part of swap 650b969c4abSMel Gorman * space which also has its own migratepage callback. This 651b969c4abSMel Gorman * is the most common path for page migration. 652b20a3503SChristoph Lameter */ 6532d1db3b1SChristoph Lameter rc = mapping->a_ops->migratepage(mapping, 654a6bc32b8SMel Gorman newpage, page, mode); 6558351a6e4SChristoph Lameter else 656a6bc32b8SMel Gorman rc = fallback_migrate_page(mapping, newpage, page, mode); 657b20a3503SChristoph Lameter 658*78bd5209SRafael Aquini if (rc != MIGRATEPAGE_SUCCESS) { 659e24f0b8fSChristoph Lameter newpage->mapping = NULL; 6603fe2011fSMel Gorman } else { 6613fe2011fSMel Gorman if (remap_swapcache) 6623fe2011fSMel Gorman remove_migration_ptes(page, newpage); 66335512ecaSKonstantin Khlebnikov page->mapping = NULL; 6643fe2011fSMel Gorman } 6656c5240aeSChristoph Lameter 666b20a3503SChristoph Lameter unlock_page(newpage); 667b20a3503SChristoph Lameter 668e24f0b8fSChristoph Lameter return rc; 669e24f0b8fSChristoph Lameter } 670e24f0b8fSChristoph Lameter 6710dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage, 672a6bc32b8SMel Gorman int force, bool offlining, enum migrate_mode mode) 673e24f0b8fSChristoph Lameter { 6740dabec93SMinchan Kim int rc = -EAGAIN; 6753fe2011fSMel Gorman int remap_swapcache = 1; 67656039efaSKAMEZAWA Hiroyuki struct mem_cgroup *mem; 6773f6c8272SMel Gorman struct anon_vma *anon_vma = NULL; 67895a402c3SChristoph Lameter 679529ae9aaSNick Piggin if (!trylock_page(page)) { 680a6bc32b8SMel Gorman if (!force || mode == MIGRATE_ASYNC) 6810dabec93SMinchan Kim goto out; 6823e7d3449SMel Gorman 6833e7d3449SMel Gorman /* 6843e7d3449SMel Gorman * It's not safe for direct compaction to call lock_page. 6853e7d3449SMel Gorman * For example, during page readahead pages are added locked 6863e7d3449SMel Gorman * to the LRU. Later, when the IO completes the pages are 6873e7d3449SMel Gorman * marked uptodate and unlocked. However, the queueing 6883e7d3449SMel Gorman * could be merging multiple pages for one bio (e.g. 6893e7d3449SMel Gorman * mpage_readpages). If an allocation happens for the 6903e7d3449SMel Gorman * second or third page, the process can end up locking 6913e7d3449SMel Gorman * the same page twice and deadlocking. Rather than 6923e7d3449SMel Gorman * trying to be clever about what pages can be locked, 6933e7d3449SMel Gorman * avoid the use of lock_page for direct compaction 6943e7d3449SMel Gorman * altogether. 6953e7d3449SMel Gorman */ 6963e7d3449SMel Gorman if (current->flags & PF_MEMALLOC) 6970dabec93SMinchan Kim goto out; 6983e7d3449SMel Gorman 699e24f0b8fSChristoph Lameter lock_page(page); 700e24f0b8fSChristoph Lameter } 701e24f0b8fSChristoph Lameter 70262b61f61SHugh Dickins /* 70362b61f61SHugh Dickins * Only memory hotplug's offline_pages() caller has locked out KSM, 70462b61f61SHugh Dickins * and can safely migrate a KSM page. The other cases have skipped 70562b61f61SHugh Dickins * PageKsm along with PageReserved - but it is only now when we have 70662b61f61SHugh Dickins * the page lock that we can be certain it will not go KSM beneath us 70762b61f61SHugh Dickins * (KSM will not upgrade a page from PageAnon to PageKsm when it sees 70862b61f61SHugh Dickins * its pagecount raised, but only here do we take the page lock which 70962b61f61SHugh Dickins * serializes that). 71062b61f61SHugh Dickins */ 71162b61f61SHugh Dickins if (PageKsm(page) && !offlining) { 71262b61f61SHugh Dickins rc = -EBUSY; 71362b61f61SHugh Dickins goto unlock; 71462b61f61SHugh Dickins } 71562b61f61SHugh Dickins 71601b1ae63SKAMEZAWA Hiroyuki /* charge against new page */ 7170030f535SJohannes Weiner mem_cgroup_prepare_migration(page, newpage, &mem); 71801b1ae63SKAMEZAWA Hiroyuki 719e24f0b8fSChristoph Lameter if (PageWriteback(page)) { 72011bc82d6SAndrea Arcangeli /* 721a6bc32b8SMel Gorman * Only in the case of a full syncronous migration is it 722a6bc32b8SMel Gorman * necessary to wait for PageWriteback. In the async case, 723a6bc32b8SMel Gorman * the retry loop is too short and in the sync-light case, 724a6bc32b8SMel Gorman * the overhead of stalling is too much 72511bc82d6SAndrea Arcangeli */ 726a6bc32b8SMel Gorman if (mode != MIGRATE_SYNC) { 72711bc82d6SAndrea Arcangeli rc = -EBUSY; 72811bc82d6SAndrea Arcangeli goto uncharge; 72911bc82d6SAndrea Arcangeli } 73011bc82d6SAndrea Arcangeli if (!force) 73101b1ae63SKAMEZAWA Hiroyuki goto uncharge; 732e24f0b8fSChristoph Lameter wait_on_page_writeback(page); 733e24f0b8fSChristoph Lameter } 734e24f0b8fSChristoph Lameter /* 735dc386d4dSKAMEZAWA Hiroyuki * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 736dc386d4dSKAMEZAWA Hiroyuki * we cannot notice that anon_vma is freed while we migrates a page. 7371ce82b69SHugh Dickins * This get_anon_vma() delays freeing anon_vma pointer until the end 738dc386d4dSKAMEZAWA Hiroyuki * of migration. File cache pages are no problem because of page_lock() 739989f89c5SKAMEZAWA Hiroyuki * File Caches may use write_page() or lock_page() in migration, then, 740989f89c5SKAMEZAWA Hiroyuki * just care Anon page here. 741e24f0b8fSChristoph Lameter */ 742989f89c5SKAMEZAWA Hiroyuki if (PageAnon(page)) { 7431ce82b69SHugh Dickins /* 7441ce82b69SHugh Dickins * Only page_lock_anon_vma() understands the subtleties of 7451ce82b69SHugh Dickins * getting a hold on an anon_vma from outside one of its mms. 7461ce82b69SHugh Dickins */ 747746b18d4SPeter Zijlstra anon_vma = page_get_anon_vma(page); 7481ce82b69SHugh Dickins if (anon_vma) { 7491ce82b69SHugh Dickins /* 750746b18d4SPeter Zijlstra * Anon page 7511ce82b69SHugh Dickins */ 7521ce82b69SHugh Dickins } else if (PageSwapCache(page)) { 7533fe2011fSMel Gorman /* 7543fe2011fSMel Gorman * We cannot be sure that the anon_vma of an unmapped 7553fe2011fSMel Gorman * swapcache page is safe to use because we don't 7563fe2011fSMel Gorman * know in advance if the VMA that this page belonged 7573fe2011fSMel Gorman * to still exists. If the VMA and others sharing the 7583fe2011fSMel Gorman * data have been freed, then the anon_vma could 7593fe2011fSMel Gorman * already be invalid. 7603fe2011fSMel Gorman * 7613fe2011fSMel Gorman * To avoid this possibility, swapcache pages get 7623fe2011fSMel Gorman * migrated but are not remapped when migration 7633fe2011fSMel Gorman * completes 7643fe2011fSMel Gorman */ 7653fe2011fSMel Gorman remap_swapcache = 0; 7663fe2011fSMel Gorman } else { 7671ce82b69SHugh Dickins goto uncharge; 768989f89c5SKAMEZAWA Hiroyuki } 7693fe2011fSMel Gorman } 77062e1c553SShaohua Li 771dc386d4dSKAMEZAWA Hiroyuki /* 77262e1c553SShaohua Li * Corner case handling: 77362e1c553SShaohua Li * 1. When a new swap-cache page is read into, it is added to the LRU 77462e1c553SShaohua Li * and treated as swapcache but it has no rmap yet. 77562e1c553SShaohua Li * Calling try_to_unmap() against a page->mapping==NULL page will 77662e1c553SShaohua Li * trigger a BUG. So handle it here. 77762e1c553SShaohua Li * 2. An orphaned page (see truncate_complete_page) might have 77862e1c553SShaohua Li * fs-private metadata. The page can be picked up due to memory 77962e1c553SShaohua Li * offlining. Everywhere else except page reclaim, the page is 78062e1c553SShaohua Li * invisible to the vm, so the page can not be migrated. So try to 78162e1c553SShaohua Li * free the metadata, so the page can be freed. 782dc386d4dSKAMEZAWA Hiroyuki */ 78362e1c553SShaohua Li if (!page->mapping) { 7841ce82b69SHugh Dickins VM_BUG_ON(PageAnon(page)); 7851ce82b69SHugh Dickins if (page_has_private(page)) { 78662e1c553SShaohua Li try_to_free_buffers(page); 7871ce82b69SHugh Dickins goto uncharge; 78862e1c553SShaohua Li } 789abfc3488SShaohua Li goto skip_unmap; 790abfc3488SShaohua Li } 79162e1c553SShaohua Li 792dc386d4dSKAMEZAWA Hiroyuki /* Establish migration ptes or remove ptes */ 79314fa31b8SAndi Kleen try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 794dc386d4dSKAMEZAWA Hiroyuki 795abfc3488SShaohua Li skip_unmap: 796e24f0b8fSChristoph Lameter if (!page_mapped(page)) 797a6bc32b8SMel Gorman rc = move_to_new_page(newpage, page, remap_swapcache, mode); 798e24f0b8fSChristoph Lameter 7993fe2011fSMel Gorman if (rc && remap_swapcache) 8006c5240aeSChristoph Lameter remove_migration_ptes(page, page); 8013f6c8272SMel Gorman 8023f6c8272SMel Gorman /* Drop an anon_vma reference if we took one */ 80376545066SRik van Riel if (anon_vma) 8049e60109fSPeter Zijlstra put_anon_vma(anon_vma); 8053f6c8272SMel Gorman 80601b1ae63SKAMEZAWA Hiroyuki uncharge: 807*78bd5209SRafael Aquini mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); 808e24f0b8fSChristoph Lameter unlock: 809b20a3503SChristoph Lameter unlock_page(page); 8100dabec93SMinchan Kim out: 8110dabec93SMinchan Kim return rc; 8120dabec93SMinchan Kim } 81395a402c3SChristoph Lameter 8140dabec93SMinchan Kim /* 8150dabec93SMinchan Kim * Obtain the lock on page, remove all ptes and migrate the page 8160dabec93SMinchan Kim * to the newly allocated page in newpage. 8170dabec93SMinchan Kim */ 8180dabec93SMinchan Kim static int unmap_and_move(new_page_t get_new_page, unsigned long private, 819a6bc32b8SMel Gorman struct page *page, int force, bool offlining, 820a6bc32b8SMel Gorman enum migrate_mode mode) 8210dabec93SMinchan Kim { 8220dabec93SMinchan Kim int rc = 0; 8230dabec93SMinchan Kim int *result = NULL; 8240dabec93SMinchan Kim struct page *newpage = get_new_page(page, private, &result); 8250dabec93SMinchan Kim 8260dabec93SMinchan Kim if (!newpage) 8270dabec93SMinchan Kim return -ENOMEM; 8280dabec93SMinchan Kim 8290dabec93SMinchan Kim if (page_count(page) == 1) { 8300dabec93SMinchan Kim /* page was freed from under us. So we are done. */ 8310dabec93SMinchan Kim goto out; 8320dabec93SMinchan Kim } 8330dabec93SMinchan Kim 8340dabec93SMinchan Kim if (unlikely(PageTransHuge(page))) 8350dabec93SMinchan Kim if (unlikely(split_huge_page(page))) 8360dabec93SMinchan Kim goto out; 8370dabec93SMinchan Kim 838a6bc32b8SMel Gorman rc = __unmap_and_move(page, newpage, force, offlining, mode); 8390dabec93SMinchan Kim out: 840e24f0b8fSChristoph Lameter if (rc != -EAGAIN) { 841aaa994b3SChristoph Lameter /* 842aaa994b3SChristoph Lameter * A page that has been migrated has all references 843aaa994b3SChristoph Lameter * removed and will be freed. A page that has not been 844aaa994b3SChristoph Lameter * migrated will have kepts its references and be 845aaa994b3SChristoph Lameter * restored. 846aaa994b3SChristoph Lameter */ 847aaa994b3SChristoph Lameter list_del(&page->lru); 848a731286dSKOSAKI Motohiro dec_zone_page_state(page, NR_ISOLATED_ANON + 8496c0b1351SJohannes Weiner page_is_file_cache(page)); 850894bc310SLee Schermerhorn putback_lru_page(page); 851e24f0b8fSChristoph Lameter } 85295a402c3SChristoph Lameter /* 85395a402c3SChristoph Lameter * Move the new page to the LRU. If migration was not successful 85495a402c3SChristoph Lameter * then this will free the page. 85595a402c3SChristoph Lameter */ 856894bc310SLee Schermerhorn putback_lru_page(newpage); 857742755a1SChristoph Lameter if (result) { 858742755a1SChristoph Lameter if (rc) 859742755a1SChristoph Lameter *result = rc; 860742755a1SChristoph Lameter else 861742755a1SChristoph Lameter *result = page_to_nid(newpage); 862742755a1SChristoph Lameter } 863e24f0b8fSChristoph Lameter return rc; 864e24f0b8fSChristoph Lameter } 865b20a3503SChristoph Lameter 866e24f0b8fSChristoph Lameter /* 867290408d4SNaoya Horiguchi * Counterpart of unmap_and_move_page() for hugepage migration. 868290408d4SNaoya Horiguchi * 869290408d4SNaoya Horiguchi * This function doesn't wait the completion of hugepage I/O 870290408d4SNaoya Horiguchi * because there is no race between I/O and migration for hugepage. 871290408d4SNaoya Horiguchi * Note that currently hugepage I/O occurs only in direct I/O 872290408d4SNaoya Horiguchi * where no lock is held and PG_writeback is irrelevant, 873290408d4SNaoya Horiguchi * and writeback status of all subpages are counted in the reference 874290408d4SNaoya Horiguchi * count of the head page (i.e. if all subpages of a 2MB hugepage are 875290408d4SNaoya Horiguchi * under direct I/O, the reference of the head page is 512 and a bit more.) 876290408d4SNaoya Horiguchi * This means that when we try to migrate hugepage whose subpages are 877290408d4SNaoya Horiguchi * doing direct I/O, some references remain after try_to_unmap() and 878290408d4SNaoya Horiguchi * hugepage migration fails without data corruption. 879290408d4SNaoya Horiguchi * 880290408d4SNaoya Horiguchi * There is also no race when direct I/O is issued on the page under migration, 881290408d4SNaoya Horiguchi * because then pte is replaced with migration swap entry and direct I/O code 882290408d4SNaoya Horiguchi * will wait in the page fault for migration to complete. 883290408d4SNaoya Horiguchi */ 884290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page, 885290408d4SNaoya Horiguchi unsigned long private, struct page *hpage, 886a6bc32b8SMel Gorman int force, bool offlining, 887a6bc32b8SMel Gorman enum migrate_mode mode) 888290408d4SNaoya Horiguchi { 889290408d4SNaoya Horiguchi int rc = 0; 890290408d4SNaoya Horiguchi int *result = NULL; 891290408d4SNaoya Horiguchi struct page *new_hpage = get_new_page(hpage, private, &result); 892290408d4SNaoya Horiguchi struct anon_vma *anon_vma = NULL; 893290408d4SNaoya Horiguchi 894290408d4SNaoya Horiguchi if (!new_hpage) 895290408d4SNaoya Horiguchi return -ENOMEM; 896290408d4SNaoya Horiguchi 897290408d4SNaoya Horiguchi rc = -EAGAIN; 898290408d4SNaoya Horiguchi 899290408d4SNaoya Horiguchi if (!trylock_page(hpage)) { 900a6bc32b8SMel Gorman if (!force || mode != MIGRATE_SYNC) 901290408d4SNaoya Horiguchi goto out; 902290408d4SNaoya Horiguchi lock_page(hpage); 903290408d4SNaoya Horiguchi } 904290408d4SNaoya Horiguchi 905746b18d4SPeter Zijlstra if (PageAnon(hpage)) 906746b18d4SPeter Zijlstra anon_vma = page_get_anon_vma(hpage); 907290408d4SNaoya Horiguchi 908290408d4SNaoya Horiguchi try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 909290408d4SNaoya Horiguchi 910290408d4SNaoya Horiguchi if (!page_mapped(hpage)) 911a6bc32b8SMel Gorman rc = move_to_new_page(new_hpage, hpage, 1, mode); 912290408d4SNaoya Horiguchi 913290408d4SNaoya Horiguchi if (rc) 914290408d4SNaoya Horiguchi remove_migration_ptes(hpage, hpage); 915290408d4SNaoya Horiguchi 916fd4a4663SHugh Dickins if (anon_vma) 9179e60109fSPeter Zijlstra put_anon_vma(anon_vma); 9188e6ac7faSAneesh Kumar K.V 9198e6ac7faSAneesh Kumar K.V if (!rc) 9208e6ac7faSAneesh Kumar K.V hugetlb_cgroup_migrate(hpage, new_hpage); 9218e6ac7faSAneesh Kumar K.V 922290408d4SNaoya Horiguchi unlock_page(hpage); 92309761333SHillf Danton out: 924290408d4SNaoya Horiguchi put_page(new_hpage); 925290408d4SNaoya Horiguchi if (result) { 926290408d4SNaoya Horiguchi if (rc) 927290408d4SNaoya Horiguchi *result = rc; 928290408d4SNaoya Horiguchi else 929290408d4SNaoya Horiguchi *result = page_to_nid(new_hpage); 930290408d4SNaoya Horiguchi } 931290408d4SNaoya Horiguchi return rc; 932290408d4SNaoya Horiguchi } 933290408d4SNaoya Horiguchi 934290408d4SNaoya Horiguchi /* 935e24f0b8fSChristoph Lameter * migrate_pages 936e24f0b8fSChristoph Lameter * 93795a402c3SChristoph Lameter * The function takes one list of pages to migrate and a function 93895a402c3SChristoph Lameter * that determines from the page to be migrated and the private data 93995a402c3SChristoph Lameter * the target of the move and allocates the page. 940e24f0b8fSChristoph Lameter * 941e24f0b8fSChristoph Lameter * The function returns after 10 attempts or if no pages 942e24f0b8fSChristoph Lameter * are movable anymore because to has become empty 943cf608ac1SMinchan Kim * or no retryable pages exist anymore. 944cf608ac1SMinchan Kim * Caller should call putback_lru_pages to return pages to the LRU 94528bd6578SMinchan Kim * or free list only if ret != 0. 946e24f0b8fSChristoph Lameter * 94795a402c3SChristoph Lameter * Return: Number of pages not migrated or error code. 948e24f0b8fSChristoph Lameter */ 94995a402c3SChristoph Lameter int migrate_pages(struct list_head *from, 9507f0f2496SMel Gorman new_page_t get_new_page, unsigned long private, bool offlining, 951a6bc32b8SMel Gorman enum migrate_mode mode) 952e24f0b8fSChristoph Lameter { 953e24f0b8fSChristoph Lameter int retry = 1; 954e24f0b8fSChristoph Lameter int nr_failed = 0; 955e24f0b8fSChristoph Lameter int pass = 0; 956e24f0b8fSChristoph Lameter struct page *page; 957e24f0b8fSChristoph Lameter struct page *page2; 958e24f0b8fSChristoph Lameter int swapwrite = current->flags & PF_SWAPWRITE; 959e24f0b8fSChristoph Lameter int rc; 9602d1db3b1SChristoph Lameter 961e24f0b8fSChristoph Lameter if (!swapwrite) 962e24f0b8fSChristoph Lameter current->flags |= PF_SWAPWRITE; 963e24f0b8fSChristoph Lameter 964e24f0b8fSChristoph Lameter for(pass = 0; pass < 10 && retry; pass++) { 965e24f0b8fSChristoph Lameter retry = 0; 966e24f0b8fSChristoph Lameter 967e24f0b8fSChristoph Lameter list_for_each_entry_safe(page, page2, from, lru) { 968e24f0b8fSChristoph Lameter cond_resched(); 969e24f0b8fSChristoph Lameter 97095a402c3SChristoph Lameter rc = unmap_and_move(get_new_page, private, 97177f1fe6bSMel Gorman page, pass > 2, offlining, 972a6bc32b8SMel Gorman mode); 973e24f0b8fSChristoph Lameter 974e24f0b8fSChristoph Lameter switch(rc) { 97595a402c3SChristoph Lameter case -ENOMEM: 97695a402c3SChristoph Lameter goto out; 977e24f0b8fSChristoph Lameter case -EAGAIN: 978b20a3503SChristoph Lameter retry++; 979e24f0b8fSChristoph Lameter break; 980*78bd5209SRafael Aquini case MIGRATEPAGE_SUCCESS: 981e24f0b8fSChristoph Lameter break; 982e24f0b8fSChristoph Lameter default: 983b20a3503SChristoph Lameter /* Permanent failure */ 984b20a3503SChristoph Lameter nr_failed++; 985e24f0b8fSChristoph Lameter break; 986b20a3503SChristoph Lameter } 987b20a3503SChristoph Lameter } 988e24f0b8fSChristoph Lameter } 989*78bd5209SRafael Aquini rc = nr_failed + retry; 99095a402c3SChristoph Lameter out: 991b20a3503SChristoph Lameter if (!swapwrite) 992b20a3503SChristoph Lameter current->flags &= ~PF_SWAPWRITE; 993b20a3503SChristoph Lameter 99495a402c3SChristoph Lameter return rc; 995b20a3503SChristoph Lameter } 996b20a3503SChristoph Lameter 997189ebff2SAneesh Kumar K.V int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 998189ebff2SAneesh Kumar K.V unsigned long private, bool offlining, 999a6bc32b8SMel Gorman enum migrate_mode mode) 1000290408d4SNaoya Horiguchi { 1001189ebff2SAneesh Kumar K.V int pass, rc; 1002290408d4SNaoya Horiguchi 1003189ebff2SAneesh Kumar K.V for (pass = 0; pass < 10; pass++) { 1004290408d4SNaoya Horiguchi rc = unmap_and_move_huge_page(get_new_page, 1005189ebff2SAneesh Kumar K.V private, hpage, pass > 2, offlining, 1006a6bc32b8SMel Gorman mode); 1007290408d4SNaoya Horiguchi switch (rc) { 1008290408d4SNaoya Horiguchi case -ENOMEM: 1009290408d4SNaoya Horiguchi goto out; 1010290408d4SNaoya Horiguchi case -EAGAIN: 1011189ebff2SAneesh Kumar K.V /* try again */ 1012189ebff2SAneesh Kumar K.V cond_resched(); 1013290408d4SNaoya Horiguchi break; 1014*78bd5209SRafael Aquini case MIGRATEPAGE_SUCCESS: 1015189ebff2SAneesh Kumar K.V goto out; 1016290408d4SNaoya Horiguchi default: 1017189ebff2SAneesh Kumar K.V rc = -EIO; 1018189ebff2SAneesh Kumar K.V goto out; 1019290408d4SNaoya Horiguchi } 1020290408d4SNaoya Horiguchi } 1021290408d4SNaoya Horiguchi out: 1022290408d4SNaoya Horiguchi return rc; 1023290408d4SNaoya Horiguchi } 1024290408d4SNaoya Horiguchi 1025742755a1SChristoph Lameter #ifdef CONFIG_NUMA 1026742755a1SChristoph Lameter /* 1027742755a1SChristoph Lameter * Move a list of individual pages 1028742755a1SChristoph Lameter */ 1029742755a1SChristoph Lameter struct page_to_node { 1030742755a1SChristoph Lameter unsigned long addr; 1031742755a1SChristoph Lameter struct page *page; 1032742755a1SChristoph Lameter int node; 1033742755a1SChristoph Lameter int status; 1034742755a1SChristoph Lameter }; 1035742755a1SChristoph Lameter 1036742755a1SChristoph Lameter static struct page *new_page_node(struct page *p, unsigned long private, 1037742755a1SChristoph Lameter int **result) 1038742755a1SChristoph Lameter { 1039742755a1SChristoph Lameter struct page_to_node *pm = (struct page_to_node *)private; 1040742755a1SChristoph Lameter 1041742755a1SChristoph Lameter while (pm->node != MAX_NUMNODES && pm->page != p) 1042742755a1SChristoph Lameter pm++; 1043742755a1SChristoph Lameter 1044742755a1SChristoph Lameter if (pm->node == MAX_NUMNODES) 1045742755a1SChristoph Lameter return NULL; 1046742755a1SChristoph Lameter 1047742755a1SChristoph Lameter *result = &pm->status; 1048742755a1SChristoph Lameter 10496484eb3eSMel Gorman return alloc_pages_exact_node(pm->node, 1050769848c0SMel Gorman GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 1051742755a1SChristoph Lameter } 1052742755a1SChristoph Lameter 1053742755a1SChristoph Lameter /* 1054742755a1SChristoph Lameter * Move a set of pages as indicated in the pm array. The addr 1055742755a1SChristoph Lameter * field must be set to the virtual address of the page to be moved 1056742755a1SChristoph Lameter * and the node number must contain a valid target node. 10575e9a0f02SBrice Goglin * The pm array ends with node = MAX_NUMNODES. 1058742755a1SChristoph Lameter */ 10595e9a0f02SBrice Goglin static int do_move_page_to_node_array(struct mm_struct *mm, 10605e9a0f02SBrice Goglin struct page_to_node *pm, 1061742755a1SChristoph Lameter int migrate_all) 1062742755a1SChristoph Lameter { 1063742755a1SChristoph Lameter int err; 1064742755a1SChristoph Lameter struct page_to_node *pp; 1065742755a1SChristoph Lameter LIST_HEAD(pagelist); 1066742755a1SChristoph Lameter 1067742755a1SChristoph Lameter down_read(&mm->mmap_sem); 1068742755a1SChristoph Lameter 1069742755a1SChristoph Lameter /* 1070742755a1SChristoph Lameter * Build a list of pages to migrate 1071742755a1SChristoph Lameter */ 1072742755a1SChristoph Lameter for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 1073742755a1SChristoph Lameter struct vm_area_struct *vma; 1074742755a1SChristoph Lameter struct page *page; 1075742755a1SChristoph Lameter 1076742755a1SChristoph Lameter err = -EFAULT; 1077742755a1SChristoph Lameter vma = find_vma(mm, pp->addr); 107870384dc6SGleb Natapov if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1079742755a1SChristoph Lameter goto set_status; 1080742755a1SChristoph Lameter 1081500d65d4SAndrea Arcangeli page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); 108289f5b7daSLinus Torvalds 108389f5b7daSLinus Torvalds err = PTR_ERR(page); 108489f5b7daSLinus Torvalds if (IS_ERR(page)) 108589f5b7daSLinus Torvalds goto set_status; 108689f5b7daSLinus Torvalds 1087742755a1SChristoph Lameter err = -ENOENT; 1088742755a1SChristoph Lameter if (!page) 1089742755a1SChristoph Lameter goto set_status; 1090742755a1SChristoph Lameter 109162b61f61SHugh Dickins /* Use PageReserved to check for zero page */ 109262b61f61SHugh Dickins if (PageReserved(page) || PageKsm(page)) 1093742755a1SChristoph Lameter goto put_and_set; 1094742755a1SChristoph Lameter 1095742755a1SChristoph Lameter pp->page = page; 1096742755a1SChristoph Lameter err = page_to_nid(page); 1097742755a1SChristoph Lameter 1098742755a1SChristoph Lameter if (err == pp->node) 1099742755a1SChristoph Lameter /* 1100742755a1SChristoph Lameter * Node already in the right place 1101742755a1SChristoph Lameter */ 1102742755a1SChristoph Lameter goto put_and_set; 1103742755a1SChristoph Lameter 1104742755a1SChristoph Lameter err = -EACCES; 1105742755a1SChristoph Lameter if (page_mapcount(page) > 1 && 1106742755a1SChristoph Lameter !migrate_all) 1107742755a1SChristoph Lameter goto put_and_set; 1108742755a1SChristoph Lameter 110962695a84SNick Piggin err = isolate_lru_page(page); 11106d9c285aSKOSAKI Motohiro if (!err) { 111162695a84SNick Piggin list_add_tail(&page->lru, &pagelist); 11126d9c285aSKOSAKI Motohiro inc_zone_page_state(page, NR_ISOLATED_ANON + 11136d9c285aSKOSAKI Motohiro page_is_file_cache(page)); 11146d9c285aSKOSAKI Motohiro } 1115742755a1SChristoph Lameter put_and_set: 1116742755a1SChristoph Lameter /* 1117742755a1SChristoph Lameter * Either remove the duplicate refcount from 1118742755a1SChristoph Lameter * isolate_lru_page() or drop the page ref if it was 1119742755a1SChristoph Lameter * not isolated. 1120742755a1SChristoph Lameter */ 1121742755a1SChristoph Lameter put_page(page); 1122742755a1SChristoph Lameter set_status: 1123742755a1SChristoph Lameter pp->status = err; 1124742755a1SChristoph Lameter } 1125742755a1SChristoph Lameter 1126e78bbfa8SBrice Goglin err = 0; 1127cf608ac1SMinchan Kim if (!list_empty(&pagelist)) { 1128742755a1SChristoph Lameter err = migrate_pages(&pagelist, new_page_node, 1129a6bc32b8SMel Gorman (unsigned long)pm, 0, MIGRATE_SYNC); 1130cf608ac1SMinchan Kim if (err) 1131cf608ac1SMinchan Kim putback_lru_pages(&pagelist); 1132cf608ac1SMinchan Kim } 1133742755a1SChristoph Lameter 1134742755a1SChristoph Lameter up_read(&mm->mmap_sem); 1135742755a1SChristoph Lameter return err; 1136742755a1SChristoph Lameter } 1137742755a1SChristoph Lameter 1138742755a1SChristoph Lameter /* 11395e9a0f02SBrice Goglin * Migrate an array of page address onto an array of nodes and fill 11405e9a0f02SBrice Goglin * the corresponding array of status. 11415e9a0f02SBrice Goglin */ 11423268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 11435e9a0f02SBrice Goglin unsigned long nr_pages, 11445e9a0f02SBrice Goglin const void __user * __user *pages, 11455e9a0f02SBrice Goglin const int __user *nodes, 11465e9a0f02SBrice Goglin int __user *status, int flags) 11475e9a0f02SBrice Goglin { 11483140a227SBrice Goglin struct page_to_node *pm; 11493140a227SBrice Goglin unsigned long chunk_nr_pages; 11503140a227SBrice Goglin unsigned long chunk_start; 11513140a227SBrice Goglin int err; 11525e9a0f02SBrice Goglin 11535e9a0f02SBrice Goglin err = -ENOMEM; 11543140a227SBrice Goglin pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 11553140a227SBrice Goglin if (!pm) 11565e9a0f02SBrice Goglin goto out; 115735282a2dSBrice Goglin 115835282a2dSBrice Goglin migrate_prep(); 115935282a2dSBrice Goglin 11605e9a0f02SBrice Goglin /* 11613140a227SBrice Goglin * Store a chunk of page_to_node array in a page, 11623140a227SBrice Goglin * but keep the last one as a marker 11635e9a0f02SBrice Goglin */ 11643140a227SBrice Goglin chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; 11653140a227SBrice Goglin 11663140a227SBrice Goglin for (chunk_start = 0; 11673140a227SBrice Goglin chunk_start < nr_pages; 11683140a227SBrice Goglin chunk_start += chunk_nr_pages) { 11693140a227SBrice Goglin int j; 11703140a227SBrice Goglin 11713140a227SBrice Goglin if (chunk_start + chunk_nr_pages > nr_pages) 11723140a227SBrice Goglin chunk_nr_pages = nr_pages - chunk_start; 11733140a227SBrice Goglin 11743140a227SBrice Goglin /* fill the chunk pm with addrs and nodes from user-space */ 11753140a227SBrice Goglin for (j = 0; j < chunk_nr_pages; j++) { 11765e9a0f02SBrice Goglin const void __user *p; 11775e9a0f02SBrice Goglin int node; 11785e9a0f02SBrice Goglin 11793140a227SBrice Goglin err = -EFAULT; 11803140a227SBrice Goglin if (get_user(p, pages + j + chunk_start)) 11813140a227SBrice Goglin goto out_pm; 11823140a227SBrice Goglin pm[j].addr = (unsigned long) p; 11833140a227SBrice Goglin 11843140a227SBrice Goglin if (get_user(node, nodes + j + chunk_start)) 11855e9a0f02SBrice Goglin goto out_pm; 11865e9a0f02SBrice Goglin 11875e9a0f02SBrice Goglin err = -ENODEV; 11886f5a55f1SLinus Torvalds if (node < 0 || node >= MAX_NUMNODES) 11896f5a55f1SLinus Torvalds goto out_pm; 11906f5a55f1SLinus Torvalds 11915e9a0f02SBrice Goglin if (!node_state(node, N_HIGH_MEMORY)) 11925e9a0f02SBrice Goglin goto out_pm; 11935e9a0f02SBrice Goglin 11945e9a0f02SBrice Goglin err = -EACCES; 11955e9a0f02SBrice Goglin if (!node_isset(node, task_nodes)) 11965e9a0f02SBrice Goglin goto out_pm; 11975e9a0f02SBrice Goglin 11983140a227SBrice Goglin pm[j].node = node; 11995e9a0f02SBrice Goglin } 12005e9a0f02SBrice Goglin 12013140a227SBrice Goglin /* End marker for this chunk */ 12023140a227SBrice Goglin pm[chunk_nr_pages].node = MAX_NUMNODES; 12033140a227SBrice Goglin 12043140a227SBrice Goglin /* Migrate this chunk */ 12053140a227SBrice Goglin err = do_move_page_to_node_array(mm, pm, 12063140a227SBrice Goglin flags & MPOL_MF_MOVE_ALL); 12073140a227SBrice Goglin if (err < 0) 12083140a227SBrice Goglin goto out_pm; 12093140a227SBrice Goglin 12105e9a0f02SBrice Goglin /* Return status information */ 12113140a227SBrice Goglin for (j = 0; j < chunk_nr_pages; j++) 12123140a227SBrice Goglin if (put_user(pm[j].status, status + j + chunk_start)) { 12135e9a0f02SBrice Goglin err = -EFAULT; 12143140a227SBrice Goglin goto out_pm; 12153140a227SBrice Goglin } 12163140a227SBrice Goglin } 12173140a227SBrice Goglin err = 0; 12185e9a0f02SBrice Goglin 12195e9a0f02SBrice Goglin out_pm: 12203140a227SBrice Goglin free_page((unsigned long)pm); 12215e9a0f02SBrice Goglin out: 12225e9a0f02SBrice Goglin return err; 12235e9a0f02SBrice Goglin } 12245e9a0f02SBrice Goglin 12255e9a0f02SBrice Goglin /* 12262f007e74SBrice Goglin * Determine the nodes of an array of pages and store it in an array of status. 1227742755a1SChristoph Lameter */ 122880bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 122980bba129SBrice Goglin const void __user **pages, int *status) 1230742755a1SChristoph Lameter { 12312f007e74SBrice Goglin unsigned long i; 1232742755a1SChristoph Lameter 12332f007e74SBrice Goglin down_read(&mm->mmap_sem); 12342f007e74SBrice Goglin 12352f007e74SBrice Goglin for (i = 0; i < nr_pages; i++) { 123680bba129SBrice Goglin unsigned long addr = (unsigned long)(*pages); 12372f007e74SBrice Goglin struct vm_area_struct *vma; 12382f007e74SBrice Goglin struct page *page; 1239c095adbcSKOSAKI Motohiro int err = -EFAULT; 12402f007e74SBrice Goglin 12412f007e74SBrice Goglin vma = find_vma(mm, addr); 124270384dc6SGleb Natapov if (!vma || addr < vma->vm_start) 1243742755a1SChristoph Lameter goto set_status; 1244742755a1SChristoph Lameter 12452f007e74SBrice Goglin page = follow_page(vma, addr, 0); 124689f5b7daSLinus Torvalds 124789f5b7daSLinus Torvalds err = PTR_ERR(page); 124889f5b7daSLinus Torvalds if (IS_ERR(page)) 124989f5b7daSLinus Torvalds goto set_status; 125089f5b7daSLinus Torvalds 1251742755a1SChristoph Lameter err = -ENOENT; 1252742755a1SChristoph Lameter /* Use PageReserved to check for zero page */ 125362b61f61SHugh Dickins if (!page || PageReserved(page) || PageKsm(page)) 1254742755a1SChristoph Lameter goto set_status; 1255742755a1SChristoph Lameter 1256742755a1SChristoph Lameter err = page_to_nid(page); 1257742755a1SChristoph Lameter set_status: 125880bba129SBrice Goglin *status = err; 125980bba129SBrice Goglin 126080bba129SBrice Goglin pages++; 126180bba129SBrice Goglin status++; 126280bba129SBrice Goglin } 126380bba129SBrice Goglin 126480bba129SBrice Goglin up_read(&mm->mmap_sem); 126580bba129SBrice Goglin } 126680bba129SBrice Goglin 126780bba129SBrice Goglin /* 126880bba129SBrice Goglin * Determine the nodes of a user array of pages and store it in 126980bba129SBrice Goglin * a user array of status. 127080bba129SBrice Goglin */ 127180bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 127280bba129SBrice Goglin const void __user * __user *pages, 127380bba129SBrice Goglin int __user *status) 127480bba129SBrice Goglin { 127580bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16 127680bba129SBrice Goglin const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 127780bba129SBrice Goglin int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 127880bba129SBrice Goglin 127987b8d1adSH. Peter Anvin while (nr_pages) { 128087b8d1adSH. Peter Anvin unsigned long chunk_nr; 128180bba129SBrice Goglin 128287b8d1adSH. Peter Anvin chunk_nr = nr_pages; 128387b8d1adSH. Peter Anvin if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 128487b8d1adSH. Peter Anvin chunk_nr = DO_PAGES_STAT_CHUNK_NR; 128587b8d1adSH. Peter Anvin 128687b8d1adSH. Peter Anvin if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 128787b8d1adSH. Peter Anvin break; 128880bba129SBrice Goglin 128980bba129SBrice Goglin do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 129080bba129SBrice Goglin 129187b8d1adSH. Peter Anvin if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 129287b8d1adSH. Peter Anvin break; 1293742755a1SChristoph Lameter 129487b8d1adSH. Peter Anvin pages += chunk_nr; 129587b8d1adSH. Peter Anvin status += chunk_nr; 129687b8d1adSH. Peter Anvin nr_pages -= chunk_nr; 129787b8d1adSH. Peter Anvin } 129887b8d1adSH. Peter Anvin return nr_pages ? -EFAULT : 0; 1299742755a1SChristoph Lameter } 1300742755a1SChristoph Lameter 1301742755a1SChristoph Lameter /* 1302742755a1SChristoph Lameter * Move a list of pages in the address space of the currently executing 1303742755a1SChristoph Lameter * process. 1304742755a1SChristoph Lameter */ 1305938bb9f5SHeiko Carstens SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 1306938bb9f5SHeiko Carstens const void __user * __user *, pages, 1307938bb9f5SHeiko Carstens const int __user *, nodes, 1308938bb9f5SHeiko Carstens int __user *, status, int, flags) 1309742755a1SChristoph Lameter { 1310c69e8d9cSDavid Howells const struct cred *cred = current_cred(), *tcred; 1311742755a1SChristoph Lameter struct task_struct *task; 1312742755a1SChristoph Lameter struct mm_struct *mm; 13135e9a0f02SBrice Goglin int err; 13143268c63eSChristoph Lameter nodemask_t task_nodes; 1315742755a1SChristoph Lameter 1316742755a1SChristoph Lameter /* Check flags */ 1317742755a1SChristoph Lameter if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1318742755a1SChristoph Lameter return -EINVAL; 1319742755a1SChristoph Lameter 1320742755a1SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1321742755a1SChristoph Lameter return -EPERM; 1322742755a1SChristoph Lameter 1323742755a1SChristoph Lameter /* Find the mm_struct */ 1324a879bf58SGreg Thelen rcu_read_lock(); 1325228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 1326742755a1SChristoph Lameter if (!task) { 1327a879bf58SGreg Thelen rcu_read_unlock(); 1328742755a1SChristoph Lameter return -ESRCH; 1329742755a1SChristoph Lameter } 13303268c63eSChristoph Lameter get_task_struct(task); 1331742755a1SChristoph Lameter 1332742755a1SChristoph Lameter /* 1333742755a1SChristoph Lameter * Check if this process has the right to modify the specified 1334742755a1SChristoph Lameter * process. The right exists if the process has administrative 1335742755a1SChristoph Lameter * capabilities, superuser privileges or the same 1336742755a1SChristoph Lameter * userid as the target process. 1337742755a1SChristoph Lameter */ 1338c69e8d9cSDavid Howells tcred = __task_cred(task); 1339b38a86ebSEric W. Biederman if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && 1340b38a86ebSEric W. Biederman !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && 1341742755a1SChristoph Lameter !capable(CAP_SYS_NICE)) { 1342c69e8d9cSDavid Howells rcu_read_unlock(); 1343742755a1SChristoph Lameter err = -EPERM; 13445e9a0f02SBrice Goglin goto out; 1345742755a1SChristoph Lameter } 1346c69e8d9cSDavid Howells rcu_read_unlock(); 1347742755a1SChristoph Lameter 134886c3a764SDavid Quigley err = security_task_movememory(task); 134986c3a764SDavid Quigley if (err) 1350742755a1SChristoph Lameter goto out; 1351742755a1SChristoph Lameter 13523268c63eSChristoph Lameter task_nodes = cpuset_mems_allowed(task); 13533268c63eSChristoph Lameter mm = get_task_mm(task); 13543268c63eSChristoph Lameter put_task_struct(task); 13553268c63eSChristoph Lameter 13566e8b09eaSSasha Levin if (!mm) 13576e8b09eaSSasha Levin return -EINVAL; 13586e8b09eaSSasha Levin 13593268c63eSChristoph Lameter if (nodes) 13603268c63eSChristoph Lameter err = do_pages_move(mm, task_nodes, nr_pages, pages, 13613268c63eSChristoph Lameter nodes, status, flags); 13623268c63eSChristoph Lameter else 13635e9a0f02SBrice Goglin err = do_pages_stat(mm, nr_pages, pages, status); 13643268c63eSChristoph Lameter 13653268c63eSChristoph Lameter mmput(mm); 13663268c63eSChristoph Lameter return err; 1367742755a1SChristoph Lameter 1368742755a1SChristoph Lameter out: 13693268c63eSChristoph Lameter put_task_struct(task); 1370742755a1SChristoph Lameter return err; 1371742755a1SChristoph Lameter } 1372742755a1SChristoph Lameter 13737b2259b3SChristoph Lameter /* 13747b2259b3SChristoph Lameter * Call migration functions in the vma_ops that may prepare 13757b2259b3SChristoph Lameter * memory in a vm for migration. migration functions may perform 13767b2259b3SChristoph Lameter * the migration for vmas that do not have an underlying page struct. 13777b2259b3SChristoph Lameter */ 13787b2259b3SChristoph Lameter int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 13797b2259b3SChristoph Lameter const nodemask_t *from, unsigned long flags) 13807b2259b3SChristoph Lameter { 13817b2259b3SChristoph Lameter struct vm_area_struct *vma; 13827b2259b3SChristoph Lameter int err = 0; 13837b2259b3SChristoph Lameter 13841001c9fbSDaisuke Nishimura for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { 13857b2259b3SChristoph Lameter if (vma->vm_ops && vma->vm_ops->migrate) { 13867b2259b3SChristoph Lameter err = vma->vm_ops->migrate(vma, to, from, flags); 13877b2259b3SChristoph Lameter if (err) 13887b2259b3SChristoph Lameter break; 13897b2259b3SChristoph Lameter } 13907b2259b3SChristoph Lameter } 13917b2259b3SChristoph Lameter return err; 13927b2259b3SChristoph Lameter } 139383d1674aSGerald Schaefer #endif 1394