1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 2b20a3503SChristoph Lameter /* 314e0f9bcSHugh Dickins * Memory Migration functionality - linux/mm/migrate.c 4b20a3503SChristoph Lameter * 5b20a3503SChristoph Lameter * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 6b20a3503SChristoph Lameter * 7b20a3503SChristoph Lameter * Page migration was first developed in the context of the memory hotplug 8b20a3503SChristoph Lameter * project. The main authors of the migration code are: 9b20a3503SChristoph Lameter * 10b20a3503SChristoph Lameter * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 11b20a3503SChristoph Lameter * Hirokazu Takahashi <taka@valinux.co.jp> 12b20a3503SChristoph Lameter * Dave Hansen <haveblue@us.ibm.com> 13cde53535SChristoph Lameter * Christoph Lameter 14b20a3503SChristoph Lameter */ 15b20a3503SChristoph Lameter 16b20a3503SChristoph Lameter #include <linux/migrate.h> 17b95f1b31SPaul Gortmaker #include <linux/export.h> 18b20a3503SChristoph Lameter #include <linux/swap.h> 190697212aSChristoph Lameter #include <linux/swapops.h> 20b20a3503SChristoph Lameter #include <linux/pagemap.h> 21e23ca00bSChristoph Lameter #include <linux/buffer_head.h> 22b20a3503SChristoph Lameter #include <linux/mm_inline.h> 23b488893aSPavel Emelyanov #include <linux/nsproxy.h> 24b20a3503SChristoph Lameter #include <linux/pagevec.h> 25e9995ef9SHugh Dickins #include <linux/ksm.h> 26b20a3503SChristoph Lameter #include <linux/rmap.h> 27b20a3503SChristoph Lameter #include <linux/topology.h> 28b20a3503SChristoph Lameter #include <linux/cpu.h> 29b20a3503SChristoph Lameter #include <linux/cpuset.h> 3004e62a29SChristoph Lameter #include <linux/writeback.h> 31742755a1SChristoph Lameter #include <linux/mempolicy.h> 32742755a1SChristoph Lameter #include <linux/vmalloc.h> 3386c3a764SDavid Quigley #include <linux/security.h> 3442cb14b1SHugh Dickins #include <linux/backing-dev.h> 35bda807d4SMinchan Kim #include <linux/compaction.h> 364f5ca265SAdrian Bunk #include <linux/syscalls.h> 377addf443SDominik Brodowski #include <linux/compat.h> 38290408d4SNaoya Horiguchi #include <linux/hugetlb.h> 398e6ac7faSAneesh Kumar K.V #include <linux/hugetlb_cgroup.h> 405a0e3ad6STejun Heo #include <linux/gfp.h> 41a520110eSChristoph Hellwig #include <linux/pagewalk.h> 42df6ad698SJérôme Glisse #include <linux/pfn_t.h> 43a5430ddaSJérôme Glisse #include <linux/memremap.h> 448315ada7SJérôme Glisse #include <linux/userfaultfd_k.h> 45bf6bddf1SRafael Aquini #include <linux/balloon_compaction.h> 46f714f4f2SMel Gorman #include <linux/mmu_notifier.h> 4733c3fc71SVladimir Davydov #include <linux/page_idle.h> 48d435edcaSVlastimil Babka #include <linux/page_owner.h> 496e84f315SIngo Molnar #include <linux/sched/mm.h> 50197e7e52SLinus Torvalds #include <linux/ptrace.h> 5134290e2cSRalph Campbell #include <linux/oom.h> 52b20a3503SChristoph Lameter 530d1836c3SMichal Nazarewicz #include <asm/tlbflush.h> 540d1836c3SMichal Nazarewicz 557b2a2d4aSMel Gorman #define CREATE_TRACE_POINTS 567b2a2d4aSMel Gorman #include <trace/events/migrate.h> 577b2a2d4aSMel Gorman 58b20a3503SChristoph Lameter #include "internal.h" 59b20a3503SChristoph Lameter 60b20a3503SChristoph Lameter /* 61742755a1SChristoph Lameter * migrate_prep() needs to be called before we start compiling a list of pages 62748446bbSMel Gorman * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 63748446bbSMel Gorman * undesirable, use migrate_prep_local() 64b20a3503SChristoph Lameter */ 65b20a3503SChristoph Lameter int migrate_prep(void) 66b20a3503SChristoph Lameter { 67b20a3503SChristoph Lameter /* 68b20a3503SChristoph Lameter * Clear the LRU lists so pages can be isolated. 69b20a3503SChristoph Lameter * Note that pages may be moved off the LRU after we have 70b20a3503SChristoph Lameter * drained them. Those pages will fail to migrate like other 71b20a3503SChristoph Lameter * pages that may be busy. 72b20a3503SChristoph Lameter */ 73b20a3503SChristoph Lameter lru_add_drain_all(); 74b20a3503SChristoph Lameter 75b20a3503SChristoph Lameter return 0; 76b20a3503SChristoph Lameter } 77b20a3503SChristoph Lameter 78748446bbSMel Gorman /* Do the necessary work of migrate_prep but not if it involves other CPUs */ 79748446bbSMel Gorman int migrate_prep_local(void) 80748446bbSMel Gorman { 81748446bbSMel Gorman lru_add_drain(); 82748446bbSMel Gorman 83748446bbSMel Gorman return 0; 84748446bbSMel Gorman } 85748446bbSMel Gorman 869e5bcd61SYisheng Xie int isolate_movable_page(struct page *page, isolate_mode_t mode) 87bda807d4SMinchan Kim { 88bda807d4SMinchan Kim struct address_space *mapping; 89bda807d4SMinchan Kim 90bda807d4SMinchan Kim /* 91bda807d4SMinchan Kim * Avoid burning cycles with pages that are yet under __free_pages(), 92bda807d4SMinchan Kim * or just got freed under us. 93bda807d4SMinchan Kim * 94bda807d4SMinchan Kim * In case we 'win' a race for a movable page being freed under us and 95bda807d4SMinchan Kim * raise its refcount preventing __free_pages() from doing its job 96bda807d4SMinchan Kim * the put_page() at the end of this block will take care of 97bda807d4SMinchan Kim * release this page, thus avoiding a nasty leakage. 98bda807d4SMinchan Kim */ 99bda807d4SMinchan Kim if (unlikely(!get_page_unless_zero(page))) 100bda807d4SMinchan Kim goto out; 101bda807d4SMinchan Kim 102bda807d4SMinchan Kim /* 103bda807d4SMinchan Kim * Check PageMovable before holding a PG_lock because page's owner 104bda807d4SMinchan Kim * assumes anybody doesn't touch PG_lock of newly allocated page 1058bb4e7a2SWei Yang * so unconditionally grabbing the lock ruins page's owner side. 106bda807d4SMinchan Kim */ 107bda807d4SMinchan Kim if (unlikely(!__PageMovable(page))) 108bda807d4SMinchan Kim goto out_putpage; 109bda807d4SMinchan Kim /* 110bda807d4SMinchan Kim * As movable pages are not isolated from LRU lists, concurrent 111bda807d4SMinchan Kim * compaction threads can race against page migration functions 112bda807d4SMinchan Kim * as well as race against the releasing a page. 113bda807d4SMinchan Kim * 114bda807d4SMinchan Kim * In order to avoid having an already isolated movable page 115bda807d4SMinchan Kim * being (wrongly) re-isolated while it is under migration, 116bda807d4SMinchan Kim * or to avoid attempting to isolate pages being released, 117bda807d4SMinchan Kim * lets be sure we have the page lock 118bda807d4SMinchan Kim * before proceeding with the movable page isolation steps. 119bda807d4SMinchan Kim */ 120bda807d4SMinchan Kim if (unlikely(!trylock_page(page))) 121bda807d4SMinchan Kim goto out_putpage; 122bda807d4SMinchan Kim 123bda807d4SMinchan Kim if (!PageMovable(page) || PageIsolated(page)) 124bda807d4SMinchan Kim goto out_no_isolated; 125bda807d4SMinchan Kim 126bda807d4SMinchan Kim mapping = page_mapping(page); 127bda807d4SMinchan Kim VM_BUG_ON_PAGE(!mapping, page); 128bda807d4SMinchan Kim 129bda807d4SMinchan Kim if (!mapping->a_ops->isolate_page(page, mode)) 130bda807d4SMinchan Kim goto out_no_isolated; 131bda807d4SMinchan Kim 132bda807d4SMinchan Kim /* Driver shouldn't use PG_isolated bit of page->flags */ 133bda807d4SMinchan Kim WARN_ON_ONCE(PageIsolated(page)); 134bda807d4SMinchan Kim __SetPageIsolated(page); 135bda807d4SMinchan Kim unlock_page(page); 136bda807d4SMinchan Kim 1379e5bcd61SYisheng Xie return 0; 138bda807d4SMinchan Kim 139bda807d4SMinchan Kim out_no_isolated: 140bda807d4SMinchan Kim unlock_page(page); 141bda807d4SMinchan Kim out_putpage: 142bda807d4SMinchan Kim put_page(page); 143bda807d4SMinchan Kim out: 1449e5bcd61SYisheng Xie return -EBUSY; 145bda807d4SMinchan Kim } 146bda807d4SMinchan Kim 147bda807d4SMinchan Kim /* It should be called on page which is PG_movable */ 148bda807d4SMinchan Kim void putback_movable_page(struct page *page) 149bda807d4SMinchan Kim { 150bda807d4SMinchan Kim struct address_space *mapping; 151bda807d4SMinchan Kim 152bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageLocked(page), page); 153bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageMovable(page), page); 154bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageIsolated(page), page); 155bda807d4SMinchan Kim 156bda807d4SMinchan Kim mapping = page_mapping(page); 157bda807d4SMinchan Kim mapping->a_ops->putback_page(page); 158bda807d4SMinchan Kim __ClearPageIsolated(page); 159bda807d4SMinchan Kim } 160bda807d4SMinchan Kim 161b20a3503SChristoph Lameter /* 1625733c7d1SRafael Aquini * Put previously isolated pages back onto the appropriate lists 1635733c7d1SRafael Aquini * from where they were once taken off for compaction/migration. 1645733c7d1SRafael Aquini * 16559c82b70SJoonsoo Kim * This function shall be used whenever the isolated pageset has been 16659c82b70SJoonsoo Kim * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 16759c82b70SJoonsoo Kim * and isolate_huge_page(). 1685733c7d1SRafael Aquini */ 1695733c7d1SRafael Aquini void putback_movable_pages(struct list_head *l) 1705733c7d1SRafael Aquini { 1715733c7d1SRafael Aquini struct page *page; 1725733c7d1SRafael Aquini struct page *page2; 1735733c7d1SRafael Aquini 1745733c7d1SRafael Aquini list_for_each_entry_safe(page, page2, l, lru) { 17531caf665SNaoya Horiguchi if (unlikely(PageHuge(page))) { 17631caf665SNaoya Horiguchi putback_active_hugepage(page); 17731caf665SNaoya Horiguchi continue; 17831caf665SNaoya Horiguchi } 1795733c7d1SRafael Aquini list_del(&page->lru); 180bda807d4SMinchan Kim /* 181bda807d4SMinchan Kim * We isolated non-lru movable page so here we can use 182bda807d4SMinchan Kim * __PageMovable because LRU page's mapping cannot have 183bda807d4SMinchan Kim * PAGE_MAPPING_MOVABLE. 184bda807d4SMinchan Kim */ 185b1123ea6SMinchan Kim if (unlikely(__PageMovable(page))) { 186bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageIsolated(page), page); 187bda807d4SMinchan Kim lock_page(page); 188bda807d4SMinchan Kim if (PageMovable(page)) 189bda807d4SMinchan Kim putback_movable_page(page); 190bf6bddf1SRafael Aquini else 191bda807d4SMinchan Kim __ClearPageIsolated(page); 192bda807d4SMinchan Kim unlock_page(page); 193bda807d4SMinchan Kim put_page(page); 194bda807d4SMinchan Kim } else { 195e8db67ebSNaoya Horiguchi mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 196e8db67ebSNaoya Horiguchi page_is_file_cache(page), -hpage_nr_pages(page)); 197fc280fe8SRabin Vincent putback_lru_page(page); 198b20a3503SChristoph Lameter } 199b20a3503SChristoph Lameter } 200bda807d4SMinchan Kim } 201b20a3503SChristoph Lameter 2020697212aSChristoph Lameter /* 2030697212aSChristoph Lameter * Restore a potential migration pte to a working pte entry 2040697212aSChristoph Lameter */ 205e4b82222SMinchan Kim static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 206e9995ef9SHugh Dickins unsigned long addr, void *old) 2070697212aSChristoph Lameter { 2083fe87967SKirill A. Shutemov struct page_vma_mapped_walk pvmw = { 2093fe87967SKirill A. Shutemov .page = old, 2103fe87967SKirill A. Shutemov .vma = vma, 2113fe87967SKirill A. Shutemov .address = addr, 2123fe87967SKirill A. Shutemov .flags = PVMW_SYNC | PVMW_MIGRATION, 2133fe87967SKirill A. Shutemov }; 2143fe87967SKirill A. Shutemov struct page *new; 2153fe87967SKirill A. Shutemov pte_t pte; 2160697212aSChristoph Lameter swp_entry_t entry; 2170697212aSChristoph Lameter 2183fe87967SKirill A. Shutemov VM_BUG_ON_PAGE(PageTail(page), page); 2193fe87967SKirill A. Shutemov while (page_vma_mapped_walk(&pvmw)) { 2204b0ece6fSNaoya Horiguchi if (PageKsm(page)) 2214b0ece6fSNaoya Horiguchi new = page; 2224b0ece6fSNaoya Horiguchi else 2233fe87967SKirill A. Shutemov new = page - pvmw.page->index + 2243fe87967SKirill A. Shutemov linear_page_index(vma, pvmw.address); 2250697212aSChristoph Lameter 226616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 227616b8371SZi Yan /* PMD-mapped THP migration entry */ 228616b8371SZi Yan if (!pvmw.pte) { 229616b8371SZi Yan VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 230616b8371SZi Yan remove_migration_pmd(&pvmw, new); 231616b8371SZi Yan continue; 232616b8371SZi Yan } 233616b8371SZi Yan #endif 234616b8371SZi Yan 2350697212aSChristoph Lameter get_page(new); 2366d2329f8SAndrea Arcangeli pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 2373fe87967SKirill A. Shutemov if (pte_swp_soft_dirty(*pvmw.pte)) 238c3d16e16SCyrill Gorcunov pte = pte_mksoft_dirty(pte); 239d3cb8bf6SMel Gorman 2403fe87967SKirill A. Shutemov /* 2413fe87967SKirill A. Shutemov * Recheck VMA as permissions can change since migration started 2423fe87967SKirill A. Shutemov */ 2433fe87967SKirill A. Shutemov entry = pte_to_swp_entry(*pvmw.pte); 2440697212aSChristoph Lameter if (is_write_migration_entry(entry)) 245d3cb8bf6SMel Gorman pte = maybe_mkwrite(pte, vma); 246d3cb8bf6SMel Gorman 247df6ad698SJérôme Glisse if (unlikely(is_zone_device_page(new))) { 248df6ad698SJérôme Glisse if (is_device_private_page(new)) { 249a5430ddaSJérôme Glisse entry = make_device_private_entry(new, pte_write(pte)); 250a5430ddaSJérôme Glisse pte = swp_entry_to_pte(entry); 251df6ad698SJérôme Glisse } 252d2b2c6ddSLars Persson } 253a5430ddaSJérôme Glisse 2543ef8fd7fSAndi Kleen #ifdef CONFIG_HUGETLB_PAGE 255be7517d6STony Lu if (PageHuge(new)) { 256290408d4SNaoya Horiguchi pte = pte_mkhuge(pte); 257be7517d6STony Lu pte = arch_make_huge_pte(pte, vma, new, 0); 258383321abSAneesh Kumar K.V set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 25904e62a29SChristoph Lameter if (PageAnon(new)) 2603fe87967SKirill A. Shutemov hugepage_add_anon_rmap(new, vma, pvmw.address); 261290408d4SNaoya Horiguchi else 26253f9263bSKirill A. Shutemov page_dup_rmap(new, true); 263383321abSAneesh Kumar K.V } else 264383321abSAneesh Kumar K.V #endif 265383321abSAneesh Kumar K.V { 266383321abSAneesh Kumar K.V set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 267383321abSAneesh Kumar K.V 268383321abSAneesh Kumar K.V if (PageAnon(new)) 2693fe87967SKirill A. Shutemov page_add_anon_rmap(new, vma, pvmw.address, false); 27004e62a29SChristoph Lameter else 271dd78feddSKirill A. Shutemov page_add_file_rmap(new, false); 272383321abSAneesh Kumar K.V } 273e388466dSKirill A. Shutemov if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 27451afb12bSHugh Dickins mlock_vma_page(new); 27551afb12bSHugh Dickins 276e125fe40SKirill A. Shutemov if (PageTransHuge(page) && PageMlocked(page)) 277e125fe40SKirill A. Shutemov clear_page_mlock(page); 278e125fe40SKirill A. Shutemov 27904e62a29SChristoph Lameter /* No need to invalidate - it was non-present before */ 2803fe87967SKirill A. Shutemov update_mmu_cache(vma, pvmw.address, pvmw.pte); 2813fe87967SKirill A. Shutemov } 2823fe87967SKirill A. Shutemov 283e4b82222SMinchan Kim return true; 2840697212aSChristoph Lameter } 2850697212aSChristoph Lameter 2860697212aSChristoph Lameter /* 28704e62a29SChristoph Lameter * Get rid of all migration entries and replace them by 28804e62a29SChristoph Lameter * references to the indicated page. 28904e62a29SChristoph Lameter */ 290e388466dSKirill A. Shutemov void remove_migration_ptes(struct page *old, struct page *new, bool locked) 29104e62a29SChristoph Lameter { 292051ac83aSJoonsoo Kim struct rmap_walk_control rwc = { 293051ac83aSJoonsoo Kim .rmap_one = remove_migration_pte, 294051ac83aSJoonsoo Kim .arg = old, 295051ac83aSJoonsoo Kim }; 296051ac83aSJoonsoo Kim 297e388466dSKirill A. Shutemov if (locked) 298e388466dSKirill A. Shutemov rmap_walk_locked(new, &rwc); 299e388466dSKirill A. Shutemov else 300051ac83aSJoonsoo Kim rmap_walk(new, &rwc); 30104e62a29SChristoph Lameter } 30204e62a29SChristoph Lameter 30304e62a29SChristoph Lameter /* 3040697212aSChristoph Lameter * Something used the pte of a page under migration. We need to 3050697212aSChristoph Lameter * get to the page and wait until migration is finished. 3060697212aSChristoph Lameter * When we return from this function the fault will be retried. 3070697212aSChristoph Lameter */ 308e66f17ffSNaoya Horiguchi void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 30930dad309SNaoya Horiguchi spinlock_t *ptl) 3100697212aSChristoph Lameter { 31130dad309SNaoya Horiguchi pte_t pte; 3120697212aSChristoph Lameter swp_entry_t entry; 3130697212aSChristoph Lameter struct page *page; 3140697212aSChristoph Lameter 31530dad309SNaoya Horiguchi spin_lock(ptl); 3160697212aSChristoph Lameter pte = *ptep; 3170697212aSChristoph Lameter if (!is_swap_pte(pte)) 3180697212aSChristoph Lameter goto out; 3190697212aSChristoph Lameter 3200697212aSChristoph Lameter entry = pte_to_swp_entry(pte); 3210697212aSChristoph Lameter if (!is_migration_entry(entry)) 3220697212aSChristoph Lameter goto out; 3230697212aSChristoph Lameter 3240697212aSChristoph Lameter page = migration_entry_to_page(entry); 3250697212aSChristoph Lameter 326e286781dSNick Piggin /* 32789eb946aSMatthew Wilcox * Once page cache replacement of page migration started, page_count 3289a1ea439SHugh Dickins * is zero; but we must not call put_and_wait_on_page_locked() without 3299a1ea439SHugh Dickins * a ref. Use get_page_unless_zero(), and just fault again if it fails. 330e286781dSNick Piggin */ 331e286781dSNick Piggin if (!get_page_unless_zero(page)) 332e286781dSNick Piggin goto out; 3330697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 3349a1ea439SHugh Dickins put_and_wait_on_page_locked(page); 3350697212aSChristoph Lameter return; 3360697212aSChristoph Lameter out: 3370697212aSChristoph Lameter pte_unmap_unlock(ptep, ptl); 3380697212aSChristoph Lameter } 3390697212aSChristoph Lameter 34030dad309SNaoya Horiguchi void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 34130dad309SNaoya Horiguchi unsigned long address) 34230dad309SNaoya Horiguchi { 34330dad309SNaoya Horiguchi spinlock_t *ptl = pte_lockptr(mm, pmd); 34430dad309SNaoya Horiguchi pte_t *ptep = pte_offset_map(pmd, address); 34530dad309SNaoya Horiguchi __migration_entry_wait(mm, ptep, ptl); 34630dad309SNaoya Horiguchi } 34730dad309SNaoya Horiguchi 348cb900f41SKirill A. Shutemov void migration_entry_wait_huge(struct vm_area_struct *vma, 349cb900f41SKirill A. Shutemov struct mm_struct *mm, pte_t *pte) 35030dad309SNaoya Horiguchi { 351cb900f41SKirill A. Shutemov spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); 35230dad309SNaoya Horiguchi __migration_entry_wait(mm, pte, ptl); 35330dad309SNaoya Horiguchi } 35430dad309SNaoya Horiguchi 355616b8371SZi Yan #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 356616b8371SZi Yan void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 357616b8371SZi Yan { 358616b8371SZi Yan spinlock_t *ptl; 359616b8371SZi Yan struct page *page; 360616b8371SZi Yan 361616b8371SZi Yan ptl = pmd_lock(mm, pmd); 362616b8371SZi Yan if (!is_pmd_migration_entry(*pmd)) 363616b8371SZi Yan goto unlock; 364616b8371SZi Yan page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); 365616b8371SZi Yan if (!get_page_unless_zero(page)) 366616b8371SZi Yan goto unlock; 367616b8371SZi Yan spin_unlock(ptl); 3689a1ea439SHugh Dickins put_and_wait_on_page_locked(page); 369616b8371SZi Yan return; 370616b8371SZi Yan unlock: 371616b8371SZi Yan spin_unlock(ptl); 372616b8371SZi Yan } 373616b8371SZi Yan #endif 374616b8371SZi Yan 375f900482dSJan Kara static int expected_page_refs(struct address_space *mapping, struct page *page) 3760b3901b3SJan Kara { 3770b3901b3SJan Kara int expected_count = 1; 3780b3901b3SJan Kara 3790b3901b3SJan Kara /* 3800b3901b3SJan Kara * Device public or private pages have an extra refcount as they are 3810b3901b3SJan Kara * ZONE_DEVICE pages. 3820b3901b3SJan Kara */ 3830b3901b3SJan Kara expected_count += is_device_private_page(page); 384f900482dSJan Kara if (mapping) 3850b3901b3SJan Kara expected_count += hpage_nr_pages(page) + page_has_private(page); 3860b3901b3SJan Kara 3870b3901b3SJan Kara return expected_count; 3880b3901b3SJan Kara } 3890b3901b3SJan Kara 390b20a3503SChristoph Lameter /* 391c3fcf8a5SChristoph Lameter * Replace the page in the mapping. 3925b5c7120SChristoph Lameter * 3935b5c7120SChristoph Lameter * The number of remaining references must be: 3945b5c7120SChristoph Lameter * 1 for anonymous pages without a mapping 3955b5c7120SChristoph Lameter * 2 for pages with a mapping 396266cf658SDavid Howells * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 397b20a3503SChristoph Lameter */ 39836bc08ccSGu Zheng int migrate_page_move_mapping(struct address_space *mapping, 39937109694SKeith Busch struct page *newpage, struct page *page, int extra_count) 400b20a3503SChristoph Lameter { 40189eb946aSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, page_index(page)); 40242cb14b1SHugh Dickins struct zone *oldzone, *newzone; 40342cb14b1SHugh Dickins int dirty; 404f900482dSJan Kara int expected_count = expected_page_refs(mapping, page) + extra_count; 4058763cb45SJérôme Glisse 4066c5240aeSChristoph Lameter if (!mapping) { 4070e8c7d0fSChristoph Lameter /* Anonymous page without mapping */ 4088e321fefSBenjamin LaHaise if (page_count(page) != expected_count) 4096c5240aeSChristoph Lameter return -EAGAIN; 410cf4b769aSHugh Dickins 411cf4b769aSHugh Dickins /* No turning back from here */ 412cf4b769aSHugh Dickins newpage->index = page->index; 413cf4b769aSHugh Dickins newpage->mapping = page->mapping; 414cf4b769aSHugh Dickins if (PageSwapBacked(page)) 415fa9949daSHugh Dickins __SetPageSwapBacked(newpage); 416cf4b769aSHugh Dickins 41778bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 4186c5240aeSChristoph Lameter } 4196c5240aeSChristoph Lameter 42042cb14b1SHugh Dickins oldzone = page_zone(page); 42142cb14b1SHugh Dickins newzone = page_zone(newpage); 42242cb14b1SHugh Dickins 42389eb946aSMatthew Wilcox xas_lock_irq(&xas); 42489eb946aSMatthew Wilcox if (page_count(page) != expected_count || xas_load(&xas) != page) { 42589eb946aSMatthew Wilcox xas_unlock_irq(&xas); 426e23ca00bSChristoph Lameter return -EAGAIN; 427b20a3503SChristoph Lameter } 428b20a3503SChristoph Lameter 429fe896d18SJoonsoo Kim if (!page_ref_freeze(page, expected_count)) { 43089eb946aSMatthew Wilcox xas_unlock_irq(&xas); 431e286781dSNick Piggin return -EAGAIN; 432e286781dSNick Piggin } 433e286781dSNick Piggin 434b20a3503SChristoph Lameter /* 435cf4b769aSHugh Dickins * Now we know that no one else is looking at the page: 436cf4b769aSHugh Dickins * no turning back from here. 437b20a3503SChristoph Lameter */ 438cf4b769aSHugh Dickins newpage->index = page->index; 439cf4b769aSHugh Dickins newpage->mapping = page->mapping; 440e71769aeSNaoya Horiguchi page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ 4416326fec1SNicholas Piggin if (PageSwapBacked(page)) { 4426326fec1SNicholas Piggin __SetPageSwapBacked(newpage); 443b20a3503SChristoph Lameter if (PageSwapCache(page)) { 444b20a3503SChristoph Lameter SetPageSwapCache(newpage); 445b20a3503SChristoph Lameter set_page_private(newpage, page_private(page)); 446b20a3503SChristoph Lameter } 4476326fec1SNicholas Piggin } else { 4486326fec1SNicholas Piggin VM_BUG_ON_PAGE(PageSwapCache(page), page); 4496326fec1SNicholas Piggin } 450b20a3503SChristoph Lameter 45142cb14b1SHugh Dickins /* Move dirty while page refs frozen and newpage not yet exposed */ 45242cb14b1SHugh Dickins dirty = PageDirty(page); 45342cb14b1SHugh Dickins if (dirty) { 45442cb14b1SHugh Dickins ClearPageDirty(page); 45542cb14b1SHugh Dickins SetPageDirty(newpage); 45642cb14b1SHugh Dickins } 45742cb14b1SHugh Dickins 45889eb946aSMatthew Wilcox xas_store(&xas, newpage); 459e71769aeSNaoya Horiguchi if (PageTransHuge(page)) { 460e71769aeSNaoya Horiguchi int i; 461e71769aeSNaoya Horiguchi 462013567beSNaoya Horiguchi for (i = 1; i < HPAGE_PMD_NR; i++) { 46389eb946aSMatthew Wilcox xas_next(&xas); 4644101196bSMatthew Wilcox (Oracle) xas_store(&xas, newpage); 465e71769aeSNaoya Horiguchi } 466e71769aeSNaoya Horiguchi } 4677cf9c2c7SNick Piggin 4687cf9c2c7SNick Piggin /* 469937a94c9SJacobo Giralt * Drop cache reference from old page by unfreezing 470937a94c9SJacobo Giralt * to one less reference. 4717cf9c2c7SNick Piggin * We know this isn't the last reference. 4727cf9c2c7SNick Piggin */ 473e71769aeSNaoya Horiguchi page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); 4747cf9c2c7SNick Piggin 47589eb946aSMatthew Wilcox xas_unlock(&xas); 47642cb14b1SHugh Dickins /* Leave irq disabled to prevent preemption while updating stats */ 47742cb14b1SHugh Dickins 4780e8c7d0fSChristoph Lameter /* 4790e8c7d0fSChristoph Lameter * If moved to a different zone then also account 4800e8c7d0fSChristoph Lameter * the page for that zone. Other VM counters will be 4810e8c7d0fSChristoph Lameter * taken care of when we establish references to the 4820e8c7d0fSChristoph Lameter * new page and drop references to the old page. 4830e8c7d0fSChristoph Lameter * 4840e8c7d0fSChristoph Lameter * Note that anonymous pages are accounted for 4854b9d0fabSMel Gorman * via NR_FILE_PAGES and NR_ANON_MAPPED if they 4860e8c7d0fSChristoph Lameter * are mapped to swap space. 4870e8c7d0fSChristoph Lameter */ 48842cb14b1SHugh Dickins if (newzone != oldzone) { 48911fb9989SMel Gorman __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); 49011fb9989SMel Gorman __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); 49142cb14b1SHugh Dickins if (PageSwapBacked(page) && !PageSwapCache(page)) { 49211fb9989SMel Gorman __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); 49311fb9989SMel Gorman __inc_node_state(newzone->zone_pgdat, NR_SHMEM); 4944b02108aSKOSAKI Motohiro } 49542cb14b1SHugh Dickins if (dirty && mapping_cap_account_dirty(mapping)) { 49611fb9989SMel Gorman __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); 4975a1c84b4SMel Gorman __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); 49811fb9989SMel Gorman __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); 4995a1c84b4SMel Gorman __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); 50042cb14b1SHugh Dickins } 50142cb14b1SHugh Dickins } 50242cb14b1SHugh Dickins local_irq_enable(); 503b20a3503SChristoph Lameter 50478bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 505b20a3503SChristoph Lameter } 5061118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_move_mapping); 507b20a3503SChristoph Lameter 508b20a3503SChristoph Lameter /* 509290408d4SNaoya Horiguchi * The expected number of remaining references is the same as that 510290408d4SNaoya Horiguchi * of migrate_page_move_mapping(). 511290408d4SNaoya Horiguchi */ 512290408d4SNaoya Horiguchi int migrate_huge_page_move_mapping(struct address_space *mapping, 513290408d4SNaoya Horiguchi struct page *newpage, struct page *page) 514290408d4SNaoya Horiguchi { 51589eb946aSMatthew Wilcox XA_STATE(xas, &mapping->i_pages, page_index(page)); 516290408d4SNaoya Horiguchi int expected_count; 517290408d4SNaoya Horiguchi 51889eb946aSMatthew Wilcox xas_lock_irq(&xas); 519290408d4SNaoya Horiguchi expected_count = 2 + page_has_private(page); 52089eb946aSMatthew Wilcox if (page_count(page) != expected_count || xas_load(&xas) != page) { 52189eb946aSMatthew Wilcox xas_unlock_irq(&xas); 522290408d4SNaoya Horiguchi return -EAGAIN; 523290408d4SNaoya Horiguchi } 524290408d4SNaoya Horiguchi 525fe896d18SJoonsoo Kim if (!page_ref_freeze(page, expected_count)) { 52689eb946aSMatthew Wilcox xas_unlock_irq(&xas); 527290408d4SNaoya Horiguchi return -EAGAIN; 528290408d4SNaoya Horiguchi } 529290408d4SNaoya Horiguchi 530cf4b769aSHugh Dickins newpage->index = page->index; 531cf4b769aSHugh Dickins newpage->mapping = page->mapping; 5326a93ca8fSJohannes Weiner 533290408d4SNaoya Horiguchi get_page(newpage); 534290408d4SNaoya Horiguchi 53589eb946aSMatthew Wilcox xas_store(&xas, newpage); 536290408d4SNaoya Horiguchi 537fe896d18SJoonsoo Kim page_ref_unfreeze(page, expected_count - 1); 538290408d4SNaoya Horiguchi 53989eb946aSMatthew Wilcox xas_unlock_irq(&xas); 5406a93ca8fSJohannes Weiner 54178bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 542290408d4SNaoya Horiguchi } 543290408d4SNaoya Horiguchi 544290408d4SNaoya Horiguchi /* 54530b0a105SDave Hansen * Gigantic pages are so large that we do not guarantee that page++ pointer 54630b0a105SDave Hansen * arithmetic will work across the entire page. We need something more 54730b0a105SDave Hansen * specialized. 54830b0a105SDave Hansen */ 54930b0a105SDave Hansen static void __copy_gigantic_page(struct page *dst, struct page *src, 55030b0a105SDave Hansen int nr_pages) 55130b0a105SDave Hansen { 55230b0a105SDave Hansen int i; 55330b0a105SDave Hansen struct page *dst_base = dst; 55430b0a105SDave Hansen struct page *src_base = src; 55530b0a105SDave Hansen 55630b0a105SDave Hansen for (i = 0; i < nr_pages; ) { 55730b0a105SDave Hansen cond_resched(); 55830b0a105SDave Hansen copy_highpage(dst, src); 55930b0a105SDave Hansen 56030b0a105SDave Hansen i++; 56130b0a105SDave Hansen dst = mem_map_next(dst, dst_base, i); 56230b0a105SDave Hansen src = mem_map_next(src, src_base, i); 56330b0a105SDave Hansen } 56430b0a105SDave Hansen } 56530b0a105SDave Hansen 56630b0a105SDave Hansen static void copy_huge_page(struct page *dst, struct page *src) 56730b0a105SDave Hansen { 56830b0a105SDave Hansen int i; 56930b0a105SDave Hansen int nr_pages; 57030b0a105SDave Hansen 57130b0a105SDave Hansen if (PageHuge(src)) { 57230b0a105SDave Hansen /* hugetlbfs page */ 57330b0a105SDave Hansen struct hstate *h = page_hstate(src); 57430b0a105SDave Hansen nr_pages = pages_per_huge_page(h); 57530b0a105SDave Hansen 57630b0a105SDave Hansen if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { 57730b0a105SDave Hansen __copy_gigantic_page(dst, src, nr_pages); 57830b0a105SDave Hansen return; 57930b0a105SDave Hansen } 58030b0a105SDave Hansen } else { 58130b0a105SDave Hansen /* thp page */ 58230b0a105SDave Hansen BUG_ON(!PageTransHuge(src)); 58330b0a105SDave Hansen nr_pages = hpage_nr_pages(src); 58430b0a105SDave Hansen } 58530b0a105SDave Hansen 58630b0a105SDave Hansen for (i = 0; i < nr_pages; i++) { 58730b0a105SDave Hansen cond_resched(); 58830b0a105SDave Hansen copy_highpage(dst + i, src + i); 58930b0a105SDave Hansen } 59030b0a105SDave Hansen } 59130b0a105SDave Hansen 59230b0a105SDave Hansen /* 593b20a3503SChristoph Lameter * Copy the page to its new location 594b20a3503SChristoph Lameter */ 5952916ecc0SJérôme Glisse void migrate_page_states(struct page *newpage, struct page *page) 596b20a3503SChristoph Lameter { 5977851a45cSRik van Riel int cpupid; 5987851a45cSRik van Riel 599b20a3503SChristoph Lameter if (PageError(page)) 600b20a3503SChristoph Lameter SetPageError(newpage); 601b20a3503SChristoph Lameter if (PageReferenced(page)) 602b20a3503SChristoph Lameter SetPageReferenced(newpage); 603b20a3503SChristoph Lameter if (PageUptodate(page)) 604b20a3503SChristoph Lameter SetPageUptodate(newpage); 605894bc310SLee Schermerhorn if (TestClearPageActive(page)) { 606309381feSSasha Levin VM_BUG_ON_PAGE(PageUnevictable(page), page); 607b20a3503SChristoph Lameter SetPageActive(newpage); 608418b27efSLee Schermerhorn } else if (TestClearPageUnevictable(page)) 609418b27efSLee Schermerhorn SetPageUnevictable(newpage); 6101899ad18SJohannes Weiner if (PageWorkingset(page)) 6111899ad18SJohannes Weiner SetPageWorkingset(newpage); 612b20a3503SChristoph Lameter if (PageChecked(page)) 613b20a3503SChristoph Lameter SetPageChecked(newpage); 614b20a3503SChristoph Lameter if (PageMappedToDisk(page)) 615b20a3503SChristoph Lameter SetPageMappedToDisk(newpage); 616b20a3503SChristoph Lameter 61742cb14b1SHugh Dickins /* Move dirty on pages not done by migrate_page_move_mapping() */ 61842cb14b1SHugh Dickins if (PageDirty(page)) 619752dc185SHugh Dickins SetPageDirty(newpage); 620b20a3503SChristoph Lameter 62133c3fc71SVladimir Davydov if (page_is_young(page)) 62233c3fc71SVladimir Davydov set_page_young(newpage); 62333c3fc71SVladimir Davydov if (page_is_idle(page)) 62433c3fc71SVladimir Davydov set_page_idle(newpage); 62533c3fc71SVladimir Davydov 6267851a45cSRik van Riel /* 6277851a45cSRik van Riel * Copy NUMA information to the new page, to prevent over-eager 6287851a45cSRik van Riel * future migrations of this same page. 6297851a45cSRik van Riel */ 6307851a45cSRik van Riel cpupid = page_cpupid_xchg_last(page, -1); 6317851a45cSRik van Riel page_cpupid_xchg_last(newpage, cpupid); 6327851a45cSRik van Riel 633e9995ef9SHugh Dickins ksm_migrate_page(newpage, page); 634c8d6553bSHugh Dickins /* 635c8d6553bSHugh Dickins * Please do not reorder this without considering how mm/ksm.c's 636c8d6553bSHugh Dickins * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 637c8d6553bSHugh Dickins */ 638b3b3a99cSNaoya Horiguchi if (PageSwapCache(page)) 639b20a3503SChristoph Lameter ClearPageSwapCache(page); 640b20a3503SChristoph Lameter ClearPagePrivate(page); 641b20a3503SChristoph Lameter set_page_private(page, 0); 642b20a3503SChristoph Lameter 643b20a3503SChristoph Lameter /* 644b20a3503SChristoph Lameter * If any waiters have accumulated on the new page then 645b20a3503SChristoph Lameter * wake them up. 646b20a3503SChristoph Lameter */ 647b20a3503SChristoph Lameter if (PageWriteback(newpage)) 648b20a3503SChristoph Lameter end_page_writeback(newpage); 649d435edcaSVlastimil Babka 650d435edcaSVlastimil Babka copy_page_owner(page, newpage); 65174485cf2SJohannes Weiner 65274485cf2SJohannes Weiner mem_cgroup_migrate(page, newpage); 653b20a3503SChristoph Lameter } 6542916ecc0SJérôme Glisse EXPORT_SYMBOL(migrate_page_states); 6552916ecc0SJérôme Glisse 6562916ecc0SJérôme Glisse void migrate_page_copy(struct page *newpage, struct page *page) 6572916ecc0SJérôme Glisse { 6582916ecc0SJérôme Glisse if (PageHuge(page) || PageTransHuge(page)) 6592916ecc0SJérôme Glisse copy_huge_page(newpage, page); 6602916ecc0SJérôme Glisse else 6612916ecc0SJérôme Glisse copy_highpage(newpage, page); 6622916ecc0SJérôme Glisse 6632916ecc0SJérôme Glisse migrate_page_states(newpage, page); 6642916ecc0SJérôme Glisse } 6651118dce7SRichard Weinberger EXPORT_SYMBOL(migrate_page_copy); 666b20a3503SChristoph Lameter 6671d8b85ccSChristoph Lameter /************************************************************ 6681d8b85ccSChristoph Lameter * Migration functions 6691d8b85ccSChristoph Lameter ***********************************************************/ 6701d8b85ccSChristoph Lameter 671b20a3503SChristoph Lameter /* 672bda807d4SMinchan Kim * Common logic to directly migrate a single LRU page suitable for 673266cf658SDavid Howells * pages that do not use PagePrivate/PagePrivate2. 674b20a3503SChristoph Lameter * 675b20a3503SChristoph Lameter * Pages are locked upon entry and exit. 676b20a3503SChristoph Lameter */ 6772d1db3b1SChristoph Lameter int migrate_page(struct address_space *mapping, 678a6bc32b8SMel Gorman struct page *newpage, struct page *page, 679a6bc32b8SMel Gorman enum migrate_mode mode) 680b20a3503SChristoph Lameter { 681b20a3503SChristoph Lameter int rc; 682b20a3503SChristoph Lameter 683b20a3503SChristoph Lameter BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 684b20a3503SChristoph Lameter 68537109694SKeith Busch rc = migrate_page_move_mapping(mapping, newpage, page, 0); 686b20a3503SChristoph Lameter 68778bd5209SRafael Aquini if (rc != MIGRATEPAGE_SUCCESS) 688b20a3503SChristoph Lameter return rc; 689b20a3503SChristoph Lameter 6902916ecc0SJérôme Glisse if (mode != MIGRATE_SYNC_NO_COPY) 691b20a3503SChristoph Lameter migrate_page_copy(newpage, page); 6922916ecc0SJérôme Glisse else 6932916ecc0SJérôme Glisse migrate_page_states(newpage, page); 69478bd5209SRafael Aquini return MIGRATEPAGE_SUCCESS; 695b20a3503SChristoph Lameter } 696b20a3503SChristoph Lameter EXPORT_SYMBOL(migrate_page); 697b20a3503SChristoph Lameter 6989361401eSDavid Howells #ifdef CONFIG_BLOCK 69984ade7c1SJan Kara /* Returns true if all buffers are successfully locked */ 70084ade7c1SJan Kara static bool buffer_migrate_lock_buffers(struct buffer_head *head, 70184ade7c1SJan Kara enum migrate_mode mode) 70284ade7c1SJan Kara { 70384ade7c1SJan Kara struct buffer_head *bh = head; 70484ade7c1SJan Kara 70584ade7c1SJan Kara /* Simple case, sync compaction */ 70684ade7c1SJan Kara if (mode != MIGRATE_ASYNC) { 70784ade7c1SJan Kara do { 70884ade7c1SJan Kara lock_buffer(bh); 70984ade7c1SJan Kara bh = bh->b_this_page; 71084ade7c1SJan Kara 71184ade7c1SJan Kara } while (bh != head); 71284ade7c1SJan Kara 71384ade7c1SJan Kara return true; 71484ade7c1SJan Kara } 71584ade7c1SJan Kara 71684ade7c1SJan Kara /* async case, we cannot block on lock_buffer so use trylock_buffer */ 71784ade7c1SJan Kara do { 71884ade7c1SJan Kara if (!trylock_buffer(bh)) { 71984ade7c1SJan Kara /* 72084ade7c1SJan Kara * We failed to lock the buffer and cannot stall in 72184ade7c1SJan Kara * async migration. Release the taken locks 72284ade7c1SJan Kara */ 72384ade7c1SJan Kara struct buffer_head *failed_bh = bh; 72484ade7c1SJan Kara bh = head; 72584ade7c1SJan Kara while (bh != failed_bh) { 72684ade7c1SJan Kara unlock_buffer(bh); 72784ade7c1SJan Kara bh = bh->b_this_page; 72884ade7c1SJan Kara } 72984ade7c1SJan Kara return false; 73084ade7c1SJan Kara } 73184ade7c1SJan Kara 73284ade7c1SJan Kara bh = bh->b_this_page; 73384ade7c1SJan Kara } while (bh != head); 73484ade7c1SJan Kara return true; 73584ade7c1SJan Kara } 73684ade7c1SJan Kara 73789cb0888SJan Kara static int __buffer_migrate_page(struct address_space *mapping, 73889cb0888SJan Kara struct page *newpage, struct page *page, enum migrate_mode mode, 73989cb0888SJan Kara bool check_refs) 7401d8b85ccSChristoph Lameter { 7411d8b85ccSChristoph Lameter struct buffer_head *bh, *head; 7421d8b85ccSChristoph Lameter int rc; 743cc4f11e6SJan Kara int expected_count; 7441d8b85ccSChristoph Lameter 7451d8b85ccSChristoph Lameter if (!page_has_buffers(page)) 746a6bc32b8SMel Gorman return migrate_page(mapping, newpage, page, mode); 7471d8b85ccSChristoph Lameter 748cc4f11e6SJan Kara /* Check whether page does not have extra refs before we do more work */ 749f900482dSJan Kara expected_count = expected_page_refs(mapping, page); 750cc4f11e6SJan Kara if (page_count(page) != expected_count) 751cc4f11e6SJan Kara return -EAGAIN; 752cc4f11e6SJan Kara 7531d8b85ccSChristoph Lameter head = page_buffers(page); 754cc4f11e6SJan Kara if (!buffer_migrate_lock_buffers(head, mode)) 755cc4f11e6SJan Kara return -EAGAIN; 7561d8b85ccSChristoph Lameter 75789cb0888SJan Kara if (check_refs) { 75889cb0888SJan Kara bool busy; 75989cb0888SJan Kara bool invalidated = false; 76089cb0888SJan Kara 76189cb0888SJan Kara recheck_buffers: 76289cb0888SJan Kara busy = false; 76389cb0888SJan Kara spin_lock(&mapping->private_lock); 76489cb0888SJan Kara bh = head; 76589cb0888SJan Kara do { 76689cb0888SJan Kara if (atomic_read(&bh->b_count)) { 76789cb0888SJan Kara busy = true; 76889cb0888SJan Kara break; 76989cb0888SJan Kara } 77089cb0888SJan Kara bh = bh->b_this_page; 77189cb0888SJan Kara } while (bh != head); 77289cb0888SJan Kara if (busy) { 77389cb0888SJan Kara if (invalidated) { 77489cb0888SJan Kara rc = -EAGAIN; 77589cb0888SJan Kara goto unlock_buffers; 77689cb0888SJan Kara } 777ebdf4de5SJan Kara spin_unlock(&mapping->private_lock); 77889cb0888SJan Kara invalidate_bh_lrus(); 77989cb0888SJan Kara invalidated = true; 78089cb0888SJan Kara goto recheck_buffers; 78189cb0888SJan Kara } 78289cb0888SJan Kara } 78389cb0888SJan Kara 78437109694SKeith Busch rc = migrate_page_move_mapping(mapping, newpage, page, 0); 78578bd5209SRafael Aquini if (rc != MIGRATEPAGE_SUCCESS) 786cc4f11e6SJan Kara goto unlock_buffers; 7871d8b85ccSChristoph Lameter 7881d8b85ccSChristoph Lameter ClearPagePrivate(page); 7891d8b85ccSChristoph Lameter set_page_private(newpage, page_private(page)); 7901d8b85ccSChristoph Lameter set_page_private(page, 0); 7911d8b85ccSChristoph Lameter put_page(page); 7921d8b85ccSChristoph Lameter get_page(newpage); 7931d8b85ccSChristoph Lameter 7941d8b85ccSChristoph Lameter bh = head; 7951d8b85ccSChristoph Lameter do { 7961d8b85ccSChristoph Lameter set_bh_page(bh, newpage, bh_offset(bh)); 7971d8b85ccSChristoph Lameter bh = bh->b_this_page; 7981d8b85ccSChristoph Lameter 7991d8b85ccSChristoph Lameter } while (bh != head); 8001d8b85ccSChristoph Lameter 8011d8b85ccSChristoph Lameter SetPagePrivate(newpage); 8021d8b85ccSChristoph Lameter 8032916ecc0SJérôme Glisse if (mode != MIGRATE_SYNC_NO_COPY) 8041d8b85ccSChristoph Lameter migrate_page_copy(newpage, page); 8052916ecc0SJérôme Glisse else 8062916ecc0SJérôme Glisse migrate_page_states(newpage, page); 8071d8b85ccSChristoph Lameter 808cc4f11e6SJan Kara rc = MIGRATEPAGE_SUCCESS; 809cc4f11e6SJan Kara unlock_buffers: 810ebdf4de5SJan Kara if (check_refs) 811ebdf4de5SJan Kara spin_unlock(&mapping->private_lock); 8121d8b85ccSChristoph Lameter bh = head; 8131d8b85ccSChristoph Lameter do { 8141d8b85ccSChristoph Lameter unlock_buffer(bh); 8151d8b85ccSChristoph Lameter bh = bh->b_this_page; 8161d8b85ccSChristoph Lameter 8171d8b85ccSChristoph Lameter } while (bh != head); 8181d8b85ccSChristoph Lameter 819cc4f11e6SJan Kara return rc; 8201d8b85ccSChristoph Lameter } 82189cb0888SJan Kara 82289cb0888SJan Kara /* 82389cb0888SJan Kara * Migration function for pages with buffers. This function can only be used 82489cb0888SJan Kara * if the underlying filesystem guarantees that no other references to "page" 82589cb0888SJan Kara * exist. For example attached buffer heads are accessed only under page lock. 82689cb0888SJan Kara */ 82789cb0888SJan Kara int buffer_migrate_page(struct address_space *mapping, 82889cb0888SJan Kara struct page *newpage, struct page *page, enum migrate_mode mode) 82989cb0888SJan Kara { 83089cb0888SJan Kara return __buffer_migrate_page(mapping, newpage, page, mode, false); 83189cb0888SJan Kara } 8321d8b85ccSChristoph Lameter EXPORT_SYMBOL(buffer_migrate_page); 83389cb0888SJan Kara 83489cb0888SJan Kara /* 83589cb0888SJan Kara * Same as above except that this variant is more careful and checks that there 83689cb0888SJan Kara * are also no buffer head references. This function is the right one for 83789cb0888SJan Kara * mappings where buffer heads are directly looked up and referenced (such as 83889cb0888SJan Kara * block device mappings). 83989cb0888SJan Kara */ 84089cb0888SJan Kara int buffer_migrate_page_norefs(struct address_space *mapping, 84189cb0888SJan Kara struct page *newpage, struct page *page, enum migrate_mode mode) 84289cb0888SJan Kara { 84389cb0888SJan Kara return __buffer_migrate_page(mapping, newpage, page, mode, true); 84489cb0888SJan Kara } 8459361401eSDavid Howells #endif 8461d8b85ccSChristoph Lameter 84704e62a29SChristoph Lameter /* 84804e62a29SChristoph Lameter * Writeback a page to clean the dirty state 84904e62a29SChristoph Lameter */ 85004e62a29SChristoph Lameter static int writeout(struct address_space *mapping, struct page *page) 85104e62a29SChristoph Lameter { 85204e62a29SChristoph Lameter struct writeback_control wbc = { 85304e62a29SChristoph Lameter .sync_mode = WB_SYNC_NONE, 85404e62a29SChristoph Lameter .nr_to_write = 1, 85504e62a29SChristoph Lameter .range_start = 0, 85604e62a29SChristoph Lameter .range_end = LLONG_MAX, 85704e62a29SChristoph Lameter .for_reclaim = 1 85804e62a29SChristoph Lameter }; 85904e62a29SChristoph Lameter int rc; 86004e62a29SChristoph Lameter 86104e62a29SChristoph Lameter if (!mapping->a_ops->writepage) 86204e62a29SChristoph Lameter /* No write method for the address space */ 86304e62a29SChristoph Lameter return -EINVAL; 86404e62a29SChristoph Lameter 86504e62a29SChristoph Lameter if (!clear_page_dirty_for_io(page)) 86604e62a29SChristoph Lameter /* Someone else already triggered a write */ 86704e62a29SChristoph Lameter return -EAGAIN; 86804e62a29SChristoph Lameter 86904e62a29SChristoph Lameter /* 87004e62a29SChristoph Lameter * A dirty page may imply that the underlying filesystem has 87104e62a29SChristoph Lameter * the page on some queue. So the page must be clean for 87204e62a29SChristoph Lameter * migration. Writeout may mean we loose the lock and the 87304e62a29SChristoph Lameter * page state is no longer what we checked for earlier. 87404e62a29SChristoph Lameter * At this point we know that the migration attempt cannot 87504e62a29SChristoph Lameter * be successful. 87604e62a29SChristoph Lameter */ 877e388466dSKirill A. Shutemov remove_migration_ptes(page, page, false); 87804e62a29SChristoph Lameter 87904e62a29SChristoph Lameter rc = mapping->a_ops->writepage(page, &wbc); 88004e62a29SChristoph Lameter 88104e62a29SChristoph Lameter if (rc != AOP_WRITEPAGE_ACTIVATE) 88204e62a29SChristoph Lameter /* unlocked. Relock */ 88304e62a29SChristoph Lameter lock_page(page); 88404e62a29SChristoph Lameter 885bda8550dSHugh Dickins return (rc < 0) ? -EIO : -EAGAIN; 88604e62a29SChristoph Lameter } 88704e62a29SChristoph Lameter 88804e62a29SChristoph Lameter /* 88904e62a29SChristoph Lameter * Default handling if a filesystem does not provide a migration function. 89004e62a29SChristoph Lameter */ 8918351a6e4SChristoph Lameter static int fallback_migrate_page(struct address_space *mapping, 892a6bc32b8SMel Gorman struct page *newpage, struct page *page, enum migrate_mode mode) 8938351a6e4SChristoph Lameter { 894b969c4abSMel Gorman if (PageDirty(page)) { 895a6bc32b8SMel Gorman /* Only writeback pages in full synchronous migration */ 8962916ecc0SJérôme Glisse switch (mode) { 8972916ecc0SJérôme Glisse case MIGRATE_SYNC: 8982916ecc0SJérôme Glisse case MIGRATE_SYNC_NO_COPY: 8992916ecc0SJérôme Glisse break; 9002916ecc0SJérôme Glisse default: 901b969c4abSMel Gorman return -EBUSY; 9022916ecc0SJérôme Glisse } 90304e62a29SChristoph Lameter return writeout(mapping, page); 904b969c4abSMel Gorman } 9058351a6e4SChristoph Lameter 9068351a6e4SChristoph Lameter /* 9078351a6e4SChristoph Lameter * Buffers may be managed in a filesystem specific way. 9088351a6e4SChristoph Lameter * We must have no buffers or drop them. 9098351a6e4SChristoph Lameter */ 910266cf658SDavid Howells if (page_has_private(page) && 9118351a6e4SChristoph Lameter !try_to_release_page(page, GFP_KERNEL)) 912806031bbSMel Gorman return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; 9138351a6e4SChristoph Lameter 914a6bc32b8SMel Gorman return migrate_page(mapping, newpage, page, mode); 9158351a6e4SChristoph Lameter } 9168351a6e4SChristoph Lameter 9171d8b85ccSChristoph Lameter /* 918e24f0b8fSChristoph Lameter * Move a page to a newly allocated page 919e24f0b8fSChristoph Lameter * The page is locked and all ptes have been successfully removed. 920b20a3503SChristoph Lameter * 921e24f0b8fSChristoph Lameter * The new page will have replaced the old page if this function 922e24f0b8fSChristoph Lameter * is successful. 923894bc310SLee Schermerhorn * 924894bc310SLee Schermerhorn * Return value: 925894bc310SLee Schermerhorn * < 0 - error code 92678bd5209SRafael Aquini * MIGRATEPAGE_SUCCESS - success 927b20a3503SChristoph Lameter */ 9283fe2011fSMel Gorman static int move_to_new_page(struct page *newpage, struct page *page, 9295c3f9a67SHugh Dickins enum migrate_mode mode) 930b20a3503SChristoph Lameter { 931e24f0b8fSChristoph Lameter struct address_space *mapping; 932bda807d4SMinchan Kim int rc = -EAGAIN; 933bda807d4SMinchan Kim bool is_lru = !__PageMovable(page); 934b20a3503SChristoph Lameter 9357db7671fSHugh Dickins VM_BUG_ON_PAGE(!PageLocked(page), page); 9367db7671fSHugh Dickins VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 937b20a3503SChristoph Lameter 938b20a3503SChristoph Lameter mapping = page_mapping(page); 939bda807d4SMinchan Kim 940bda807d4SMinchan Kim if (likely(is_lru)) { 941b20a3503SChristoph Lameter if (!mapping) 942a6bc32b8SMel Gorman rc = migrate_page(mapping, newpage, page, mode); 9436c5240aeSChristoph Lameter else if (mapping->a_ops->migratepage) 944b20a3503SChristoph Lameter /* 945bda807d4SMinchan Kim * Most pages have a mapping and most filesystems 946bda807d4SMinchan Kim * provide a migratepage callback. Anonymous pages 947bda807d4SMinchan Kim * are part of swap space which also has its own 948bda807d4SMinchan Kim * migratepage callback. This is the most common path 949bda807d4SMinchan Kim * for page migration. 950b20a3503SChristoph Lameter */ 951bda807d4SMinchan Kim rc = mapping->a_ops->migratepage(mapping, newpage, 952bda807d4SMinchan Kim page, mode); 9538351a6e4SChristoph Lameter else 954bda807d4SMinchan Kim rc = fallback_migrate_page(mapping, newpage, 955bda807d4SMinchan Kim page, mode); 956bda807d4SMinchan Kim } else { 957bda807d4SMinchan Kim /* 958bda807d4SMinchan Kim * In case of non-lru page, it could be released after 959bda807d4SMinchan Kim * isolation step. In that case, we shouldn't try migration. 960bda807d4SMinchan Kim */ 961bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageIsolated(page), page); 962bda807d4SMinchan Kim if (!PageMovable(page)) { 963bda807d4SMinchan Kim rc = MIGRATEPAGE_SUCCESS; 964bda807d4SMinchan Kim __ClearPageIsolated(page); 965bda807d4SMinchan Kim goto out; 966bda807d4SMinchan Kim } 967bda807d4SMinchan Kim 968bda807d4SMinchan Kim rc = mapping->a_ops->migratepage(mapping, newpage, 969bda807d4SMinchan Kim page, mode); 970bda807d4SMinchan Kim WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 971bda807d4SMinchan Kim !PageIsolated(page)); 972bda807d4SMinchan Kim } 973b20a3503SChristoph Lameter 9745c3f9a67SHugh Dickins /* 9755c3f9a67SHugh Dickins * When successful, old pagecache page->mapping must be cleared before 9765c3f9a67SHugh Dickins * page is freed; but stats require that PageAnon be left as PageAnon. 9775c3f9a67SHugh Dickins */ 9785c3f9a67SHugh Dickins if (rc == MIGRATEPAGE_SUCCESS) { 979bda807d4SMinchan Kim if (__PageMovable(page)) { 980bda807d4SMinchan Kim VM_BUG_ON_PAGE(!PageIsolated(page), page); 981bda807d4SMinchan Kim 982bda807d4SMinchan Kim /* 983bda807d4SMinchan Kim * We clear PG_movable under page_lock so any compactor 984bda807d4SMinchan Kim * cannot try to migrate this page. 985bda807d4SMinchan Kim */ 986bda807d4SMinchan Kim __ClearPageIsolated(page); 987bda807d4SMinchan Kim } 988bda807d4SMinchan Kim 989bda807d4SMinchan Kim /* 990c23a0c99SRalph Campbell * Anonymous and movable page->mapping will be cleared by 991bda807d4SMinchan Kim * free_pages_prepare so don't reset it here for keeping 992bda807d4SMinchan Kim * the type to work PageAnon, for example. 993bda807d4SMinchan Kim */ 994bda807d4SMinchan Kim if (!PageMappingFlags(page)) 9955c3f9a67SHugh Dickins page->mapping = NULL; 996d2b2c6ddSLars Persson 99725b2995aSChristoph Hellwig if (likely(!is_zone_device_page(newpage))) 998d2b2c6ddSLars Persson flush_dcache_page(newpage); 999d2b2c6ddSLars Persson 10003fe2011fSMel Gorman } 1001bda807d4SMinchan Kim out: 1002e24f0b8fSChristoph Lameter return rc; 1003e24f0b8fSChristoph Lameter } 1004e24f0b8fSChristoph Lameter 10050dabec93SMinchan Kim static int __unmap_and_move(struct page *page, struct page *newpage, 10069c620e2bSHugh Dickins int force, enum migrate_mode mode) 1007e24f0b8fSChristoph Lameter { 10080dabec93SMinchan Kim int rc = -EAGAIN; 10092ebba6b7SHugh Dickins int page_was_mapped = 0; 10103f6c8272SMel Gorman struct anon_vma *anon_vma = NULL; 1011bda807d4SMinchan Kim bool is_lru = !__PageMovable(page); 101295a402c3SChristoph Lameter 1013529ae9aaSNick Piggin if (!trylock_page(page)) { 1014a6bc32b8SMel Gorman if (!force || mode == MIGRATE_ASYNC) 10150dabec93SMinchan Kim goto out; 10163e7d3449SMel Gorman 10173e7d3449SMel Gorman /* 10183e7d3449SMel Gorman * It's not safe for direct compaction to call lock_page. 10193e7d3449SMel Gorman * For example, during page readahead pages are added locked 10203e7d3449SMel Gorman * to the LRU. Later, when the IO completes the pages are 10213e7d3449SMel Gorman * marked uptodate and unlocked. However, the queueing 10223e7d3449SMel Gorman * could be merging multiple pages for one bio (e.g. 10233e7d3449SMel Gorman * mpage_readpages). If an allocation happens for the 10243e7d3449SMel Gorman * second or third page, the process can end up locking 10253e7d3449SMel Gorman * the same page twice and deadlocking. Rather than 10263e7d3449SMel Gorman * trying to be clever about what pages can be locked, 10273e7d3449SMel Gorman * avoid the use of lock_page for direct compaction 10283e7d3449SMel Gorman * altogether. 10293e7d3449SMel Gorman */ 10303e7d3449SMel Gorman if (current->flags & PF_MEMALLOC) 10310dabec93SMinchan Kim goto out; 10323e7d3449SMel Gorman 1033e24f0b8fSChristoph Lameter lock_page(page); 1034e24f0b8fSChristoph Lameter } 1035e24f0b8fSChristoph Lameter 1036e24f0b8fSChristoph Lameter if (PageWriteback(page)) { 103711bc82d6SAndrea Arcangeli /* 1038fed5b64aSJianguo Wu * Only in the case of a full synchronous migration is it 1039a6bc32b8SMel Gorman * necessary to wait for PageWriteback. In the async case, 1040a6bc32b8SMel Gorman * the retry loop is too short and in the sync-light case, 1041a6bc32b8SMel Gorman * the overhead of stalling is too much 104211bc82d6SAndrea Arcangeli */ 10432916ecc0SJérôme Glisse switch (mode) { 10442916ecc0SJérôme Glisse case MIGRATE_SYNC: 10452916ecc0SJérôme Glisse case MIGRATE_SYNC_NO_COPY: 10462916ecc0SJérôme Glisse break; 10472916ecc0SJérôme Glisse default: 104811bc82d6SAndrea Arcangeli rc = -EBUSY; 10490a31bc97SJohannes Weiner goto out_unlock; 105011bc82d6SAndrea Arcangeli } 105111bc82d6SAndrea Arcangeli if (!force) 10520a31bc97SJohannes Weiner goto out_unlock; 1053e24f0b8fSChristoph Lameter wait_on_page_writeback(page); 1054e24f0b8fSChristoph Lameter } 105503f15c86SHugh Dickins 1056e24f0b8fSChristoph Lameter /* 1057dc386d4dSKAMEZAWA Hiroyuki * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 1058dc386d4dSKAMEZAWA Hiroyuki * we cannot notice that anon_vma is freed while we migrates a page. 10591ce82b69SHugh Dickins * This get_anon_vma() delays freeing anon_vma pointer until the end 1060dc386d4dSKAMEZAWA Hiroyuki * of migration. File cache pages are no problem because of page_lock() 1061989f89c5SKAMEZAWA Hiroyuki * File Caches may use write_page() or lock_page() in migration, then, 1062989f89c5SKAMEZAWA Hiroyuki * just care Anon page here. 10633fe2011fSMel Gorman * 106403f15c86SHugh Dickins * Only page_get_anon_vma() understands the subtleties of 106503f15c86SHugh Dickins * getting a hold on an anon_vma from outside one of its mms. 106603f15c86SHugh Dickins * But if we cannot get anon_vma, then we won't need it anyway, 106703f15c86SHugh Dickins * because that implies that the anon page is no longer mapped 106803f15c86SHugh Dickins * (and cannot be remapped so long as we hold the page lock). 10693fe2011fSMel Gorman */ 107003f15c86SHugh Dickins if (PageAnon(page) && !PageKsm(page)) 107103f15c86SHugh Dickins anon_vma = page_get_anon_vma(page); 107262e1c553SShaohua Li 10737db7671fSHugh Dickins /* 10747db7671fSHugh Dickins * Block others from accessing the new page when we get around to 10757db7671fSHugh Dickins * establishing additional references. We are usually the only one 10767db7671fSHugh Dickins * holding a reference to newpage at this point. We used to have a BUG 10777db7671fSHugh Dickins * here if trylock_page(newpage) fails, but would like to allow for 10787db7671fSHugh Dickins * cases where there might be a race with the previous use of newpage. 10797db7671fSHugh Dickins * This is much like races on refcount of oldpage: just don't BUG(). 10807db7671fSHugh Dickins */ 10817db7671fSHugh Dickins if (unlikely(!trylock_page(newpage))) 10827db7671fSHugh Dickins goto out_unlock; 10837db7671fSHugh Dickins 1084bda807d4SMinchan Kim if (unlikely(!is_lru)) { 1085bda807d4SMinchan Kim rc = move_to_new_page(newpage, page, mode); 1086bda807d4SMinchan Kim goto out_unlock_both; 1087bda807d4SMinchan Kim } 1088bda807d4SMinchan Kim 1089dc386d4dSKAMEZAWA Hiroyuki /* 109062e1c553SShaohua Li * Corner case handling: 109162e1c553SShaohua Li * 1. When a new swap-cache page is read into, it is added to the LRU 109262e1c553SShaohua Li * and treated as swapcache but it has no rmap yet. 109362e1c553SShaohua Li * Calling try_to_unmap() against a page->mapping==NULL page will 109462e1c553SShaohua Li * trigger a BUG. So handle it here. 109562e1c553SShaohua Li * 2. An orphaned page (see truncate_complete_page) might have 109662e1c553SShaohua Li * fs-private metadata. The page can be picked up due to memory 109762e1c553SShaohua Li * offlining. Everywhere else except page reclaim, the page is 109862e1c553SShaohua Li * invisible to the vm, so the page can not be migrated. So try to 109962e1c553SShaohua Li * free the metadata, so the page can be freed. 1100dc386d4dSKAMEZAWA Hiroyuki */ 110162e1c553SShaohua Li if (!page->mapping) { 1102309381feSSasha Levin VM_BUG_ON_PAGE(PageAnon(page), page); 11031ce82b69SHugh Dickins if (page_has_private(page)) { 110462e1c553SShaohua Li try_to_free_buffers(page); 11057db7671fSHugh Dickins goto out_unlock_both; 110662e1c553SShaohua Li } 11077db7671fSHugh Dickins } else if (page_mapped(page)) { 11087db7671fSHugh Dickins /* Establish migration ptes */ 110903f15c86SHugh Dickins VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 111003f15c86SHugh Dickins page); 11112ebba6b7SHugh Dickins try_to_unmap(page, 1112da1b13ccSWanpeng Li TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 11132ebba6b7SHugh Dickins page_was_mapped = 1; 11142ebba6b7SHugh Dickins } 1115dc386d4dSKAMEZAWA Hiroyuki 1116e24f0b8fSChristoph Lameter if (!page_mapped(page)) 11175c3f9a67SHugh Dickins rc = move_to_new_page(newpage, page, mode); 1118e24f0b8fSChristoph Lameter 11195c3f9a67SHugh Dickins if (page_was_mapped) 11205c3f9a67SHugh Dickins remove_migration_ptes(page, 1121e388466dSKirill A. Shutemov rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 11223f6c8272SMel Gorman 11237db7671fSHugh Dickins out_unlock_both: 11247db7671fSHugh Dickins unlock_page(newpage); 11257db7671fSHugh Dickins out_unlock: 11263f6c8272SMel Gorman /* Drop an anon_vma reference if we took one */ 112776545066SRik van Riel if (anon_vma) 11289e60109fSPeter Zijlstra put_anon_vma(anon_vma); 1129b20a3503SChristoph Lameter unlock_page(page); 11300dabec93SMinchan Kim out: 1131c6c919ebSMinchan Kim /* 1132c6c919ebSMinchan Kim * If migration is successful, decrease refcount of the newpage 1133c6c919ebSMinchan Kim * which will not free the page because new page owner increased 1134c6c919ebSMinchan Kim * refcounter. As well, if it is LRU page, add the page to LRU 1135e0a352faSDavid Hildenbrand * list in here. Use the old state of the isolated source page to 1136e0a352faSDavid Hildenbrand * determine if we migrated a LRU page. newpage was already unlocked 1137e0a352faSDavid Hildenbrand * and possibly modified by its owner - don't rely on the page 1138e0a352faSDavid Hildenbrand * state. 1139c6c919ebSMinchan Kim */ 1140c6c919ebSMinchan Kim if (rc == MIGRATEPAGE_SUCCESS) { 1141e0a352faSDavid Hildenbrand if (unlikely(!is_lru)) 1142c6c919ebSMinchan Kim put_page(newpage); 1143c6c919ebSMinchan Kim else 1144c6c919ebSMinchan Kim putback_lru_page(newpage); 1145c6c919ebSMinchan Kim } 1146c6c919ebSMinchan Kim 11470dabec93SMinchan Kim return rc; 11480dabec93SMinchan Kim } 114995a402c3SChristoph Lameter 11500dabec93SMinchan Kim /* 1151ef2a5153SGeert Uytterhoeven * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work 1152ef2a5153SGeert Uytterhoeven * around it. 1153ef2a5153SGeert Uytterhoeven */ 1154815f0ddbSNick Desaulniers #if defined(CONFIG_ARM) && \ 1155815f0ddbSNick Desaulniers defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 1156ef2a5153SGeert Uytterhoeven #define ICE_noinline noinline 1157ef2a5153SGeert Uytterhoeven #else 1158ef2a5153SGeert Uytterhoeven #define ICE_noinline 1159ef2a5153SGeert Uytterhoeven #endif 1160ef2a5153SGeert Uytterhoeven 1161ef2a5153SGeert Uytterhoeven /* 11620dabec93SMinchan Kim * Obtain the lock on page, remove all ptes and migrate the page 11630dabec93SMinchan Kim * to the newly allocated page in newpage. 11640dabec93SMinchan Kim */ 1165ef2a5153SGeert Uytterhoeven static ICE_noinline int unmap_and_move(new_page_t get_new_page, 1166ef2a5153SGeert Uytterhoeven free_page_t put_new_page, 1167ef2a5153SGeert Uytterhoeven unsigned long private, struct page *page, 1168add05cecSNaoya Horiguchi int force, enum migrate_mode mode, 1169add05cecSNaoya Horiguchi enum migrate_reason reason) 11700dabec93SMinchan Kim { 11712def7424SHugh Dickins int rc = MIGRATEPAGE_SUCCESS; 117274d4a579SYang Shi struct page *newpage = NULL; 11730dabec93SMinchan Kim 117494723aafSMichal Hocko if (!thp_migration_supported() && PageTransHuge(page)) 117594723aafSMichal Hocko return -ENOMEM; 117694723aafSMichal Hocko 11770dabec93SMinchan Kim if (page_count(page) == 1) { 11780dabec93SMinchan Kim /* page was freed from under us. So we are done. */ 1179c6c919ebSMinchan Kim ClearPageActive(page); 1180c6c919ebSMinchan Kim ClearPageUnevictable(page); 1181bda807d4SMinchan Kim if (unlikely(__PageMovable(page))) { 1182bda807d4SMinchan Kim lock_page(page); 1183bda807d4SMinchan Kim if (!PageMovable(page)) 1184bda807d4SMinchan Kim __ClearPageIsolated(page); 1185bda807d4SMinchan Kim unlock_page(page); 1186bda807d4SMinchan Kim } 11870dabec93SMinchan Kim goto out; 11880dabec93SMinchan Kim } 11890dabec93SMinchan Kim 119074d4a579SYang Shi newpage = get_new_page(page, private); 119174d4a579SYang Shi if (!newpage) 119274d4a579SYang Shi return -ENOMEM; 119374d4a579SYang Shi 11949c620e2bSHugh Dickins rc = __unmap_and_move(page, newpage, force, mode); 1195c6c919ebSMinchan Kim if (rc == MIGRATEPAGE_SUCCESS) 11967cd12b4aSVlastimil Babka set_page_owner_migrate_reason(newpage, reason); 1197bf6bddf1SRafael Aquini 11980dabec93SMinchan Kim out: 1199e24f0b8fSChristoph Lameter if (rc != -EAGAIN) { 1200aaa994b3SChristoph Lameter /* 1201aaa994b3SChristoph Lameter * A page that has been migrated has all references 1202aaa994b3SChristoph Lameter * removed and will be freed. A page that has not been 1203c23a0c99SRalph Campbell * migrated will have kept its references and be restored. 1204aaa994b3SChristoph Lameter */ 1205aaa994b3SChristoph Lameter list_del(&page->lru); 12066afcf8efSMing Ling 12076afcf8efSMing Ling /* 12086afcf8efSMing Ling * Compaction can migrate also non-LRU pages which are 12096afcf8efSMing Ling * not accounted to NR_ISOLATED_*. They can be recognized 12106afcf8efSMing Ling * as __PageMovable 12116afcf8efSMing Ling */ 12126afcf8efSMing Ling if (likely(!__PageMovable(page))) 1213e8db67ebSNaoya Horiguchi mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 1214e8db67ebSNaoya Horiguchi page_is_file_cache(page), -hpage_nr_pages(page)); 1215e24f0b8fSChristoph Lameter } 121668711a74SDavid Rientjes 121795a402c3SChristoph Lameter /* 1218c6c919ebSMinchan Kim * If migration is successful, releases reference grabbed during 1219c6c919ebSMinchan Kim * isolation. Otherwise, restore the page to right list unless 1220c6c919ebSMinchan Kim * we want to retry. 122195a402c3SChristoph Lameter */ 1222c6c919ebSMinchan Kim if (rc == MIGRATEPAGE_SUCCESS) { 1223c6c919ebSMinchan Kim put_page(page); 1224c6c919ebSMinchan Kim if (reason == MR_MEMORY_FAILURE) { 1225c6c919ebSMinchan Kim /* 1226c6c919ebSMinchan Kim * Set PG_HWPoison on just freed page 1227c6c919ebSMinchan Kim * intentionally. Although it's rather weird, 1228c6c919ebSMinchan Kim * it's how HWPoison flag works at the moment. 1229c6c919ebSMinchan Kim */ 1230d4ae9916SNaoya Horiguchi if (set_hwpoison_free_buddy_page(page)) 1231c6c919ebSMinchan Kim num_poisoned_pages_inc(); 1232c6c919ebSMinchan Kim } 1233c6c919ebSMinchan Kim } else { 1234bda807d4SMinchan Kim if (rc != -EAGAIN) { 1235bda807d4SMinchan Kim if (likely(!__PageMovable(page))) { 1236c6c919ebSMinchan Kim putback_lru_page(page); 1237bda807d4SMinchan Kim goto put_new; 1238bda807d4SMinchan Kim } 1239bda807d4SMinchan Kim 1240bda807d4SMinchan Kim lock_page(page); 1241bda807d4SMinchan Kim if (PageMovable(page)) 1242bda807d4SMinchan Kim putback_movable_page(page); 1243bda807d4SMinchan Kim else 1244bda807d4SMinchan Kim __ClearPageIsolated(page); 1245bda807d4SMinchan Kim unlock_page(page); 1246bda807d4SMinchan Kim put_page(page); 1247bda807d4SMinchan Kim } 1248bda807d4SMinchan Kim put_new: 1249cf4b769aSHugh Dickins if (put_new_page) 125068711a74SDavid Rientjes put_new_page(newpage, private); 1251c6c919ebSMinchan Kim else 1252d6d86c0aSKonstantin Khlebnikov put_page(newpage); 1253c6c919ebSMinchan Kim } 125468711a74SDavid Rientjes 1255e24f0b8fSChristoph Lameter return rc; 1256e24f0b8fSChristoph Lameter } 1257b20a3503SChristoph Lameter 1258e24f0b8fSChristoph Lameter /* 1259290408d4SNaoya Horiguchi * Counterpart of unmap_and_move_page() for hugepage migration. 1260290408d4SNaoya Horiguchi * 1261290408d4SNaoya Horiguchi * This function doesn't wait the completion of hugepage I/O 1262290408d4SNaoya Horiguchi * because there is no race between I/O and migration for hugepage. 1263290408d4SNaoya Horiguchi * Note that currently hugepage I/O occurs only in direct I/O 1264290408d4SNaoya Horiguchi * where no lock is held and PG_writeback is irrelevant, 1265290408d4SNaoya Horiguchi * and writeback status of all subpages are counted in the reference 1266290408d4SNaoya Horiguchi * count of the head page (i.e. if all subpages of a 2MB hugepage are 1267290408d4SNaoya Horiguchi * under direct I/O, the reference of the head page is 512 and a bit more.) 1268290408d4SNaoya Horiguchi * This means that when we try to migrate hugepage whose subpages are 1269290408d4SNaoya Horiguchi * doing direct I/O, some references remain after try_to_unmap() and 1270290408d4SNaoya Horiguchi * hugepage migration fails without data corruption. 1271290408d4SNaoya Horiguchi * 1272290408d4SNaoya Horiguchi * There is also no race when direct I/O is issued on the page under migration, 1273290408d4SNaoya Horiguchi * because then pte is replaced with migration swap entry and direct I/O code 1274290408d4SNaoya Horiguchi * will wait in the page fault for migration to complete. 1275290408d4SNaoya Horiguchi */ 1276290408d4SNaoya Horiguchi static int unmap_and_move_huge_page(new_page_t get_new_page, 127768711a74SDavid Rientjes free_page_t put_new_page, unsigned long private, 127868711a74SDavid Rientjes struct page *hpage, int force, 12797cd12b4aSVlastimil Babka enum migrate_mode mode, int reason) 1280290408d4SNaoya Horiguchi { 12812def7424SHugh Dickins int rc = -EAGAIN; 12822ebba6b7SHugh Dickins int page_was_mapped = 0; 128332665f2bSJoonsoo Kim struct page *new_hpage; 1284290408d4SNaoya Horiguchi struct anon_vma *anon_vma = NULL; 1285c0d0381aSMike Kravetz struct address_space *mapping = NULL; 1286290408d4SNaoya Horiguchi 128783467efbSNaoya Horiguchi /* 12887ed2c31dSAnshuman Khandual * Migratability of hugepages depends on architectures and their size. 128983467efbSNaoya Horiguchi * This check is necessary because some callers of hugepage migration 129083467efbSNaoya Horiguchi * like soft offline and memory hotremove don't walk through page 129183467efbSNaoya Horiguchi * tables or check whether the hugepage is pmd-based or not before 129283467efbSNaoya Horiguchi * kicking migration. 129383467efbSNaoya Horiguchi */ 1294100873d7SNaoya Horiguchi if (!hugepage_migration_supported(page_hstate(hpage))) { 129532665f2bSJoonsoo Kim putback_active_hugepage(hpage); 129683467efbSNaoya Horiguchi return -ENOSYS; 129732665f2bSJoonsoo Kim } 129883467efbSNaoya Horiguchi 1299666feb21SMichal Hocko new_hpage = get_new_page(hpage, private); 1300290408d4SNaoya Horiguchi if (!new_hpage) 1301290408d4SNaoya Horiguchi return -ENOMEM; 1302290408d4SNaoya Horiguchi 1303290408d4SNaoya Horiguchi if (!trylock_page(hpage)) { 13042916ecc0SJérôme Glisse if (!force) 1305290408d4SNaoya Horiguchi goto out; 13062916ecc0SJérôme Glisse switch (mode) { 13072916ecc0SJérôme Glisse case MIGRATE_SYNC: 13082916ecc0SJérôme Glisse case MIGRATE_SYNC_NO_COPY: 13092916ecc0SJérôme Glisse break; 13102916ecc0SJérôme Glisse default: 13112916ecc0SJérôme Glisse goto out; 13122916ecc0SJérôme Glisse } 1313290408d4SNaoya Horiguchi lock_page(hpage); 1314290408d4SNaoya Horiguchi } 1315290408d4SNaoya Horiguchi 1316cb6acd01SMike Kravetz /* 1317cb6acd01SMike Kravetz * Check for pages which are in the process of being freed. Without 1318cb6acd01SMike Kravetz * page_mapping() set, hugetlbfs specific move page routine will not 1319cb6acd01SMike Kravetz * be called and we could leak usage counts for subpools. 1320cb6acd01SMike Kravetz */ 1321cb6acd01SMike Kravetz if (page_private(hpage) && !page_mapping(hpage)) { 1322cb6acd01SMike Kravetz rc = -EBUSY; 1323cb6acd01SMike Kravetz goto out_unlock; 1324cb6acd01SMike Kravetz } 1325cb6acd01SMike Kravetz 1326746b18d4SPeter Zijlstra if (PageAnon(hpage)) 1327746b18d4SPeter Zijlstra anon_vma = page_get_anon_vma(hpage); 1328290408d4SNaoya Horiguchi 13297db7671fSHugh Dickins if (unlikely(!trylock_page(new_hpage))) 13307db7671fSHugh Dickins goto put_anon; 13317db7671fSHugh Dickins 13322ebba6b7SHugh Dickins if (page_mapped(hpage)) { 1333c0d0381aSMike Kravetz /* 1334c0d0381aSMike Kravetz * try_to_unmap could potentially call huge_pmd_unshare. 1335c0d0381aSMike Kravetz * Because of this, take semaphore in write mode here and 1336c0d0381aSMike Kravetz * set TTU_RMAP_LOCKED to let lower levels know we have 1337c0d0381aSMike Kravetz * taken the lock. 1338c0d0381aSMike Kravetz */ 1339c0d0381aSMike Kravetz mapping = hugetlb_page_mapping_lock_write(hpage); 1340c0d0381aSMike Kravetz if (unlikely(!mapping)) 1341c0d0381aSMike Kravetz goto unlock_put_anon; 1342c0d0381aSMike Kravetz 13432ebba6b7SHugh Dickins try_to_unmap(hpage, 1344c0d0381aSMike Kravetz TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| 1345c0d0381aSMike Kravetz TTU_RMAP_LOCKED); 13462ebba6b7SHugh Dickins page_was_mapped = 1; 1347c0d0381aSMike Kravetz /* 1348c0d0381aSMike Kravetz * Leave mapping locked until after subsequent call to 1349c0d0381aSMike Kravetz * remove_migration_ptes() 1350c0d0381aSMike Kravetz */ 13512ebba6b7SHugh Dickins } 1352290408d4SNaoya Horiguchi 1353290408d4SNaoya Horiguchi if (!page_mapped(hpage)) 13545c3f9a67SHugh Dickins rc = move_to_new_page(new_hpage, hpage, mode); 1355290408d4SNaoya Horiguchi 1356c0d0381aSMike Kravetz if (page_was_mapped) { 13575c3f9a67SHugh Dickins remove_migration_ptes(hpage, 1358c0d0381aSMike Kravetz rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, true); 1359c0d0381aSMike Kravetz i_mmap_unlock_write(mapping); 1360c0d0381aSMike Kravetz } 1361290408d4SNaoya Horiguchi 1362c0d0381aSMike Kravetz unlock_put_anon: 13637db7671fSHugh Dickins unlock_page(new_hpage); 13647db7671fSHugh Dickins 13657db7671fSHugh Dickins put_anon: 1366fd4a4663SHugh Dickins if (anon_vma) 13679e60109fSPeter Zijlstra put_anon_vma(anon_vma); 13688e6ac7faSAneesh Kumar K.V 13692def7424SHugh Dickins if (rc == MIGRATEPAGE_SUCCESS) { 1370ab5ac90aSMichal Hocko move_hugetlb_state(hpage, new_hpage, reason); 13712def7424SHugh Dickins put_new_page = NULL; 13722def7424SHugh Dickins } 13738e6ac7faSAneesh Kumar K.V 1374cb6acd01SMike Kravetz out_unlock: 1375290408d4SNaoya Horiguchi unlock_page(hpage); 137609761333SHillf Danton out: 1377b8ec1ceeSNaoya Horiguchi if (rc != -EAGAIN) 1378b8ec1ceeSNaoya Horiguchi putback_active_hugepage(hpage); 137968711a74SDavid Rientjes 138068711a74SDavid Rientjes /* 138168711a74SDavid Rientjes * If migration was not successful and there's a freeing callback, use 138268711a74SDavid Rientjes * it. Otherwise, put_page() will drop the reference grabbed during 138368711a74SDavid Rientjes * isolation. 138468711a74SDavid Rientjes */ 13852def7424SHugh Dickins if (put_new_page) 138668711a74SDavid Rientjes put_new_page(new_hpage, private); 138768711a74SDavid Rientjes else 13883aaa76e1SNaoya Horiguchi putback_active_hugepage(new_hpage); 138968711a74SDavid Rientjes 1390290408d4SNaoya Horiguchi return rc; 1391290408d4SNaoya Horiguchi } 1392290408d4SNaoya Horiguchi 1393290408d4SNaoya Horiguchi /* 1394c73e5c9cSSrivatsa S. Bhat * migrate_pages - migrate the pages specified in a list, to the free pages 1395c73e5c9cSSrivatsa S. Bhat * supplied as the target for the page migration 1396e24f0b8fSChristoph Lameter * 1397c73e5c9cSSrivatsa S. Bhat * @from: The list of pages to be migrated. 1398c73e5c9cSSrivatsa S. Bhat * @get_new_page: The function used to allocate free pages to be used 1399c73e5c9cSSrivatsa S. Bhat * as the target of the page migration. 140068711a74SDavid Rientjes * @put_new_page: The function used to free target pages if migration 140168711a74SDavid Rientjes * fails, or NULL if no special handling is necessary. 1402c73e5c9cSSrivatsa S. Bhat * @private: Private data to be passed on to get_new_page() 1403c73e5c9cSSrivatsa S. Bhat * @mode: The migration mode that specifies the constraints for 1404c73e5c9cSSrivatsa S. Bhat * page migration, if any. 1405c73e5c9cSSrivatsa S. Bhat * @reason: The reason for page migration. 1406e24f0b8fSChristoph Lameter * 1407c73e5c9cSSrivatsa S. Bhat * The function returns after 10 attempts or if no pages are movable any more 1408c73e5c9cSSrivatsa S. Bhat * because the list has become empty or no retryable pages exist any more. 140914e0f9bcSHugh Dickins * The caller should call putback_movable_pages() to return pages to the LRU 141028bd6578SMinchan Kim * or free list only if ret != 0. 1411e24f0b8fSChristoph Lameter * 1412c73e5c9cSSrivatsa S. Bhat * Returns the number of pages that were not migrated, or an error code. 1413e24f0b8fSChristoph Lameter */ 14149c620e2bSHugh Dickins int migrate_pages(struct list_head *from, new_page_t get_new_page, 141568711a74SDavid Rientjes free_page_t put_new_page, unsigned long private, 141668711a74SDavid Rientjes enum migrate_mode mode, int reason) 1417e24f0b8fSChristoph Lameter { 1418e24f0b8fSChristoph Lameter int retry = 1; 1419e24f0b8fSChristoph Lameter int nr_failed = 0; 14205647bc29SMel Gorman int nr_succeeded = 0; 1421e24f0b8fSChristoph Lameter int pass = 0; 1422e24f0b8fSChristoph Lameter struct page *page; 1423e24f0b8fSChristoph Lameter struct page *page2; 1424e24f0b8fSChristoph Lameter int swapwrite = current->flags & PF_SWAPWRITE; 1425e24f0b8fSChristoph Lameter int rc; 14262d1db3b1SChristoph Lameter 1427e24f0b8fSChristoph Lameter if (!swapwrite) 1428e24f0b8fSChristoph Lameter current->flags |= PF_SWAPWRITE; 1429e24f0b8fSChristoph Lameter 1430e24f0b8fSChristoph Lameter for(pass = 0; pass < 10 && retry; pass++) { 1431e24f0b8fSChristoph Lameter retry = 0; 1432e24f0b8fSChristoph Lameter 1433e24f0b8fSChristoph Lameter list_for_each_entry_safe(page, page2, from, lru) { 143494723aafSMichal Hocko retry: 1435e24f0b8fSChristoph Lameter cond_resched(); 1436e24f0b8fSChristoph Lameter 143731caf665SNaoya Horiguchi if (PageHuge(page)) 143831caf665SNaoya Horiguchi rc = unmap_and_move_huge_page(get_new_page, 143968711a74SDavid Rientjes put_new_page, private, page, 14407cd12b4aSVlastimil Babka pass > 2, mode, reason); 144131caf665SNaoya Horiguchi else 144268711a74SDavid Rientjes rc = unmap_and_move(get_new_page, put_new_page, 1443add05cecSNaoya Horiguchi private, page, pass > 2, mode, 1444add05cecSNaoya Horiguchi reason); 1445e24f0b8fSChristoph Lameter 1446e24f0b8fSChristoph Lameter switch(rc) { 144795a402c3SChristoph Lameter case -ENOMEM: 144894723aafSMichal Hocko /* 144994723aafSMichal Hocko * THP migration might be unsupported or the 145094723aafSMichal Hocko * allocation could've failed so we should 145194723aafSMichal Hocko * retry on the same page with the THP split 145294723aafSMichal Hocko * to base pages. 145394723aafSMichal Hocko * 145494723aafSMichal Hocko * Head page is retried immediately and tail 145594723aafSMichal Hocko * pages are added to the tail of the list so 145694723aafSMichal Hocko * we encounter them after the rest of the list 145794723aafSMichal Hocko * is processed. 145894723aafSMichal Hocko */ 1459e6112fc3SAnshuman Khandual if (PageTransHuge(page) && !PageHuge(page)) { 146094723aafSMichal Hocko lock_page(page); 146194723aafSMichal Hocko rc = split_huge_page_to_list(page, from); 146294723aafSMichal Hocko unlock_page(page); 146394723aafSMichal Hocko if (!rc) { 146494723aafSMichal Hocko list_safe_reset_next(page, page2, lru); 146594723aafSMichal Hocko goto retry; 146694723aafSMichal Hocko } 146794723aafSMichal Hocko } 1468dfef2ef4SDavid Rientjes nr_failed++; 146995a402c3SChristoph Lameter goto out; 1470e24f0b8fSChristoph Lameter case -EAGAIN: 1471b20a3503SChristoph Lameter retry++; 1472e24f0b8fSChristoph Lameter break; 147378bd5209SRafael Aquini case MIGRATEPAGE_SUCCESS: 14745647bc29SMel Gorman nr_succeeded++; 1475e24f0b8fSChristoph Lameter break; 1476e24f0b8fSChristoph Lameter default: 1477354a3363SNaoya Horiguchi /* 1478354a3363SNaoya Horiguchi * Permanent failure (-EBUSY, -ENOSYS, etc.): 1479354a3363SNaoya Horiguchi * unlike -EAGAIN case, the failed page is 1480354a3363SNaoya Horiguchi * removed from migration page list and not 1481354a3363SNaoya Horiguchi * retried in the next outer loop. 1482354a3363SNaoya Horiguchi */ 1483b20a3503SChristoph Lameter nr_failed++; 1484e24f0b8fSChristoph Lameter break; 1485b20a3503SChristoph Lameter } 1486b20a3503SChristoph Lameter } 1487e24f0b8fSChristoph Lameter } 1488f2f81fb2SVlastimil Babka nr_failed += retry; 1489f2f81fb2SVlastimil Babka rc = nr_failed; 149095a402c3SChristoph Lameter out: 14915647bc29SMel Gorman if (nr_succeeded) 14925647bc29SMel Gorman count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); 14935647bc29SMel Gorman if (nr_failed) 14945647bc29SMel Gorman count_vm_events(PGMIGRATE_FAIL, nr_failed); 14957b2a2d4aSMel Gorman trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); 14967b2a2d4aSMel Gorman 1497b20a3503SChristoph Lameter if (!swapwrite) 1498b20a3503SChristoph Lameter current->flags &= ~PF_SWAPWRITE; 1499b20a3503SChristoph Lameter 150095a402c3SChristoph Lameter return rc; 1501b20a3503SChristoph Lameter } 1502b20a3503SChristoph Lameter 1503742755a1SChristoph Lameter #ifdef CONFIG_NUMA 1504742755a1SChristoph Lameter 1505a49bd4d7SMichal Hocko static int store_status(int __user *status, int start, int value, int nr) 1506742755a1SChristoph Lameter { 1507a49bd4d7SMichal Hocko while (nr-- > 0) { 1508a49bd4d7SMichal Hocko if (put_user(value, status + start)) 1509a49bd4d7SMichal Hocko return -EFAULT; 1510a49bd4d7SMichal Hocko start++; 1511a49bd4d7SMichal Hocko } 1512742755a1SChristoph Lameter 1513a49bd4d7SMichal Hocko return 0; 1514a49bd4d7SMichal Hocko } 1515742755a1SChristoph Lameter 1516a49bd4d7SMichal Hocko static int do_move_pages_to_node(struct mm_struct *mm, 1517a49bd4d7SMichal Hocko struct list_head *pagelist, int node) 1518a49bd4d7SMichal Hocko { 1519a49bd4d7SMichal Hocko int err; 1520742755a1SChristoph Lameter 1521a49bd4d7SMichal Hocko err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, 1522a49bd4d7SMichal Hocko MIGRATE_SYNC, MR_SYSCALL); 1523a49bd4d7SMichal Hocko if (err) 1524a49bd4d7SMichal Hocko putback_movable_pages(pagelist); 1525a49bd4d7SMichal Hocko return err; 1526742755a1SChristoph Lameter } 1527742755a1SChristoph Lameter 1528742755a1SChristoph Lameter /* 1529a49bd4d7SMichal Hocko * Resolves the given address to a struct page, isolates it from the LRU and 1530a49bd4d7SMichal Hocko * puts it to the given pagelist. 1531e0153fc2SYang Shi * Returns: 1532e0153fc2SYang Shi * errno - if the page cannot be found/isolated 1533e0153fc2SYang Shi * 0 - when it doesn't have to be migrated because it is already on the 1534e0153fc2SYang Shi * target node 1535e0153fc2SYang Shi * 1 - when it has been queued 1536742755a1SChristoph Lameter */ 1537a49bd4d7SMichal Hocko static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, 1538a49bd4d7SMichal Hocko int node, struct list_head *pagelist, bool migrate_all) 1539742755a1SChristoph Lameter { 1540742755a1SChristoph Lameter struct vm_area_struct *vma; 1541742755a1SChristoph Lameter struct page *page; 1542e8db67ebSNaoya Horiguchi unsigned int follflags; 1543a49bd4d7SMichal Hocko int err; 1544742755a1SChristoph Lameter 1545a49bd4d7SMichal Hocko down_read(&mm->mmap_sem); 1546742755a1SChristoph Lameter err = -EFAULT; 1547a49bd4d7SMichal Hocko vma = find_vma(mm, addr); 1548a49bd4d7SMichal Hocko if (!vma || addr < vma->vm_start || !vma_migratable(vma)) 1549a49bd4d7SMichal Hocko goto out; 1550742755a1SChristoph Lameter 1551d899844eSKirill A. Shutemov /* FOLL_DUMP to ignore special (like zero) pages */ 1552e8db67ebSNaoya Horiguchi follflags = FOLL_GET | FOLL_DUMP; 1553a49bd4d7SMichal Hocko page = follow_page(vma, addr, follflags); 155489f5b7daSLinus Torvalds 155589f5b7daSLinus Torvalds err = PTR_ERR(page); 155689f5b7daSLinus Torvalds if (IS_ERR(page)) 1557a49bd4d7SMichal Hocko goto out; 155889f5b7daSLinus Torvalds 1559742755a1SChristoph Lameter err = -ENOENT; 1560742755a1SChristoph Lameter if (!page) 1561a49bd4d7SMichal Hocko goto out; 1562742755a1SChristoph Lameter 1563a49bd4d7SMichal Hocko err = 0; 1564a49bd4d7SMichal Hocko if (page_to_nid(page) == node) 1565a49bd4d7SMichal Hocko goto out_putpage; 1566742755a1SChristoph Lameter 1567742755a1SChristoph Lameter err = -EACCES; 1568a49bd4d7SMichal Hocko if (page_mapcount(page) > 1 && !migrate_all) 1569a49bd4d7SMichal Hocko goto out_putpage; 1570742755a1SChristoph Lameter 1571e632a938SNaoya Horiguchi if (PageHuge(page)) { 1572e8db67ebSNaoya Horiguchi if (PageHead(page)) { 1573a49bd4d7SMichal Hocko isolate_huge_page(page, pagelist); 1574e0153fc2SYang Shi err = 1; 1575e8db67ebSNaoya Horiguchi } 1576a49bd4d7SMichal Hocko } else { 1577a49bd4d7SMichal Hocko struct page *head; 1578e632a938SNaoya Horiguchi 1579e8db67ebSNaoya Horiguchi head = compound_head(page); 1580e8db67ebSNaoya Horiguchi err = isolate_lru_page(head); 1581a49bd4d7SMichal Hocko if (err) 1582a49bd4d7SMichal Hocko goto out_putpage; 1583a49bd4d7SMichal Hocko 1584e0153fc2SYang Shi err = 1; 1585a49bd4d7SMichal Hocko list_add_tail(&head->lru, pagelist); 1586e8db67ebSNaoya Horiguchi mod_node_page_state(page_pgdat(head), 1587e8db67ebSNaoya Horiguchi NR_ISOLATED_ANON + page_is_file_cache(head), 1588e8db67ebSNaoya Horiguchi hpage_nr_pages(head)); 15896d9c285aSKOSAKI Motohiro } 1590a49bd4d7SMichal Hocko out_putpage: 1591742755a1SChristoph Lameter /* 1592742755a1SChristoph Lameter * Either remove the duplicate refcount from 1593742755a1SChristoph Lameter * isolate_lru_page() or drop the page ref if it was 1594742755a1SChristoph Lameter * not isolated. 1595742755a1SChristoph Lameter */ 1596742755a1SChristoph Lameter put_page(page); 1597a49bd4d7SMichal Hocko out: 1598742755a1SChristoph Lameter up_read(&mm->mmap_sem); 1599742755a1SChristoph Lameter return err; 1600742755a1SChristoph Lameter } 1601742755a1SChristoph Lameter 16027ca8783aSWei Yang static int move_pages_and_store_status(struct mm_struct *mm, int node, 16037ca8783aSWei Yang struct list_head *pagelist, int __user *status, 16047ca8783aSWei Yang int start, int i, unsigned long nr_pages) 16057ca8783aSWei Yang { 16067ca8783aSWei Yang int err; 16077ca8783aSWei Yang 1608*5d7ae891SWei Yang if (list_empty(pagelist)) 1609*5d7ae891SWei Yang return 0; 1610*5d7ae891SWei Yang 16117ca8783aSWei Yang err = do_move_pages_to_node(mm, pagelist, node); 16127ca8783aSWei Yang if (err) { 16137ca8783aSWei Yang /* 16147ca8783aSWei Yang * Positive err means the number of failed 16157ca8783aSWei Yang * pages to migrate. Since we are going to 16167ca8783aSWei Yang * abort and return the number of non-migrated 16177ca8783aSWei Yang * pages, so need to incude the rest of the 16187ca8783aSWei Yang * nr_pages that have not been attempted as 16197ca8783aSWei Yang * well. 16207ca8783aSWei Yang */ 16217ca8783aSWei Yang if (err > 0) 16227ca8783aSWei Yang err += nr_pages - i - 1; 16237ca8783aSWei Yang return err; 16247ca8783aSWei Yang } 16257ca8783aSWei Yang return store_status(status, start, node, i - start); 16267ca8783aSWei Yang } 16277ca8783aSWei Yang 1628742755a1SChristoph Lameter /* 16295e9a0f02SBrice Goglin * Migrate an array of page address onto an array of nodes and fill 16305e9a0f02SBrice Goglin * the corresponding array of status. 16315e9a0f02SBrice Goglin */ 16323268c63eSChristoph Lameter static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 16335e9a0f02SBrice Goglin unsigned long nr_pages, 16345e9a0f02SBrice Goglin const void __user * __user *pages, 16355e9a0f02SBrice Goglin const int __user *nodes, 16365e9a0f02SBrice Goglin int __user *status, int flags) 16375e9a0f02SBrice Goglin { 1638a49bd4d7SMichal Hocko int current_node = NUMA_NO_NODE; 1639a49bd4d7SMichal Hocko LIST_HEAD(pagelist); 1640a49bd4d7SMichal Hocko int start, i; 1641a49bd4d7SMichal Hocko int err = 0, err1; 164235282a2dSBrice Goglin 164335282a2dSBrice Goglin migrate_prep(); 164435282a2dSBrice Goglin 1645a49bd4d7SMichal Hocko for (i = start = 0; i < nr_pages; i++) { 16465e9a0f02SBrice Goglin const void __user *p; 1647a49bd4d7SMichal Hocko unsigned long addr; 16485e9a0f02SBrice Goglin int node; 16495e9a0f02SBrice Goglin 16503140a227SBrice Goglin err = -EFAULT; 1651a49bd4d7SMichal Hocko if (get_user(p, pages + i)) 1652a49bd4d7SMichal Hocko goto out_flush; 1653a49bd4d7SMichal Hocko if (get_user(node, nodes + i)) 1654a49bd4d7SMichal Hocko goto out_flush; 1655057d3389SAndrey Konovalov addr = (unsigned long)untagged_addr(p); 16565e9a0f02SBrice Goglin 16575e9a0f02SBrice Goglin err = -ENODEV; 16586f5a55f1SLinus Torvalds if (node < 0 || node >= MAX_NUMNODES) 1659a49bd4d7SMichal Hocko goto out_flush; 1660389162c2SLai Jiangshan if (!node_state(node, N_MEMORY)) 1661a49bd4d7SMichal Hocko goto out_flush; 16625e9a0f02SBrice Goglin 16635e9a0f02SBrice Goglin err = -EACCES; 16645e9a0f02SBrice Goglin if (!node_isset(node, task_nodes)) 1665a49bd4d7SMichal Hocko goto out_flush; 16665e9a0f02SBrice Goglin 1667a49bd4d7SMichal Hocko if (current_node == NUMA_NO_NODE) { 1668a49bd4d7SMichal Hocko current_node = node; 1669a49bd4d7SMichal Hocko start = i; 1670a49bd4d7SMichal Hocko } else if (node != current_node) { 16717ca8783aSWei Yang err = move_pages_and_store_status(mm, current_node, 16727ca8783aSWei Yang &pagelist, status, start, i, nr_pages); 1673a49bd4d7SMichal Hocko if (err) 1674a49bd4d7SMichal Hocko goto out; 1675a49bd4d7SMichal Hocko start = i; 1676a49bd4d7SMichal Hocko current_node = node; 16775e9a0f02SBrice Goglin } 16785e9a0f02SBrice Goglin 1679a49bd4d7SMichal Hocko /* 1680a49bd4d7SMichal Hocko * Errors in the page lookup or isolation are not fatal and we simply 1681a49bd4d7SMichal Hocko * report them via status 1682a49bd4d7SMichal Hocko */ 1683a49bd4d7SMichal Hocko err = add_page_for_migration(mm, addr, current_node, 1684a49bd4d7SMichal Hocko &pagelist, flags & MPOL_MF_MOVE_ALL); 1685e0153fc2SYang Shi 1686e0153fc2SYang Shi if (!err) { 1687e0153fc2SYang Shi /* The page is already on the target node */ 1688e0153fc2SYang Shi err = store_status(status, i, current_node, 1); 1689e0153fc2SYang Shi if (err) 1690e0153fc2SYang Shi goto out_flush; 1691a49bd4d7SMichal Hocko continue; 1692e0153fc2SYang Shi } else if (err > 0) { 1693e0153fc2SYang Shi /* The page is successfully queued for migration */ 1694e0153fc2SYang Shi continue; 1695e0153fc2SYang Shi } 16963140a227SBrice Goglin 1697a49bd4d7SMichal Hocko err = store_status(status, i, err, 1); 1698a49bd4d7SMichal Hocko if (err) 1699a49bd4d7SMichal Hocko goto out_flush; 17003140a227SBrice Goglin 17017ca8783aSWei Yang err = move_pages_and_store_status(mm, current_node, &pagelist, 17027ca8783aSWei Yang status, start, i, nr_pages); 1703a49bd4d7SMichal Hocko if (err) 1704a49bd4d7SMichal Hocko goto out; 1705a49bd4d7SMichal Hocko current_node = NUMA_NO_NODE; 17063140a227SBrice Goglin } 1707a49bd4d7SMichal Hocko out_flush: 1708a49bd4d7SMichal Hocko /* Make sure we do not overwrite the existing error */ 17097ca8783aSWei Yang err1 = move_pages_and_store_status(mm, current_node, &pagelist, 17107ca8783aSWei Yang status, start, i, nr_pages); 1711dfe9aa23SWei Yang if (err >= 0) 1712a49bd4d7SMichal Hocko err = err1; 17135e9a0f02SBrice Goglin out: 17145e9a0f02SBrice Goglin return err; 17155e9a0f02SBrice Goglin } 17165e9a0f02SBrice Goglin 17175e9a0f02SBrice Goglin /* 17182f007e74SBrice Goglin * Determine the nodes of an array of pages and store it in an array of status. 1719742755a1SChristoph Lameter */ 172080bba129SBrice Goglin static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 172180bba129SBrice Goglin const void __user **pages, int *status) 1722742755a1SChristoph Lameter { 17232f007e74SBrice Goglin unsigned long i; 1724742755a1SChristoph Lameter 17252f007e74SBrice Goglin down_read(&mm->mmap_sem); 17262f007e74SBrice Goglin 17272f007e74SBrice Goglin for (i = 0; i < nr_pages; i++) { 172880bba129SBrice Goglin unsigned long addr = (unsigned long)(*pages); 17292f007e74SBrice Goglin struct vm_area_struct *vma; 17302f007e74SBrice Goglin struct page *page; 1731c095adbcSKOSAKI Motohiro int err = -EFAULT; 17322f007e74SBrice Goglin 17332f007e74SBrice Goglin vma = find_vma(mm, addr); 173470384dc6SGleb Natapov if (!vma || addr < vma->vm_start) 1735742755a1SChristoph Lameter goto set_status; 1736742755a1SChristoph Lameter 1737d899844eSKirill A. Shutemov /* FOLL_DUMP to ignore special (like zero) pages */ 1738d899844eSKirill A. Shutemov page = follow_page(vma, addr, FOLL_DUMP); 173989f5b7daSLinus Torvalds 174089f5b7daSLinus Torvalds err = PTR_ERR(page); 174189f5b7daSLinus Torvalds if (IS_ERR(page)) 174289f5b7daSLinus Torvalds goto set_status; 174389f5b7daSLinus Torvalds 1744d899844eSKirill A. Shutemov err = page ? page_to_nid(page) : -ENOENT; 1745742755a1SChristoph Lameter set_status: 174680bba129SBrice Goglin *status = err; 174780bba129SBrice Goglin 174880bba129SBrice Goglin pages++; 174980bba129SBrice Goglin status++; 175080bba129SBrice Goglin } 175180bba129SBrice Goglin 175280bba129SBrice Goglin up_read(&mm->mmap_sem); 175380bba129SBrice Goglin } 175480bba129SBrice Goglin 175580bba129SBrice Goglin /* 175680bba129SBrice Goglin * Determine the nodes of a user array of pages and store it in 175780bba129SBrice Goglin * a user array of status. 175880bba129SBrice Goglin */ 175980bba129SBrice Goglin static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 176080bba129SBrice Goglin const void __user * __user *pages, 176180bba129SBrice Goglin int __user *status) 176280bba129SBrice Goglin { 176380bba129SBrice Goglin #define DO_PAGES_STAT_CHUNK_NR 16 176480bba129SBrice Goglin const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 176580bba129SBrice Goglin int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 176680bba129SBrice Goglin 176787b8d1adSH. Peter Anvin while (nr_pages) { 176887b8d1adSH. Peter Anvin unsigned long chunk_nr; 176980bba129SBrice Goglin 177087b8d1adSH. Peter Anvin chunk_nr = nr_pages; 177187b8d1adSH. Peter Anvin if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) 177287b8d1adSH. Peter Anvin chunk_nr = DO_PAGES_STAT_CHUNK_NR; 177387b8d1adSH. Peter Anvin 177487b8d1adSH. Peter Anvin if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) 177587b8d1adSH. Peter Anvin break; 177680bba129SBrice Goglin 177780bba129SBrice Goglin do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 177880bba129SBrice Goglin 177987b8d1adSH. Peter Anvin if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 178087b8d1adSH. Peter Anvin break; 1781742755a1SChristoph Lameter 178287b8d1adSH. Peter Anvin pages += chunk_nr; 178387b8d1adSH. Peter Anvin status += chunk_nr; 178487b8d1adSH. Peter Anvin nr_pages -= chunk_nr; 178587b8d1adSH. Peter Anvin } 178687b8d1adSH. Peter Anvin return nr_pages ? -EFAULT : 0; 1787742755a1SChristoph Lameter } 1788742755a1SChristoph Lameter 1789742755a1SChristoph Lameter /* 1790742755a1SChristoph Lameter * Move a list of pages in the address space of the currently executing 1791742755a1SChristoph Lameter * process. 1792742755a1SChristoph Lameter */ 17937addf443SDominik Brodowski static int kernel_move_pages(pid_t pid, unsigned long nr_pages, 17947addf443SDominik Brodowski const void __user * __user *pages, 17957addf443SDominik Brodowski const int __user *nodes, 17967addf443SDominik Brodowski int __user *status, int flags) 1797742755a1SChristoph Lameter { 1798742755a1SChristoph Lameter struct task_struct *task; 1799742755a1SChristoph Lameter struct mm_struct *mm; 18005e9a0f02SBrice Goglin int err; 18013268c63eSChristoph Lameter nodemask_t task_nodes; 1802742755a1SChristoph Lameter 1803742755a1SChristoph Lameter /* Check flags */ 1804742755a1SChristoph Lameter if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1805742755a1SChristoph Lameter return -EINVAL; 1806742755a1SChristoph Lameter 1807742755a1SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1808742755a1SChristoph Lameter return -EPERM; 1809742755a1SChristoph Lameter 1810742755a1SChristoph Lameter /* Find the mm_struct */ 1811a879bf58SGreg Thelen rcu_read_lock(); 1812228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current; 1813742755a1SChristoph Lameter if (!task) { 1814a879bf58SGreg Thelen rcu_read_unlock(); 1815742755a1SChristoph Lameter return -ESRCH; 1816742755a1SChristoph Lameter } 18173268c63eSChristoph Lameter get_task_struct(task); 1818742755a1SChristoph Lameter 1819742755a1SChristoph Lameter /* 1820742755a1SChristoph Lameter * Check if this process has the right to modify the specified 1821197e7e52SLinus Torvalds * process. Use the regular "ptrace_may_access()" checks. 1822742755a1SChristoph Lameter */ 1823197e7e52SLinus Torvalds if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1824c69e8d9cSDavid Howells rcu_read_unlock(); 1825742755a1SChristoph Lameter err = -EPERM; 18265e9a0f02SBrice Goglin goto out; 1827742755a1SChristoph Lameter } 1828c69e8d9cSDavid Howells rcu_read_unlock(); 1829742755a1SChristoph Lameter 183086c3a764SDavid Quigley err = security_task_movememory(task); 183186c3a764SDavid Quigley if (err) 1832742755a1SChristoph Lameter goto out; 1833742755a1SChristoph Lameter 18343268c63eSChristoph Lameter task_nodes = cpuset_mems_allowed(task); 18353268c63eSChristoph Lameter mm = get_task_mm(task); 18363268c63eSChristoph Lameter put_task_struct(task); 18373268c63eSChristoph Lameter 18386e8b09eaSSasha Levin if (!mm) 18396e8b09eaSSasha Levin return -EINVAL; 18406e8b09eaSSasha Levin 18413268c63eSChristoph Lameter if (nodes) 18423268c63eSChristoph Lameter err = do_pages_move(mm, task_nodes, nr_pages, pages, 18433268c63eSChristoph Lameter nodes, status, flags); 18443268c63eSChristoph Lameter else 18455e9a0f02SBrice Goglin err = do_pages_stat(mm, nr_pages, pages, status); 18463268c63eSChristoph Lameter 18473268c63eSChristoph Lameter mmput(mm); 18483268c63eSChristoph Lameter return err; 1849742755a1SChristoph Lameter 1850742755a1SChristoph Lameter out: 18513268c63eSChristoph Lameter put_task_struct(task); 1852742755a1SChristoph Lameter return err; 1853742755a1SChristoph Lameter } 1854742755a1SChristoph Lameter 18557addf443SDominik Brodowski SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 18567addf443SDominik Brodowski const void __user * __user *, pages, 18577addf443SDominik Brodowski const int __user *, nodes, 18587addf443SDominik Brodowski int __user *, status, int, flags) 18597addf443SDominik Brodowski { 18607addf443SDominik Brodowski return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 18617addf443SDominik Brodowski } 18627addf443SDominik Brodowski 18637addf443SDominik Brodowski #ifdef CONFIG_COMPAT 18647addf443SDominik Brodowski COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, 18657addf443SDominik Brodowski compat_uptr_t __user *, pages32, 18667addf443SDominik Brodowski const int __user *, nodes, 18677addf443SDominik Brodowski int __user *, status, 18687addf443SDominik Brodowski int, flags) 18697addf443SDominik Brodowski { 18707addf443SDominik Brodowski const void __user * __user *pages; 18717addf443SDominik Brodowski int i; 18727addf443SDominik Brodowski 18737addf443SDominik Brodowski pages = compat_alloc_user_space(nr_pages * sizeof(void *)); 18747addf443SDominik Brodowski for (i = 0; i < nr_pages; i++) { 18757addf443SDominik Brodowski compat_uptr_t p; 18767addf443SDominik Brodowski 18777addf443SDominik Brodowski if (get_user(p, pages32 + i) || 18787addf443SDominik Brodowski put_user(compat_ptr(p), pages + i)) 18797addf443SDominik Brodowski return -EFAULT; 18807addf443SDominik Brodowski } 18817addf443SDominik Brodowski return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 18827addf443SDominik Brodowski } 18837addf443SDominik Brodowski #endif /* CONFIG_COMPAT */ 18847addf443SDominik Brodowski 18857039e1dbSPeter Zijlstra #ifdef CONFIG_NUMA_BALANCING 18867039e1dbSPeter Zijlstra /* 18877039e1dbSPeter Zijlstra * Returns true if this is a safe migration target node for misplaced NUMA 18887039e1dbSPeter Zijlstra * pages. Currently it only checks the watermarks which crude 18897039e1dbSPeter Zijlstra */ 18907039e1dbSPeter Zijlstra static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 18913abef4e6SMel Gorman unsigned long nr_migrate_pages) 18927039e1dbSPeter Zijlstra { 18937039e1dbSPeter Zijlstra int z; 1894599d0c95SMel Gorman 18957039e1dbSPeter Zijlstra for (z = pgdat->nr_zones - 1; z >= 0; z--) { 18967039e1dbSPeter Zijlstra struct zone *zone = pgdat->node_zones + z; 18977039e1dbSPeter Zijlstra 18987039e1dbSPeter Zijlstra if (!populated_zone(zone)) 18997039e1dbSPeter Zijlstra continue; 19007039e1dbSPeter Zijlstra 19017039e1dbSPeter Zijlstra /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 19027039e1dbSPeter Zijlstra if (!zone_watermark_ok(zone, 0, 19037039e1dbSPeter Zijlstra high_wmark_pages(zone) + 19047039e1dbSPeter Zijlstra nr_migrate_pages, 1905bfe9d006SHuang Ying ZONE_MOVABLE, 0)) 19067039e1dbSPeter Zijlstra continue; 19077039e1dbSPeter Zijlstra return true; 19087039e1dbSPeter Zijlstra } 19097039e1dbSPeter Zijlstra return false; 19107039e1dbSPeter Zijlstra } 19117039e1dbSPeter Zijlstra 19127039e1dbSPeter Zijlstra static struct page *alloc_misplaced_dst_page(struct page *page, 1913666feb21SMichal Hocko unsigned long data) 19147039e1dbSPeter Zijlstra { 19157039e1dbSPeter Zijlstra int nid = (int) data; 19167039e1dbSPeter Zijlstra struct page *newpage; 19177039e1dbSPeter Zijlstra 191896db800fSVlastimil Babka newpage = __alloc_pages_node(nid, 1919e97ca8e5SJohannes Weiner (GFP_HIGHUSER_MOVABLE | 1920e97ca8e5SJohannes Weiner __GFP_THISNODE | __GFP_NOMEMALLOC | 1921e97ca8e5SJohannes Weiner __GFP_NORETRY | __GFP_NOWARN) & 19228479eba7SMel Gorman ~__GFP_RECLAIM, 0); 1923bac0382cSHillf Danton 19247039e1dbSPeter Zijlstra return newpage; 19257039e1dbSPeter Zijlstra } 19267039e1dbSPeter Zijlstra 19271c30e017SMel Gorman static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1928b32967ffSMel Gorman { 1929340ef390SHugh Dickins int page_lru; 1930b32967ffSMel Gorman 1931309381feSSasha Levin VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 19323abef4e6SMel Gorman 1933b32967ffSMel Gorman /* Avoid migrating to a node that is nearly full */ 1934d8c6546bSMatthew Wilcox (Oracle) if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) 1935340ef390SHugh Dickins return 0; 1936b32967ffSMel Gorman 1937340ef390SHugh Dickins if (isolate_lru_page(page)) 1938340ef390SHugh Dickins return 0; 1939340ef390SHugh Dickins 1940340ef390SHugh Dickins /* 1941340ef390SHugh Dickins * migrate_misplaced_transhuge_page() skips page migration's usual 1942340ef390SHugh Dickins * check on page_count(), so we must do it here, now that the page 1943340ef390SHugh Dickins * has been isolated: a GUP pin, or any other pin, prevents migration. 1944340ef390SHugh Dickins * The expected page count is 3: 1 for page's mapcount and 1 for the 1945340ef390SHugh Dickins * caller's pin and 1 for the reference taken by isolate_lru_page(). 1946340ef390SHugh Dickins */ 1947340ef390SHugh Dickins if (PageTransHuge(page) && page_count(page) != 3) { 1948340ef390SHugh Dickins putback_lru_page(page); 1949b32967ffSMel Gorman return 0; 1950b32967ffSMel Gorman } 1951b32967ffSMel Gorman 1952b32967ffSMel Gorman page_lru = page_is_file_cache(page); 1953599d0c95SMel Gorman mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, 1954340ef390SHugh Dickins hpage_nr_pages(page)); 1955b32967ffSMel Gorman 1956b32967ffSMel Gorman /* 1957340ef390SHugh Dickins * Isolating the page has taken another reference, so the 1958340ef390SHugh Dickins * caller's reference can be safely dropped without the page 1959340ef390SHugh Dickins * disappearing underneath us during migration. 1960b32967ffSMel Gorman */ 1961b32967ffSMel Gorman put_page(page); 1962340ef390SHugh Dickins return 1; 1963b32967ffSMel Gorman } 1964b32967ffSMel Gorman 1965de466bd6SMel Gorman bool pmd_trans_migrating(pmd_t pmd) 1966de466bd6SMel Gorman { 1967de466bd6SMel Gorman struct page *page = pmd_page(pmd); 1968de466bd6SMel Gorman return PageLocked(page); 1969de466bd6SMel Gorman } 1970de466bd6SMel Gorman 1971a8f60772SMel Gorman /* 19727039e1dbSPeter Zijlstra * Attempt to migrate a misplaced page to the specified destination 19737039e1dbSPeter Zijlstra * node. Caller is expected to have an elevated reference count on 19747039e1dbSPeter Zijlstra * the page that will be dropped by this function before returning. 19757039e1dbSPeter Zijlstra */ 19761bc115d8SMel Gorman int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 19771bc115d8SMel Gorman int node) 19787039e1dbSPeter Zijlstra { 1979a8f60772SMel Gorman pg_data_t *pgdat = NODE_DATA(node); 1980340ef390SHugh Dickins int isolated; 1981b32967ffSMel Gorman int nr_remaining; 19827039e1dbSPeter Zijlstra LIST_HEAD(migratepages); 19837039e1dbSPeter Zijlstra 19847039e1dbSPeter Zijlstra /* 19851bc115d8SMel Gorman * Don't migrate file pages that are mapped in multiple processes 19861bc115d8SMel Gorman * with execute permissions as they are probably shared libraries. 19877039e1dbSPeter Zijlstra */ 19881bc115d8SMel Gorman if (page_mapcount(page) != 1 && page_is_file_cache(page) && 19891bc115d8SMel Gorman (vma->vm_flags & VM_EXEC)) 19907039e1dbSPeter Zijlstra goto out; 19917039e1dbSPeter Zijlstra 1992a8f60772SMel Gorman /* 199309a913a7SMel Gorman * Also do not migrate dirty pages as not all filesystems can move 199409a913a7SMel Gorman * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 199509a913a7SMel Gorman */ 199609a913a7SMel Gorman if (page_is_file_cache(page) && PageDirty(page)) 199709a913a7SMel Gorman goto out; 199809a913a7SMel Gorman 1999b32967ffSMel Gorman isolated = numamigrate_isolate_page(pgdat, page); 2000b32967ffSMel Gorman if (!isolated) 20017039e1dbSPeter Zijlstra goto out; 20027039e1dbSPeter Zijlstra 20037039e1dbSPeter Zijlstra list_add(&page->lru, &migratepages); 20049c620e2bSHugh Dickins nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 200568711a74SDavid Rientjes NULL, node, MIGRATE_ASYNC, 200668711a74SDavid Rientjes MR_NUMA_MISPLACED); 20077039e1dbSPeter Zijlstra if (nr_remaining) { 200859c82b70SJoonsoo Kim if (!list_empty(&migratepages)) { 200959c82b70SJoonsoo Kim list_del(&page->lru); 2010599d0c95SMel Gorman dec_node_page_state(page, NR_ISOLATED_ANON + 201159c82b70SJoonsoo Kim page_is_file_cache(page)); 201259c82b70SJoonsoo Kim putback_lru_page(page); 201359c82b70SJoonsoo Kim } 20147039e1dbSPeter Zijlstra isolated = 0; 201503c5a6e1SMel Gorman } else 201603c5a6e1SMel Gorman count_vm_numa_event(NUMA_PAGE_MIGRATE); 20177039e1dbSPeter Zijlstra BUG_ON(!list_empty(&migratepages)); 20187039e1dbSPeter Zijlstra return isolated; 2019340ef390SHugh Dickins 2020340ef390SHugh Dickins out: 2021340ef390SHugh Dickins put_page(page); 2022340ef390SHugh Dickins return 0; 20237039e1dbSPeter Zijlstra } 2024220018d3SMel Gorman #endif /* CONFIG_NUMA_BALANCING */ 2025b32967ffSMel Gorman 2026220018d3SMel Gorman #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 2027340ef390SHugh Dickins /* 2028340ef390SHugh Dickins * Migrates a THP to a given target node. page must be locked and is unlocked 2029340ef390SHugh Dickins * before returning. 2030340ef390SHugh Dickins */ 2031b32967ffSMel Gorman int migrate_misplaced_transhuge_page(struct mm_struct *mm, 2032b32967ffSMel Gorman struct vm_area_struct *vma, 2033b32967ffSMel Gorman pmd_t *pmd, pmd_t entry, 2034b32967ffSMel Gorman unsigned long address, 2035b32967ffSMel Gorman struct page *page, int node) 2036b32967ffSMel Gorman { 2037c4088ebdSKirill A. Shutemov spinlock_t *ptl; 2038b32967ffSMel Gorman pg_data_t *pgdat = NODE_DATA(node); 2039b32967ffSMel Gorman int isolated = 0; 2040b32967ffSMel Gorman struct page *new_page = NULL; 2041b32967ffSMel Gorman int page_lru = page_is_file_cache(page); 20427066f0f9SAndrea Arcangeli unsigned long start = address & HPAGE_PMD_MASK; 2043b32967ffSMel Gorman 2044b32967ffSMel Gorman new_page = alloc_pages_node(node, 204525160354SVlastimil Babka (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), 2046e97ca8e5SJohannes Weiner HPAGE_PMD_ORDER); 2047340ef390SHugh Dickins if (!new_page) 2048340ef390SHugh Dickins goto out_fail; 20499a982250SKirill A. Shutemov prep_transhuge_page(new_page); 2050340ef390SHugh Dickins 2051b32967ffSMel Gorman isolated = numamigrate_isolate_page(pgdat, page); 2052340ef390SHugh Dickins if (!isolated) { 2053b32967ffSMel Gorman put_page(new_page); 2054340ef390SHugh Dickins goto out_fail; 2055b32967ffSMel Gorman } 2056b0943d61SMel Gorman 2057b32967ffSMel Gorman /* Prepare a page as a migration target */ 205848c935adSKirill A. Shutemov __SetPageLocked(new_page); 2059d44d363fSShaohua Li if (PageSwapBacked(page)) 2060fa9949daSHugh Dickins __SetPageSwapBacked(new_page); 2061b32967ffSMel Gorman 2062b32967ffSMel Gorman /* anon mapping, we can simply copy page->mapping to the new page: */ 2063b32967ffSMel Gorman new_page->mapping = page->mapping; 2064b32967ffSMel Gorman new_page->index = page->index; 20657eef5f97SAndrea Arcangeli /* flush the cache before copying using the kernel virtual address */ 20667eef5f97SAndrea Arcangeli flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); 2067b32967ffSMel Gorman migrate_page_copy(new_page, page); 2068b32967ffSMel Gorman WARN_ON(PageLRU(new_page)); 2069b32967ffSMel Gorman 2070b32967ffSMel Gorman /* Recheck the target PMD */ 2071c4088ebdSKirill A. Shutemov ptl = pmd_lock(mm, pmd); 2072f4e177d1SWill Deacon if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { 2073c4088ebdSKirill A. Shutemov spin_unlock(ptl); 2074b32967ffSMel Gorman 2075b32967ffSMel Gorman /* Reverse changes made by migrate_page_copy() */ 2076b32967ffSMel Gorman if (TestClearPageActive(new_page)) 2077b32967ffSMel Gorman SetPageActive(page); 2078b32967ffSMel Gorman if (TestClearPageUnevictable(new_page)) 2079b32967ffSMel Gorman SetPageUnevictable(page); 2080b32967ffSMel Gorman 2081b32967ffSMel Gorman unlock_page(new_page); 2082b32967ffSMel Gorman put_page(new_page); /* Free it */ 2083b32967ffSMel Gorman 2084a54a407fSMel Gorman /* Retake the callers reference and putback on LRU */ 2085a54a407fSMel Gorman get_page(page); 2086b32967ffSMel Gorman putback_lru_page(page); 2087599d0c95SMel Gorman mod_node_page_state(page_pgdat(page), 2088a54a407fSMel Gorman NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); 2089eb4489f6SMel Gorman 2090eb4489f6SMel Gorman goto out_unlock; 2091b32967ffSMel Gorman } 2092b32967ffSMel Gorman 209310102459SKirill A. Shutemov entry = mk_huge_pmd(new_page, vma->vm_page_prot); 2094f55e1014SLinus Torvalds entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 2095b32967ffSMel Gorman 20962b4847e7SMel Gorman /* 2097d7c33934SAndrea Arcangeli * Overwrite the old entry under pagetable lock and establish 2098d7c33934SAndrea Arcangeli * the new PTE. Any parallel GUP will either observe the old 2099d7c33934SAndrea Arcangeli * page blocking on the page lock, block on the page table 2100d7c33934SAndrea Arcangeli * lock or observe the new page. The SetPageUptodate on the 2101d7c33934SAndrea Arcangeli * new page and page_add_new_anon_rmap guarantee the copy is 2102d7c33934SAndrea Arcangeli * visible before the pagetable update. 21032b4847e7SMel Gorman */ 21047066f0f9SAndrea Arcangeli page_add_anon_rmap(new_page, vma, start, true); 2105d7c33934SAndrea Arcangeli /* 2106d7c33934SAndrea Arcangeli * At this point the pmd is numa/protnone (i.e. non present) and the TLB 2107d7c33934SAndrea Arcangeli * has already been flushed globally. So no TLB can be currently 2108d7c33934SAndrea Arcangeli * caching this non present pmd mapping. There's no need to clear the 2109d7c33934SAndrea Arcangeli * pmd before doing set_pmd_at(), nor to flush the TLB after 2110d7c33934SAndrea Arcangeli * set_pmd_at(). Clearing the pmd here would introduce a race 2111d7c33934SAndrea Arcangeli * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the 2112d7c33934SAndrea Arcangeli * mmap_sem for reading. If the pmd is set to NULL at any given time, 2113d7c33934SAndrea Arcangeli * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this 2114d7c33934SAndrea Arcangeli * pmd. 2115d7c33934SAndrea Arcangeli */ 21167066f0f9SAndrea Arcangeli set_pmd_at(mm, start, pmd, entry); 2117ce4a9cc5SStephen Rothwell update_mmu_cache_pmd(vma, address, &entry); 21182b4847e7SMel Gorman 2119f4e177d1SWill Deacon page_ref_unfreeze(page, 2); 212051afb12bSHugh Dickins mlock_migrate_page(new_page, page); 2121d281ee61SKirill A. Shutemov page_remove_rmap(page, true); 21227cd12b4aSVlastimil Babka set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); 21232b4847e7SMel Gorman 2124c4088ebdSKirill A. Shutemov spin_unlock(ptl); 2125b32967ffSMel Gorman 212611de9927SMel Gorman /* Take an "isolate" reference and put new page on the LRU. */ 212711de9927SMel Gorman get_page(new_page); 212811de9927SMel Gorman putback_lru_page(new_page); 212911de9927SMel Gorman 2130b32967ffSMel Gorman unlock_page(new_page); 2131b32967ffSMel Gorman unlock_page(page); 2132b32967ffSMel Gorman put_page(page); /* Drop the rmap reference */ 2133b32967ffSMel Gorman put_page(page); /* Drop the LRU isolation reference */ 2134b32967ffSMel Gorman 2135b32967ffSMel Gorman count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 2136b32967ffSMel Gorman count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 2137b32967ffSMel Gorman 2138599d0c95SMel Gorman mod_node_page_state(page_pgdat(page), 2139b32967ffSMel Gorman NR_ISOLATED_ANON + page_lru, 2140b32967ffSMel Gorman -HPAGE_PMD_NR); 2141b32967ffSMel Gorman return isolated; 2142b32967ffSMel Gorman 2143340ef390SHugh Dickins out_fail: 2144340ef390SHugh Dickins count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 21452b4847e7SMel Gorman ptl = pmd_lock(mm, pmd); 21462b4847e7SMel Gorman if (pmd_same(*pmd, entry)) { 21474d942466SMel Gorman entry = pmd_modify(entry, vma->vm_page_prot); 21487066f0f9SAndrea Arcangeli set_pmd_at(mm, start, pmd, entry); 2149a54a407fSMel Gorman update_mmu_cache_pmd(vma, address, &entry); 21502b4847e7SMel Gorman } 21512b4847e7SMel Gorman spin_unlock(ptl); 2152a54a407fSMel Gorman 2153eb4489f6SMel Gorman out_unlock: 2154340ef390SHugh Dickins unlock_page(page); 2155b32967ffSMel Gorman put_page(page); 2156b32967ffSMel Gorman return 0; 2157b32967ffSMel Gorman } 21587039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA_BALANCING */ 21597039e1dbSPeter Zijlstra 21607039e1dbSPeter Zijlstra #endif /* CONFIG_NUMA */ 21618763cb45SJérôme Glisse 21629b2ed9cbSChristoph Hellwig #ifdef CONFIG_DEVICE_PRIVATE 21638763cb45SJérôme Glisse static int migrate_vma_collect_hole(unsigned long start, 21648763cb45SJérôme Glisse unsigned long end, 2165b7a16c7aSSteven Price __always_unused int depth, 21668763cb45SJérôme Glisse struct mm_walk *walk) 21678763cb45SJérôme Glisse { 21688763cb45SJérôme Glisse struct migrate_vma *migrate = walk->private; 21698763cb45SJérôme Glisse unsigned long addr; 21708763cb45SJérôme Glisse 2171872ea707SRalph Campbell for (addr = start; addr < end; addr += PAGE_SIZE) { 2172e20d103bSMark Hairgrove migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 21738315ada7SJérôme Glisse migrate->dst[migrate->npages] = 0; 2174e20d103bSMark Hairgrove migrate->npages++; 21758315ada7SJérôme Glisse migrate->cpages++; 21768315ada7SJérôme Glisse } 21778315ada7SJérôme Glisse 21788315ada7SJérôme Glisse return 0; 21798315ada7SJérôme Glisse } 21808315ada7SJérôme Glisse 21818315ada7SJérôme Glisse static int migrate_vma_collect_skip(unsigned long start, 21828315ada7SJérôme Glisse unsigned long end, 21838315ada7SJérôme Glisse struct mm_walk *walk) 21848315ada7SJérôme Glisse { 21858315ada7SJérôme Glisse struct migrate_vma *migrate = walk->private; 21868315ada7SJérôme Glisse unsigned long addr; 21878315ada7SJérôme Glisse 2188872ea707SRalph Campbell for (addr = start; addr < end; addr += PAGE_SIZE) { 21898763cb45SJérôme Glisse migrate->dst[migrate->npages] = 0; 21908763cb45SJérôme Glisse migrate->src[migrate->npages++] = 0; 21918763cb45SJérôme Glisse } 21928763cb45SJérôme Glisse 21938763cb45SJérôme Glisse return 0; 21948763cb45SJérôme Glisse } 21958763cb45SJérôme Glisse 21968763cb45SJérôme Glisse static int migrate_vma_collect_pmd(pmd_t *pmdp, 21978763cb45SJérôme Glisse unsigned long start, 21988763cb45SJérôme Glisse unsigned long end, 21998763cb45SJérôme Glisse struct mm_walk *walk) 22008763cb45SJérôme Glisse { 22018763cb45SJérôme Glisse struct migrate_vma *migrate = walk->private; 22028763cb45SJérôme Glisse struct vm_area_struct *vma = walk->vma; 22038763cb45SJérôme Glisse struct mm_struct *mm = vma->vm_mm; 22048c3328f1SJérôme Glisse unsigned long addr = start, unmapped = 0; 22058763cb45SJérôme Glisse spinlock_t *ptl; 22068763cb45SJérôme Glisse pte_t *ptep; 22078763cb45SJérôme Glisse 22088763cb45SJérôme Glisse again: 22098763cb45SJérôme Glisse if (pmd_none(*pmdp)) 2210b7a16c7aSSteven Price return migrate_vma_collect_hole(start, end, -1, walk); 22118763cb45SJérôme Glisse 22128763cb45SJérôme Glisse if (pmd_trans_huge(*pmdp)) { 22138763cb45SJérôme Glisse struct page *page; 22148763cb45SJérôme Glisse 22158763cb45SJérôme Glisse ptl = pmd_lock(mm, pmdp); 22168763cb45SJérôme Glisse if (unlikely(!pmd_trans_huge(*pmdp))) { 22178763cb45SJérôme Glisse spin_unlock(ptl); 22188763cb45SJérôme Glisse goto again; 22198763cb45SJérôme Glisse } 22208763cb45SJérôme Glisse 22218763cb45SJérôme Glisse page = pmd_page(*pmdp); 22228763cb45SJérôme Glisse if (is_huge_zero_page(page)) { 22238763cb45SJérôme Glisse spin_unlock(ptl); 22248763cb45SJérôme Glisse split_huge_pmd(vma, pmdp, addr); 22258763cb45SJérôme Glisse if (pmd_trans_unstable(pmdp)) 22268315ada7SJérôme Glisse return migrate_vma_collect_skip(start, end, 22278763cb45SJérôme Glisse walk); 22288763cb45SJérôme Glisse } else { 22298763cb45SJérôme Glisse int ret; 22308763cb45SJérôme Glisse 22318763cb45SJérôme Glisse get_page(page); 22328763cb45SJérôme Glisse spin_unlock(ptl); 22338763cb45SJérôme Glisse if (unlikely(!trylock_page(page))) 22348315ada7SJérôme Glisse return migrate_vma_collect_skip(start, end, 22358763cb45SJérôme Glisse walk); 22368763cb45SJérôme Glisse ret = split_huge_page(page); 22378763cb45SJérôme Glisse unlock_page(page); 22388763cb45SJérôme Glisse put_page(page); 22398315ada7SJérôme Glisse if (ret) 22408315ada7SJérôme Glisse return migrate_vma_collect_skip(start, end, 22418315ada7SJérôme Glisse walk); 22428315ada7SJérôme Glisse if (pmd_none(*pmdp)) 2243b7a16c7aSSteven Price return migrate_vma_collect_hole(start, end, -1, 22448763cb45SJérôme Glisse walk); 22458763cb45SJérôme Glisse } 22468763cb45SJérôme Glisse } 22478763cb45SJérôme Glisse 22488763cb45SJérôme Glisse if (unlikely(pmd_bad(*pmdp))) 22498315ada7SJérôme Glisse return migrate_vma_collect_skip(start, end, walk); 22508763cb45SJérôme Glisse 22518763cb45SJérôme Glisse ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 22528c3328f1SJérôme Glisse arch_enter_lazy_mmu_mode(); 22538c3328f1SJérôme Glisse 22548763cb45SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, ptep++) { 2255800bb1c8SChristoph Hellwig unsigned long mpfn = 0, pfn; 22568763cb45SJérôme Glisse struct page *page; 22578c3328f1SJérôme Glisse swp_entry_t entry; 22588763cb45SJérôme Glisse pte_t pte; 22598763cb45SJérôme Glisse 22608763cb45SJérôme Glisse pte = *ptep; 22618763cb45SJérôme Glisse 2262a5430ddaSJérôme Glisse if (pte_none(pte)) { 22638315ada7SJérôme Glisse mpfn = MIGRATE_PFN_MIGRATE; 22648315ada7SJérôme Glisse migrate->cpages++; 22658763cb45SJérôme Glisse goto next; 22668763cb45SJérôme Glisse } 22678763cb45SJérôme Glisse 2268a5430ddaSJérôme Glisse if (!pte_present(pte)) { 2269a5430ddaSJérôme Glisse /* 2270a5430ddaSJérôme Glisse * Only care about unaddressable device page special 2271a5430ddaSJérôme Glisse * page table entry. Other special swap entries are not 2272a5430ddaSJérôme Glisse * migratable, and we ignore regular swapped page. 2273a5430ddaSJérôme Glisse */ 2274a5430ddaSJérôme Glisse entry = pte_to_swp_entry(pte); 2275a5430ddaSJérôme Glisse if (!is_device_private_entry(entry)) 2276a5430ddaSJérôme Glisse goto next; 2277a5430ddaSJérôme Glisse 2278a5430ddaSJérôme Glisse page = device_private_entry_to_page(entry); 2279800bb1c8SChristoph Hellwig if (page->pgmap->owner != migrate->src_owner) 2280800bb1c8SChristoph Hellwig goto next; 2281800bb1c8SChristoph Hellwig 2282a5430ddaSJérôme Glisse mpfn = migrate_pfn(page_to_pfn(page)) | 228306d462beSChristoph Hellwig MIGRATE_PFN_MIGRATE; 2284a5430ddaSJérôme Glisse if (is_write_device_private_entry(entry)) 2285a5430ddaSJérôme Glisse mpfn |= MIGRATE_PFN_WRITE; 2286a5430ddaSJérôme Glisse } else { 2287800bb1c8SChristoph Hellwig if (migrate->src_owner) 2288800bb1c8SChristoph Hellwig goto next; 2289276f756dSPingfan Liu pfn = pte_pfn(pte); 22908315ada7SJérôme Glisse if (is_zero_pfn(pfn)) { 22918315ada7SJérôme Glisse mpfn = MIGRATE_PFN_MIGRATE; 22928315ada7SJérôme Glisse migrate->cpages++; 22938315ada7SJérôme Glisse goto next; 22948315ada7SJérôme Glisse } 229525b2995aSChristoph Hellwig page = vm_normal_page(migrate->vma, addr, pte); 2296a5430ddaSJérôme Glisse mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2297a5430ddaSJérôme Glisse mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2298a5430ddaSJérôme Glisse } 2299a5430ddaSJérôme Glisse 2300a5430ddaSJérôme Glisse /* FIXME support THP */ 23018763cb45SJérôme Glisse if (!page || !page->mapping || PageTransCompound(page)) { 2302276f756dSPingfan Liu mpfn = 0; 23038763cb45SJérôme Glisse goto next; 23048763cb45SJérôme Glisse } 23058763cb45SJérôme Glisse 23068763cb45SJérôme Glisse /* 23078763cb45SJérôme Glisse * By getting a reference on the page we pin it and that blocks 23088763cb45SJérôme Glisse * any kind of migration. Side effect is that it "freezes" the 23098763cb45SJérôme Glisse * pte. 23108763cb45SJérôme Glisse * 23118763cb45SJérôme Glisse * We drop this reference after isolating the page from the lru 23128763cb45SJérôme Glisse * for non device page (device page are not on the lru and thus 23138763cb45SJérôme Glisse * can't be dropped from it). 23148763cb45SJérôme Glisse */ 23158763cb45SJérôme Glisse get_page(page); 23168763cb45SJérôme Glisse migrate->cpages++; 23178763cb45SJérôme Glisse 23188c3328f1SJérôme Glisse /* 23198c3328f1SJérôme Glisse * Optimize for the common case where page is only mapped once 23208c3328f1SJérôme Glisse * in one process. If we can lock the page, then we can safely 23218c3328f1SJérôme Glisse * set up a special migration page table entry now. 23228c3328f1SJérôme Glisse */ 23238c3328f1SJérôme Glisse if (trylock_page(page)) { 23248c3328f1SJérôme Glisse pte_t swp_pte; 23258c3328f1SJérôme Glisse 23268c3328f1SJérôme Glisse mpfn |= MIGRATE_PFN_LOCKED; 23278c3328f1SJérôme Glisse ptep_get_and_clear(mm, addr, ptep); 23288c3328f1SJérôme Glisse 23298c3328f1SJérôme Glisse /* Setup special migration page table entry */ 233007707125SRalph Campbell entry = make_migration_entry(page, mpfn & 233107707125SRalph Campbell MIGRATE_PFN_WRITE); 23328c3328f1SJérôme Glisse swp_pte = swp_entry_to_pte(entry); 23338c3328f1SJérôme Glisse if (pte_soft_dirty(pte)) 23348c3328f1SJérôme Glisse swp_pte = pte_swp_mksoft_dirty(swp_pte); 23358c3328f1SJérôme Glisse set_pte_at(mm, addr, ptep, swp_pte); 23368c3328f1SJérôme Glisse 23378c3328f1SJérôme Glisse /* 23388c3328f1SJérôme Glisse * This is like regular unmap: we remove the rmap and 23398c3328f1SJérôme Glisse * drop page refcount. Page won't be freed, as we took 23408c3328f1SJérôme Glisse * a reference just above. 23418c3328f1SJérôme Glisse */ 23428c3328f1SJérôme Glisse page_remove_rmap(page, false); 23438c3328f1SJérôme Glisse put_page(page); 2344a5430ddaSJérôme Glisse 2345a5430ddaSJérôme Glisse if (pte_present(pte)) 23468c3328f1SJérôme Glisse unmapped++; 23478c3328f1SJérôme Glisse } 23488c3328f1SJérôme Glisse 23498763cb45SJérôme Glisse next: 2350a5430ddaSJérôme Glisse migrate->dst[migrate->npages] = 0; 23518763cb45SJérôme Glisse migrate->src[migrate->npages++] = mpfn; 23528763cb45SJérôme Glisse } 23538c3328f1SJérôme Glisse arch_leave_lazy_mmu_mode(); 23548763cb45SJérôme Glisse pte_unmap_unlock(ptep - 1, ptl); 23558763cb45SJérôme Glisse 23568c3328f1SJérôme Glisse /* Only flush the TLB if we actually modified any entries */ 23578c3328f1SJérôme Glisse if (unmapped) 23588c3328f1SJérôme Glisse flush_tlb_range(walk->vma, start, end); 23598c3328f1SJérôme Glisse 23608763cb45SJérôme Glisse return 0; 23618763cb45SJérôme Glisse } 23628763cb45SJérôme Glisse 23637b86ac33SChristoph Hellwig static const struct mm_walk_ops migrate_vma_walk_ops = { 23647b86ac33SChristoph Hellwig .pmd_entry = migrate_vma_collect_pmd, 23657b86ac33SChristoph Hellwig .pte_hole = migrate_vma_collect_hole, 23667b86ac33SChristoph Hellwig }; 23677b86ac33SChristoph Hellwig 23688763cb45SJérôme Glisse /* 23698763cb45SJérôme Glisse * migrate_vma_collect() - collect pages over a range of virtual addresses 23708763cb45SJérôme Glisse * @migrate: migrate struct containing all migration information 23718763cb45SJérôme Glisse * 23728763cb45SJérôme Glisse * This will walk the CPU page table. For each virtual address backed by a 23738763cb45SJérôme Glisse * valid page, it updates the src array and takes a reference on the page, in 23748763cb45SJérôme Glisse * order to pin the page until we lock it and unmap it. 23758763cb45SJérôme Glisse */ 23768763cb45SJérôme Glisse static void migrate_vma_collect(struct migrate_vma *migrate) 23778763cb45SJérôme Glisse { 2378ac46d4f3SJérôme Glisse struct mmu_notifier_range range; 23798763cb45SJérôme Glisse 23807b86ac33SChristoph Hellwig mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, 23817b86ac33SChristoph Hellwig migrate->vma->vm_mm, migrate->start, migrate->end); 2382ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range); 23838763cb45SJérôme Glisse 23847b86ac33SChristoph Hellwig walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 23857b86ac33SChristoph Hellwig &migrate_vma_walk_ops, migrate); 23867b86ac33SChristoph Hellwig 23877b86ac33SChristoph Hellwig mmu_notifier_invalidate_range_end(&range); 23888763cb45SJérôme Glisse migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 23898763cb45SJérôme Glisse } 23908763cb45SJérôme Glisse 23918763cb45SJérôme Glisse /* 23928763cb45SJérôme Glisse * migrate_vma_check_page() - check if page is pinned or not 23938763cb45SJérôme Glisse * @page: struct page to check 23948763cb45SJérôme Glisse * 23958763cb45SJérôme Glisse * Pinned pages cannot be migrated. This is the same test as in 23968763cb45SJérôme Glisse * migrate_page_move_mapping(), except that here we allow migration of a 23978763cb45SJérôme Glisse * ZONE_DEVICE page. 23988763cb45SJérôme Glisse */ 23998763cb45SJérôme Glisse static bool migrate_vma_check_page(struct page *page) 24008763cb45SJérôme Glisse { 24018763cb45SJérôme Glisse /* 24028763cb45SJérôme Glisse * One extra ref because caller holds an extra reference, either from 24038763cb45SJérôme Glisse * isolate_lru_page() for a regular page, or migrate_vma_collect() for 24048763cb45SJérôme Glisse * a device page. 24058763cb45SJérôme Glisse */ 24068763cb45SJérôme Glisse int extra = 1; 24078763cb45SJérôme Glisse 24088763cb45SJérôme Glisse /* 24098763cb45SJérôme Glisse * FIXME support THP (transparent huge page), it is bit more complex to 24108763cb45SJérôme Glisse * check them than regular pages, because they can be mapped with a pmd 24118763cb45SJérôme Glisse * or with a pte (split pte mapping). 24128763cb45SJérôme Glisse */ 24138763cb45SJérôme Glisse if (PageCompound(page)) 24148763cb45SJérôme Glisse return false; 24158763cb45SJérôme Glisse 2416a5430ddaSJérôme Glisse /* Page from ZONE_DEVICE have one extra reference */ 2417a5430ddaSJérôme Glisse if (is_zone_device_page(page)) { 2418a5430ddaSJérôme Glisse /* 2419a5430ddaSJérôme Glisse * Private page can never be pin as they have no valid pte and 2420a5430ddaSJérôme Glisse * GUP will fail for those. Yet if there is a pending migration 2421a5430ddaSJérôme Glisse * a thread might try to wait on the pte migration entry and 2422a5430ddaSJérôme Glisse * will bump the page reference count. Sadly there is no way to 2423a5430ddaSJérôme Glisse * differentiate a regular pin from migration wait. Hence to 2424a5430ddaSJérôme Glisse * avoid 2 racing thread trying to migrate back to CPU to enter 2425a5430ddaSJérôme Glisse * infinite loop (one stoping migration because the other is 2426a5430ddaSJérôme Glisse * waiting on pte migration entry). We always return true here. 2427a5430ddaSJérôme Glisse * 2428a5430ddaSJérôme Glisse * FIXME proper solution is to rework migration_entry_wait() so 2429a5430ddaSJérôme Glisse * it does not need to take a reference on page. 2430a5430ddaSJérôme Glisse */ 243125b2995aSChristoph Hellwig return is_device_private_page(page); 2432a5430ddaSJérôme Glisse } 2433a5430ddaSJérôme Glisse 2434df6ad698SJérôme Glisse /* For file back page */ 2435df6ad698SJérôme Glisse if (page_mapping(page)) 2436df6ad698SJérôme Glisse extra += 1 + page_has_private(page); 2437df6ad698SJérôme Glisse 24388763cb45SJérôme Glisse if ((page_count(page) - extra) > page_mapcount(page)) 24398763cb45SJérôme Glisse return false; 24408763cb45SJérôme Glisse 24418763cb45SJérôme Glisse return true; 24428763cb45SJérôme Glisse } 24438763cb45SJérôme Glisse 24448763cb45SJérôme Glisse /* 24458763cb45SJérôme Glisse * migrate_vma_prepare() - lock pages and isolate them from the lru 24468763cb45SJérôme Glisse * @migrate: migrate struct containing all migration information 24478763cb45SJérôme Glisse * 24488763cb45SJérôme Glisse * This locks pages that have been collected by migrate_vma_collect(). Once each 24498763cb45SJérôme Glisse * page is locked it is isolated from the lru (for non-device pages). Finally, 24508763cb45SJérôme Glisse * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be 24518763cb45SJérôme Glisse * migrated by concurrent kernel threads. 24528763cb45SJérôme Glisse */ 24538763cb45SJérôme Glisse static void migrate_vma_prepare(struct migrate_vma *migrate) 24548763cb45SJérôme Glisse { 24558763cb45SJérôme Glisse const unsigned long npages = migrate->npages; 24568c3328f1SJérôme Glisse const unsigned long start = migrate->start; 24578c3328f1SJérôme Glisse unsigned long addr, i, restore = 0; 24588763cb45SJérôme Glisse bool allow_drain = true; 24598763cb45SJérôme Glisse 24608763cb45SJérôme Glisse lru_add_drain(); 24618763cb45SJérôme Glisse 24628763cb45SJérôme Glisse for (i = 0; (i < npages) && migrate->cpages; i++) { 24638763cb45SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 24648c3328f1SJérôme Glisse bool remap = true; 24658763cb45SJérôme Glisse 24668763cb45SJérôme Glisse if (!page) 24678763cb45SJérôme Glisse continue; 24688763cb45SJérôme Glisse 24698c3328f1SJérôme Glisse if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { 24708763cb45SJérôme Glisse /* 24718763cb45SJérôme Glisse * Because we are migrating several pages there can be 24728763cb45SJérôme Glisse * a deadlock between 2 concurrent migration where each 24738763cb45SJérôme Glisse * are waiting on each other page lock. 24748763cb45SJérôme Glisse * 24758763cb45SJérôme Glisse * Make migrate_vma() a best effort thing and backoff 24768763cb45SJérôme Glisse * for any page we can not lock right away. 24778763cb45SJérôme Glisse */ 24788763cb45SJérôme Glisse if (!trylock_page(page)) { 24798763cb45SJérôme Glisse migrate->src[i] = 0; 24808763cb45SJérôme Glisse migrate->cpages--; 24818763cb45SJérôme Glisse put_page(page); 24828763cb45SJérôme Glisse continue; 24838763cb45SJérôme Glisse } 24848c3328f1SJérôme Glisse remap = false; 24858763cb45SJérôme Glisse migrate->src[i] |= MIGRATE_PFN_LOCKED; 24868c3328f1SJérôme Glisse } 24878763cb45SJérôme Glisse 2488a5430ddaSJérôme Glisse /* ZONE_DEVICE pages are not on LRU */ 2489a5430ddaSJérôme Glisse if (!is_zone_device_page(page)) { 24908763cb45SJérôme Glisse if (!PageLRU(page) && allow_drain) { 24918763cb45SJérôme Glisse /* Drain CPU's pagevec */ 24928763cb45SJérôme Glisse lru_add_drain_all(); 24938763cb45SJérôme Glisse allow_drain = false; 24948763cb45SJérôme Glisse } 24958763cb45SJérôme Glisse 24968763cb45SJérôme Glisse if (isolate_lru_page(page)) { 24978c3328f1SJérôme Glisse if (remap) { 24988c3328f1SJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 24998c3328f1SJérôme Glisse migrate->cpages--; 25008c3328f1SJérôme Glisse restore++; 25018c3328f1SJérôme Glisse } else { 25028763cb45SJérôme Glisse migrate->src[i] = 0; 25038763cb45SJérôme Glisse unlock_page(page); 25048763cb45SJérôme Glisse migrate->cpages--; 25058763cb45SJérôme Glisse put_page(page); 25068c3328f1SJérôme Glisse } 25078763cb45SJérôme Glisse continue; 25088763cb45SJérôme Glisse } 25098763cb45SJérôme Glisse 2510a5430ddaSJérôme Glisse /* Drop the reference we took in collect */ 2511a5430ddaSJérôme Glisse put_page(page); 2512a5430ddaSJérôme Glisse } 2513a5430ddaSJérôme Glisse 25148763cb45SJérôme Glisse if (!migrate_vma_check_page(page)) { 25158c3328f1SJérôme Glisse if (remap) { 25168c3328f1SJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 25178c3328f1SJérôme Glisse migrate->cpages--; 25188c3328f1SJérôme Glisse restore++; 25198c3328f1SJérôme Glisse 2520a5430ddaSJérôme Glisse if (!is_zone_device_page(page)) { 25218c3328f1SJérôme Glisse get_page(page); 25228c3328f1SJérôme Glisse putback_lru_page(page); 2523a5430ddaSJérôme Glisse } 25248c3328f1SJérôme Glisse } else { 25258763cb45SJérôme Glisse migrate->src[i] = 0; 25268763cb45SJérôme Glisse unlock_page(page); 25278763cb45SJérôme Glisse migrate->cpages--; 25288763cb45SJérôme Glisse 2529a5430ddaSJérôme Glisse if (!is_zone_device_page(page)) 25308763cb45SJérôme Glisse putback_lru_page(page); 2531a5430ddaSJérôme Glisse else 2532a5430ddaSJérôme Glisse put_page(page); 25338763cb45SJérôme Glisse } 25348763cb45SJérôme Glisse } 25358763cb45SJérôme Glisse } 25368763cb45SJérôme Glisse 25378c3328f1SJérôme Glisse for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { 25388c3328f1SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 25398c3328f1SJérôme Glisse 25408c3328f1SJérôme Glisse if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 25418c3328f1SJérôme Glisse continue; 25428c3328f1SJérôme Glisse 25438c3328f1SJérôme Glisse remove_migration_pte(page, migrate->vma, addr, page); 25448c3328f1SJérôme Glisse 25458c3328f1SJérôme Glisse migrate->src[i] = 0; 25468c3328f1SJérôme Glisse unlock_page(page); 25478c3328f1SJérôme Glisse put_page(page); 25488c3328f1SJérôme Glisse restore--; 25498c3328f1SJérôme Glisse } 25508c3328f1SJérôme Glisse } 25518c3328f1SJérôme Glisse 25528763cb45SJérôme Glisse /* 25538763cb45SJérôme Glisse * migrate_vma_unmap() - replace page mapping with special migration pte entry 25548763cb45SJérôme Glisse * @migrate: migrate struct containing all migration information 25558763cb45SJérôme Glisse * 25568763cb45SJérôme Glisse * Replace page mapping (CPU page table pte) with a special migration pte entry 25578763cb45SJérôme Glisse * and check again if it has been pinned. Pinned pages are restored because we 25588763cb45SJérôme Glisse * cannot migrate them. 25598763cb45SJérôme Glisse * 25608763cb45SJérôme Glisse * This is the last step before we call the device driver callback to allocate 25618763cb45SJérôme Glisse * destination memory and copy contents of original page over to new page. 25628763cb45SJérôme Glisse */ 25638763cb45SJérôme Glisse static void migrate_vma_unmap(struct migrate_vma *migrate) 25648763cb45SJérôme Glisse { 25658763cb45SJérôme Glisse int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 25668763cb45SJérôme Glisse const unsigned long npages = migrate->npages; 25678763cb45SJérôme Glisse const unsigned long start = migrate->start; 25688763cb45SJérôme Glisse unsigned long addr, i, restore = 0; 25698763cb45SJérôme Glisse 25708763cb45SJérôme Glisse for (i = 0; i < npages; i++) { 25718763cb45SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 25728763cb45SJérôme Glisse 25738763cb45SJérôme Glisse if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 25748763cb45SJérôme Glisse continue; 25758763cb45SJérôme Glisse 25768c3328f1SJérôme Glisse if (page_mapped(page)) { 25778763cb45SJérôme Glisse try_to_unmap(page, flags); 25788c3328f1SJérôme Glisse if (page_mapped(page)) 25798c3328f1SJérôme Glisse goto restore; 25808c3328f1SJérôme Glisse } 25818c3328f1SJérôme Glisse 25828c3328f1SJérôme Glisse if (migrate_vma_check_page(page)) 25838c3328f1SJérôme Glisse continue; 25848c3328f1SJérôme Glisse 25858c3328f1SJérôme Glisse restore: 25868763cb45SJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 25878763cb45SJérôme Glisse migrate->cpages--; 25888763cb45SJérôme Glisse restore++; 25898763cb45SJérôme Glisse } 25908763cb45SJérôme Glisse 25918763cb45SJérôme Glisse for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { 25928763cb45SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 25938763cb45SJérôme Glisse 25948763cb45SJérôme Glisse if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 25958763cb45SJérôme Glisse continue; 25968763cb45SJérôme Glisse 25978763cb45SJérôme Glisse remove_migration_ptes(page, page, false); 25988763cb45SJérôme Glisse 25998763cb45SJérôme Glisse migrate->src[i] = 0; 26008763cb45SJérôme Glisse unlock_page(page); 26018763cb45SJérôme Glisse restore--; 26028763cb45SJérôme Glisse 2603a5430ddaSJérôme Glisse if (is_zone_device_page(page)) 2604a5430ddaSJérôme Glisse put_page(page); 2605a5430ddaSJérôme Glisse else 26068763cb45SJérôme Glisse putback_lru_page(page); 26078763cb45SJérôme Glisse } 26088763cb45SJérôme Glisse } 26098763cb45SJérôme Glisse 2610a7d1f22bSChristoph Hellwig /** 2611a7d1f22bSChristoph Hellwig * migrate_vma_setup() - prepare to migrate a range of memory 2612a7d1f22bSChristoph Hellwig * @args: contains the vma, start, and and pfns arrays for the migration 2613a7d1f22bSChristoph Hellwig * 2614a7d1f22bSChristoph Hellwig * Returns: negative errno on failures, 0 when 0 or more pages were migrated 2615a7d1f22bSChristoph Hellwig * without an error. 2616a7d1f22bSChristoph Hellwig * 2617a7d1f22bSChristoph Hellwig * Prepare to migrate a range of memory virtual address range by collecting all 2618a7d1f22bSChristoph Hellwig * the pages backing each virtual address in the range, saving them inside the 2619a7d1f22bSChristoph Hellwig * src array. Then lock those pages and unmap them. Once the pages are locked 2620a7d1f22bSChristoph Hellwig * and unmapped, check whether each page is pinned or not. Pages that aren't 2621a7d1f22bSChristoph Hellwig * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 2622a7d1f22bSChristoph Hellwig * corresponding src array entry. Then restores any pages that are pinned, by 2623a7d1f22bSChristoph Hellwig * remapping and unlocking those pages. 2624a7d1f22bSChristoph Hellwig * 2625a7d1f22bSChristoph Hellwig * The caller should then allocate destination memory and copy source memory to 2626a7d1f22bSChristoph Hellwig * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 2627a7d1f22bSChristoph Hellwig * flag set). Once these are allocated and copied, the caller must update each 2628a7d1f22bSChristoph Hellwig * corresponding entry in the dst array with the pfn value of the destination 2629a7d1f22bSChristoph Hellwig * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set 2630a7d1f22bSChristoph Hellwig * (destination pages must have their struct pages locked, via lock_page()). 2631a7d1f22bSChristoph Hellwig * 2632a7d1f22bSChristoph Hellwig * Note that the caller does not have to migrate all the pages that are marked 2633a7d1f22bSChristoph Hellwig * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 2634a7d1f22bSChristoph Hellwig * device memory to system memory. If the caller cannot migrate a device page 2635a7d1f22bSChristoph Hellwig * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 2636a7d1f22bSChristoph Hellwig * consequences for the userspace process, so it must be avoided if at all 2637a7d1f22bSChristoph Hellwig * possible. 2638a7d1f22bSChristoph Hellwig * 2639a7d1f22bSChristoph Hellwig * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 2640a7d1f22bSChristoph Hellwig * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 2641a7d1f22bSChristoph Hellwig * allowing the caller to allocate device memory for those unback virtual 2642a7d1f22bSChristoph Hellwig * address. For this the caller simply has to allocate device memory and 2643a7d1f22bSChristoph Hellwig * properly set the destination entry like for regular migration. Note that 2644a7d1f22bSChristoph Hellwig * this can still fails and thus inside the device driver must check if the 2645a7d1f22bSChristoph Hellwig * migration was successful for those entries after calling migrate_vma_pages() 2646a7d1f22bSChristoph Hellwig * just like for regular migration. 2647a7d1f22bSChristoph Hellwig * 2648a7d1f22bSChristoph Hellwig * After that, the callers must call migrate_vma_pages() to go over each entry 2649a7d1f22bSChristoph Hellwig * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2650a7d1f22bSChristoph Hellwig * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2651a7d1f22bSChristoph Hellwig * then migrate_vma_pages() to migrate struct page information from the source 2652a7d1f22bSChristoph Hellwig * struct page to the destination struct page. If it fails to migrate the 2653a7d1f22bSChristoph Hellwig * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 2654a7d1f22bSChristoph Hellwig * src array. 2655a7d1f22bSChristoph Hellwig * 2656a7d1f22bSChristoph Hellwig * At this point all successfully migrated pages have an entry in the src 2657a7d1f22bSChristoph Hellwig * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2658a7d1f22bSChristoph Hellwig * array entry with MIGRATE_PFN_VALID flag set. 2659a7d1f22bSChristoph Hellwig * 2660a7d1f22bSChristoph Hellwig * Once migrate_vma_pages() returns the caller may inspect which pages were 2661a7d1f22bSChristoph Hellwig * successfully migrated, and which were not. Successfully migrated pages will 2662a7d1f22bSChristoph Hellwig * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 2663a7d1f22bSChristoph Hellwig * 2664a7d1f22bSChristoph Hellwig * It is safe to update device page table after migrate_vma_pages() because 2665a7d1f22bSChristoph Hellwig * both destination and source page are still locked, and the mmap_sem is held 2666a7d1f22bSChristoph Hellwig * in read mode (hence no one can unmap the range being migrated). 2667a7d1f22bSChristoph Hellwig * 2668a7d1f22bSChristoph Hellwig * Once the caller is done cleaning up things and updating its page table (if it 2669a7d1f22bSChristoph Hellwig * chose to do so, this is not an obligation) it finally calls 2670a7d1f22bSChristoph Hellwig * migrate_vma_finalize() to update the CPU page table to point to new pages 2671a7d1f22bSChristoph Hellwig * for successfully migrated pages or otherwise restore the CPU page table to 2672a7d1f22bSChristoph Hellwig * point to the original source pages. 2673a7d1f22bSChristoph Hellwig */ 2674a7d1f22bSChristoph Hellwig int migrate_vma_setup(struct migrate_vma *args) 2675a7d1f22bSChristoph Hellwig { 2676a7d1f22bSChristoph Hellwig long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 2677a7d1f22bSChristoph Hellwig 2678a7d1f22bSChristoph Hellwig args->start &= PAGE_MASK; 2679a7d1f22bSChristoph Hellwig args->end &= PAGE_MASK; 2680a7d1f22bSChristoph Hellwig if (!args->vma || is_vm_hugetlb_page(args->vma) || 2681a7d1f22bSChristoph Hellwig (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 2682a7d1f22bSChristoph Hellwig return -EINVAL; 2683a7d1f22bSChristoph Hellwig if (nr_pages <= 0) 2684a7d1f22bSChristoph Hellwig return -EINVAL; 2685a7d1f22bSChristoph Hellwig if (args->start < args->vma->vm_start || 2686a7d1f22bSChristoph Hellwig args->start >= args->vma->vm_end) 2687a7d1f22bSChristoph Hellwig return -EINVAL; 2688a7d1f22bSChristoph Hellwig if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 2689a7d1f22bSChristoph Hellwig return -EINVAL; 2690a7d1f22bSChristoph Hellwig if (!args->src || !args->dst) 2691a7d1f22bSChristoph Hellwig return -EINVAL; 2692a7d1f22bSChristoph Hellwig 2693a7d1f22bSChristoph Hellwig memset(args->src, 0, sizeof(*args->src) * nr_pages); 2694a7d1f22bSChristoph Hellwig args->cpages = 0; 2695a7d1f22bSChristoph Hellwig args->npages = 0; 2696a7d1f22bSChristoph Hellwig 2697a7d1f22bSChristoph Hellwig migrate_vma_collect(args); 2698a7d1f22bSChristoph Hellwig 2699a7d1f22bSChristoph Hellwig if (args->cpages) 2700a7d1f22bSChristoph Hellwig migrate_vma_prepare(args); 2701a7d1f22bSChristoph Hellwig if (args->cpages) 2702a7d1f22bSChristoph Hellwig migrate_vma_unmap(args); 2703a7d1f22bSChristoph Hellwig 2704a7d1f22bSChristoph Hellwig /* 2705a7d1f22bSChristoph Hellwig * At this point pages are locked and unmapped, and thus they have 2706a7d1f22bSChristoph Hellwig * stable content and can safely be copied to destination memory that 2707a7d1f22bSChristoph Hellwig * is allocated by the drivers. 2708a7d1f22bSChristoph Hellwig */ 2709a7d1f22bSChristoph Hellwig return 0; 2710a7d1f22bSChristoph Hellwig 2711a7d1f22bSChristoph Hellwig } 2712a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_setup); 2713a7d1f22bSChristoph Hellwig 271434290e2cSRalph Campbell /* 271534290e2cSRalph Campbell * This code closely matches the code in: 271634290e2cSRalph Campbell * __handle_mm_fault() 271734290e2cSRalph Campbell * handle_pte_fault() 271834290e2cSRalph Campbell * do_anonymous_page() 271934290e2cSRalph Campbell * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 272034290e2cSRalph Campbell * private page. 272134290e2cSRalph Campbell */ 27228315ada7SJérôme Glisse static void migrate_vma_insert_page(struct migrate_vma *migrate, 27238315ada7SJérôme Glisse unsigned long addr, 27248315ada7SJérôme Glisse struct page *page, 27258315ada7SJérôme Glisse unsigned long *src, 27268315ada7SJérôme Glisse unsigned long *dst) 27278315ada7SJérôme Glisse { 27288315ada7SJérôme Glisse struct vm_area_struct *vma = migrate->vma; 27298315ada7SJérôme Glisse struct mm_struct *mm = vma->vm_mm; 27308315ada7SJérôme Glisse struct mem_cgroup *memcg; 27318315ada7SJérôme Glisse bool flush = false; 27328315ada7SJérôme Glisse spinlock_t *ptl; 27338315ada7SJérôme Glisse pte_t entry; 27348315ada7SJérôme Glisse pgd_t *pgdp; 27358315ada7SJérôme Glisse p4d_t *p4dp; 27368315ada7SJérôme Glisse pud_t *pudp; 27378315ada7SJérôme Glisse pmd_t *pmdp; 27388315ada7SJérôme Glisse pte_t *ptep; 27398315ada7SJérôme Glisse 27408315ada7SJérôme Glisse /* Only allow populating anonymous memory */ 27418315ada7SJérôme Glisse if (!vma_is_anonymous(vma)) 27428315ada7SJérôme Glisse goto abort; 27438315ada7SJérôme Glisse 27448315ada7SJérôme Glisse pgdp = pgd_offset(mm, addr); 27458315ada7SJérôme Glisse p4dp = p4d_alloc(mm, pgdp, addr); 27468315ada7SJérôme Glisse if (!p4dp) 27478315ada7SJérôme Glisse goto abort; 27488315ada7SJérôme Glisse pudp = pud_alloc(mm, p4dp, addr); 27498315ada7SJérôme Glisse if (!pudp) 27508315ada7SJérôme Glisse goto abort; 27518315ada7SJérôme Glisse pmdp = pmd_alloc(mm, pudp, addr); 27528315ada7SJérôme Glisse if (!pmdp) 27538315ada7SJérôme Glisse goto abort; 27548315ada7SJérôme Glisse 27558315ada7SJérôme Glisse if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 27568315ada7SJérôme Glisse goto abort; 27578315ada7SJérôme Glisse 27588315ada7SJérôme Glisse /* 27598315ada7SJérôme Glisse * Use pte_alloc() instead of pte_alloc_map(). We can't run 27608315ada7SJérôme Glisse * pte_offset_map() on pmds where a huge pmd might be created 27618315ada7SJérôme Glisse * from a different thread. 27628315ada7SJérôme Glisse * 27638315ada7SJérôme Glisse * pte_alloc_map() is safe to use under down_write(mmap_sem) or when 27648315ada7SJérôme Glisse * parallel threads are excluded by other means. 27658315ada7SJérôme Glisse * 27668315ada7SJérôme Glisse * Here we only have down_read(mmap_sem). 27678315ada7SJérôme Glisse */ 27684cf58924SJoel Fernandes (Google) if (pte_alloc(mm, pmdp)) 27698315ada7SJérôme Glisse goto abort; 27708315ada7SJérôme Glisse 27718315ada7SJérôme Glisse /* See the comment in pte_alloc_one_map() */ 27728315ada7SJérôme Glisse if (unlikely(pmd_trans_unstable(pmdp))) 27738315ada7SJérôme Glisse goto abort; 27748315ada7SJérôme Glisse 27758315ada7SJérôme Glisse if (unlikely(anon_vma_prepare(vma))) 27768315ada7SJérôme Glisse goto abort; 27778315ada7SJérôme Glisse if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) 27788315ada7SJérôme Glisse goto abort; 27798315ada7SJérôme Glisse 27808315ada7SJérôme Glisse /* 27818315ada7SJérôme Glisse * The memory barrier inside __SetPageUptodate makes sure that 27828315ada7SJérôme Glisse * preceding stores to the page contents become visible before 27838315ada7SJérôme Glisse * the set_pte_at() write. 27848315ada7SJérôme Glisse */ 27858315ada7SJérôme Glisse __SetPageUptodate(page); 27868315ada7SJérôme Glisse 2787df6ad698SJérôme Glisse if (is_zone_device_page(page)) { 2788df6ad698SJérôme Glisse if (is_device_private_page(page)) { 27898315ada7SJérôme Glisse swp_entry_t swp_entry; 27908315ada7SJérôme Glisse 27918315ada7SJérôme Glisse swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); 27928315ada7SJérôme Glisse entry = swp_entry_to_pte(swp_entry); 2793df6ad698SJérôme Glisse } 27948315ada7SJérôme Glisse } else { 27958315ada7SJérôme Glisse entry = mk_pte(page, vma->vm_page_prot); 27968315ada7SJérôme Glisse if (vma->vm_flags & VM_WRITE) 27978315ada7SJérôme Glisse entry = pte_mkwrite(pte_mkdirty(entry)); 27988315ada7SJérôme Glisse } 27998315ada7SJérôme Glisse 28008315ada7SJérôme Glisse ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 28018315ada7SJérôme Glisse 280234290e2cSRalph Campbell if (check_stable_address_space(mm)) 280334290e2cSRalph Campbell goto unlock_abort; 280434290e2cSRalph Campbell 28058315ada7SJérôme Glisse if (pte_present(*ptep)) { 28068315ada7SJérôme Glisse unsigned long pfn = pte_pfn(*ptep); 28078315ada7SJérôme Glisse 2808c23a0c99SRalph Campbell if (!is_zero_pfn(pfn)) 2809c23a0c99SRalph Campbell goto unlock_abort; 28108315ada7SJérôme Glisse flush = true; 2811c23a0c99SRalph Campbell } else if (!pte_none(*ptep)) 2812c23a0c99SRalph Campbell goto unlock_abort; 28138315ada7SJérôme Glisse 28148315ada7SJérôme Glisse /* 2815c23a0c99SRalph Campbell * Check for userfaultfd but do not deliver the fault. Instead, 28168315ada7SJérôme Glisse * just back off. 28178315ada7SJérôme Glisse */ 2818c23a0c99SRalph Campbell if (userfaultfd_missing(vma)) 2819c23a0c99SRalph Campbell goto unlock_abort; 28208315ada7SJérôme Glisse 28218315ada7SJérôme Glisse inc_mm_counter(mm, MM_ANONPAGES); 28228315ada7SJérôme Glisse page_add_new_anon_rmap(page, vma, addr, false); 28238315ada7SJérôme Glisse mem_cgroup_commit_charge(page, memcg, false, false); 28248315ada7SJérôme Glisse if (!is_zone_device_page(page)) 28258315ada7SJérôme Glisse lru_cache_add_active_or_unevictable(page, vma); 28268315ada7SJérôme Glisse get_page(page); 28278315ada7SJérôme Glisse 28288315ada7SJérôme Glisse if (flush) { 28298315ada7SJérôme Glisse flush_cache_page(vma, addr, pte_pfn(*ptep)); 28308315ada7SJérôme Glisse ptep_clear_flush_notify(vma, addr, ptep); 28318315ada7SJérôme Glisse set_pte_at_notify(mm, addr, ptep, entry); 28328315ada7SJérôme Glisse update_mmu_cache(vma, addr, ptep); 28338315ada7SJérôme Glisse } else { 28348315ada7SJérôme Glisse /* No need to invalidate - it was non-present before */ 28358315ada7SJérôme Glisse set_pte_at(mm, addr, ptep, entry); 28368315ada7SJérôme Glisse update_mmu_cache(vma, addr, ptep); 28378315ada7SJérôme Glisse } 28388315ada7SJérôme Glisse 28398315ada7SJérôme Glisse pte_unmap_unlock(ptep, ptl); 28408315ada7SJérôme Glisse *src = MIGRATE_PFN_MIGRATE; 28418315ada7SJérôme Glisse return; 28428315ada7SJérôme Glisse 2843c23a0c99SRalph Campbell unlock_abort: 2844c23a0c99SRalph Campbell pte_unmap_unlock(ptep, ptl); 2845c23a0c99SRalph Campbell mem_cgroup_cancel_charge(page, memcg, false); 28468315ada7SJérôme Glisse abort: 28478315ada7SJérôme Glisse *src &= ~MIGRATE_PFN_MIGRATE; 28488315ada7SJérôme Glisse } 28498315ada7SJérôme Glisse 2850a7d1f22bSChristoph Hellwig /** 28518763cb45SJérôme Glisse * migrate_vma_pages() - migrate meta-data from src page to dst page 28528763cb45SJérôme Glisse * @migrate: migrate struct containing all migration information 28538763cb45SJérôme Glisse * 28548763cb45SJérôme Glisse * This migrates struct page meta-data from source struct page to destination 28558763cb45SJérôme Glisse * struct page. This effectively finishes the migration from source page to the 28568763cb45SJérôme Glisse * destination page. 28578763cb45SJérôme Glisse */ 2858a7d1f22bSChristoph Hellwig void migrate_vma_pages(struct migrate_vma *migrate) 28598763cb45SJérôme Glisse { 28608763cb45SJérôme Glisse const unsigned long npages = migrate->npages; 28618763cb45SJérôme Glisse const unsigned long start = migrate->start; 2862ac46d4f3SJérôme Glisse struct mmu_notifier_range range; 2863ac46d4f3SJérôme Glisse unsigned long addr, i; 28648315ada7SJérôme Glisse bool notified = false; 28658763cb45SJérôme Glisse 28668763cb45SJérôme Glisse for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 28678763cb45SJérôme Glisse struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 28688763cb45SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 28698763cb45SJérôme Glisse struct address_space *mapping; 28708763cb45SJérôme Glisse int r; 28718763cb45SJérôme Glisse 28728315ada7SJérôme Glisse if (!newpage) { 28738315ada7SJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 28748763cb45SJérôme Glisse continue; 28758315ada7SJérôme Glisse } 28768315ada7SJérôme Glisse 28778315ada7SJérôme Glisse if (!page) { 2878c23a0c99SRalph Campbell if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 28798763cb45SJérôme Glisse continue; 28808315ada7SJérôme Glisse if (!notified) { 28818315ada7SJérôme Glisse notified = true; 2882ac46d4f3SJérôme Glisse 2883ac46d4f3SJérôme Glisse mmu_notifier_range_init(&range, 28847269f999SJérôme Glisse MMU_NOTIFY_CLEAR, 0, 28856f4f13e8SJérôme Glisse NULL, 2886ac46d4f3SJérôme Glisse migrate->vma->vm_mm, 2887ac46d4f3SJérôme Glisse addr, migrate->end); 2888ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range); 28898315ada7SJérôme Glisse } 28908315ada7SJérôme Glisse migrate_vma_insert_page(migrate, addr, newpage, 28918315ada7SJérôme Glisse &migrate->src[i], 28928315ada7SJérôme Glisse &migrate->dst[i]); 28938315ada7SJérôme Glisse continue; 28948315ada7SJérôme Glisse } 28958763cb45SJérôme Glisse 28968763cb45SJérôme Glisse mapping = page_mapping(page); 28978763cb45SJérôme Glisse 2898a5430ddaSJérôme Glisse if (is_zone_device_page(newpage)) { 2899a5430ddaSJérôme Glisse if (is_device_private_page(newpage)) { 2900a5430ddaSJérôme Glisse /* 2901a5430ddaSJérôme Glisse * For now only support private anonymous when 2902a5430ddaSJérôme Glisse * migrating to un-addressable device memory. 2903a5430ddaSJérôme Glisse */ 2904a5430ddaSJérôme Glisse if (mapping) { 2905a5430ddaSJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2906a5430ddaSJérôme Glisse continue; 2907a5430ddaSJérôme Glisse } 290825b2995aSChristoph Hellwig } else { 2909a5430ddaSJérôme Glisse /* 2910a5430ddaSJérôme Glisse * Other types of ZONE_DEVICE page are not 2911a5430ddaSJérôme Glisse * supported. 2912a5430ddaSJérôme Glisse */ 2913a5430ddaSJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2914a5430ddaSJérôme Glisse continue; 2915a5430ddaSJérôme Glisse } 2916a5430ddaSJérôme Glisse } 2917a5430ddaSJérôme Glisse 29188763cb45SJérôme Glisse r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 29198763cb45SJérôme Glisse if (r != MIGRATEPAGE_SUCCESS) 29208763cb45SJérôme Glisse migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 29218763cb45SJérôme Glisse } 29228315ada7SJérôme Glisse 29234645b9feSJérôme Glisse /* 29244645b9feSJérôme Glisse * No need to double call mmu_notifier->invalidate_range() callback as 29254645b9feSJérôme Glisse * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 29264645b9feSJérôme Glisse * did already call it. 29274645b9feSJérôme Glisse */ 29288315ada7SJérôme Glisse if (notified) 2929ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_only_end(&range); 29308763cb45SJérôme Glisse } 2931a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_pages); 29328763cb45SJérôme Glisse 2933a7d1f22bSChristoph Hellwig /** 29348763cb45SJérôme Glisse * migrate_vma_finalize() - restore CPU page table entry 29358763cb45SJérôme Glisse * @migrate: migrate struct containing all migration information 29368763cb45SJérôme Glisse * 29378763cb45SJérôme Glisse * This replaces the special migration pte entry with either a mapping to the 29388763cb45SJérôme Glisse * new page if migration was successful for that page, or to the original page 29398763cb45SJérôme Glisse * otherwise. 29408763cb45SJérôme Glisse * 29418763cb45SJérôme Glisse * This also unlocks the pages and puts them back on the lru, or drops the extra 29428763cb45SJérôme Glisse * refcount, for device pages. 29438763cb45SJérôme Glisse */ 2944a7d1f22bSChristoph Hellwig void migrate_vma_finalize(struct migrate_vma *migrate) 29458763cb45SJérôme Glisse { 29468763cb45SJérôme Glisse const unsigned long npages = migrate->npages; 29478763cb45SJérôme Glisse unsigned long i; 29488763cb45SJérôme Glisse 29498763cb45SJérôme Glisse for (i = 0; i < npages; i++) { 29508763cb45SJérôme Glisse struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 29518763cb45SJérôme Glisse struct page *page = migrate_pfn_to_page(migrate->src[i]); 29528763cb45SJérôme Glisse 29538315ada7SJérôme Glisse if (!page) { 29548315ada7SJérôme Glisse if (newpage) { 29558315ada7SJérôme Glisse unlock_page(newpage); 29568315ada7SJérôme Glisse put_page(newpage); 29578315ada7SJérôme Glisse } 29588763cb45SJérôme Glisse continue; 29598315ada7SJérôme Glisse } 29608315ada7SJérôme Glisse 29618763cb45SJérôme Glisse if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 29628763cb45SJérôme Glisse if (newpage) { 29638763cb45SJérôme Glisse unlock_page(newpage); 29648763cb45SJérôme Glisse put_page(newpage); 29658763cb45SJérôme Glisse } 29668763cb45SJérôme Glisse newpage = page; 29678763cb45SJérôme Glisse } 29688763cb45SJérôme Glisse 29698763cb45SJérôme Glisse remove_migration_ptes(page, newpage, false); 29708763cb45SJérôme Glisse unlock_page(page); 29718763cb45SJérôme Glisse migrate->cpages--; 29728763cb45SJérôme Glisse 2973a5430ddaSJérôme Glisse if (is_zone_device_page(page)) 2974a5430ddaSJérôme Glisse put_page(page); 2975a5430ddaSJérôme Glisse else 29768763cb45SJérôme Glisse putback_lru_page(page); 29778763cb45SJérôme Glisse 29788763cb45SJérôme Glisse if (newpage != page) { 29798763cb45SJérôme Glisse unlock_page(newpage); 2980a5430ddaSJérôme Glisse if (is_zone_device_page(newpage)) 2981a5430ddaSJérôme Glisse put_page(newpage); 2982a5430ddaSJérôme Glisse else 29838763cb45SJérôme Glisse putback_lru_page(newpage); 29848763cb45SJérôme Glisse } 29858763cb45SJérôme Glisse } 29868763cb45SJérôme Glisse } 2987a7d1f22bSChristoph Hellwig EXPORT_SYMBOL(migrate_vma_finalize); 29889b2ed9cbSChristoph Hellwig #endif /* CONFIG_DEVICE_PRIVATE */ 2989