1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/mm/swap_state.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 61da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds #include <linux/mm.h> 115a0e3ad6STejun Heo #include <linux/gfp.h> 121da177e4SLinus Torvalds #include <linux/kernel_stat.h> 131da177e4SLinus Torvalds #include <linux/swap.h> 1446017e95SHugh Dickins #include <linux/swapops.h> 151da177e4SLinus Torvalds #include <linux/init.h> 161da177e4SLinus Torvalds #include <linux/pagemap.h> 171da177e4SLinus Torvalds #include <linux/backing-dev.h> 183fb5c298SChristian Ehrhardt #include <linux/blkdev.h> 19c484d410SHugh Dickins #include <linux/pagevec.h> 20b20a3503SChristoph Lameter #include <linux/migrate.h> 214b3ef9daSHuang, Ying #include <linux/vmalloc.h> 2267afa38eSTim Chen #include <linux/swap_slots.h> 2338d8b4e6SHuang Ying #include <linux/huge_mm.h> 2461ef1865SMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 25243bce09SHugh Dickins #include "internal.h" 26014bb1deSNeilBrown #include "swap.h" 271da177e4SLinus Torvalds 281da177e4SLinus Torvalds /* 291da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 307eaceaccSJens Axboe * vmscan's shrink_page_list. 311da177e4SLinus Torvalds */ 32f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 331da177e4SLinus Torvalds .writepage = swap_writepage, 344c4a7634SNeilBrown .dirty_folio = noop_dirty_folio, 351c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3654184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 371c93923cSAndrew Morton #endif 381da177e4SLinus Torvalds }; 391da177e4SLinus Torvalds 40783cb68eSChangbin Du struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 41783cb68eSChangbin Du static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 42f5c754d6SColin Ian King static bool enable_vma_readahead __read_mostly = true; 43ec560175SHuang Ying 44ec560175SHuang Ying #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 45ec560175SHuang Ying #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 46ec560175SHuang Ying #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 47ec560175SHuang Ying #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 48ec560175SHuang Ying 49ec560175SHuang Ying #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 50ec560175SHuang Ying #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 51ec560175SHuang Ying #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 52ec560175SHuang Ying 53ec560175SHuang Ying #define SWAP_RA_VAL(addr, win, hits) \ 54ec560175SHuang Ying (((addr) & PAGE_MASK) | \ 55ec560175SHuang Ying (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 56ec560175SHuang Ying ((hits) & SWAP_RA_HITS_MASK)) 57ec560175SHuang Ying 58ec560175SHuang Ying /* Initial readahead hits is 4 to start up with a small window */ 59ec560175SHuang Ying #define GET_SWAP_RA_VAL(vma) \ 60ec560175SHuang Ying (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 611da177e4SLinus Torvalds 62579f8290SShaohua Li static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 63579f8290SShaohua Li 641da177e4SLinus Torvalds void show_swap_cache_info(void) 651da177e4SLinus Torvalds { 6633806f06SShaohua Li printk("%lu pages in swap cache\n", total_swapcache_pages()); 67ec8acf20SShaohua Li printk("Free swap = %ldkB\n", 68ec8acf20SShaohua Li get_nr_swap_pages() << (PAGE_SHIFT - 10)); 691da177e4SLinus Torvalds printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 701da177e4SLinus Torvalds } 711da177e4SLinus Torvalds 72aae466b0SJoonsoo Kim void *get_shadow_from_swap_cache(swp_entry_t entry) 73aae466b0SJoonsoo Kim { 74aae466b0SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 75aae466b0SJoonsoo Kim pgoff_t idx = swp_offset(entry); 76aae466b0SJoonsoo Kim struct page *page; 77aae466b0SJoonsoo Kim 788c647dd1SMatthew Wilcox (Oracle) page = xa_load(&address_space->i_pages, idx); 79aae466b0SJoonsoo Kim if (xa_is_value(page)) 80aae466b0SJoonsoo Kim return page; 81aae466b0SJoonsoo Kim return NULL; 82aae466b0SJoonsoo Kim } 83aae466b0SJoonsoo Kim 841da177e4SLinus Torvalds /* 852bb876b5SMatthew Wilcox (Oracle) * add_to_swap_cache resembles filemap_add_folio on swapper_space, 861da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 871da177e4SLinus Torvalds */ 88a4c366f0SMatthew Wilcox (Oracle) int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 893852f676SJoonsoo Kim gfp_t gfp, void **shadowp) 901da177e4SLinus Torvalds { 918d93b41cSMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 9238d8b4e6SHuang Ying pgoff_t idx = swp_offset(entry); 93a4c366f0SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 94a4c366f0SMatthew Wilcox (Oracle) unsigned long i, nr = folio_nr_pages(folio); 953852f676SJoonsoo Kim void *old; 961da177e4SLinus Torvalds 975649d113SYang Yang xas_set_update(&xas, workingset_update_node); 985649d113SYang Yang 99a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 100a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 101a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 10251726b12SHugh Dickins 103a4c366f0SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 104a4c366f0SMatthew Wilcox (Oracle) folio_set_swapcache(folio); 105e286781dSNick Piggin 1068d93b41cSMatthew Wilcox do { 1078d93b41cSMatthew Wilcox xas_lock_irq(&xas); 1088d93b41cSMatthew Wilcox xas_create_range(&xas); 1098d93b41cSMatthew Wilcox if (xas_error(&xas)) 1108d93b41cSMatthew Wilcox goto unlock; 11138d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 112a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 1133852f676SJoonsoo Kim old = xas_load(&xas); 1143852f676SJoonsoo Kim if (xa_is_value(old)) { 1153852f676SJoonsoo Kim if (shadowp) 1163852f676SJoonsoo Kim *shadowp = old; 1173852f676SJoonsoo Kim } 118a4c366f0SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), entry.val + i); 119a4c366f0SMatthew Wilcox (Oracle) xas_store(&xas, folio); 1208d93b41cSMatthew Wilcox xas_next(&xas); 1211da177e4SLinus Torvalds } 12238d8b4e6SHuang Ying address_space->nrpages += nr; 123a4c366f0SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 124a4c366f0SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 1258d93b41cSMatthew Wilcox unlock: 1268d93b41cSMatthew Wilcox xas_unlock_irq(&xas); 1278d93b41cSMatthew Wilcox } while (xas_nomem(&xas, gfp)); 1288d93b41cSMatthew Wilcox 1298d93b41cSMatthew Wilcox if (!xas_error(&xas)) 1308d93b41cSMatthew Wilcox return 0; 1318d93b41cSMatthew Wilcox 132a4c366f0SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 133a4c366f0SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 1348d93b41cSMatthew Wilcox return xas_error(&xas); 1351da177e4SLinus Torvalds } 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds /* 138ceff9d33SMatthew Wilcox (Oracle) * This must be called only on folios that have 1391da177e4SLinus Torvalds * been verified to be in the swap cache. 1401da177e4SLinus Torvalds */ 141ceff9d33SMatthew Wilcox (Oracle) void __delete_from_swap_cache(struct folio *folio, 1423852f676SJoonsoo Kim swp_entry_t entry, void *shadow) 1431da177e4SLinus Torvalds { 1444e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 145ceff9d33SMatthew Wilcox (Oracle) int i; 146ceff9d33SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 1474e17ec25SMatthew Wilcox pgoff_t idx = swp_offset(entry); 1484e17ec25SMatthew Wilcox XA_STATE(xas, &address_space->i_pages, idx); 14933806f06SShaohua Li 1505649d113SYang Yang xas_set_update(&xas, workingset_update_node); 1515649d113SYang Yang 152ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 153ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 154ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 1551da177e4SLinus Torvalds 15638d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 1573852f676SJoonsoo Kim void *entry = xas_store(&xas, shadow); 158b9eb7776SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(entry != folio, entry); 159ceff9d33SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), 0); 1604e17ec25SMatthew Wilcox xas_next(&xas); 16138d8b4e6SHuang Ying } 162ceff9d33SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 16338d8b4e6SHuang Ying address_space->nrpages -= nr; 164ceff9d33SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 165ceff9d33SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 1661da177e4SLinus Torvalds } 1671da177e4SLinus Torvalds 1681da177e4SLinus Torvalds /** 16909c02e56SMatthew Wilcox (Oracle) * add_to_swap - allocate swap space for a folio 17009c02e56SMatthew Wilcox (Oracle) * @folio: folio we want to move to swap 1711da177e4SLinus Torvalds * 17209c02e56SMatthew Wilcox (Oracle) * Allocate swap space for the folio and add the folio to the 17309c02e56SMatthew Wilcox (Oracle) * swap cache. 17409c02e56SMatthew Wilcox (Oracle) * 17509c02e56SMatthew Wilcox (Oracle) * Context: Caller needs to hold the folio lock. 17609c02e56SMatthew Wilcox (Oracle) * Return: Whether the folio was added to the swap cache. 1771da177e4SLinus Torvalds */ 17809c02e56SMatthew Wilcox (Oracle) bool add_to_swap(struct folio *folio) 1791da177e4SLinus Torvalds { 1801da177e4SLinus Torvalds swp_entry_t entry; 1811da177e4SLinus Torvalds int err; 1821da177e4SLinus Torvalds 18309c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 18409c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1851da177e4SLinus Torvalds 186e2e3fdc7SMatthew Wilcox (Oracle) entry = folio_alloc_swap(folio); 1871da177e4SLinus Torvalds if (!entry.val) 18809c02e56SMatthew Wilcox (Oracle) return false; 1890f074658SMinchan Kim 190bd53b714SNick Piggin /* 1918d93b41cSMatthew Wilcox * XArray node allocations from PF_MEMALLOC contexts could 192bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 193bd53b714SNick Piggin * stops emergency reserves from being allocated. 1941da177e4SLinus Torvalds * 195bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 196bd53b714SNick Piggin * deadlock in the swap out path. 1971da177e4SLinus Torvalds */ 1981da177e4SLinus Torvalds /* 199854e9ed0SMinchan Kim * Add it to the swap cache. 2001da177e4SLinus Torvalds */ 201a4c366f0SMatthew Wilcox (Oracle) err = add_to_swap_cache(folio, entry, 2023852f676SJoonsoo Kim __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 20338d8b4e6SHuang Ying if (err) 2042ca4532aSDaisuke Nishimura /* 2052ca4532aSDaisuke Nishimura * add_to_swap_cache() doesn't return -EEXIST, so we can safely 2062ca4532aSDaisuke Nishimura * clear SWAP_HAS_CACHE flag. 2072ca4532aSDaisuke Nishimura */ 2080f074658SMinchan Kim goto fail; 2099625456cSShaohua Li /* 21009c02e56SMatthew Wilcox (Oracle) * Normally the folio will be dirtied in unmap because its 21109c02e56SMatthew Wilcox (Oracle) * pte should be dirty. A special case is MADV_FREE page. The 21209c02e56SMatthew Wilcox (Oracle) * page's pte could have dirty bit cleared but the folio's 21309c02e56SMatthew Wilcox (Oracle) * SwapBacked flag is still set because clearing the dirty bit 21409c02e56SMatthew Wilcox (Oracle) * and SwapBacked flag has no lock protected. For such folio, 21509c02e56SMatthew Wilcox (Oracle) * unmap will not set dirty bit for it, so folio reclaim will 21609c02e56SMatthew Wilcox (Oracle) * not write the folio out. This can cause data corruption when 21709c02e56SMatthew Wilcox (Oracle) * the folio is swapped in later. Always setting the dirty flag 21809c02e56SMatthew Wilcox (Oracle) * for the folio solves the problem. 2199625456cSShaohua Li */ 22009c02e56SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 2211da177e4SLinus Torvalds 22209c02e56SMatthew Wilcox (Oracle) return true; 22338d8b4e6SHuang Ying 22438d8b4e6SHuang Ying fail: 2254081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 22609c02e56SMatthew Wilcox (Oracle) return false; 22738d8b4e6SHuang Ying } 22838d8b4e6SHuang Ying 2291da177e4SLinus Torvalds /* 23075fa68a5SMatthew Wilcox (Oracle) * This must be called only on folios that have 2311da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 23275fa68a5SMatthew Wilcox (Oracle) * It will never put the folio into the free list, 23375fa68a5SMatthew Wilcox (Oracle) * the caller has a reference on the folio. 2341da177e4SLinus Torvalds */ 23575fa68a5SMatthew Wilcox (Oracle) void delete_from_swap_cache(struct folio *folio) 2361da177e4SLinus Torvalds { 23775fa68a5SMatthew Wilcox (Oracle) swp_entry_t entry = folio_swap_entry(folio); 2384e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 2391da177e4SLinus Torvalds 240b93b0163SMatthew Wilcox xa_lock_irq(&address_space->i_pages); 241ceff9d33SMatthew Wilcox (Oracle) __delete_from_swap_cache(folio, entry, NULL); 242b93b0163SMatthew Wilcox xa_unlock_irq(&address_space->i_pages); 2431da177e4SLinus Torvalds 2444081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 24575fa68a5SMatthew Wilcox (Oracle) folio_ref_sub(folio, folio_nr_pages(folio)); 2461da177e4SLinus Torvalds } 2471da177e4SLinus Torvalds 2483852f676SJoonsoo Kim void clear_shadow_from_swap_cache(int type, unsigned long begin, 2493852f676SJoonsoo Kim unsigned long end) 2503852f676SJoonsoo Kim { 2513852f676SJoonsoo Kim unsigned long curr = begin; 2523852f676SJoonsoo Kim void *old; 2533852f676SJoonsoo Kim 2543852f676SJoonsoo Kim for (;;) { 2553852f676SJoonsoo Kim swp_entry_t entry = swp_entry(type, curr); 2563852f676SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 2573852f676SJoonsoo Kim XA_STATE(xas, &address_space->i_pages, curr); 2583852f676SJoonsoo Kim 2595649d113SYang Yang xas_set_update(&xas, workingset_update_node); 2605649d113SYang Yang 2613852f676SJoonsoo Kim xa_lock_irq(&address_space->i_pages); 2623852f676SJoonsoo Kim xas_for_each(&xas, old, end) { 2633852f676SJoonsoo Kim if (!xa_is_value(old)) 2643852f676SJoonsoo Kim continue; 2653852f676SJoonsoo Kim xas_store(&xas, NULL); 2663852f676SJoonsoo Kim } 2673852f676SJoonsoo Kim xa_unlock_irq(&address_space->i_pages); 2683852f676SJoonsoo Kim 2693852f676SJoonsoo Kim /* search the next swapcache until we meet end */ 2703852f676SJoonsoo Kim curr >>= SWAP_ADDRESS_SPACE_SHIFT; 2713852f676SJoonsoo Kim curr++; 2723852f676SJoonsoo Kim curr <<= SWAP_ADDRESS_SPACE_SHIFT; 2733852f676SJoonsoo Kim if (curr > end) 2743852f676SJoonsoo Kim break; 2753852f676SJoonsoo Kim } 2763852f676SJoonsoo Kim } 2773852f676SJoonsoo Kim 2781da177e4SLinus Torvalds /* 2791da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 2801da177e4SLinus Torvalds * 281aedd74d4SMatthew Wilcox (Oracle) * Its ok to check the swapcache flag without the folio lock 2821da177e4SLinus Torvalds * here because we are going to recheck again inside 283aedd74d4SMatthew Wilcox (Oracle) * folio_free_swap() _with_ the lock. 2841da177e4SLinus Torvalds * - Marcelo 2851da177e4SLinus Torvalds */ 286f4c4a3f4SHuang Ying void free_swap_cache(struct page *page) 2871da177e4SLinus Torvalds { 288aedd74d4SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 289aedd74d4SMatthew Wilcox (Oracle) 290aedd74d4SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) && !folio_mapped(folio) && 291aedd74d4SMatthew Wilcox (Oracle) folio_trylock(folio)) { 292aedd74d4SMatthew Wilcox (Oracle) folio_free_swap(folio); 293aedd74d4SMatthew Wilcox (Oracle) folio_unlock(folio); 2941da177e4SLinus Torvalds } 2951da177e4SLinus Torvalds } 2961da177e4SLinus Torvalds 2971da177e4SLinus Torvalds /* 2981da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 299b8072f09SHugh Dickins * this page if it is the last user of the page. 3001da177e4SLinus Torvalds */ 3011da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 3021da177e4SLinus Torvalds { 3031da177e4SLinus Torvalds free_swap_cache(page); 3046fcb52a5SAaron Lu if (!is_huge_zero_page(page)) 30509cbfeafSKirill A. Shutemov put_page(page); 3061da177e4SLinus Torvalds } 3071da177e4SLinus Torvalds 3081da177e4SLinus Torvalds /* 3091da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 3101da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 3111da177e4SLinus Torvalds */ 3127cc8f9c7SLinus Torvalds void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 3131da177e4SLinus Torvalds { 314aabfb572SMichal Hocko lru_add_drain(); 3157cc8f9c7SLinus Torvalds for (int i = 0; i < nr; i++) 3167cc8f9c7SLinus Torvalds free_swap_cache(encoded_page_ptr(pages[i])); 3177cc8f9c7SLinus Torvalds release_pages(pages, nr); 3181da177e4SLinus Torvalds } 3191da177e4SLinus Torvalds 320e9e9b7ecSMinchan Kim static inline bool swap_use_vma_readahead(void) 321e9e9b7ecSMinchan Kim { 322e9e9b7ecSMinchan Kim return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 323e9e9b7ecSMinchan Kim } 324e9e9b7ecSMinchan Kim 3251da177e4SLinus Torvalds /* 326c9edc242SMatthew Wilcox (Oracle) * Lookup a swap entry in the swap cache. A found folio will be returned 3271da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 328c9edc242SMatthew Wilcox (Oracle) * lock getting page table operations atomic even if we drop the folio 3291da177e4SLinus Torvalds * lock before returning. 330cbc2bd98SKairui Song * 331cbc2bd98SKairui Song * Caller must lock the swap device or hold a reference to keep it valid. 3321da177e4SLinus Torvalds */ 333c9edc242SMatthew Wilcox (Oracle) struct folio *swap_cache_get_folio(swp_entry_t entry, 334c9edc242SMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr) 3351da177e4SLinus Torvalds { 336c9edc242SMatthew Wilcox (Oracle) struct folio *folio; 3371da177e4SLinus Torvalds 338c9edc242SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 339*66dabbb6SChristoph Hellwig if (!IS_ERR(folio)) { 340eaf649ebSMinchan Kim bool vma_ra = swap_use_vma_readahead(); 341eaf649ebSMinchan Kim bool readahead; 342eaf649ebSMinchan Kim 343eaf649ebSMinchan Kim /* 344eaf649ebSMinchan Kim * At the moment, we don't support PG_readahead for anon THP 345eaf649ebSMinchan Kim * so let's bail out rather than confusing the readahead stat. 346eaf649ebSMinchan Kim */ 347c9edc242SMatthew Wilcox (Oracle) if (unlikely(folio_test_large(folio))) 348c9edc242SMatthew Wilcox (Oracle) return folio; 349eaf649ebSMinchan Kim 350c9edc242SMatthew Wilcox (Oracle) readahead = folio_test_clear_readahead(folio); 351eaf649ebSMinchan Kim if (vma && vma_ra) { 352eaf649ebSMinchan Kim unsigned long ra_val; 353eaf649ebSMinchan Kim int win, hits; 354eaf649ebSMinchan Kim 355eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 356eaf649ebSMinchan Kim win = SWAP_RA_WIN(ra_val); 357eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 358ec560175SHuang Ying if (readahead) 359ec560175SHuang Ying hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 360ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 361ec560175SHuang Ying SWAP_RA_VAL(addr, win, hits)); 362ec560175SHuang Ying } 363eaf649ebSMinchan Kim 364ec560175SHuang Ying if (readahead) { 365ec560175SHuang Ying count_vm_event(SWAP_RA_HIT); 366eaf649ebSMinchan Kim if (!vma || !vma_ra) 367ec560175SHuang Ying atomic_inc(&swapin_readahead_hits); 368ec560175SHuang Ying } 369*66dabbb6SChristoph Hellwig } else { 370*66dabbb6SChristoph Hellwig folio = NULL; 371ec560175SHuang Ying } 372eaf649ebSMinchan Kim 373c9edc242SMatthew Wilcox (Oracle) return folio; 374c9edc242SMatthew Wilcox (Oracle) } 375c9edc242SMatthew Wilcox (Oracle) 37661ef1865SMatthew Wilcox (Oracle) /** 377524984ffSMatthew Wilcox (Oracle) * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 37861ef1865SMatthew Wilcox (Oracle) * @mapping: The address_space to search. 37961ef1865SMatthew Wilcox (Oracle) * @index: The page cache index. 38061ef1865SMatthew Wilcox (Oracle) * 381524984ffSMatthew Wilcox (Oracle) * This differs from filemap_get_folio() in that it will also look for the 382524984ffSMatthew Wilcox (Oracle) * folio in the swap cache. 38361ef1865SMatthew Wilcox (Oracle) * 384524984ffSMatthew Wilcox (Oracle) * Return: The found folio or %NULL. 38561ef1865SMatthew Wilcox (Oracle) */ 386524984ffSMatthew Wilcox (Oracle) struct folio *filemap_get_incore_folio(struct address_space *mapping, 387524984ffSMatthew Wilcox (Oracle) pgoff_t index) 38861ef1865SMatthew Wilcox (Oracle) { 38961ef1865SMatthew Wilcox (Oracle) swp_entry_t swp; 39061ef1865SMatthew Wilcox (Oracle) struct swap_info_struct *si; 391097b3e59SChristoph Hellwig struct folio *folio = filemap_get_entry(mapping, index); 39261ef1865SMatthew Wilcox (Oracle) 393*66dabbb6SChristoph Hellwig if (!folio) 394*66dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 395dd8095b1SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 396*66dabbb6SChristoph Hellwig return folio; 39761ef1865SMatthew Wilcox (Oracle) if (!shmem_mapping(mapping)) 398*66dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 39961ef1865SMatthew Wilcox (Oracle) 400dd8095b1SMatthew Wilcox (Oracle) swp = radix_to_swp_entry(folio); 401ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */ 402ba6851b4SMiaohe Lin if (non_swap_entry(swp)) 403*66dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 40461ef1865SMatthew Wilcox (Oracle) /* Prevent swapoff from happening to us */ 40561ef1865SMatthew Wilcox (Oracle) si = get_swap_device(swp); 40661ef1865SMatthew Wilcox (Oracle) if (!si) 407*66dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 408dd8095b1SMatthew Wilcox (Oracle) index = swp_offset(swp); 409dd8095b1SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(swp), index); 41061ef1865SMatthew Wilcox (Oracle) put_swap_device(si); 411524984ffSMatthew Wilcox (Oracle) return folio; 41261ef1865SMatthew Wilcox (Oracle) } 41361ef1865SMatthew Wilcox (Oracle) 4145b999aadSDmitry Safonov struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 4155b999aadSDmitry Safonov struct vm_area_struct *vma, unsigned long addr, 4165b999aadSDmitry Safonov bool *new_page_allocated) 4171da177e4SLinus Torvalds { 418eb085574SHuang Ying struct swap_info_struct *si; 419a0d3374bSMatthew Wilcox (Oracle) struct folio *folio; 420aae466b0SJoonsoo Kim void *shadow = NULL; 4214c6355b2SJohannes Weiner 4225b999aadSDmitry Safonov *new_page_allocated = false; 4231da177e4SLinus Torvalds 4244c6355b2SJohannes Weiner for (;;) { 4254c6355b2SJohannes Weiner int err; 4261da177e4SLinus Torvalds /* 4271da177e4SLinus Torvalds * First check the swap cache. Since this is normally 428cb691e2fSMatthew Wilcox (Oracle) * called after swap_cache_get_folio() failed, re-calling 4291da177e4SLinus Torvalds * that would confuse statistics. 4301da177e4SLinus Torvalds */ 431eb085574SHuang Ying si = get_swap_device(entry); 432eb085574SHuang Ying if (!si) 4334c6355b2SJohannes Weiner return NULL; 434a0d3374bSMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), 435eb085574SHuang Ying swp_offset(entry)); 436eb085574SHuang Ying put_swap_device(si); 437*66dabbb6SChristoph Hellwig if (!IS_ERR(folio)) 438a0d3374bSMatthew Wilcox (Oracle) return folio_file_page(folio, swp_offset(entry)); 4391da177e4SLinus Torvalds 440ba81f838SHuang Ying /* 441ba81f838SHuang Ying * Just skip read ahead for unused swap slot. 442ba81f838SHuang Ying * During swap_off when swap_slot_cache is disabled, 443ba81f838SHuang Ying * we have to handle the race between putting 444ba81f838SHuang Ying * swap entry in swap cache and marking swap slot 445ba81f838SHuang Ying * as SWAP_HAS_CACHE. That's done in later part of code or 446ba81f838SHuang Ying * else swap_off will be aborted if we return NULL. 447ba81f838SHuang Ying */ 448ba81f838SHuang Ying if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 4494c6355b2SJohannes Weiner return NULL; 450e8c26ab6STim Chen 4511da177e4SLinus Torvalds /* 4524c6355b2SJohannes Weiner * Get a new page to read into from swap. Allocate it now, 4534c6355b2SJohannes Weiner * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 4544c6355b2SJohannes Weiner * cause any racers to loop around until we add it to cache. 4551da177e4SLinus Torvalds */ 456a0d3374bSMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); 457a0d3374bSMatthew Wilcox (Oracle) if (!folio) 4584c6355b2SJohannes Weiner return NULL; 4591da177e4SLinus Torvalds 4601da177e4SLinus Torvalds /* 461f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 462f000944dSHugh Dickins */ 463355cfa73SKAMEZAWA Hiroyuki err = swapcache_prepare(entry); 4644c6355b2SJohannes Weiner if (!err) 465f000944dSHugh Dickins break; 466f000944dSHugh Dickins 467a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 4684c6355b2SJohannes Weiner if (err != -EEXIST) 4694c6355b2SJohannes Weiner return NULL; 4701da177e4SLinus Torvalds 4714c6355b2SJohannes Weiner /* 4724c6355b2SJohannes Weiner * We might race against __delete_from_swap_cache(), and 4734c6355b2SJohannes Weiner * stumble across a swap_map entry whose SWAP_HAS_CACHE 4744c6355b2SJohannes Weiner * has not yet been cleared. Or race against another 4754c6355b2SJohannes Weiner * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 4764c6355b2SJohannes Weiner * in swap_map, but not yet added its page to swap cache. 4774c6355b2SJohannes Weiner */ 478029c4628SGuo Ziliang schedule_timeout_uninterruptible(1); 4794c6355b2SJohannes Weiner } 4804c6355b2SJohannes Weiner 4814c6355b2SJohannes Weiner /* 4824c6355b2SJohannes Weiner * The swap entry is ours to swap in. Prepare the new page. 4834c6355b2SJohannes Weiner */ 4844c6355b2SJohannes Weiner 485a0d3374bSMatthew Wilcox (Oracle) __folio_set_locked(folio); 486a0d3374bSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 4874c6355b2SJohannes Weiner 48865995918SMatthew Wilcox (Oracle) if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 4894c6355b2SJohannes Weiner goto fail_unlock; 4904c6355b2SJohannes Weiner 4910add0c77SShakeel Butt /* May fail (-ENOMEM) if XArray node allocation failed. */ 492a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 4934c6355b2SJohannes Weiner goto fail_unlock; 4940add0c77SShakeel Butt 4950add0c77SShakeel Butt mem_cgroup_swapin_uncharge_swap(entry); 4964c6355b2SJohannes Weiner 497aae466b0SJoonsoo Kim if (shadow) 498a0d3374bSMatthew Wilcox (Oracle) workingset_refault(folio, shadow); 499314b57fbSJohannes Weiner 500a0d3374bSMatthew Wilcox (Oracle) /* Caller will initiate read into locked folio */ 501a0d3374bSMatthew Wilcox (Oracle) folio_add_lru(folio); 5024c6355b2SJohannes Weiner *new_page_allocated = true; 503a0d3374bSMatthew Wilcox (Oracle) return &folio->page; 5044c6355b2SJohannes Weiner 5054c6355b2SJohannes Weiner fail_unlock: 5064081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 507a0d3374bSMatthew Wilcox (Oracle) folio_unlock(folio); 508a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 5094c6355b2SJohannes Weiner return NULL; 5101da177e4SLinus Torvalds } 51146017e95SHugh Dickins 5125b999aadSDmitry Safonov /* 5135b999aadSDmitry Safonov * Locate a page of swap in physical memory, reserving swap cache space 5145b999aadSDmitry Safonov * and reading the disk if it is not already cached. 5155b999aadSDmitry Safonov * A failure return means that either the page allocation failed or that 5165b999aadSDmitry Safonov * the swap entry is no longer in use. 5175b999aadSDmitry Safonov */ 5185b999aadSDmitry Safonov struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 5195169b844SNeilBrown struct vm_area_struct *vma, 5205169b844SNeilBrown unsigned long addr, bool do_poll, 5215169b844SNeilBrown struct swap_iocb **plug) 5225b999aadSDmitry Safonov { 5235b999aadSDmitry Safonov bool page_was_allocated; 5245b999aadSDmitry Safonov struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 5255b999aadSDmitry Safonov vma, addr, &page_was_allocated); 5265b999aadSDmitry Safonov 5275b999aadSDmitry Safonov if (page_was_allocated) 5285169b844SNeilBrown swap_readpage(retpage, do_poll, plug); 5295b999aadSDmitry Safonov 5305b999aadSDmitry Safonov return retpage; 5315b999aadSDmitry Safonov } 5325b999aadSDmitry Safonov 533ec560175SHuang Ying static unsigned int __swapin_nr_pages(unsigned long prev_offset, 534ec560175SHuang Ying unsigned long offset, 535ec560175SHuang Ying int hits, 536ec560175SHuang Ying int max_pages, 537ec560175SHuang Ying int prev_win) 538579f8290SShaohua Li { 539ec560175SHuang Ying unsigned int pages, last_ra; 540579f8290SShaohua Li 541579f8290SShaohua Li /* 542579f8290SShaohua Li * This heuristic has been found to work well on both sequential and 543579f8290SShaohua Li * random loads, swapping to hard disk or to SSD: please don't ask 544579f8290SShaohua Li * what the "+ 2" means, it just happens to work well, that's all. 545579f8290SShaohua Li */ 546ec560175SHuang Ying pages = hits + 2; 547579f8290SShaohua Li if (pages == 2) { 548579f8290SShaohua Li /* 549579f8290SShaohua Li * We can have no readahead hits to judge by: but must not get 550579f8290SShaohua Li * stuck here forever, so check for an adjacent offset instead 551579f8290SShaohua Li * (and don't even bother to check whether swap type is same). 552579f8290SShaohua Li */ 553579f8290SShaohua Li if (offset != prev_offset + 1 && offset != prev_offset - 1) 554579f8290SShaohua Li pages = 1; 555579f8290SShaohua Li } else { 556579f8290SShaohua Li unsigned int roundup = 4; 557579f8290SShaohua Li while (roundup < pages) 558579f8290SShaohua Li roundup <<= 1; 559579f8290SShaohua Li pages = roundup; 560579f8290SShaohua Li } 561579f8290SShaohua Li 562579f8290SShaohua Li if (pages > max_pages) 563579f8290SShaohua Li pages = max_pages; 564579f8290SShaohua Li 565579f8290SShaohua Li /* Don't shrink readahead too fast */ 566ec560175SHuang Ying last_ra = prev_win / 2; 567579f8290SShaohua Li if (pages < last_ra) 568579f8290SShaohua Li pages = last_ra; 569ec560175SHuang Ying 570ec560175SHuang Ying return pages; 571ec560175SHuang Ying } 572ec560175SHuang Ying 573ec560175SHuang Ying static unsigned long swapin_nr_pages(unsigned long offset) 574ec560175SHuang Ying { 575ec560175SHuang Ying static unsigned long prev_offset; 576ec560175SHuang Ying unsigned int hits, pages, max_pages; 577ec560175SHuang Ying static atomic_t last_readahead_pages; 578ec560175SHuang Ying 579ec560175SHuang Ying max_pages = 1 << READ_ONCE(page_cluster); 580ec560175SHuang Ying if (max_pages <= 1) 581ec560175SHuang Ying return 1; 582ec560175SHuang Ying 583ec560175SHuang Ying hits = atomic_xchg(&swapin_readahead_hits, 0); 584d6c1f098SQian Cai pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 585d6c1f098SQian Cai max_pages, 586ec560175SHuang Ying atomic_read(&last_readahead_pages)); 587ec560175SHuang Ying if (!hits) 588d6c1f098SQian Cai WRITE_ONCE(prev_offset, offset); 589579f8290SShaohua Li atomic_set(&last_readahead_pages, pages); 590579f8290SShaohua Li 591579f8290SShaohua Li return pages; 592579f8290SShaohua Li } 593579f8290SShaohua Li 59446017e95SHugh Dickins /** 595e9e9b7ecSMinchan Kim * swap_cluster_readahead - swap in pages in hope we need them soon 59646017e95SHugh Dickins * @entry: swap entry of this memory 5977682486bSRandy Dunlap * @gfp_mask: memory allocation flags 598e9e9b7ecSMinchan Kim * @vmf: fault information 59946017e95SHugh Dickins * 60046017e95SHugh Dickins * Returns the struct page for entry and addr, after queueing swapin. 60146017e95SHugh Dickins * 60246017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 60346017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 60446017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 60546017e95SHugh Dickins * the 'original' request together with the readahead ones... 60646017e95SHugh Dickins * 60746017e95SHugh Dickins * This has been extended to use the NUMA policies from the mm triggering 60846017e95SHugh Dickins * the readahead. 60946017e95SHugh Dickins * 610c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 61146017e95SHugh Dickins */ 612e9e9b7ecSMinchan Kim struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 613e9e9b7ecSMinchan Kim struct vm_fault *vmf) 61446017e95SHugh Dickins { 61546017e95SHugh Dickins struct page *page; 616579f8290SShaohua Li unsigned long entry_offset = swp_offset(entry); 617579f8290SShaohua Li unsigned long offset = entry_offset; 61867f96aa2SRik van Riel unsigned long start_offset, end_offset; 619579f8290SShaohua Li unsigned long mask; 620e9a6effaSHuang Ying struct swap_info_struct *si = swp_swap_info(entry); 6213fb5c298SChristian Ehrhardt struct blk_plug plug; 6225169b844SNeilBrown struct swap_iocb *splug = NULL; 623c4fa6309SHuang Ying bool do_poll = true, page_allocated; 624e9e9b7ecSMinchan Kim struct vm_area_struct *vma = vmf->vma; 625e9e9b7ecSMinchan Kim unsigned long addr = vmf->address; 62646017e95SHugh Dickins 627579f8290SShaohua Li mask = swapin_nr_pages(offset) - 1; 628579f8290SShaohua Li if (!mask) 629579f8290SShaohua Li goto skip; 630579f8290SShaohua Li 63123955622SShaohua Li do_poll = false; 63267f96aa2SRik van Riel /* Read a page_cluster sized and aligned cluster around offset. */ 63367f96aa2SRik van Riel start_offset = offset & ~mask; 63467f96aa2SRik van Riel end_offset = offset | mask; 63567f96aa2SRik van Riel if (!start_offset) /* First page is swap header. */ 63667f96aa2SRik van Riel start_offset++; 637e9a6effaSHuang Ying if (end_offset >= si->max) 638e9a6effaSHuang Ying end_offset = si->max - 1; 63967f96aa2SRik van Riel 6403fb5c298SChristian Ehrhardt blk_start_plug(&plug); 64167f96aa2SRik van Riel for (offset = start_offset; offset <= end_offset ; offset++) { 64246017e95SHugh Dickins /* Ok, do the async read-ahead now */ 643c4fa6309SHuang Ying page = __read_swap_cache_async( 644c4fa6309SHuang Ying swp_entry(swp_type(entry), offset), 645c4fa6309SHuang Ying gfp_mask, vma, addr, &page_allocated); 64646017e95SHugh Dickins if (!page) 64767f96aa2SRik van Riel continue; 648c4fa6309SHuang Ying if (page_allocated) { 6495169b844SNeilBrown swap_readpage(page, false, &splug); 650eaf649ebSMinchan Kim if (offset != entry_offset) { 651579f8290SShaohua Li SetPageReadahead(page); 652cbc65df2SHuang Ying count_vm_event(SWAP_RA); 653cbc65df2SHuang Ying } 654c4fa6309SHuang Ying } 65509cbfeafSKirill A. Shutemov put_page(page); 65646017e95SHugh Dickins } 6573fb5c298SChristian Ehrhardt blk_finish_plug(&plug); 6585169b844SNeilBrown swap_read_unplug(splug); 6593fb5c298SChristian Ehrhardt 66046017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 661579f8290SShaohua Li skip: 6625169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 6635169b844SNeilBrown return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); 66446017e95SHugh Dickins } 6654b3ef9daSHuang, Ying 6664b3ef9daSHuang, Ying int init_swap_address_space(unsigned int type, unsigned long nr_pages) 6674b3ef9daSHuang, Ying { 6684b3ef9daSHuang, Ying struct address_space *spaces, *space; 6694b3ef9daSHuang, Ying unsigned int i, nr; 6704b3ef9daSHuang, Ying 6714b3ef9daSHuang, Ying nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 672778e1cddSKees Cook spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 6734b3ef9daSHuang, Ying if (!spaces) 6744b3ef9daSHuang, Ying return -ENOMEM; 6754b3ef9daSHuang, Ying for (i = 0; i < nr; i++) { 6764b3ef9daSHuang, Ying space = spaces + i; 677a2833486SMatthew Wilcox xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 6784b3ef9daSHuang, Ying atomic_set(&space->i_mmap_writable, 0); 6794b3ef9daSHuang, Ying space->a_ops = &swap_aops; 6804b3ef9daSHuang, Ying /* swap cache doesn't use writeback related tags */ 6814b3ef9daSHuang, Ying mapping_set_no_writeback_tags(space); 6824b3ef9daSHuang, Ying } 6834b3ef9daSHuang, Ying nr_swapper_spaces[type] = nr; 684054f1d1fSHuang Ying swapper_spaces[type] = spaces; 6854b3ef9daSHuang, Ying 6864b3ef9daSHuang, Ying return 0; 6874b3ef9daSHuang, Ying } 6884b3ef9daSHuang, Ying 6894b3ef9daSHuang, Ying void exit_swap_address_space(unsigned int type) 6904b3ef9daSHuang, Ying { 691eea4a501SHuang Ying int i; 692eea4a501SHuang Ying struct address_space *spaces = swapper_spaces[type]; 693eea4a501SHuang Ying 694eea4a501SHuang Ying for (i = 0; i < nr_swapper_spaces[type]; i++) 695eea4a501SHuang Ying VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 696eea4a501SHuang Ying kvfree(spaces); 6974b3ef9daSHuang, Ying nr_swapper_spaces[type] = 0; 698054f1d1fSHuang Ying swapper_spaces[type] = NULL; 6994b3ef9daSHuang, Ying } 700ec560175SHuang Ying 701eaf649ebSMinchan Kim static void swap_ra_info(struct vm_fault *vmf, 702eaf649ebSMinchan Kim struct vma_swap_readahead *ra_info) 703ec560175SHuang Ying { 704ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 705eaf649ebSMinchan Kim unsigned long ra_val; 70616ba391eSKairui Song unsigned long faddr, pfn, fpfn, lpfn, rpfn; 707ec560175SHuang Ying unsigned long start, end; 708eaf649ebSMinchan Kim pte_t *pte, *orig_pte; 70916ba391eSKairui Song unsigned int max_win, hits, prev_win, win; 710ec560175SHuang Ying #ifndef CONFIG_64BIT 711ec560175SHuang Ying pte_t *tpte; 712ec560175SHuang Ying #endif 713ec560175SHuang Ying 71461b63972SHuang Ying max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 71561b63972SHuang Ying SWAP_RA_ORDER_CEILING); 71661b63972SHuang Ying if (max_win == 1) { 717eaf649ebSMinchan Kim ra_info->win = 1; 718eaf649ebSMinchan Kim return; 71961b63972SHuang Ying } 72061b63972SHuang Ying 721ec560175SHuang Ying faddr = vmf->address; 722ec560175SHuang Ying fpfn = PFN_DOWN(faddr); 723eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 724eaf649ebSMinchan Kim pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 725eaf649ebSMinchan Kim prev_win = SWAP_RA_WIN(ra_val); 726eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 727eaf649ebSMinchan Kim ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 728ec560175SHuang Ying max_win, prev_win); 729ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 730ec560175SHuang Ying SWAP_RA_VAL(faddr, win, 0)); 731ec560175SHuang Ying 73218ad72f5SKairui Song if (win == 1) 733eaf649ebSMinchan Kim return; 734ec560175SHuang Ying 735ec560175SHuang Ying /* Copy the PTEs because the page table may be unmapped */ 73618ad72f5SKairui Song orig_pte = pte = pte_offset_map(vmf->pmd, faddr); 73716ba391eSKairui Song if (fpfn == pfn + 1) { 73816ba391eSKairui Song lpfn = fpfn; 73916ba391eSKairui Song rpfn = fpfn + win; 74016ba391eSKairui Song } else if (pfn == fpfn + 1) { 74116ba391eSKairui Song lpfn = fpfn - win + 1; 74216ba391eSKairui Song rpfn = fpfn + 1; 74316ba391eSKairui Song } else { 74416ba391eSKairui Song unsigned int left = (win - 1) / 2; 74516ba391eSKairui Song 74616ba391eSKairui Song lpfn = fpfn - left; 74716ba391eSKairui Song rpfn = fpfn + win - left; 748ec560175SHuang Ying } 74916ba391eSKairui Song start = max3(lpfn, PFN_DOWN(vma->vm_start), 75016ba391eSKairui Song PFN_DOWN(faddr & PMD_MASK)); 75116ba391eSKairui Song end = min3(rpfn, PFN_DOWN(vma->vm_end), 75216ba391eSKairui Song PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 75316ba391eSKairui Song 754eaf649ebSMinchan Kim ra_info->nr_pte = end - start; 755eaf649ebSMinchan Kim ra_info->offset = fpfn - start; 756eaf649ebSMinchan Kim pte -= ra_info->offset; 757ec560175SHuang Ying #ifdef CONFIG_64BIT 758eaf649ebSMinchan Kim ra_info->ptes = pte; 759ec560175SHuang Ying #else 760eaf649ebSMinchan Kim tpte = ra_info->ptes; 761ec560175SHuang Ying for (pfn = start; pfn != end; pfn++) 762ec560175SHuang Ying *tpte++ = *pte++; 763ec560175SHuang Ying #endif 764eaf649ebSMinchan Kim pte_unmap(orig_pte); 765ec560175SHuang Ying } 766ec560175SHuang Ying 767e9f59873SYang Shi /** 768e9f59873SYang Shi * swap_vma_readahead - swap in pages in hope we need them soon 76927ec4878SKrzysztof Kozlowski * @fentry: swap entry of this memory 770e9f59873SYang Shi * @gfp_mask: memory allocation flags 771e9f59873SYang Shi * @vmf: fault information 772e9f59873SYang Shi * 773e9f59873SYang Shi * Returns the struct page for entry and addr, after queueing swapin. 774e9f59873SYang Shi * 775cb152a1aSShijie Luo * Primitive swap readahead code. We simply read in a few pages whose 776e9f59873SYang Shi * virtual addresses are around the fault address in the same vma. 777e9f59873SYang Shi * 778c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 779e9f59873SYang Shi * 780e9f59873SYang Shi */ 781f5c754d6SColin Ian King static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 782eaf649ebSMinchan Kim struct vm_fault *vmf) 783ec560175SHuang Ying { 784ec560175SHuang Ying struct blk_plug plug; 7855169b844SNeilBrown struct swap_iocb *splug = NULL; 786ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 787ec560175SHuang Ying struct page *page; 788ec560175SHuang Ying pte_t *pte, pentry; 789ec560175SHuang Ying swp_entry_t entry; 790ec560175SHuang Ying unsigned int i; 791ec560175SHuang Ying bool page_allocated; 792e97af699SMiaohe Lin struct vma_swap_readahead ra_info = { 793e97af699SMiaohe Lin .win = 1, 794e97af699SMiaohe Lin }; 795ec560175SHuang Ying 796eaf649ebSMinchan Kim swap_ra_info(vmf, &ra_info); 797eaf649ebSMinchan Kim if (ra_info.win == 1) 798ec560175SHuang Ying goto skip; 799ec560175SHuang Ying 800ec560175SHuang Ying blk_start_plug(&plug); 801eaf649ebSMinchan Kim for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; 802ec560175SHuang Ying i++, pte++) { 803ec560175SHuang Ying pentry = *pte; 80492bafb20SMiaohe Lin if (!is_swap_pte(pentry)) 805ec560175SHuang Ying continue; 806ec560175SHuang Ying entry = pte_to_swp_entry(pentry); 807ec560175SHuang Ying if (unlikely(non_swap_entry(entry))) 808ec560175SHuang Ying continue; 809ec560175SHuang Ying page = __read_swap_cache_async(entry, gfp_mask, vma, 810ec560175SHuang Ying vmf->address, &page_allocated); 811ec560175SHuang Ying if (!page) 812ec560175SHuang Ying continue; 813ec560175SHuang Ying if (page_allocated) { 8145169b844SNeilBrown swap_readpage(page, false, &splug); 815eaf649ebSMinchan Kim if (i != ra_info.offset) { 816ec560175SHuang Ying SetPageReadahead(page); 817ec560175SHuang Ying count_vm_event(SWAP_RA); 818ec560175SHuang Ying } 819ec560175SHuang Ying } 820ec560175SHuang Ying put_page(page); 821ec560175SHuang Ying } 822ec560175SHuang Ying blk_finish_plug(&plug); 8235169b844SNeilBrown swap_read_unplug(splug); 824ec560175SHuang Ying lru_add_drain(); 825ec560175SHuang Ying skip: 8265169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 827ec560175SHuang Ying return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 8285169b844SNeilBrown ra_info.win == 1, NULL); 829ec560175SHuang Ying } 830d9bfcfdcSHuang Ying 831e9e9b7ecSMinchan Kim /** 832e9e9b7ecSMinchan Kim * swapin_readahead - swap in pages in hope we need them soon 833e9e9b7ecSMinchan Kim * @entry: swap entry of this memory 834e9e9b7ecSMinchan Kim * @gfp_mask: memory allocation flags 835e9e9b7ecSMinchan Kim * @vmf: fault information 836e9e9b7ecSMinchan Kim * 837e9e9b7ecSMinchan Kim * Returns the struct page for entry and addr, after queueing swapin. 838e9e9b7ecSMinchan Kim * 839e9e9b7ecSMinchan Kim * It's a main entry function for swap readahead. By the configuration, 840e9e9b7ecSMinchan Kim * it will read ahead blocks by cluster-based(ie, physical disk based) 841e9e9b7ecSMinchan Kim * or vma-based(ie, virtual address based on faulty address) readahead. 842e9e9b7ecSMinchan Kim */ 843e9e9b7ecSMinchan Kim struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 844e9e9b7ecSMinchan Kim struct vm_fault *vmf) 845e9e9b7ecSMinchan Kim { 846e9e9b7ecSMinchan Kim return swap_use_vma_readahead() ? 847e9e9b7ecSMinchan Kim swap_vma_readahead(entry, gfp_mask, vmf) : 848e9e9b7ecSMinchan Kim swap_cluster_readahead(entry, gfp_mask, vmf); 849e9e9b7ecSMinchan Kim } 850e9e9b7ecSMinchan Kim 851d9bfcfdcSHuang Ying #ifdef CONFIG_SYSFS 852d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_show(struct kobject *kobj, 853d9bfcfdcSHuang Ying struct kobj_attribute *attr, char *buf) 854d9bfcfdcSHuang Ying { 855ae7a927dSJoe Perches return sysfs_emit(buf, "%s\n", 856ae7a927dSJoe Perches enable_vma_readahead ? "true" : "false"); 857d9bfcfdcSHuang Ying } 858d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_store(struct kobject *kobj, 859d9bfcfdcSHuang Ying struct kobj_attribute *attr, 860d9bfcfdcSHuang Ying const char *buf, size_t count) 861d9bfcfdcSHuang Ying { 862717aeab4SJagdish Gediya ssize_t ret; 863717aeab4SJagdish Gediya 864717aeab4SJagdish Gediya ret = kstrtobool(buf, &enable_vma_readahead); 865717aeab4SJagdish Gediya if (ret) 866717aeab4SJagdish Gediya return ret; 867d9bfcfdcSHuang Ying 868d9bfcfdcSHuang Ying return count; 869d9bfcfdcSHuang Ying } 8706106b93eSMiaohe Lin static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 871d9bfcfdcSHuang Ying 872d9bfcfdcSHuang Ying static struct attribute *swap_attrs[] = { 873d9bfcfdcSHuang Ying &vma_ra_enabled_attr.attr, 874d9bfcfdcSHuang Ying NULL, 875d9bfcfdcSHuang Ying }; 876d9bfcfdcSHuang Ying 877e48333b6SRikard Falkeborn static const struct attribute_group swap_attr_group = { 878d9bfcfdcSHuang Ying .attrs = swap_attrs, 879d9bfcfdcSHuang Ying }; 880d9bfcfdcSHuang Ying 881d9bfcfdcSHuang Ying static int __init swap_init_sysfs(void) 882d9bfcfdcSHuang Ying { 883d9bfcfdcSHuang Ying int err; 884d9bfcfdcSHuang Ying struct kobject *swap_kobj; 885d9bfcfdcSHuang Ying 886d9bfcfdcSHuang Ying swap_kobj = kobject_create_and_add("swap", mm_kobj); 887d9bfcfdcSHuang Ying if (!swap_kobj) { 888d9bfcfdcSHuang Ying pr_err("failed to create swap kobject\n"); 889d9bfcfdcSHuang Ying return -ENOMEM; 890d9bfcfdcSHuang Ying } 891d9bfcfdcSHuang Ying err = sysfs_create_group(swap_kobj, &swap_attr_group); 892d9bfcfdcSHuang Ying if (err) { 893d9bfcfdcSHuang Ying pr_err("failed to register swap group\n"); 894d9bfcfdcSHuang Ying goto delete_obj; 895d9bfcfdcSHuang Ying } 896d9bfcfdcSHuang Ying return 0; 897d9bfcfdcSHuang Ying 898d9bfcfdcSHuang Ying delete_obj: 899d9bfcfdcSHuang Ying kobject_put(swap_kobj); 900d9bfcfdcSHuang Ying return err; 901d9bfcfdcSHuang Ying } 902d9bfcfdcSHuang Ying subsys_initcall(swap_init_sysfs); 903d9bfcfdcSHuang Ying #endif 904