1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/mm/swap_state.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 61da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds #include <linux/mm.h> 115a0e3ad6STejun Heo #include <linux/gfp.h> 121da177e4SLinus Torvalds #include <linux/kernel_stat.h> 131da177e4SLinus Torvalds #include <linux/swap.h> 1446017e95SHugh Dickins #include <linux/swapops.h> 151da177e4SLinus Torvalds #include <linux/init.h> 161da177e4SLinus Torvalds #include <linux/pagemap.h> 171da177e4SLinus Torvalds #include <linux/backing-dev.h> 183fb5c298SChristian Ehrhardt #include <linux/blkdev.h> 19c484d410SHugh Dickins #include <linux/pagevec.h> 20b20a3503SChristoph Lameter #include <linux/migrate.h> 214b3ef9daSHuang, Ying #include <linux/vmalloc.h> 2267afa38eSTim Chen #include <linux/swap_slots.h> 2338d8b4e6SHuang Ying #include <linux/huge_mm.h> 2461ef1865SMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 25243bce09SHugh Dickins #include "internal.h" 26014bb1deSNeilBrown #include "swap.h" 271da177e4SLinus Torvalds 281da177e4SLinus Torvalds /* 291da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 307eaceaccSJens Axboe * vmscan's shrink_page_list. 311da177e4SLinus Torvalds */ 32f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 331da177e4SLinus Torvalds .writepage = swap_writepage, 344c4a7634SNeilBrown .dirty_folio = noop_dirty_folio, 351c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3654184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 371c93923cSAndrew Morton #endif 381da177e4SLinus Torvalds }; 391da177e4SLinus Torvalds 40783cb68eSChangbin Du struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 41783cb68eSChangbin Du static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 42f5c754d6SColin Ian King static bool enable_vma_readahead __read_mostly = true; 43ec560175SHuang Ying 44ec560175SHuang Ying #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 45ec560175SHuang Ying #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 46ec560175SHuang Ying #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 47ec560175SHuang Ying #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 48ec560175SHuang Ying 49ec560175SHuang Ying #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 50ec560175SHuang Ying #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 51ec560175SHuang Ying #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 52ec560175SHuang Ying 53ec560175SHuang Ying #define SWAP_RA_VAL(addr, win, hits) \ 54ec560175SHuang Ying (((addr) & PAGE_MASK) | \ 55ec560175SHuang Ying (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 56ec560175SHuang Ying ((hits) & SWAP_RA_HITS_MASK)) 57ec560175SHuang Ying 58ec560175SHuang Ying /* Initial readahead hits is 4 to start up with a small window */ 59ec560175SHuang Ying #define GET_SWAP_RA_VAL(vma) \ 60ec560175SHuang Ying (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 611da177e4SLinus Torvalds 62579f8290SShaohua Li static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 63579f8290SShaohua Li 641da177e4SLinus Torvalds void show_swap_cache_info(void) 651da177e4SLinus Torvalds { 6633806f06SShaohua Li printk("%lu pages in swap cache\n", total_swapcache_pages()); 67ec8acf20SShaohua Li printk("Free swap = %ldkB\n", 68ec8acf20SShaohua Li get_nr_swap_pages() << (PAGE_SHIFT - 10)); 691da177e4SLinus Torvalds printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 701da177e4SLinus Torvalds } 711da177e4SLinus Torvalds 72aae466b0SJoonsoo Kim void *get_shadow_from_swap_cache(swp_entry_t entry) 73aae466b0SJoonsoo Kim { 74aae466b0SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 75aae466b0SJoonsoo Kim pgoff_t idx = swp_offset(entry); 76aae466b0SJoonsoo Kim struct page *page; 77aae466b0SJoonsoo Kim 788c647dd1SMatthew Wilcox (Oracle) page = xa_load(&address_space->i_pages, idx); 79aae466b0SJoonsoo Kim if (xa_is_value(page)) 80aae466b0SJoonsoo Kim return page; 81aae466b0SJoonsoo Kim return NULL; 82aae466b0SJoonsoo Kim } 83aae466b0SJoonsoo Kim 841da177e4SLinus Torvalds /* 852bb876b5SMatthew Wilcox (Oracle) * add_to_swap_cache resembles filemap_add_folio on swapper_space, 861da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 871da177e4SLinus Torvalds */ 88a4c366f0SMatthew Wilcox (Oracle) int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 893852f676SJoonsoo Kim gfp_t gfp, void **shadowp) 901da177e4SLinus Torvalds { 918d93b41cSMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 9238d8b4e6SHuang Ying pgoff_t idx = swp_offset(entry); 93a4c366f0SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 94a4c366f0SMatthew Wilcox (Oracle) unsigned long i, nr = folio_nr_pages(folio); 953852f676SJoonsoo Kim void *old; 961da177e4SLinus Torvalds 975649d113SYang Yang xas_set_update(&xas, workingset_update_node); 985649d113SYang Yang 99a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 100a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 101a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 10251726b12SHugh Dickins 103a4c366f0SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 104a4c366f0SMatthew Wilcox (Oracle) folio_set_swapcache(folio); 105e286781dSNick Piggin 1068d93b41cSMatthew Wilcox do { 1078d93b41cSMatthew Wilcox xas_lock_irq(&xas); 1088d93b41cSMatthew Wilcox xas_create_range(&xas); 1098d93b41cSMatthew Wilcox if (xas_error(&xas)) 1108d93b41cSMatthew Wilcox goto unlock; 11138d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 112a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 1133852f676SJoonsoo Kim old = xas_load(&xas); 1143852f676SJoonsoo Kim if (xa_is_value(old)) { 1153852f676SJoonsoo Kim if (shadowp) 1163852f676SJoonsoo Kim *shadowp = old; 1173852f676SJoonsoo Kim } 118a4c366f0SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), entry.val + i); 119a4c366f0SMatthew Wilcox (Oracle) xas_store(&xas, folio); 1208d93b41cSMatthew Wilcox xas_next(&xas); 1211da177e4SLinus Torvalds } 12238d8b4e6SHuang Ying address_space->nrpages += nr; 123a4c366f0SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 124a4c366f0SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 1258d93b41cSMatthew Wilcox unlock: 1268d93b41cSMatthew Wilcox xas_unlock_irq(&xas); 1278d93b41cSMatthew Wilcox } while (xas_nomem(&xas, gfp)); 1288d93b41cSMatthew Wilcox 1298d93b41cSMatthew Wilcox if (!xas_error(&xas)) 1308d93b41cSMatthew Wilcox return 0; 1318d93b41cSMatthew Wilcox 132a4c366f0SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 133a4c366f0SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 1348d93b41cSMatthew Wilcox return xas_error(&xas); 1351da177e4SLinus Torvalds } 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds /* 138ceff9d33SMatthew Wilcox (Oracle) * This must be called only on folios that have 1391da177e4SLinus Torvalds * been verified to be in the swap cache. 1401da177e4SLinus Torvalds */ 141ceff9d33SMatthew Wilcox (Oracle) void __delete_from_swap_cache(struct folio *folio, 1423852f676SJoonsoo Kim swp_entry_t entry, void *shadow) 1431da177e4SLinus Torvalds { 1444e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 145ceff9d33SMatthew Wilcox (Oracle) int i; 146ceff9d33SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 1474e17ec25SMatthew Wilcox pgoff_t idx = swp_offset(entry); 1484e17ec25SMatthew Wilcox XA_STATE(xas, &address_space->i_pages, idx); 14933806f06SShaohua Li 1505649d113SYang Yang xas_set_update(&xas, workingset_update_node); 1515649d113SYang Yang 152ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 153ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 154ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 1551da177e4SLinus Torvalds 15638d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 1573852f676SJoonsoo Kim void *entry = xas_store(&xas, shadow); 158b9eb7776SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(entry != folio, entry); 159ceff9d33SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), 0); 1604e17ec25SMatthew Wilcox xas_next(&xas); 16138d8b4e6SHuang Ying } 162ceff9d33SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 16338d8b4e6SHuang Ying address_space->nrpages -= nr; 164ceff9d33SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 165ceff9d33SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 1661da177e4SLinus Torvalds } 1671da177e4SLinus Torvalds 1681da177e4SLinus Torvalds /** 16909c02e56SMatthew Wilcox (Oracle) * add_to_swap - allocate swap space for a folio 17009c02e56SMatthew Wilcox (Oracle) * @folio: folio we want to move to swap 1711da177e4SLinus Torvalds * 17209c02e56SMatthew Wilcox (Oracle) * Allocate swap space for the folio and add the folio to the 17309c02e56SMatthew Wilcox (Oracle) * swap cache. 17409c02e56SMatthew Wilcox (Oracle) * 17509c02e56SMatthew Wilcox (Oracle) * Context: Caller needs to hold the folio lock. 17609c02e56SMatthew Wilcox (Oracle) * Return: Whether the folio was added to the swap cache. 1771da177e4SLinus Torvalds */ 17809c02e56SMatthew Wilcox (Oracle) bool add_to_swap(struct folio *folio) 1791da177e4SLinus Torvalds { 1801da177e4SLinus Torvalds swp_entry_t entry; 1811da177e4SLinus Torvalds int err; 1821da177e4SLinus Torvalds 18309c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 18409c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1851da177e4SLinus Torvalds 186e2e3fdc7SMatthew Wilcox (Oracle) entry = folio_alloc_swap(folio); 1871da177e4SLinus Torvalds if (!entry.val) 18809c02e56SMatthew Wilcox (Oracle) return false; 1890f074658SMinchan Kim 190bd53b714SNick Piggin /* 1918d93b41cSMatthew Wilcox * XArray node allocations from PF_MEMALLOC contexts could 192bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 193bd53b714SNick Piggin * stops emergency reserves from being allocated. 1941da177e4SLinus Torvalds * 195bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 196bd53b714SNick Piggin * deadlock in the swap out path. 1971da177e4SLinus Torvalds */ 1981da177e4SLinus Torvalds /* 199854e9ed0SMinchan Kim * Add it to the swap cache. 2001da177e4SLinus Torvalds */ 201a4c366f0SMatthew Wilcox (Oracle) err = add_to_swap_cache(folio, entry, 2023852f676SJoonsoo Kim __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 20338d8b4e6SHuang Ying if (err) 2042ca4532aSDaisuke Nishimura /* 2052ca4532aSDaisuke Nishimura * add_to_swap_cache() doesn't return -EEXIST, so we can safely 2062ca4532aSDaisuke Nishimura * clear SWAP_HAS_CACHE flag. 2072ca4532aSDaisuke Nishimura */ 2080f074658SMinchan Kim goto fail; 2099625456cSShaohua Li /* 21009c02e56SMatthew Wilcox (Oracle) * Normally the folio will be dirtied in unmap because its 21109c02e56SMatthew Wilcox (Oracle) * pte should be dirty. A special case is MADV_FREE page. The 21209c02e56SMatthew Wilcox (Oracle) * page's pte could have dirty bit cleared but the folio's 21309c02e56SMatthew Wilcox (Oracle) * SwapBacked flag is still set because clearing the dirty bit 21409c02e56SMatthew Wilcox (Oracle) * and SwapBacked flag has no lock protected. For such folio, 21509c02e56SMatthew Wilcox (Oracle) * unmap will not set dirty bit for it, so folio reclaim will 21609c02e56SMatthew Wilcox (Oracle) * not write the folio out. This can cause data corruption when 21709c02e56SMatthew Wilcox (Oracle) * the folio is swapped in later. Always setting the dirty flag 21809c02e56SMatthew Wilcox (Oracle) * for the folio solves the problem. 2199625456cSShaohua Li */ 22009c02e56SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 2211da177e4SLinus Torvalds 22209c02e56SMatthew Wilcox (Oracle) return true; 22338d8b4e6SHuang Ying 22438d8b4e6SHuang Ying fail: 2254081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 22609c02e56SMatthew Wilcox (Oracle) return false; 22738d8b4e6SHuang Ying } 22838d8b4e6SHuang Ying 2291da177e4SLinus Torvalds /* 23075fa68a5SMatthew Wilcox (Oracle) * This must be called only on folios that have 2311da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 23275fa68a5SMatthew Wilcox (Oracle) * It will never put the folio into the free list, 23375fa68a5SMatthew Wilcox (Oracle) * the caller has a reference on the folio. 2341da177e4SLinus Torvalds */ 23575fa68a5SMatthew Wilcox (Oracle) void delete_from_swap_cache(struct folio *folio) 2361da177e4SLinus Torvalds { 23775fa68a5SMatthew Wilcox (Oracle) swp_entry_t entry = folio_swap_entry(folio); 2384e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 2391da177e4SLinus Torvalds 240b93b0163SMatthew Wilcox xa_lock_irq(&address_space->i_pages); 241ceff9d33SMatthew Wilcox (Oracle) __delete_from_swap_cache(folio, entry, NULL); 242b93b0163SMatthew Wilcox xa_unlock_irq(&address_space->i_pages); 2431da177e4SLinus Torvalds 2444081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 24575fa68a5SMatthew Wilcox (Oracle) folio_ref_sub(folio, folio_nr_pages(folio)); 2461da177e4SLinus Torvalds } 2471da177e4SLinus Torvalds 2483852f676SJoonsoo Kim void clear_shadow_from_swap_cache(int type, unsigned long begin, 2493852f676SJoonsoo Kim unsigned long end) 2503852f676SJoonsoo Kim { 2513852f676SJoonsoo Kim unsigned long curr = begin; 2523852f676SJoonsoo Kim void *old; 2533852f676SJoonsoo Kim 2543852f676SJoonsoo Kim for (;;) { 2553852f676SJoonsoo Kim swp_entry_t entry = swp_entry(type, curr); 2563852f676SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 2573852f676SJoonsoo Kim XA_STATE(xas, &address_space->i_pages, curr); 2583852f676SJoonsoo Kim 2595649d113SYang Yang xas_set_update(&xas, workingset_update_node); 2605649d113SYang Yang 2613852f676SJoonsoo Kim xa_lock_irq(&address_space->i_pages); 2623852f676SJoonsoo Kim xas_for_each(&xas, old, end) { 2633852f676SJoonsoo Kim if (!xa_is_value(old)) 2643852f676SJoonsoo Kim continue; 2653852f676SJoonsoo Kim xas_store(&xas, NULL); 2663852f676SJoonsoo Kim } 2673852f676SJoonsoo Kim xa_unlock_irq(&address_space->i_pages); 2683852f676SJoonsoo Kim 2693852f676SJoonsoo Kim /* search the next swapcache until we meet end */ 2703852f676SJoonsoo Kim curr >>= SWAP_ADDRESS_SPACE_SHIFT; 2713852f676SJoonsoo Kim curr++; 2723852f676SJoonsoo Kim curr <<= SWAP_ADDRESS_SPACE_SHIFT; 2733852f676SJoonsoo Kim if (curr > end) 2743852f676SJoonsoo Kim break; 2753852f676SJoonsoo Kim } 2763852f676SJoonsoo Kim } 2773852f676SJoonsoo Kim 2781da177e4SLinus Torvalds /* 2791da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 2801da177e4SLinus Torvalds * 281aedd74d4SMatthew Wilcox (Oracle) * Its ok to check the swapcache flag without the folio lock 2821da177e4SLinus Torvalds * here because we are going to recheck again inside 283aedd74d4SMatthew Wilcox (Oracle) * folio_free_swap() _with_ the lock. 2841da177e4SLinus Torvalds * - Marcelo 2851da177e4SLinus Torvalds */ 286f4c4a3f4SHuang Ying void free_swap_cache(struct page *page) 2871da177e4SLinus Torvalds { 288aedd74d4SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 289aedd74d4SMatthew Wilcox (Oracle) 290aedd74d4SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) && !folio_mapped(folio) && 291aedd74d4SMatthew Wilcox (Oracle) folio_trylock(folio)) { 292aedd74d4SMatthew Wilcox (Oracle) folio_free_swap(folio); 293aedd74d4SMatthew Wilcox (Oracle) folio_unlock(folio); 2941da177e4SLinus Torvalds } 2951da177e4SLinus Torvalds } 2961da177e4SLinus Torvalds 2971da177e4SLinus Torvalds /* 2981da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 299b8072f09SHugh Dickins * this page if it is the last user of the page. 3001da177e4SLinus Torvalds */ 3011da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 3021da177e4SLinus Torvalds { 3031da177e4SLinus Torvalds free_swap_cache(page); 3046fcb52a5SAaron Lu if (!is_huge_zero_page(page)) 30509cbfeafSKirill A. Shutemov put_page(page); 3061da177e4SLinus Torvalds } 3071da177e4SLinus Torvalds 3081da177e4SLinus Torvalds /* 3091da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 3101da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 3111da177e4SLinus Torvalds */ 3127cc8f9c7SLinus Torvalds void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 3131da177e4SLinus Torvalds { 314aabfb572SMichal Hocko lru_add_drain(); 3157cc8f9c7SLinus Torvalds for (int i = 0; i < nr; i++) 3167cc8f9c7SLinus Torvalds free_swap_cache(encoded_page_ptr(pages[i])); 3177cc8f9c7SLinus Torvalds release_pages(pages, nr); 3181da177e4SLinus Torvalds } 3191da177e4SLinus Torvalds 320e9e9b7ecSMinchan Kim static inline bool swap_use_vma_readahead(void) 321e9e9b7ecSMinchan Kim { 322e9e9b7ecSMinchan Kim return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 323e9e9b7ecSMinchan Kim } 324e9e9b7ecSMinchan Kim 3251da177e4SLinus Torvalds /* 326c9edc242SMatthew Wilcox (Oracle) * Lookup a swap entry in the swap cache. A found folio will be returned 3271da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 328c9edc242SMatthew Wilcox (Oracle) * lock getting page table operations atomic even if we drop the folio 3291da177e4SLinus Torvalds * lock before returning. 330cbc2bd98SKairui Song * 331cbc2bd98SKairui Song * Caller must lock the swap device or hold a reference to keep it valid. 3321da177e4SLinus Torvalds */ 333c9edc242SMatthew Wilcox (Oracle) struct folio *swap_cache_get_folio(swp_entry_t entry, 334c9edc242SMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr) 3351da177e4SLinus Torvalds { 336c9edc242SMatthew Wilcox (Oracle) struct folio *folio; 3371da177e4SLinus Torvalds 338c9edc242SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 33966dabbb6SChristoph Hellwig if (!IS_ERR(folio)) { 340eaf649ebSMinchan Kim bool vma_ra = swap_use_vma_readahead(); 341eaf649ebSMinchan Kim bool readahead; 342eaf649ebSMinchan Kim 343eaf649ebSMinchan Kim /* 344eaf649ebSMinchan Kim * At the moment, we don't support PG_readahead for anon THP 345eaf649ebSMinchan Kim * so let's bail out rather than confusing the readahead stat. 346eaf649ebSMinchan Kim */ 347c9edc242SMatthew Wilcox (Oracle) if (unlikely(folio_test_large(folio))) 348c9edc242SMatthew Wilcox (Oracle) return folio; 349eaf649ebSMinchan Kim 350c9edc242SMatthew Wilcox (Oracle) readahead = folio_test_clear_readahead(folio); 351eaf649ebSMinchan Kim if (vma && vma_ra) { 352eaf649ebSMinchan Kim unsigned long ra_val; 353eaf649ebSMinchan Kim int win, hits; 354eaf649ebSMinchan Kim 355eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 356eaf649ebSMinchan Kim win = SWAP_RA_WIN(ra_val); 357eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 358ec560175SHuang Ying if (readahead) 359ec560175SHuang Ying hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 360ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 361ec560175SHuang Ying SWAP_RA_VAL(addr, win, hits)); 362ec560175SHuang Ying } 363eaf649ebSMinchan Kim 364ec560175SHuang Ying if (readahead) { 365ec560175SHuang Ying count_vm_event(SWAP_RA_HIT); 366eaf649ebSMinchan Kim if (!vma || !vma_ra) 367ec560175SHuang Ying atomic_inc(&swapin_readahead_hits); 368ec560175SHuang Ying } 36966dabbb6SChristoph Hellwig } else { 37066dabbb6SChristoph Hellwig folio = NULL; 371ec560175SHuang Ying } 372eaf649ebSMinchan Kim 373c9edc242SMatthew Wilcox (Oracle) return folio; 374c9edc242SMatthew Wilcox (Oracle) } 375c9edc242SMatthew Wilcox (Oracle) 37661ef1865SMatthew Wilcox (Oracle) /** 377524984ffSMatthew Wilcox (Oracle) * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 37861ef1865SMatthew Wilcox (Oracle) * @mapping: The address_space to search. 37961ef1865SMatthew Wilcox (Oracle) * @index: The page cache index. 38061ef1865SMatthew Wilcox (Oracle) * 381524984ffSMatthew Wilcox (Oracle) * This differs from filemap_get_folio() in that it will also look for the 382524984ffSMatthew Wilcox (Oracle) * folio in the swap cache. 38361ef1865SMatthew Wilcox (Oracle) * 384524984ffSMatthew Wilcox (Oracle) * Return: The found folio or %NULL. 38561ef1865SMatthew Wilcox (Oracle) */ 386524984ffSMatthew Wilcox (Oracle) struct folio *filemap_get_incore_folio(struct address_space *mapping, 387524984ffSMatthew Wilcox (Oracle) pgoff_t index) 38861ef1865SMatthew Wilcox (Oracle) { 38961ef1865SMatthew Wilcox (Oracle) swp_entry_t swp; 39061ef1865SMatthew Wilcox (Oracle) struct swap_info_struct *si; 391097b3e59SChristoph Hellwig struct folio *folio = filemap_get_entry(mapping, index); 39261ef1865SMatthew Wilcox (Oracle) 39366dabbb6SChristoph Hellwig if (!folio) 39466dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 395dd8095b1SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 39666dabbb6SChristoph Hellwig return folio; 39761ef1865SMatthew Wilcox (Oracle) if (!shmem_mapping(mapping)) 39866dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 39961ef1865SMatthew Wilcox (Oracle) 400dd8095b1SMatthew Wilcox (Oracle) swp = radix_to_swp_entry(folio); 401ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */ 402ba6851b4SMiaohe Lin if (non_swap_entry(swp)) 40366dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 40461ef1865SMatthew Wilcox (Oracle) /* Prevent swapoff from happening to us */ 40561ef1865SMatthew Wilcox (Oracle) si = get_swap_device(swp); 40661ef1865SMatthew Wilcox (Oracle) if (!si) 40766dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 408dd8095b1SMatthew Wilcox (Oracle) index = swp_offset(swp); 409dd8095b1SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(swp), index); 41061ef1865SMatthew Wilcox (Oracle) put_swap_device(si); 411524984ffSMatthew Wilcox (Oracle) return folio; 41261ef1865SMatthew Wilcox (Oracle) } 41361ef1865SMatthew Wilcox (Oracle) 4145b999aadSDmitry Safonov struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 4155b999aadSDmitry Safonov struct vm_area_struct *vma, unsigned long addr, 4165b999aadSDmitry Safonov bool *new_page_allocated) 4171da177e4SLinus Torvalds { 418eb085574SHuang Ying struct swap_info_struct *si; 419a0d3374bSMatthew Wilcox (Oracle) struct folio *folio; 42046a774d3SHuang Ying struct page *page; 421aae466b0SJoonsoo Kim void *shadow = NULL; 4224c6355b2SJohannes Weiner 4235b999aadSDmitry Safonov *new_page_allocated = false; 42446a774d3SHuang Ying si = get_swap_device(entry); 42546a774d3SHuang Ying if (!si) 42646a774d3SHuang Ying return NULL; 4271da177e4SLinus Torvalds 4284c6355b2SJohannes Weiner for (;;) { 4294c6355b2SJohannes Weiner int err; 4301da177e4SLinus Torvalds /* 4311da177e4SLinus Torvalds * First check the swap cache. Since this is normally 432cb691e2fSMatthew Wilcox (Oracle) * called after swap_cache_get_folio() failed, re-calling 4331da177e4SLinus Torvalds * that would confuse statistics. 4341da177e4SLinus Torvalds */ 435a0d3374bSMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), 436eb085574SHuang Ying swp_offset(entry)); 43746a774d3SHuang Ying if (!IS_ERR(folio)) { 43846a774d3SHuang Ying page = folio_file_page(folio, swp_offset(entry)); 43946a774d3SHuang Ying goto got_page; 44046a774d3SHuang Ying } 4411da177e4SLinus Torvalds 442ba81f838SHuang Ying /* 443ba81f838SHuang Ying * Just skip read ahead for unused swap slot. 444ba81f838SHuang Ying * During swap_off when swap_slot_cache is disabled, 445ba81f838SHuang Ying * we have to handle the race between putting 446ba81f838SHuang Ying * swap entry in swap cache and marking swap slot 447ba81f838SHuang Ying * as SWAP_HAS_CACHE. That's done in later part of code or 448ba81f838SHuang Ying * else swap_off will be aborted if we return NULL. 449ba81f838SHuang Ying */ 4503ecdeb0fSHuang Ying if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) 45146a774d3SHuang Ying goto fail_put_swap; 452e8c26ab6STim Chen 4531da177e4SLinus Torvalds /* 4544c6355b2SJohannes Weiner * Get a new page to read into from swap. Allocate it now, 4554c6355b2SJohannes Weiner * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 4564c6355b2SJohannes Weiner * cause any racers to loop around until we add it to cache. 4571da177e4SLinus Torvalds */ 458a0d3374bSMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); 459a0d3374bSMatthew Wilcox (Oracle) if (!folio) 46046a774d3SHuang Ying goto fail_put_swap; 4611da177e4SLinus Torvalds 4621da177e4SLinus Torvalds /* 463f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 464f000944dSHugh Dickins */ 465355cfa73SKAMEZAWA Hiroyuki err = swapcache_prepare(entry); 4664c6355b2SJohannes Weiner if (!err) 467f000944dSHugh Dickins break; 468f000944dSHugh Dickins 469a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 4704c6355b2SJohannes Weiner if (err != -EEXIST) 47146a774d3SHuang Ying goto fail_put_swap; 4721da177e4SLinus Torvalds 4734c6355b2SJohannes Weiner /* 4744c6355b2SJohannes Weiner * We might race against __delete_from_swap_cache(), and 4754c6355b2SJohannes Weiner * stumble across a swap_map entry whose SWAP_HAS_CACHE 4764c6355b2SJohannes Weiner * has not yet been cleared. Or race against another 4774c6355b2SJohannes Weiner * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 4784c6355b2SJohannes Weiner * in swap_map, but not yet added its page to swap cache. 4794c6355b2SJohannes Weiner */ 480029c4628SGuo Ziliang schedule_timeout_uninterruptible(1); 4814c6355b2SJohannes Weiner } 4824c6355b2SJohannes Weiner 4834c6355b2SJohannes Weiner /* 4844c6355b2SJohannes Weiner * The swap entry is ours to swap in. Prepare the new page. 4854c6355b2SJohannes Weiner */ 4864c6355b2SJohannes Weiner 487a0d3374bSMatthew Wilcox (Oracle) __folio_set_locked(folio); 488a0d3374bSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 4894c6355b2SJohannes Weiner 49065995918SMatthew Wilcox (Oracle) if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 4914c6355b2SJohannes Weiner goto fail_unlock; 4924c6355b2SJohannes Weiner 4930add0c77SShakeel Butt /* May fail (-ENOMEM) if XArray node allocation failed. */ 494a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 4954c6355b2SJohannes Weiner goto fail_unlock; 4960add0c77SShakeel Butt 4970add0c77SShakeel Butt mem_cgroup_swapin_uncharge_swap(entry); 4984c6355b2SJohannes Weiner 499aae466b0SJoonsoo Kim if (shadow) 500a0d3374bSMatthew Wilcox (Oracle) workingset_refault(folio, shadow); 501314b57fbSJohannes Weiner 502a0d3374bSMatthew Wilcox (Oracle) /* Caller will initiate read into locked folio */ 503a0d3374bSMatthew Wilcox (Oracle) folio_add_lru(folio); 5044c6355b2SJohannes Weiner *new_page_allocated = true; 50546a774d3SHuang Ying page = &folio->page; 50646a774d3SHuang Ying got_page: 50746a774d3SHuang Ying put_swap_device(si); 50846a774d3SHuang Ying return page; 5094c6355b2SJohannes Weiner 5104c6355b2SJohannes Weiner fail_unlock: 5114081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 512a0d3374bSMatthew Wilcox (Oracle) folio_unlock(folio); 513a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 51446a774d3SHuang Ying fail_put_swap: 51546a774d3SHuang Ying put_swap_device(si); 5164c6355b2SJohannes Weiner return NULL; 5171da177e4SLinus Torvalds } 51846017e95SHugh Dickins 5195b999aadSDmitry Safonov /* 5205b999aadSDmitry Safonov * Locate a page of swap in physical memory, reserving swap cache space 5215b999aadSDmitry Safonov * and reading the disk if it is not already cached. 5225b999aadSDmitry Safonov * A failure return means that either the page allocation failed or that 5235b999aadSDmitry Safonov * the swap entry is no longer in use. 52446a774d3SHuang Ying * 52546a774d3SHuang Ying * get/put_swap_device() aren't needed to call this function, because 52646a774d3SHuang Ying * __read_swap_cache_async() call them and swap_readpage() holds the 52746a774d3SHuang Ying * swap cache folio lock. 5285b999aadSDmitry Safonov */ 5295b999aadSDmitry Safonov struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 5305169b844SNeilBrown struct vm_area_struct *vma, 5315169b844SNeilBrown unsigned long addr, bool do_poll, 5325169b844SNeilBrown struct swap_iocb **plug) 5335b999aadSDmitry Safonov { 5345b999aadSDmitry Safonov bool page_was_allocated; 5355b999aadSDmitry Safonov struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 5365b999aadSDmitry Safonov vma, addr, &page_was_allocated); 5375b999aadSDmitry Safonov 5385b999aadSDmitry Safonov if (page_was_allocated) 5395169b844SNeilBrown swap_readpage(retpage, do_poll, plug); 5405b999aadSDmitry Safonov 5415b999aadSDmitry Safonov return retpage; 5425b999aadSDmitry Safonov } 5435b999aadSDmitry Safonov 544ec560175SHuang Ying static unsigned int __swapin_nr_pages(unsigned long prev_offset, 545ec560175SHuang Ying unsigned long offset, 546ec560175SHuang Ying int hits, 547ec560175SHuang Ying int max_pages, 548ec560175SHuang Ying int prev_win) 549579f8290SShaohua Li { 550ec560175SHuang Ying unsigned int pages, last_ra; 551579f8290SShaohua Li 552579f8290SShaohua Li /* 553579f8290SShaohua Li * This heuristic has been found to work well on both sequential and 554579f8290SShaohua Li * random loads, swapping to hard disk or to SSD: please don't ask 555579f8290SShaohua Li * what the "+ 2" means, it just happens to work well, that's all. 556579f8290SShaohua Li */ 557ec560175SHuang Ying pages = hits + 2; 558579f8290SShaohua Li if (pages == 2) { 559579f8290SShaohua Li /* 560579f8290SShaohua Li * We can have no readahead hits to judge by: but must not get 561579f8290SShaohua Li * stuck here forever, so check for an adjacent offset instead 562579f8290SShaohua Li * (and don't even bother to check whether swap type is same). 563579f8290SShaohua Li */ 564579f8290SShaohua Li if (offset != prev_offset + 1 && offset != prev_offset - 1) 565579f8290SShaohua Li pages = 1; 566579f8290SShaohua Li } else { 567579f8290SShaohua Li unsigned int roundup = 4; 568579f8290SShaohua Li while (roundup < pages) 569579f8290SShaohua Li roundup <<= 1; 570579f8290SShaohua Li pages = roundup; 571579f8290SShaohua Li } 572579f8290SShaohua Li 573579f8290SShaohua Li if (pages > max_pages) 574579f8290SShaohua Li pages = max_pages; 575579f8290SShaohua Li 576579f8290SShaohua Li /* Don't shrink readahead too fast */ 577ec560175SHuang Ying last_ra = prev_win / 2; 578579f8290SShaohua Li if (pages < last_ra) 579579f8290SShaohua Li pages = last_ra; 580ec560175SHuang Ying 581ec560175SHuang Ying return pages; 582ec560175SHuang Ying } 583ec560175SHuang Ying 584ec560175SHuang Ying static unsigned long swapin_nr_pages(unsigned long offset) 585ec560175SHuang Ying { 586ec560175SHuang Ying static unsigned long prev_offset; 587ec560175SHuang Ying unsigned int hits, pages, max_pages; 588ec560175SHuang Ying static atomic_t last_readahead_pages; 589ec560175SHuang Ying 590ec560175SHuang Ying max_pages = 1 << READ_ONCE(page_cluster); 591ec560175SHuang Ying if (max_pages <= 1) 592ec560175SHuang Ying return 1; 593ec560175SHuang Ying 594ec560175SHuang Ying hits = atomic_xchg(&swapin_readahead_hits, 0); 595d6c1f098SQian Cai pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 596d6c1f098SQian Cai max_pages, 597ec560175SHuang Ying atomic_read(&last_readahead_pages)); 598ec560175SHuang Ying if (!hits) 599d6c1f098SQian Cai WRITE_ONCE(prev_offset, offset); 600579f8290SShaohua Li atomic_set(&last_readahead_pages, pages); 601579f8290SShaohua Li 602579f8290SShaohua Li return pages; 603579f8290SShaohua Li } 604579f8290SShaohua Li 60546017e95SHugh Dickins /** 606e9e9b7ecSMinchan Kim * swap_cluster_readahead - swap in pages in hope we need them soon 60746017e95SHugh Dickins * @entry: swap entry of this memory 6087682486bSRandy Dunlap * @gfp_mask: memory allocation flags 609e9e9b7ecSMinchan Kim * @vmf: fault information 61046017e95SHugh Dickins * 61146017e95SHugh Dickins * Returns the struct page for entry and addr, after queueing swapin. 61246017e95SHugh Dickins * 61346017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 61446017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 61546017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 61646017e95SHugh Dickins * the 'original' request together with the readahead ones... 61746017e95SHugh Dickins * 61846017e95SHugh Dickins * This has been extended to use the NUMA policies from the mm triggering 61946017e95SHugh Dickins * the readahead. 62046017e95SHugh Dickins * 621c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 62246017e95SHugh Dickins */ 623e9e9b7ecSMinchan Kim struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 624e9e9b7ecSMinchan Kim struct vm_fault *vmf) 62546017e95SHugh Dickins { 62646017e95SHugh Dickins struct page *page; 627579f8290SShaohua Li unsigned long entry_offset = swp_offset(entry); 628579f8290SShaohua Li unsigned long offset = entry_offset; 62967f96aa2SRik van Riel unsigned long start_offset, end_offset; 630579f8290SShaohua Li unsigned long mask; 631e9a6effaSHuang Ying struct swap_info_struct *si = swp_swap_info(entry); 6323fb5c298SChristian Ehrhardt struct blk_plug plug; 6335169b844SNeilBrown struct swap_iocb *splug = NULL; 634c4fa6309SHuang Ying bool do_poll = true, page_allocated; 635e9e9b7ecSMinchan Kim struct vm_area_struct *vma = vmf->vma; 636e9e9b7ecSMinchan Kim unsigned long addr = vmf->address; 63746017e95SHugh Dickins 638579f8290SShaohua Li mask = swapin_nr_pages(offset) - 1; 639579f8290SShaohua Li if (!mask) 640579f8290SShaohua Li goto skip; 641579f8290SShaohua Li 64223955622SShaohua Li do_poll = false; 64367f96aa2SRik van Riel /* Read a page_cluster sized and aligned cluster around offset. */ 64467f96aa2SRik van Riel start_offset = offset & ~mask; 64567f96aa2SRik van Riel end_offset = offset | mask; 64667f96aa2SRik van Riel if (!start_offset) /* First page is swap header. */ 64767f96aa2SRik van Riel start_offset++; 648e9a6effaSHuang Ying if (end_offset >= si->max) 649e9a6effaSHuang Ying end_offset = si->max - 1; 65067f96aa2SRik van Riel 6513fb5c298SChristian Ehrhardt blk_start_plug(&plug); 65267f96aa2SRik van Riel for (offset = start_offset; offset <= end_offset ; offset++) { 65346017e95SHugh Dickins /* Ok, do the async read-ahead now */ 654c4fa6309SHuang Ying page = __read_swap_cache_async( 655c4fa6309SHuang Ying swp_entry(swp_type(entry), offset), 656c4fa6309SHuang Ying gfp_mask, vma, addr, &page_allocated); 65746017e95SHugh Dickins if (!page) 65867f96aa2SRik van Riel continue; 659c4fa6309SHuang Ying if (page_allocated) { 6605169b844SNeilBrown swap_readpage(page, false, &splug); 661eaf649ebSMinchan Kim if (offset != entry_offset) { 662579f8290SShaohua Li SetPageReadahead(page); 663cbc65df2SHuang Ying count_vm_event(SWAP_RA); 664cbc65df2SHuang Ying } 665c4fa6309SHuang Ying } 66609cbfeafSKirill A. Shutemov put_page(page); 66746017e95SHugh Dickins } 6683fb5c298SChristian Ehrhardt blk_finish_plug(&plug); 6695169b844SNeilBrown swap_read_unplug(splug); 6703fb5c298SChristian Ehrhardt 67146017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 672579f8290SShaohua Li skip: 6735169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 6745169b844SNeilBrown return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); 67546017e95SHugh Dickins } 6764b3ef9daSHuang, Ying 6774b3ef9daSHuang, Ying int init_swap_address_space(unsigned int type, unsigned long nr_pages) 6784b3ef9daSHuang, Ying { 6794b3ef9daSHuang, Ying struct address_space *spaces, *space; 6804b3ef9daSHuang, Ying unsigned int i, nr; 6814b3ef9daSHuang, Ying 6824b3ef9daSHuang, Ying nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 683778e1cddSKees Cook spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 6844b3ef9daSHuang, Ying if (!spaces) 6854b3ef9daSHuang, Ying return -ENOMEM; 6864b3ef9daSHuang, Ying for (i = 0; i < nr; i++) { 6874b3ef9daSHuang, Ying space = spaces + i; 688a2833486SMatthew Wilcox xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 6894b3ef9daSHuang, Ying atomic_set(&space->i_mmap_writable, 0); 6904b3ef9daSHuang, Ying space->a_ops = &swap_aops; 6914b3ef9daSHuang, Ying /* swap cache doesn't use writeback related tags */ 6924b3ef9daSHuang, Ying mapping_set_no_writeback_tags(space); 6934b3ef9daSHuang, Ying } 6944b3ef9daSHuang, Ying nr_swapper_spaces[type] = nr; 695054f1d1fSHuang Ying swapper_spaces[type] = spaces; 6964b3ef9daSHuang, Ying 6974b3ef9daSHuang, Ying return 0; 6984b3ef9daSHuang, Ying } 6994b3ef9daSHuang, Ying 7004b3ef9daSHuang, Ying void exit_swap_address_space(unsigned int type) 7014b3ef9daSHuang, Ying { 702eea4a501SHuang Ying int i; 703eea4a501SHuang Ying struct address_space *spaces = swapper_spaces[type]; 704eea4a501SHuang Ying 705eea4a501SHuang Ying for (i = 0; i < nr_swapper_spaces[type]; i++) 706eea4a501SHuang Ying VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 707eea4a501SHuang Ying kvfree(spaces); 7084b3ef9daSHuang, Ying nr_swapper_spaces[type] = 0; 709054f1d1fSHuang Ying swapper_spaces[type] = NULL; 7104b3ef9daSHuang, Ying } 711ec560175SHuang Ying 712*4f8fcf4cSHugh Dickins #define SWAP_RA_ORDER_CEILING 5 713*4f8fcf4cSHugh Dickins 714*4f8fcf4cSHugh Dickins struct vma_swap_readahead { 715*4f8fcf4cSHugh Dickins unsigned short win; 716*4f8fcf4cSHugh Dickins unsigned short offset; 717*4f8fcf4cSHugh Dickins unsigned short nr_pte; 718*4f8fcf4cSHugh Dickins }; 719*4f8fcf4cSHugh Dickins 720eaf649ebSMinchan Kim static void swap_ra_info(struct vm_fault *vmf, 721eaf649ebSMinchan Kim struct vma_swap_readahead *ra_info) 722ec560175SHuang Ying { 723ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 724eaf649ebSMinchan Kim unsigned long ra_val; 72516ba391eSKairui Song unsigned long faddr, pfn, fpfn, lpfn, rpfn; 726ec560175SHuang Ying unsigned long start, end; 72716ba391eSKairui Song unsigned int max_win, hits, prev_win, win; 728ec560175SHuang Ying 72961b63972SHuang Ying max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 73061b63972SHuang Ying SWAP_RA_ORDER_CEILING); 73161b63972SHuang Ying if (max_win == 1) { 732eaf649ebSMinchan Kim ra_info->win = 1; 733eaf649ebSMinchan Kim return; 73461b63972SHuang Ying } 73561b63972SHuang Ying 736ec560175SHuang Ying faddr = vmf->address; 737ec560175SHuang Ying fpfn = PFN_DOWN(faddr); 738eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 739eaf649ebSMinchan Kim pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 740eaf649ebSMinchan Kim prev_win = SWAP_RA_WIN(ra_val); 741eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 742eaf649ebSMinchan Kim ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 743ec560175SHuang Ying max_win, prev_win); 744ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 745ec560175SHuang Ying SWAP_RA_VAL(faddr, win, 0)); 74618ad72f5SKairui Song if (win == 1) 747eaf649ebSMinchan Kim return; 748ec560175SHuang Ying 74916ba391eSKairui Song if (fpfn == pfn + 1) { 75016ba391eSKairui Song lpfn = fpfn; 75116ba391eSKairui Song rpfn = fpfn + win; 75216ba391eSKairui Song } else if (pfn == fpfn + 1) { 75316ba391eSKairui Song lpfn = fpfn - win + 1; 75416ba391eSKairui Song rpfn = fpfn + 1; 75516ba391eSKairui Song } else { 75616ba391eSKairui Song unsigned int left = (win - 1) / 2; 75716ba391eSKairui Song 75816ba391eSKairui Song lpfn = fpfn - left; 75916ba391eSKairui Song rpfn = fpfn + win - left; 760ec560175SHuang Ying } 76116ba391eSKairui Song start = max3(lpfn, PFN_DOWN(vma->vm_start), 76216ba391eSKairui Song PFN_DOWN(faddr & PMD_MASK)); 76316ba391eSKairui Song end = min3(rpfn, PFN_DOWN(vma->vm_end), 76416ba391eSKairui Song PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 76516ba391eSKairui Song 766eaf649ebSMinchan Kim ra_info->nr_pte = end - start; 767eaf649ebSMinchan Kim ra_info->offset = fpfn - start; 768ec560175SHuang Ying } 769ec560175SHuang Ying 770e9f59873SYang Shi /** 771e9f59873SYang Shi * swap_vma_readahead - swap in pages in hope we need them soon 77227ec4878SKrzysztof Kozlowski * @fentry: swap entry of this memory 773e9f59873SYang Shi * @gfp_mask: memory allocation flags 774e9f59873SYang Shi * @vmf: fault information 775e9f59873SYang Shi * 776e9f59873SYang Shi * Returns the struct page for entry and addr, after queueing swapin. 777e9f59873SYang Shi * 778cb152a1aSShijie Luo * Primitive swap readahead code. We simply read in a few pages whose 779e9f59873SYang Shi * virtual addresses are around the fault address in the same vma. 780e9f59873SYang Shi * 781c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 782e9f59873SYang Shi * 783e9f59873SYang Shi */ 784f5c754d6SColin Ian King static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 785eaf649ebSMinchan Kim struct vm_fault *vmf) 786ec560175SHuang Ying { 787ec560175SHuang Ying struct blk_plug plug; 7885169b844SNeilBrown struct swap_iocb *splug = NULL; 789ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 790ec560175SHuang Ying struct page *page; 791*4f8fcf4cSHugh Dickins pte_t *pte = NULL, pentry; 792*4f8fcf4cSHugh Dickins unsigned long addr; 793ec560175SHuang Ying swp_entry_t entry; 794ec560175SHuang Ying unsigned int i; 795ec560175SHuang Ying bool page_allocated; 796e97af699SMiaohe Lin struct vma_swap_readahead ra_info = { 797e97af699SMiaohe Lin .win = 1, 798e97af699SMiaohe Lin }; 799ec560175SHuang Ying 800eaf649ebSMinchan Kim swap_ra_info(vmf, &ra_info); 801eaf649ebSMinchan Kim if (ra_info.win == 1) 802ec560175SHuang Ying goto skip; 803ec560175SHuang Ying 804*4f8fcf4cSHugh Dickins addr = vmf->address - (ra_info.offset * PAGE_SIZE); 805*4f8fcf4cSHugh Dickins 806ec560175SHuang Ying blk_start_plug(&plug); 807*4f8fcf4cSHugh Dickins for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { 808*4f8fcf4cSHugh Dickins if (!pte++) { 809*4f8fcf4cSHugh Dickins pte = pte_offset_map(vmf->pmd, addr); 810*4f8fcf4cSHugh Dickins if (!pte) 811*4f8fcf4cSHugh Dickins break; 812*4f8fcf4cSHugh Dickins } 813*4f8fcf4cSHugh Dickins pentry = ptep_get_lockless(pte); 81492bafb20SMiaohe Lin if (!is_swap_pte(pentry)) 815ec560175SHuang Ying continue; 816ec560175SHuang Ying entry = pte_to_swp_entry(pentry); 817ec560175SHuang Ying if (unlikely(non_swap_entry(entry))) 818ec560175SHuang Ying continue; 819*4f8fcf4cSHugh Dickins pte_unmap(pte); 820*4f8fcf4cSHugh Dickins pte = NULL; 821ec560175SHuang Ying page = __read_swap_cache_async(entry, gfp_mask, vma, 822*4f8fcf4cSHugh Dickins addr, &page_allocated); 823ec560175SHuang Ying if (!page) 824ec560175SHuang Ying continue; 825ec560175SHuang Ying if (page_allocated) { 8265169b844SNeilBrown swap_readpage(page, false, &splug); 827eaf649ebSMinchan Kim if (i != ra_info.offset) { 828ec560175SHuang Ying SetPageReadahead(page); 829ec560175SHuang Ying count_vm_event(SWAP_RA); 830ec560175SHuang Ying } 831ec560175SHuang Ying } 832ec560175SHuang Ying put_page(page); 833ec560175SHuang Ying } 834*4f8fcf4cSHugh Dickins if (pte) 835*4f8fcf4cSHugh Dickins pte_unmap(pte); 836ec560175SHuang Ying blk_finish_plug(&plug); 8375169b844SNeilBrown swap_read_unplug(splug); 838ec560175SHuang Ying lru_add_drain(); 839ec560175SHuang Ying skip: 8405169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 841ec560175SHuang Ying return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 8425169b844SNeilBrown ra_info.win == 1, NULL); 843ec560175SHuang Ying } 844d9bfcfdcSHuang Ying 845e9e9b7ecSMinchan Kim /** 846e9e9b7ecSMinchan Kim * swapin_readahead - swap in pages in hope we need them soon 847e9e9b7ecSMinchan Kim * @entry: swap entry of this memory 848e9e9b7ecSMinchan Kim * @gfp_mask: memory allocation flags 849e9e9b7ecSMinchan Kim * @vmf: fault information 850e9e9b7ecSMinchan Kim * 851e9e9b7ecSMinchan Kim * Returns the struct page for entry and addr, after queueing swapin. 852e9e9b7ecSMinchan Kim * 853e9e9b7ecSMinchan Kim * It's a main entry function for swap readahead. By the configuration, 854e9e9b7ecSMinchan Kim * it will read ahead blocks by cluster-based(ie, physical disk based) 855e9e9b7ecSMinchan Kim * or vma-based(ie, virtual address based on faulty address) readahead. 856e9e9b7ecSMinchan Kim */ 857e9e9b7ecSMinchan Kim struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 858e9e9b7ecSMinchan Kim struct vm_fault *vmf) 859e9e9b7ecSMinchan Kim { 860e9e9b7ecSMinchan Kim return swap_use_vma_readahead() ? 861e9e9b7ecSMinchan Kim swap_vma_readahead(entry, gfp_mask, vmf) : 862e9e9b7ecSMinchan Kim swap_cluster_readahead(entry, gfp_mask, vmf); 863e9e9b7ecSMinchan Kim } 864e9e9b7ecSMinchan Kim 865d9bfcfdcSHuang Ying #ifdef CONFIG_SYSFS 866d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_show(struct kobject *kobj, 867d9bfcfdcSHuang Ying struct kobj_attribute *attr, char *buf) 868d9bfcfdcSHuang Ying { 869ae7a927dSJoe Perches return sysfs_emit(buf, "%s\n", 870ae7a927dSJoe Perches enable_vma_readahead ? "true" : "false"); 871d9bfcfdcSHuang Ying } 872d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_store(struct kobject *kobj, 873d9bfcfdcSHuang Ying struct kobj_attribute *attr, 874d9bfcfdcSHuang Ying const char *buf, size_t count) 875d9bfcfdcSHuang Ying { 876717aeab4SJagdish Gediya ssize_t ret; 877717aeab4SJagdish Gediya 878717aeab4SJagdish Gediya ret = kstrtobool(buf, &enable_vma_readahead); 879717aeab4SJagdish Gediya if (ret) 880717aeab4SJagdish Gediya return ret; 881d9bfcfdcSHuang Ying 882d9bfcfdcSHuang Ying return count; 883d9bfcfdcSHuang Ying } 8846106b93eSMiaohe Lin static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 885d9bfcfdcSHuang Ying 886d9bfcfdcSHuang Ying static struct attribute *swap_attrs[] = { 887d9bfcfdcSHuang Ying &vma_ra_enabled_attr.attr, 888d9bfcfdcSHuang Ying NULL, 889d9bfcfdcSHuang Ying }; 890d9bfcfdcSHuang Ying 891e48333b6SRikard Falkeborn static const struct attribute_group swap_attr_group = { 892d9bfcfdcSHuang Ying .attrs = swap_attrs, 893d9bfcfdcSHuang Ying }; 894d9bfcfdcSHuang Ying 895d9bfcfdcSHuang Ying static int __init swap_init_sysfs(void) 896d9bfcfdcSHuang Ying { 897d9bfcfdcSHuang Ying int err; 898d9bfcfdcSHuang Ying struct kobject *swap_kobj; 899d9bfcfdcSHuang Ying 900d9bfcfdcSHuang Ying swap_kobj = kobject_create_and_add("swap", mm_kobj); 901d9bfcfdcSHuang Ying if (!swap_kobj) { 902d9bfcfdcSHuang Ying pr_err("failed to create swap kobject\n"); 903d9bfcfdcSHuang Ying return -ENOMEM; 904d9bfcfdcSHuang Ying } 905d9bfcfdcSHuang Ying err = sysfs_create_group(swap_kobj, &swap_attr_group); 906d9bfcfdcSHuang Ying if (err) { 907d9bfcfdcSHuang Ying pr_err("failed to register swap group\n"); 908d9bfcfdcSHuang Ying goto delete_obj; 909d9bfcfdcSHuang Ying } 910d9bfcfdcSHuang Ying return 0; 911d9bfcfdcSHuang Ying 912d9bfcfdcSHuang Ying delete_obj: 913d9bfcfdcSHuang Ying kobject_put(swap_kobj); 914d9bfcfdcSHuang Ying return err; 915d9bfcfdcSHuang Ying } 916d9bfcfdcSHuang Ying subsys_initcall(swap_init_sysfs); 917d9bfcfdcSHuang Ying #endif 918