1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/mm/swap_state.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 61da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds #include <linux/mm.h> 115a0e3ad6STejun Heo #include <linux/gfp.h> 121da177e4SLinus Torvalds #include <linux/kernel_stat.h> 13ddc1a5cbSHugh Dickins #include <linux/mempolicy.h> 141da177e4SLinus Torvalds #include <linux/swap.h> 1546017e95SHugh Dickins #include <linux/swapops.h> 161da177e4SLinus Torvalds #include <linux/init.h> 171da177e4SLinus Torvalds #include <linux/pagemap.h> 184907e80bSMatthew Wilcox (Oracle) #include <linux/pagevec.h> 191da177e4SLinus Torvalds #include <linux/backing-dev.h> 203fb5c298SChristian Ehrhardt #include <linux/blkdev.h> 21b20a3503SChristoph Lameter #include <linux/migrate.h> 224b3ef9daSHuang, Ying #include <linux/vmalloc.h> 2367afa38eSTim Chen #include <linux/swap_slots.h> 2438d8b4e6SHuang Ying #include <linux/huge_mm.h> 2561ef1865SMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 26243bce09SHugh Dickins #include "internal.h" 27014bb1deSNeilBrown #include "swap.h" 281da177e4SLinus Torvalds 291da177e4SLinus Torvalds /* 301da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 317eaceaccSJens Axboe * vmscan's shrink_page_list. 321da177e4SLinus Torvalds */ 33f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 341da177e4SLinus Torvalds .writepage = swap_writepage, 354c4a7634SNeilBrown .dirty_folio = noop_dirty_folio, 361c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3754184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 381c93923cSAndrew Morton #endif 391da177e4SLinus Torvalds }; 401da177e4SLinus Torvalds 41783cb68eSChangbin Du struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 42783cb68eSChangbin Du static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 43f5c754d6SColin Ian King static bool enable_vma_readahead __read_mostly = true; 44ec560175SHuang Ying 45ec560175SHuang Ying #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 46ec560175SHuang Ying #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 47ec560175SHuang Ying #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 48ec560175SHuang Ying #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 49ec560175SHuang Ying 50ec560175SHuang Ying #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 51ec560175SHuang Ying #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 52ec560175SHuang Ying #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 53ec560175SHuang Ying 54ec560175SHuang Ying #define SWAP_RA_VAL(addr, win, hits) \ 55ec560175SHuang Ying (((addr) & PAGE_MASK) | \ 56ec560175SHuang Ying (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 57ec560175SHuang Ying ((hits) & SWAP_RA_HITS_MASK)) 58ec560175SHuang Ying 59ec560175SHuang Ying /* Initial readahead hits is 4 to start up with a small window */ 60ec560175SHuang Ying #define GET_SWAP_RA_VAL(vma) \ 61ec560175SHuang Ying (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 621da177e4SLinus Torvalds 63579f8290SShaohua Li static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 64579f8290SShaohua Li 651da177e4SLinus Torvalds void show_swap_cache_info(void) 661da177e4SLinus Torvalds { 6733806f06SShaohua Li printk("%lu pages in swap cache\n", total_swapcache_pages()); 683cb8eaa4SZhangPeng printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 693cb8eaa4SZhangPeng printk("Total swap = %lukB\n", K(total_swap_pages)); 701da177e4SLinus Torvalds } 711da177e4SLinus Torvalds 72aae466b0SJoonsoo Kim void *get_shadow_from_swap_cache(swp_entry_t entry) 73aae466b0SJoonsoo Kim { 74aae466b0SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 75aae466b0SJoonsoo Kim pgoff_t idx = swp_offset(entry); 76aae466b0SJoonsoo Kim struct page *page; 77aae466b0SJoonsoo Kim 788c647dd1SMatthew Wilcox (Oracle) page = xa_load(&address_space->i_pages, idx); 79aae466b0SJoonsoo Kim if (xa_is_value(page)) 80aae466b0SJoonsoo Kim return page; 81aae466b0SJoonsoo Kim return NULL; 82aae466b0SJoonsoo Kim } 83aae466b0SJoonsoo Kim 841da177e4SLinus Torvalds /* 852bb876b5SMatthew Wilcox (Oracle) * add_to_swap_cache resembles filemap_add_folio on swapper_space, 861da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 871da177e4SLinus Torvalds */ 88a4c366f0SMatthew Wilcox (Oracle) int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 893852f676SJoonsoo Kim gfp_t gfp, void **shadowp) 901da177e4SLinus Torvalds { 918d93b41cSMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 9238d8b4e6SHuang Ying pgoff_t idx = swp_offset(entry); 93a4c366f0SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 94a4c366f0SMatthew Wilcox (Oracle) unsigned long i, nr = folio_nr_pages(folio); 953852f676SJoonsoo Kim void *old; 961da177e4SLinus Torvalds 975649d113SYang Yang xas_set_update(&xas, workingset_update_node); 985649d113SYang Yang 99a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 100a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 101a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 10251726b12SHugh Dickins 103a4c366f0SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 104a4c366f0SMatthew Wilcox (Oracle) folio_set_swapcache(folio); 1053d2c9087SDavid Hildenbrand folio->swap = entry; 106e286781dSNick Piggin 1078d93b41cSMatthew Wilcox do { 1088d93b41cSMatthew Wilcox xas_lock_irq(&xas); 1098d93b41cSMatthew Wilcox xas_create_range(&xas); 1108d93b41cSMatthew Wilcox if (xas_error(&xas)) 1118d93b41cSMatthew Wilcox goto unlock; 11238d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 113a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 114e5b306a0SKairui Song if (shadowp) { 1153852f676SJoonsoo Kim old = xas_load(&xas); 116e5b306a0SKairui Song if (xa_is_value(old)) 1173852f676SJoonsoo Kim *shadowp = old; 1183852f676SJoonsoo Kim } 119a4c366f0SMatthew Wilcox (Oracle) xas_store(&xas, folio); 1208d93b41cSMatthew Wilcox xas_next(&xas); 1211da177e4SLinus Torvalds } 12238d8b4e6SHuang Ying address_space->nrpages += nr; 123a4c366f0SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 124a4c366f0SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 1258d93b41cSMatthew Wilcox unlock: 1268d93b41cSMatthew Wilcox xas_unlock_irq(&xas); 1278d93b41cSMatthew Wilcox } while (xas_nomem(&xas, gfp)); 1288d93b41cSMatthew Wilcox 1298d93b41cSMatthew Wilcox if (!xas_error(&xas)) 1308d93b41cSMatthew Wilcox return 0; 1318d93b41cSMatthew Wilcox 132a4c366f0SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 133a4c366f0SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 1348d93b41cSMatthew Wilcox return xas_error(&xas); 1351da177e4SLinus Torvalds } 1361da177e4SLinus Torvalds 1371da177e4SLinus Torvalds /* 138ceff9d33SMatthew Wilcox (Oracle) * This must be called only on folios that have 1391da177e4SLinus Torvalds * been verified to be in the swap cache. 1401da177e4SLinus Torvalds */ 141ceff9d33SMatthew Wilcox (Oracle) void __delete_from_swap_cache(struct folio *folio, 1423852f676SJoonsoo Kim swp_entry_t entry, void *shadow) 1431da177e4SLinus Torvalds { 1444e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 145ceff9d33SMatthew Wilcox (Oracle) int i; 146ceff9d33SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 1474e17ec25SMatthew Wilcox pgoff_t idx = swp_offset(entry); 1484e17ec25SMatthew Wilcox XA_STATE(xas, &address_space->i_pages, idx); 14933806f06SShaohua Li 1505649d113SYang Yang xas_set_update(&xas, workingset_update_node); 1515649d113SYang Yang 152ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 153ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 154ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 1551da177e4SLinus Torvalds 15638d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 1573852f676SJoonsoo Kim void *entry = xas_store(&xas, shadow); 158b9eb7776SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(entry != folio, entry); 1594e17ec25SMatthew Wilcox xas_next(&xas); 16038d8b4e6SHuang Ying } 1613d2c9087SDavid Hildenbrand folio->swap.val = 0; 162ceff9d33SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 16338d8b4e6SHuang Ying address_space->nrpages -= nr; 164ceff9d33SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 165ceff9d33SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 1661da177e4SLinus Torvalds } 1671da177e4SLinus Torvalds 1681da177e4SLinus Torvalds /** 16909c02e56SMatthew Wilcox (Oracle) * add_to_swap - allocate swap space for a folio 17009c02e56SMatthew Wilcox (Oracle) * @folio: folio we want to move to swap 1711da177e4SLinus Torvalds * 17209c02e56SMatthew Wilcox (Oracle) * Allocate swap space for the folio and add the folio to the 17309c02e56SMatthew Wilcox (Oracle) * swap cache. 17409c02e56SMatthew Wilcox (Oracle) * 17509c02e56SMatthew Wilcox (Oracle) * Context: Caller needs to hold the folio lock. 17609c02e56SMatthew Wilcox (Oracle) * Return: Whether the folio was added to the swap cache. 1771da177e4SLinus Torvalds */ 17809c02e56SMatthew Wilcox (Oracle) bool add_to_swap(struct folio *folio) 1791da177e4SLinus Torvalds { 1801da177e4SLinus Torvalds swp_entry_t entry; 1811da177e4SLinus Torvalds int err; 1821da177e4SLinus Torvalds 18309c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 18409c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1851da177e4SLinus Torvalds 186e2e3fdc7SMatthew Wilcox (Oracle) entry = folio_alloc_swap(folio); 1871da177e4SLinus Torvalds if (!entry.val) 18809c02e56SMatthew Wilcox (Oracle) return false; 1890f074658SMinchan Kim 190bd53b714SNick Piggin /* 1918d93b41cSMatthew Wilcox * XArray node allocations from PF_MEMALLOC contexts could 192bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 193bd53b714SNick Piggin * stops emergency reserves from being allocated. 1941da177e4SLinus Torvalds * 195bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 196bd53b714SNick Piggin * deadlock in the swap out path. 1971da177e4SLinus Torvalds */ 1981da177e4SLinus Torvalds /* 199854e9ed0SMinchan Kim * Add it to the swap cache. 2001da177e4SLinus Torvalds */ 201a4c366f0SMatthew Wilcox (Oracle) err = add_to_swap_cache(folio, entry, 2023852f676SJoonsoo Kim __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 20338d8b4e6SHuang Ying if (err) 2042ca4532aSDaisuke Nishimura /* 2052ca4532aSDaisuke Nishimura * add_to_swap_cache() doesn't return -EEXIST, so we can safely 2062ca4532aSDaisuke Nishimura * clear SWAP_HAS_CACHE flag. 2072ca4532aSDaisuke Nishimura */ 2080f074658SMinchan Kim goto fail; 2099625456cSShaohua Li /* 21009c02e56SMatthew Wilcox (Oracle) * Normally the folio will be dirtied in unmap because its 21109c02e56SMatthew Wilcox (Oracle) * pte should be dirty. A special case is MADV_FREE page. The 21209c02e56SMatthew Wilcox (Oracle) * page's pte could have dirty bit cleared but the folio's 21309c02e56SMatthew Wilcox (Oracle) * SwapBacked flag is still set because clearing the dirty bit 21409c02e56SMatthew Wilcox (Oracle) * and SwapBacked flag has no lock protected. For such folio, 21509c02e56SMatthew Wilcox (Oracle) * unmap will not set dirty bit for it, so folio reclaim will 21609c02e56SMatthew Wilcox (Oracle) * not write the folio out. This can cause data corruption when 21709c02e56SMatthew Wilcox (Oracle) * the folio is swapped in later. Always setting the dirty flag 21809c02e56SMatthew Wilcox (Oracle) * for the folio solves the problem. 2199625456cSShaohua Li */ 22009c02e56SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 2211da177e4SLinus Torvalds 22209c02e56SMatthew Wilcox (Oracle) return true; 22338d8b4e6SHuang Ying 22438d8b4e6SHuang Ying fail: 2254081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 22609c02e56SMatthew Wilcox (Oracle) return false; 22738d8b4e6SHuang Ying } 22838d8b4e6SHuang Ying 2291da177e4SLinus Torvalds /* 23075fa68a5SMatthew Wilcox (Oracle) * This must be called only on folios that have 2311da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 23275fa68a5SMatthew Wilcox (Oracle) * It will never put the folio into the free list, 23375fa68a5SMatthew Wilcox (Oracle) * the caller has a reference on the folio. 2341da177e4SLinus Torvalds */ 23575fa68a5SMatthew Wilcox (Oracle) void delete_from_swap_cache(struct folio *folio) 2361da177e4SLinus Torvalds { 2373d2c9087SDavid Hildenbrand swp_entry_t entry = folio->swap; 2384e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 2391da177e4SLinus Torvalds 240b93b0163SMatthew Wilcox xa_lock_irq(&address_space->i_pages); 241ceff9d33SMatthew Wilcox (Oracle) __delete_from_swap_cache(folio, entry, NULL); 242b93b0163SMatthew Wilcox xa_unlock_irq(&address_space->i_pages); 2431da177e4SLinus Torvalds 2444081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 24575fa68a5SMatthew Wilcox (Oracle) folio_ref_sub(folio, folio_nr_pages(folio)); 2461da177e4SLinus Torvalds } 2471da177e4SLinus Torvalds 2483852f676SJoonsoo Kim void clear_shadow_from_swap_cache(int type, unsigned long begin, 2493852f676SJoonsoo Kim unsigned long end) 2503852f676SJoonsoo Kim { 2513852f676SJoonsoo Kim unsigned long curr = begin; 2523852f676SJoonsoo Kim void *old; 2533852f676SJoonsoo Kim 2543852f676SJoonsoo Kim for (;;) { 2553852f676SJoonsoo Kim swp_entry_t entry = swp_entry(type, curr); 2563852f676SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 2573852f676SJoonsoo Kim XA_STATE(xas, &address_space->i_pages, curr); 2583852f676SJoonsoo Kim 2595649d113SYang Yang xas_set_update(&xas, workingset_update_node); 2605649d113SYang Yang 2613852f676SJoonsoo Kim xa_lock_irq(&address_space->i_pages); 2623852f676SJoonsoo Kim xas_for_each(&xas, old, end) { 2633852f676SJoonsoo Kim if (!xa_is_value(old)) 2643852f676SJoonsoo Kim continue; 2653852f676SJoonsoo Kim xas_store(&xas, NULL); 2663852f676SJoonsoo Kim } 2673852f676SJoonsoo Kim xa_unlock_irq(&address_space->i_pages); 2683852f676SJoonsoo Kim 2693852f676SJoonsoo Kim /* search the next swapcache until we meet end */ 2703852f676SJoonsoo Kim curr >>= SWAP_ADDRESS_SPACE_SHIFT; 2713852f676SJoonsoo Kim curr++; 2723852f676SJoonsoo Kim curr <<= SWAP_ADDRESS_SPACE_SHIFT; 2733852f676SJoonsoo Kim if (curr > end) 2743852f676SJoonsoo Kim break; 2753852f676SJoonsoo Kim } 2763852f676SJoonsoo Kim } 2773852f676SJoonsoo Kim 2781da177e4SLinus Torvalds /* 2791da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 2801da177e4SLinus Torvalds * 281aedd74d4SMatthew Wilcox (Oracle) * Its ok to check the swapcache flag without the folio lock 2821da177e4SLinus Torvalds * here because we are going to recheck again inside 283aedd74d4SMatthew Wilcox (Oracle) * folio_free_swap() _with_ the lock. 2841da177e4SLinus Torvalds * - Marcelo 2851da177e4SLinus Torvalds */ 286*63b77499SMatthew Wilcox (Oracle) void free_swap_cache(struct folio *folio) 2871da177e4SLinus Torvalds { 288aedd74d4SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) && !folio_mapped(folio) && 289aedd74d4SMatthew Wilcox (Oracle) folio_trylock(folio)) { 290aedd74d4SMatthew Wilcox (Oracle) folio_free_swap(folio); 291aedd74d4SMatthew Wilcox (Oracle) folio_unlock(folio); 2921da177e4SLinus Torvalds } 2931da177e4SLinus Torvalds } 2941da177e4SLinus Torvalds 2951da177e4SLinus Torvalds /* 2961da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 297b8072f09SHugh Dickins * this page if it is the last user of the page. 2981da177e4SLinus Torvalds */ 2991da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 3001da177e4SLinus Torvalds { 301*63b77499SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 302*63b77499SMatthew Wilcox (Oracle) 303*63b77499SMatthew Wilcox (Oracle) free_swap_cache(folio); 3046fcb52a5SAaron Lu if (!is_huge_zero_page(page)) 305*63b77499SMatthew Wilcox (Oracle) folio_put(folio); 3061da177e4SLinus Torvalds } 3071da177e4SLinus Torvalds 3081da177e4SLinus Torvalds /* 3091da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 3101da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 3111da177e4SLinus Torvalds */ 3127cc8f9c7SLinus Torvalds void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 3131da177e4SLinus Torvalds { 3144907e80bSMatthew Wilcox (Oracle) struct folio_batch folios; 3154907e80bSMatthew Wilcox (Oracle) unsigned int refs[PAGEVEC_SIZE]; 316d7f861b9SDavid Hildenbrand 3174907e80bSMatthew Wilcox (Oracle) lru_add_drain(); 3184907e80bSMatthew Wilcox (Oracle) folio_batch_init(&folios); 3194907e80bSMatthew Wilcox (Oracle) for (int i = 0; i < nr; i++) { 3204907e80bSMatthew Wilcox (Oracle) struct folio *folio = page_folio(encoded_page_ptr(pages[i])); 3214907e80bSMatthew Wilcox (Oracle) 322*63b77499SMatthew Wilcox (Oracle) free_swap_cache(folio); 3234907e80bSMatthew Wilcox (Oracle) refs[folios.nr] = 1; 324d7f861b9SDavid Hildenbrand if (unlikely(encoded_page_flags(pages[i]) & 325d7f861b9SDavid Hildenbrand ENCODED_PAGE_BIT_NR_PAGES_NEXT)) 3264907e80bSMatthew Wilcox (Oracle) refs[folios.nr] = encoded_nr_pages(pages[++i]); 327d7f861b9SDavid Hildenbrand 3284907e80bSMatthew Wilcox (Oracle) if (folio_batch_add(&folios, folio) == 0) 3294907e80bSMatthew Wilcox (Oracle) folios_put_refs(&folios, refs); 330d7f861b9SDavid Hildenbrand } 3314907e80bSMatthew Wilcox (Oracle) if (folios.nr) 3324907e80bSMatthew Wilcox (Oracle) folios_put_refs(&folios, refs); 3331da177e4SLinus Torvalds } 3341da177e4SLinus Torvalds 335e9e9b7ecSMinchan Kim static inline bool swap_use_vma_readahead(void) 336e9e9b7ecSMinchan Kim { 337e9e9b7ecSMinchan Kim return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 338e9e9b7ecSMinchan Kim } 339e9e9b7ecSMinchan Kim 3401da177e4SLinus Torvalds /* 341c9edc242SMatthew Wilcox (Oracle) * Lookup a swap entry in the swap cache. A found folio will be returned 3421da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 343c9edc242SMatthew Wilcox (Oracle) * lock getting page table operations atomic even if we drop the folio 3441da177e4SLinus Torvalds * lock before returning. 345cbc2bd98SKairui Song * 346cbc2bd98SKairui Song * Caller must lock the swap device or hold a reference to keep it valid. 3471da177e4SLinus Torvalds */ 348c9edc242SMatthew Wilcox (Oracle) struct folio *swap_cache_get_folio(swp_entry_t entry, 349c9edc242SMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr) 3501da177e4SLinus Torvalds { 351c9edc242SMatthew Wilcox (Oracle) struct folio *folio; 3521da177e4SLinus Torvalds 353c9edc242SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 35466dabbb6SChristoph Hellwig if (!IS_ERR(folio)) { 355eaf649ebSMinchan Kim bool vma_ra = swap_use_vma_readahead(); 356eaf649ebSMinchan Kim bool readahead; 357eaf649ebSMinchan Kim 358eaf649ebSMinchan Kim /* 359eaf649ebSMinchan Kim * At the moment, we don't support PG_readahead for anon THP 360eaf649ebSMinchan Kim * so let's bail out rather than confusing the readahead stat. 361eaf649ebSMinchan Kim */ 362c9edc242SMatthew Wilcox (Oracle) if (unlikely(folio_test_large(folio))) 363c9edc242SMatthew Wilcox (Oracle) return folio; 364eaf649ebSMinchan Kim 365c9edc242SMatthew Wilcox (Oracle) readahead = folio_test_clear_readahead(folio); 366eaf649ebSMinchan Kim if (vma && vma_ra) { 367eaf649ebSMinchan Kim unsigned long ra_val; 368eaf649ebSMinchan Kim int win, hits; 369eaf649ebSMinchan Kim 370eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 371eaf649ebSMinchan Kim win = SWAP_RA_WIN(ra_val); 372eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 373ec560175SHuang Ying if (readahead) 374ec560175SHuang Ying hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 375ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 376ec560175SHuang Ying SWAP_RA_VAL(addr, win, hits)); 377ec560175SHuang Ying } 378eaf649ebSMinchan Kim 379ec560175SHuang Ying if (readahead) { 380ec560175SHuang Ying count_vm_event(SWAP_RA_HIT); 381eaf649ebSMinchan Kim if (!vma || !vma_ra) 382ec560175SHuang Ying atomic_inc(&swapin_readahead_hits); 383ec560175SHuang Ying } 38466dabbb6SChristoph Hellwig } else { 38566dabbb6SChristoph Hellwig folio = NULL; 386ec560175SHuang Ying } 387eaf649ebSMinchan Kim 388c9edc242SMatthew Wilcox (Oracle) return folio; 389c9edc242SMatthew Wilcox (Oracle) } 390c9edc242SMatthew Wilcox (Oracle) 39161ef1865SMatthew Wilcox (Oracle) /** 392524984ffSMatthew Wilcox (Oracle) * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 39361ef1865SMatthew Wilcox (Oracle) * @mapping: The address_space to search. 39461ef1865SMatthew Wilcox (Oracle) * @index: The page cache index. 39561ef1865SMatthew Wilcox (Oracle) * 396524984ffSMatthew Wilcox (Oracle) * This differs from filemap_get_folio() in that it will also look for the 397524984ffSMatthew Wilcox (Oracle) * folio in the swap cache. 39861ef1865SMatthew Wilcox (Oracle) * 399524984ffSMatthew Wilcox (Oracle) * Return: The found folio or %NULL. 40061ef1865SMatthew Wilcox (Oracle) */ 401524984ffSMatthew Wilcox (Oracle) struct folio *filemap_get_incore_folio(struct address_space *mapping, 402524984ffSMatthew Wilcox (Oracle) pgoff_t index) 40361ef1865SMatthew Wilcox (Oracle) { 40461ef1865SMatthew Wilcox (Oracle) swp_entry_t swp; 40561ef1865SMatthew Wilcox (Oracle) struct swap_info_struct *si; 406097b3e59SChristoph Hellwig struct folio *folio = filemap_get_entry(mapping, index); 40761ef1865SMatthew Wilcox (Oracle) 40866dabbb6SChristoph Hellwig if (!folio) 40966dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 410dd8095b1SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 41166dabbb6SChristoph Hellwig return folio; 41261ef1865SMatthew Wilcox (Oracle) if (!shmem_mapping(mapping)) 41366dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 41461ef1865SMatthew Wilcox (Oracle) 415dd8095b1SMatthew Wilcox (Oracle) swp = radix_to_swp_entry(folio); 416ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */ 417ba6851b4SMiaohe Lin if (non_swap_entry(swp)) 41866dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 41961ef1865SMatthew Wilcox (Oracle) /* Prevent swapoff from happening to us */ 42061ef1865SMatthew Wilcox (Oracle) si = get_swap_device(swp); 42161ef1865SMatthew Wilcox (Oracle) if (!si) 42266dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 423dd8095b1SMatthew Wilcox (Oracle) index = swp_offset(swp); 424dd8095b1SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(swp), index); 42561ef1865SMatthew Wilcox (Oracle) put_swap_device(si); 426524984ffSMatthew Wilcox (Oracle) return folio; 42761ef1865SMatthew Wilcox (Oracle) } 42861ef1865SMatthew Wilcox (Oracle) 42996c7b0b4SMatthew Wilcox (Oracle) struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 43096c7b0b4SMatthew Wilcox (Oracle) struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, 431a65b0e76SDomenico Cerasuolo bool skip_if_exists) 4321da177e4SLinus Torvalds { 433eb085574SHuang Ying struct swap_info_struct *si; 434a0d3374bSMatthew Wilcox (Oracle) struct folio *folio; 435aae466b0SJoonsoo Kim void *shadow = NULL; 4364c6355b2SJohannes Weiner 4375b999aadSDmitry Safonov *new_page_allocated = false; 43846a774d3SHuang Ying si = get_swap_device(entry); 43946a774d3SHuang Ying if (!si) 44046a774d3SHuang Ying return NULL; 4411da177e4SLinus Torvalds 4424c6355b2SJohannes Weiner for (;;) { 4434c6355b2SJohannes Weiner int err; 4441da177e4SLinus Torvalds /* 4451da177e4SLinus Torvalds * First check the swap cache. Since this is normally 446cb691e2fSMatthew Wilcox (Oracle) * called after swap_cache_get_folio() failed, re-calling 4471da177e4SLinus Torvalds * that would confuse statistics. 4481da177e4SLinus Torvalds */ 449a0d3374bSMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), 450eb085574SHuang Ying swp_offset(entry)); 45196c7b0b4SMatthew Wilcox (Oracle) if (!IS_ERR(folio)) 45296c7b0b4SMatthew Wilcox (Oracle) goto got_folio; 4531da177e4SLinus Torvalds 454ba81f838SHuang Ying /* 455ba81f838SHuang Ying * Just skip read ahead for unused swap slot. 456ba81f838SHuang Ying * During swap_off when swap_slot_cache is disabled, 457ba81f838SHuang Ying * we have to handle the race between putting 458ba81f838SHuang Ying * swap entry in swap cache and marking swap slot 459ba81f838SHuang Ying * as SWAP_HAS_CACHE. That's done in later part of code or 460ba81f838SHuang Ying * else swap_off will be aborted if we return NULL. 461ba81f838SHuang Ying */ 4623ecdeb0fSHuang Ying if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) 46346a774d3SHuang Ying goto fail_put_swap; 464e8c26ab6STim Chen 4651da177e4SLinus Torvalds /* 46696c7b0b4SMatthew Wilcox (Oracle) * Get a new folio to read into from swap. Allocate it now, 4674c6355b2SJohannes Weiner * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 4684c6355b2SJohannes Weiner * cause any racers to loop around until we add it to cache. 4691da177e4SLinus Torvalds */ 470ddc1a5cbSHugh Dickins folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, 471ddc1a5cbSHugh Dickins mpol, ilx, numa_node_id()); 472a0d3374bSMatthew Wilcox (Oracle) if (!folio) 47346a774d3SHuang Ying goto fail_put_swap; 4741da177e4SLinus Torvalds 4751da177e4SLinus Torvalds /* 476f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 477f000944dSHugh Dickins */ 478355cfa73SKAMEZAWA Hiroyuki err = swapcache_prepare(entry); 4794c6355b2SJohannes Weiner if (!err) 480f000944dSHugh Dickins break; 481f000944dSHugh Dickins 482a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 4834c6355b2SJohannes Weiner if (err != -EEXIST) 48446a774d3SHuang Ying goto fail_put_swap; 4851da177e4SLinus Torvalds 4864c6355b2SJohannes Weiner /* 487a65b0e76SDomenico Cerasuolo * Protect against a recursive call to __read_swap_cache_async() 488a65b0e76SDomenico Cerasuolo * on the same entry waiting forever here because SWAP_HAS_CACHE 489a65b0e76SDomenico Cerasuolo * is set but the folio is not the swap cache yet. This can 490a65b0e76SDomenico Cerasuolo * happen today if mem_cgroup_swapin_charge_folio() below 491a65b0e76SDomenico Cerasuolo * triggers reclaim through zswap, which may call 492a65b0e76SDomenico Cerasuolo * __read_swap_cache_async() in the writeback path. 493a65b0e76SDomenico Cerasuolo */ 494a65b0e76SDomenico Cerasuolo if (skip_if_exists) 495a65b0e76SDomenico Cerasuolo goto fail_put_swap; 496a65b0e76SDomenico Cerasuolo 497a65b0e76SDomenico Cerasuolo /* 4984c6355b2SJohannes Weiner * We might race against __delete_from_swap_cache(), and 4994c6355b2SJohannes Weiner * stumble across a swap_map entry whose SWAP_HAS_CACHE 5004c6355b2SJohannes Weiner * has not yet been cleared. Or race against another 5014c6355b2SJohannes Weiner * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 50296c7b0b4SMatthew Wilcox (Oracle) * in swap_map, but not yet added its folio to swap cache. 5034c6355b2SJohannes Weiner */ 504029c4628SGuo Ziliang schedule_timeout_uninterruptible(1); 5054c6355b2SJohannes Weiner } 5064c6355b2SJohannes Weiner 5074c6355b2SJohannes Weiner /* 50896c7b0b4SMatthew Wilcox (Oracle) * The swap entry is ours to swap in. Prepare the new folio. 5094c6355b2SJohannes Weiner */ 5104c6355b2SJohannes Weiner 511a0d3374bSMatthew Wilcox (Oracle) __folio_set_locked(folio); 512a0d3374bSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 5134c6355b2SJohannes Weiner 51465995918SMatthew Wilcox (Oracle) if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 5154c6355b2SJohannes Weiner goto fail_unlock; 5164c6355b2SJohannes Weiner 5170add0c77SShakeel Butt /* May fail (-ENOMEM) if XArray node allocation failed. */ 518a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 5194c6355b2SJohannes Weiner goto fail_unlock; 5200add0c77SShakeel Butt 5210add0c77SShakeel Butt mem_cgroup_swapin_uncharge_swap(entry); 5224c6355b2SJohannes Weiner 523aae466b0SJoonsoo Kim if (shadow) 524a0d3374bSMatthew Wilcox (Oracle) workingset_refault(folio, shadow); 525314b57fbSJohannes Weiner 526a0d3374bSMatthew Wilcox (Oracle) /* Caller will initiate read into locked folio */ 527a0d3374bSMatthew Wilcox (Oracle) folio_add_lru(folio); 5284c6355b2SJohannes Weiner *new_page_allocated = true; 52996c7b0b4SMatthew Wilcox (Oracle) got_folio: 53046a774d3SHuang Ying put_swap_device(si); 53196c7b0b4SMatthew Wilcox (Oracle) return folio; 5324c6355b2SJohannes Weiner 5334c6355b2SJohannes Weiner fail_unlock: 5344081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 535a0d3374bSMatthew Wilcox (Oracle) folio_unlock(folio); 536a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 53746a774d3SHuang Ying fail_put_swap: 53846a774d3SHuang Ying put_swap_device(si); 5394c6355b2SJohannes Weiner return NULL; 5401da177e4SLinus Torvalds } 54146017e95SHugh Dickins 5425b999aadSDmitry Safonov /* 5435b999aadSDmitry Safonov * Locate a page of swap in physical memory, reserving swap cache space 5445b999aadSDmitry Safonov * and reading the disk if it is not already cached. 5455b999aadSDmitry Safonov * A failure return means that either the page allocation failed or that 5465b999aadSDmitry Safonov * the swap entry is no longer in use. 54746a774d3SHuang Ying * 54846a774d3SHuang Ying * get/put_swap_device() aren't needed to call this function, because 549c9bdf768SMatthew Wilcox (Oracle) * __read_swap_cache_async() call them and swap_read_folio() holds the 55046a774d3SHuang Ying * swap cache folio lock. 5515b999aadSDmitry Safonov */ 5526e03492eSMatthew Wilcox (Oracle) struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 5536e03492eSMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr, 5546e03492eSMatthew Wilcox (Oracle) struct swap_iocb **plug) 5555b999aadSDmitry Safonov { 556ddc1a5cbSHugh Dickins bool page_allocated; 557ddc1a5cbSHugh Dickins struct mempolicy *mpol; 558ddc1a5cbSHugh Dickins pgoff_t ilx; 55996c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 5605b999aadSDmitry Safonov 561ddc1a5cbSHugh Dickins mpol = get_vma_policy(vma, addr, 0, &ilx); 56296c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 563a65b0e76SDomenico Cerasuolo &page_allocated, false); 564ddc1a5cbSHugh Dickins mpol_cond_put(mpol); 5655b999aadSDmitry Safonov 566ddc1a5cbSHugh Dickins if (page_allocated) 567c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, plug); 5686e03492eSMatthew Wilcox (Oracle) return folio; 5695b999aadSDmitry Safonov } 5705b999aadSDmitry Safonov 571ec560175SHuang Ying static unsigned int __swapin_nr_pages(unsigned long prev_offset, 572ec560175SHuang Ying unsigned long offset, 573ec560175SHuang Ying int hits, 574ec560175SHuang Ying int max_pages, 575ec560175SHuang Ying int prev_win) 576579f8290SShaohua Li { 577ec560175SHuang Ying unsigned int pages, last_ra; 578579f8290SShaohua Li 579579f8290SShaohua Li /* 580579f8290SShaohua Li * This heuristic has been found to work well on both sequential and 581579f8290SShaohua Li * random loads, swapping to hard disk or to SSD: please don't ask 582579f8290SShaohua Li * what the "+ 2" means, it just happens to work well, that's all. 583579f8290SShaohua Li */ 584ec560175SHuang Ying pages = hits + 2; 585579f8290SShaohua Li if (pages == 2) { 586579f8290SShaohua Li /* 587579f8290SShaohua Li * We can have no readahead hits to judge by: but must not get 588579f8290SShaohua Li * stuck here forever, so check for an adjacent offset instead 589579f8290SShaohua Li * (and don't even bother to check whether swap type is same). 590579f8290SShaohua Li */ 591579f8290SShaohua Li if (offset != prev_offset + 1 && offset != prev_offset - 1) 592579f8290SShaohua Li pages = 1; 593579f8290SShaohua Li } else { 594579f8290SShaohua Li unsigned int roundup = 4; 595579f8290SShaohua Li while (roundup < pages) 596579f8290SShaohua Li roundup <<= 1; 597579f8290SShaohua Li pages = roundup; 598579f8290SShaohua Li } 599579f8290SShaohua Li 600579f8290SShaohua Li if (pages > max_pages) 601579f8290SShaohua Li pages = max_pages; 602579f8290SShaohua Li 603579f8290SShaohua Li /* Don't shrink readahead too fast */ 604ec560175SHuang Ying last_ra = prev_win / 2; 605579f8290SShaohua Li if (pages < last_ra) 606579f8290SShaohua Li pages = last_ra; 607ec560175SHuang Ying 608ec560175SHuang Ying return pages; 609ec560175SHuang Ying } 610ec560175SHuang Ying 611ec560175SHuang Ying static unsigned long swapin_nr_pages(unsigned long offset) 612ec560175SHuang Ying { 613ec560175SHuang Ying static unsigned long prev_offset; 614ec560175SHuang Ying unsigned int hits, pages, max_pages; 615ec560175SHuang Ying static atomic_t last_readahead_pages; 616ec560175SHuang Ying 617ec560175SHuang Ying max_pages = 1 << READ_ONCE(page_cluster); 618ec560175SHuang Ying if (max_pages <= 1) 619ec560175SHuang Ying return 1; 620ec560175SHuang Ying 621ec560175SHuang Ying hits = atomic_xchg(&swapin_readahead_hits, 0); 622d6c1f098SQian Cai pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 623d6c1f098SQian Cai max_pages, 624ec560175SHuang Ying atomic_read(&last_readahead_pages)); 625ec560175SHuang Ying if (!hits) 626d6c1f098SQian Cai WRITE_ONCE(prev_offset, offset); 627579f8290SShaohua Li atomic_set(&last_readahead_pages, pages); 628579f8290SShaohua Li 629579f8290SShaohua Li return pages; 630579f8290SShaohua Li } 631579f8290SShaohua Li 63246017e95SHugh Dickins /** 633e9e9b7ecSMinchan Kim * swap_cluster_readahead - swap in pages in hope we need them soon 63446017e95SHugh Dickins * @entry: swap entry of this memory 6357682486bSRandy Dunlap * @gfp_mask: memory allocation flags 636ddc1a5cbSHugh Dickins * @mpol: NUMA memory allocation policy to be applied 637ddc1a5cbSHugh Dickins * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 63846017e95SHugh Dickins * 639a4575c41SMatthew Wilcox (Oracle) * Returns the struct folio for entry and addr, after queueing swapin. 64046017e95SHugh Dickins * 64146017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 64246017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 64346017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 64446017e95SHugh Dickins * the 'original' request together with the readahead ones... 64546017e95SHugh Dickins * 646ddc1a5cbSHugh Dickins * Note: it is intentional that the same NUMA policy and interleave index 647ddc1a5cbSHugh Dickins * are used for every page of the readahead: neighbouring pages on swap 648ddc1a5cbSHugh Dickins * are fairly likely to have been swapped out from the same node. 64946017e95SHugh Dickins */ 650a4575c41SMatthew Wilcox (Oracle) struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 651ddc1a5cbSHugh Dickins struct mempolicy *mpol, pgoff_t ilx) 65246017e95SHugh Dickins { 65396c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 654579f8290SShaohua Li unsigned long entry_offset = swp_offset(entry); 655579f8290SShaohua Li unsigned long offset = entry_offset; 65667f96aa2SRik van Riel unsigned long start_offset, end_offset; 657579f8290SShaohua Li unsigned long mask; 658e9a6effaSHuang Ying struct swap_info_struct *si = swp_swap_info(entry); 6593fb5c298SChristian Ehrhardt struct blk_plug plug; 6605169b844SNeilBrown struct swap_iocb *splug = NULL; 661b243dcbfSSuren Baghdasaryan bool page_allocated; 66246017e95SHugh Dickins 663579f8290SShaohua Li mask = swapin_nr_pages(offset) - 1; 664579f8290SShaohua Li if (!mask) 665579f8290SShaohua Li goto skip; 666579f8290SShaohua Li 66767f96aa2SRik van Riel /* Read a page_cluster sized and aligned cluster around offset. */ 66867f96aa2SRik van Riel start_offset = offset & ~mask; 66967f96aa2SRik van Riel end_offset = offset | mask; 67067f96aa2SRik van Riel if (!start_offset) /* First page is swap header. */ 67167f96aa2SRik van Riel start_offset++; 672e9a6effaSHuang Ying if (end_offset >= si->max) 673e9a6effaSHuang Ying end_offset = si->max - 1; 67467f96aa2SRik van Riel 6753fb5c298SChristian Ehrhardt blk_start_plug(&plug); 67667f96aa2SRik van Riel for (offset = start_offset; offset <= end_offset ; offset++) { 67746017e95SHugh Dickins /* Ok, do the async read-ahead now */ 67896c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async( 679c4fa6309SHuang Ying swp_entry(swp_type(entry), offset), 680a65b0e76SDomenico Cerasuolo gfp_mask, mpol, ilx, &page_allocated, false); 68196c7b0b4SMatthew Wilcox (Oracle) if (!folio) 68267f96aa2SRik van Riel continue; 683c4fa6309SHuang Ying if (page_allocated) { 684c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, &splug); 685eaf649ebSMinchan Kim if (offset != entry_offset) { 68696c7b0b4SMatthew Wilcox (Oracle) folio_set_readahead(folio); 687cbc65df2SHuang Ying count_vm_event(SWAP_RA); 688cbc65df2SHuang Ying } 689c4fa6309SHuang Ying } 69096c7b0b4SMatthew Wilcox (Oracle) folio_put(folio); 69146017e95SHugh Dickins } 6923fb5c298SChristian Ehrhardt blk_finish_plug(&plug); 6935169b844SNeilBrown swap_read_unplug(splug); 69446017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 695579f8290SShaohua Li skip: 6965169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 69796c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 698a65b0e76SDomenico Cerasuolo &page_allocated, false); 69916e96ba5SNhat Pham if (unlikely(page_allocated)) { 70096c7b0b4SMatthew Wilcox (Oracle) zswap_folio_swapin(folio); 70116e96ba5SNhat Pham swap_read_folio(folio, false, NULL); 70216e96ba5SNhat Pham } 703a4575c41SMatthew Wilcox (Oracle) return folio; 70446017e95SHugh Dickins } 7054b3ef9daSHuang, Ying 7064b3ef9daSHuang, Ying int init_swap_address_space(unsigned int type, unsigned long nr_pages) 7074b3ef9daSHuang, Ying { 7084b3ef9daSHuang, Ying struct address_space *spaces, *space; 7094b3ef9daSHuang, Ying unsigned int i, nr; 7104b3ef9daSHuang, Ying 7114b3ef9daSHuang, Ying nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 712778e1cddSKees Cook spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 7134b3ef9daSHuang, Ying if (!spaces) 7144b3ef9daSHuang, Ying return -ENOMEM; 7154b3ef9daSHuang, Ying for (i = 0; i < nr; i++) { 7164b3ef9daSHuang, Ying space = spaces + i; 717a2833486SMatthew Wilcox xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 7184b3ef9daSHuang, Ying atomic_set(&space->i_mmap_writable, 0); 7194b3ef9daSHuang, Ying space->a_ops = &swap_aops; 7204b3ef9daSHuang, Ying /* swap cache doesn't use writeback related tags */ 7214b3ef9daSHuang, Ying mapping_set_no_writeback_tags(space); 7224b3ef9daSHuang, Ying } 7234b3ef9daSHuang, Ying nr_swapper_spaces[type] = nr; 724054f1d1fSHuang Ying swapper_spaces[type] = spaces; 7254b3ef9daSHuang, Ying 7264b3ef9daSHuang, Ying return 0; 7274b3ef9daSHuang, Ying } 7284b3ef9daSHuang, Ying 7294b3ef9daSHuang, Ying void exit_swap_address_space(unsigned int type) 7304b3ef9daSHuang, Ying { 731eea4a501SHuang Ying int i; 732eea4a501SHuang Ying struct address_space *spaces = swapper_spaces[type]; 733eea4a501SHuang Ying 734eea4a501SHuang Ying for (i = 0; i < nr_swapper_spaces[type]; i++) 735eea4a501SHuang Ying VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 736eea4a501SHuang Ying kvfree(spaces); 7374b3ef9daSHuang, Ying nr_swapper_spaces[type] = 0; 738054f1d1fSHuang Ying swapper_spaces[type] = NULL; 7394b3ef9daSHuang, Ying } 740ec560175SHuang Ying 7414f8fcf4cSHugh Dickins #define SWAP_RA_ORDER_CEILING 5 7424f8fcf4cSHugh Dickins 7434f8fcf4cSHugh Dickins struct vma_swap_readahead { 7444f8fcf4cSHugh Dickins unsigned short win; 7454f8fcf4cSHugh Dickins unsigned short offset; 7464f8fcf4cSHugh Dickins unsigned short nr_pte; 7474f8fcf4cSHugh Dickins }; 7484f8fcf4cSHugh Dickins 749eaf649ebSMinchan Kim static void swap_ra_info(struct vm_fault *vmf, 750eaf649ebSMinchan Kim struct vma_swap_readahead *ra_info) 751ec560175SHuang Ying { 752ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 753eaf649ebSMinchan Kim unsigned long ra_val; 75416ba391eSKairui Song unsigned long faddr, pfn, fpfn, lpfn, rpfn; 755ec560175SHuang Ying unsigned long start, end; 75616ba391eSKairui Song unsigned int max_win, hits, prev_win, win; 757ec560175SHuang Ying 75861b63972SHuang Ying max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 75961b63972SHuang Ying SWAP_RA_ORDER_CEILING); 76061b63972SHuang Ying if (max_win == 1) { 761eaf649ebSMinchan Kim ra_info->win = 1; 762eaf649ebSMinchan Kim return; 76361b63972SHuang Ying } 76461b63972SHuang Ying 765ec560175SHuang Ying faddr = vmf->address; 766ec560175SHuang Ying fpfn = PFN_DOWN(faddr); 767eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 768eaf649ebSMinchan Kim pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 769eaf649ebSMinchan Kim prev_win = SWAP_RA_WIN(ra_val); 770eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 771eaf649ebSMinchan Kim ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 772ec560175SHuang Ying max_win, prev_win); 773ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 774ec560175SHuang Ying SWAP_RA_VAL(faddr, win, 0)); 77518ad72f5SKairui Song if (win == 1) 776eaf649ebSMinchan Kim return; 777ec560175SHuang Ying 77816ba391eSKairui Song if (fpfn == pfn + 1) { 77916ba391eSKairui Song lpfn = fpfn; 78016ba391eSKairui Song rpfn = fpfn + win; 78116ba391eSKairui Song } else if (pfn == fpfn + 1) { 78216ba391eSKairui Song lpfn = fpfn - win + 1; 78316ba391eSKairui Song rpfn = fpfn + 1; 78416ba391eSKairui Song } else { 78516ba391eSKairui Song unsigned int left = (win - 1) / 2; 78616ba391eSKairui Song 78716ba391eSKairui Song lpfn = fpfn - left; 78816ba391eSKairui Song rpfn = fpfn + win - left; 789ec560175SHuang Ying } 79016ba391eSKairui Song start = max3(lpfn, PFN_DOWN(vma->vm_start), 79116ba391eSKairui Song PFN_DOWN(faddr & PMD_MASK)); 79216ba391eSKairui Song end = min3(rpfn, PFN_DOWN(vma->vm_end), 79316ba391eSKairui Song PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 79416ba391eSKairui Song 795eaf649ebSMinchan Kim ra_info->nr_pte = end - start; 796eaf649ebSMinchan Kim ra_info->offset = fpfn - start; 797ec560175SHuang Ying } 798ec560175SHuang Ying 799e9f59873SYang Shi /** 800e9f59873SYang Shi * swap_vma_readahead - swap in pages in hope we need them soon 801ddc1a5cbSHugh Dickins * @targ_entry: swap entry of the targeted memory 802e9f59873SYang Shi * @gfp_mask: memory allocation flags 803ddc1a5cbSHugh Dickins * @mpol: NUMA memory allocation policy to be applied 804ddc1a5cbSHugh Dickins * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 805e9f59873SYang Shi * @vmf: fault information 806e9f59873SYang Shi * 807a4575c41SMatthew Wilcox (Oracle) * Returns the struct folio for entry and addr, after queueing swapin. 808e9f59873SYang Shi * 809cb152a1aSShijie Luo * Primitive swap readahead code. We simply read in a few pages whose 810e9f59873SYang Shi * virtual addresses are around the fault address in the same vma. 811e9f59873SYang Shi * 812c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 813e9f59873SYang Shi * 814e9f59873SYang Shi */ 815a4575c41SMatthew Wilcox (Oracle) static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 816a4575c41SMatthew Wilcox (Oracle) struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 817ec560175SHuang Ying { 818ec560175SHuang Ying struct blk_plug plug; 8195169b844SNeilBrown struct swap_iocb *splug = NULL; 82096c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 8214f8fcf4cSHugh Dickins pte_t *pte = NULL, pentry; 8224f8fcf4cSHugh Dickins unsigned long addr; 823ec560175SHuang Ying swp_entry_t entry; 824ddc1a5cbSHugh Dickins pgoff_t ilx; 825ec560175SHuang Ying unsigned int i; 826ec560175SHuang Ying bool page_allocated; 827e97af699SMiaohe Lin struct vma_swap_readahead ra_info = { 828e97af699SMiaohe Lin .win = 1, 829e97af699SMiaohe Lin }; 830ec560175SHuang Ying 831eaf649ebSMinchan Kim swap_ra_info(vmf, &ra_info); 832eaf649ebSMinchan Kim if (ra_info.win == 1) 833ec560175SHuang Ying goto skip; 834ec560175SHuang Ying 8354f8fcf4cSHugh Dickins addr = vmf->address - (ra_info.offset * PAGE_SIZE); 836ddc1a5cbSHugh Dickins ilx = targ_ilx - ra_info.offset; 8374f8fcf4cSHugh Dickins 838ec560175SHuang Ying blk_start_plug(&plug); 839ddc1a5cbSHugh Dickins for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { 8404f8fcf4cSHugh Dickins if (!pte++) { 8414f8fcf4cSHugh Dickins pte = pte_offset_map(vmf->pmd, addr); 8424f8fcf4cSHugh Dickins if (!pte) 8434f8fcf4cSHugh Dickins break; 8444f8fcf4cSHugh Dickins } 8454f8fcf4cSHugh Dickins pentry = ptep_get_lockless(pte); 84692bafb20SMiaohe Lin if (!is_swap_pte(pentry)) 847ec560175SHuang Ying continue; 848ec560175SHuang Ying entry = pte_to_swp_entry(pentry); 849ec560175SHuang Ying if (unlikely(non_swap_entry(entry))) 850ec560175SHuang Ying continue; 8514f8fcf4cSHugh Dickins pte_unmap(pte); 8524f8fcf4cSHugh Dickins pte = NULL; 85396c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 854a65b0e76SDomenico Cerasuolo &page_allocated, false); 85596c7b0b4SMatthew Wilcox (Oracle) if (!folio) 856ec560175SHuang Ying continue; 857ec560175SHuang Ying if (page_allocated) { 858c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, &splug); 859eaf649ebSMinchan Kim if (i != ra_info.offset) { 86096c7b0b4SMatthew Wilcox (Oracle) folio_set_readahead(folio); 861ec560175SHuang Ying count_vm_event(SWAP_RA); 862ec560175SHuang Ying } 863ec560175SHuang Ying } 86496c7b0b4SMatthew Wilcox (Oracle) folio_put(folio); 865ec560175SHuang Ying } 8664f8fcf4cSHugh Dickins if (pte) 8674f8fcf4cSHugh Dickins pte_unmap(pte); 868ec560175SHuang Ying blk_finish_plug(&plug); 8695169b844SNeilBrown swap_read_unplug(splug); 870ec560175SHuang Ying lru_add_drain(); 871ec560175SHuang Ying skip: 87296c7b0b4SMatthew Wilcox (Oracle) /* The folio was likely read above, so no need for plugging here */ 87396c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, 874a65b0e76SDomenico Cerasuolo &page_allocated, false); 87516e96ba5SNhat Pham if (unlikely(page_allocated)) { 87696c7b0b4SMatthew Wilcox (Oracle) zswap_folio_swapin(folio); 87716e96ba5SNhat Pham swap_read_folio(folio, false, NULL); 87816e96ba5SNhat Pham } 879a4575c41SMatthew Wilcox (Oracle) return folio; 880ec560175SHuang Ying } 881d9bfcfdcSHuang Ying 882e9e9b7ecSMinchan Kim /** 883e9e9b7ecSMinchan Kim * swapin_readahead - swap in pages in hope we need them soon 884e9e9b7ecSMinchan Kim * @entry: swap entry of this memory 885e9e9b7ecSMinchan Kim * @gfp_mask: memory allocation flags 886e9e9b7ecSMinchan Kim * @vmf: fault information 887e9e9b7ecSMinchan Kim * 888e9e9b7ecSMinchan Kim * Returns the struct page for entry and addr, after queueing swapin. 889e9e9b7ecSMinchan Kim * 890e9e9b7ecSMinchan Kim * It's a main entry function for swap readahead. By the configuration, 891e9e9b7ecSMinchan Kim * it will read ahead blocks by cluster-based(ie, physical disk based) 892e9e9b7ecSMinchan Kim * or vma-based(ie, virtual address based on faulty address) readahead. 893e9e9b7ecSMinchan Kim */ 894e9e9b7ecSMinchan Kim struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 895e9e9b7ecSMinchan Kim struct vm_fault *vmf) 896e9e9b7ecSMinchan Kim { 897ddc1a5cbSHugh Dickins struct mempolicy *mpol; 898ddc1a5cbSHugh Dickins pgoff_t ilx; 899a4575c41SMatthew Wilcox (Oracle) struct folio *folio; 900ddc1a5cbSHugh Dickins 901ddc1a5cbSHugh Dickins mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 902a4575c41SMatthew Wilcox (Oracle) folio = swap_use_vma_readahead() ? 903ddc1a5cbSHugh Dickins swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 904ddc1a5cbSHugh Dickins swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 905ddc1a5cbSHugh Dickins mpol_cond_put(mpol); 906a4575c41SMatthew Wilcox (Oracle) 907a4575c41SMatthew Wilcox (Oracle) if (!folio) 908a4575c41SMatthew Wilcox (Oracle) return NULL; 909a4575c41SMatthew Wilcox (Oracle) return folio_file_page(folio, swp_offset(entry)); 910e9e9b7ecSMinchan Kim } 911e9e9b7ecSMinchan Kim 912d9bfcfdcSHuang Ying #ifdef CONFIG_SYSFS 913d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_show(struct kobject *kobj, 914d9bfcfdcSHuang Ying struct kobj_attribute *attr, char *buf) 915d9bfcfdcSHuang Ying { 916ae7a927dSJoe Perches return sysfs_emit(buf, "%s\n", 917ae7a927dSJoe Perches enable_vma_readahead ? "true" : "false"); 918d9bfcfdcSHuang Ying } 919d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_store(struct kobject *kobj, 920d9bfcfdcSHuang Ying struct kobj_attribute *attr, 921d9bfcfdcSHuang Ying const char *buf, size_t count) 922d9bfcfdcSHuang Ying { 923717aeab4SJagdish Gediya ssize_t ret; 924717aeab4SJagdish Gediya 925717aeab4SJagdish Gediya ret = kstrtobool(buf, &enable_vma_readahead); 926717aeab4SJagdish Gediya if (ret) 927717aeab4SJagdish Gediya return ret; 928d9bfcfdcSHuang Ying 929d9bfcfdcSHuang Ying return count; 930d9bfcfdcSHuang Ying } 9316106b93eSMiaohe Lin static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 932d9bfcfdcSHuang Ying 933d9bfcfdcSHuang Ying static struct attribute *swap_attrs[] = { 934d9bfcfdcSHuang Ying &vma_ra_enabled_attr.attr, 935d9bfcfdcSHuang Ying NULL, 936d9bfcfdcSHuang Ying }; 937d9bfcfdcSHuang Ying 938e48333b6SRikard Falkeborn static const struct attribute_group swap_attr_group = { 939d9bfcfdcSHuang Ying .attrs = swap_attrs, 940d9bfcfdcSHuang Ying }; 941d9bfcfdcSHuang Ying 942d9bfcfdcSHuang Ying static int __init swap_init_sysfs(void) 943d9bfcfdcSHuang Ying { 944d9bfcfdcSHuang Ying int err; 945d9bfcfdcSHuang Ying struct kobject *swap_kobj; 946d9bfcfdcSHuang Ying 947d9bfcfdcSHuang Ying swap_kobj = kobject_create_and_add("swap", mm_kobj); 948d9bfcfdcSHuang Ying if (!swap_kobj) { 949d9bfcfdcSHuang Ying pr_err("failed to create swap kobject\n"); 950d9bfcfdcSHuang Ying return -ENOMEM; 951d9bfcfdcSHuang Ying } 952d9bfcfdcSHuang Ying err = sysfs_create_group(swap_kobj, &swap_attr_group); 953d9bfcfdcSHuang Ying if (err) { 954d9bfcfdcSHuang Ying pr_err("failed to register swap group\n"); 955d9bfcfdcSHuang Ying goto delete_obj; 956d9bfcfdcSHuang Ying } 957d9bfcfdcSHuang Ying return 0; 958d9bfcfdcSHuang Ying 959d9bfcfdcSHuang Ying delete_obj: 960d9bfcfdcSHuang Ying kobject_put(swap_kobj); 961d9bfcfdcSHuang Ying return err; 962d9bfcfdcSHuang Ying } 963d9bfcfdcSHuang Ying subsys_initcall(swap_init_sysfs); 964d9bfcfdcSHuang Ying #endif 965