1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/mm/swap_state.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 61da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds #include <linux/mm.h> 115a0e3ad6STejun Heo #include <linux/gfp.h> 121da177e4SLinus Torvalds #include <linux/kernel_stat.h> 13ddc1a5cbSHugh Dickins #include <linux/mempolicy.h> 141da177e4SLinus Torvalds #include <linux/swap.h> 1546017e95SHugh Dickins #include <linux/swapops.h> 161da177e4SLinus Torvalds #include <linux/init.h> 171da177e4SLinus Torvalds #include <linux/pagemap.h> 181da177e4SLinus Torvalds #include <linux/backing-dev.h> 193fb5c298SChristian Ehrhardt #include <linux/blkdev.h> 20b20a3503SChristoph Lameter #include <linux/migrate.h> 214b3ef9daSHuang, Ying #include <linux/vmalloc.h> 2267afa38eSTim Chen #include <linux/swap_slots.h> 2338d8b4e6SHuang Ying #include <linux/huge_mm.h> 2461ef1865SMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 25243bce09SHugh Dickins #include "internal.h" 26014bb1deSNeilBrown #include "swap.h" 271da177e4SLinus Torvalds 281da177e4SLinus Torvalds /* 291da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 307eaceaccSJens Axboe * vmscan's shrink_page_list. 311da177e4SLinus Torvalds */ 32f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 331da177e4SLinus Torvalds .writepage = swap_writepage, 344c4a7634SNeilBrown .dirty_folio = noop_dirty_folio, 351c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3654184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 371c93923cSAndrew Morton #endif 381da177e4SLinus Torvalds }; 391da177e4SLinus Torvalds 40783cb68eSChangbin Du struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 41783cb68eSChangbin Du static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 42f5c754d6SColin Ian King static bool enable_vma_readahead __read_mostly = true; 43ec560175SHuang Ying 44ec560175SHuang Ying #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 45ec560175SHuang Ying #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 46ec560175SHuang Ying #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 47ec560175SHuang Ying #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 48ec560175SHuang Ying 49ec560175SHuang Ying #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 50ec560175SHuang Ying #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 51ec560175SHuang Ying #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 52ec560175SHuang Ying 53ec560175SHuang Ying #define SWAP_RA_VAL(addr, win, hits) \ 54ec560175SHuang Ying (((addr) & PAGE_MASK) | \ 55ec560175SHuang Ying (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 56ec560175SHuang Ying ((hits) & SWAP_RA_HITS_MASK)) 57ec560175SHuang Ying 58ec560175SHuang Ying /* Initial readahead hits is 4 to start up with a small window */ 59ec560175SHuang Ying #define GET_SWAP_RA_VAL(vma) \ 60ec560175SHuang Ying (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 611da177e4SLinus Torvalds 62579f8290SShaohua Li static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 63579f8290SShaohua Li 641da177e4SLinus Torvalds void show_swap_cache_info(void) 651da177e4SLinus Torvalds { 6633806f06SShaohua Li printk("%lu pages in swap cache\n", total_swapcache_pages()); 673cb8eaa4SZhangPeng printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); 683cb8eaa4SZhangPeng printk("Total swap = %lukB\n", K(total_swap_pages)); 691da177e4SLinus Torvalds } 701da177e4SLinus Torvalds 71aae466b0SJoonsoo Kim void *get_shadow_from_swap_cache(swp_entry_t entry) 72aae466b0SJoonsoo Kim { 73aae466b0SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 74aae466b0SJoonsoo Kim pgoff_t idx = swp_offset(entry); 75aae466b0SJoonsoo Kim struct page *page; 76aae466b0SJoonsoo Kim 778c647dd1SMatthew Wilcox (Oracle) page = xa_load(&address_space->i_pages, idx); 78aae466b0SJoonsoo Kim if (xa_is_value(page)) 79aae466b0SJoonsoo Kim return page; 80aae466b0SJoonsoo Kim return NULL; 81aae466b0SJoonsoo Kim } 82aae466b0SJoonsoo Kim 831da177e4SLinus Torvalds /* 842bb876b5SMatthew Wilcox (Oracle) * add_to_swap_cache resembles filemap_add_folio on swapper_space, 851da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 861da177e4SLinus Torvalds */ 87a4c366f0SMatthew Wilcox (Oracle) int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 883852f676SJoonsoo Kim gfp_t gfp, void **shadowp) 891da177e4SLinus Torvalds { 908d93b41cSMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 9138d8b4e6SHuang Ying pgoff_t idx = swp_offset(entry); 92a4c366f0SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 93a4c366f0SMatthew Wilcox (Oracle) unsigned long i, nr = folio_nr_pages(folio); 943852f676SJoonsoo Kim void *old; 951da177e4SLinus Torvalds 965649d113SYang Yang xas_set_update(&xas, workingset_update_node); 975649d113SYang Yang 98a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 99a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 100a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 10151726b12SHugh Dickins 102a4c366f0SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 103a4c366f0SMatthew Wilcox (Oracle) folio_set_swapcache(folio); 1043d2c9087SDavid Hildenbrand folio->swap = entry; 105e286781dSNick Piggin 1068d93b41cSMatthew Wilcox do { 1078d93b41cSMatthew Wilcox xas_lock_irq(&xas); 1088d93b41cSMatthew Wilcox xas_create_range(&xas); 1098d93b41cSMatthew Wilcox if (xas_error(&xas)) 1108d93b41cSMatthew Wilcox goto unlock; 11138d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 112a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 113e5b306a0SKairui Song if (shadowp) { 1143852f676SJoonsoo Kim old = xas_load(&xas); 115e5b306a0SKairui Song if (xa_is_value(old)) 1163852f676SJoonsoo Kim *shadowp = old; 1173852f676SJoonsoo Kim } 118a4c366f0SMatthew Wilcox (Oracle) xas_store(&xas, folio); 1198d93b41cSMatthew Wilcox xas_next(&xas); 1201da177e4SLinus Torvalds } 12138d8b4e6SHuang Ying address_space->nrpages += nr; 122a4c366f0SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 123a4c366f0SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 1248d93b41cSMatthew Wilcox unlock: 1258d93b41cSMatthew Wilcox xas_unlock_irq(&xas); 1268d93b41cSMatthew Wilcox } while (xas_nomem(&xas, gfp)); 1278d93b41cSMatthew Wilcox 1288d93b41cSMatthew Wilcox if (!xas_error(&xas)) 1298d93b41cSMatthew Wilcox return 0; 1308d93b41cSMatthew Wilcox 131a4c366f0SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 132a4c366f0SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 1338d93b41cSMatthew Wilcox return xas_error(&xas); 1341da177e4SLinus Torvalds } 1351da177e4SLinus Torvalds 1361da177e4SLinus Torvalds /* 137ceff9d33SMatthew Wilcox (Oracle) * This must be called only on folios that have 1381da177e4SLinus Torvalds * been verified to be in the swap cache. 1391da177e4SLinus Torvalds */ 140ceff9d33SMatthew Wilcox (Oracle) void __delete_from_swap_cache(struct folio *folio, 1413852f676SJoonsoo Kim swp_entry_t entry, void *shadow) 1421da177e4SLinus Torvalds { 1434e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 144ceff9d33SMatthew Wilcox (Oracle) int i; 145ceff9d33SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 1464e17ec25SMatthew Wilcox pgoff_t idx = swp_offset(entry); 1474e17ec25SMatthew Wilcox XA_STATE(xas, &address_space->i_pages, idx); 14833806f06SShaohua Li 1495649d113SYang Yang xas_set_update(&xas, workingset_update_node); 1505649d113SYang Yang 151ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 152ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 153ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 1541da177e4SLinus Torvalds 15538d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 1563852f676SJoonsoo Kim void *entry = xas_store(&xas, shadow); 157b9eb7776SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(entry != folio, entry); 1584e17ec25SMatthew Wilcox xas_next(&xas); 15938d8b4e6SHuang Ying } 1603d2c9087SDavid Hildenbrand folio->swap.val = 0; 161ceff9d33SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 16238d8b4e6SHuang Ying address_space->nrpages -= nr; 163ceff9d33SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 164ceff9d33SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 1651da177e4SLinus Torvalds } 1661da177e4SLinus Torvalds 1671da177e4SLinus Torvalds /** 16809c02e56SMatthew Wilcox (Oracle) * add_to_swap - allocate swap space for a folio 16909c02e56SMatthew Wilcox (Oracle) * @folio: folio we want to move to swap 1701da177e4SLinus Torvalds * 17109c02e56SMatthew Wilcox (Oracle) * Allocate swap space for the folio and add the folio to the 17209c02e56SMatthew Wilcox (Oracle) * swap cache. 17309c02e56SMatthew Wilcox (Oracle) * 17409c02e56SMatthew Wilcox (Oracle) * Context: Caller needs to hold the folio lock. 17509c02e56SMatthew Wilcox (Oracle) * Return: Whether the folio was added to the swap cache. 1761da177e4SLinus Torvalds */ 17709c02e56SMatthew Wilcox (Oracle) bool add_to_swap(struct folio *folio) 1781da177e4SLinus Torvalds { 1791da177e4SLinus Torvalds swp_entry_t entry; 1801da177e4SLinus Torvalds int err; 1811da177e4SLinus Torvalds 18209c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 18309c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1841da177e4SLinus Torvalds 185e2e3fdc7SMatthew Wilcox (Oracle) entry = folio_alloc_swap(folio); 1861da177e4SLinus Torvalds if (!entry.val) 18709c02e56SMatthew Wilcox (Oracle) return false; 1880f074658SMinchan Kim 189bd53b714SNick Piggin /* 1908d93b41cSMatthew Wilcox * XArray node allocations from PF_MEMALLOC contexts could 191bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 192bd53b714SNick Piggin * stops emergency reserves from being allocated. 1931da177e4SLinus Torvalds * 194bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 195bd53b714SNick Piggin * deadlock in the swap out path. 1961da177e4SLinus Torvalds */ 1971da177e4SLinus Torvalds /* 198854e9ed0SMinchan Kim * Add it to the swap cache. 1991da177e4SLinus Torvalds */ 200a4c366f0SMatthew Wilcox (Oracle) err = add_to_swap_cache(folio, entry, 2013852f676SJoonsoo Kim __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 20238d8b4e6SHuang Ying if (err) 2032ca4532aSDaisuke Nishimura /* 2042ca4532aSDaisuke Nishimura * add_to_swap_cache() doesn't return -EEXIST, so we can safely 2052ca4532aSDaisuke Nishimura * clear SWAP_HAS_CACHE flag. 2062ca4532aSDaisuke Nishimura */ 2070f074658SMinchan Kim goto fail; 2089625456cSShaohua Li /* 20909c02e56SMatthew Wilcox (Oracle) * Normally the folio will be dirtied in unmap because its 21009c02e56SMatthew Wilcox (Oracle) * pte should be dirty. A special case is MADV_FREE page. The 21109c02e56SMatthew Wilcox (Oracle) * page's pte could have dirty bit cleared but the folio's 21209c02e56SMatthew Wilcox (Oracle) * SwapBacked flag is still set because clearing the dirty bit 21309c02e56SMatthew Wilcox (Oracle) * and SwapBacked flag has no lock protected. For such folio, 21409c02e56SMatthew Wilcox (Oracle) * unmap will not set dirty bit for it, so folio reclaim will 21509c02e56SMatthew Wilcox (Oracle) * not write the folio out. This can cause data corruption when 21609c02e56SMatthew Wilcox (Oracle) * the folio is swapped in later. Always setting the dirty flag 21709c02e56SMatthew Wilcox (Oracle) * for the folio solves the problem. 2189625456cSShaohua Li */ 21909c02e56SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 2201da177e4SLinus Torvalds 22109c02e56SMatthew Wilcox (Oracle) return true; 22238d8b4e6SHuang Ying 22338d8b4e6SHuang Ying fail: 2244081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 22509c02e56SMatthew Wilcox (Oracle) return false; 22638d8b4e6SHuang Ying } 22738d8b4e6SHuang Ying 2281da177e4SLinus Torvalds /* 22975fa68a5SMatthew Wilcox (Oracle) * This must be called only on folios that have 2301da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 23175fa68a5SMatthew Wilcox (Oracle) * It will never put the folio into the free list, 23275fa68a5SMatthew Wilcox (Oracle) * the caller has a reference on the folio. 2331da177e4SLinus Torvalds */ 23475fa68a5SMatthew Wilcox (Oracle) void delete_from_swap_cache(struct folio *folio) 2351da177e4SLinus Torvalds { 2363d2c9087SDavid Hildenbrand swp_entry_t entry = folio->swap; 2374e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 2381da177e4SLinus Torvalds 239b93b0163SMatthew Wilcox xa_lock_irq(&address_space->i_pages); 240ceff9d33SMatthew Wilcox (Oracle) __delete_from_swap_cache(folio, entry, NULL); 241b93b0163SMatthew Wilcox xa_unlock_irq(&address_space->i_pages); 2421da177e4SLinus Torvalds 2434081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 24475fa68a5SMatthew Wilcox (Oracle) folio_ref_sub(folio, folio_nr_pages(folio)); 2451da177e4SLinus Torvalds } 2461da177e4SLinus Torvalds 2473852f676SJoonsoo Kim void clear_shadow_from_swap_cache(int type, unsigned long begin, 2483852f676SJoonsoo Kim unsigned long end) 2493852f676SJoonsoo Kim { 2503852f676SJoonsoo Kim unsigned long curr = begin; 2513852f676SJoonsoo Kim void *old; 2523852f676SJoonsoo Kim 2533852f676SJoonsoo Kim for (;;) { 2543852f676SJoonsoo Kim swp_entry_t entry = swp_entry(type, curr); 2553852f676SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 2563852f676SJoonsoo Kim XA_STATE(xas, &address_space->i_pages, curr); 2573852f676SJoonsoo Kim 2585649d113SYang Yang xas_set_update(&xas, workingset_update_node); 2595649d113SYang Yang 2603852f676SJoonsoo Kim xa_lock_irq(&address_space->i_pages); 2613852f676SJoonsoo Kim xas_for_each(&xas, old, end) { 2623852f676SJoonsoo Kim if (!xa_is_value(old)) 2633852f676SJoonsoo Kim continue; 2643852f676SJoonsoo Kim xas_store(&xas, NULL); 2653852f676SJoonsoo Kim } 2663852f676SJoonsoo Kim xa_unlock_irq(&address_space->i_pages); 2673852f676SJoonsoo Kim 2683852f676SJoonsoo Kim /* search the next swapcache until we meet end */ 2693852f676SJoonsoo Kim curr >>= SWAP_ADDRESS_SPACE_SHIFT; 2703852f676SJoonsoo Kim curr++; 2713852f676SJoonsoo Kim curr <<= SWAP_ADDRESS_SPACE_SHIFT; 2723852f676SJoonsoo Kim if (curr > end) 2733852f676SJoonsoo Kim break; 2743852f676SJoonsoo Kim } 2753852f676SJoonsoo Kim } 2763852f676SJoonsoo Kim 2771da177e4SLinus Torvalds /* 2781da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 2791da177e4SLinus Torvalds * 280aedd74d4SMatthew Wilcox (Oracle) * Its ok to check the swapcache flag without the folio lock 2811da177e4SLinus Torvalds * here because we are going to recheck again inside 282aedd74d4SMatthew Wilcox (Oracle) * folio_free_swap() _with_ the lock. 2831da177e4SLinus Torvalds * - Marcelo 2841da177e4SLinus Torvalds */ 285f4c4a3f4SHuang Ying void free_swap_cache(struct page *page) 2861da177e4SLinus Torvalds { 287aedd74d4SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 288aedd74d4SMatthew Wilcox (Oracle) 289aedd74d4SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) && !folio_mapped(folio) && 290aedd74d4SMatthew Wilcox (Oracle) folio_trylock(folio)) { 291aedd74d4SMatthew Wilcox (Oracle) folio_free_swap(folio); 292aedd74d4SMatthew Wilcox (Oracle) folio_unlock(folio); 2931da177e4SLinus Torvalds } 2941da177e4SLinus Torvalds } 2951da177e4SLinus Torvalds 2961da177e4SLinus Torvalds /* 2971da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 298b8072f09SHugh Dickins * this page if it is the last user of the page. 2991da177e4SLinus Torvalds */ 3001da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 3011da177e4SLinus Torvalds { 3021da177e4SLinus Torvalds free_swap_cache(page); 3036fcb52a5SAaron Lu if (!is_huge_zero_page(page)) 30409cbfeafSKirill A. Shutemov put_page(page); 3051da177e4SLinus Torvalds } 3061da177e4SLinus Torvalds 3071da177e4SLinus Torvalds /* 3081da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 3091da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 3101da177e4SLinus Torvalds */ 3117cc8f9c7SLinus Torvalds void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 3121da177e4SLinus Torvalds { 313aabfb572SMichal Hocko lru_add_drain(); 3147cc8f9c7SLinus Torvalds for (int i = 0; i < nr; i++) 3157cc8f9c7SLinus Torvalds free_swap_cache(encoded_page_ptr(pages[i])); 3167cc8f9c7SLinus Torvalds release_pages(pages, nr); 3171da177e4SLinus Torvalds } 3181da177e4SLinus Torvalds 319e9e9b7ecSMinchan Kim static inline bool swap_use_vma_readahead(void) 320e9e9b7ecSMinchan Kim { 321e9e9b7ecSMinchan Kim return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 322e9e9b7ecSMinchan Kim } 323e9e9b7ecSMinchan Kim 3241da177e4SLinus Torvalds /* 325c9edc242SMatthew Wilcox (Oracle) * Lookup a swap entry in the swap cache. A found folio will be returned 3261da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 327c9edc242SMatthew Wilcox (Oracle) * lock getting page table operations atomic even if we drop the folio 3281da177e4SLinus Torvalds * lock before returning. 329cbc2bd98SKairui Song * 330cbc2bd98SKairui Song * Caller must lock the swap device or hold a reference to keep it valid. 3311da177e4SLinus Torvalds */ 332c9edc242SMatthew Wilcox (Oracle) struct folio *swap_cache_get_folio(swp_entry_t entry, 333c9edc242SMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr) 3341da177e4SLinus Torvalds { 335c9edc242SMatthew Wilcox (Oracle) struct folio *folio; 3361da177e4SLinus Torvalds 337c9edc242SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 33866dabbb6SChristoph Hellwig if (!IS_ERR(folio)) { 339eaf649ebSMinchan Kim bool vma_ra = swap_use_vma_readahead(); 340eaf649ebSMinchan Kim bool readahead; 341eaf649ebSMinchan Kim 342eaf649ebSMinchan Kim /* 343eaf649ebSMinchan Kim * At the moment, we don't support PG_readahead for anon THP 344eaf649ebSMinchan Kim * so let's bail out rather than confusing the readahead stat. 345eaf649ebSMinchan Kim */ 346c9edc242SMatthew Wilcox (Oracle) if (unlikely(folio_test_large(folio))) 347c9edc242SMatthew Wilcox (Oracle) return folio; 348eaf649ebSMinchan Kim 349c9edc242SMatthew Wilcox (Oracle) readahead = folio_test_clear_readahead(folio); 350eaf649ebSMinchan Kim if (vma && vma_ra) { 351eaf649ebSMinchan Kim unsigned long ra_val; 352eaf649ebSMinchan Kim int win, hits; 353eaf649ebSMinchan Kim 354eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 355eaf649ebSMinchan Kim win = SWAP_RA_WIN(ra_val); 356eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 357ec560175SHuang Ying if (readahead) 358ec560175SHuang Ying hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 359ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 360ec560175SHuang Ying SWAP_RA_VAL(addr, win, hits)); 361ec560175SHuang Ying } 362eaf649ebSMinchan Kim 363ec560175SHuang Ying if (readahead) { 364ec560175SHuang Ying count_vm_event(SWAP_RA_HIT); 365eaf649ebSMinchan Kim if (!vma || !vma_ra) 366ec560175SHuang Ying atomic_inc(&swapin_readahead_hits); 367ec560175SHuang Ying } 36866dabbb6SChristoph Hellwig } else { 36966dabbb6SChristoph Hellwig folio = NULL; 370ec560175SHuang Ying } 371eaf649ebSMinchan Kim 372c9edc242SMatthew Wilcox (Oracle) return folio; 373c9edc242SMatthew Wilcox (Oracle) } 374c9edc242SMatthew Wilcox (Oracle) 37561ef1865SMatthew Wilcox (Oracle) /** 376524984ffSMatthew Wilcox (Oracle) * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 37761ef1865SMatthew Wilcox (Oracle) * @mapping: The address_space to search. 37861ef1865SMatthew Wilcox (Oracle) * @index: The page cache index. 37961ef1865SMatthew Wilcox (Oracle) * 380524984ffSMatthew Wilcox (Oracle) * This differs from filemap_get_folio() in that it will also look for the 381524984ffSMatthew Wilcox (Oracle) * folio in the swap cache. 38261ef1865SMatthew Wilcox (Oracle) * 383524984ffSMatthew Wilcox (Oracle) * Return: The found folio or %NULL. 38461ef1865SMatthew Wilcox (Oracle) */ 385524984ffSMatthew Wilcox (Oracle) struct folio *filemap_get_incore_folio(struct address_space *mapping, 386524984ffSMatthew Wilcox (Oracle) pgoff_t index) 38761ef1865SMatthew Wilcox (Oracle) { 38861ef1865SMatthew Wilcox (Oracle) swp_entry_t swp; 38961ef1865SMatthew Wilcox (Oracle) struct swap_info_struct *si; 390097b3e59SChristoph Hellwig struct folio *folio = filemap_get_entry(mapping, index); 39161ef1865SMatthew Wilcox (Oracle) 39266dabbb6SChristoph Hellwig if (!folio) 39366dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 394dd8095b1SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 39566dabbb6SChristoph Hellwig return folio; 39661ef1865SMatthew Wilcox (Oracle) if (!shmem_mapping(mapping)) 39766dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 39861ef1865SMatthew Wilcox (Oracle) 399dd8095b1SMatthew Wilcox (Oracle) swp = radix_to_swp_entry(folio); 400ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */ 401ba6851b4SMiaohe Lin if (non_swap_entry(swp)) 40266dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 40361ef1865SMatthew Wilcox (Oracle) /* Prevent swapoff from happening to us */ 40461ef1865SMatthew Wilcox (Oracle) si = get_swap_device(swp); 40561ef1865SMatthew Wilcox (Oracle) if (!si) 40666dabbb6SChristoph Hellwig return ERR_PTR(-ENOENT); 407dd8095b1SMatthew Wilcox (Oracle) index = swp_offset(swp); 408dd8095b1SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(swp), index); 40961ef1865SMatthew Wilcox (Oracle) put_swap_device(si); 410524984ffSMatthew Wilcox (Oracle) return folio; 41161ef1865SMatthew Wilcox (Oracle) } 41261ef1865SMatthew Wilcox (Oracle) 41396c7b0b4SMatthew Wilcox (Oracle) struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 41496c7b0b4SMatthew Wilcox (Oracle) struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, 415a65b0e76SDomenico Cerasuolo bool skip_if_exists) 4161da177e4SLinus Torvalds { 417eb085574SHuang Ying struct swap_info_struct *si; 418a0d3374bSMatthew Wilcox (Oracle) struct folio *folio; 419aae466b0SJoonsoo Kim void *shadow = NULL; 4204c6355b2SJohannes Weiner 4215b999aadSDmitry Safonov *new_page_allocated = false; 42246a774d3SHuang Ying si = get_swap_device(entry); 42346a774d3SHuang Ying if (!si) 42446a774d3SHuang Ying return NULL; 4251da177e4SLinus Torvalds 4264c6355b2SJohannes Weiner for (;;) { 4274c6355b2SJohannes Weiner int err; 4281da177e4SLinus Torvalds /* 4291da177e4SLinus Torvalds * First check the swap cache. Since this is normally 430cb691e2fSMatthew Wilcox (Oracle) * called after swap_cache_get_folio() failed, re-calling 4311da177e4SLinus Torvalds * that would confuse statistics. 4321da177e4SLinus Torvalds */ 433a0d3374bSMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), 434eb085574SHuang Ying swp_offset(entry)); 43596c7b0b4SMatthew Wilcox (Oracle) if (!IS_ERR(folio)) 43696c7b0b4SMatthew Wilcox (Oracle) goto got_folio; 4371da177e4SLinus Torvalds 438ba81f838SHuang Ying /* 439ba81f838SHuang Ying * Just skip read ahead for unused swap slot. 440ba81f838SHuang Ying * During swap_off when swap_slot_cache is disabled, 441ba81f838SHuang Ying * we have to handle the race between putting 442ba81f838SHuang Ying * swap entry in swap cache and marking swap slot 443ba81f838SHuang Ying * as SWAP_HAS_CACHE. That's done in later part of code or 444ba81f838SHuang Ying * else swap_off will be aborted if we return NULL. 445ba81f838SHuang Ying */ 4463ecdeb0fSHuang Ying if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) 44746a774d3SHuang Ying goto fail_put_swap; 448e8c26ab6STim Chen 4491da177e4SLinus Torvalds /* 45096c7b0b4SMatthew Wilcox (Oracle) * Get a new folio to read into from swap. Allocate it now, 4514c6355b2SJohannes Weiner * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 4524c6355b2SJohannes Weiner * cause any racers to loop around until we add it to cache. 4531da177e4SLinus Torvalds */ 454ddc1a5cbSHugh Dickins folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, 455ddc1a5cbSHugh Dickins mpol, ilx, numa_node_id()); 456a0d3374bSMatthew Wilcox (Oracle) if (!folio) 45746a774d3SHuang Ying goto fail_put_swap; 4581da177e4SLinus Torvalds 4591da177e4SLinus Torvalds /* 460f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 461f000944dSHugh Dickins */ 462355cfa73SKAMEZAWA Hiroyuki err = swapcache_prepare(entry); 4634c6355b2SJohannes Weiner if (!err) 464f000944dSHugh Dickins break; 465f000944dSHugh Dickins 466a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 4674c6355b2SJohannes Weiner if (err != -EEXIST) 46846a774d3SHuang Ying goto fail_put_swap; 4691da177e4SLinus Torvalds 4704c6355b2SJohannes Weiner /* 471a65b0e76SDomenico Cerasuolo * Protect against a recursive call to __read_swap_cache_async() 472a65b0e76SDomenico Cerasuolo * on the same entry waiting forever here because SWAP_HAS_CACHE 473a65b0e76SDomenico Cerasuolo * is set but the folio is not the swap cache yet. This can 474a65b0e76SDomenico Cerasuolo * happen today if mem_cgroup_swapin_charge_folio() below 475a65b0e76SDomenico Cerasuolo * triggers reclaim through zswap, which may call 476a65b0e76SDomenico Cerasuolo * __read_swap_cache_async() in the writeback path. 477a65b0e76SDomenico Cerasuolo */ 478a65b0e76SDomenico Cerasuolo if (skip_if_exists) 479a65b0e76SDomenico Cerasuolo goto fail_put_swap; 480a65b0e76SDomenico Cerasuolo 481a65b0e76SDomenico Cerasuolo /* 4824c6355b2SJohannes Weiner * We might race against __delete_from_swap_cache(), and 4834c6355b2SJohannes Weiner * stumble across a swap_map entry whose SWAP_HAS_CACHE 4844c6355b2SJohannes Weiner * has not yet been cleared. Or race against another 4854c6355b2SJohannes Weiner * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 48696c7b0b4SMatthew Wilcox (Oracle) * in swap_map, but not yet added its folio to swap cache. 4874c6355b2SJohannes Weiner */ 488029c4628SGuo Ziliang schedule_timeout_uninterruptible(1); 4894c6355b2SJohannes Weiner } 4904c6355b2SJohannes Weiner 4914c6355b2SJohannes Weiner /* 49296c7b0b4SMatthew Wilcox (Oracle) * The swap entry is ours to swap in. Prepare the new folio. 4934c6355b2SJohannes Weiner */ 4944c6355b2SJohannes Weiner 495a0d3374bSMatthew Wilcox (Oracle) __folio_set_locked(folio); 496a0d3374bSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 4974c6355b2SJohannes Weiner 49865995918SMatthew Wilcox (Oracle) if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 4994c6355b2SJohannes Weiner goto fail_unlock; 5004c6355b2SJohannes Weiner 5010add0c77SShakeel Butt /* May fail (-ENOMEM) if XArray node allocation failed. */ 502a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 5034c6355b2SJohannes Weiner goto fail_unlock; 5040add0c77SShakeel Butt 5050add0c77SShakeel Butt mem_cgroup_swapin_uncharge_swap(entry); 5064c6355b2SJohannes Weiner 507aae466b0SJoonsoo Kim if (shadow) 508a0d3374bSMatthew Wilcox (Oracle) workingset_refault(folio, shadow); 509314b57fbSJohannes Weiner 510a0d3374bSMatthew Wilcox (Oracle) /* Caller will initiate read into locked folio */ 511a0d3374bSMatthew Wilcox (Oracle) folio_add_lru(folio); 5124c6355b2SJohannes Weiner *new_page_allocated = true; 51396c7b0b4SMatthew Wilcox (Oracle) got_folio: 51446a774d3SHuang Ying put_swap_device(si); 51596c7b0b4SMatthew Wilcox (Oracle) return folio; 5164c6355b2SJohannes Weiner 5174c6355b2SJohannes Weiner fail_unlock: 5184081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 519a0d3374bSMatthew Wilcox (Oracle) folio_unlock(folio); 520a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 52146a774d3SHuang Ying fail_put_swap: 52246a774d3SHuang Ying put_swap_device(si); 5234c6355b2SJohannes Weiner return NULL; 5241da177e4SLinus Torvalds } 52546017e95SHugh Dickins 5265b999aadSDmitry Safonov /* 5275b999aadSDmitry Safonov * Locate a page of swap in physical memory, reserving swap cache space 5285b999aadSDmitry Safonov * and reading the disk if it is not already cached. 5295b999aadSDmitry Safonov * A failure return means that either the page allocation failed or that 5305b999aadSDmitry Safonov * the swap entry is no longer in use. 53146a774d3SHuang Ying * 53246a774d3SHuang Ying * get/put_swap_device() aren't needed to call this function, because 533c9bdf768SMatthew Wilcox (Oracle) * __read_swap_cache_async() call them and swap_read_folio() holds the 53446a774d3SHuang Ying * swap cache folio lock. 5355b999aadSDmitry Safonov */ 5366e03492eSMatthew Wilcox (Oracle) struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 5376e03492eSMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr, 5386e03492eSMatthew Wilcox (Oracle) struct swap_iocb **plug) 5395b999aadSDmitry Safonov { 540ddc1a5cbSHugh Dickins bool page_allocated; 541ddc1a5cbSHugh Dickins struct mempolicy *mpol; 542ddc1a5cbSHugh Dickins pgoff_t ilx; 54396c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 5445b999aadSDmitry Safonov 545ddc1a5cbSHugh Dickins mpol = get_vma_policy(vma, addr, 0, &ilx); 54696c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 547a65b0e76SDomenico Cerasuolo &page_allocated, false); 548ddc1a5cbSHugh Dickins mpol_cond_put(mpol); 5495b999aadSDmitry Safonov 550ddc1a5cbSHugh Dickins if (page_allocated) 551c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, plug); 5526e03492eSMatthew Wilcox (Oracle) return folio; 5535b999aadSDmitry Safonov } 5545b999aadSDmitry Safonov 555ec560175SHuang Ying static unsigned int __swapin_nr_pages(unsigned long prev_offset, 556ec560175SHuang Ying unsigned long offset, 557ec560175SHuang Ying int hits, 558ec560175SHuang Ying int max_pages, 559ec560175SHuang Ying int prev_win) 560579f8290SShaohua Li { 561ec560175SHuang Ying unsigned int pages, last_ra; 562579f8290SShaohua Li 563579f8290SShaohua Li /* 564579f8290SShaohua Li * This heuristic has been found to work well on both sequential and 565579f8290SShaohua Li * random loads, swapping to hard disk or to SSD: please don't ask 566579f8290SShaohua Li * what the "+ 2" means, it just happens to work well, that's all. 567579f8290SShaohua Li */ 568ec560175SHuang Ying pages = hits + 2; 569579f8290SShaohua Li if (pages == 2) { 570579f8290SShaohua Li /* 571579f8290SShaohua Li * We can have no readahead hits to judge by: but must not get 572579f8290SShaohua Li * stuck here forever, so check for an adjacent offset instead 573579f8290SShaohua Li * (and don't even bother to check whether swap type is same). 574579f8290SShaohua Li */ 575579f8290SShaohua Li if (offset != prev_offset + 1 && offset != prev_offset - 1) 576579f8290SShaohua Li pages = 1; 577579f8290SShaohua Li } else { 578579f8290SShaohua Li unsigned int roundup = 4; 579579f8290SShaohua Li while (roundup < pages) 580579f8290SShaohua Li roundup <<= 1; 581579f8290SShaohua Li pages = roundup; 582579f8290SShaohua Li } 583579f8290SShaohua Li 584579f8290SShaohua Li if (pages > max_pages) 585579f8290SShaohua Li pages = max_pages; 586579f8290SShaohua Li 587579f8290SShaohua Li /* Don't shrink readahead too fast */ 588ec560175SHuang Ying last_ra = prev_win / 2; 589579f8290SShaohua Li if (pages < last_ra) 590579f8290SShaohua Li pages = last_ra; 591ec560175SHuang Ying 592ec560175SHuang Ying return pages; 593ec560175SHuang Ying } 594ec560175SHuang Ying 595ec560175SHuang Ying static unsigned long swapin_nr_pages(unsigned long offset) 596ec560175SHuang Ying { 597ec560175SHuang Ying static unsigned long prev_offset; 598ec560175SHuang Ying unsigned int hits, pages, max_pages; 599ec560175SHuang Ying static atomic_t last_readahead_pages; 600ec560175SHuang Ying 601ec560175SHuang Ying max_pages = 1 << READ_ONCE(page_cluster); 602ec560175SHuang Ying if (max_pages <= 1) 603ec560175SHuang Ying return 1; 604ec560175SHuang Ying 605ec560175SHuang Ying hits = atomic_xchg(&swapin_readahead_hits, 0); 606d6c1f098SQian Cai pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 607d6c1f098SQian Cai max_pages, 608ec560175SHuang Ying atomic_read(&last_readahead_pages)); 609ec560175SHuang Ying if (!hits) 610d6c1f098SQian Cai WRITE_ONCE(prev_offset, offset); 611579f8290SShaohua Li atomic_set(&last_readahead_pages, pages); 612579f8290SShaohua Li 613579f8290SShaohua Li return pages; 614579f8290SShaohua Li } 615579f8290SShaohua Li 61646017e95SHugh Dickins /** 617e9e9b7ecSMinchan Kim * swap_cluster_readahead - swap in pages in hope we need them soon 61846017e95SHugh Dickins * @entry: swap entry of this memory 6197682486bSRandy Dunlap * @gfp_mask: memory allocation flags 620ddc1a5cbSHugh Dickins * @mpol: NUMA memory allocation policy to be applied 621ddc1a5cbSHugh Dickins * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 62246017e95SHugh Dickins * 623a4575c41SMatthew Wilcox (Oracle) * Returns the struct folio for entry and addr, after queueing swapin. 62446017e95SHugh Dickins * 62546017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 62646017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 62746017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 62846017e95SHugh Dickins * the 'original' request together with the readahead ones... 62946017e95SHugh Dickins * 630ddc1a5cbSHugh Dickins * Note: it is intentional that the same NUMA policy and interleave index 631ddc1a5cbSHugh Dickins * are used for every page of the readahead: neighbouring pages on swap 632ddc1a5cbSHugh Dickins * are fairly likely to have been swapped out from the same node. 63346017e95SHugh Dickins */ 634a4575c41SMatthew Wilcox (Oracle) struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 635ddc1a5cbSHugh Dickins struct mempolicy *mpol, pgoff_t ilx) 63646017e95SHugh Dickins { 63796c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 638579f8290SShaohua Li unsigned long entry_offset = swp_offset(entry); 639579f8290SShaohua Li unsigned long offset = entry_offset; 64067f96aa2SRik van Riel unsigned long start_offset, end_offset; 641579f8290SShaohua Li unsigned long mask; 642e9a6effaSHuang Ying struct swap_info_struct *si = swp_swap_info(entry); 6433fb5c298SChristian Ehrhardt struct blk_plug plug; 6445169b844SNeilBrown struct swap_iocb *splug = NULL; 645b243dcbfSSuren Baghdasaryan bool page_allocated; 64646017e95SHugh Dickins 647579f8290SShaohua Li mask = swapin_nr_pages(offset) - 1; 648579f8290SShaohua Li if (!mask) 649579f8290SShaohua Li goto skip; 650579f8290SShaohua Li 65167f96aa2SRik van Riel /* Read a page_cluster sized and aligned cluster around offset. */ 65267f96aa2SRik van Riel start_offset = offset & ~mask; 65367f96aa2SRik van Riel end_offset = offset | mask; 65467f96aa2SRik van Riel if (!start_offset) /* First page is swap header. */ 65567f96aa2SRik van Riel start_offset++; 656e9a6effaSHuang Ying if (end_offset >= si->max) 657e9a6effaSHuang Ying end_offset = si->max - 1; 65867f96aa2SRik van Riel 6593fb5c298SChristian Ehrhardt blk_start_plug(&plug); 66067f96aa2SRik van Riel for (offset = start_offset; offset <= end_offset ; offset++) { 66146017e95SHugh Dickins /* Ok, do the async read-ahead now */ 66296c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async( 663c4fa6309SHuang Ying swp_entry(swp_type(entry), offset), 664a65b0e76SDomenico Cerasuolo gfp_mask, mpol, ilx, &page_allocated, false); 66596c7b0b4SMatthew Wilcox (Oracle) if (!folio) 66667f96aa2SRik van Riel continue; 667c4fa6309SHuang Ying if (page_allocated) { 668c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, &splug); 669eaf649ebSMinchan Kim if (offset != entry_offset) { 67096c7b0b4SMatthew Wilcox (Oracle) folio_set_readahead(folio); 671cbc65df2SHuang Ying count_vm_event(SWAP_RA); 672cbc65df2SHuang Ying } 673c4fa6309SHuang Ying } 67496c7b0b4SMatthew Wilcox (Oracle) folio_put(folio); 67546017e95SHugh Dickins } 6763fb5c298SChristian Ehrhardt blk_finish_plug(&plug); 6775169b844SNeilBrown swap_read_unplug(splug); 67846017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 679579f8290SShaohua Li skip: 6805169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 68196c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 682a65b0e76SDomenico Cerasuolo &page_allocated, false); 683*16e96ba5SNhat Pham if (unlikely(page_allocated)) { 68496c7b0b4SMatthew Wilcox (Oracle) zswap_folio_swapin(folio); 685*16e96ba5SNhat Pham swap_read_folio(folio, false, NULL); 686*16e96ba5SNhat Pham } 687a4575c41SMatthew Wilcox (Oracle) return folio; 68846017e95SHugh Dickins } 6894b3ef9daSHuang, Ying 6904b3ef9daSHuang, Ying int init_swap_address_space(unsigned int type, unsigned long nr_pages) 6914b3ef9daSHuang, Ying { 6924b3ef9daSHuang, Ying struct address_space *spaces, *space; 6934b3ef9daSHuang, Ying unsigned int i, nr; 6944b3ef9daSHuang, Ying 6954b3ef9daSHuang, Ying nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 696778e1cddSKees Cook spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 6974b3ef9daSHuang, Ying if (!spaces) 6984b3ef9daSHuang, Ying return -ENOMEM; 6994b3ef9daSHuang, Ying for (i = 0; i < nr; i++) { 7004b3ef9daSHuang, Ying space = spaces + i; 701a2833486SMatthew Wilcox xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 7024b3ef9daSHuang, Ying atomic_set(&space->i_mmap_writable, 0); 7034b3ef9daSHuang, Ying space->a_ops = &swap_aops; 7044b3ef9daSHuang, Ying /* swap cache doesn't use writeback related tags */ 7054b3ef9daSHuang, Ying mapping_set_no_writeback_tags(space); 7064b3ef9daSHuang, Ying } 7074b3ef9daSHuang, Ying nr_swapper_spaces[type] = nr; 708054f1d1fSHuang Ying swapper_spaces[type] = spaces; 7094b3ef9daSHuang, Ying 7104b3ef9daSHuang, Ying return 0; 7114b3ef9daSHuang, Ying } 7124b3ef9daSHuang, Ying 7134b3ef9daSHuang, Ying void exit_swap_address_space(unsigned int type) 7144b3ef9daSHuang, Ying { 715eea4a501SHuang Ying int i; 716eea4a501SHuang Ying struct address_space *spaces = swapper_spaces[type]; 717eea4a501SHuang Ying 718eea4a501SHuang Ying for (i = 0; i < nr_swapper_spaces[type]; i++) 719eea4a501SHuang Ying VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 720eea4a501SHuang Ying kvfree(spaces); 7214b3ef9daSHuang, Ying nr_swapper_spaces[type] = 0; 722054f1d1fSHuang Ying swapper_spaces[type] = NULL; 7234b3ef9daSHuang, Ying } 724ec560175SHuang Ying 7254f8fcf4cSHugh Dickins #define SWAP_RA_ORDER_CEILING 5 7264f8fcf4cSHugh Dickins 7274f8fcf4cSHugh Dickins struct vma_swap_readahead { 7284f8fcf4cSHugh Dickins unsigned short win; 7294f8fcf4cSHugh Dickins unsigned short offset; 7304f8fcf4cSHugh Dickins unsigned short nr_pte; 7314f8fcf4cSHugh Dickins }; 7324f8fcf4cSHugh Dickins 733eaf649ebSMinchan Kim static void swap_ra_info(struct vm_fault *vmf, 734eaf649ebSMinchan Kim struct vma_swap_readahead *ra_info) 735ec560175SHuang Ying { 736ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 737eaf649ebSMinchan Kim unsigned long ra_val; 73816ba391eSKairui Song unsigned long faddr, pfn, fpfn, lpfn, rpfn; 739ec560175SHuang Ying unsigned long start, end; 74016ba391eSKairui Song unsigned int max_win, hits, prev_win, win; 741ec560175SHuang Ying 74261b63972SHuang Ying max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 74361b63972SHuang Ying SWAP_RA_ORDER_CEILING); 74461b63972SHuang Ying if (max_win == 1) { 745eaf649ebSMinchan Kim ra_info->win = 1; 746eaf649ebSMinchan Kim return; 74761b63972SHuang Ying } 74861b63972SHuang Ying 749ec560175SHuang Ying faddr = vmf->address; 750ec560175SHuang Ying fpfn = PFN_DOWN(faddr); 751eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 752eaf649ebSMinchan Kim pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 753eaf649ebSMinchan Kim prev_win = SWAP_RA_WIN(ra_val); 754eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 755eaf649ebSMinchan Kim ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 756ec560175SHuang Ying max_win, prev_win); 757ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 758ec560175SHuang Ying SWAP_RA_VAL(faddr, win, 0)); 75918ad72f5SKairui Song if (win == 1) 760eaf649ebSMinchan Kim return; 761ec560175SHuang Ying 76216ba391eSKairui Song if (fpfn == pfn + 1) { 76316ba391eSKairui Song lpfn = fpfn; 76416ba391eSKairui Song rpfn = fpfn + win; 76516ba391eSKairui Song } else if (pfn == fpfn + 1) { 76616ba391eSKairui Song lpfn = fpfn - win + 1; 76716ba391eSKairui Song rpfn = fpfn + 1; 76816ba391eSKairui Song } else { 76916ba391eSKairui Song unsigned int left = (win - 1) / 2; 77016ba391eSKairui Song 77116ba391eSKairui Song lpfn = fpfn - left; 77216ba391eSKairui Song rpfn = fpfn + win - left; 773ec560175SHuang Ying } 77416ba391eSKairui Song start = max3(lpfn, PFN_DOWN(vma->vm_start), 77516ba391eSKairui Song PFN_DOWN(faddr & PMD_MASK)); 77616ba391eSKairui Song end = min3(rpfn, PFN_DOWN(vma->vm_end), 77716ba391eSKairui Song PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 77816ba391eSKairui Song 779eaf649ebSMinchan Kim ra_info->nr_pte = end - start; 780eaf649ebSMinchan Kim ra_info->offset = fpfn - start; 781ec560175SHuang Ying } 782ec560175SHuang Ying 783e9f59873SYang Shi /** 784e9f59873SYang Shi * swap_vma_readahead - swap in pages in hope we need them soon 785ddc1a5cbSHugh Dickins * @targ_entry: swap entry of the targeted memory 786e9f59873SYang Shi * @gfp_mask: memory allocation flags 787ddc1a5cbSHugh Dickins * @mpol: NUMA memory allocation policy to be applied 788ddc1a5cbSHugh Dickins * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE 789e9f59873SYang Shi * @vmf: fault information 790e9f59873SYang Shi * 791a4575c41SMatthew Wilcox (Oracle) * Returns the struct folio for entry and addr, after queueing swapin. 792e9f59873SYang Shi * 793cb152a1aSShijie Luo * Primitive swap readahead code. We simply read in a few pages whose 794e9f59873SYang Shi * virtual addresses are around the fault address in the same vma. 795e9f59873SYang Shi * 796c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 797e9f59873SYang Shi * 798e9f59873SYang Shi */ 799a4575c41SMatthew Wilcox (Oracle) static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, 800a4575c41SMatthew Wilcox (Oracle) struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) 801ec560175SHuang Ying { 802ec560175SHuang Ying struct blk_plug plug; 8035169b844SNeilBrown struct swap_iocb *splug = NULL; 80496c7b0b4SMatthew Wilcox (Oracle) struct folio *folio; 8054f8fcf4cSHugh Dickins pte_t *pte = NULL, pentry; 8064f8fcf4cSHugh Dickins unsigned long addr; 807ec560175SHuang Ying swp_entry_t entry; 808ddc1a5cbSHugh Dickins pgoff_t ilx; 809ec560175SHuang Ying unsigned int i; 810ec560175SHuang Ying bool page_allocated; 811e97af699SMiaohe Lin struct vma_swap_readahead ra_info = { 812e97af699SMiaohe Lin .win = 1, 813e97af699SMiaohe Lin }; 814ec560175SHuang Ying 815eaf649ebSMinchan Kim swap_ra_info(vmf, &ra_info); 816eaf649ebSMinchan Kim if (ra_info.win == 1) 817ec560175SHuang Ying goto skip; 818ec560175SHuang Ying 8194f8fcf4cSHugh Dickins addr = vmf->address - (ra_info.offset * PAGE_SIZE); 820ddc1a5cbSHugh Dickins ilx = targ_ilx - ra_info.offset; 8214f8fcf4cSHugh Dickins 822ec560175SHuang Ying blk_start_plug(&plug); 823ddc1a5cbSHugh Dickins for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { 8244f8fcf4cSHugh Dickins if (!pte++) { 8254f8fcf4cSHugh Dickins pte = pte_offset_map(vmf->pmd, addr); 8264f8fcf4cSHugh Dickins if (!pte) 8274f8fcf4cSHugh Dickins break; 8284f8fcf4cSHugh Dickins } 8294f8fcf4cSHugh Dickins pentry = ptep_get_lockless(pte); 83092bafb20SMiaohe Lin if (!is_swap_pte(pentry)) 831ec560175SHuang Ying continue; 832ec560175SHuang Ying entry = pte_to_swp_entry(pentry); 833ec560175SHuang Ying if (unlikely(non_swap_entry(entry))) 834ec560175SHuang Ying continue; 8354f8fcf4cSHugh Dickins pte_unmap(pte); 8364f8fcf4cSHugh Dickins pte = NULL; 83796c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 838a65b0e76SDomenico Cerasuolo &page_allocated, false); 83996c7b0b4SMatthew Wilcox (Oracle) if (!folio) 840ec560175SHuang Ying continue; 841ec560175SHuang Ying if (page_allocated) { 842c9bdf768SMatthew Wilcox (Oracle) swap_read_folio(folio, false, &splug); 843eaf649ebSMinchan Kim if (i != ra_info.offset) { 84496c7b0b4SMatthew Wilcox (Oracle) folio_set_readahead(folio); 845ec560175SHuang Ying count_vm_event(SWAP_RA); 846ec560175SHuang Ying } 847ec560175SHuang Ying } 84896c7b0b4SMatthew Wilcox (Oracle) folio_put(folio); 849ec560175SHuang Ying } 8504f8fcf4cSHugh Dickins if (pte) 8514f8fcf4cSHugh Dickins pte_unmap(pte); 852ec560175SHuang Ying blk_finish_plug(&plug); 8535169b844SNeilBrown swap_read_unplug(splug); 854ec560175SHuang Ying lru_add_drain(); 855ec560175SHuang Ying skip: 85696c7b0b4SMatthew Wilcox (Oracle) /* The folio was likely read above, so no need for plugging here */ 85796c7b0b4SMatthew Wilcox (Oracle) folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, 858a65b0e76SDomenico Cerasuolo &page_allocated, false); 859*16e96ba5SNhat Pham if (unlikely(page_allocated)) { 86096c7b0b4SMatthew Wilcox (Oracle) zswap_folio_swapin(folio); 861*16e96ba5SNhat Pham swap_read_folio(folio, false, NULL); 862*16e96ba5SNhat Pham } 863a4575c41SMatthew Wilcox (Oracle) return folio; 864ec560175SHuang Ying } 865d9bfcfdcSHuang Ying 866e9e9b7ecSMinchan Kim /** 867e9e9b7ecSMinchan Kim * swapin_readahead - swap in pages in hope we need them soon 868e9e9b7ecSMinchan Kim * @entry: swap entry of this memory 869e9e9b7ecSMinchan Kim * @gfp_mask: memory allocation flags 870e9e9b7ecSMinchan Kim * @vmf: fault information 871e9e9b7ecSMinchan Kim * 872e9e9b7ecSMinchan Kim * Returns the struct page for entry and addr, after queueing swapin. 873e9e9b7ecSMinchan Kim * 874e9e9b7ecSMinchan Kim * It's a main entry function for swap readahead. By the configuration, 875e9e9b7ecSMinchan Kim * it will read ahead blocks by cluster-based(ie, physical disk based) 876e9e9b7ecSMinchan Kim * or vma-based(ie, virtual address based on faulty address) readahead. 877e9e9b7ecSMinchan Kim */ 878e9e9b7ecSMinchan Kim struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 879e9e9b7ecSMinchan Kim struct vm_fault *vmf) 880e9e9b7ecSMinchan Kim { 881ddc1a5cbSHugh Dickins struct mempolicy *mpol; 882ddc1a5cbSHugh Dickins pgoff_t ilx; 883a4575c41SMatthew Wilcox (Oracle) struct folio *folio; 884ddc1a5cbSHugh Dickins 885ddc1a5cbSHugh Dickins mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); 886a4575c41SMatthew Wilcox (Oracle) folio = swap_use_vma_readahead() ? 887ddc1a5cbSHugh Dickins swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : 888ddc1a5cbSHugh Dickins swap_cluster_readahead(entry, gfp_mask, mpol, ilx); 889ddc1a5cbSHugh Dickins mpol_cond_put(mpol); 890a4575c41SMatthew Wilcox (Oracle) 891a4575c41SMatthew Wilcox (Oracle) if (!folio) 892a4575c41SMatthew Wilcox (Oracle) return NULL; 893a4575c41SMatthew Wilcox (Oracle) return folio_file_page(folio, swp_offset(entry)); 894e9e9b7ecSMinchan Kim } 895e9e9b7ecSMinchan Kim 896d9bfcfdcSHuang Ying #ifdef CONFIG_SYSFS 897d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_show(struct kobject *kobj, 898d9bfcfdcSHuang Ying struct kobj_attribute *attr, char *buf) 899d9bfcfdcSHuang Ying { 900ae7a927dSJoe Perches return sysfs_emit(buf, "%s\n", 901ae7a927dSJoe Perches enable_vma_readahead ? "true" : "false"); 902d9bfcfdcSHuang Ying } 903d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_store(struct kobject *kobj, 904d9bfcfdcSHuang Ying struct kobj_attribute *attr, 905d9bfcfdcSHuang Ying const char *buf, size_t count) 906d9bfcfdcSHuang Ying { 907717aeab4SJagdish Gediya ssize_t ret; 908717aeab4SJagdish Gediya 909717aeab4SJagdish Gediya ret = kstrtobool(buf, &enable_vma_readahead); 910717aeab4SJagdish Gediya if (ret) 911717aeab4SJagdish Gediya return ret; 912d9bfcfdcSHuang Ying 913d9bfcfdcSHuang Ying return count; 914d9bfcfdcSHuang Ying } 9156106b93eSMiaohe Lin static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 916d9bfcfdcSHuang Ying 917d9bfcfdcSHuang Ying static struct attribute *swap_attrs[] = { 918d9bfcfdcSHuang Ying &vma_ra_enabled_attr.attr, 919d9bfcfdcSHuang Ying NULL, 920d9bfcfdcSHuang Ying }; 921d9bfcfdcSHuang Ying 922e48333b6SRikard Falkeborn static const struct attribute_group swap_attr_group = { 923d9bfcfdcSHuang Ying .attrs = swap_attrs, 924d9bfcfdcSHuang Ying }; 925d9bfcfdcSHuang Ying 926d9bfcfdcSHuang Ying static int __init swap_init_sysfs(void) 927d9bfcfdcSHuang Ying { 928d9bfcfdcSHuang Ying int err; 929d9bfcfdcSHuang Ying struct kobject *swap_kobj; 930d9bfcfdcSHuang Ying 931d9bfcfdcSHuang Ying swap_kobj = kobject_create_and_add("swap", mm_kobj); 932d9bfcfdcSHuang Ying if (!swap_kobj) { 933d9bfcfdcSHuang Ying pr_err("failed to create swap kobject\n"); 934d9bfcfdcSHuang Ying return -ENOMEM; 935d9bfcfdcSHuang Ying } 936d9bfcfdcSHuang Ying err = sysfs_create_group(swap_kobj, &swap_attr_group); 937d9bfcfdcSHuang Ying if (err) { 938d9bfcfdcSHuang Ying pr_err("failed to register swap group\n"); 939d9bfcfdcSHuang Ying goto delete_obj; 940d9bfcfdcSHuang Ying } 941d9bfcfdcSHuang Ying return 0; 942d9bfcfdcSHuang Ying 943d9bfcfdcSHuang Ying delete_obj: 944d9bfcfdcSHuang Ying kobject_put(swap_kobj); 945d9bfcfdcSHuang Ying return err; 946d9bfcfdcSHuang Ying } 947d9bfcfdcSHuang Ying subsys_initcall(swap_init_sysfs); 948d9bfcfdcSHuang Ying #endif 949