1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/mm/swap_state.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 61da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 71da177e4SLinus Torvalds * 81da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds #include <linux/mm.h> 115a0e3ad6STejun Heo #include <linux/gfp.h> 121da177e4SLinus Torvalds #include <linux/kernel_stat.h> 131da177e4SLinus Torvalds #include <linux/swap.h> 1446017e95SHugh Dickins #include <linux/swapops.h> 151da177e4SLinus Torvalds #include <linux/init.h> 161da177e4SLinus Torvalds #include <linux/pagemap.h> 171da177e4SLinus Torvalds #include <linux/backing-dev.h> 183fb5c298SChristian Ehrhardt #include <linux/blkdev.h> 19c484d410SHugh Dickins #include <linux/pagevec.h> 20b20a3503SChristoph Lameter #include <linux/migrate.h> 214b3ef9daSHuang, Ying #include <linux/vmalloc.h> 2267afa38eSTim Chen #include <linux/swap_slots.h> 2338d8b4e6SHuang Ying #include <linux/huge_mm.h> 2461ef1865SMatthew Wilcox (Oracle) #include <linux/shmem_fs.h> 25243bce09SHugh Dickins #include "internal.h" 26014bb1deSNeilBrown #include "swap.h" 271da177e4SLinus Torvalds 281da177e4SLinus Torvalds /* 291da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 307eaceaccSJens Axboe * vmscan's shrink_page_list. 311da177e4SLinus Torvalds */ 32f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 331da177e4SLinus Torvalds .writepage = swap_writepage, 344c4a7634SNeilBrown .dirty_folio = noop_dirty_folio, 351c93923cSAndrew Morton #ifdef CONFIG_MIGRATION 3654184650SMatthew Wilcox (Oracle) .migrate_folio = migrate_folio, 371c93923cSAndrew Morton #endif 381da177e4SLinus Torvalds }; 391da177e4SLinus Torvalds 40783cb68eSChangbin Du struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 41783cb68eSChangbin Du static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 42f5c754d6SColin Ian King static bool enable_vma_readahead __read_mostly = true; 43ec560175SHuang Ying 44ec560175SHuang Ying #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 45ec560175SHuang Ying #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 46ec560175SHuang Ying #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK 47ec560175SHuang Ying #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) 48ec560175SHuang Ying 49ec560175SHuang Ying #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) 50ec560175SHuang Ying #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) 51ec560175SHuang Ying #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) 52ec560175SHuang Ying 53ec560175SHuang Ying #define SWAP_RA_VAL(addr, win, hits) \ 54ec560175SHuang Ying (((addr) & PAGE_MASK) | \ 55ec560175SHuang Ying (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ 56ec560175SHuang Ying ((hits) & SWAP_RA_HITS_MASK)) 57ec560175SHuang Ying 58ec560175SHuang Ying /* Initial readahead hits is 4 to start up with a small window */ 59ec560175SHuang Ying #define GET_SWAP_RA_VAL(vma) \ 60ec560175SHuang Ying (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) 611da177e4SLinus Torvalds 62579f8290SShaohua Li static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); 63579f8290SShaohua Li 641da177e4SLinus Torvalds void show_swap_cache_info(void) 651da177e4SLinus Torvalds { 6633806f06SShaohua Li printk("%lu pages in swap cache\n", total_swapcache_pages()); 67ec8acf20SShaohua Li printk("Free swap = %ldkB\n", 68ec8acf20SShaohua Li get_nr_swap_pages() << (PAGE_SHIFT - 10)); 691da177e4SLinus Torvalds printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 701da177e4SLinus Torvalds } 711da177e4SLinus Torvalds 72aae466b0SJoonsoo Kim void *get_shadow_from_swap_cache(swp_entry_t entry) 73aae466b0SJoonsoo Kim { 74aae466b0SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 75aae466b0SJoonsoo Kim pgoff_t idx = swp_offset(entry); 76aae466b0SJoonsoo Kim struct page *page; 77aae466b0SJoonsoo Kim 788c647dd1SMatthew Wilcox (Oracle) page = xa_load(&address_space->i_pages, idx); 79aae466b0SJoonsoo Kim if (xa_is_value(page)) 80aae466b0SJoonsoo Kim return page; 81aae466b0SJoonsoo Kim return NULL; 82aae466b0SJoonsoo Kim } 83aae466b0SJoonsoo Kim 841da177e4SLinus Torvalds /* 852bb876b5SMatthew Wilcox (Oracle) * add_to_swap_cache resembles filemap_add_folio on swapper_space, 861da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 871da177e4SLinus Torvalds */ 88a4c366f0SMatthew Wilcox (Oracle) int add_to_swap_cache(struct folio *folio, swp_entry_t entry, 893852f676SJoonsoo Kim gfp_t gfp, void **shadowp) 901da177e4SLinus Torvalds { 918d93b41cSMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 9238d8b4e6SHuang Ying pgoff_t idx = swp_offset(entry); 93a4c366f0SMatthew Wilcox (Oracle) XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); 94a4c366f0SMatthew Wilcox (Oracle) unsigned long i, nr = folio_nr_pages(folio); 953852f676SJoonsoo Kim void *old; 961da177e4SLinus Torvalds 97a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 98a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 99a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); 10051726b12SHugh Dickins 101a4c366f0SMatthew Wilcox (Oracle) folio_ref_add(folio, nr); 102a4c366f0SMatthew Wilcox (Oracle) folio_set_swapcache(folio); 103e286781dSNick Piggin 1048d93b41cSMatthew Wilcox do { 1058d93b41cSMatthew Wilcox xas_lock_irq(&xas); 1068d93b41cSMatthew Wilcox xas_create_range(&xas); 1078d93b41cSMatthew Wilcox if (xas_error(&xas)) 1088d93b41cSMatthew Wilcox goto unlock; 10938d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 110a4c366f0SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); 1113852f676SJoonsoo Kim old = xas_load(&xas); 1123852f676SJoonsoo Kim if (xa_is_value(old)) { 1133852f676SJoonsoo Kim if (shadowp) 1143852f676SJoonsoo Kim *shadowp = old; 1153852f676SJoonsoo Kim } 116a4c366f0SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), entry.val + i); 117a4c366f0SMatthew Wilcox (Oracle) xas_store(&xas, folio); 1188d93b41cSMatthew Wilcox xas_next(&xas); 1191da177e4SLinus Torvalds } 12038d8b4e6SHuang Ying address_space->nrpages += nr; 121a4c366f0SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); 122a4c366f0SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); 1238d93b41cSMatthew Wilcox unlock: 1248d93b41cSMatthew Wilcox xas_unlock_irq(&xas); 1258d93b41cSMatthew Wilcox } while (xas_nomem(&xas, gfp)); 1268d93b41cSMatthew Wilcox 1278d93b41cSMatthew Wilcox if (!xas_error(&xas)) 1288d93b41cSMatthew Wilcox return 0; 1298d93b41cSMatthew Wilcox 130a4c366f0SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 131a4c366f0SMatthew Wilcox (Oracle) folio_ref_sub(folio, nr); 1328d93b41cSMatthew Wilcox return xas_error(&xas); 1331da177e4SLinus Torvalds } 1341da177e4SLinus Torvalds 1351da177e4SLinus Torvalds /* 136ceff9d33SMatthew Wilcox (Oracle) * This must be called only on folios that have 1371da177e4SLinus Torvalds * been verified to be in the swap cache. 1381da177e4SLinus Torvalds */ 139ceff9d33SMatthew Wilcox (Oracle) void __delete_from_swap_cache(struct folio *folio, 1403852f676SJoonsoo Kim swp_entry_t entry, void *shadow) 1411da177e4SLinus Torvalds { 1424e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 143ceff9d33SMatthew Wilcox (Oracle) int i; 144ceff9d33SMatthew Wilcox (Oracle) long nr = folio_nr_pages(folio); 1454e17ec25SMatthew Wilcox pgoff_t idx = swp_offset(entry); 1464e17ec25SMatthew Wilcox XA_STATE(xas, &address_space->i_pages, idx); 14733806f06SShaohua Li 148ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 149ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); 150ceff9d33SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); 1511da177e4SLinus Torvalds 15238d8b4e6SHuang Ying for (i = 0; i < nr; i++) { 1533852f676SJoonsoo Kim void *entry = xas_store(&xas, shadow); 154b9eb7776SMatthew Wilcox (Oracle) VM_BUG_ON_PAGE(entry != folio, entry); 155ceff9d33SMatthew Wilcox (Oracle) set_page_private(folio_page(folio, i), 0); 1564e17ec25SMatthew Wilcox xas_next(&xas); 15738d8b4e6SHuang Ying } 158ceff9d33SMatthew Wilcox (Oracle) folio_clear_swapcache(folio); 15938d8b4e6SHuang Ying address_space->nrpages -= nr; 160ceff9d33SMatthew Wilcox (Oracle) __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); 161ceff9d33SMatthew Wilcox (Oracle) __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); 1621da177e4SLinus Torvalds } 1631da177e4SLinus Torvalds 1641da177e4SLinus Torvalds /** 16509c02e56SMatthew Wilcox (Oracle) * add_to_swap - allocate swap space for a folio 16609c02e56SMatthew Wilcox (Oracle) * @folio: folio we want to move to swap 1671da177e4SLinus Torvalds * 16809c02e56SMatthew Wilcox (Oracle) * Allocate swap space for the folio and add the folio to the 16909c02e56SMatthew Wilcox (Oracle) * swap cache. 17009c02e56SMatthew Wilcox (Oracle) * 17109c02e56SMatthew Wilcox (Oracle) * Context: Caller needs to hold the folio lock. 17209c02e56SMatthew Wilcox (Oracle) * Return: Whether the folio was added to the swap cache. 1731da177e4SLinus Torvalds */ 17409c02e56SMatthew Wilcox (Oracle) bool add_to_swap(struct folio *folio) 1751da177e4SLinus Torvalds { 1761da177e4SLinus Torvalds swp_entry_t entry; 1771da177e4SLinus Torvalds int err; 1781da177e4SLinus Torvalds 17909c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 18009c02e56SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1811da177e4SLinus Torvalds 182e2e3fdc7SMatthew Wilcox (Oracle) entry = folio_alloc_swap(folio); 1831da177e4SLinus Torvalds if (!entry.val) 18409c02e56SMatthew Wilcox (Oracle) return false; 1850f074658SMinchan Kim 186bd53b714SNick Piggin /* 1878d93b41cSMatthew Wilcox * XArray node allocations from PF_MEMALLOC contexts could 188bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 189bd53b714SNick Piggin * stops emergency reserves from being allocated. 1901da177e4SLinus Torvalds * 191bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 192bd53b714SNick Piggin * deadlock in the swap out path. 1931da177e4SLinus Torvalds */ 1941da177e4SLinus Torvalds /* 195854e9ed0SMinchan Kim * Add it to the swap cache. 1961da177e4SLinus Torvalds */ 197a4c366f0SMatthew Wilcox (Oracle) err = add_to_swap_cache(folio, entry, 1983852f676SJoonsoo Kim __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); 19938d8b4e6SHuang Ying if (err) 2002ca4532aSDaisuke Nishimura /* 2012ca4532aSDaisuke Nishimura * add_to_swap_cache() doesn't return -EEXIST, so we can safely 2022ca4532aSDaisuke Nishimura * clear SWAP_HAS_CACHE flag. 2032ca4532aSDaisuke Nishimura */ 2040f074658SMinchan Kim goto fail; 2059625456cSShaohua Li /* 20609c02e56SMatthew Wilcox (Oracle) * Normally the folio will be dirtied in unmap because its 20709c02e56SMatthew Wilcox (Oracle) * pte should be dirty. A special case is MADV_FREE page. The 20809c02e56SMatthew Wilcox (Oracle) * page's pte could have dirty bit cleared but the folio's 20909c02e56SMatthew Wilcox (Oracle) * SwapBacked flag is still set because clearing the dirty bit 21009c02e56SMatthew Wilcox (Oracle) * and SwapBacked flag has no lock protected. For such folio, 21109c02e56SMatthew Wilcox (Oracle) * unmap will not set dirty bit for it, so folio reclaim will 21209c02e56SMatthew Wilcox (Oracle) * not write the folio out. This can cause data corruption when 21309c02e56SMatthew Wilcox (Oracle) * the folio is swapped in later. Always setting the dirty flag 21409c02e56SMatthew Wilcox (Oracle) * for the folio solves the problem. 2159625456cSShaohua Li */ 21609c02e56SMatthew Wilcox (Oracle) folio_mark_dirty(folio); 2171da177e4SLinus Torvalds 21809c02e56SMatthew Wilcox (Oracle) return true; 21938d8b4e6SHuang Ying 22038d8b4e6SHuang Ying fail: 2214081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 22209c02e56SMatthew Wilcox (Oracle) return false; 22338d8b4e6SHuang Ying } 22438d8b4e6SHuang Ying 2251da177e4SLinus Torvalds /* 22675fa68a5SMatthew Wilcox (Oracle) * This must be called only on folios that have 2271da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 22875fa68a5SMatthew Wilcox (Oracle) * It will never put the folio into the free list, 22975fa68a5SMatthew Wilcox (Oracle) * the caller has a reference on the folio. 2301da177e4SLinus Torvalds */ 23175fa68a5SMatthew Wilcox (Oracle) void delete_from_swap_cache(struct folio *folio) 2321da177e4SLinus Torvalds { 23375fa68a5SMatthew Wilcox (Oracle) swp_entry_t entry = folio_swap_entry(folio); 2344e17ec25SMatthew Wilcox struct address_space *address_space = swap_address_space(entry); 2351da177e4SLinus Torvalds 236b93b0163SMatthew Wilcox xa_lock_irq(&address_space->i_pages); 237ceff9d33SMatthew Wilcox (Oracle) __delete_from_swap_cache(folio, entry, NULL); 238b93b0163SMatthew Wilcox xa_unlock_irq(&address_space->i_pages); 2391da177e4SLinus Torvalds 2404081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 24175fa68a5SMatthew Wilcox (Oracle) folio_ref_sub(folio, folio_nr_pages(folio)); 2421da177e4SLinus Torvalds } 2431da177e4SLinus Torvalds 2443852f676SJoonsoo Kim void clear_shadow_from_swap_cache(int type, unsigned long begin, 2453852f676SJoonsoo Kim unsigned long end) 2463852f676SJoonsoo Kim { 2473852f676SJoonsoo Kim unsigned long curr = begin; 2483852f676SJoonsoo Kim void *old; 2493852f676SJoonsoo Kim 2503852f676SJoonsoo Kim for (;;) { 2513852f676SJoonsoo Kim swp_entry_t entry = swp_entry(type, curr); 2523852f676SJoonsoo Kim struct address_space *address_space = swap_address_space(entry); 2533852f676SJoonsoo Kim XA_STATE(xas, &address_space->i_pages, curr); 2543852f676SJoonsoo Kim 2553852f676SJoonsoo Kim xa_lock_irq(&address_space->i_pages); 2563852f676SJoonsoo Kim xas_for_each(&xas, old, end) { 2573852f676SJoonsoo Kim if (!xa_is_value(old)) 2583852f676SJoonsoo Kim continue; 2593852f676SJoonsoo Kim xas_store(&xas, NULL); 2603852f676SJoonsoo Kim } 2613852f676SJoonsoo Kim xa_unlock_irq(&address_space->i_pages); 2623852f676SJoonsoo Kim 2633852f676SJoonsoo Kim /* search the next swapcache until we meet end */ 2643852f676SJoonsoo Kim curr >>= SWAP_ADDRESS_SPACE_SHIFT; 2653852f676SJoonsoo Kim curr++; 2663852f676SJoonsoo Kim curr <<= SWAP_ADDRESS_SPACE_SHIFT; 2673852f676SJoonsoo Kim if (curr > end) 2683852f676SJoonsoo Kim break; 2693852f676SJoonsoo Kim } 2703852f676SJoonsoo Kim } 2713852f676SJoonsoo Kim 2721da177e4SLinus Torvalds /* 2731da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 2741da177e4SLinus Torvalds * 275aedd74d4SMatthew Wilcox (Oracle) * Its ok to check the swapcache flag without the folio lock 2761da177e4SLinus Torvalds * here because we are going to recheck again inside 277aedd74d4SMatthew Wilcox (Oracle) * folio_free_swap() _with_ the lock. 2781da177e4SLinus Torvalds * - Marcelo 2791da177e4SLinus Torvalds */ 280f4c4a3f4SHuang Ying void free_swap_cache(struct page *page) 2811da177e4SLinus Torvalds { 282aedd74d4SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page); 283aedd74d4SMatthew Wilcox (Oracle) 284aedd74d4SMatthew Wilcox (Oracle) if (folio_test_swapcache(folio) && !folio_mapped(folio) && 285aedd74d4SMatthew Wilcox (Oracle) folio_trylock(folio)) { 286aedd74d4SMatthew Wilcox (Oracle) folio_free_swap(folio); 287aedd74d4SMatthew Wilcox (Oracle) folio_unlock(folio); 2881da177e4SLinus Torvalds } 2891da177e4SLinus Torvalds } 2901da177e4SLinus Torvalds 2911da177e4SLinus Torvalds /* 2921da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 293b8072f09SHugh Dickins * this page if it is the last user of the page. 2941da177e4SLinus Torvalds */ 2951da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 2961da177e4SLinus Torvalds { 2971da177e4SLinus Torvalds free_swap_cache(page); 2986fcb52a5SAaron Lu if (!is_huge_zero_page(page)) 29909cbfeafSKirill A. Shutemov put_page(page); 3001da177e4SLinus Torvalds } 3011da177e4SLinus Torvalds 3021da177e4SLinus Torvalds /* 3031da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 3041da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 3051da177e4SLinus Torvalds */ 306*7cc8f9c7SLinus Torvalds void free_pages_and_swap_cache(struct encoded_page **pages, int nr) 3071da177e4SLinus Torvalds { 308aabfb572SMichal Hocko lru_add_drain(); 309*7cc8f9c7SLinus Torvalds for (int i = 0; i < nr; i++) 310*7cc8f9c7SLinus Torvalds free_swap_cache(encoded_page_ptr(pages[i])); 311*7cc8f9c7SLinus Torvalds release_pages(pages, nr); 3121da177e4SLinus Torvalds } 3131da177e4SLinus Torvalds 314e9e9b7ecSMinchan Kim static inline bool swap_use_vma_readahead(void) 315e9e9b7ecSMinchan Kim { 316e9e9b7ecSMinchan Kim return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); 317e9e9b7ecSMinchan Kim } 318e9e9b7ecSMinchan Kim 3191da177e4SLinus Torvalds /* 320c9edc242SMatthew Wilcox (Oracle) * Lookup a swap entry in the swap cache. A found folio will be returned 3211da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 322c9edc242SMatthew Wilcox (Oracle) * lock getting page table operations atomic even if we drop the folio 3231da177e4SLinus Torvalds * lock before returning. 3241da177e4SLinus Torvalds */ 325c9edc242SMatthew Wilcox (Oracle) struct folio *swap_cache_get_folio(swp_entry_t entry, 326c9edc242SMatthew Wilcox (Oracle) struct vm_area_struct *vma, unsigned long addr) 3271da177e4SLinus Torvalds { 328c9edc242SMatthew Wilcox (Oracle) struct folio *folio; 329eb085574SHuang Ying struct swap_info_struct *si; 3301da177e4SLinus Torvalds 331eb085574SHuang Ying si = get_swap_device(entry); 332eb085574SHuang Ying if (!si) 333eb085574SHuang Ying return NULL; 334c9edc242SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); 335eb085574SHuang Ying put_swap_device(si); 3361da177e4SLinus Torvalds 337c9edc242SMatthew Wilcox (Oracle) if (folio) { 338eaf649ebSMinchan Kim bool vma_ra = swap_use_vma_readahead(); 339eaf649ebSMinchan Kim bool readahead; 340eaf649ebSMinchan Kim 341eaf649ebSMinchan Kim /* 342eaf649ebSMinchan Kim * At the moment, we don't support PG_readahead for anon THP 343eaf649ebSMinchan Kim * so let's bail out rather than confusing the readahead stat. 344eaf649ebSMinchan Kim */ 345c9edc242SMatthew Wilcox (Oracle) if (unlikely(folio_test_large(folio))) 346c9edc242SMatthew Wilcox (Oracle) return folio; 347eaf649ebSMinchan Kim 348c9edc242SMatthew Wilcox (Oracle) readahead = folio_test_clear_readahead(folio); 349eaf649ebSMinchan Kim if (vma && vma_ra) { 350eaf649ebSMinchan Kim unsigned long ra_val; 351eaf649ebSMinchan Kim int win, hits; 352eaf649ebSMinchan Kim 353eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 354eaf649ebSMinchan Kim win = SWAP_RA_WIN(ra_val); 355eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 356ec560175SHuang Ying if (readahead) 357ec560175SHuang Ying hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 358ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 359ec560175SHuang Ying SWAP_RA_VAL(addr, win, hits)); 360ec560175SHuang Ying } 361eaf649ebSMinchan Kim 362ec560175SHuang Ying if (readahead) { 363ec560175SHuang Ying count_vm_event(SWAP_RA_HIT); 364eaf649ebSMinchan Kim if (!vma || !vma_ra) 365ec560175SHuang Ying atomic_inc(&swapin_readahead_hits); 366ec560175SHuang Ying } 367ec560175SHuang Ying } 368eaf649ebSMinchan Kim 369c9edc242SMatthew Wilcox (Oracle) return folio; 370c9edc242SMatthew Wilcox (Oracle) } 371c9edc242SMatthew Wilcox (Oracle) 37261ef1865SMatthew Wilcox (Oracle) /** 373524984ffSMatthew Wilcox (Oracle) * filemap_get_incore_folio - Find and get a folio from the page or swap caches. 37461ef1865SMatthew Wilcox (Oracle) * @mapping: The address_space to search. 37561ef1865SMatthew Wilcox (Oracle) * @index: The page cache index. 37661ef1865SMatthew Wilcox (Oracle) * 377524984ffSMatthew Wilcox (Oracle) * This differs from filemap_get_folio() in that it will also look for the 378524984ffSMatthew Wilcox (Oracle) * folio in the swap cache. 37961ef1865SMatthew Wilcox (Oracle) * 380524984ffSMatthew Wilcox (Oracle) * Return: The found folio or %NULL. 38161ef1865SMatthew Wilcox (Oracle) */ 382524984ffSMatthew Wilcox (Oracle) struct folio *filemap_get_incore_folio(struct address_space *mapping, 383524984ffSMatthew Wilcox (Oracle) pgoff_t index) 38461ef1865SMatthew Wilcox (Oracle) { 38561ef1865SMatthew Wilcox (Oracle) swp_entry_t swp; 38661ef1865SMatthew Wilcox (Oracle) struct swap_info_struct *si; 387dd8095b1SMatthew Wilcox (Oracle) struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0); 38861ef1865SMatthew Wilcox (Oracle) 389dd8095b1SMatthew Wilcox (Oracle) if (!xa_is_value(folio)) 390dd8095b1SMatthew Wilcox (Oracle) goto out; 39161ef1865SMatthew Wilcox (Oracle) if (!shmem_mapping(mapping)) 39261ef1865SMatthew Wilcox (Oracle) return NULL; 39361ef1865SMatthew Wilcox (Oracle) 394dd8095b1SMatthew Wilcox (Oracle) swp = radix_to_swp_entry(folio); 395ba6851b4SMiaohe Lin /* There might be swapin error entries in shmem mapping. */ 396ba6851b4SMiaohe Lin if (non_swap_entry(swp)) 397ba6851b4SMiaohe Lin return NULL; 39861ef1865SMatthew Wilcox (Oracle) /* Prevent swapoff from happening to us */ 39961ef1865SMatthew Wilcox (Oracle) si = get_swap_device(swp); 40061ef1865SMatthew Wilcox (Oracle) if (!si) 40161ef1865SMatthew Wilcox (Oracle) return NULL; 402dd8095b1SMatthew Wilcox (Oracle) index = swp_offset(swp); 403dd8095b1SMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(swp), index); 40461ef1865SMatthew Wilcox (Oracle) put_swap_device(si); 405dd8095b1SMatthew Wilcox (Oracle) out: 406524984ffSMatthew Wilcox (Oracle) return folio; 40761ef1865SMatthew Wilcox (Oracle) } 40861ef1865SMatthew Wilcox (Oracle) 4095b999aadSDmitry Safonov struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 4105b999aadSDmitry Safonov struct vm_area_struct *vma, unsigned long addr, 4115b999aadSDmitry Safonov bool *new_page_allocated) 4121da177e4SLinus Torvalds { 413eb085574SHuang Ying struct swap_info_struct *si; 414a0d3374bSMatthew Wilcox (Oracle) struct folio *folio; 415aae466b0SJoonsoo Kim void *shadow = NULL; 4164c6355b2SJohannes Weiner 4175b999aadSDmitry Safonov *new_page_allocated = false; 4181da177e4SLinus Torvalds 4194c6355b2SJohannes Weiner for (;;) { 4204c6355b2SJohannes Weiner int err; 4211da177e4SLinus Torvalds /* 4221da177e4SLinus Torvalds * First check the swap cache. Since this is normally 423cb691e2fSMatthew Wilcox (Oracle) * called after swap_cache_get_folio() failed, re-calling 4241da177e4SLinus Torvalds * that would confuse statistics. 4251da177e4SLinus Torvalds */ 426eb085574SHuang Ying si = get_swap_device(entry); 427eb085574SHuang Ying if (!si) 4284c6355b2SJohannes Weiner return NULL; 429a0d3374bSMatthew Wilcox (Oracle) folio = filemap_get_folio(swap_address_space(entry), 430eb085574SHuang Ying swp_offset(entry)); 431eb085574SHuang Ying put_swap_device(si); 432a0d3374bSMatthew Wilcox (Oracle) if (folio) 433a0d3374bSMatthew Wilcox (Oracle) return folio_file_page(folio, swp_offset(entry)); 4341da177e4SLinus Torvalds 435ba81f838SHuang Ying /* 436ba81f838SHuang Ying * Just skip read ahead for unused swap slot. 437ba81f838SHuang Ying * During swap_off when swap_slot_cache is disabled, 438ba81f838SHuang Ying * we have to handle the race between putting 439ba81f838SHuang Ying * swap entry in swap cache and marking swap slot 440ba81f838SHuang Ying * as SWAP_HAS_CACHE. That's done in later part of code or 441ba81f838SHuang Ying * else swap_off will be aborted if we return NULL. 442ba81f838SHuang Ying */ 443ba81f838SHuang Ying if (!__swp_swapcount(entry) && swap_slot_cache_enabled) 4444c6355b2SJohannes Weiner return NULL; 445e8c26ab6STim Chen 4461da177e4SLinus Torvalds /* 4474c6355b2SJohannes Weiner * Get a new page to read into from swap. Allocate it now, 4484c6355b2SJohannes Weiner * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will 4494c6355b2SJohannes Weiner * cause any racers to loop around until we add it to cache. 4501da177e4SLinus Torvalds */ 451a0d3374bSMatthew Wilcox (Oracle) folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); 452a0d3374bSMatthew Wilcox (Oracle) if (!folio) 4534c6355b2SJohannes Weiner return NULL; 4541da177e4SLinus Torvalds 4551da177e4SLinus Torvalds /* 456f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 457f000944dSHugh Dickins */ 458355cfa73SKAMEZAWA Hiroyuki err = swapcache_prepare(entry); 4594c6355b2SJohannes Weiner if (!err) 460f000944dSHugh Dickins break; 461f000944dSHugh Dickins 462a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 4634c6355b2SJohannes Weiner if (err != -EEXIST) 4644c6355b2SJohannes Weiner return NULL; 4651da177e4SLinus Torvalds 4664c6355b2SJohannes Weiner /* 4674c6355b2SJohannes Weiner * We might race against __delete_from_swap_cache(), and 4684c6355b2SJohannes Weiner * stumble across a swap_map entry whose SWAP_HAS_CACHE 4694c6355b2SJohannes Weiner * has not yet been cleared. Or race against another 4704c6355b2SJohannes Weiner * __read_swap_cache_async(), which has set SWAP_HAS_CACHE 4714c6355b2SJohannes Weiner * in swap_map, but not yet added its page to swap cache. 4724c6355b2SJohannes Weiner */ 473029c4628SGuo Ziliang schedule_timeout_uninterruptible(1); 4744c6355b2SJohannes Weiner } 4754c6355b2SJohannes Weiner 4764c6355b2SJohannes Weiner /* 4774c6355b2SJohannes Weiner * The swap entry is ours to swap in. Prepare the new page. 4784c6355b2SJohannes Weiner */ 4794c6355b2SJohannes Weiner 480a0d3374bSMatthew Wilcox (Oracle) __folio_set_locked(folio); 481a0d3374bSMatthew Wilcox (Oracle) __folio_set_swapbacked(folio); 4824c6355b2SJohannes Weiner 48365995918SMatthew Wilcox (Oracle) if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) 4844c6355b2SJohannes Weiner goto fail_unlock; 4854c6355b2SJohannes Weiner 4860add0c77SShakeel Butt /* May fail (-ENOMEM) if XArray node allocation failed. */ 487a4c366f0SMatthew Wilcox (Oracle) if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) 4884c6355b2SJohannes Weiner goto fail_unlock; 4890add0c77SShakeel Butt 4900add0c77SShakeel Butt mem_cgroup_swapin_uncharge_swap(entry); 4914c6355b2SJohannes Weiner 492aae466b0SJoonsoo Kim if (shadow) 493a0d3374bSMatthew Wilcox (Oracle) workingset_refault(folio, shadow); 494314b57fbSJohannes Weiner 495a0d3374bSMatthew Wilcox (Oracle) /* Caller will initiate read into locked folio */ 496a0d3374bSMatthew Wilcox (Oracle) folio_add_lru(folio); 4974c6355b2SJohannes Weiner *new_page_allocated = true; 498a0d3374bSMatthew Wilcox (Oracle) return &folio->page; 4994c6355b2SJohannes Weiner 5004c6355b2SJohannes Weiner fail_unlock: 5014081f744SMatthew Wilcox (Oracle) put_swap_folio(folio, entry); 502a0d3374bSMatthew Wilcox (Oracle) folio_unlock(folio); 503a0d3374bSMatthew Wilcox (Oracle) folio_put(folio); 5044c6355b2SJohannes Weiner return NULL; 5051da177e4SLinus Torvalds } 50646017e95SHugh Dickins 5075b999aadSDmitry Safonov /* 5085b999aadSDmitry Safonov * Locate a page of swap in physical memory, reserving swap cache space 5095b999aadSDmitry Safonov * and reading the disk if it is not already cached. 5105b999aadSDmitry Safonov * A failure return means that either the page allocation failed or that 5115b999aadSDmitry Safonov * the swap entry is no longer in use. 5125b999aadSDmitry Safonov */ 5135b999aadSDmitry Safonov struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 5145169b844SNeilBrown struct vm_area_struct *vma, 5155169b844SNeilBrown unsigned long addr, bool do_poll, 5165169b844SNeilBrown struct swap_iocb **plug) 5175b999aadSDmitry Safonov { 5185b999aadSDmitry Safonov bool page_was_allocated; 5195b999aadSDmitry Safonov struct page *retpage = __read_swap_cache_async(entry, gfp_mask, 5205b999aadSDmitry Safonov vma, addr, &page_was_allocated); 5215b999aadSDmitry Safonov 5225b999aadSDmitry Safonov if (page_was_allocated) 5235169b844SNeilBrown swap_readpage(retpage, do_poll, plug); 5245b999aadSDmitry Safonov 5255b999aadSDmitry Safonov return retpage; 5265b999aadSDmitry Safonov } 5275b999aadSDmitry Safonov 528ec560175SHuang Ying static unsigned int __swapin_nr_pages(unsigned long prev_offset, 529ec560175SHuang Ying unsigned long offset, 530ec560175SHuang Ying int hits, 531ec560175SHuang Ying int max_pages, 532ec560175SHuang Ying int prev_win) 533579f8290SShaohua Li { 534ec560175SHuang Ying unsigned int pages, last_ra; 535579f8290SShaohua Li 536579f8290SShaohua Li /* 537579f8290SShaohua Li * This heuristic has been found to work well on both sequential and 538579f8290SShaohua Li * random loads, swapping to hard disk or to SSD: please don't ask 539579f8290SShaohua Li * what the "+ 2" means, it just happens to work well, that's all. 540579f8290SShaohua Li */ 541ec560175SHuang Ying pages = hits + 2; 542579f8290SShaohua Li if (pages == 2) { 543579f8290SShaohua Li /* 544579f8290SShaohua Li * We can have no readahead hits to judge by: but must not get 545579f8290SShaohua Li * stuck here forever, so check for an adjacent offset instead 546579f8290SShaohua Li * (and don't even bother to check whether swap type is same). 547579f8290SShaohua Li */ 548579f8290SShaohua Li if (offset != prev_offset + 1 && offset != prev_offset - 1) 549579f8290SShaohua Li pages = 1; 550579f8290SShaohua Li } else { 551579f8290SShaohua Li unsigned int roundup = 4; 552579f8290SShaohua Li while (roundup < pages) 553579f8290SShaohua Li roundup <<= 1; 554579f8290SShaohua Li pages = roundup; 555579f8290SShaohua Li } 556579f8290SShaohua Li 557579f8290SShaohua Li if (pages > max_pages) 558579f8290SShaohua Li pages = max_pages; 559579f8290SShaohua Li 560579f8290SShaohua Li /* Don't shrink readahead too fast */ 561ec560175SHuang Ying last_ra = prev_win / 2; 562579f8290SShaohua Li if (pages < last_ra) 563579f8290SShaohua Li pages = last_ra; 564ec560175SHuang Ying 565ec560175SHuang Ying return pages; 566ec560175SHuang Ying } 567ec560175SHuang Ying 568ec560175SHuang Ying static unsigned long swapin_nr_pages(unsigned long offset) 569ec560175SHuang Ying { 570ec560175SHuang Ying static unsigned long prev_offset; 571ec560175SHuang Ying unsigned int hits, pages, max_pages; 572ec560175SHuang Ying static atomic_t last_readahead_pages; 573ec560175SHuang Ying 574ec560175SHuang Ying max_pages = 1 << READ_ONCE(page_cluster); 575ec560175SHuang Ying if (max_pages <= 1) 576ec560175SHuang Ying return 1; 577ec560175SHuang Ying 578ec560175SHuang Ying hits = atomic_xchg(&swapin_readahead_hits, 0); 579d6c1f098SQian Cai pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, 580d6c1f098SQian Cai max_pages, 581ec560175SHuang Ying atomic_read(&last_readahead_pages)); 582ec560175SHuang Ying if (!hits) 583d6c1f098SQian Cai WRITE_ONCE(prev_offset, offset); 584579f8290SShaohua Li atomic_set(&last_readahead_pages, pages); 585579f8290SShaohua Li 586579f8290SShaohua Li return pages; 587579f8290SShaohua Li } 588579f8290SShaohua Li 58946017e95SHugh Dickins /** 590e9e9b7ecSMinchan Kim * swap_cluster_readahead - swap in pages in hope we need them soon 59146017e95SHugh Dickins * @entry: swap entry of this memory 5927682486bSRandy Dunlap * @gfp_mask: memory allocation flags 593e9e9b7ecSMinchan Kim * @vmf: fault information 59446017e95SHugh Dickins * 59546017e95SHugh Dickins * Returns the struct page for entry and addr, after queueing swapin. 59646017e95SHugh Dickins * 59746017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 59846017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 59946017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 60046017e95SHugh Dickins * the 'original' request together with the readahead ones... 60146017e95SHugh Dickins * 60246017e95SHugh Dickins * This has been extended to use the NUMA policies from the mm triggering 60346017e95SHugh Dickins * the readahead. 60446017e95SHugh Dickins * 605c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 60646017e95SHugh Dickins */ 607e9e9b7ecSMinchan Kim struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, 608e9e9b7ecSMinchan Kim struct vm_fault *vmf) 60946017e95SHugh Dickins { 61046017e95SHugh Dickins struct page *page; 611579f8290SShaohua Li unsigned long entry_offset = swp_offset(entry); 612579f8290SShaohua Li unsigned long offset = entry_offset; 61367f96aa2SRik van Riel unsigned long start_offset, end_offset; 614579f8290SShaohua Li unsigned long mask; 615e9a6effaSHuang Ying struct swap_info_struct *si = swp_swap_info(entry); 6163fb5c298SChristian Ehrhardt struct blk_plug plug; 6175169b844SNeilBrown struct swap_iocb *splug = NULL; 618c4fa6309SHuang Ying bool do_poll = true, page_allocated; 619e9e9b7ecSMinchan Kim struct vm_area_struct *vma = vmf->vma; 620e9e9b7ecSMinchan Kim unsigned long addr = vmf->address; 62146017e95SHugh Dickins 622579f8290SShaohua Li mask = swapin_nr_pages(offset) - 1; 623579f8290SShaohua Li if (!mask) 624579f8290SShaohua Li goto skip; 625579f8290SShaohua Li 62623955622SShaohua Li do_poll = false; 62767f96aa2SRik van Riel /* Read a page_cluster sized and aligned cluster around offset. */ 62867f96aa2SRik van Riel start_offset = offset & ~mask; 62967f96aa2SRik van Riel end_offset = offset | mask; 63067f96aa2SRik van Riel if (!start_offset) /* First page is swap header. */ 63167f96aa2SRik van Riel start_offset++; 632e9a6effaSHuang Ying if (end_offset >= si->max) 633e9a6effaSHuang Ying end_offset = si->max - 1; 63467f96aa2SRik van Riel 6353fb5c298SChristian Ehrhardt blk_start_plug(&plug); 63667f96aa2SRik van Riel for (offset = start_offset; offset <= end_offset ; offset++) { 63746017e95SHugh Dickins /* Ok, do the async read-ahead now */ 638c4fa6309SHuang Ying page = __read_swap_cache_async( 639c4fa6309SHuang Ying swp_entry(swp_type(entry), offset), 640c4fa6309SHuang Ying gfp_mask, vma, addr, &page_allocated); 64146017e95SHugh Dickins if (!page) 64267f96aa2SRik van Riel continue; 643c4fa6309SHuang Ying if (page_allocated) { 6445169b844SNeilBrown swap_readpage(page, false, &splug); 645eaf649ebSMinchan Kim if (offset != entry_offset) { 646579f8290SShaohua Li SetPageReadahead(page); 647cbc65df2SHuang Ying count_vm_event(SWAP_RA); 648cbc65df2SHuang Ying } 649c4fa6309SHuang Ying } 65009cbfeafSKirill A. Shutemov put_page(page); 65146017e95SHugh Dickins } 6523fb5c298SChristian Ehrhardt blk_finish_plug(&plug); 6535169b844SNeilBrown swap_read_unplug(splug); 6543fb5c298SChristian Ehrhardt 65546017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 656579f8290SShaohua Li skip: 6575169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 6585169b844SNeilBrown return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); 65946017e95SHugh Dickins } 6604b3ef9daSHuang, Ying 6614b3ef9daSHuang, Ying int init_swap_address_space(unsigned int type, unsigned long nr_pages) 6624b3ef9daSHuang, Ying { 6634b3ef9daSHuang, Ying struct address_space *spaces, *space; 6644b3ef9daSHuang, Ying unsigned int i, nr; 6654b3ef9daSHuang, Ying 6664b3ef9daSHuang, Ying nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); 667778e1cddSKees Cook spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); 6684b3ef9daSHuang, Ying if (!spaces) 6694b3ef9daSHuang, Ying return -ENOMEM; 6704b3ef9daSHuang, Ying for (i = 0; i < nr; i++) { 6714b3ef9daSHuang, Ying space = spaces + i; 672a2833486SMatthew Wilcox xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); 6734b3ef9daSHuang, Ying atomic_set(&space->i_mmap_writable, 0); 6744b3ef9daSHuang, Ying space->a_ops = &swap_aops; 6754b3ef9daSHuang, Ying /* swap cache doesn't use writeback related tags */ 6764b3ef9daSHuang, Ying mapping_set_no_writeback_tags(space); 6774b3ef9daSHuang, Ying } 6784b3ef9daSHuang, Ying nr_swapper_spaces[type] = nr; 679054f1d1fSHuang Ying swapper_spaces[type] = spaces; 6804b3ef9daSHuang, Ying 6814b3ef9daSHuang, Ying return 0; 6824b3ef9daSHuang, Ying } 6834b3ef9daSHuang, Ying 6844b3ef9daSHuang, Ying void exit_swap_address_space(unsigned int type) 6854b3ef9daSHuang, Ying { 686eea4a501SHuang Ying int i; 687eea4a501SHuang Ying struct address_space *spaces = swapper_spaces[type]; 688eea4a501SHuang Ying 689eea4a501SHuang Ying for (i = 0; i < nr_swapper_spaces[type]; i++) 690eea4a501SHuang Ying VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); 691eea4a501SHuang Ying kvfree(spaces); 6924b3ef9daSHuang, Ying nr_swapper_spaces[type] = 0; 693054f1d1fSHuang Ying swapper_spaces[type] = NULL; 6944b3ef9daSHuang, Ying } 695ec560175SHuang Ying 696ec560175SHuang Ying static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, 697ec560175SHuang Ying unsigned long faddr, 698ec560175SHuang Ying unsigned long lpfn, 699ec560175SHuang Ying unsigned long rpfn, 700ec560175SHuang Ying unsigned long *start, 701ec560175SHuang Ying unsigned long *end) 702ec560175SHuang Ying { 703ec560175SHuang Ying *start = max3(lpfn, PFN_DOWN(vma->vm_start), 704ec560175SHuang Ying PFN_DOWN(faddr & PMD_MASK)); 705ec560175SHuang Ying *end = min3(rpfn, PFN_DOWN(vma->vm_end), 706ec560175SHuang Ying PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 707ec560175SHuang Ying } 708ec560175SHuang Ying 709eaf649ebSMinchan Kim static void swap_ra_info(struct vm_fault *vmf, 710eaf649ebSMinchan Kim struct vma_swap_readahead *ra_info) 711ec560175SHuang Ying { 712ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 713eaf649ebSMinchan Kim unsigned long ra_val; 714ec560175SHuang Ying unsigned long faddr, pfn, fpfn; 715ec560175SHuang Ying unsigned long start, end; 716eaf649ebSMinchan Kim pte_t *pte, *orig_pte; 717ec560175SHuang Ying unsigned int max_win, hits, prev_win, win, left; 718ec560175SHuang Ying #ifndef CONFIG_64BIT 719ec560175SHuang Ying pte_t *tpte; 720ec560175SHuang Ying #endif 721ec560175SHuang Ying 72261b63972SHuang Ying max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 72361b63972SHuang Ying SWAP_RA_ORDER_CEILING); 72461b63972SHuang Ying if (max_win == 1) { 725eaf649ebSMinchan Kim ra_info->win = 1; 726eaf649ebSMinchan Kim return; 72761b63972SHuang Ying } 72861b63972SHuang Ying 729ec560175SHuang Ying faddr = vmf->address; 730eaf649ebSMinchan Kim orig_pte = pte = pte_offset_map(vmf->pmd, faddr); 731ec560175SHuang Ying 732ec560175SHuang Ying fpfn = PFN_DOWN(faddr); 733eaf649ebSMinchan Kim ra_val = GET_SWAP_RA_VAL(vma); 734eaf649ebSMinchan Kim pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); 735eaf649ebSMinchan Kim prev_win = SWAP_RA_WIN(ra_val); 736eaf649ebSMinchan Kim hits = SWAP_RA_HITS(ra_val); 737eaf649ebSMinchan Kim ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, 738ec560175SHuang Ying max_win, prev_win); 739ec560175SHuang Ying atomic_long_set(&vma->swap_readahead_info, 740ec560175SHuang Ying SWAP_RA_VAL(faddr, win, 0)); 741ec560175SHuang Ying 742eaf649ebSMinchan Kim if (win == 1) { 743eaf649ebSMinchan Kim pte_unmap(orig_pte); 744eaf649ebSMinchan Kim return; 745eaf649ebSMinchan Kim } 746ec560175SHuang Ying 747ec560175SHuang Ying /* Copy the PTEs because the page table may be unmapped */ 748ec560175SHuang Ying if (fpfn == pfn + 1) 749ec560175SHuang Ying swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); 750ec560175SHuang Ying else if (pfn == fpfn + 1) 751ec560175SHuang Ying swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, 752ec560175SHuang Ying &start, &end); 753ec560175SHuang Ying else { 754ec560175SHuang Ying left = (win - 1) / 2; 755ec560175SHuang Ying swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 756ec560175SHuang Ying &start, &end); 757ec560175SHuang Ying } 758eaf649ebSMinchan Kim ra_info->nr_pte = end - start; 759eaf649ebSMinchan Kim ra_info->offset = fpfn - start; 760eaf649ebSMinchan Kim pte -= ra_info->offset; 761ec560175SHuang Ying #ifdef CONFIG_64BIT 762eaf649ebSMinchan Kim ra_info->ptes = pte; 763ec560175SHuang Ying #else 764eaf649ebSMinchan Kim tpte = ra_info->ptes; 765ec560175SHuang Ying for (pfn = start; pfn != end; pfn++) 766ec560175SHuang Ying *tpte++ = *pte++; 767ec560175SHuang Ying #endif 768eaf649ebSMinchan Kim pte_unmap(orig_pte); 769ec560175SHuang Ying } 770ec560175SHuang Ying 771e9f59873SYang Shi /** 772e9f59873SYang Shi * swap_vma_readahead - swap in pages in hope we need them soon 77327ec4878SKrzysztof Kozlowski * @fentry: swap entry of this memory 774e9f59873SYang Shi * @gfp_mask: memory allocation flags 775e9f59873SYang Shi * @vmf: fault information 776e9f59873SYang Shi * 777e9f59873SYang Shi * Returns the struct page for entry and addr, after queueing swapin. 778e9f59873SYang Shi * 779cb152a1aSShijie Luo * Primitive swap readahead code. We simply read in a few pages whose 780e9f59873SYang Shi * virtual addresses are around the fault address in the same vma. 781e9f59873SYang Shi * 782c1e8d7c6SMichel Lespinasse * Caller must hold read mmap_lock if vmf->vma is not NULL. 783e9f59873SYang Shi * 784e9f59873SYang Shi */ 785f5c754d6SColin Ian King static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, 786eaf649ebSMinchan Kim struct vm_fault *vmf) 787ec560175SHuang Ying { 788ec560175SHuang Ying struct blk_plug plug; 7895169b844SNeilBrown struct swap_iocb *splug = NULL; 790ec560175SHuang Ying struct vm_area_struct *vma = vmf->vma; 791ec560175SHuang Ying struct page *page; 792ec560175SHuang Ying pte_t *pte, pentry; 793ec560175SHuang Ying swp_entry_t entry; 794ec560175SHuang Ying unsigned int i; 795ec560175SHuang Ying bool page_allocated; 796e97af699SMiaohe Lin struct vma_swap_readahead ra_info = { 797e97af699SMiaohe Lin .win = 1, 798e97af699SMiaohe Lin }; 799ec560175SHuang Ying 800eaf649ebSMinchan Kim swap_ra_info(vmf, &ra_info); 801eaf649ebSMinchan Kim if (ra_info.win == 1) 802ec560175SHuang Ying goto skip; 803ec560175SHuang Ying 804ec560175SHuang Ying blk_start_plug(&plug); 805eaf649ebSMinchan Kim for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; 806ec560175SHuang Ying i++, pte++) { 807ec560175SHuang Ying pentry = *pte; 80892bafb20SMiaohe Lin if (!is_swap_pte(pentry)) 809ec560175SHuang Ying continue; 810ec560175SHuang Ying entry = pte_to_swp_entry(pentry); 811ec560175SHuang Ying if (unlikely(non_swap_entry(entry))) 812ec560175SHuang Ying continue; 813ec560175SHuang Ying page = __read_swap_cache_async(entry, gfp_mask, vma, 814ec560175SHuang Ying vmf->address, &page_allocated); 815ec560175SHuang Ying if (!page) 816ec560175SHuang Ying continue; 817ec560175SHuang Ying if (page_allocated) { 8185169b844SNeilBrown swap_readpage(page, false, &splug); 819eaf649ebSMinchan Kim if (i != ra_info.offset) { 820ec560175SHuang Ying SetPageReadahead(page); 821ec560175SHuang Ying count_vm_event(SWAP_RA); 822ec560175SHuang Ying } 823ec560175SHuang Ying } 824ec560175SHuang Ying put_page(page); 825ec560175SHuang Ying } 826ec560175SHuang Ying blk_finish_plug(&plug); 8275169b844SNeilBrown swap_read_unplug(splug); 828ec560175SHuang Ying lru_add_drain(); 829ec560175SHuang Ying skip: 8305169b844SNeilBrown /* The page was likely read above, so no need for plugging here */ 831ec560175SHuang Ying return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 8325169b844SNeilBrown ra_info.win == 1, NULL); 833ec560175SHuang Ying } 834d9bfcfdcSHuang Ying 835e9e9b7ecSMinchan Kim /** 836e9e9b7ecSMinchan Kim * swapin_readahead - swap in pages in hope we need them soon 837e9e9b7ecSMinchan Kim * @entry: swap entry of this memory 838e9e9b7ecSMinchan Kim * @gfp_mask: memory allocation flags 839e9e9b7ecSMinchan Kim * @vmf: fault information 840e9e9b7ecSMinchan Kim * 841e9e9b7ecSMinchan Kim * Returns the struct page for entry and addr, after queueing swapin. 842e9e9b7ecSMinchan Kim * 843e9e9b7ecSMinchan Kim * It's a main entry function for swap readahead. By the configuration, 844e9e9b7ecSMinchan Kim * it will read ahead blocks by cluster-based(ie, physical disk based) 845e9e9b7ecSMinchan Kim * or vma-based(ie, virtual address based on faulty address) readahead. 846e9e9b7ecSMinchan Kim */ 847e9e9b7ecSMinchan Kim struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 848e9e9b7ecSMinchan Kim struct vm_fault *vmf) 849e9e9b7ecSMinchan Kim { 850e9e9b7ecSMinchan Kim return swap_use_vma_readahead() ? 851e9e9b7ecSMinchan Kim swap_vma_readahead(entry, gfp_mask, vmf) : 852e9e9b7ecSMinchan Kim swap_cluster_readahead(entry, gfp_mask, vmf); 853e9e9b7ecSMinchan Kim } 854e9e9b7ecSMinchan Kim 855d9bfcfdcSHuang Ying #ifdef CONFIG_SYSFS 856d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_show(struct kobject *kobj, 857d9bfcfdcSHuang Ying struct kobj_attribute *attr, char *buf) 858d9bfcfdcSHuang Ying { 859ae7a927dSJoe Perches return sysfs_emit(buf, "%s\n", 860ae7a927dSJoe Perches enable_vma_readahead ? "true" : "false"); 861d9bfcfdcSHuang Ying } 862d9bfcfdcSHuang Ying static ssize_t vma_ra_enabled_store(struct kobject *kobj, 863d9bfcfdcSHuang Ying struct kobj_attribute *attr, 864d9bfcfdcSHuang Ying const char *buf, size_t count) 865d9bfcfdcSHuang Ying { 866717aeab4SJagdish Gediya ssize_t ret; 867717aeab4SJagdish Gediya 868717aeab4SJagdish Gediya ret = kstrtobool(buf, &enable_vma_readahead); 869717aeab4SJagdish Gediya if (ret) 870717aeab4SJagdish Gediya return ret; 871d9bfcfdcSHuang Ying 872d9bfcfdcSHuang Ying return count; 873d9bfcfdcSHuang Ying } 8746106b93eSMiaohe Lin static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); 875d9bfcfdcSHuang Ying 876d9bfcfdcSHuang Ying static struct attribute *swap_attrs[] = { 877d9bfcfdcSHuang Ying &vma_ra_enabled_attr.attr, 878d9bfcfdcSHuang Ying NULL, 879d9bfcfdcSHuang Ying }; 880d9bfcfdcSHuang Ying 881e48333b6SRikard Falkeborn static const struct attribute_group swap_attr_group = { 882d9bfcfdcSHuang Ying .attrs = swap_attrs, 883d9bfcfdcSHuang Ying }; 884d9bfcfdcSHuang Ying 885d9bfcfdcSHuang Ying static int __init swap_init_sysfs(void) 886d9bfcfdcSHuang Ying { 887d9bfcfdcSHuang Ying int err; 888d9bfcfdcSHuang Ying struct kobject *swap_kobj; 889d9bfcfdcSHuang Ying 890d9bfcfdcSHuang Ying swap_kobj = kobject_create_and_add("swap", mm_kobj); 891d9bfcfdcSHuang Ying if (!swap_kobj) { 892d9bfcfdcSHuang Ying pr_err("failed to create swap kobject\n"); 893d9bfcfdcSHuang Ying return -ENOMEM; 894d9bfcfdcSHuang Ying } 895d9bfcfdcSHuang Ying err = sysfs_create_group(swap_kobj, &swap_attr_group); 896d9bfcfdcSHuang Ying if (err) { 897d9bfcfdcSHuang Ying pr_err("failed to register swap group\n"); 898d9bfcfdcSHuang Ying goto delete_obj; 899d9bfcfdcSHuang Ying } 900d9bfcfdcSHuang Ying return 0; 901d9bfcfdcSHuang Ying 902d9bfcfdcSHuang Ying delete_obj: 903d9bfcfdcSHuang Ying kobject_put(swap_kobj); 904d9bfcfdcSHuang Ying return err; 905d9bfcfdcSHuang Ying } 906d9bfcfdcSHuang Ying subsys_initcall(swap_init_sysfs); 907d9bfcfdcSHuang Ying #endif 908