11da177e4SLinus Torvalds /* 21da177e4SLinus Torvalds * linux/mm/swap_state.c 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 51da177e4SLinus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Rewritten to use page cache, (C) 1998 Stephen Tweedie 81da177e4SLinus Torvalds */ 91da177e4SLinus Torvalds #include <linux/module.h> 101da177e4SLinus Torvalds #include <linux/mm.h> 111da177e4SLinus Torvalds #include <linux/kernel_stat.h> 121da177e4SLinus Torvalds #include <linux/swap.h> 1346017e95SHugh Dickins #include <linux/swapops.h> 141da177e4SLinus Torvalds #include <linux/init.h> 151da177e4SLinus Torvalds #include <linux/pagemap.h> 161da177e4SLinus Torvalds #include <linux/buffer_head.h> 171da177e4SLinus Torvalds #include <linux/backing-dev.h> 18c484d410SHugh Dickins #include <linux/pagevec.h> 19b20a3503SChristoph Lameter #include <linux/migrate.h> 201da177e4SLinus Torvalds 211da177e4SLinus Torvalds #include <asm/pgtable.h> 221da177e4SLinus Torvalds 231da177e4SLinus Torvalds /* 241da177e4SLinus Torvalds * swapper_space is a fiction, retained to simplify the path through 252706a1b8SAnderson Briglia * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 261da177e4SLinus Torvalds * future use of radix_tree tags in the swap cache. 271da177e4SLinus Torvalds */ 28f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = { 291da177e4SLinus Torvalds .writepage = swap_writepage, 301da177e4SLinus Torvalds .sync_page = block_sync_page, 311da177e4SLinus Torvalds .set_page_dirty = __set_page_dirty_nobuffers, 32e965f963SChristoph Lameter .migratepage = migrate_page, 331da177e4SLinus Torvalds }; 341da177e4SLinus Torvalds 351da177e4SLinus Torvalds static struct backing_dev_info swap_backing_dev_info = { 361da177e4SLinus Torvalds .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 371da177e4SLinus Torvalds .unplug_io_fn = swap_unplug_io_fn, 381da177e4SLinus Torvalds }; 391da177e4SLinus Torvalds 401da177e4SLinus Torvalds struct address_space swapper_space = { 411da177e4SLinus Torvalds .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 42e4d91918SIngo Molnar .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 431da177e4SLinus Torvalds .a_ops = &swap_aops, 441da177e4SLinus Torvalds .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 451da177e4SLinus Torvalds .backing_dev_info = &swap_backing_dev_info, 461da177e4SLinus Torvalds }; 471da177e4SLinus Torvalds 481da177e4SLinus Torvalds #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 491da177e4SLinus Torvalds 501da177e4SLinus Torvalds static struct { 511da177e4SLinus Torvalds unsigned long add_total; 521da177e4SLinus Torvalds unsigned long del_total; 531da177e4SLinus Torvalds unsigned long find_success; 541da177e4SLinus Torvalds unsigned long find_total; 551da177e4SLinus Torvalds } swap_cache_info; 561da177e4SLinus Torvalds 571da177e4SLinus Torvalds void show_swap_cache_info(void) 581da177e4SLinus Torvalds { 59bb63be0aSHugh Dickins printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 601da177e4SLinus Torvalds swap_cache_info.add_total, swap_cache_info.del_total, 61bb63be0aSHugh Dickins swap_cache_info.find_success, swap_cache_info.find_total); 621da177e4SLinus Torvalds printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 631da177e4SLinus Torvalds printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 641da177e4SLinus Torvalds } 651da177e4SLinus Torvalds 661da177e4SLinus Torvalds /* 67f000944dSHugh Dickins * add_to_swap_cache resembles add_to_page_cache on swapper_space, 681da177e4SLinus Torvalds * but sets SwapCache flag and private instead of mapping and index. 691da177e4SLinus Torvalds */ 7073b1262fSHugh Dickins int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 711da177e4SLinus Torvalds { 721da177e4SLinus Torvalds int error; 731da177e4SLinus Torvalds 74b55ed816SNick Piggin BUG_ON(!PageLocked(page)); 751da177e4SLinus Torvalds BUG_ON(PageSwapCache(page)); 761da177e4SLinus Torvalds BUG_ON(PagePrivate(page)); 7735c754d7SBalbir Singh error = radix_tree_preload(gfp_mask); 7835c754d7SBalbir Singh if (!error) { 791da177e4SLinus Torvalds write_lock_irq(&swapper_space.tree_lock); 801da177e4SLinus Torvalds error = radix_tree_insert(&swapper_space.page_tree, 811da177e4SLinus Torvalds entry.val, page); 821da177e4SLinus Torvalds if (!error) { 831da177e4SLinus Torvalds page_cache_get(page); 841da177e4SLinus Torvalds SetPageSwapCache(page); 854c21e2f2SHugh Dickins set_page_private(page, entry.val); 861da177e4SLinus Torvalds total_swapcache_pages++; 87347ce434SChristoph Lameter __inc_zone_page_state(page, NR_FILE_PAGES); 88bb63be0aSHugh Dickins INC_CACHE_INFO(add_total); 891da177e4SLinus Torvalds } 901da177e4SLinus Torvalds write_unlock_irq(&swapper_space.tree_lock); 911da177e4SLinus Torvalds radix_tree_preload_end(); 92fa1de900SHugh Dickins } 931da177e4SLinus Torvalds return error; 941da177e4SLinus Torvalds } 951da177e4SLinus Torvalds 961da177e4SLinus Torvalds /* 971da177e4SLinus Torvalds * This must be called only on pages that have 981da177e4SLinus Torvalds * been verified to be in the swap cache. 991da177e4SLinus Torvalds */ 1001da177e4SLinus Torvalds void __delete_from_swap_cache(struct page *page) 1011da177e4SLinus Torvalds { 1021da177e4SLinus Torvalds BUG_ON(!PageLocked(page)); 1031da177e4SLinus Torvalds BUG_ON(!PageSwapCache(page)); 1041da177e4SLinus Torvalds BUG_ON(PageWriteback(page)); 1053279ffd9SHugh Dickins BUG_ON(PagePrivate(page)); 1061da177e4SLinus Torvalds 1074c21e2f2SHugh Dickins radix_tree_delete(&swapper_space.page_tree, page_private(page)); 1084c21e2f2SHugh Dickins set_page_private(page, 0); 1091da177e4SLinus Torvalds ClearPageSwapCache(page); 1101da177e4SLinus Torvalds total_swapcache_pages--; 111347ce434SChristoph Lameter __dec_zone_page_state(page, NR_FILE_PAGES); 1121da177e4SLinus Torvalds INC_CACHE_INFO(del_total); 1131da177e4SLinus Torvalds } 1141da177e4SLinus Torvalds 1151da177e4SLinus Torvalds /** 1161da177e4SLinus Torvalds * add_to_swap - allocate swap space for a page 1171da177e4SLinus Torvalds * @page: page we want to move to swap 118*7682486bSRandy Dunlap * @gfp_mask: memory allocation flags 1191da177e4SLinus Torvalds * 1201da177e4SLinus Torvalds * Allocate swap space for the page and add the page to the 1211da177e4SLinus Torvalds * swap cache. Caller needs to hold the page lock. 1221da177e4SLinus Torvalds */ 1231480a540SChristoph Lameter int add_to_swap(struct page * page, gfp_t gfp_mask) 1241da177e4SLinus Torvalds { 1251da177e4SLinus Torvalds swp_entry_t entry; 1261da177e4SLinus Torvalds int err; 1271da177e4SLinus Torvalds 128e74ca2b4SEric Sesterhenn BUG_ON(!PageLocked(page)); 1290ed361deSNick Piggin BUG_ON(!PageUptodate(page)); 1301da177e4SLinus Torvalds 1311da177e4SLinus Torvalds for (;;) { 1321da177e4SLinus Torvalds entry = get_swap_page(); 1331da177e4SLinus Torvalds if (!entry.val) 1341da177e4SLinus Torvalds return 0; 1351da177e4SLinus Torvalds 136bd53b714SNick Piggin /* 137bd53b714SNick Piggin * Radix-tree node allocations from PF_MEMALLOC contexts could 138bd53b714SNick Piggin * completely exhaust the page allocator. __GFP_NOMEMALLOC 139bd53b714SNick Piggin * stops emergency reserves from being allocated. 1401da177e4SLinus Torvalds * 141bd53b714SNick Piggin * TODO: this could cause a theoretical memory reclaim 142bd53b714SNick Piggin * deadlock in the swap out path. 1431da177e4SLinus Torvalds */ 1441da177e4SLinus Torvalds /* 1451da177e4SLinus Torvalds * Add it to the swap cache and mark it dirty 1461da177e4SLinus Torvalds */ 147f000944dSHugh Dickins err = add_to_swap_cache(page, entry, 1481480a540SChristoph Lameter gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 1491da177e4SLinus Torvalds 1501da177e4SLinus Torvalds switch (err) { 1511da177e4SLinus Torvalds case 0: /* Success */ 1521da177e4SLinus Torvalds SetPageDirty(page); 1531da177e4SLinus Torvalds return 1; 1541da177e4SLinus Torvalds case -EEXIST: 1551da177e4SLinus Torvalds /* Raced with "speculative" read_swap_cache_async */ 1561da177e4SLinus Torvalds swap_free(entry); 1571da177e4SLinus Torvalds continue; 1581da177e4SLinus Torvalds default: 1591da177e4SLinus Torvalds /* -ENOMEM radix-tree allocation failure */ 1601da177e4SLinus Torvalds swap_free(entry); 1611da177e4SLinus Torvalds return 0; 1621da177e4SLinus Torvalds } 1631da177e4SLinus Torvalds } 1641da177e4SLinus Torvalds } 1651da177e4SLinus Torvalds 1661da177e4SLinus Torvalds /* 1671da177e4SLinus Torvalds * This must be called only on pages that have 1681da177e4SLinus Torvalds * been verified to be in the swap cache and locked. 1691da177e4SLinus Torvalds * It will never put the page into the free list, 1701da177e4SLinus Torvalds * the caller has a reference on the page. 1711da177e4SLinus Torvalds */ 1721da177e4SLinus Torvalds void delete_from_swap_cache(struct page *page) 1731da177e4SLinus Torvalds { 1741da177e4SLinus Torvalds swp_entry_t entry; 1751da177e4SLinus Torvalds 1764c21e2f2SHugh Dickins entry.val = page_private(page); 1771da177e4SLinus Torvalds 1781da177e4SLinus Torvalds write_lock_irq(&swapper_space.tree_lock); 1791da177e4SLinus Torvalds __delete_from_swap_cache(page); 1801da177e4SLinus Torvalds write_unlock_irq(&swapper_space.tree_lock); 1811da177e4SLinus Torvalds 1821da177e4SLinus Torvalds swap_free(entry); 1831da177e4SLinus Torvalds page_cache_release(page); 1841da177e4SLinus Torvalds } 1851da177e4SLinus Torvalds 1861da177e4SLinus Torvalds /* 1871da177e4SLinus Torvalds * If we are the only user, then try to free up the swap cache. 1881da177e4SLinus Torvalds * 1891da177e4SLinus Torvalds * Its ok to check for PageSwapCache without the page lock 1901da177e4SLinus Torvalds * here because we are going to recheck again inside 1911da177e4SLinus Torvalds * exclusive_swap_page() _with_ the lock. 1921da177e4SLinus Torvalds * - Marcelo 1931da177e4SLinus Torvalds */ 1941da177e4SLinus Torvalds static inline void free_swap_cache(struct page *page) 1951da177e4SLinus Torvalds { 1961da177e4SLinus Torvalds if (PageSwapCache(page) && !TestSetPageLocked(page)) { 1971da177e4SLinus Torvalds remove_exclusive_swap_page(page); 1981da177e4SLinus Torvalds unlock_page(page); 1991da177e4SLinus Torvalds } 2001da177e4SLinus Torvalds } 2011da177e4SLinus Torvalds 2021da177e4SLinus Torvalds /* 2031da177e4SLinus Torvalds * Perform a free_page(), also freeing any swap cache associated with 204b8072f09SHugh Dickins * this page if it is the last user of the page. 2051da177e4SLinus Torvalds */ 2061da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page) 2071da177e4SLinus Torvalds { 2081da177e4SLinus Torvalds free_swap_cache(page); 2091da177e4SLinus Torvalds page_cache_release(page); 2101da177e4SLinus Torvalds } 2111da177e4SLinus Torvalds 2121da177e4SLinus Torvalds /* 2131da177e4SLinus Torvalds * Passed an array of pages, drop them all from swapcache and then release 2141da177e4SLinus Torvalds * them. They are removed from the LRU and freed if this is their last use. 2151da177e4SLinus Torvalds */ 2161da177e4SLinus Torvalds void free_pages_and_swap_cache(struct page **pages, int nr) 2171da177e4SLinus Torvalds { 2181da177e4SLinus Torvalds struct page **pagep = pages; 2191da177e4SLinus Torvalds 2201da177e4SLinus Torvalds lru_add_drain(); 2211da177e4SLinus Torvalds while (nr) { 222c484d410SHugh Dickins int todo = min(nr, PAGEVEC_SIZE); 2231da177e4SLinus Torvalds int i; 2241da177e4SLinus Torvalds 2251da177e4SLinus Torvalds for (i = 0; i < todo; i++) 2261da177e4SLinus Torvalds free_swap_cache(pagep[i]); 2271da177e4SLinus Torvalds release_pages(pagep, todo, 0); 2281da177e4SLinus Torvalds pagep += todo; 2291da177e4SLinus Torvalds nr -= todo; 2301da177e4SLinus Torvalds } 2311da177e4SLinus Torvalds } 2321da177e4SLinus Torvalds 2331da177e4SLinus Torvalds /* 2341da177e4SLinus Torvalds * Lookup a swap entry in the swap cache. A found page will be returned 2351da177e4SLinus Torvalds * unlocked and with its refcount incremented - we rely on the kernel 2361da177e4SLinus Torvalds * lock getting page table operations atomic even if we drop the page 2371da177e4SLinus Torvalds * lock before returning. 2381da177e4SLinus Torvalds */ 2391da177e4SLinus Torvalds struct page * lookup_swap_cache(swp_entry_t entry) 2401da177e4SLinus Torvalds { 2411da177e4SLinus Torvalds struct page *page; 2421da177e4SLinus Torvalds 2431da177e4SLinus Torvalds page = find_get_page(&swapper_space, entry.val); 2441da177e4SLinus Torvalds 2451da177e4SLinus Torvalds if (page) 2461da177e4SLinus Torvalds INC_CACHE_INFO(find_success); 2471da177e4SLinus Torvalds 2481da177e4SLinus Torvalds INC_CACHE_INFO(find_total); 2491da177e4SLinus Torvalds return page; 2501da177e4SLinus Torvalds } 2511da177e4SLinus Torvalds 2521da177e4SLinus Torvalds /* 2531da177e4SLinus Torvalds * Locate a page of swap in physical memory, reserving swap cache space 2541da177e4SLinus Torvalds * and reading the disk if it is not already cached. 2551da177e4SLinus Torvalds * A failure return means that either the page allocation failed or that 2561da177e4SLinus Torvalds * the swap entry is no longer in use. 2571da177e4SLinus Torvalds */ 25802098feaSHugh Dickins struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, 2591da177e4SLinus Torvalds struct vm_area_struct *vma, unsigned long addr) 2601da177e4SLinus Torvalds { 2611da177e4SLinus Torvalds struct page *found_page, *new_page = NULL; 2621da177e4SLinus Torvalds int err; 2631da177e4SLinus Torvalds 2641da177e4SLinus Torvalds do { 2651da177e4SLinus Torvalds /* 2661da177e4SLinus Torvalds * First check the swap cache. Since this is normally 2671da177e4SLinus Torvalds * called after lookup_swap_cache() failed, re-calling 2681da177e4SLinus Torvalds * that would confuse statistics. 2691da177e4SLinus Torvalds */ 2701da177e4SLinus Torvalds found_page = find_get_page(&swapper_space, entry.val); 2711da177e4SLinus Torvalds if (found_page) 2721da177e4SLinus Torvalds break; 2731da177e4SLinus Torvalds 2741da177e4SLinus Torvalds /* 2751da177e4SLinus Torvalds * Get a new page to read into from swap. 2761da177e4SLinus Torvalds */ 2771da177e4SLinus Torvalds if (!new_page) { 27802098feaSHugh Dickins new_page = alloc_page_vma(gfp_mask, vma, addr); 2791da177e4SLinus Torvalds if (!new_page) 2801da177e4SLinus Torvalds break; /* Out of memory */ 2811da177e4SLinus Torvalds } 2821da177e4SLinus Torvalds 2831da177e4SLinus Torvalds /* 284f000944dSHugh Dickins * Swap entry may have been freed since our caller observed it. 285f000944dSHugh Dickins */ 286f000944dSHugh Dickins if (!swap_duplicate(entry)) 287f000944dSHugh Dickins break; 288f000944dSHugh Dickins 289f000944dSHugh Dickins /* 2901da177e4SLinus Torvalds * Associate the page with swap entry in the swap cache. 291f000944dSHugh Dickins * May fail (-EEXIST) if there is already a page associated 292f000944dSHugh Dickins * with this entry in the swap cache: added by a racing 293f000944dSHugh Dickins * read_swap_cache_async, or add_to_swap or shmem_writepage 294f000944dSHugh Dickins * re-using the just freed swap entry for an existing page. 2951da177e4SLinus Torvalds * May fail (-ENOMEM) if radix-tree node allocation failed. 2961da177e4SLinus Torvalds */ 297f000944dSHugh Dickins SetPageLocked(new_page); 298f000944dSHugh Dickins err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 2991da177e4SLinus Torvalds if (!err) { 3001da177e4SLinus Torvalds /* 3011da177e4SLinus Torvalds * Initiate read into locked page and return. 3021da177e4SLinus Torvalds */ 3031da177e4SLinus Torvalds lru_cache_add_active(new_page); 3041da177e4SLinus Torvalds swap_readpage(NULL, new_page); 3051da177e4SLinus Torvalds return new_page; 3061da177e4SLinus Torvalds } 307f000944dSHugh Dickins ClearPageLocked(new_page); 308f000944dSHugh Dickins swap_free(entry); 309f000944dSHugh Dickins } while (err != -ENOMEM); 3101da177e4SLinus Torvalds 3111da177e4SLinus Torvalds if (new_page) 3121da177e4SLinus Torvalds page_cache_release(new_page); 3131da177e4SLinus Torvalds return found_page; 3141da177e4SLinus Torvalds } 31546017e95SHugh Dickins 31646017e95SHugh Dickins /** 31746017e95SHugh Dickins * swapin_readahead - swap in pages in hope we need them soon 31846017e95SHugh Dickins * @entry: swap entry of this memory 319*7682486bSRandy Dunlap * @gfp_mask: memory allocation flags 32046017e95SHugh Dickins * @vma: user vma this address belongs to 32146017e95SHugh Dickins * @addr: target address for mempolicy 32246017e95SHugh Dickins * 32346017e95SHugh Dickins * Returns the struct page for entry and addr, after queueing swapin. 32446017e95SHugh Dickins * 32546017e95SHugh Dickins * Primitive swap readahead code. We simply read an aligned block of 32646017e95SHugh Dickins * (1 << page_cluster) entries in the swap area. This method is chosen 32746017e95SHugh Dickins * because it doesn't cost us any seek time. We also make sure to queue 32846017e95SHugh Dickins * the 'original' request together with the readahead ones... 32946017e95SHugh Dickins * 33046017e95SHugh Dickins * This has been extended to use the NUMA policies from the mm triggering 33146017e95SHugh Dickins * the readahead. 33246017e95SHugh Dickins * 33346017e95SHugh Dickins * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 33446017e95SHugh Dickins */ 33502098feaSHugh Dickins struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 33646017e95SHugh Dickins struct vm_area_struct *vma, unsigned long addr) 33746017e95SHugh Dickins { 33846017e95SHugh Dickins int nr_pages; 33946017e95SHugh Dickins struct page *page; 34046017e95SHugh Dickins unsigned long offset; 34146017e95SHugh Dickins unsigned long end_offset; 34246017e95SHugh Dickins 34346017e95SHugh Dickins /* 34446017e95SHugh Dickins * Get starting offset for readaround, and number of pages to read. 34546017e95SHugh Dickins * Adjust starting address by readbehind (for NUMA interleave case)? 34646017e95SHugh Dickins * No, it's very unlikely that swap layout would follow vma layout, 34746017e95SHugh Dickins * more likely that neighbouring swap pages came from the same node: 34846017e95SHugh Dickins * so use the same "addr" to choose the same node for each swap read. 34946017e95SHugh Dickins */ 35046017e95SHugh Dickins nr_pages = valid_swaphandles(entry, &offset); 35146017e95SHugh Dickins for (end_offset = offset + nr_pages; offset < end_offset; offset++) { 35246017e95SHugh Dickins /* Ok, do the async read-ahead now */ 35346017e95SHugh Dickins page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 35402098feaSHugh Dickins gfp_mask, vma, addr); 35546017e95SHugh Dickins if (!page) 35646017e95SHugh Dickins break; 35746017e95SHugh Dickins page_cache_release(page); 35846017e95SHugh Dickins } 35946017e95SHugh Dickins lru_add_drain(); /* Push any new pages onto the LRU now */ 36002098feaSHugh Dickins return read_swap_cache_async(entry, gfp_mask, vma, addr); 36146017e95SHugh Dickins } 362