xref: /linux/mm/swap_state.c (revision 7682486b3ee06f800d5b11033371c7c5e92e3057)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  *  linux/mm/swap_state.c
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
51da177e4SLinus Torvalds  *  Swap reorganised 29.12.95, Stephen Tweedie
61da177e4SLinus Torvalds  *
71da177e4SLinus Torvalds  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
81da177e4SLinus Torvalds  */
91da177e4SLinus Torvalds #include <linux/module.h>
101da177e4SLinus Torvalds #include <linux/mm.h>
111da177e4SLinus Torvalds #include <linux/kernel_stat.h>
121da177e4SLinus Torvalds #include <linux/swap.h>
1346017e95SHugh Dickins #include <linux/swapops.h>
141da177e4SLinus Torvalds #include <linux/init.h>
151da177e4SLinus Torvalds #include <linux/pagemap.h>
161da177e4SLinus Torvalds #include <linux/buffer_head.h>
171da177e4SLinus Torvalds #include <linux/backing-dev.h>
18c484d410SHugh Dickins #include <linux/pagevec.h>
19b20a3503SChristoph Lameter #include <linux/migrate.h>
201da177e4SLinus Torvalds 
211da177e4SLinus Torvalds #include <asm/pgtable.h>
221da177e4SLinus Torvalds 
231da177e4SLinus Torvalds /*
241da177e4SLinus Torvalds  * swapper_space is a fiction, retained to simplify the path through
252706a1b8SAnderson Briglia  * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
261da177e4SLinus Torvalds  * future use of radix_tree tags in the swap cache.
271da177e4SLinus Torvalds  */
28f5e54d6eSChristoph Hellwig static const struct address_space_operations swap_aops = {
291da177e4SLinus Torvalds 	.writepage	= swap_writepage,
301da177e4SLinus Torvalds 	.sync_page	= block_sync_page,
311da177e4SLinus Torvalds 	.set_page_dirty	= __set_page_dirty_nobuffers,
32e965f963SChristoph Lameter 	.migratepage	= migrate_page,
331da177e4SLinus Torvalds };
341da177e4SLinus Torvalds 
351da177e4SLinus Torvalds static struct backing_dev_info swap_backing_dev_info = {
361da177e4SLinus Torvalds 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
371da177e4SLinus Torvalds 	.unplug_io_fn	= swap_unplug_io_fn,
381da177e4SLinus Torvalds };
391da177e4SLinus Torvalds 
401da177e4SLinus Torvalds struct address_space swapper_space = {
411da177e4SLinus Torvalds 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42e4d91918SIngo Molnar 	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
431da177e4SLinus Torvalds 	.a_ops		= &swap_aops,
441da177e4SLinus Torvalds 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
451da177e4SLinus Torvalds 	.backing_dev_info = &swap_backing_dev_info,
461da177e4SLinus Torvalds };
471da177e4SLinus Torvalds 
481da177e4SLinus Torvalds #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
491da177e4SLinus Torvalds 
501da177e4SLinus Torvalds static struct {
511da177e4SLinus Torvalds 	unsigned long add_total;
521da177e4SLinus Torvalds 	unsigned long del_total;
531da177e4SLinus Torvalds 	unsigned long find_success;
541da177e4SLinus Torvalds 	unsigned long find_total;
551da177e4SLinus Torvalds } swap_cache_info;
561da177e4SLinus Torvalds 
571da177e4SLinus Torvalds void show_swap_cache_info(void)
581da177e4SLinus Torvalds {
59bb63be0aSHugh Dickins 	printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
601da177e4SLinus Torvalds 		swap_cache_info.add_total, swap_cache_info.del_total,
61bb63be0aSHugh Dickins 		swap_cache_info.find_success, swap_cache_info.find_total);
621da177e4SLinus Torvalds 	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
631da177e4SLinus Torvalds 	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
641da177e4SLinus Torvalds }
651da177e4SLinus Torvalds 
661da177e4SLinus Torvalds /*
67f000944dSHugh Dickins  * add_to_swap_cache resembles add_to_page_cache on swapper_space,
681da177e4SLinus Torvalds  * but sets SwapCache flag and private instead of mapping and index.
691da177e4SLinus Torvalds  */
7073b1262fSHugh Dickins int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
711da177e4SLinus Torvalds {
721da177e4SLinus Torvalds 	int error;
731da177e4SLinus Torvalds 
74b55ed816SNick Piggin 	BUG_ON(!PageLocked(page));
751da177e4SLinus Torvalds 	BUG_ON(PageSwapCache(page));
761da177e4SLinus Torvalds 	BUG_ON(PagePrivate(page));
7735c754d7SBalbir Singh 	error = radix_tree_preload(gfp_mask);
7835c754d7SBalbir Singh 	if (!error) {
791da177e4SLinus Torvalds 		write_lock_irq(&swapper_space.tree_lock);
801da177e4SLinus Torvalds 		error = radix_tree_insert(&swapper_space.page_tree,
811da177e4SLinus Torvalds 						entry.val, page);
821da177e4SLinus Torvalds 		if (!error) {
831da177e4SLinus Torvalds 			page_cache_get(page);
841da177e4SLinus Torvalds 			SetPageSwapCache(page);
854c21e2f2SHugh Dickins 			set_page_private(page, entry.val);
861da177e4SLinus Torvalds 			total_swapcache_pages++;
87347ce434SChristoph Lameter 			__inc_zone_page_state(page, NR_FILE_PAGES);
88bb63be0aSHugh Dickins 			INC_CACHE_INFO(add_total);
891da177e4SLinus Torvalds 		}
901da177e4SLinus Torvalds 		write_unlock_irq(&swapper_space.tree_lock);
911da177e4SLinus Torvalds 		radix_tree_preload_end();
92fa1de900SHugh Dickins 	}
931da177e4SLinus Torvalds 	return error;
941da177e4SLinus Torvalds }
951da177e4SLinus Torvalds 
961da177e4SLinus Torvalds /*
971da177e4SLinus Torvalds  * This must be called only on pages that have
981da177e4SLinus Torvalds  * been verified to be in the swap cache.
991da177e4SLinus Torvalds  */
1001da177e4SLinus Torvalds void __delete_from_swap_cache(struct page *page)
1011da177e4SLinus Torvalds {
1021da177e4SLinus Torvalds 	BUG_ON(!PageLocked(page));
1031da177e4SLinus Torvalds 	BUG_ON(!PageSwapCache(page));
1041da177e4SLinus Torvalds 	BUG_ON(PageWriteback(page));
1053279ffd9SHugh Dickins 	BUG_ON(PagePrivate(page));
1061da177e4SLinus Torvalds 
1074c21e2f2SHugh Dickins 	radix_tree_delete(&swapper_space.page_tree, page_private(page));
1084c21e2f2SHugh Dickins 	set_page_private(page, 0);
1091da177e4SLinus Torvalds 	ClearPageSwapCache(page);
1101da177e4SLinus Torvalds 	total_swapcache_pages--;
111347ce434SChristoph Lameter 	__dec_zone_page_state(page, NR_FILE_PAGES);
1121da177e4SLinus Torvalds 	INC_CACHE_INFO(del_total);
1131da177e4SLinus Torvalds }
1141da177e4SLinus Torvalds 
1151da177e4SLinus Torvalds /**
1161da177e4SLinus Torvalds  * add_to_swap - allocate swap space for a page
1171da177e4SLinus Torvalds  * @page: page we want to move to swap
118*7682486bSRandy Dunlap  * @gfp_mask: memory allocation flags
1191da177e4SLinus Torvalds  *
1201da177e4SLinus Torvalds  * Allocate swap space for the page and add the page to the
1211da177e4SLinus Torvalds  * swap cache.  Caller needs to hold the page lock.
1221da177e4SLinus Torvalds  */
1231480a540SChristoph Lameter int add_to_swap(struct page * page, gfp_t gfp_mask)
1241da177e4SLinus Torvalds {
1251da177e4SLinus Torvalds 	swp_entry_t entry;
1261da177e4SLinus Torvalds 	int err;
1271da177e4SLinus Torvalds 
128e74ca2b4SEric Sesterhenn 	BUG_ON(!PageLocked(page));
1290ed361deSNick Piggin 	BUG_ON(!PageUptodate(page));
1301da177e4SLinus Torvalds 
1311da177e4SLinus Torvalds 	for (;;) {
1321da177e4SLinus Torvalds 		entry = get_swap_page();
1331da177e4SLinus Torvalds 		if (!entry.val)
1341da177e4SLinus Torvalds 			return 0;
1351da177e4SLinus Torvalds 
136bd53b714SNick Piggin 		/*
137bd53b714SNick Piggin 		 * Radix-tree node allocations from PF_MEMALLOC contexts could
138bd53b714SNick Piggin 		 * completely exhaust the page allocator. __GFP_NOMEMALLOC
139bd53b714SNick Piggin 		 * stops emergency reserves from being allocated.
1401da177e4SLinus Torvalds 		 *
141bd53b714SNick Piggin 		 * TODO: this could cause a theoretical memory reclaim
142bd53b714SNick Piggin 		 * deadlock in the swap out path.
1431da177e4SLinus Torvalds 		 */
1441da177e4SLinus Torvalds 		/*
1451da177e4SLinus Torvalds 		 * Add it to the swap cache and mark it dirty
1461da177e4SLinus Torvalds 		 */
147f000944dSHugh Dickins 		err = add_to_swap_cache(page, entry,
1481480a540SChristoph Lameter 				gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
1491da177e4SLinus Torvalds 
1501da177e4SLinus Torvalds 		switch (err) {
1511da177e4SLinus Torvalds 		case 0:				/* Success */
1521da177e4SLinus Torvalds 			SetPageDirty(page);
1531da177e4SLinus Torvalds 			return 1;
1541da177e4SLinus Torvalds 		case -EEXIST:
1551da177e4SLinus Torvalds 			/* Raced with "speculative" read_swap_cache_async */
1561da177e4SLinus Torvalds 			swap_free(entry);
1571da177e4SLinus Torvalds 			continue;
1581da177e4SLinus Torvalds 		default:
1591da177e4SLinus Torvalds 			/* -ENOMEM radix-tree allocation failure */
1601da177e4SLinus Torvalds 			swap_free(entry);
1611da177e4SLinus Torvalds 			return 0;
1621da177e4SLinus Torvalds 		}
1631da177e4SLinus Torvalds 	}
1641da177e4SLinus Torvalds }
1651da177e4SLinus Torvalds 
1661da177e4SLinus Torvalds /*
1671da177e4SLinus Torvalds  * This must be called only on pages that have
1681da177e4SLinus Torvalds  * been verified to be in the swap cache and locked.
1691da177e4SLinus Torvalds  * It will never put the page into the free list,
1701da177e4SLinus Torvalds  * the caller has a reference on the page.
1711da177e4SLinus Torvalds  */
1721da177e4SLinus Torvalds void delete_from_swap_cache(struct page *page)
1731da177e4SLinus Torvalds {
1741da177e4SLinus Torvalds 	swp_entry_t entry;
1751da177e4SLinus Torvalds 
1764c21e2f2SHugh Dickins 	entry.val = page_private(page);
1771da177e4SLinus Torvalds 
1781da177e4SLinus Torvalds 	write_lock_irq(&swapper_space.tree_lock);
1791da177e4SLinus Torvalds 	__delete_from_swap_cache(page);
1801da177e4SLinus Torvalds 	write_unlock_irq(&swapper_space.tree_lock);
1811da177e4SLinus Torvalds 
1821da177e4SLinus Torvalds 	swap_free(entry);
1831da177e4SLinus Torvalds 	page_cache_release(page);
1841da177e4SLinus Torvalds }
1851da177e4SLinus Torvalds 
1861da177e4SLinus Torvalds /*
1871da177e4SLinus Torvalds  * If we are the only user, then try to free up the swap cache.
1881da177e4SLinus Torvalds  *
1891da177e4SLinus Torvalds  * Its ok to check for PageSwapCache without the page lock
1901da177e4SLinus Torvalds  * here because we are going to recheck again inside
1911da177e4SLinus Torvalds  * exclusive_swap_page() _with_ the lock.
1921da177e4SLinus Torvalds  * 					- Marcelo
1931da177e4SLinus Torvalds  */
1941da177e4SLinus Torvalds static inline void free_swap_cache(struct page *page)
1951da177e4SLinus Torvalds {
1961da177e4SLinus Torvalds 	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
1971da177e4SLinus Torvalds 		remove_exclusive_swap_page(page);
1981da177e4SLinus Torvalds 		unlock_page(page);
1991da177e4SLinus Torvalds 	}
2001da177e4SLinus Torvalds }
2011da177e4SLinus Torvalds 
2021da177e4SLinus Torvalds /*
2031da177e4SLinus Torvalds  * Perform a free_page(), also freeing any swap cache associated with
204b8072f09SHugh Dickins  * this page if it is the last user of the page.
2051da177e4SLinus Torvalds  */
2061da177e4SLinus Torvalds void free_page_and_swap_cache(struct page *page)
2071da177e4SLinus Torvalds {
2081da177e4SLinus Torvalds 	free_swap_cache(page);
2091da177e4SLinus Torvalds 	page_cache_release(page);
2101da177e4SLinus Torvalds }
2111da177e4SLinus Torvalds 
2121da177e4SLinus Torvalds /*
2131da177e4SLinus Torvalds  * Passed an array of pages, drop them all from swapcache and then release
2141da177e4SLinus Torvalds  * them.  They are removed from the LRU and freed if this is their last use.
2151da177e4SLinus Torvalds  */
2161da177e4SLinus Torvalds void free_pages_and_swap_cache(struct page **pages, int nr)
2171da177e4SLinus Torvalds {
2181da177e4SLinus Torvalds 	struct page **pagep = pages;
2191da177e4SLinus Torvalds 
2201da177e4SLinus Torvalds 	lru_add_drain();
2211da177e4SLinus Torvalds 	while (nr) {
222c484d410SHugh Dickins 		int todo = min(nr, PAGEVEC_SIZE);
2231da177e4SLinus Torvalds 		int i;
2241da177e4SLinus Torvalds 
2251da177e4SLinus Torvalds 		for (i = 0; i < todo; i++)
2261da177e4SLinus Torvalds 			free_swap_cache(pagep[i]);
2271da177e4SLinus Torvalds 		release_pages(pagep, todo, 0);
2281da177e4SLinus Torvalds 		pagep += todo;
2291da177e4SLinus Torvalds 		nr -= todo;
2301da177e4SLinus Torvalds 	}
2311da177e4SLinus Torvalds }
2321da177e4SLinus Torvalds 
2331da177e4SLinus Torvalds /*
2341da177e4SLinus Torvalds  * Lookup a swap entry in the swap cache. A found page will be returned
2351da177e4SLinus Torvalds  * unlocked and with its refcount incremented - we rely on the kernel
2361da177e4SLinus Torvalds  * lock getting page table operations atomic even if we drop the page
2371da177e4SLinus Torvalds  * lock before returning.
2381da177e4SLinus Torvalds  */
2391da177e4SLinus Torvalds struct page * lookup_swap_cache(swp_entry_t entry)
2401da177e4SLinus Torvalds {
2411da177e4SLinus Torvalds 	struct page *page;
2421da177e4SLinus Torvalds 
2431da177e4SLinus Torvalds 	page = find_get_page(&swapper_space, entry.val);
2441da177e4SLinus Torvalds 
2451da177e4SLinus Torvalds 	if (page)
2461da177e4SLinus Torvalds 		INC_CACHE_INFO(find_success);
2471da177e4SLinus Torvalds 
2481da177e4SLinus Torvalds 	INC_CACHE_INFO(find_total);
2491da177e4SLinus Torvalds 	return page;
2501da177e4SLinus Torvalds }
2511da177e4SLinus Torvalds 
2521da177e4SLinus Torvalds /*
2531da177e4SLinus Torvalds  * Locate a page of swap in physical memory, reserving swap cache space
2541da177e4SLinus Torvalds  * and reading the disk if it is not already cached.
2551da177e4SLinus Torvalds  * A failure return means that either the page allocation failed or that
2561da177e4SLinus Torvalds  * the swap entry is no longer in use.
2571da177e4SLinus Torvalds  */
25802098feaSHugh Dickins struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
2591da177e4SLinus Torvalds 			struct vm_area_struct *vma, unsigned long addr)
2601da177e4SLinus Torvalds {
2611da177e4SLinus Torvalds 	struct page *found_page, *new_page = NULL;
2621da177e4SLinus Torvalds 	int err;
2631da177e4SLinus Torvalds 
2641da177e4SLinus Torvalds 	do {
2651da177e4SLinus Torvalds 		/*
2661da177e4SLinus Torvalds 		 * First check the swap cache.  Since this is normally
2671da177e4SLinus Torvalds 		 * called after lookup_swap_cache() failed, re-calling
2681da177e4SLinus Torvalds 		 * that would confuse statistics.
2691da177e4SLinus Torvalds 		 */
2701da177e4SLinus Torvalds 		found_page = find_get_page(&swapper_space, entry.val);
2711da177e4SLinus Torvalds 		if (found_page)
2721da177e4SLinus Torvalds 			break;
2731da177e4SLinus Torvalds 
2741da177e4SLinus Torvalds 		/*
2751da177e4SLinus Torvalds 		 * Get a new page to read into from swap.
2761da177e4SLinus Torvalds 		 */
2771da177e4SLinus Torvalds 		if (!new_page) {
27802098feaSHugh Dickins 			new_page = alloc_page_vma(gfp_mask, vma, addr);
2791da177e4SLinus Torvalds 			if (!new_page)
2801da177e4SLinus Torvalds 				break;		/* Out of memory */
2811da177e4SLinus Torvalds 		}
2821da177e4SLinus Torvalds 
2831da177e4SLinus Torvalds 		/*
284f000944dSHugh Dickins 		 * Swap entry may have been freed since our caller observed it.
285f000944dSHugh Dickins 		 */
286f000944dSHugh Dickins 		if (!swap_duplicate(entry))
287f000944dSHugh Dickins 			break;
288f000944dSHugh Dickins 
289f000944dSHugh Dickins 		/*
2901da177e4SLinus Torvalds 		 * Associate the page with swap entry in the swap cache.
291f000944dSHugh Dickins 		 * May fail (-EEXIST) if there is already a page associated
292f000944dSHugh Dickins 		 * with this entry in the swap cache: added by a racing
293f000944dSHugh Dickins 		 * read_swap_cache_async, or add_to_swap or shmem_writepage
294f000944dSHugh Dickins 		 * re-using the just freed swap entry for an existing page.
2951da177e4SLinus Torvalds 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
2961da177e4SLinus Torvalds 		 */
297f000944dSHugh Dickins 		SetPageLocked(new_page);
298f000944dSHugh Dickins 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
2991da177e4SLinus Torvalds 		if (!err) {
3001da177e4SLinus Torvalds 			/*
3011da177e4SLinus Torvalds 			 * Initiate read into locked page and return.
3021da177e4SLinus Torvalds 			 */
3031da177e4SLinus Torvalds 			lru_cache_add_active(new_page);
3041da177e4SLinus Torvalds 			swap_readpage(NULL, new_page);
3051da177e4SLinus Torvalds 			return new_page;
3061da177e4SLinus Torvalds 		}
307f000944dSHugh Dickins 		ClearPageLocked(new_page);
308f000944dSHugh Dickins 		swap_free(entry);
309f000944dSHugh Dickins 	} while (err != -ENOMEM);
3101da177e4SLinus Torvalds 
3111da177e4SLinus Torvalds 	if (new_page)
3121da177e4SLinus Torvalds 		page_cache_release(new_page);
3131da177e4SLinus Torvalds 	return found_page;
3141da177e4SLinus Torvalds }
31546017e95SHugh Dickins 
31646017e95SHugh Dickins /**
31746017e95SHugh Dickins  * swapin_readahead - swap in pages in hope we need them soon
31846017e95SHugh Dickins  * @entry: swap entry of this memory
319*7682486bSRandy Dunlap  * @gfp_mask: memory allocation flags
32046017e95SHugh Dickins  * @vma: user vma this address belongs to
32146017e95SHugh Dickins  * @addr: target address for mempolicy
32246017e95SHugh Dickins  *
32346017e95SHugh Dickins  * Returns the struct page for entry and addr, after queueing swapin.
32446017e95SHugh Dickins  *
32546017e95SHugh Dickins  * Primitive swap readahead code. We simply read an aligned block of
32646017e95SHugh Dickins  * (1 << page_cluster) entries in the swap area. This method is chosen
32746017e95SHugh Dickins  * because it doesn't cost us any seek time.  We also make sure to queue
32846017e95SHugh Dickins  * the 'original' request together with the readahead ones...
32946017e95SHugh Dickins  *
33046017e95SHugh Dickins  * This has been extended to use the NUMA policies from the mm triggering
33146017e95SHugh Dickins  * the readahead.
33246017e95SHugh Dickins  *
33346017e95SHugh Dickins  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
33446017e95SHugh Dickins  */
33502098feaSHugh Dickins struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
33646017e95SHugh Dickins 			struct vm_area_struct *vma, unsigned long addr)
33746017e95SHugh Dickins {
33846017e95SHugh Dickins 	int nr_pages;
33946017e95SHugh Dickins 	struct page *page;
34046017e95SHugh Dickins 	unsigned long offset;
34146017e95SHugh Dickins 	unsigned long end_offset;
34246017e95SHugh Dickins 
34346017e95SHugh Dickins 	/*
34446017e95SHugh Dickins 	 * Get starting offset for readaround, and number of pages to read.
34546017e95SHugh Dickins 	 * Adjust starting address by readbehind (for NUMA interleave case)?
34646017e95SHugh Dickins 	 * No, it's very unlikely that swap layout would follow vma layout,
34746017e95SHugh Dickins 	 * more likely that neighbouring swap pages came from the same node:
34846017e95SHugh Dickins 	 * so use the same "addr" to choose the same node for each swap read.
34946017e95SHugh Dickins 	 */
35046017e95SHugh Dickins 	nr_pages = valid_swaphandles(entry, &offset);
35146017e95SHugh Dickins 	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
35246017e95SHugh Dickins 		/* Ok, do the async read-ahead now */
35346017e95SHugh Dickins 		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
35402098feaSHugh Dickins 						gfp_mask, vma, addr);
35546017e95SHugh Dickins 		if (!page)
35646017e95SHugh Dickins 			break;
35746017e95SHugh Dickins 		page_cache_release(page);
35846017e95SHugh Dickins 	}
35946017e95SHugh Dickins 	lru_add_drain();	/* Push any new pages onto the LRU now */
36002098feaSHugh Dickins 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
36146017e95SHugh Dickins }
362