xref: /linux/mm/readahead.c (revision 30002ed2e41830ec03ec3e577ad83ac6b188f96e)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * mm/readahead.c - address_space-level file readahead.
31da177e4SLinus Torvalds  *
41da177e4SLinus Torvalds  * Copyright (C) 2002, Linus Torvalds
51da177e4SLinus Torvalds  *
61da177e4SLinus Torvalds  * 09Apr2002	akpm@zip.com.au
71da177e4SLinus Torvalds  *		Initial version.
81da177e4SLinus Torvalds  */
91da177e4SLinus Torvalds 
101da177e4SLinus Torvalds #include <linux/kernel.h>
111da177e4SLinus Torvalds #include <linux/fs.h>
121da177e4SLinus Torvalds #include <linux/mm.h>
131da177e4SLinus Torvalds #include <linux/module.h>
141da177e4SLinus Torvalds #include <linux/blkdev.h>
151da177e4SLinus Torvalds #include <linux/backing-dev.h>
168bde37f0SAndrew Morton #include <linux/task_io_accounting_ops.h>
171da177e4SLinus Torvalds #include <linux/pagevec.h>
18f5ff8422SJens Axboe #include <linux/pagemap.h>
191da177e4SLinus Torvalds 
201da177e4SLinus Torvalds void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
211da177e4SLinus Torvalds {
221da177e4SLinus Torvalds }
231da177e4SLinus Torvalds EXPORT_SYMBOL(default_unplug_io_fn);
241da177e4SLinus Torvalds 
251da177e4SLinus Torvalds struct backing_dev_info default_backing_dev_info = {
26535443f5SFengguang Wu 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
271da177e4SLinus Torvalds 	.state		= 0,
281da177e4SLinus Torvalds 	.capabilities	= BDI_CAP_MAP_COPY,
291da177e4SLinus Torvalds 	.unplug_io_fn	= default_unplug_io_fn,
301da177e4SLinus Torvalds };
311da177e4SLinus Torvalds EXPORT_SYMBOL_GPL(default_backing_dev_info);
321da177e4SLinus Torvalds 
331da177e4SLinus Torvalds /*
341da177e4SLinus Torvalds  * Initialise a struct file's readahead state.  Assumes that the caller has
351da177e4SLinus Torvalds  * memset *ra to zero.
361da177e4SLinus Torvalds  */
371da177e4SLinus Torvalds void
381da177e4SLinus Torvalds file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
391da177e4SLinus Torvalds {
401da177e4SLinus Torvalds 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
41f4e6b498SFengguang Wu 	ra->prev_pos = -1;
421da177e4SLinus Torvalds }
43d41cc702SSteven Whitehouse EXPORT_SYMBOL_GPL(file_ra_state_init);
441da177e4SLinus Torvalds 
451da177e4SLinus Torvalds #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
461da177e4SLinus Torvalds 
471da177e4SLinus Torvalds /**
48bd40cddaSRandy Dunlap  * read_cache_pages - populate an address space with some pages & start reads against them
491da177e4SLinus Torvalds  * @mapping: the address_space
501da177e4SLinus Torvalds  * @pages: The address of a list_head which contains the target pages.  These
511da177e4SLinus Torvalds  *   pages have their ->index populated and are otherwise uninitialised.
521da177e4SLinus Torvalds  * @filler: callback routine for filling a single page.
531da177e4SLinus Torvalds  * @data: private data for the callback routine.
541da177e4SLinus Torvalds  *
551da177e4SLinus Torvalds  * Hides the details of the LRU cache etc from the filesystems.
561da177e4SLinus Torvalds  */
571da177e4SLinus Torvalds int read_cache_pages(struct address_space *mapping, struct list_head *pages,
581da177e4SLinus Torvalds 			int (*filler)(void *, struct page *), void *data)
591da177e4SLinus Torvalds {
601da177e4SLinus Torvalds 	struct page *page;
611da177e4SLinus Torvalds 	int ret = 0;
621da177e4SLinus Torvalds 
631da177e4SLinus Torvalds 	while (!list_empty(pages)) {
641da177e4SLinus Torvalds 		page = list_to_page(pages);
651da177e4SLinus Torvalds 		list_del(&page->lru);
66eb2be189SNick Piggin 		if (add_to_page_cache_lru(page, mapping,
67eb2be189SNick Piggin 					page->index, GFP_KERNEL)) {
681da177e4SLinus Torvalds 			page_cache_release(page);
691da177e4SLinus Torvalds 			continue;
701da177e4SLinus Torvalds 		}
71eb2be189SNick Piggin 		page_cache_release(page);
72eb2be189SNick Piggin 
731da177e4SLinus Torvalds 		ret = filler(data, page);
74eb2be189SNick Piggin 		if (unlikely(ret)) {
7538da288bSOGAWA Hirofumi 			put_pages_list(pages);
761da177e4SLinus Torvalds 			break;
771da177e4SLinus Torvalds 		}
788bde37f0SAndrew Morton 		task_io_account_read(PAGE_CACHE_SIZE);
791da177e4SLinus Torvalds 	}
801da177e4SLinus Torvalds 	return ret;
811da177e4SLinus Torvalds }
821da177e4SLinus Torvalds 
831da177e4SLinus Torvalds EXPORT_SYMBOL(read_cache_pages);
841da177e4SLinus Torvalds 
851da177e4SLinus Torvalds static int read_pages(struct address_space *mapping, struct file *filp,
861da177e4SLinus Torvalds 		struct list_head *pages, unsigned nr_pages)
871da177e4SLinus Torvalds {
881da177e4SLinus Torvalds 	unsigned page_idx;
89994fc28cSZach Brown 	int ret;
901da177e4SLinus Torvalds 
911da177e4SLinus Torvalds 	if (mapping->a_ops->readpages) {
921da177e4SLinus Torvalds 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
93029e332eSOGAWA Hirofumi 		/* Clean up the remaining pages */
94029e332eSOGAWA Hirofumi 		put_pages_list(pages);
951da177e4SLinus Torvalds 		goto out;
961da177e4SLinus Torvalds 	}
971da177e4SLinus Torvalds 
981da177e4SLinus Torvalds 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
991da177e4SLinus Torvalds 		struct page *page = list_to_page(pages);
1001da177e4SLinus Torvalds 		list_del(&page->lru);
101eb2be189SNick Piggin 		if (!add_to_page_cache_lru(page, mapping,
1021da177e4SLinus Torvalds 					page->index, GFP_KERNEL)) {
1039f1a3cfcSZach Brown 			mapping->a_ops->readpage(filp, page);
104eb2be189SNick Piggin 		}
1051da177e4SLinus Torvalds 		page_cache_release(page);
1061da177e4SLinus Torvalds 	}
107994fc28cSZach Brown 	ret = 0;
1081da177e4SLinus Torvalds out:
1091da177e4SLinus Torvalds 	return ret;
1101da177e4SLinus Torvalds }
1111da177e4SLinus Torvalds 
1121da177e4SLinus Torvalds /*
1131da177e4SLinus Torvalds  * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
1141da177e4SLinus Torvalds  * the pages first, then submits them all for I/O. This avoids the very bad
1151da177e4SLinus Torvalds  * behaviour which would occur if page allocations are causing VM writeback.
1161da177e4SLinus Torvalds  * We really don't want to intermingle reads and writes like that.
1171da177e4SLinus Torvalds  *
1181da177e4SLinus Torvalds  * Returns the number of pages requested, or the maximum amount of I/O allowed.
1191da177e4SLinus Torvalds  *
1201da177e4SLinus Torvalds  * do_page_cache_readahead() returns -1 if it encountered request queue
1211da177e4SLinus Torvalds  * congestion.
1221da177e4SLinus Torvalds  */
1231da177e4SLinus Torvalds static int
1241da177e4SLinus Torvalds __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
12546fc3e7bSFengguang Wu 			pgoff_t offset, unsigned long nr_to_read,
12646fc3e7bSFengguang Wu 			unsigned long lookahead_size)
1271da177e4SLinus Torvalds {
1281da177e4SLinus Torvalds 	struct inode *inode = mapping->host;
1291da177e4SLinus Torvalds 	struct page *page;
1301da177e4SLinus Torvalds 	unsigned long end_index;	/* The last page we want to read */
1311da177e4SLinus Torvalds 	LIST_HEAD(page_pool);
1321da177e4SLinus Torvalds 	int page_idx;
1331da177e4SLinus Torvalds 	int ret = 0;
1341da177e4SLinus Torvalds 	loff_t isize = i_size_read(inode);
1351da177e4SLinus Torvalds 
1361da177e4SLinus Torvalds 	if (isize == 0)
1371da177e4SLinus Torvalds 		goto out;
1381da177e4SLinus Torvalds 
1391da177e4SLinus Torvalds 	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
1401da177e4SLinus Torvalds 
1411da177e4SLinus Torvalds 	/*
1421da177e4SLinus Torvalds 	 * Preallocate as many pages as we will need.
1431da177e4SLinus Torvalds 	 */
1441da177e4SLinus Torvalds 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
1457361f4d8SAndrew Morton 		pgoff_t page_offset = offset + page_idx;
1461da177e4SLinus Torvalds 
1471da177e4SLinus Torvalds 		if (page_offset > end_index)
1481da177e4SLinus Torvalds 			break;
1491da177e4SLinus Torvalds 
15000128188SNick Piggin 		rcu_read_lock();
1511da177e4SLinus Torvalds 		page = radix_tree_lookup(&mapping->page_tree, page_offset);
15200128188SNick Piggin 		rcu_read_unlock();
1531da177e4SLinus Torvalds 		if (page)
1541da177e4SLinus Torvalds 			continue;
1551da177e4SLinus Torvalds 
1561da177e4SLinus Torvalds 		page = page_cache_alloc_cold(mapping);
1571da177e4SLinus Torvalds 		if (!page)
1581da177e4SLinus Torvalds 			break;
1591da177e4SLinus Torvalds 		page->index = page_offset;
1601da177e4SLinus Torvalds 		list_add(&page->lru, &page_pool);
16146fc3e7bSFengguang Wu 		if (page_idx == nr_to_read - lookahead_size)
16246fc3e7bSFengguang Wu 			SetPageReadahead(page);
1631da177e4SLinus Torvalds 		ret++;
1641da177e4SLinus Torvalds 	}
1651da177e4SLinus Torvalds 
1661da177e4SLinus Torvalds 	/*
1671da177e4SLinus Torvalds 	 * Now start the IO.  We ignore I/O errors - if the page is not
1681da177e4SLinus Torvalds 	 * uptodate then the caller will launch readpage again, and
1691da177e4SLinus Torvalds 	 * will then handle the error.
1701da177e4SLinus Torvalds 	 */
1711da177e4SLinus Torvalds 	if (ret)
1721da177e4SLinus Torvalds 		read_pages(mapping, filp, &page_pool, ret);
1731da177e4SLinus Torvalds 	BUG_ON(!list_empty(&page_pool));
1741da177e4SLinus Torvalds out:
1751da177e4SLinus Torvalds 	return ret;
1761da177e4SLinus Torvalds }
1771da177e4SLinus Torvalds 
1781da177e4SLinus Torvalds /*
1791da177e4SLinus Torvalds  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
1801da177e4SLinus Torvalds  * memory at once.
1811da177e4SLinus Torvalds  */
1821da177e4SLinus Torvalds int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
1837361f4d8SAndrew Morton 		pgoff_t offset, unsigned long nr_to_read)
1841da177e4SLinus Torvalds {
1851da177e4SLinus Torvalds 	int ret = 0;
1861da177e4SLinus Torvalds 
1871da177e4SLinus Torvalds 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
1881da177e4SLinus Torvalds 		return -EINVAL;
1891da177e4SLinus Torvalds 
1901da177e4SLinus Torvalds 	while (nr_to_read) {
1911da177e4SLinus Torvalds 		int err;
1921da177e4SLinus Torvalds 
1931da177e4SLinus Torvalds 		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
1941da177e4SLinus Torvalds 
1951da177e4SLinus Torvalds 		if (this_chunk > nr_to_read)
1961da177e4SLinus Torvalds 			this_chunk = nr_to_read;
1971da177e4SLinus Torvalds 		err = __do_page_cache_readahead(mapping, filp,
19846fc3e7bSFengguang Wu 						offset, this_chunk, 0);
1991da177e4SLinus Torvalds 		if (err < 0) {
2001da177e4SLinus Torvalds 			ret = err;
2011da177e4SLinus Torvalds 			break;
2021da177e4SLinus Torvalds 		}
2031da177e4SLinus Torvalds 		ret += err;
2041da177e4SLinus Torvalds 		offset += this_chunk;
2051da177e4SLinus Torvalds 		nr_to_read -= this_chunk;
2061da177e4SLinus Torvalds 	}
2071da177e4SLinus Torvalds 	return ret;
2081da177e4SLinus Torvalds }
2091da177e4SLinus Torvalds 
2101da177e4SLinus Torvalds /*
2111da177e4SLinus Torvalds  * This version skips the IO if the queue is read-congested, and will tell the
2121da177e4SLinus Torvalds  * block layer to abandon the readahead if request allocation would block.
2131da177e4SLinus Torvalds  *
2141da177e4SLinus Torvalds  * force_page_cache_readahead() will ignore queue congestion and will block on
2151da177e4SLinus Torvalds  * request queues.
2161da177e4SLinus Torvalds  */
2171da177e4SLinus Torvalds int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
2187361f4d8SAndrew Morton 			pgoff_t offset, unsigned long nr_to_read)
2191da177e4SLinus Torvalds {
2201da177e4SLinus Torvalds 	if (bdi_read_congested(mapping->backing_dev_info))
2211da177e4SLinus Torvalds 		return -1;
2221da177e4SLinus Torvalds 
22346fc3e7bSFengguang Wu 	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
2241da177e4SLinus Torvalds }
2251da177e4SLinus Torvalds 
2261da177e4SLinus Torvalds /*
2271da177e4SLinus Torvalds  * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
2281da177e4SLinus Torvalds  * sensible upper limit.
2291da177e4SLinus Torvalds  */
2301da177e4SLinus Torvalds unsigned long max_sane_readahead(unsigned long nr)
2311da177e4SLinus Torvalds {
23205a0416bSChristoph Lameter 	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
23305a0416bSChristoph Lameter 		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
2341da177e4SLinus Torvalds }
2355ce1110bSFengguang Wu 
236e0bf68ddSPeter Zijlstra static int __init readahead_init(void)
237e0bf68ddSPeter Zijlstra {
238cf0ca9feSPeter Zijlstra 	int err;
239cf0ca9feSPeter Zijlstra 
240cf0ca9feSPeter Zijlstra 	err = bdi_init(&default_backing_dev_info);
241cf0ca9feSPeter Zijlstra 	if (!err)
242cf0ca9feSPeter Zijlstra 		bdi_register(&default_backing_dev_info, NULL, "default");
243cf0ca9feSPeter Zijlstra 
244cf0ca9feSPeter Zijlstra 	return err;
245e0bf68ddSPeter Zijlstra }
246e0bf68ddSPeter Zijlstra subsys_initcall(readahead_init);
247e0bf68ddSPeter Zijlstra 
2485ce1110bSFengguang Wu /*
2495ce1110bSFengguang Wu  * Submit IO for the read-ahead request in file_ra_state.
2505ce1110bSFengguang Wu  */
251f9acc8c7SFengguang Wu static unsigned long ra_submit(struct file_ra_state *ra,
2525ce1110bSFengguang Wu 		       struct address_space *mapping, struct file *filp)
2535ce1110bSFengguang Wu {
2545ce1110bSFengguang Wu 	int actual;
2555ce1110bSFengguang Wu 
2565ce1110bSFengguang Wu 	actual = __do_page_cache_readahead(mapping, filp,
257f9acc8c7SFengguang Wu 					ra->start, ra->size, ra->async_size);
2585ce1110bSFengguang Wu 
2595ce1110bSFengguang Wu 	return actual;
2605ce1110bSFengguang Wu }
261122a21d1SFengguang Wu 
262122a21d1SFengguang Wu /*
263c743d96bSFengguang Wu  * Set the initial window size, round to next power of 2 and square
264c743d96bSFengguang Wu  * for small size, x 4 for medium, and x 2 for large
265c743d96bSFengguang Wu  * for 128k (32 page) max ra
266c743d96bSFengguang Wu  * 1-8 page = 32k initial, > 8 page = 128k initial
267c743d96bSFengguang Wu  */
268c743d96bSFengguang Wu static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
269c743d96bSFengguang Wu {
270c743d96bSFengguang Wu 	unsigned long newsize = roundup_pow_of_two(size);
271c743d96bSFengguang Wu 
272c743d96bSFengguang Wu 	if (newsize <= max / 32)
273c743d96bSFengguang Wu 		newsize = newsize * 4;
274c743d96bSFengguang Wu 	else if (newsize <= max / 4)
275c743d96bSFengguang Wu 		newsize = newsize * 2;
276c743d96bSFengguang Wu 	else
277c743d96bSFengguang Wu 		newsize = max;
278c743d96bSFengguang Wu 
279c743d96bSFengguang Wu 	return newsize;
280c743d96bSFengguang Wu }
281c743d96bSFengguang Wu 
282c743d96bSFengguang Wu /*
283122a21d1SFengguang Wu  *  Get the previous window size, ramp it up, and
284122a21d1SFengguang Wu  *  return it as the new window size.
285122a21d1SFengguang Wu  */
286c743d96bSFengguang Wu static unsigned long get_next_ra_size(struct file_ra_state *ra,
287122a21d1SFengguang Wu 						unsigned long max)
288122a21d1SFengguang Wu {
289f9acc8c7SFengguang Wu 	unsigned long cur = ra->size;
290122a21d1SFengguang Wu 	unsigned long newsize;
291122a21d1SFengguang Wu 
292122a21d1SFengguang Wu 	if (cur < max / 16)
293c743d96bSFengguang Wu 		newsize = 4 * cur;
294122a21d1SFengguang Wu 	else
295c743d96bSFengguang Wu 		newsize = 2 * cur;
296122a21d1SFengguang Wu 
297122a21d1SFengguang Wu 	return min(newsize, max);
298122a21d1SFengguang Wu }
299122a21d1SFengguang Wu 
300122a21d1SFengguang Wu /*
301122a21d1SFengguang Wu  * On-demand readahead design.
302122a21d1SFengguang Wu  *
303122a21d1SFengguang Wu  * The fields in struct file_ra_state represent the most-recently-executed
304122a21d1SFengguang Wu  * readahead attempt:
305122a21d1SFengguang Wu  *
306f9acc8c7SFengguang Wu  *                        |<----- async_size ---------|
307f9acc8c7SFengguang Wu  *     |------------------- size -------------------->|
308f9acc8c7SFengguang Wu  *     |==================#===========================|
309f9acc8c7SFengguang Wu  *     ^start             ^page marked with PG_readahead
310122a21d1SFengguang Wu  *
311122a21d1SFengguang Wu  * To overlap application thinking time and disk I/O time, we do
312122a21d1SFengguang Wu  * `readahead pipelining': Do not wait until the application consumed all
313122a21d1SFengguang Wu  * readahead pages and stalled on the missing page at readahead_index;
314f9acc8c7SFengguang Wu  * Instead, submit an asynchronous readahead I/O as soon as there are
315f9acc8c7SFengguang Wu  * only async_size pages left in the readahead window. Normally async_size
316f9acc8c7SFengguang Wu  * will be equal to size, for maximum pipelining.
317122a21d1SFengguang Wu  *
318122a21d1SFengguang Wu  * In interleaved sequential reads, concurrent streams on the same fd can
319122a21d1SFengguang Wu  * be invalidating each other's readahead state. So we flag the new readahead
320f9acc8c7SFengguang Wu  * page at (start+size-async_size) with PG_readahead, and use it as readahead
321122a21d1SFengguang Wu  * indicator. The flag won't be set on already cached pages, to avoid the
322122a21d1SFengguang Wu  * readahead-for-nothing fuss, saving pointless page cache lookups.
323122a21d1SFengguang Wu  *
324f4e6b498SFengguang Wu  * prev_pos tracks the last visited byte in the _previous_ read request.
325122a21d1SFengguang Wu  * It should be maintained by the caller, and will be used for detecting
326122a21d1SFengguang Wu  * small random reads. Note that the readahead algorithm checks loosely
327122a21d1SFengguang Wu  * for sequential patterns. Hence interleaved reads might be served as
328122a21d1SFengguang Wu  * sequential ones.
329122a21d1SFengguang Wu  *
330122a21d1SFengguang Wu  * There is a special-case: if the first page which the application tries to
331122a21d1SFengguang Wu  * read happens to be the first page of the file, it is assumed that a linear
332122a21d1SFengguang Wu  * read is about to happen and the window is immediately set to the initial size
333122a21d1SFengguang Wu  * based on I/O request size and the max_readahead.
334122a21d1SFengguang Wu  *
335122a21d1SFengguang Wu  * The code ramps up the readahead size aggressively at first, but slow down as
336122a21d1SFengguang Wu  * it approaches max_readhead.
337122a21d1SFengguang Wu  */
338122a21d1SFengguang Wu 
339122a21d1SFengguang Wu /*
340122a21d1SFengguang Wu  * A minimal readahead algorithm for trivial sequential/random reads.
341122a21d1SFengguang Wu  */
342122a21d1SFengguang Wu static unsigned long
343122a21d1SFengguang Wu ondemand_readahead(struct address_space *mapping,
344122a21d1SFengguang Wu 		   struct file_ra_state *ra, struct file *filp,
345cf914a7dSRusty Russell 		   bool hit_readahead_marker, pgoff_t offset,
346122a21d1SFengguang Wu 		   unsigned long req_size)
347122a21d1SFengguang Wu {
348f4e6b498SFengguang Wu 	int	max = ra->ra_pages;	/* max readahead pages */
349f4e6b498SFengguang Wu 	pgoff_t prev_offset;
350122a21d1SFengguang Wu 	int	sequential;
351122a21d1SFengguang Wu 
352122a21d1SFengguang Wu 	/*
353f9acc8c7SFengguang Wu 	 * It's the expected callback offset, assume sequential access.
354122a21d1SFengguang Wu 	 * Ramp up sizes, and push forward the readahead window.
355122a21d1SFengguang Wu 	 */
356f9acc8c7SFengguang Wu 	if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
357f9acc8c7SFengguang Wu 			offset == (ra->start + ra->size))) {
358f9acc8c7SFengguang Wu 		ra->start += ra->size;
359f9acc8c7SFengguang Wu 		ra->size = get_next_ra_size(ra, max);
360f9acc8c7SFengguang Wu 		ra->async_size = ra->size;
361f9acc8c7SFengguang Wu 		goto readit;
362122a21d1SFengguang Wu 	}
363122a21d1SFengguang Wu 
364f4e6b498SFengguang Wu 	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
365f4e6b498SFengguang Wu 	sequential = offset - prev_offset <= 1UL || req_size > max;
366f4e6b498SFengguang Wu 
367122a21d1SFengguang Wu 	/*
368122a21d1SFengguang Wu 	 * Standalone, small read.
369122a21d1SFengguang Wu 	 * Read as is, and do not pollute the readahead state.
370122a21d1SFengguang Wu 	 */
371cf914a7dSRusty Russell 	if (!hit_readahead_marker && !sequential) {
372122a21d1SFengguang Wu 		return __do_page_cache_readahead(mapping, filp,
373122a21d1SFengguang Wu 						offset, req_size, 0);
374122a21d1SFengguang Wu 	}
375122a21d1SFengguang Wu 
376122a21d1SFengguang Wu 	/*
3776b10c6c9SFengguang Wu 	 * Hit a marked page without valid readahead state.
3786b10c6c9SFengguang Wu 	 * E.g. interleaved reads.
3796b10c6c9SFengguang Wu 	 * Query the pagecache for async_size, which normally equals to
3806b10c6c9SFengguang Wu 	 * readahead size. Ramp it up and use it as the new readahead size.
3816b10c6c9SFengguang Wu 	 */
3826b10c6c9SFengguang Wu 	if (hit_readahead_marker) {
3836b10c6c9SFengguang Wu 		pgoff_t start;
3846b10c6c9SFengguang Wu 
385*30002ed2SNick Piggin 		rcu_read_lock();
3866b10c6c9SFengguang Wu 		start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387*30002ed2SNick Piggin 		rcu_read_unlock();
3886b10c6c9SFengguang Wu 
3896b10c6c9SFengguang Wu 		if (!start || start - offset > max)
3906b10c6c9SFengguang Wu 			return 0;
3916b10c6c9SFengguang Wu 
3926b10c6c9SFengguang Wu 		ra->start = start;
3936b10c6c9SFengguang Wu 		ra->size = start - offset;	/* old async_size */
3946b10c6c9SFengguang Wu 		ra->size = get_next_ra_size(ra, max);
3956b10c6c9SFengguang Wu 		ra->async_size = ra->size;
3966b10c6c9SFengguang Wu 		goto readit;
3976b10c6c9SFengguang Wu 	}
3986b10c6c9SFengguang Wu 
3996b10c6c9SFengguang Wu 	/*
400122a21d1SFengguang Wu 	 * It may be one of
401122a21d1SFengguang Wu 	 * 	- first read on start of file
402122a21d1SFengguang Wu 	 * 	- sequential cache miss
403122a21d1SFengguang Wu 	 * 	- oversize random read
404122a21d1SFengguang Wu 	 * Start readahead for it.
405122a21d1SFengguang Wu 	 */
406f9acc8c7SFengguang Wu 	ra->start = offset;
407f9acc8c7SFengguang Wu 	ra->size = get_init_ra_size(req_size, max);
408f9acc8c7SFengguang Wu 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
409122a21d1SFengguang Wu 
410f9acc8c7SFengguang Wu readit:
411122a21d1SFengguang Wu 	return ra_submit(ra, mapping, filp);
412122a21d1SFengguang Wu }
413122a21d1SFengguang Wu 
414122a21d1SFengguang Wu /**
415cf914a7dSRusty Russell  * page_cache_sync_readahead - generic file readahead
416122a21d1SFengguang Wu  * @mapping: address_space which holds the pagecache and I/O vectors
417122a21d1SFengguang Wu  * @ra: file_ra_state which holds the readahead state
418122a21d1SFengguang Wu  * @filp: passed on to ->readpage() and ->readpages()
419cf914a7dSRusty Russell  * @offset: start offset into @mapping, in pagecache page-sized units
420122a21d1SFengguang Wu  * @req_size: hint: total size of the read which the caller is performing in
421cf914a7dSRusty Russell  *            pagecache pages
422122a21d1SFengguang Wu  *
423cf914a7dSRusty Russell  * page_cache_sync_readahead() should be called when a cache miss happened:
424cf914a7dSRusty Russell  * it will submit the read.  The readahead logic may decide to piggyback more
425cf914a7dSRusty Russell  * pages onto the read request if access patterns suggest it will improve
426cf914a7dSRusty Russell  * performance.
427122a21d1SFengguang Wu  */
428cf914a7dSRusty Russell void page_cache_sync_readahead(struct address_space *mapping,
429cf914a7dSRusty Russell 			       struct file_ra_state *ra, struct file *filp,
430cf914a7dSRusty Russell 			       pgoff_t offset, unsigned long req_size)
431cf914a7dSRusty Russell {
432cf914a7dSRusty Russell 	/* no read-ahead */
433cf914a7dSRusty Russell 	if (!ra->ra_pages)
434cf914a7dSRusty Russell 		return;
435cf914a7dSRusty Russell 
436cf914a7dSRusty Russell 	/* do read-ahead */
437cf914a7dSRusty Russell 	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
438cf914a7dSRusty Russell }
439cf914a7dSRusty Russell EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
440cf914a7dSRusty Russell 
441cf914a7dSRusty Russell /**
442cf914a7dSRusty Russell  * page_cache_async_readahead - file readahead for marked pages
443cf914a7dSRusty Russell  * @mapping: address_space which holds the pagecache and I/O vectors
444cf914a7dSRusty Russell  * @ra: file_ra_state which holds the readahead state
445cf914a7dSRusty Russell  * @filp: passed on to ->readpage() and ->readpages()
446cf914a7dSRusty Russell  * @page: the page at @offset which has the PG_readahead flag set
447cf914a7dSRusty Russell  * @offset: start offset into @mapping, in pagecache page-sized units
448cf914a7dSRusty Russell  * @req_size: hint: total size of the read which the caller is performing in
449cf914a7dSRusty Russell  *            pagecache pages
450cf914a7dSRusty Russell  *
451cf914a7dSRusty Russell  * page_cache_async_ondemand() should be called when a page is used which
452f7850d93SRandy Dunlap  * has the PG_readahead flag; this is a marker to suggest that the application
453cf914a7dSRusty Russell  * has used up enough of the readahead window that we should start pulling in
454f7850d93SRandy Dunlap  * more pages.
455f7850d93SRandy Dunlap  */
456cf914a7dSRusty Russell void
457cf914a7dSRusty Russell page_cache_async_readahead(struct address_space *mapping,
458122a21d1SFengguang Wu 			   struct file_ra_state *ra, struct file *filp,
459122a21d1SFengguang Wu 			   struct page *page, pgoff_t offset,
460122a21d1SFengguang Wu 			   unsigned long req_size)
461122a21d1SFengguang Wu {
462122a21d1SFengguang Wu 	/* no read-ahead */
463122a21d1SFengguang Wu 	if (!ra->ra_pages)
464cf914a7dSRusty Russell 		return;
465122a21d1SFengguang Wu 
466fe3cba17SFengguang Wu 	/*
467cf914a7dSRusty Russell 	 * Same bit is used for PG_readahead and PG_reclaim.
468fe3cba17SFengguang Wu 	 */
469fe3cba17SFengguang Wu 	if (PageWriteback(page))
470cf914a7dSRusty Russell 		return;
471fe3cba17SFengguang Wu 
472122a21d1SFengguang Wu 	ClearPageReadahead(page);
473122a21d1SFengguang Wu 
474122a21d1SFengguang Wu 	/*
475122a21d1SFengguang Wu 	 * Defer asynchronous read-ahead on IO congestion.
476122a21d1SFengguang Wu 	 */
477122a21d1SFengguang Wu 	if (bdi_read_congested(mapping->backing_dev_info))
478cf914a7dSRusty Russell 		return;
479122a21d1SFengguang Wu 
480122a21d1SFengguang Wu 	/* do read-ahead */
481cf914a7dSRusty Russell 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
482122a21d1SFengguang Wu }
483cf914a7dSRusty Russell EXPORT_SYMBOL_GPL(page_cache_async_readahead);
484