1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * mm/readahead.c - address_space-level file readahead. 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 2002, Linus Torvalds 61da177e4SLinus Torvalds * 7e1f8e874SFrancois Cami * 09Apr2002 Andrew Morton 81da177e4SLinus Torvalds * Initial version. 91da177e4SLinus Torvalds */ 101da177e4SLinus Torvalds 111da177e4SLinus Torvalds #include <linux/kernel.h> 1211bd969fSRoss Zwisler #include <linux/dax.h> 135a0e3ad6STejun Heo #include <linux/gfp.h> 14b95f1b31SPaul Gortmaker #include <linux/export.h> 151da177e4SLinus Torvalds #include <linux/blkdev.h> 161da177e4SLinus Torvalds #include <linux/backing-dev.h> 178bde37f0SAndrew Morton #include <linux/task_io_accounting_ops.h> 181da177e4SLinus Torvalds #include <linux/pagevec.h> 19f5ff8422SJens Axboe #include <linux/pagemap.h> 20782182e5SCong Wang #include <linux/syscalls.h> 21782182e5SCong Wang #include <linux/file.h> 22d72ee911SGeliang Tang #include <linux/mm_inline.h> 23ca47e8c7SJosef Bacik #include <linux/blk-cgroup.h> 243d8f7615SAmir Goldstein #include <linux/fadvise.h> 25f2c817beSMatthew Wilcox (Oracle) #include <linux/sched/mm.h> 261da177e4SLinus Torvalds 2729f175d1SFabian Frederick #include "internal.h" 2829f175d1SFabian Frederick 291da177e4SLinus Torvalds /* 301da177e4SLinus Torvalds * Initialise a struct file's readahead state. Assumes that the caller has 311da177e4SLinus Torvalds * memset *ra to zero. 321da177e4SLinus Torvalds */ 331da177e4SLinus Torvalds void 341da177e4SLinus Torvalds file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 351da177e4SLinus Torvalds { 36de1414a6SChristoph Hellwig ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; 37f4e6b498SFengguang Wu ra->prev_pos = -1; 381da177e4SLinus Torvalds } 39d41cc702SSteven Whitehouse EXPORT_SYMBOL_GPL(file_ra_state_init); 401da177e4SLinus Torvalds 4103fb3d2aSDavid Howells /* 4203fb3d2aSDavid Howells * see if a page needs releasing upon read_cache_pages() failure 43266cf658SDavid Howells * - the caller of read_cache_pages() may have set PG_private or PG_fscache 44266cf658SDavid Howells * before calling, such as the NFS fs marking pages that are cached locally 45266cf658SDavid Howells * on disk, thus we need to give the fs a chance to clean up in the event of 46266cf658SDavid Howells * an error 4703fb3d2aSDavid Howells */ 4803fb3d2aSDavid Howells static void read_cache_pages_invalidate_page(struct address_space *mapping, 4903fb3d2aSDavid Howells struct page *page) 5003fb3d2aSDavid Howells { 51266cf658SDavid Howells if (page_has_private(page)) { 5203fb3d2aSDavid Howells if (!trylock_page(page)) 5303fb3d2aSDavid Howells BUG(); 5403fb3d2aSDavid Howells page->mapping = mapping; 5509cbfeafSKirill A. Shutemov do_invalidatepage(page, 0, PAGE_SIZE); 5603fb3d2aSDavid Howells page->mapping = NULL; 5703fb3d2aSDavid Howells unlock_page(page); 5803fb3d2aSDavid Howells } 5909cbfeafSKirill A. Shutemov put_page(page); 6003fb3d2aSDavid Howells } 6103fb3d2aSDavid Howells 6203fb3d2aSDavid Howells /* 6303fb3d2aSDavid Howells * release a list of pages, invalidating them first if need be 6403fb3d2aSDavid Howells */ 6503fb3d2aSDavid Howells static void read_cache_pages_invalidate_pages(struct address_space *mapping, 6603fb3d2aSDavid Howells struct list_head *pages) 6703fb3d2aSDavid Howells { 6803fb3d2aSDavid Howells struct page *victim; 6903fb3d2aSDavid Howells 7003fb3d2aSDavid Howells while (!list_empty(pages)) { 71c8ad6302SGeliang Tang victim = lru_to_page(pages); 7203fb3d2aSDavid Howells list_del(&victim->lru); 7303fb3d2aSDavid Howells read_cache_pages_invalidate_page(mapping, victim); 7403fb3d2aSDavid Howells } 7503fb3d2aSDavid Howells } 7603fb3d2aSDavid Howells 771da177e4SLinus Torvalds /** 78bd40cddaSRandy Dunlap * read_cache_pages - populate an address space with some pages & start reads against them 791da177e4SLinus Torvalds * @mapping: the address_space 801da177e4SLinus Torvalds * @pages: The address of a list_head which contains the target pages. These 811da177e4SLinus Torvalds * pages have their ->index populated and are otherwise uninitialised. 821da177e4SLinus Torvalds * @filler: callback routine for filling a single page. 831da177e4SLinus Torvalds * @data: private data for the callback routine. 841da177e4SLinus Torvalds * 851da177e4SLinus Torvalds * Hides the details of the LRU cache etc from the filesystems. 86a862f68aSMike Rapoport * 87a862f68aSMike Rapoport * Returns: %0 on success, error return by @filler otherwise 881da177e4SLinus Torvalds */ 891da177e4SLinus Torvalds int read_cache_pages(struct address_space *mapping, struct list_head *pages, 901da177e4SLinus Torvalds int (*filler)(void *, struct page *), void *data) 911da177e4SLinus Torvalds { 921da177e4SLinus Torvalds struct page *page; 931da177e4SLinus Torvalds int ret = 0; 941da177e4SLinus Torvalds 951da177e4SLinus Torvalds while (!list_empty(pages)) { 96c8ad6302SGeliang Tang page = lru_to_page(pages); 971da177e4SLinus Torvalds list_del(&page->lru); 98063d99b4SMichal Hocko if (add_to_page_cache_lru(page, mapping, page->index, 998a5c743eSMichal Hocko readahead_gfp_mask(mapping))) { 10003fb3d2aSDavid Howells read_cache_pages_invalidate_page(mapping, page); 1011da177e4SLinus Torvalds continue; 1021da177e4SLinus Torvalds } 10309cbfeafSKirill A. Shutemov put_page(page); 104eb2be189SNick Piggin 1051da177e4SLinus Torvalds ret = filler(data, page); 106eb2be189SNick Piggin if (unlikely(ret)) { 10703fb3d2aSDavid Howells read_cache_pages_invalidate_pages(mapping, pages); 1081da177e4SLinus Torvalds break; 1091da177e4SLinus Torvalds } 11009cbfeafSKirill A. Shutemov task_io_account_read(PAGE_SIZE); 1111da177e4SLinus Torvalds } 1121da177e4SLinus Torvalds return ret; 1131da177e4SLinus Torvalds } 1141da177e4SLinus Torvalds 1151da177e4SLinus Torvalds EXPORT_SYMBOL(read_cache_pages); 1161da177e4SLinus Torvalds 117a4d96536SMatthew Wilcox (Oracle) static void read_pages(struct readahead_control *rac, struct list_head *pages, 118c1f6925eSMatthew Wilcox (Oracle) bool skip_page) 1191da177e4SLinus Torvalds { 120a4d96536SMatthew Wilcox (Oracle) const struct address_space_operations *aops = rac->mapping->a_ops; 121c1f6925eSMatthew Wilcox (Oracle) struct page *page; 1225b417b18SJens Axboe struct blk_plug plug; 1231da177e4SLinus Torvalds 124a4d96536SMatthew Wilcox (Oracle) if (!readahead_count(rac)) 125c1f6925eSMatthew Wilcox (Oracle) goto out; 126ad4ae1c7SMatthew Wilcox (Oracle) 1275b417b18SJens Axboe blk_start_plug(&plug); 1285b417b18SJens Axboe 1298151b4c8SMatthew Wilcox (Oracle) if (aops->readahead) { 1308151b4c8SMatthew Wilcox (Oracle) aops->readahead(rac); 1318151b4c8SMatthew Wilcox (Oracle) /* Clean up the remaining pages */ 1328151b4c8SMatthew Wilcox (Oracle) while ((page = readahead_page(rac))) { 1338151b4c8SMatthew Wilcox (Oracle) unlock_page(page); 1348151b4c8SMatthew Wilcox (Oracle) put_page(page); 1358151b4c8SMatthew Wilcox (Oracle) } 1368151b4c8SMatthew Wilcox (Oracle) } else if (aops->readpages) { 137a4d96536SMatthew Wilcox (Oracle) aops->readpages(rac->file, rac->mapping, pages, 138a4d96536SMatthew Wilcox (Oracle) readahead_count(rac)); 139029e332eSOGAWA Hirofumi /* Clean up the remaining pages */ 140029e332eSOGAWA Hirofumi put_pages_list(pages); 141c1f6925eSMatthew Wilcox (Oracle) rac->_index += rac->_nr_pages; 142c1f6925eSMatthew Wilcox (Oracle) rac->_nr_pages = 0; 143c1f6925eSMatthew Wilcox (Oracle) } else { 144c1f6925eSMatthew Wilcox (Oracle) while ((page = readahead_page(rac))) { 145a4d96536SMatthew Wilcox (Oracle) aops->readpage(rac->file, page); 14609cbfeafSKirill A. Shutemov put_page(page); 1471da177e4SLinus Torvalds } 148c1f6925eSMatthew Wilcox (Oracle) } 1495b417b18SJens Axboe 1505b417b18SJens Axboe blk_finish_plug(&plug); 151ad4ae1c7SMatthew Wilcox (Oracle) 152ad4ae1c7SMatthew Wilcox (Oracle) BUG_ON(!list_empty(pages)); 153c1f6925eSMatthew Wilcox (Oracle) BUG_ON(readahead_count(rac)); 154c1f6925eSMatthew Wilcox (Oracle) 155c1f6925eSMatthew Wilcox (Oracle) out: 156c1f6925eSMatthew Wilcox (Oracle) if (skip_page) 157c1f6925eSMatthew Wilcox (Oracle) rac->_index++; 1581da177e4SLinus Torvalds } 1591da177e4SLinus Torvalds 1602c684234SMatthew Wilcox (Oracle) /** 16173bb49daSMatthew Wilcox (Oracle) * page_cache_ra_unbounded - Start unchecked readahead. 16273bb49daSMatthew Wilcox (Oracle) * @ractl: Readahead control. 1632c684234SMatthew Wilcox (Oracle) * @nr_to_read: The number of pages to read. 1642c684234SMatthew Wilcox (Oracle) * @lookahead_size: Where to start the next readahead. 1652c684234SMatthew Wilcox (Oracle) * 1662c684234SMatthew Wilcox (Oracle) * This function is for filesystems to call when they want to start 1672c684234SMatthew Wilcox (Oracle) * readahead beyond a file's stated i_size. This is almost certainly 1682c684234SMatthew Wilcox (Oracle) * not the function you want to call. Use page_cache_async_readahead() 1692c684234SMatthew Wilcox (Oracle) * or page_cache_sync_readahead() instead. 1702c684234SMatthew Wilcox (Oracle) * 1712c684234SMatthew Wilcox (Oracle) * Context: File is referenced by caller. Mutexes may be held by caller. 1722c684234SMatthew Wilcox (Oracle) * May sleep, but will not reenter filesystem to reclaim memory. 1731da177e4SLinus Torvalds */ 17473bb49daSMatthew Wilcox (Oracle) void page_cache_ra_unbounded(struct readahead_control *ractl, 17573bb49daSMatthew Wilcox (Oracle) unsigned long nr_to_read, unsigned long lookahead_size) 1761da177e4SLinus Torvalds { 17773bb49daSMatthew Wilcox (Oracle) struct address_space *mapping = ractl->mapping; 17873bb49daSMatthew Wilcox (Oracle) unsigned long index = readahead_index(ractl); 1791da177e4SLinus Torvalds LIST_HEAD(page_pool); 1808a5c743eSMichal Hocko gfp_t gfp_mask = readahead_gfp_mask(mapping); 181c2c7ad74SMatthew Wilcox (Oracle) unsigned long i; 1821da177e4SLinus Torvalds 1831da177e4SLinus Torvalds /* 184f2c817beSMatthew Wilcox (Oracle) * Partway through the readahead operation, we will have added 185f2c817beSMatthew Wilcox (Oracle) * locked pages to the page cache, but will not yet have submitted 186f2c817beSMatthew Wilcox (Oracle) * them for I/O. Adding another page may need to allocate memory, 187f2c817beSMatthew Wilcox (Oracle) * which can trigger memory reclaim. Telling the VM we're in 188f2c817beSMatthew Wilcox (Oracle) * the middle of a filesystem operation will cause it to not 189f2c817beSMatthew Wilcox (Oracle) * touch file-backed pages, preventing a deadlock. Most (all?) 190f2c817beSMatthew Wilcox (Oracle) * filesystems already specify __GFP_NOFS in their mapping's 191f2c817beSMatthew Wilcox (Oracle) * gfp_mask, but let's be explicit here. 192f2c817beSMatthew Wilcox (Oracle) */ 193f2c817beSMatthew Wilcox (Oracle) unsigned int nofs = memalloc_nofs_save(); 194f2c817beSMatthew Wilcox (Oracle) 195f2c817beSMatthew Wilcox (Oracle) /* 1961da177e4SLinus Torvalds * Preallocate as many pages as we will need. 1971da177e4SLinus Torvalds */ 198c2c7ad74SMatthew Wilcox (Oracle) for (i = 0; i < nr_to_read; i++) { 199b0f31d78SMatthew Wilcox (Oracle) struct page *page = xa_load(&mapping->i_pages, index + i); 2001da177e4SLinus Torvalds 20173bb49daSMatthew Wilcox (Oracle) BUG_ON(index + i != ractl->_index + ractl->_nr_pages); 202c1f6925eSMatthew Wilcox (Oracle) 2033159f943SMatthew Wilcox if (page && !xa_is_value(page)) { 204b3751e6aSChristoph Hellwig /* 2052d8163e4SMatthew Wilcox (Oracle) * Page already present? Kick off the current batch 2062d8163e4SMatthew Wilcox (Oracle) * of contiguous pages before continuing with the 2072d8163e4SMatthew Wilcox (Oracle) * next batch. This page may be the one we would 2082d8163e4SMatthew Wilcox (Oracle) * have intended to mark as Readahead, but we don't 2092d8163e4SMatthew Wilcox (Oracle) * have a stable reference to this page, and it's 2102d8163e4SMatthew Wilcox (Oracle) * not worth getting one just for that. 211b3751e6aSChristoph Hellwig */ 21273bb49daSMatthew Wilcox (Oracle) read_pages(ractl, &page_pool, true); 2131da177e4SLinus Torvalds continue; 214b3751e6aSChristoph Hellwig } 2151da177e4SLinus Torvalds 2168a5c743eSMichal Hocko page = __page_cache_alloc(gfp_mask); 2171da177e4SLinus Torvalds if (!page) 2181da177e4SLinus Torvalds break; 219c1f6925eSMatthew Wilcox (Oracle) if (mapping->a_ops->readpages) { 220ef8153b6SMatthew Wilcox (Oracle) page->index = index + i; 2211da177e4SLinus Torvalds list_add(&page->lru, &page_pool); 222c1f6925eSMatthew Wilcox (Oracle) } else if (add_to_page_cache_lru(page, mapping, index + i, 223c1f6925eSMatthew Wilcox (Oracle) gfp_mask) < 0) { 224c1f6925eSMatthew Wilcox (Oracle) put_page(page); 22573bb49daSMatthew Wilcox (Oracle) read_pages(ractl, &page_pool, true); 226c1f6925eSMatthew Wilcox (Oracle) continue; 227c1f6925eSMatthew Wilcox (Oracle) } 228c2c7ad74SMatthew Wilcox (Oracle) if (i == nr_to_read - lookahead_size) 22946fc3e7bSFengguang Wu SetPageReadahead(page); 23073bb49daSMatthew Wilcox (Oracle) ractl->_nr_pages++; 2311da177e4SLinus Torvalds } 2321da177e4SLinus Torvalds 2331da177e4SLinus Torvalds /* 2341da177e4SLinus Torvalds * Now start the IO. We ignore I/O errors - if the page is not 2351da177e4SLinus Torvalds * uptodate then the caller will launch readpage again, and 2361da177e4SLinus Torvalds * will then handle the error. 2371da177e4SLinus Torvalds */ 23873bb49daSMatthew Wilcox (Oracle) read_pages(ractl, &page_pool, false); 239f2c817beSMatthew Wilcox (Oracle) memalloc_nofs_restore(nofs); 2401da177e4SLinus Torvalds } 24173bb49daSMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); 2422c684234SMatthew Wilcox (Oracle) 2432c684234SMatthew Wilcox (Oracle) /* 2448238287eSMatthew Wilcox (Oracle) * do_page_cache_ra() actually reads a chunk of disk. It allocates 2452c684234SMatthew Wilcox (Oracle) * the pages first, then submits them for I/O. This avoids the very bad 2462c684234SMatthew Wilcox (Oracle) * behaviour which would occur if page allocations are causing VM writeback. 2472c684234SMatthew Wilcox (Oracle) * We really don't want to intermingle reads and writes like that. 2482c684234SMatthew Wilcox (Oracle) */ 2498238287eSMatthew Wilcox (Oracle) void do_page_cache_ra(struct readahead_control *ractl, 2508238287eSMatthew Wilcox (Oracle) unsigned long nr_to_read, unsigned long lookahead_size) 2512c684234SMatthew Wilcox (Oracle) { 2528238287eSMatthew Wilcox (Oracle) struct inode *inode = ractl->mapping->host; 2538238287eSMatthew Wilcox (Oracle) unsigned long index = readahead_index(ractl); 2542c684234SMatthew Wilcox (Oracle) loff_t isize = i_size_read(inode); 2552c684234SMatthew Wilcox (Oracle) pgoff_t end_index; /* The last page we want to read */ 2562c684234SMatthew Wilcox (Oracle) 2572c684234SMatthew Wilcox (Oracle) if (isize == 0) 2582c684234SMatthew Wilcox (Oracle) return; 2592c684234SMatthew Wilcox (Oracle) 2602c684234SMatthew Wilcox (Oracle) end_index = (isize - 1) >> PAGE_SHIFT; 2612c684234SMatthew Wilcox (Oracle) if (index > end_index) 2622c684234SMatthew Wilcox (Oracle) return; 2632c684234SMatthew Wilcox (Oracle) /* Don't read past the page containing the last byte of the file */ 2642c684234SMatthew Wilcox (Oracle) if (nr_to_read > end_index - index) 2652c684234SMatthew Wilcox (Oracle) nr_to_read = end_index - index + 1; 2662c684234SMatthew Wilcox (Oracle) 2678238287eSMatthew Wilcox (Oracle) page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); 2682c684234SMatthew Wilcox (Oracle) } 2691da177e4SLinus Torvalds 2701da177e4SLinus Torvalds /* 2711da177e4SLinus Torvalds * Chunk the readahead into 2 megabyte units, so that we don't pin too much 2721da177e4SLinus Torvalds * memory at once. 2731da177e4SLinus Torvalds */ 2747b3df3b9SDavid Howells void force_page_cache_ra(struct readahead_control *ractl, 2757b3df3b9SDavid Howells unsigned long nr_to_read) 2761da177e4SLinus Torvalds { 2777b3df3b9SDavid Howells struct address_space *mapping = ractl->mapping; 2789491ae4aSJens Axboe struct backing_dev_info *bdi = inode_to_bdi(mapping->host); 2797b3df3b9SDavid Howells struct file_ra_state *ra = &ractl->file->f_ra; 2807b3df3b9SDavid Howells unsigned long max_pages, index; 2819491ae4aSJens Axboe 2828151b4c8SMatthew Wilcox (Oracle) if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && 2838151b4c8SMatthew Wilcox (Oracle) !mapping->a_ops->readahead)) 2849a42823aSMatthew Wilcox (Oracle) return; 2851da177e4SLinus Torvalds 2869491ae4aSJens Axboe /* 2879491ae4aSJens Axboe * If the request exceeds the readahead window, allow the read to 2889491ae4aSJens Axboe * be up to the optimal hardware IO size 2899491ae4aSJens Axboe */ 2907b3df3b9SDavid Howells index = readahead_index(ractl); 2919491ae4aSJens Axboe max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); 2927b3df3b9SDavid Howells nr_to_read = min_t(unsigned long, nr_to_read, max_pages); 2931da177e4SLinus Torvalds while (nr_to_read) { 29409cbfeafSKirill A. Shutemov unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; 2951da177e4SLinus Torvalds 2961da177e4SLinus Torvalds if (this_chunk > nr_to_read) 2971da177e4SLinus Torvalds this_chunk = nr_to_read; 2987b3df3b9SDavid Howells ractl->_index = index; 2997b3df3b9SDavid Howells do_page_cache_ra(ractl, this_chunk, 0); 30058d5640eSMark Rutland 30108eb9658SMatthew Wilcox (Oracle) index += this_chunk; 3021da177e4SLinus Torvalds nr_to_read -= this_chunk; 3031da177e4SLinus Torvalds } 3041da177e4SLinus Torvalds } 3051da177e4SLinus Torvalds 3065ce1110bSFengguang Wu /* 307c743d96bSFengguang Wu * Set the initial window size, round to next power of 2 and square 308c743d96bSFengguang Wu * for small size, x 4 for medium, and x 2 for large 309c743d96bSFengguang Wu * for 128k (32 page) max ra 310c743d96bSFengguang Wu * 1-8 page = 32k initial, > 8 page = 128k initial 311c743d96bSFengguang Wu */ 312c743d96bSFengguang Wu static unsigned long get_init_ra_size(unsigned long size, unsigned long max) 313c743d96bSFengguang Wu { 314c743d96bSFengguang Wu unsigned long newsize = roundup_pow_of_two(size); 315c743d96bSFengguang Wu 316c743d96bSFengguang Wu if (newsize <= max / 32) 317c743d96bSFengguang Wu newsize = newsize * 4; 318c743d96bSFengguang Wu else if (newsize <= max / 4) 319c743d96bSFengguang Wu newsize = newsize * 2; 320c743d96bSFengguang Wu else 321c743d96bSFengguang Wu newsize = max; 322c743d96bSFengguang Wu 323c743d96bSFengguang Wu return newsize; 324c743d96bSFengguang Wu } 325c743d96bSFengguang Wu 326c743d96bSFengguang Wu /* 327122a21d1SFengguang Wu * Get the previous window size, ramp it up, and 328122a21d1SFengguang Wu * return it as the new window size. 329122a21d1SFengguang Wu */ 330c743d96bSFengguang Wu static unsigned long get_next_ra_size(struct file_ra_state *ra, 331122a21d1SFengguang Wu unsigned long max) 332122a21d1SFengguang Wu { 333f9acc8c7SFengguang Wu unsigned long cur = ra->size; 334122a21d1SFengguang Wu 335122a21d1SFengguang Wu if (cur < max / 16) 33620ff1c95SGao Xiang return 4 * cur; 33720ff1c95SGao Xiang if (cur <= max / 2) 33820ff1c95SGao Xiang return 2 * cur; 33920ff1c95SGao Xiang return max; 340122a21d1SFengguang Wu } 341122a21d1SFengguang Wu 342122a21d1SFengguang Wu /* 343122a21d1SFengguang Wu * On-demand readahead design. 344122a21d1SFengguang Wu * 345122a21d1SFengguang Wu * The fields in struct file_ra_state represent the most-recently-executed 346122a21d1SFengguang Wu * readahead attempt: 347122a21d1SFengguang Wu * 348f9acc8c7SFengguang Wu * |<----- async_size ---------| 349f9acc8c7SFengguang Wu * |------------------- size -------------------->| 350f9acc8c7SFengguang Wu * |==================#===========================| 351f9acc8c7SFengguang Wu * ^start ^page marked with PG_readahead 352122a21d1SFengguang Wu * 353122a21d1SFengguang Wu * To overlap application thinking time and disk I/O time, we do 354122a21d1SFengguang Wu * `readahead pipelining': Do not wait until the application consumed all 355122a21d1SFengguang Wu * readahead pages and stalled on the missing page at readahead_index; 356f9acc8c7SFengguang Wu * Instead, submit an asynchronous readahead I/O as soon as there are 357f9acc8c7SFengguang Wu * only async_size pages left in the readahead window. Normally async_size 358f9acc8c7SFengguang Wu * will be equal to size, for maximum pipelining. 359122a21d1SFengguang Wu * 360122a21d1SFengguang Wu * In interleaved sequential reads, concurrent streams on the same fd can 361122a21d1SFengguang Wu * be invalidating each other's readahead state. So we flag the new readahead 362f9acc8c7SFengguang Wu * page at (start+size-async_size) with PG_readahead, and use it as readahead 363122a21d1SFengguang Wu * indicator. The flag won't be set on already cached pages, to avoid the 364122a21d1SFengguang Wu * readahead-for-nothing fuss, saving pointless page cache lookups. 365122a21d1SFengguang Wu * 366f4e6b498SFengguang Wu * prev_pos tracks the last visited byte in the _previous_ read request. 367122a21d1SFengguang Wu * It should be maintained by the caller, and will be used for detecting 368122a21d1SFengguang Wu * small random reads. Note that the readahead algorithm checks loosely 369122a21d1SFengguang Wu * for sequential patterns. Hence interleaved reads might be served as 370122a21d1SFengguang Wu * sequential ones. 371122a21d1SFengguang Wu * 372122a21d1SFengguang Wu * There is a special-case: if the first page which the application tries to 373122a21d1SFengguang Wu * read happens to be the first page of the file, it is assumed that a linear 374122a21d1SFengguang Wu * read is about to happen and the window is immediately set to the initial size 375122a21d1SFengguang Wu * based on I/O request size and the max_readahead. 376122a21d1SFengguang Wu * 377122a21d1SFengguang Wu * The code ramps up the readahead size aggressively at first, but slow down as 378122a21d1SFengguang Wu * it approaches max_readhead. 379122a21d1SFengguang Wu */ 380122a21d1SFengguang Wu 381122a21d1SFengguang Wu /* 38208eb9658SMatthew Wilcox (Oracle) * Count contiguously cached pages from @index-1 to @index-@max, 38310be0b37SWu Fengguang * this count is a conservative estimation of 38410be0b37SWu Fengguang * - length of the sequential read sequence, or 38510be0b37SWu Fengguang * - thrashing threshold in memory tight systems 38610be0b37SWu Fengguang */ 38710be0b37SWu Fengguang static pgoff_t count_history_pages(struct address_space *mapping, 38808eb9658SMatthew Wilcox (Oracle) pgoff_t index, unsigned long max) 38910be0b37SWu Fengguang { 39010be0b37SWu Fengguang pgoff_t head; 39110be0b37SWu Fengguang 39210be0b37SWu Fengguang rcu_read_lock(); 39308eb9658SMatthew Wilcox (Oracle) head = page_cache_prev_miss(mapping, index - 1, max); 39410be0b37SWu Fengguang rcu_read_unlock(); 39510be0b37SWu Fengguang 39608eb9658SMatthew Wilcox (Oracle) return index - 1 - head; 39710be0b37SWu Fengguang } 39810be0b37SWu Fengguang 39910be0b37SWu Fengguang /* 40010be0b37SWu Fengguang * page cache context based read-ahead 40110be0b37SWu Fengguang */ 40210be0b37SWu Fengguang static int try_context_readahead(struct address_space *mapping, 40310be0b37SWu Fengguang struct file_ra_state *ra, 40408eb9658SMatthew Wilcox (Oracle) pgoff_t index, 40510be0b37SWu Fengguang unsigned long req_size, 40610be0b37SWu Fengguang unsigned long max) 40710be0b37SWu Fengguang { 40810be0b37SWu Fengguang pgoff_t size; 40910be0b37SWu Fengguang 41008eb9658SMatthew Wilcox (Oracle) size = count_history_pages(mapping, index, max); 41110be0b37SWu Fengguang 41210be0b37SWu Fengguang /* 4132cad4018SFengguang Wu * not enough history pages: 41410be0b37SWu Fengguang * it could be a random read 41510be0b37SWu Fengguang */ 4162cad4018SFengguang Wu if (size <= req_size) 41710be0b37SWu Fengguang return 0; 41810be0b37SWu Fengguang 41910be0b37SWu Fengguang /* 42010be0b37SWu Fengguang * starts from beginning of file: 42110be0b37SWu Fengguang * it is a strong indication of long-run stream (or whole-file-read) 42210be0b37SWu Fengguang */ 42308eb9658SMatthew Wilcox (Oracle) if (size >= index) 42410be0b37SWu Fengguang size *= 2; 42510be0b37SWu Fengguang 42608eb9658SMatthew Wilcox (Oracle) ra->start = index; 4272cad4018SFengguang Wu ra->size = min(size + req_size, max); 4282cad4018SFengguang Wu ra->async_size = 1; 42910be0b37SWu Fengguang 43010be0b37SWu Fengguang return 1; 43110be0b37SWu Fengguang } 43210be0b37SWu Fengguang 43310be0b37SWu Fengguang /* 434122a21d1SFengguang Wu * A minimal readahead algorithm for trivial sequential/random reads. 435122a21d1SFengguang Wu */ 4366e4af69aSDavid Howells static void ondemand_readahead(struct readahead_control *ractl, 4376e4af69aSDavid Howells struct file_ra_state *ra, bool hit_readahead_marker, 438122a21d1SFengguang Wu unsigned long req_size) 439122a21d1SFengguang Wu { 4406e4af69aSDavid Howells struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); 4419491ae4aSJens Axboe unsigned long max_pages = ra->ra_pages; 442dc30b96aSMarkus Stockhausen unsigned long add_pages; 4436e4af69aSDavid Howells unsigned long index = readahead_index(ractl); 44408eb9658SMatthew Wilcox (Oracle) pgoff_t prev_index; 445045a2529SWu Fengguang 446045a2529SWu Fengguang /* 4479491ae4aSJens Axboe * If the request exceeds the readahead window, allow the read to 4489491ae4aSJens Axboe * be up to the optimal hardware IO size 4499491ae4aSJens Axboe */ 4509491ae4aSJens Axboe if (req_size > max_pages && bdi->io_pages > max_pages) 4519491ae4aSJens Axboe max_pages = min(req_size, bdi->io_pages); 4529491ae4aSJens Axboe 4539491ae4aSJens Axboe /* 454045a2529SWu Fengguang * start of file 455045a2529SWu Fengguang */ 45608eb9658SMatthew Wilcox (Oracle) if (!index) 457045a2529SWu Fengguang goto initial_readahead; 458122a21d1SFengguang Wu 459122a21d1SFengguang Wu /* 46008eb9658SMatthew Wilcox (Oracle) * It's the expected callback index, assume sequential access. 461122a21d1SFengguang Wu * Ramp up sizes, and push forward the readahead window. 462122a21d1SFengguang Wu */ 46308eb9658SMatthew Wilcox (Oracle) if ((index == (ra->start + ra->size - ra->async_size) || 46408eb9658SMatthew Wilcox (Oracle) index == (ra->start + ra->size))) { 465f9acc8c7SFengguang Wu ra->start += ra->size; 4669491ae4aSJens Axboe ra->size = get_next_ra_size(ra, max_pages); 467f9acc8c7SFengguang Wu ra->async_size = ra->size; 468f9acc8c7SFengguang Wu goto readit; 469122a21d1SFengguang Wu } 470122a21d1SFengguang Wu 471122a21d1SFengguang Wu /* 4726b10c6c9SFengguang Wu * Hit a marked page without valid readahead state. 4736b10c6c9SFengguang Wu * E.g. interleaved reads. 4746b10c6c9SFengguang Wu * Query the pagecache for async_size, which normally equals to 4756b10c6c9SFengguang Wu * readahead size. Ramp it up and use it as the new readahead size. 4766b10c6c9SFengguang Wu */ 4776b10c6c9SFengguang Wu if (hit_readahead_marker) { 4786b10c6c9SFengguang Wu pgoff_t start; 4796b10c6c9SFengguang Wu 48030002ed2SNick Piggin rcu_read_lock(); 4816e4af69aSDavid Howells start = page_cache_next_miss(ractl->mapping, index + 1, 4826e4af69aSDavid Howells max_pages); 48330002ed2SNick Piggin rcu_read_unlock(); 4846b10c6c9SFengguang Wu 48508eb9658SMatthew Wilcox (Oracle) if (!start || start - index > max_pages) 4869a42823aSMatthew Wilcox (Oracle) return; 4876b10c6c9SFengguang Wu 4886b10c6c9SFengguang Wu ra->start = start; 48908eb9658SMatthew Wilcox (Oracle) ra->size = start - index; /* old async_size */ 490160334a0SWu Fengguang ra->size += req_size; 4919491ae4aSJens Axboe ra->size = get_next_ra_size(ra, max_pages); 4926b10c6c9SFengguang Wu ra->async_size = ra->size; 4936b10c6c9SFengguang Wu goto readit; 4946b10c6c9SFengguang Wu } 4956b10c6c9SFengguang Wu 4966b10c6c9SFengguang Wu /* 497045a2529SWu Fengguang * oversize read 498122a21d1SFengguang Wu */ 4999491ae4aSJens Axboe if (req_size > max_pages) 500045a2529SWu Fengguang goto initial_readahead; 501045a2529SWu Fengguang 502045a2529SWu Fengguang /* 503045a2529SWu Fengguang * sequential cache miss 50408eb9658SMatthew Wilcox (Oracle) * trivial case: (index - prev_index) == 1 50508eb9658SMatthew Wilcox (Oracle) * unaligned reads: (index - prev_index) == 0 506045a2529SWu Fengguang */ 50708eb9658SMatthew Wilcox (Oracle) prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; 50808eb9658SMatthew Wilcox (Oracle) if (index - prev_index <= 1UL) 509045a2529SWu Fengguang goto initial_readahead; 510045a2529SWu Fengguang 511045a2529SWu Fengguang /* 51210be0b37SWu Fengguang * Query the page cache and look for the traces(cached history pages) 51310be0b37SWu Fengguang * that a sequential stream would leave behind. 51410be0b37SWu Fengguang */ 5156e4af69aSDavid Howells if (try_context_readahead(ractl->mapping, ra, index, req_size, 5166e4af69aSDavid Howells max_pages)) 51710be0b37SWu Fengguang goto readit; 51810be0b37SWu Fengguang 51910be0b37SWu Fengguang /* 520045a2529SWu Fengguang * standalone, small random read 521045a2529SWu Fengguang * Read as is, and do not pollute the readahead state. 522045a2529SWu Fengguang */ 5236e4af69aSDavid Howells do_page_cache_ra(ractl, req_size, 0); 5249a42823aSMatthew Wilcox (Oracle) return; 525045a2529SWu Fengguang 526045a2529SWu Fengguang initial_readahead: 52708eb9658SMatthew Wilcox (Oracle) ra->start = index; 5289491ae4aSJens Axboe ra->size = get_init_ra_size(req_size, max_pages); 529f9acc8c7SFengguang Wu ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 530122a21d1SFengguang Wu 531f9acc8c7SFengguang Wu readit: 53251daa88eSWu Fengguang /* 53351daa88eSWu Fengguang * Will this read hit the readahead marker made by itself? 53451daa88eSWu Fengguang * If so, trigger the readahead marker hit now, and merge 53551daa88eSWu Fengguang * the resulted next readahead window into the current one. 536dc30b96aSMarkus Stockhausen * Take care of maximum IO pages as above. 53751daa88eSWu Fengguang */ 53808eb9658SMatthew Wilcox (Oracle) if (index == ra->start && ra->size == ra->async_size) { 539dc30b96aSMarkus Stockhausen add_pages = get_next_ra_size(ra, max_pages); 540dc30b96aSMarkus Stockhausen if (ra->size + add_pages <= max_pages) { 541dc30b96aSMarkus Stockhausen ra->async_size = add_pages; 542dc30b96aSMarkus Stockhausen ra->size += add_pages; 543dc30b96aSMarkus Stockhausen } else { 544dc30b96aSMarkus Stockhausen ra->size = max_pages; 545dc30b96aSMarkus Stockhausen ra->async_size = max_pages >> 1; 546dc30b96aSMarkus Stockhausen } 54751daa88eSWu Fengguang } 54851daa88eSWu Fengguang 5496e4af69aSDavid Howells ractl->_index = ra->start; 5506e4af69aSDavid Howells do_page_cache_ra(ractl, ra->size, ra->async_size); 551122a21d1SFengguang Wu } 552122a21d1SFengguang Wu 553*fefa7c47SMatthew Wilcox (Oracle) void page_cache_sync_ra(struct readahead_control *ractl, 554*fefa7c47SMatthew Wilcox (Oracle) struct file_ra_state *ra, unsigned long req_count) 555cf914a7dSRusty Russell { 556cf914a7dSRusty Russell /* no read-ahead */ 557cf914a7dSRusty Russell if (!ra->ra_pages) 558cf914a7dSRusty Russell return; 559cf914a7dSRusty Russell 560ca47e8c7SJosef Bacik if (blk_cgroup_congested()) 561ca47e8c7SJosef Bacik return; 562ca47e8c7SJosef Bacik 5630141450fSWu Fengguang /* be dumb */ 564*fefa7c47SMatthew Wilcox (Oracle) if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) { 565*fefa7c47SMatthew Wilcox (Oracle) force_page_cache_ra(ractl, req_count); 5660141450fSWu Fengguang return; 5670141450fSWu Fengguang } 5680141450fSWu Fengguang 569cf914a7dSRusty Russell /* do read-ahead */ 570*fefa7c47SMatthew Wilcox (Oracle) ondemand_readahead(ractl, ra, false, req_count); 571cf914a7dSRusty Russell } 572*fefa7c47SMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(page_cache_sync_ra); 573cf914a7dSRusty Russell 574*fefa7c47SMatthew Wilcox (Oracle) void page_cache_async_ra(struct readahead_control *ractl, 575*fefa7c47SMatthew Wilcox (Oracle) struct file_ra_state *ra, struct page *page, 57608eb9658SMatthew Wilcox (Oracle) unsigned long req_count) 577122a21d1SFengguang Wu { 578122a21d1SFengguang Wu /* no read-ahead */ 579122a21d1SFengguang Wu if (!ra->ra_pages) 580cf914a7dSRusty Russell return; 581122a21d1SFengguang Wu 582fe3cba17SFengguang Wu /* 583cf914a7dSRusty Russell * Same bit is used for PG_readahead and PG_reclaim. 584fe3cba17SFengguang Wu */ 585fe3cba17SFengguang Wu if (PageWriteback(page)) 586cf914a7dSRusty Russell return; 587fe3cba17SFengguang Wu 588122a21d1SFengguang Wu ClearPageReadahead(page); 589122a21d1SFengguang Wu 590122a21d1SFengguang Wu /* 591122a21d1SFengguang Wu * Defer asynchronous read-ahead on IO congestion. 592122a21d1SFengguang Wu */ 593*fefa7c47SMatthew Wilcox (Oracle) if (inode_read_congested(ractl->mapping->host)) 594cf914a7dSRusty Russell return; 595122a21d1SFengguang Wu 596ca47e8c7SJosef Bacik if (blk_cgroup_congested()) 597ca47e8c7SJosef Bacik return; 598ca47e8c7SJosef Bacik 599122a21d1SFengguang Wu /* do read-ahead */ 600*fefa7c47SMatthew Wilcox (Oracle) ondemand_readahead(ractl, ra, true, req_count); 601122a21d1SFengguang Wu } 602*fefa7c47SMatthew Wilcox (Oracle) EXPORT_SYMBOL_GPL(page_cache_async_ra); 603782182e5SCong Wang 604c7b95d51SDominik Brodowski ssize_t ksys_readahead(int fd, loff_t offset, size_t count) 605782182e5SCong Wang { 606782182e5SCong Wang ssize_t ret; 6072903ff01SAl Viro struct fd f; 608782182e5SCong Wang 609782182e5SCong Wang ret = -EBADF; 6102903ff01SAl Viro f = fdget(fd); 6113d8f7615SAmir Goldstein if (!f.file || !(f.file->f_mode & FMODE_READ)) 6123d8f7615SAmir Goldstein goto out; 6133d8f7615SAmir Goldstein 6143d8f7615SAmir Goldstein /* 6153d8f7615SAmir Goldstein * The readahead() syscall is intended to run only on files 6163d8f7615SAmir Goldstein * that can execute readahead. If readahead is not possible 6173d8f7615SAmir Goldstein * on this file, then we must return -EINVAL. 6183d8f7615SAmir Goldstein */ 6193d8f7615SAmir Goldstein ret = -EINVAL; 6203d8f7615SAmir Goldstein if (!f.file->f_mapping || !f.file->f_mapping->a_ops || 6213d8f7615SAmir Goldstein !S_ISREG(file_inode(f.file)->i_mode)) 6223d8f7615SAmir Goldstein goto out; 6233d8f7615SAmir Goldstein 6243d8f7615SAmir Goldstein ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); 6253d8f7615SAmir Goldstein out: 6262903ff01SAl Viro fdput(f); 627782182e5SCong Wang return ret; 628782182e5SCong Wang } 629c7b95d51SDominik Brodowski 630c7b95d51SDominik Brodowski SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) 631c7b95d51SDominik Brodowski { 632c7b95d51SDominik Brodowski return ksys_readahead(fd, offset, count); 633c7b95d51SDominik Brodowski } 634