1 /* 2 * mm/readahead.c - address_space-level file readahead. 3 * 4 * Copyright (C) 2002, Linus Torvalds 5 * 6 * 09Apr2002 Andrew Morton 7 * Initial version. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/fs.h> 12 #include <linux/gfp.h> 13 #include <linux/mm.h> 14 #include <linux/module.h> 15 #include <linux/blkdev.h> 16 #include <linux/backing-dev.h> 17 #include <linux/task_io_accounting_ops.h> 18 #include <linux/pagevec.h> 19 #include <linux/pagemap.h> 20 21 /* 22 * Initialise a struct file's readahead state. Assumes that the caller has 23 * memset *ra to zero. 24 */ 25 void 26 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 27 { 28 ra->ra_pages = mapping->backing_dev_info->ra_pages; 29 ra->prev_pos = -1; 30 } 31 EXPORT_SYMBOL_GPL(file_ra_state_init); 32 33 #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 34 35 /* 36 * see if a page needs releasing upon read_cache_pages() failure 37 * - the caller of read_cache_pages() may have set PG_private or PG_fscache 38 * before calling, such as the NFS fs marking pages that are cached locally 39 * on disk, thus we need to give the fs a chance to clean up in the event of 40 * an error 41 */ 42 static void read_cache_pages_invalidate_page(struct address_space *mapping, 43 struct page *page) 44 { 45 if (page_has_private(page)) { 46 if (!trylock_page(page)) 47 BUG(); 48 page->mapping = mapping; 49 do_invalidatepage(page, 0); 50 page->mapping = NULL; 51 unlock_page(page); 52 } 53 page_cache_release(page); 54 } 55 56 /* 57 * release a list of pages, invalidating them first if need be 58 */ 59 static void read_cache_pages_invalidate_pages(struct address_space *mapping, 60 struct list_head *pages) 61 { 62 struct page *victim; 63 64 while (!list_empty(pages)) { 65 victim = list_to_page(pages); 66 list_del(&victim->lru); 67 read_cache_pages_invalidate_page(mapping, victim); 68 } 69 } 70 71 /** 72 * read_cache_pages - populate an address space with some pages & start reads against them 73 * @mapping: the address_space 74 * @pages: The address of a list_head which contains the target pages. These 75 * pages have their ->index populated and are otherwise uninitialised. 76 * @filler: callback routine for filling a single page. 77 * @data: private data for the callback routine. 78 * 79 * Hides the details of the LRU cache etc from the filesystems. 80 */ 81 int read_cache_pages(struct address_space *mapping, struct list_head *pages, 82 int (*filler)(void *, struct page *), void *data) 83 { 84 struct page *page; 85 int ret = 0; 86 87 while (!list_empty(pages)) { 88 page = list_to_page(pages); 89 list_del(&page->lru); 90 if (add_to_page_cache_lru(page, mapping, 91 page->index, GFP_KERNEL)) { 92 read_cache_pages_invalidate_page(mapping, page); 93 continue; 94 } 95 page_cache_release(page); 96 97 ret = filler(data, page); 98 if (unlikely(ret)) { 99 read_cache_pages_invalidate_pages(mapping, pages); 100 break; 101 } 102 task_io_account_read(PAGE_CACHE_SIZE); 103 } 104 return ret; 105 } 106 107 EXPORT_SYMBOL(read_cache_pages); 108 109 static int read_pages(struct address_space *mapping, struct file *filp, 110 struct list_head *pages, unsigned nr_pages) 111 { 112 unsigned page_idx; 113 int ret; 114 115 if (mapping->a_ops->readpages) { 116 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 117 /* Clean up the remaining pages */ 118 put_pages_list(pages); 119 goto out; 120 } 121 122 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 123 struct page *page = list_to_page(pages); 124 list_del(&page->lru); 125 if (!add_to_page_cache_lru(page, mapping, 126 page->index, GFP_KERNEL)) { 127 mapping->a_ops->readpage(filp, page); 128 } 129 page_cache_release(page); 130 } 131 ret = 0; 132 out: 133 return ret; 134 } 135 136 /* 137 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all 138 * the pages first, then submits them all for I/O. This avoids the very bad 139 * behaviour which would occur if page allocations are causing VM writeback. 140 * We really don't want to intermingle reads and writes like that. 141 * 142 * Returns the number of pages requested, or the maximum amount of I/O allowed. 143 */ 144 static int 145 __do_page_cache_readahead(struct address_space *mapping, struct file *filp, 146 pgoff_t offset, unsigned long nr_to_read, 147 unsigned long lookahead_size) 148 { 149 struct inode *inode = mapping->host; 150 struct page *page; 151 unsigned long end_index; /* The last page we want to read */ 152 LIST_HEAD(page_pool); 153 int page_idx; 154 int ret = 0; 155 loff_t isize = i_size_read(inode); 156 157 if (isize == 0) 158 goto out; 159 160 end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); 161 162 /* 163 * Preallocate as many pages as we will need. 164 */ 165 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 166 pgoff_t page_offset = offset + page_idx; 167 168 if (page_offset > end_index) 169 break; 170 171 rcu_read_lock(); 172 page = radix_tree_lookup(&mapping->page_tree, page_offset); 173 rcu_read_unlock(); 174 if (page) 175 continue; 176 177 page = page_cache_alloc_cold(mapping); 178 if (!page) 179 break; 180 page->index = page_offset; 181 list_add(&page->lru, &page_pool); 182 if (page_idx == nr_to_read - lookahead_size) 183 SetPageReadahead(page); 184 ret++; 185 } 186 187 /* 188 * Now start the IO. We ignore I/O errors - if the page is not 189 * uptodate then the caller will launch readpage again, and 190 * will then handle the error. 191 */ 192 if (ret) 193 read_pages(mapping, filp, &page_pool, ret); 194 BUG_ON(!list_empty(&page_pool)); 195 out: 196 return ret; 197 } 198 199 /* 200 * Chunk the readahead into 2 megabyte units, so that we don't pin too much 201 * memory at once. 202 */ 203 int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 204 pgoff_t offset, unsigned long nr_to_read) 205 { 206 int ret = 0; 207 208 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 209 return -EINVAL; 210 211 nr_to_read = max_sane_readahead(nr_to_read); 212 while (nr_to_read) { 213 int err; 214 215 unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; 216 217 if (this_chunk > nr_to_read) 218 this_chunk = nr_to_read; 219 err = __do_page_cache_readahead(mapping, filp, 220 offset, this_chunk, 0); 221 if (err < 0) { 222 ret = err; 223 break; 224 } 225 ret += err; 226 offset += this_chunk; 227 nr_to_read -= this_chunk; 228 } 229 return ret; 230 } 231 232 /* 233 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 234 * sensible upper limit. 235 */ 236 unsigned long max_sane_readahead(unsigned long nr) 237 { 238 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) 239 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 240 } 241 242 /* 243 * Submit IO for the read-ahead request in file_ra_state. 244 */ 245 unsigned long ra_submit(struct file_ra_state *ra, 246 struct address_space *mapping, struct file *filp) 247 { 248 int actual; 249 250 actual = __do_page_cache_readahead(mapping, filp, 251 ra->start, ra->size, ra->async_size); 252 253 return actual; 254 } 255 256 /* 257 * Set the initial window size, round to next power of 2 and square 258 * for small size, x 4 for medium, and x 2 for large 259 * for 128k (32 page) max ra 260 * 1-8 page = 32k initial, > 8 page = 128k initial 261 */ 262 static unsigned long get_init_ra_size(unsigned long size, unsigned long max) 263 { 264 unsigned long newsize = roundup_pow_of_two(size); 265 266 if (newsize <= max / 32) 267 newsize = newsize * 4; 268 else if (newsize <= max / 4) 269 newsize = newsize * 2; 270 else 271 newsize = max; 272 273 return newsize; 274 } 275 276 /* 277 * Get the previous window size, ramp it up, and 278 * return it as the new window size. 279 */ 280 static unsigned long get_next_ra_size(struct file_ra_state *ra, 281 unsigned long max) 282 { 283 unsigned long cur = ra->size; 284 unsigned long newsize; 285 286 if (cur < max / 16) 287 newsize = 4 * cur; 288 else 289 newsize = 2 * cur; 290 291 return min(newsize, max); 292 } 293 294 /* 295 * On-demand readahead design. 296 * 297 * The fields in struct file_ra_state represent the most-recently-executed 298 * readahead attempt: 299 * 300 * |<----- async_size ---------| 301 * |------------------- size -------------------->| 302 * |==================#===========================| 303 * ^start ^page marked with PG_readahead 304 * 305 * To overlap application thinking time and disk I/O time, we do 306 * `readahead pipelining': Do not wait until the application consumed all 307 * readahead pages and stalled on the missing page at readahead_index; 308 * Instead, submit an asynchronous readahead I/O as soon as there are 309 * only async_size pages left in the readahead window. Normally async_size 310 * will be equal to size, for maximum pipelining. 311 * 312 * In interleaved sequential reads, concurrent streams on the same fd can 313 * be invalidating each other's readahead state. So we flag the new readahead 314 * page at (start+size-async_size) with PG_readahead, and use it as readahead 315 * indicator. The flag won't be set on already cached pages, to avoid the 316 * readahead-for-nothing fuss, saving pointless page cache lookups. 317 * 318 * prev_pos tracks the last visited byte in the _previous_ read request. 319 * It should be maintained by the caller, and will be used for detecting 320 * small random reads. Note that the readahead algorithm checks loosely 321 * for sequential patterns. Hence interleaved reads might be served as 322 * sequential ones. 323 * 324 * There is a special-case: if the first page which the application tries to 325 * read happens to be the first page of the file, it is assumed that a linear 326 * read is about to happen and the window is immediately set to the initial size 327 * based on I/O request size and the max_readahead. 328 * 329 * The code ramps up the readahead size aggressively at first, but slow down as 330 * it approaches max_readhead. 331 */ 332 333 /* 334 * Count contiguously cached pages from @offset-1 to @offset-@max, 335 * this count is a conservative estimation of 336 * - length of the sequential read sequence, or 337 * - thrashing threshold in memory tight systems 338 */ 339 static pgoff_t count_history_pages(struct address_space *mapping, 340 struct file_ra_state *ra, 341 pgoff_t offset, unsigned long max) 342 { 343 pgoff_t head; 344 345 rcu_read_lock(); 346 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); 347 rcu_read_unlock(); 348 349 return offset - 1 - head; 350 } 351 352 /* 353 * page cache context based read-ahead 354 */ 355 static int try_context_readahead(struct address_space *mapping, 356 struct file_ra_state *ra, 357 pgoff_t offset, 358 unsigned long req_size, 359 unsigned long max) 360 { 361 pgoff_t size; 362 363 size = count_history_pages(mapping, ra, offset, max); 364 365 /* 366 * no history pages: 367 * it could be a random read 368 */ 369 if (!size) 370 return 0; 371 372 /* 373 * starts from beginning of file: 374 * it is a strong indication of long-run stream (or whole-file-read) 375 */ 376 if (size >= offset) 377 size *= 2; 378 379 ra->start = offset; 380 ra->size = get_init_ra_size(size + req_size, max); 381 ra->async_size = ra->size; 382 383 return 1; 384 } 385 386 /* 387 * A minimal readahead algorithm for trivial sequential/random reads. 388 */ 389 static unsigned long 390 ondemand_readahead(struct address_space *mapping, 391 struct file_ra_state *ra, struct file *filp, 392 bool hit_readahead_marker, pgoff_t offset, 393 unsigned long req_size) 394 { 395 unsigned long max = max_sane_readahead(ra->ra_pages); 396 397 /* 398 * start of file 399 */ 400 if (!offset) 401 goto initial_readahead; 402 403 /* 404 * It's the expected callback offset, assume sequential access. 405 * Ramp up sizes, and push forward the readahead window. 406 */ 407 if ((offset == (ra->start + ra->size - ra->async_size) || 408 offset == (ra->start + ra->size))) { 409 ra->start += ra->size; 410 ra->size = get_next_ra_size(ra, max); 411 ra->async_size = ra->size; 412 goto readit; 413 } 414 415 /* 416 * Hit a marked page without valid readahead state. 417 * E.g. interleaved reads. 418 * Query the pagecache for async_size, which normally equals to 419 * readahead size. Ramp it up and use it as the new readahead size. 420 */ 421 if (hit_readahead_marker) { 422 pgoff_t start; 423 424 rcu_read_lock(); 425 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); 426 rcu_read_unlock(); 427 428 if (!start || start - offset > max) 429 return 0; 430 431 ra->start = start; 432 ra->size = start - offset; /* old async_size */ 433 ra->size += req_size; 434 ra->size = get_next_ra_size(ra, max); 435 ra->async_size = ra->size; 436 goto readit; 437 } 438 439 /* 440 * oversize read 441 */ 442 if (req_size > max) 443 goto initial_readahead; 444 445 /* 446 * sequential cache miss 447 */ 448 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) 449 goto initial_readahead; 450 451 /* 452 * Query the page cache and look for the traces(cached history pages) 453 * that a sequential stream would leave behind. 454 */ 455 if (try_context_readahead(mapping, ra, offset, req_size, max)) 456 goto readit; 457 458 /* 459 * standalone, small random read 460 * Read as is, and do not pollute the readahead state. 461 */ 462 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); 463 464 initial_readahead: 465 ra->start = offset; 466 ra->size = get_init_ra_size(req_size, max); 467 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 468 469 readit: 470 /* 471 * Will this read hit the readahead marker made by itself? 472 * If so, trigger the readahead marker hit now, and merge 473 * the resulted next readahead window into the current one. 474 */ 475 if (offset == ra->start && ra->size == ra->async_size) { 476 ra->async_size = get_next_ra_size(ra, max); 477 ra->size += ra->async_size; 478 } 479 480 return ra_submit(ra, mapping, filp); 481 } 482 483 /** 484 * page_cache_sync_readahead - generic file readahead 485 * @mapping: address_space which holds the pagecache and I/O vectors 486 * @ra: file_ra_state which holds the readahead state 487 * @filp: passed on to ->readpage() and ->readpages() 488 * @offset: start offset into @mapping, in pagecache page-sized units 489 * @req_size: hint: total size of the read which the caller is performing in 490 * pagecache pages 491 * 492 * page_cache_sync_readahead() should be called when a cache miss happened: 493 * it will submit the read. The readahead logic may decide to piggyback more 494 * pages onto the read request if access patterns suggest it will improve 495 * performance. 496 */ 497 void page_cache_sync_readahead(struct address_space *mapping, 498 struct file_ra_state *ra, struct file *filp, 499 pgoff_t offset, unsigned long req_size) 500 { 501 /* no read-ahead */ 502 if (!ra->ra_pages) 503 return; 504 505 /* be dumb */ 506 if (filp && (filp->f_mode & FMODE_RANDOM)) { 507 force_page_cache_readahead(mapping, filp, offset, req_size); 508 return; 509 } 510 511 /* do read-ahead */ 512 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 513 } 514 EXPORT_SYMBOL_GPL(page_cache_sync_readahead); 515 516 /** 517 * page_cache_async_readahead - file readahead for marked pages 518 * @mapping: address_space which holds the pagecache and I/O vectors 519 * @ra: file_ra_state which holds the readahead state 520 * @filp: passed on to ->readpage() and ->readpages() 521 * @page: the page at @offset which has the PG_readahead flag set 522 * @offset: start offset into @mapping, in pagecache page-sized units 523 * @req_size: hint: total size of the read which the caller is performing in 524 * pagecache pages 525 * 526 * page_cache_async_readahead() should be called when a page is used which 527 * has the PG_readahead flag; this is a marker to suggest that the application 528 * has used up enough of the readahead window that we should start pulling in 529 * more pages. 530 */ 531 void 532 page_cache_async_readahead(struct address_space *mapping, 533 struct file_ra_state *ra, struct file *filp, 534 struct page *page, pgoff_t offset, 535 unsigned long req_size) 536 { 537 /* no read-ahead */ 538 if (!ra->ra_pages) 539 return; 540 541 /* 542 * Same bit is used for PG_readahead and PG_reclaim. 543 */ 544 if (PageWriteback(page)) 545 return; 546 547 ClearPageReadahead(page); 548 549 /* 550 * Defer asynchronous read-ahead on IO congestion. 551 */ 552 if (bdi_read_congested(mapping->backing_dev_info)) 553 return; 554 555 /* do read-ahead */ 556 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 557 558 #ifdef CONFIG_BLOCK 559 /* 560 * Normally the current page is !uptodate and lock_page() will be 561 * immediately called to implicitly unplug the device. However this 562 * is not always true for RAID conifgurations, where data arrives 563 * not strictly in their submission order. In this case we need to 564 * explicitly kick off the IO. 565 */ 566 if (PageUptodate(page)) 567 blk_run_backing_dev(mapping->backing_dev_info, NULL); 568 #endif 569 } 570 EXPORT_SYMBOL_GPL(page_cache_async_readahead); 571