1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) { 52 if (err) 53 folio_set_error(fi.folio); 54 else 55 folio_mark_uptodate(fi.folio); 56 folio_unlock(fi.folio); 57 } 58 59 bio_put(bio); 60 } 61 62 static void mpage_write_end_io(struct bio *bio) 63 { 64 struct folio_iter fi; 65 int err = blk_status_to_errno(bio->bi_status); 66 67 bio_for_each_folio_all(fi, bio) { 68 if (err) { 69 folio_set_error(fi.folio); 70 mapping_set_error(fi.folio->mapping, err); 71 } 72 folio_end_writeback(fi.folio); 73 } 74 75 bio_put(bio); 76 } 77 78 static struct bio *mpage_bio_submit_read(struct bio *bio) 79 { 80 bio->bi_end_io = mpage_read_end_io; 81 guard_bio_eod(bio); 82 submit_bio(bio); 83 return NULL; 84 } 85 86 static struct bio *mpage_bio_submit_write(struct bio *bio) 87 { 88 bio->bi_end_io = mpage_write_end_io; 89 guard_bio_eod(bio); 90 submit_bio(bio); 91 return NULL; 92 } 93 94 /* 95 * support function for mpage_readahead. The fs supplied get_block might 96 * return an up to date buffer. This is used to map that buffer into 97 * the page, which allows read_folio to avoid triggering a duplicate call 98 * to get_block. 99 * 100 * The idea is to avoid adding buffers to pages that don't already have 101 * them. So when the buffer is up to date and the page size == block size, 102 * this marks the page up to date instead of adding new buffers. 103 */ 104 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 105 int page_block) 106 { 107 struct inode *inode = folio->mapping->host; 108 struct buffer_head *page_bh, *head; 109 int block = 0; 110 111 head = folio_buffers(folio); 112 if (!head) { 113 /* 114 * don't make any buffers if there is only one buffer on 115 * the folio and the folio just needs to be set up to date 116 */ 117 if (inode->i_blkbits == PAGE_SHIFT && 118 buffer_uptodate(bh)) { 119 folio_mark_uptodate(folio); 120 return; 121 } 122 head = create_empty_buffers(folio, i_blocksize(inode), 0); 123 } 124 125 page_bh = head; 126 do { 127 if (block == page_block) { 128 page_bh->b_state = bh->b_state; 129 page_bh->b_bdev = bh->b_bdev; 130 page_bh->b_blocknr = bh->b_blocknr; 131 break; 132 } 133 page_bh = page_bh->b_this_page; 134 block++; 135 } while (page_bh != head); 136 } 137 138 struct mpage_readpage_args { 139 struct bio *bio; 140 struct folio *folio; 141 unsigned int nr_pages; 142 bool is_readahead; 143 sector_t last_block_in_bio; 144 struct buffer_head map_bh; 145 unsigned long first_logical_block; 146 get_block_t *get_block; 147 }; 148 149 /* 150 * This is the worker routine which does all the work of mapping the disk 151 * blocks and constructs largest possible bios, submits them for IO if the 152 * blocks are not contiguous on the disk. 153 * 154 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 155 * represent the validity of its disk mapping and to decide when to do the next 156 * get_block() call. 157 */ 158 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 159 { 160 struct folio *folio = args->folio; 161 struct inode *inode = folio->mapping->host; 162 const unsigned blkbits = inode->i_blkbits; 163 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 164 const unsigned blocksize = 1 << blkbits; 165 struct buffer_head *map_bh = &args->map_bh; 166 sector_t block_in_file; 167 sector_t last_block; 168 sector_t last_block_in_file; 169 sector_t first_block; 170 unsigned page_block; 171 unsigned first_hole = blocks_per_page; 172 struct block_device *bdev = NULL; 173 int length; 174 int fully_mapped = 1; 175 blk_opf_t opf = REQ_OP_READ; 176 unsigned nblocks; 177 unsigned relative_block; 178 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 179 180 /* MAX_BUF_PER_PAGE, for example */ 181 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 182 183 if (args->is_readahead) { 184 opf |= REQ_RAHEAD; 185 gfp |= __GFP_NORETRY | __GFP_NOWARN; 186 } 187 188 if (folio_buffers(folio)) 189 goto confused; 190 191 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 192 last_block = block_in_file + args->nr_pages * blocks_per_page; 193 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 194 if (last_block > last_block_in_file) 195 last_block = last_block_in_file; 196 page_block = 0; 197 198 /* 199 * Map blocks using the result from the previous get_blocks call first. 200 */ 201 nblocks = map_bh->b_size >> blkbits; 202 if (buffer_mapped(map_bh) && 203 block_in_file > args->first_logical_block && 204 block_in_file < (args->first_logical_block + nblocks)) { 205 unsigned map_offset = block_in_file - args->first_logical_block; 206 unsigned last = nblocks - map_offset; 207 208 first_block = map_bh->b_blocknr + map_offset; 209 for (relative_block = 0; ; relative_block++) { 210 if (relative_block == last) { 211 clear_buffer_mapped(map_bh); 212 break; 213 } 214 if (page_block == blocks_per_page) 215 break; 216 page_block++; 217 block_in_file++; 218 } 219 bdev = map_bh->b_bdev; 220 } 221 222 /* 223 * Then do more get_blocks calls until we are done with this folio. 224 */ 225 map_bh->b_folio = folio; 226 while (page_block < blocks_per_page) { 227 map_bh->b_state = 0; 228 map_bh->b_size = 0; 229 230 if (block_in_file < last_block) { 231 map_bh->b_size = (last_block-block_in_file) << blkbits; 232 if (args->get_block(inode, block_in_file, map_bh, 0)) 233 goto confused; 234 args->first_logical_block = block_in_file; 235 } 236 237 if (!buffer_mapped(map_bh)) { 238 fully_mapped = 0; 239 if (first_hole == blocks_per_page) 240 first_hole = page_block; 241 page_block++; 242 block_in_file++; 243 continue; 244 } 245 246 /* some filesystems will copy data into the page during 247 * the get_block call, in which case we don't want to 248 * read it again. map_buffer_to_folio copies the data 249 * we just collected from get_block into the folio's buffers 250 * so read_folio doesn't have to repeat the get_block call 251 */ 252 if (buffer_uptodate(map_bh)) { 253 map_buffer_to_folio(folio, map_bh, page_block); 254 goto confused; 255 } 256 257 if (first_hole != blocks_per_page) 258 goto confused; /* hole -> non-hole */ 259 260 /* Contiguous blocks? */ 261 if (!page_block) 262 first_block = map_bh->b_blocknr; 263 else if (first_block + page_block != map_bh->b_blocknr) 264 goto confused; 265 nblocks = map_bh->b_size >> blkbits; 266 for (relative_block = 0; ; relative_block++) { 267 if (relative_block == nblocks) { 268 clear_buffer_mapped(map_bh); 269 break; 270 } else if (page_block == blocks_per_page) 271 break; 272 page_block++; 273 block_in_file++; 274 } 275 bdev = map_bh->b_bdev; 276 } 277 278 if (first_hole != blocks_per_page) { 279 folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); 280 if (first_hole == 0) { 281 folio_mark_uptodate(folio); 282 folio_unlock(folio); 283 goto out; 284 } 285 } else if (fully_mapped) { 286 folio_set_mappedtodisk(folio); 287 } 288 289 /* 290 * This folio will go to BIO. Do we need to send this BIO off first? 291 */ 292 if (args->bio && (args->last_block_in_bio != first_block - 1)) 293 args->bio = mpage_bio_submit_read(args->bio); 294 295 alloc_new: 296 if (args->bio == NULL) { 297 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 298 gfp); 299 if (args->bio == NULL) 300 goto confused; 301 args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); 302 } 303 304 length = first_hole << blkbits; 305 if (!bio_add_folio(args->bio, folio, length, 0)) { 306 args->bio = mpage_bio_submit_read(args->bio); 307 goto alloc_new; 308 } 309 310 relative_block = block_in_file - args->first_logical_block; 311 nblocks = map_bh->b_size >> blkbits; 312 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 313 (first_hole != blocks_per_page)) 314 args->bio = mpage_bio_submit_read(args->bio); 315 else 316 args->last_block_in_bio = first_block + blocks_per_page - 1; 317 out: 318 return args->bio; 319 320 confused: 321 if (args->bio) 322 args->bio = mpage_bio_submit_read(args->bio); 323 if (!folio_test_uptodate(folio)) 324 block_read_full_folio(folio, args->get_block); 325 else 326 folio_unlock(folio); 327 goto out; 328 } 329 330 /** 331 * mpage_readahead - start reads against pages 332 * @rac: Describes which pages to read. 333 * @get_block: The filesystem's block mapper function. 334 * 335 * This function walks the pages and the blocks within each page, building and 336 * emitting large BIOs. 337 * 338 * If anything unusual happens, such as: 339 * 340 * - encountering a page which has buffers 341 * - encountering a page which has a non-hole after a hole 342 * - encountering a page with non-contiguous blocks 343 * 344 * then this code just gives up and calls the buffer_head-based read function. 345 * It does handle a page which has holes at the end - that is a common case: 346 * the end-of-file on blocksize < PAGE_SIZE setups. 347 * 348 * BH_Boundary explanation: 349 * 350 * There is a problem. The mpage read code assembles several pages, gets all 351 * their disk mappings, and then submits them all. That's fine, but obtaining 352 * the disk mappings may require I/O. Reads of indirect blocks, for example. 353 * 354 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 355 * submitted in the following order: 356 * 357 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 358 * 359 * because the indirect block has to be read to get the mappings of blocks 360 * 13,14,15,16. Obviously, this impacts performance. 361 * 362 * So what we do it to allow the filesystem's get_block() function to set 363 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 364 * after this one will require I/O against a block which is probably close to 365 * this one. So you should push what I/O you have currently accumulated. 366 * 367 * This all causes the disk requests to be issued in the correct order. 368 */ 369 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 370 { 371 struct folio *folio; 372 struct mpage_readpage_args args = { 373 .get_block = get_block, 374 .is_readahead = true, 375 }; 376 377 while ((folio = readahead_folio(rac))) { 378 prefetchw(&folio->flags); 379 args.folio = folio; 380 args.nr_pages = readahead_count(rac); 381 args.bio = do_mpage_readpage(&args); 382 } 383 if (args.bio) 384 mpage_bio_submit_read(args.bio); 385 } 386 EXPORT_SYMBOL(mpage_readahead); 387 388 /* 389 * This isn't called much at all 390 */ 391 int mpage_read_folio(struct folio *folio, get_block_t get_block) 392 { 393 struct mpage_readpage_args args = { 394 .folio = folio, 395 .nr_pages = 1, 396 .get_block = get_block, 397 }; 398 399 args.bio = do_mpage_readpage(&args); 400 if (args.bio) 401 mpage_bio_submit_read(args.bio); 402 return 0; 403 } 404 EXPORT_SYMBOL(mpage_read_folio); 405 406 /* 407 * Writing is not so simple. 408 * 409 * If the page has buffers then they will be used for obtaining the disk 410 * mapping. We only support pages which are fully mapped-and-dirty, with a 411 * special case for pages which are unmapped at the end: end-of-file. 412 * 413 * If the page has no buffers (preferred) then the page is mapped here. 414 * 415 * If all blocks are found to be contiguous then the page can go into the 416 * BIO. Otherwise fall back to the mapping's writepage(). 417 * 418 * FIXME: This code wants an estimate of how many pages are still to be 419 * written, so it can intelligently allocate a suitably-sized BIO. For now, 420 * just allocate full-size (16-page) BIOs. 421 */ 422 423 struct mpage_data { 424 struct bio *bio; 425 sector_t last_block_in_bio; 426 get_block_t *get_block; 427 }; 428 429 /* 430 * We have our BIO, so we can now mark the buffers clean. Make 431 * sure to only clean buffers which we know we'll be writing. 432 */ 433 static void clean_buffers(struct folio *folio, unsigned first_unmapped) 434 { 435 unsigned buffer_counter = 0; 436 struct buffer_head *bh, *head = folio_buffers(folio); 437 438 if (!head) 439 return; 440 bh = head; 441 442 do { 443 if (buffer_counter++ == first_unmapped) 444 break; 445 clear_buffer_dirty(bh); 446 bh = bh->b_this_page; 447 } while (bh != head); 448 449 /* 450 * we cannot drop the bh if the page is not uptodate or a concurrent 451 * read_folio would fail to serialize with the bh and it would read from 452 * disk before we reach the platter. 453 */ 454 if (buffer_heads_over_limit && folio_test_uptodate(folio)) 455 try_to_free_buffers(folio); 456 } 457 458 static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, 459 void *data) 460 { 461 struct mpage_data *mpd = data; 462 struct bio *bio = mpd->bio; 463 struct address_space *mapping = folio->mapping; 464 struct inode *inode = mapping->host; 465 const unsigned blkbits = inode->i_blkbits; 466 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 467 sector_t last_block; 468 sector_t block_in_file; 469 sector_t first_block; 470 unsigned page_block; 471 unsigned first_unmapped = blocks_per_page; 472 struct block_device *bdev = NULL; 473 int boundary = 0; 474 sector_t boundary_block = 0; 475 struct block_device *boundary_bdev = NULL; 476 size_t length; 477 struct buffer_head map_bh; 478 loff_t i_size = i_size_read(inode); 479 int ret = 0; 480 struct buffer_head *head = folio_buffers(folio); 481 482 if (head) { 483 struct buffer_head *bh = head; 484 485 /* If they're all mapped and dirty, do it */ 486 page_block = 0; 487 do { 488 BUG_ON(buffer_locked(bh)); 489 if (!buffer_mapped(bh)) { 490 /* 491 * unmapped dirty buffers are created by 492 * block_dirty_folio -> mmapped data 493 */ 494 if (buffer_dirty(bh)) 495 goto confused; 496 if (first_unmapped == blocks_per_page) 497 first_unmapped = page_block; 498 continue; 499 } 500 501 if (first_unmapped != blocks_per_page) 502 goto confused; /* hole -> non-hole */ 503 504 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 505 goto confused; 506 if (page_block) { 507 if (bh->b_blocknr != first_block + page_block) 508 goto confused; 509 } else { 510 first_block = bh->b_blocknr; 511 } 512 page_block++; 513 boundary = buffer_boundary(bh); 514 if (boundary) { 515 boundary_block = bh->b_blocknr; 516 boundary_bdev = bh->b_bdev; 517 } 518 bdev = bh->b_bdev; 519 } while ((bh = bh->b_this_page) != head); 520 521 if (first_unmapped) 522 goto page_is_mapped; 523 524 /* 525 * Page has buffers, but they are all unmapped. The page was 526 * created by pagein or read over a hole which was handled by 527 * block_read_full_folio(). If this address_space is also 528 * using mpage_readahead then this can rarely happen. 529 */ 530 goto confused; 531 } 532 533 /* 534 * The page has no buffers: map it to disk 535 */ 536 BUG_ON(!folio_test_uptodate(folio)); 537 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 538 /* 539 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 540 * space. 541 */ 542 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 543 goto page_is_mapped; 544 last_block = (i_size - 1) >> blkbits; 545 map_bh.b_folio = folio; 546 for (page_block = 0; page_block < blocks_per_page; ) { 547 548 map_bh.b_state = 0; 549 map_bh.b_size = 1 << blkbits; 550 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 551 goto confused; 552 if (!buffer_mapped(&map_bh)) 553 goto confused; 554 if (buffer_new(&map_bh)) 555 clean_bdev_bh_alias(&map_bh); 556 if (buffer_boundary(&map_bh)) { 557 boundary_block = map_bh.b_blocknr; 558 boundary_bdev = map_bh.b_bdev; 559 } 560 if (page_block) { 561 if (map_bh.b_blocknr != first_block + page_block) 562 goto confused; 563 } else { 564 first_block = map_bh.b_blocknr; 565 } 566 page_block++; 567 boundary = buffer_boundary(&map_bh); 568 bdev = map_bh.b_bdev; 569 if (block_in_file == last_block) 570 break; 571 block_in_file++; 572 } 573 BUG_ON(page_block == 0); 574 575 first_unmapped = page_block; 576 577 page_is_mapped: 578 /* Don't bother writing beyond EOF, truncate will discard the folio */ 579 if (folio_pos(folio) >= i_size) 580 goto confused; 581 length = folio_size(folio); 582 if (folio_pos(folio) + length > i_size) { 583 /* 584 * The page straddles i_size. It must be zeroed out on each 585 * and every writepage invocation because it may be mmapped. 586 * "A file is mapped in multiples of the page size. For a file 587 * that is not a multiple of the page size, the remaining memory 588 * is zeroed when mapped, and writes to that region are not 589 * written out to the file." 590 */ 591 length = i_size - folio_pos(folio); 592 folio_zero_segment(folio, length, folio_size(folio)); 593 } 594 595 /* 596 * This page will go to BIO. Do we need to send this BIO off first? 597 */ 598 if (bio && mpd->last_block_in_bio != first_block - 1) 599 bio = mpage_bio_submit_write(bio); 600 601 alloc_new: 602 if (bio == NULL) { 603 bio = bio_alloc(bdev, BIO_MAX_VECS, 604 REQ_OP_WRITE | wbc_to_write_flags(wbc), 605 GFP_NOFS); 606 bio->bi_iter.bi_sector = first_block << (blkbits - 9); 607 wbc_init_bio(wbc, bio); 608 bio->bi_write_hint = inode->i_write_hint; 609 } 610 611 /* 612 * Must try to add the page before marking the buffer clean or 613 * the confused fail path above (OOM) will be very confused when 614 * it finds all bh marked clean (i.e. it will not write anything) 615 */ 616 wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); 617 length = first_unmapped << blkbits; 618 if (!bio_add_folio(bio, folio, length, 0)) { 619 bio = mpage_bio_submit_write(bio); 620 goto alloc_new; 621 } 622 623 clean_buffers(folio, first_unmapped); 624 625 BUG_ON(folio_test_writeback(folio)); 626 folio_start_writeback(folio); 627 folio_unlock(folio); 628 if (boundary || (first_unmapped != blocks_per_page)) { 629 bio = mpage_bio_submit_write(bio); 630 if (boundary_block) { 631 write_boundary_block(boundary_bdev, 632 boundary_block, 1 << blkbits); 633 } 634 } else { 635 mpd->last_block_in_bio = first_block + blocks_per_page - 1; 636 } 637 goto out; 638 639 confused: 640 if (bio) 641 bio = mpage_bio_submit_write(bio); 642 643 /* 644 * The caller has a ref on the inode, so *mapping is stable 645 */ 646 ret = block_write_full_folio(folio, wbc, mpd->get_block); 647 mapping_set_error(mapping, ret); 648 out: 649 mpd->bio = bio; 650 return ret; 651 } 652 653 /** 654 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 655 * @mapping: address space structure to write 656 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 657 * @get_block: the filesystem's block mapper function. 658 * 659 * This is a library function, which implements the writepages() 660 * address_space_operation. 661 */ 662 int 663 mpage_writepages(struct address_space *mapping, 664 struct writeback_control *wbc, get_block_t get_block) 665 { 666 struct mpage_data mpd = { 667 .get_block = get_block, 668 }; 669 struct blk_plug plug; 670 int ret; 671 672 blk_start_plug(&plug); 673 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); 674 if (mpd.bio) 675 mpage_bio_submit_write(mpd.bio); 676 blk_finish_plug(&plug); 677 return ret; 678 } 679 EXPORT_SYMBOL(mpage_writepages); 680