1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) 52 folio_end_read(fi.folio, err == 0); 53 54 bio_put(bio); 55 } 56 57 static void mpage_write_end_io(struct bio *bio) 58 { 59 struct folio_iter fi; 60 int err = blk_status_to_errno(bio->bi_status); 61 62 bio_for_each_folio_all(fi, bio) { 63 if (err) 64 mapping_set_error(fi.folio->mapping, err); 65 folio_end_writeback(fi.folio); 66 } 67 68 bio_put(bio); 69 } 70 71 static struct bio *mpage_bio_submit_read(struct bio *bio) 72 { 73 bio->bi_end_io = mpage_read_end_io; 74 guard_bio_eod(bio); 75 submit_bio(bio); 76 return NULL; 77 } 78 79 static struct bio *mpage_bio_submit_write(struct bio *bio) 80 { 81 bio->bi_end_io = mpage_write_end_io; 82 guard_bio_eod(bio); 83 submit_bio(bio); 84 return NULL; 85 } 86 87 /* 88 * support function for mpage_readahead. The fs supplied get_block might 89 * return an up to date buffer. This is used to map that buffer into 90 * the page, which allows read_folio to avoid triggering a duplicate call 91 * to get_block. 92 * 93 * The idea is to avoid adding buffers to pages that don't already have 94 * them. So when the buffer is up to date and the page size == block size, 95 * this marks the page up to date instead of adding new buffers. 96 */ 97 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 98 int page_block) 99 { 100 struct inode *inode = folio->mapping->host; 101 struct buffer_head *page_bh, *head; 102 int block = 0; 103 104 head = folio_buffers(folio); 105 if (!head) { 106 /* 107 * don't make any buffers if there is only one buffer on 108 * the folio and the folio just needs to be set up to date 109 */ 110 if (inode->i_blkbits == folio_shift(folio) && 111 buffer_uptodate(bh)) { 112 folio_mark_uptodate(folio); 113 return; 114 } 115 head = create_empty_buffers(folio, i_blocksize(inode), 0); 116 } 117 118 page_bh = head; 119 do { 120 if (block == page_block) { 121 page_bh->b_state = bh->b_state; 122 page_bh->b_bdev = bh->b_bdev; 123 page_bh->b_blocknr = bh->b_blocknr; 124 break; 125 } 126 page_bh = page_bh->b_this_page; 127 block++; 128 } while (page_bh != head); 129 } 130 131 struct mpage_readpage_args { 132 struct bio *bio; 133 struct folio *folio; 134 unsigned int nr_pages; 135 bool is_readahead; 136 sector_t last_block_in_bio; 137 struct buffer_head map_bh; 138 unsigned long first_logical_block; 139 get_block_t *get_block; 140 }; 141 142 /* 143 * This is the worker routine which does all the work of mapping the disk 144 * blocks and constructs largest possible bios, submits them for IO if the 145 * blocks are not contiguous on the disk. 146 * 147 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 148 * represent the validity of its disk mapping and to decide when to do the next 149 * get_block() call. 150 */ 151 static void do_mpage_readpage(struct mpage_readpage_args *args) 152 { 153 struct folio *folio = args->folio; 154 struct inode *inode = folio->mapping->host; 155 const unsigned blkbits = inode->i_blkbits; 156 const unsigned blocks_per_folio = folio_size(folio) >> blkbits; 157 const unsigned blocksize = 1 << blkbits; 158 struct buffer_head *map_bh = &args->map_bh; 159 sector_t block_in_file; 160 sector_t last_block; 161 sector_t last_block_in_file; 162 sector_t first_block; 163 unsigned page_block; 164 unsigned first_hole = blocks_per_folio; 165 struct block_device *bdev = NULL; 166 int length; 167 int fully_mapped = 1; 168 blk_opf_t opf = REQ_OP_READ; 169 unsigned nblocks; 170 unsigned relative_block; 171 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 172 173 if (args->is_readahead) { 174 opf |= REQ_RAHEAD; 175 gfp |= __GFP_NORETRY | __GFP_NOWARN; 176 } 177 178 if (folio_buffers(folio)) 179 goto confused; 180 181 block_in_file = folio_pos(folio) >> blkbits; 182 last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); 183 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 184 if (last_block > last_block_in_file) 185 last_block = last_block_in_file; 186 page_block = 0; 187 188 /* 189 * Map blocks using the result from the previous get_blocks call first. 190 */ 191 nblocks = map_bh->b_size >> blkbits; 192 if (buffer_mapped(map_bh) && 193 block_in_file > args->first_logical_block && 194 block_in_file < (args->first_logical_block + nblocks)) { 195 unsigned map_offset = block_in_file - args->first_logical_block; 196 unsigned last = nblocks - map_offset; 197 198 first_block = map_bh->b_blocknr + map_offset; 199 for (relative_block = 0; ; relative_block++) { 200 if (relative_block == last) { 201 clear_buffer_mapped(map_bh); 202 break; 203 } 204 if (page_block == blocks_per_folio) 205 break; 206 page_block++; 207 block_in_file++; 208 } 209 bdev = map_bh->b_bdev; 210 } 211 212 /* 213 * Then do more get_blocks calls until we are done with this folio. 214 */ 215 map_bh->b_folio = folio; 216 while (page_block < blocks_per_folio) { 217 map_bh->b_state = 0; 218 map_bh->b_size = 0; 219 220 if (block_in_file < last_block) { 221 map_bh->b_size = (last_block-block_in_file) << blkbits; 222 if (args->get_block(inode, block_in_file, map_bh, 0)) 223 goto confused; 224 args->first_logical_block = block_in_file; 225 } 226 227 if (!buffer_mapped(map_bh)) { 228 fully_mapped = 0; 229 if (first_hole == blocks_per_folio) 230 first_hole = page_block; 231 page_block++; 232 block_in_file++; 233 continue; 234 } 235 236 /* some filesystems will copy data into the page during 237 * the get_block call, in which case we don't want to 238 * read it again. map_buffer_to_folio copies the data 239 * we just collected from get_block into the folio's buffers 240 * so read_folio doesn't have to repeat the get_block call 241 */ 242 if (buffer_uptodate(map_bh)) { 243 map_buffer_to_folio(folio, map_bh, page_block); 244 goto confused; 245 } 246 247 if (first_hole != blocks_per_folio) 248 goto confused; /* hole -> non-hole */ 249 250 /* Contiguous blocks? */ 251 if (!page_block) 252 first_block = map_bh->b_blocknr; 253 else if (first_block + page_block != map_bh->b_blocknr) 254 goto confused; 255 nblocks = map_bh->b_size >> blkbits; 256 for (relative_block = 0; ; relative_block++) { 257 if (relative_block == nblocks) { 258 clear_buffer_mapped(map_bh); 259 break; 260 } else if (page_block == blocks_per_folio) 261 break; 262 page_block++; 263 block_in_file++; 264 } 265 bdev = map_bh->b_bdev; 266 } 267 268 if (first_hole != blocks_per_folio) { 269 folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); 270 if (first_hole == 0) { 271 folio_mark_uptodate(folio); 272 folio_unlock(folio); 273 goto out; 274 } 275 } else if (fully_mapped) { 276 folio_set_mappedtodisk(folio); 277 } 278 279 /* 280 * This folio will go to BIO. Do we need to send this BIO off first? 281 */ 282 if (args->bio && (args->last_block_in_bio != first_block - 1)) 283 args->bio = mpage_bio_submit_read(args->bio); 284 285 alloc_new: 286 if (args->bio == NULL) { 287 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 288 gfp); 289 if (args->bio == NULL) 290 goto confused; 291 args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); 292 } 293 294 length = first_hole << blkbits; 295 if (!bio_add_folio(args->bio, folio, length, 0)) { 296 args->bio = mpage_bio_submit_read(args->bio); 297 goto alloc_new; 298 } 299 300 relative_block = block_in_file - args->first_logical_block; 301 nblocks = map_bh->b_size >> blkbits; 302 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 303 (first_hole != blocks_per_folio)) 304 args->bio = mpage_bio_submit_read(args->bio); 305 else 306 args->last_block_in_bio = first_block + blocks_per_folio - 1; 307 out: 308 return; 309 310 confused: 311 if (args->bio) 312 args->bio = mpage_bio_submit_read(args->bio); 313 if (!folio_test_uptodate(folio)) 314 block_read_full_folio(folio, args->get_block); 315 else 316 folio_unlock(folio); 317 goto out; 318 } 319 320 /** 321 * mpage_readahead - start reads against pages 322 * @rac: Describes which pages to read. 323 * @get_block: The filesystem's block mapper function. 324 * 325 * This function walks the pages and the blocks within each page, building and 326 * emitting large BIOs. 327 * 328 * If anything unusual happens, such as: 329 * 330 * - encountering a page which has buffers 331 * - encountering a page which has a non-hole after a hole 332 * - encountering a page with non-contiguous blocks 333 * 334 * then this code just gives up and calls the buffer_head-based read function. 335 * It does handle a page which has holes at the end - that is a common case: 336 * the end-of-file on blocksize < PAGE_SIZE setups. 337 * 338 * BH_Boundary explanation: 339 * 340 * There is a problem. The mpage read code assembles several pages, gets all 341 * their disk mappings, and then submits them all. That's fine, but obtaining 342 * the disk mappings may require I/O. Reads of indirect blocks, for example. 343 * 344 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 345 * submitted in the following order: 346 * 347 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 348 * 349 * because the indirect block has to be read to get the mappings of blocks 350 * 13,14,15,16. Obviously, this impacts performance. 351 * 352 * So what we do it to allow the filesystem's get_block() function to set 353 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 354 * after this one will require I/O against a block which is probably close to 355 * this one. So you should push what I/O you have currently accumulated. 356 * 357 * This all causes the disk requests to be issued in the correct order. 358 */ 359 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 360 { 361 struct folio *folio; 362 struct mpage_readpage_args args = { 363 .get_block = get_block, 364 .is_readahead = true, 365 }; 366 367 while ((folio = readahead_folio(rac))) { 368 prefetchw(&folio->flags); 369 args.folio = folio; 370 args.nr_pages = readahead_count(rac); 371 do_mpage_readpage(&args); 372 /* 373 * If read ahead failed synchronously, it may cause by removed 374 * device, or some filesystem metadata error. 375 */ 376 if (!folio_test_locked(folio) && !folio_test_uptodate(folio)) 377 break; 378 } 379 if (args.bio) 380 mpage_bio_submit_read(args.bio); 381 } 382 EXPORT_SYMBOL(mpage_readahead); 383 384 /* 385 * This isn't called much at all 386 */ 387 int mpage_read_folio(struct folio *folio, get_block_t get_block) 388 { 389 struct mpage_readpage_args args = { 390 .folio = folio, 391 .nr_pages = folio_nr_pages(folio), 392 .get_block = get_block, 393 }; 394 395 do_mpage_readpage(&args); 396 if (args.bio) 397 mpage_bio_submit_read(args.bio); 398 return 0; 399 } 400 EXPORT_SYMBOL(mpage_read_folio); 401 402 /* 403 * Writing is not so simple. 404 * 405 * If the page has buffers then they will be used for obtaining the disk 406 * mapping. We only support pages which are fully mapped-and-dirty, with a 407 * special case for pages which are unmapped at the end: end-of-file. 408 * 409 * If the page has no buffers (preferred) then the page is mapped here. 410 * 411 * If all blocks are found to be contiguous then the page can go into the 412 * BIO. Otherwise fall back to the mapping's writepage(). 413 * 414 * FIXME: This code wants an estimate of how many pages are still to be 415 * written, so it can intelligently allocate a suitably-sized BIO. For now, 416 * just allocate full-size (16-page) BIOs. 417 */ 418 419 struct mpage_data { 420 struct bio *bio; 421 sector_t last_block_in_bio; 422 get_block_t *get_block; 423 }; 424 425 /* 426 * We have our BIO, so we can now mark the buffers clean. Make 427 * sure to only clean buffers which we know we'll be writing. 428 */ 429 static void clean_buffers(struct folio *folio, unsigned first_unmapped) 430 { 431 unsigned buffer_counter = 0; 432 struct buffer_head *bh, *head = folio_buffers(folio); 433 434 if (!head) 435 return; 436 bh = head; 437 438 do { 439 if (buffer_counter++ == first_unmapped) 440 break; 441 clear_buffer_dirty(bh); 442 bh = bh->b_this_page; 443 } while (bh != head); 444 445 /* 446 * we cannot drop the bh if the page is not uptodate or a concurrent 447 * read_folio would fail to serialize with the bh and it would read from 448 * disk before we reach the platter. 449 */ 450 if (buffer_heads_over_limit && folio_test_uptodate(folio)) 451 try_to_free_buffers(folio); 452 } 453 454 static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, 455 struct mpage_data *mpd) 456 { 457 struct bio *bio = mpd->bio; 458 struct address_space *mapping = folio->mapping; 459 struct inode *inode = mapping->host; 460 const unsigned blkbits = inode->i_blkbits; 461 const unsigned blocks_per_folio = folio_size(folio) >> blkbits; 462 sector_t last_block; 463 sector_t block_in_file; 464 sector_t first_block; 465 unsigned page_block; 466 unsigned first_unmapped = blocks_per_folio; 467 struct block_device *bdev = NULL; 468 int boundary = 0; 469 sector_t boundary_block = 0; 470 struct block_device *boundary_bdev = NULL; 471 size_t length; 472 struct buffer_head map_bh; 473 loff_t i_size = i_size_read(inode); 474 int ret = 0; 475 struct buffer_head *head = folio_buffers(folio); 476 477 if (head) { 478 struct buffer_head *bh = head; 479 480 /* If they're all mapped and dirty, do it */ 481 page_block = 0; 482 do { 483 BUG_ON(buffer_locked(bh)); 484 if (!buffer_mapped(bh)) { 485 /* 486 * unmapped dirty buffers are created by 487 * block_dirty_folio -> mmapped data 488 */ 489 if (buffer_dirty(bh)) 490 goto confused; 491 if (first_unmapped == blocks_per_folio) 492 first_unmapped = page_block; 493 continue; 494 } 495 496 if (first_unmapped != blocks_per_folio) 497 goto confused; /* hole -> non-hole */ 498 499 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 500 goto confused; 501 if (page_block) { 502 if (bh->b_blocknr != first_block + page_block) 503 goto confused; 504 } else { 505 first_block = bh->b_blocknr; 506 } 507 page_block++; 508 boundary = buffer_boundary(bh); 509 if (boundary) { 510 boundary_block = bh->b_blocknr; 511 boundary_bdev = bh->b_bdev; 512 } 513 bdev = bh->b_bdev; 514 } while ((bh = bh->b_this_page) != head); 515 516 if (first_unmapped) 517 goto page_is_mapped; 518 519 /* 520 * Page has buffers, but they are all unmapped. The page was 521 * created by pagein or read over a hole which was handled by 522 * block_read_full_folio(). If this address_space is also 523 * using mpage_readahead then this can rarely happen. 524 */ 525 goto confused; 526 } 527 528 /* 529 * The page has no buffers: map it to disk 530 */ 531 BUG_ON(!folio_test_uptodate(folio)); 532 block_in_file = folio_pos(folio) >> blkbits; 533 /* 534 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 535 * space. 536 */ 537 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 538 goto page_is_mapped; 539 last_block = (i_size - 1) >> blkbits; 540 map_bh.b_folio = folio; 541 for (page_block = 0; page_block < blocks_per_folio; ) { 542 543 map_bh.b_state = 0; 544 map_bh.b_size = 1 << blkbits; 545 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 546 goto confused; 547 if (!buffer_mapped(&map_bh)) 548 goto confused; 549 if (buffer_new(&map_bh)) 550 clean_bdev_bh_alias(&map_bh); 551 if (buffer_boundary(&map_bh)) { 552 boundary_block = map_bh.b_blocknr; 553 boundary_bdev = map_bh.b_bdev; 554 } 555 if (page_block) { 556 if (map_bh.b_blocknr != first_block + page_block) 557 goto confused; 558 } else { 559 first_block = map_bh.b_blocknr; 560 } 561 page_block++; 562 boundary = buffer_boundary(&map_bh); 563 bdev = map_bh.b_bdev; 564 if (block_in_file == last_block) 565 break; 566 block_in_file++; 567 } 568 BUG_ON(page_block == 0); 569 570 first_unmapped = page_block; 571 572 page_is_mapped: 573 /* Don't bother writing beyond EOF, truncate will discard the folio */ 574 if (folio_pos(folio) >= i_size) 575 goto confused; 576 length = folio_size(folio); 577 if (folio_pos(folio) + length > i_size) { 578 /* 579 * The page straddles i_size. It must be zeroed out on each 580 * and every writepage invocation because it may be mmapped. 581 * "A file is mapped in multiples of the page size. For a file 582 * that is not a multiple of the page size, the remaining memory 583 * is zeroed when mapped, and writes to that region are not 584 * written out to the file." 585 */ 586 length = i_size - folio_pos(folio); 587 folio_zero_segment(folio, length, folio_size(folio)); 588 } 589 590 /* 591 * This page will go to BIO. Do we need to send this BIO off first? 592 */ 593 if (bio && mpd->last_block_in_bio != first_block - 1) 594 bio = mpage_bio_submit_write(bio); 595 596 alloc_new: 597 if (bio == NULL) { 598 bio = bio_alloc(bdev, BIO_MAX_VECS, 599 REQ_OP_WRITE | wbc_to_write_flags(wbc), 600 GFP_NOFS); 601 bio->bi_iter.bi_sector = first_block << (blkbits - 9); 602 wbc_init_bio(wbc, bio); 603 bio->bi_write_hint = inode->i_write_hint; 604 } 605 606 /* 607 * Must try to add the page before marking the buffer clean or 608 * the confused fail path above (OOM) will be very confused when 609 * it finds all bh marked clean (i.e. it will not write anything) 610 */ 611 wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); 612 length = first_unmapped << blkbits; 613 if (!bio_add_folio(bio, folio, length, 0)) { 614 bio = mpage_bio_submit_write(bio); 615 goto alloc_new; 616 } 617 618 clean_buffers(folio, first_unmapped); 619 620 BUG_ON(folio_test_writeback(folio)); 621 folio_start_writeback(folio); 622 folio_unlock(folio); 623 if (boundary || (first_unmapped != blocks_per_folio)) { 624 bio = mpage_bio_submit_write(bio); 625 if (boundary_block) { 626 write_boundary_block(boundary_bdev, 627 boundary_block, 1 << blkbits); 628 } 629 } else { 630 mpd->last_block_in_bio = first_block + blocks_per_folio - 1; 631 } 632 goto out; 633 634 confused: 635 if (bio) 636 bio = mpage_bio_submit_write(bio); 637 638 /* 639 * The caller has a ref on the inode, so *mapping is stable 640 */ 641 ret = block_write_full_folio(folio, wbc, mpd->get_block); 642 mapping_set_error(mapping, ret); 643 out: 644 mpd->bio = bio; 645 return ret; 646 } 647 648 /** 649 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 650 * @mapping: address space structure to write 651 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 652 * @get_block: the filesystem's block mapper function. 653 * 654 * This is a library function, which implements the writepages() 655 * address_space_operation. 656 */ 657 int 658 mpage_writepages(struct address_space *mapping, 659 struct writeback_control *wbc, get_block_t get_block) 660 { 661 struct mpage_data mpd = { 662 .get_block = get_block, 663 }; 664 struct folio *folio = NULL; 665 struct blk_plug plug; 666 int error; 667 668 blk_start_plug(&plug); 669 while ((folio = writeback_iter(mapping, wbc, folio, &error))) 670 error = mpage_write_folio(wbc, folio, &mpd); 671 if (mpd.bio) 672 mpage_bio_submit_write(mpd.bio); 673 blk_finish_plug(&plug); 674 return error; 675 } 676 EXPORT_SYMBOL(mpage_writepages); 677