1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) 52 folio_end_read(fi.folio, err == 0); 53 54 bio_put(bio); 55 } 56 57 static void mpage_write_end_io(struct bio *bio) 58 { 59 struct folio_iter fi; 60 int err = blk_status_to_errno(bio->bi_status); 61 62 bio_for_each_folio_all(fi, bio) { 63 if (err) 64 mapping_set_error(fi.folio->mapping, err); 65 folio_end_writeback(fi.folio); 66 } 67 68 bio_put(bio); 69 } 70 71 static struct bio *mpage_bio_submit_read(struct bio *bio) 72 { 73 bio->bi_end_io = mpage_read_end_io; 74 guard_bio_eod(bio); 75 submit_bio(bio); 76 return NULL; 77 } 78 79 static struct bio *mpage_bio_submit_write(struct bio *bio) 80 { 81 bio->bi_end_io = mpage_write_end_io; 82 guard_bio_eod(bio); 83 submit_bio(bio); 84 return NULL; 85 } 86 87 /* 88 * support function for mpage_readahead. The fs supplied get_block might 89 * return an up to date buffer. This is used to map that buffer into 90 * the page, which allows read_folio to avoid triggering a duplicate call 91 * to get_block. 92 * 93 * The idea is to avoid adding buffers to pages that don't already have 94 * them. So when the buffer is up to date and the page size == block size, 95 * this marks the page up to date instead of adding new buffers. 96 */ 97 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 98 int page_block) 99 { 100 struct inode *inode = folio->mapping->host; 101 struct buffer_head *page_bh, *head; 102 int block = 0; 103 104 head = folio_buffers(folio); 105 if (!head) { 106 /* 107 * don't make any buffers if there is only one buffer on 108 * the folio and the folio just needs to be set up to date 109 */ 110 if (inode->i_blkbits == folio_shift(folio) && 111 buffer_uptodate(bh)) { 112 folio_mark_uptodate(folio); 113 return; 114 } 115 head = create_empty_buffers(folio, i_blocksize(inode), 0); 116 } 117 118 page_bh = head; 119 do { 120 if (block == page_block) { 121 page_bh->b_state = bh->b_state; 122 page_bh->b_bdev = bh->b_bdev; 123 page_bh->b_blocknr = bh->b_blocknr; 124 break; 125 } 126 page_bh = page_bh->b_this_page; 127 block++; 128 } while (page_bh != head); 129 } 130 131 struct mpage_readpage_args { 132 struct bio *bio; 133 struct folio *folio; 134 unsigned int nr_pages; 135 bool is_readahead; 136 sector_t last_block_in_bio; 137 struct buffer_head map_bh; 138 unsigned long first_logical_block; 139 get_block_t *get_block; 140 }; 141 142 /* 143 * This is the worker routine which does all the work of mapping the disk 144 * blocks and constructs largest possible bios, submits them for IO if the 145 * blocks are not contiguous on the disk. 146 * 147 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 148 * represent the validity of its disk mapping and to decide when to do the next 149 * get_block() call. 150 */ 151 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 152 { 153 struct folio *folio = args->folio; 154 struct inode *inode = folio->mapping->host; 155 const unsigned blkbits = inode->i_blkbits; 156 const unsigned blocks_per_folio = folio_size(folio) >> blkbits; 157 const unsigned blocksize = 1 << blkbits; 158 struct buffer_head *map_bh = &args->map_bh; 159 sector_t block_in_file; 160 sector_t last_block; 161 sector_t last_block_in_file; 162 sector_t first_block; 163 unsigned page_block; 164 unsigned first_hole = blocks_per_folio; 165 struct block_device *bdev = NULL; 166 int length; 167 int fully_mapped = 1; 168 blk_opf_t opf = REQ_OP_READ; 169 unsigned nblocks; 170 unsigned relative_block; 171 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 172 173 if (args->is_readahead) { 174 opf |= REQ_RAHEAD; 175 gfp |= __GFP_NORETRY | __GFP_NOWARN; 176 } 177 178 if (folio_buffers(folio)) 179 goto confused; 180 181 block_in_file = folio_pos(folio) >> blkbits; 182 last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); 183 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 184 if (last_block > last_block_in_file) 185 last_block = last_block_in_file; 186 page_block = 0; 187 188 /* 189 * Map blocks using the result from the previous get_blocks call first. 190 */ 191 nblocks = map_bh->b_size >> blkbits; 192 if (buffer_mapped(map_bh) && 193 block_in_file > args->first_logical_block && 194 block_in_file < (args->first_logical_block + nblocks)) { 195 unsigned map_offset = block_in_file - args->first_logical_block; 196 unsigned last = nblocks - map_offset; 197 198 first_block = map_bh->b_blocknr + map_offset; 199 for (relative_block = 0; ; relative_block++) { 200 if (relative_block == last) { 201 clear_buffer_mapped(map_bh); 202 break; 203 } 204 if (page_block == blocks_per_folio) 205 break; 206 page_block++; 207 block_in_file++; 208 } 209 bdev = map_bh->b_bdev; 210 } 211 212 /* 213 * Then do more get_blocks calls until we are done with this folio. 214 */ 215 map_bh->b_folio = folio; 216 while (page_block < blocks_per_folio) { 217 map_bh->b_state = 0; 218 map_bh->b_size = 0; 219 220 if (block_in_file < last_block) { 221 map_bh->b_size = (last_block-block_in_file) << blkbits; 222 if (args->get_block(inode, block_in_file, map_bh, 0)) 223 goto confused; 224 args->first_logical_block = block_in_file; 225 } 226 227 if (!buffer_mapped(map_bh)) { 228 fully_mapped = 0; 229 if (first_hole == blocks_per_folio) 230 first_hole = page_block; 231 page_block++; 232 block_in_file++; 233 continue; 234 } 235 236 /* some filesystems will copy data into the page during 237 * the get_block call, in which case we don't want to 238 * read it again. map_buffer_to_folio copies the data 239 * we just collected from get_block into the folio's buffers 240 * so read_folio doesn't have to repeat the get_block call 241 */ 242 if (buffer_uptodate(map_bh)) { 243 map_buffer_to_folio(folio, map_bh, page_block); 244 goto confused; 245 } 246 247 if (first_hole != blocks_per_folio) 248 goto confused; /* hole -> non-hole */ 249 250 /* Contiguous blocks? */ 251 if (!page_block) 252 first_block = map_bh->b_blocknr; 253 else if (first_block + page_block != map_bh->b_blocknr) 254 goto confused; 255 nblocks = map_bh->b_size >> blkbits; 256 for (relative_block = 0; ; relative_block++) { 257 if (relative_block == nblocks) { 258 clear_buffer_mapped(map_bh); 259 break; 260 } else if (page_block == blocks_per_folio) 261 break; 262 page_block++; 263 block_in_file++; 264 } 265 bdev = map_bh->b_bdev; 266 } 267 268 if (first_hole != blocks_per_folio) { 269 folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); 270 if (first_hole == 0) { 271 folio_mark_uptodate(folio); 272 folio_unlock(folio); 273 goto out; 274 } 275 } else if (fully_mapped) { 276 folio_set_mappedtodisk(folio); 277 } 278 279 /* 280 * This folio will go to BIO. Do we need to send this BIO off first? 281 */ 282 if (args->bio && (args->last_block_in_bio != first_block - 1)) 283 args->bio = mpage_bio_submit_read(args->bio); 284 285 alloc_new: 286 if (args->bio == NULL) { 287 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 288 gfp); 289 if (args->bio == NULL) 290 goto confused; 291 args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); 292 } 293 294 length = first_hole << blkbits; 295 if (!bio_add_folio(args->bio, folio, length, 0)) { 296 args->bio = mpage_bio_submit_read(args->bio); 297 goto alloc_new; 298 } 299 300 relative_block = block_in_file - args->first_logical_block; 301 nblocks = map_bh->b_size >> blkbits; 302 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 303 (first_hole != blocks_per_folio)) 304 args->bio = mpage_bio_submit_read(args->bio); 305 else 306 args->last_block_in_bio = first_block + blocks_per_folio - 1; 307 out: 308 return args->bio; 309 310 confused: 311 if (args->bio) 312 args->bio = mpage_bio_submit_read(args->bio); 313 if (!folio_test_uptodate(folio)) 314 block_read_full_folio(folio, args->get_block); 315 else 316 folio_unlock(folio); 317 goto out; 318 } 319 320 /** 321 * mpage_readahead - start reads against pages 322 * @rac: Describes which pages to read. 323 * @get_block: The filesystem's block mapper function. 324 * 325 * This function walks the pages and the blocks within each page, building and 326 * emitting large BIOs. 327 * 328 * If anything unusual happens, such as: 329 * 330 * - encountering a page which has buffers 331 * - encountering a page which has a non-hole after a hole 332 * - encountering a page with non-contiguous blocks 333 * 334 * then this code just gives up and calls the buffer_head-based read function. 335 * It does handle a page which has holes at the end - that is a common case: 336 * the end-of-file on blocksize < PAGE_SIZE setups. 337 * 338 * BH_Boundary explanation: 339 * 340 * There is a problem. The mpage read code assembles several pages, gets all 341 * their disk mappings, and then submits them all. That's fine, but obtaining 342 * the disk mappings may require I/O. Reads of indirect blocks, for example. 343 * 344 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 345 * submitted in the following order: 346 * 347 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 348 * 349 * because the indirect block has to be read to get the mappings of blocks 350 * 13,14,15,16. Obviously, this impacts performance. 351 * 352 * So what we do it to allow the filesystem's get_block() function to set 353 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 354 * after this one will require I/O against a block which is probably close to 355 * this one. So you should push what I/O you have currently accumulated. 356 * 357 * This all causes the disk requests to be issued in the correct order. 358 */ 359 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 360 { 361 struct folio *folio; 362 struct mpage_readpage_args args = { 363 .get_block = get_block, 364 .is_readahead = true, 365 }; 366 367 while ((folio = readahead_folio(rac))) { 368 prefetchw(&folio->flags); 369 args.folio = folio; 370 args.nr_pages = readahead_count(rac); 371 args.bio = do_mpage_readpage(&args); 372 } 373 if (args.bio) 374 mpage_bio_submit_read(args.bio); 375 } 376 EXPORT_SYMBOL(mpage_readahead); 377 378 /* 379 * This isn't called much at all 380 */ 381 int mpage_read_folio(struct folio *folio, get_block_t get_block) 382 { 383 struct mpage_readpage_args args = { 384 .folio = folio, 385 .nr_pages = folio_nr_pages(folio), 386 .get_block = get_block, 387 }; 388 389 args.bio = do_mpage_readpage(&args); 390 if (args.bio) 391 mpage_bio_submit_read(args.bio); 392 return 0; 393 } 394 EXPORT_SYMBOL(mpage_read_folio); 395 396 /* 397 * Writing is not so simple. 398 * 399 * If the page has buffers then they will be used for obtaining the disk 400 * mapping. We only support pages which are fully mapped-and-dirty, with a 401 * special case for pages which are unmapped at the end: end-of-file. 402 * 403 * If the page has no buffers (preferred) then the page is mapped here. 404 * 405 * If all blocks are found to be contiguous then the page can go into the 406 * BIO. Otherwise fall back to the mapping's writepage(). 407 * 408 * FIXME: This code wants an estimate of how many pages are still to be 409 * written, so it can intelligently allocate a suitably-sized BIO. For now, 410 * just allocate full-size (16-page) BIOs. 411 */ 412 413 struct mpage_data { 414 struct bio *bio; 415 sector_t last_block_in_bio; 416 get_block_t *get_block; 417 }; 418 419 /* 420 * We have our BIO, so we can now mark the buffers clean. Make 421 * sure to only clean buffers which we know we'll be writing. 422 */ 423 static void clean_buffers(struct folio *folio, unsigned first_unmapped) 424 { 425 unsigned buffer_counter = 0; 426 struct buffer_head *bh, *head = folio_buffers(folio); 427 428 if (!head) 429 return; 430 bh = head; 431 432 do { 433 if (buffer_counter++ == first_unmapped) 434 break; 435 clear_buffer_dirty(bh); 436 bh = bh->b_this_page; 437 } while (bh != head); 438 439 /* 440 * we cannot drop the bh if the page is not uptodate or a concurrent 441 * read_folio would fail to serialize with the bh and it would read from 442 * disk before we reach the platter. 443 */ 444 if (buffer_heads_over_limit && folio_test_uptodate(folio)) 445 try_to_free_buffers(folio); 446 } 447 448 static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, 449 struct mpage_data *mpd) 450 { 451 struct bio *bio = mpd->bio; 452 struct address_space *mapping = folio->mapping; 453 struct inode *inode = mapping->host; 454 const unsigned blkbits = inode->i_blkbits; 455 const unsigned blocks_per_folio = folio_size(folio) >> blkbits; 456 sector_t last_block; 457 sector_t block_in_file; 458 sector_t first_block; 459 unsigned page_block; 460 unsigned first_unmapped = blocks_per_folio; 461 struct block_device *bdev = NULL; 462 int boundary = 0; 463 sector_t boundary_block = 0; 464 struct block_device *boundary_bdev = NULL; 465 size_t length; 466 struct buffer_head map_bh; 467 loff_t i_size = i_size_read(inode); 468 int ret = 0; 469 struct buffer_head *head = folio_buffers(folio); 470 471 if (head) { 472 struct buffer_head *bh = head; 473 474 /* If they're all mapped and dirty, do it */ 475 page_block = 0; 476 do { 477 BUG_ON(buffer_locked(bh)); 478 if (!buffer_mapped(bh)) { 479 /* 480 * unmapped dirty buffers are created by 481 * block_dirty_folio -> mmapped data 482 */ 483 if (buffer_dirty(bh)) 484 goto confused; 485 if (first_unmapped == blocks_per_folio) 486 first_unmapped = page_block; 487 continue; 488 } 489 490 if (first_unmapped != blocks_per_folio) 491 goto confused; /* hole -> non-hole */ 492 493 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 494 goto confused; 495 if (page_block) { 496 if (bh->b_blocknr != first_block + page_block) 497 goto confused; 498 } else { 499 first_block = bh->b_blocknr; 500 } 501 page_block++; 502 boundary = buffer_boundary(bh); 503 if (boundary) { 504 boundary_block = bh->b_blocknr; 505 boundary_bdev = bh->b_bdev; 506 } 507 bdev = bh->b_bdev; 508 } while ((bh = bh->b_this_page) != head); 509 510 if (first_unmapped) 511 goto page_is_mapped; 512 513 /* 514 * Page has buffers, but they are all unmapped. The page was 515 * created by pagein or read over a hole which was handled by 516 * block_read_full_folio(). If this address_space is also 517 * using mpage_readahead then this can rarely happen. 518 */ 519 goto confused; 520 } 521 522 /* 523 * The page has no buffers: map it to disk 524 */ 525 BUG_ON(!folio_test_uptodate(folio)); 526 block_in_file = folio_pos(folio) >> blkbits; 527 /* 528 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 529 * space. 530 */ 531 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 532 goto page_is_mapped; 533 last_block = (i_size - 1) >> blkbits; 534 map_bh.b_folio = folio; 535 for (page_block = 0; page_block < blocks_per_folio; ) { 536 537 map_bh.b_state = 0; 538 map_bh.b_size = 1 << blkbits; 539 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 540 goto confused; 541 if (!buffer_mapped(&map_bh)) 542 goto confused; 543 if (buffer_new(&map_bh)) 544 clean_bdev_bh_alias(&map_bh); 545 if (buffer_boundary(&map_bh)) { 546 boundary_block = map_bh.b_blocknr; 547 boundary_bdev = map_bh.b_bdev; 548 } 549 if (page_block) { 550 if (map_bh.b_blocknr != first_block + page_block) 551 goto confused; 552 } else { 553 first_block = map_bh.b_blocknr; 554 } 555 page_block++; 556 boundary = buffer_boundary(&map_bh); 557 bdev = map_bh.b_bdev; 558 if (block_in_file == last_block) 559 break; 560 block_in_file++; 561 } 562 BUG_ON(page_block == 0); 563 564 first_unmapped = page_block; 565 566 page_is_mapped: 567 /* Don't bother writing beyond EOF, truncate will discard the folio */ 568 if (folio_pos(folio) >= i_size) 569 goto confused; 570 length = folio_size(folio); 571 if (folio_pos(folio) + length > i_size) { 572 /* 573 * The page straddles i_size. It must be zeroed out on each 574 * and every writepage invocation because it may be mmapped. 575 * "A file is mapped in multiples of the page size. For a file 576 * that is not a multiple of the page size, the remaining memory 577 * is zeroed when mapped, and writes to that region are not 578 * written out to the file." 579 */ 580 length = i_size - folio_pos(folio); 581 folio_zero_segment(folio, length, folio_size(folio)); 582 } 583 584 /* 585 * This page will go to BIO. Do we need to send this BIO off first? 586 */ 587 if (bio && mpd->last_block_in_bio != first_block - 1) 588 bio = mpage_bio_submit_write(bio); 589 590 alloc_new: 591 if (bio == NULL) { 592 bio = bio_alloc(bdev, BIO_MAX_VECS, 593 REQ_OP_WRITE | wbc_to_write_flags(wbc), 594 GFP_NOFS); 595 bio->bi_iter.bi_sector = first_block << (blkbits - 9); 596 wbc_init_bio(wbc, bio); 597 bio->bi_write_hint = inode->i_write_hint; 598 } 599 600 /* 601 * Must try to add the page before marking the buffer clean or 602 * the confused fail path above (OOM) will be very confused when 603 * it finds all bh marked clean (i.e. it will not write anything) 604 */ 605 wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); 606 length = first_unmapped << blkbits; 607 if (!bio_add_folio(bio, folio, length, 0)) { 608 bio = mpage_bio_submit_write(bio); 609 goto alloc_new; 610 } 611 612 clean_buffers(folio, first_unmapped); 613 614 BUG_ON(folio_test_writeback(folio)); 615 folio_start_writeback(folio); 616 folio_unlock(folio); 617 if (boundary || (first_unmapped != blocks_per_folio)) { 618 bio = mpage_bio_submit_write(bio); 619 if (boundary_block) { 620 write_boundary_block(boundary_bdev, 621 boundary_block, 1 << blkbits); 622 } 623 } else { 624 mpd->last_block_in_bio = first_block + blocks_per_folio - 1; 625 } 626 goto out; 627 628 confused: 629 if (bio) 630 bio = mpage_bio_submit_write(bio); 631 632 /* 633 * The caller has a ref on the inode, so *mapping is stable 634 */ 635 ret = block_write_full_folio(folio, wbc, mpd->get_block); 636 mapping_set_error(mapping, ret); 637 out: 638 mpd->bio = bio; 639 return ret; 640 } 641 642 /** 643 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 644 * @mapping: address space structure to write 645 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 646 * @get_block: the filesystem's block mapper function. 647 * 648 * This is a library function, which implements the writepages() 649 * address_space_operation. 650 */ 651 int 652 mpage_writepages(struct address_space *mapping, 653 struct writeback_control *wbc, get_block_t get_block) 654 { 655 struct mpage_data mpd = { 656 .get_block = get_block, 657 }; 658 struct folio *folio = NULL; 659 struct blk_plug plug; 660 int error; 661 662 blk_start_plug(&plug); 663 while ((folio = writeback_iter(mapping, wbc, folio, &error))) 664 error = mpage_write_folio(wbc, folio, &mpd); 665 if (mpd.bio) 666 mpage_bio_submit_write(mpd.bio); 667 blk_finish_plug(&plug); 668 return error; 669 } 670 EXPORT_SYMBOL(mpage_writepages); 671