1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) { 52 if (err) 53 folio_set_error(fi.folio); 54 else 55 folio_mark_uptodate(fi.folio); 56 folio_unlock(fi.folio); 57 } 58 59 bio_put(bio); 60 } 61 62 static void mpage_write_end_io(struct bio *bio) 63 { 64 struct folio_iter fi; 65 int err = blk_status_to_errno(bio->bi_status); 66 67 bio_for_each_folio_all(fi, bio) { 68 if (err) { 69 folio_set_error(fi.folio); 70 mapping_set_error(fi.folio->mapping, err); 71 } 72 folio_end_writeback(fi.folio); 73 } 74 75 bio_put(bio); 76 } 77 78 static struct bio *mpage_bio_submit_read(struct bio *bio) 79 { 80 bio->bi_end_io = mpage_read_end_io; 81 guard_bio_eod(bio); 82 submit_bio(bio); 83 return NULL; 84 } 85 86 static struct bio *mpage_bio_submit_write(struct bio *bio) 87 { 88 bio->bi_end_io = mpage_write_end_io; 89 guard_bio_eod(bio); 90 submit_bio(bio); 91 return NULL; 92 } 93 94 /* 95 * support function for mpage_readahead. The fs supplied get_block might 96 * return an up to date buffer. This is used to map that buffer into 97 * the page, which allows read_folio to avoid triggering a duplicate call 98 * to get_block. 99 * 100 * The idea is to avoid adding buffers to pages that don't already have 101 * them. So when the buffer is up to date and the page size == block size, 102 * this marks the page up to date instead of adding new buffers. 103 */ 104 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 105 int page_block) 106 { 107 struct inode *inode = folio->mapping->host; 108 struct buffer_head *page_bh, *head; 109 int block = 0; 110 111 head = folio_buffers(folio); 112 if (!head) { 113 /* 114 * don't make any buffers if there is only one buffer on 115 * the folio and the folio just needs to be set up to date 116 */ 117 if (inode->i_blkbits == PAGE_SHIFT && 118 buffer_uptodate(bh)) { 119 folio_mark_uptodate(folio); 120 return; 121 } 122 head = create_empty_buffers(folio, i_blocksize(inode), 0); 123 } 124 125 page_bh = head; 126 do { 127 if (block == page_block) { 128 page_bh->b_state = bh->b_state; 129 page_bh->b_bdev = bh->b_bdev; 130 page_bh->b_blocknr = bh->b_blocknr; 131 break; 132 } 133 page_bh = page_bh->b_this_page; 134 block++; 135 } while (page_bh != head); 136 } 137 138 struct mpage_readpage_args { 139 struct bio *bio; 140 struct folio *folio; 141 unsigned int nr_pages; 142 bool is_readahead; 143 sector_t last_block_in_bio; 144 struct buffer_head map_bh; 145 unsigned long first_logical_block; 146 get_block_t *get_block; 147 }; 148 149 /* 150 * This is the worker routine which does all the work of mapping the disk 151 * blocks and constructs largest possible bios, submits them for IO if the 152 * blocks are not contiguous on the disk. 153 * 154 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 155 * represent the validity of its disk mapping and to decide when to do the next 156 * get_block() call. 157 */ 158 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 159 { 160 struct folio *folio = args->folio; 161 struct inode *inode = folio->mapping->host; 162 const unsigned blkbits = inode->i_blkbits; 163 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 164 const unsigned blocksize = 1 << blkbits; 165 struct buffer_head *map_bh = &args->map_bh; 166 sector_t block_in_file; 167 sector_t last_block; 168 sector_t last_block_in_file; 169 sector_t blocks[MAX_BUF_PER_PAGE]; 170 unsigned page_block; 171 unsigned first_hole = blocks_per_page; 172 struct block_device *bdev = NULL; 173 int length; 174 int fully_mapped = 1; 175 blk_opf_t opf = REQ_OP_READ; 176 unsigned nblocks; 177 unsigned relative_block; 178 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 179 180 /* MAX_BUF_PER_PAGE, for example */ 181 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 182 183 if (args->is_readahead) { 184 opf |= REQ_RAHEAD; 185 gfp |= __GFP_NORETRY | __GFP_NOWARN; 186 } 187 188 if (folio_buffers(folio)) 189 goto confused; 190 191 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 192 last_block = block_in_file + args->nr_pages * blocks_per_page; 193 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 194 if (last_block > last_block_in_file) 195 last_block = last_block_in_file; 196 page_block = 0; 197 198 /* 199 * Map blocks using the result from the previous get_blocks call first. 200 */ 201 nblocks = map_bh->b_size >> blkbits; 202 if (buffer_mapped(map_bh) && 203 block_in_file > args->first_logical_block && 204 block_in_file < (args->first_logical_block + nblocks)) { 205 unsigned map_offset = block_in_file - args->first_logical_block; 206 unsigned last = nblocks - map_offset; 207 208 for (relative_block = 0; ; relative_block++) { 209 if (relative_block == last) { 210 clear_buffer_mapped(map_bh); 211 break; 212 } 213 if (page_block == blocks_per_page) 214 break; 215 blocks[page_block] = map_bh->b_blocknr + map_offset + 216 relative_block; 217 page_block++; 218 block_in_file++; 219 } 220 bdev = map_bh->b_bdev; 221 } 222 223 /* 224 * Then do more get_blocks calls until we are done with this folio. 225 */ 226 map_bh->b_folio = folio; 227 while (page_block < blocks_per_page) { 228 map_bh->b_state = 0; 229 map_bh->b_size = 0; 230 231 if (block_in_file < last_block) { 232 map_bh->b_size = (last_block-block_in_file) << blkbits; 233 if (args->get_block(inode, block_in_file, map_bh, 0)) 234 goto confused; 235 args->first_logical_block = block_in_file; 236 } 237 238 if (!buffer_mapped(map_bh)) { 239 fully_mapped = 0; 240 if (first_hole == blocks_per_page) 241 first_hole = page_block; 242 page_block++; 243 block_in_file++; 244 continue; 245 } 246 247 /* some filesystems will copy data into the page during 248 * the get_block call, in which case we don't want to 249 * read it again. map_buffer_to_folio copies the data 250 * we just collected from get_block into the folio's buffers 251 * so read_folio doesn't have to repeat the get_block call 252 */ 253 if (buffer_uptodate(map_bh)) { 254 map_buffer_to_folio(folio, map_bh, page_block); 255 goto confused; 256 } 257 258 if (first_hole != blocks_per_page) 259 goto confused; /* hole -> non-hole */ 260 261 /* Contiguous blocks? */ 262 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) 263 goto confused; 264 nblocks = map_bh->b_size >> blkbits; 265 for (relative_block = 0; ; relative_block++) { 266 if (relative_block == nblocks) { 267 clear_buffer_mapped(map_bh); 268 break; 269 } else if (page_block == blocks_per_page) 270 break; 271 blocks[page_block] = map_bh->b_blocknr+relative_block; 272 page_block++; 273 block_in_file++; 274 } 275 bdev = map_bh->b_bdev; 276 } 277 278 if (first_hole != blocks_per_page) { 279 folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); 280 if (first_hole == 0) { 281 folio_mark_uptodate(folio); 282 folio_unlock(folio); 283 goto out; 284 } 285 } else if (fully_mapped) { 286 folio_set_mappedtodisk(folio); 287 } 288 289 /* 290 * This folio will go to BIO. Do we need to send this BIO off first? 291 */ 292 if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) 293 args->bio = mpage_bio_submit_read(args->bio); 294 295 alloc_new: 296 if (args->bio == NULL) { 297 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 298 gfp); 299 if (args->bio == NULL) 300 goto confused; 301 args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); 302 } 303 304 length = first_hole << blkbits; 305 if (!bio_add_folio(args->bio, folio, length, 0)) { 306 args->bio = mpage_bio_submit_read(args->bio); 307 goto alloc_new; 308 } 309 310 relative_block = block_in_file - args->first_logical_block; 311 nblocks = map_bh->b_size >> blkbits; 312 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 313 (first_hole != blocks_per_page)) 314 args->bio = mpage_bio_submit_read(args->bio); 315 else 316 args->last_block_in_bio = blocks[blocks_per_page - 1]; 317 out: 318 return args->bio; 319 320 confused: 321 if (args->bio) 322 args->bio = mpage_bio_submit_read(args->bio); 323 if (!folio_test_uptodate(folio)) 324 block_read_full_folio(folio, args->get_block); 325 else 326 folio_unlock(folio); 327 goto out; 328 } 329 330 /** 331 * mpage_readahead - start reads against pages 332 * @rac: Describes which pages to read. 333 * @get_block: The filesystem's block mapper function. 334 * 335 * This function walks the pages and the blocks within each page, building and 336 * emitting large BIOs. 337 * 338 * If anything unusual happens, such as: 339 * 340 * - encountering a page which has buffers 341 * - encountering a page which has a non-hole after a hole 342 * - encountering a page with non-contiguous blocks 343 * 344 * then this code just gives up and calls the buffer_head-based read function. 345 * It does handle a page which has holes at the end - that is a common case: 346 * the end-of-file on blocksize < PAGE_SIZE setups. 347 * 348 * BH_Boundary explanation: 349 * 350 * There is a problem. The mpage read code assembles several pages, gets all 351 * their disk mappings, and then submits them all. That's fine, but obtaining 352 * the disk mappings may require I/O. Reads of indirect blocks, for example. 353 * 354 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 355 * submitted in the following order: 356 * 357 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 358 * 359 * because the indirect block has to be read to get the mappings of blocks 360 * 13,14,15,16. Obviously, this impacts performance. 361 * 362 * So what we do it to allow the filesystem's get_block() function to set 363 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 364 * after this one will require I/O against a block which is probably close to 365 * this one. So you should push what I/O you have currently accumulated. 366 * 367 * This all causes the disk requests to be issued in the correct order. 368 */ 369 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 370 { 371 struct folio *folio; 372 struct mpage_readpage_args args = { 373 .get_block = get_block, 374 .is_readahead = true, 375 }; 376 377 while ((folio = readahead_folio(rac))) { 378 prefetchw(&folio->flags); 379 args.folio = folio; 380 args.nr_pages = readahead_count(rac); 381 args.bio = do_mpage_readpage(&args); 382 } 383 if (args.bio) 384 mpage_bio_submit_read(args.bio); 385 } 386 EXPORT_SYMBOL(mpage_readahead); 387 388 /* 389 * This isn't called much at all 390 */ 391 int mpage_read_folio(struct folio *folio, get_block_t get_block) 392 { 393 struct mpage_readpage_args args = { 394 .folio = folio, 395 .nr_pages = 1, 396 .get_block = get_block, 397 }; 398 399 args.bio = do_mpage_readpage(&args); 400 if (args.bio) 401 mpage_bio_submit_read(args.bio); 402 return 0; 403 } 404 EXPORT_SYMBOL(mpage_read_folio); 405 406 /* 407 * Writing is not so simple. 408 * 409 * If the page has buffers then they will be used for obtaining the disk 410 * mapping. We only support pages which are fully mapped-and-dirty, with a 411 * special case for pages which are unmapped at the end: end-of-file. 412 * 413 * If the page has no buffers (preferred) then the page is mapped here. 414 * 415 * If all blocks are found to be contiguous then the page can go into the 416 * BIO. Otherwise fall back to the mapping's writepage(). 417 * 418 * FIXME: This code wants an estimate of how many pages are still to be 419 * written, so it can intelligently allocate a suitably-sized BIO. For now, 420 * just allocate full-size (16-page) BIOs. 421 */ 422 423 struct mpage_data { 424 struct bio *bio; 425 sector_t last_block_in_bio; 426 get_block_t *get_block; 427 }; 428 429 /* 430 * We have our BIO, so we can now mark the buffers clean. Make 431 * sure to only clean buffers which we know we'll be writing. 432 */ 433 static void clean_buffers(struct page *page, unsigned first_unmapped) 434 { 435 unsigned buffer_counter = 0; 436 struct buffer_head *bh, *head; 437 if (!page_has_buffers(page)) 438 return; 439 head = page_buffers(page); 440 bh = head; 441 442 do { 443 if (buffer_counter++ == first_unmapped) 444 break; 445 clear_buffer_dirty(bh); 446 bh = bh->b_this_page; 447 } while (bh != head); 448 449 /* 450 * we cannot drop the bh if the page is not uptodate or a concurrent 451 * read_folio would fail to serialize with the bh and it would read from 452 * disk before we reach the platter. 453 */ 454 if (buffer_heads_over_limit && PageUptodate(page)) 455 try_to_free_buffers(page_folio(page)); 456 } 457 458 /* 459 * For situations where we want to clean all buffers attached to a page. 460 * We don't need to calculate how many buffers are attached to the page, 461 * we just need to specify a number larger than the maximum number of buffers. 462 */ 463 void clean_page_buffers(struct page *page) 464 { 465 clean_buffers(page, ~0U); 466 } 467 468 static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, 469 void *data) 470 { 471 struct mpage_data *mpd = data; 472 struct bio *bio = mpd->bio; 473 struct address_space *mapping = folio->mapping; 474 struct inode *inode = mapping->host; 475 const unsigned blkbits = inode->i_blkbits; 476 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 477 sector_t last_block; 478 sector_t block_in_file; 479 sector_t blocks[MAX_BUF_PER_PAGE]; 480 unsigned page_block; 481 unsigned first_unmapped = blocks_per_page; 482 struct block_device *bdev = NULL; 483 int boundary = 0; 484 sector_t boundary_block = 0; 485 struct block_device *boundary_bdev = NULL; 486 size_t length; 487 struct buffer_head map_bh; 488 loff_t i_size = i_size_read(inode); 489 int ret = 0; 490 struct buffer_head *head = folio_buffers(folio); 491 492 if (head) { 493 struct buffer_head *bh = head; 494 495 /* If they're all mapped and dirty, do it */ 496 page_block = 0; 497 do { 498 BUG_ON(buffer_locked(bh)); 499 if (!buffer_mapped(bh)) { 500 /* 501 * unmapped dirty buffers are created by 502 * block_dirty_folio -> mmapped data 503 */ 504 if (buffer_dirty(bh)) 505 goto confused; 506 if (first_unmapped == blocks_per_page) 507 first_unmapped = page_block; 508 continue; 509 } 510 511 if (first_unmapped != blocks_per_page) 512 goto confused; /* hole -> non-hole */ 513 514 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 515 goto confused; 516 if (page_block) { 517 if (bh->b_blocknr != blocks[page_block-1] + 1) 518 goto confused; 519 } 520 blocks[page_block++] = bh->b_blocknr; 521 boundary = buffer_boundary(bh); 522 if (boundary) { 523 boundary_block = bh->b_blocknr; 524 boundary_bdev = bh->b_bdev; 525 } 526 bdev = bh->b_bdev; 527 } while ((bh = bh->b_this_page) != head); 528 529 if (first_unmapped) 530 goto page_is_mapped; 531 532 /* 533 * Page has buffers, but they are all unmapped. The page was 534 * created by pagein or read over a hole which was handled by 535 * block_read_full_folio(). If this address_space is also 536 * using mpage_readahead then this can rarely happen. 537 */ 538 goto confused; 539 } 540 541 /* 542 * The page has no buffers: map it to disk 543 */ 544 BUG_ON(!folio_test_uptodate(folio)); 545 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 546 /* 547 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 548 * space. 549 */ 550 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 551 goto page_is_mapped; 552 last_block = (i_size - 1) >> blkbits; 553 map_bh.b_folio = folio; 554 for (page_block = 0; page_block < blocks_per_page; ) { 555 556 map_bh.b_state = 0; 557 map_bh.b_size = 1 << blkbits; 558 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 559 goto confused; 560 if (!buffer_mapped(&map_bh)) 561 goto confused; 562 if (buffer_new(&map_bh)) 563 clean_bdev_bh_alias(&map_bh); 564 if (buffer_boundary(&map_bh)) { 565 boundary_block = map_bh.b_blocknr; 566 boundary_bdev = map_bh.b_bdev; 567 } 568 if (page_block) { 569 if (map_bh.b_blocknr != blocks[page_block-1] + 1) 570 goto confused; 571 } 572 blocks[page_block++] = map_bh.b_blocknr; 573 boundary = buffer_boundary(&map_bh); 574 bdev = map_bh.b_bdev; 575 if (block_in_file == last_block) 576 break; 577 block_in_file++; 578 } 579 BUG_ON(page_block == 0); 580 581 first_unmapped = page_block; 582 583 page_is_mapped: 584 /* Don't bother writing beyond EOF, truncate will discard the folio */ 585 if (folio_pos(folio) >= i_size) 586 goto confused; 587 length = folio_size(folio); 588 if (folio_pos(folio) + length > i_size) { 589 /* 590 * The page straddles i_size. It must be zeroed out on each 591 * and every writepage invocation because it may be mmapped. 592 * "A file is mapped in multiples of the page size. For a file 593 * that is not a multiple of the page size, the remaining memory 594 * is zeroed when mapped, and writes to that region are not 595 * written out to the file." 596 */ 597 length = i_size - folio_pos(folio); 598 folio_zero_segment(folio, length, folio_size(folio)); 599 } 600 601 /* 602 * This page will go to BIO. Do we need to send this BIO off first? 603 */ 604 if (bio && mpd->last_block_in_bio != blocks[0] - 1) 605 bio = mpage_bio_submit_write(bio); 606 607 alloc_new: 608 if (bio == NULL) { 609 bio = bio_alloc(bdev, BIO_MAX_VECS, 610 REQ_OP_WRITE | wbc_to_write_flags(wbc), 611 GFP_NOFS); 612 bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); 613 wbc_init_bio(wbc, bio); 614 } 615 616 /* 617 * Must try to add the page before marking the buffer clean or 618 * the confused fail path above (OOM) will be very confused when 619 * it finds all bh marked clean (i.e. it will not write anything) 620 */ 621 wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); 622 length = first_unmapped << blkbits; 623 if (!bio_add_folio(bio, folio, length, 0)) { 624 bio = mpage_bio_submit_write(bio); 625 goto alloc_new; 626 } 627 628 clean_buffers(&folio->page, first_unmapped); 629 630 BUG_ON(folio_test_writeback(folio)); 631 folio_start_writeback(folio); 632 folio_unlock(folio); 633 if (boundary || (first_unmapped != blocks_per_page)) { 634 bio = mpage_bio_submit_write(bio); 635 if (boundary_block) { 636 write_boundary_block(boundary_bdev, 637 boundary_block, 1 << blkbits); 638 } 639 } else { 640 mpd->last_block_in_bio = blocks[blocks_per_page - 1]; 641 } 642 goto out; 643 644 confused: 645 if (bio) 646 bio = mpage_bio_submit_write(bio); 647 648 /* 649 * The caller has a ref on the inode, so *mapping is stable 650 */ 651 ret = block_write_full_page(&folio->page, mpd->get_block, wbc); 652 mapping_set_error(mapping, ret); 653 out: 654 mpd->bio = bio; 655 return ret; 656 } 657 658 /** 659 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 660 * @mapping: address space structure to write 661 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 662 * @get_block: the filesystem's block mapper function. 663 * 664 * This is a library function, which implements the writepages() 665 * address_space_operation. 666 */ 667 int 668 mpage_writepages(struct address_space *mapping, 669 struct writeback_control *wbc, get_block_t get_block) 670 { 671 struct mpage_data mpd = { 672 .get_block = get_block, 673 }; 674 struct blk_plug plug; 675 int ret; 676 677 blk_start_plug(&plug); 678 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); 679 if (mpd.bio) 680 mpage_bio_submit_write(mpd.bio); 681 blk_finish_plug(&plug); 682 return ret; 683 } 684 EXPORT_SYMBOL(mpage_writepages); 685