1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/mpage.c 4 * 5 * Copyright (C) 2002, Linus Torvalds. 6 * 7 * Contains functions related to preparing and submitting BIOs which contain 8 * multiple pagecache pages. 9 * 10 * 15May2002 Andrew Morton 11 * Initial version 12 * 27Jun2002 axboe@suse.de 13 * use bio_add_page() to build bio's just the right size 14 */ 15 16 #include <linux/kernel.h> 17 #include <linux/export.h> 18 #include <linux/mm.h> 19 #include <linux/kdev_t.h> 20 #include <linux/gfp.h> 21 #include <linux/bio.h> 22 #include <linux/fs.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/highmem.h> 26 #include <linux/prefetch.h> 27 #include <linux/mpage.h> 28 #include <linux/mm_inline.h> 29 #include <linux/writeback.h> 30 #include <linux/backing-dev.h> 31 #include <linux/pagevec.h> 32 #include "internal.h" 33 34 /* 35 * I/O completion handler for multipage BIOs. 36 * 37 * The mpage code never puts partial pages into a BIO (except for end-of-file). 38 * If a page does not map to a contiguous run of blocks then it simply falls 39 * back to block_read_full_folio(). 40 * 41 * Why is this? If a page's completion depends on a number of different BIOs 42 * which can complete in any order (or at the same time) then determining the 43 * status of that page is hard. See end_buffer_async_read() for the details. 44 * There is no point in duplicating all that complexity. 45 */ 46 static void mpage_read_end_io(struct bio *bio) 47 { 48 struct folio_iter fi; 49 int err = blk_status_to_errno(bio->bi_status); 50 51 bio_for_each_folio_all(fi, bio) 52 folio_end_read(fi.folio, err == 0); 53 54 bio_put(bio); 55 } 56 57 static void mpage_write_end_io(struct bio *bio) 58 { 59 struct folio_iter fi; 60 int err = blk_status_to_errno(bio->bi_status); 61 62 bio_for_each_folio_all(fi, bio) { 63 if (err) 64 mapping_set_error(fi.folio->mapping, err); 65 folio_end_writeback(fi.folio); 66 } 67 68 bio_put(bio); 69 } 70 71 static struct bio *mpage_bio_submit_read(struct bio *bio) 72 { 73 bio->bi_end_io = mpage_read_end_io; 74 guard_bio_eod(bio); 75 submit_bio(bio); 76 return NULL; 77 } 78 79 static struct bio *mpage_bio_submit_write(struct bio *bio) 80 { 81 bio->bi_end_io = mpage_write_end_io; 82 guard_bio_eod(bio); 83 submit_bio(bio); 84 return NULL; 85 } 86 87 /* 88 * support function for mpage_readahead. The fs supplied get_block might 89 * return an up to date buffer. This is used to map that buffer into 90 * the page, which allows read_folio to avoid triggering a duplicate call 91 * to get_block. 92 * 93 * The idea is to avoid adding buffers to pages that don't already have 94 * them. So when the buffer is up to date and the page size == block size, 95 * this marks the page up to date instead of adding new buffers. 96 */ 97 static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, 98 int page_block) 99 { 100 struct inode *inode = folio->mapping->host; 101 struct buffer_head *page_bh, *head; 102 int block = 0; 103 104 head = folio_buffers(folio); 105 if (!head) { 106 /* 107 * don't make any buffers if there is only one buffer on 108 * the folio and the folio just needs to be set up to date 109 */ 110 if (inode->i_blkbits == PAGE_SHIFT && 111 buffer_uptodate(bh)) { 112 folio_mark_uptodate(folio); 113 return; 114 } 115 head = create_empty_buffers(folio, i_blocksize(inode), 0); 116 } 117 118 page_bh = head; 119 do { 120 if (block == page_block) { 121 page_bh->b_state = bh->b_state; 122 page_bh->b_bdev = bh->b_bdev; 123 page_bh->b_blocknr = bh->b_blocknr; 124 break; 125 } 126 page_bh = page_bh->b_this_page; 127 block++; 128 } while (page_bh != head); 129 } 130 131 struct mpage_readpage_args { 132 struct bio *bio; 133 struct folio *folio; 134 unsigned int nr_pages; 135 bool is_readahead; 136 sector_t last_block_in_bio; 137 struct buffer_head map_bh; 138 unsigned long first_logical_block; 139 get_block_t *get_block; 140 }; 141 142 /* 143 * This is the worker routine which does all the work of mapping the disk 144 * blocks and constructs largest possible bios, submits them for IO if the 145 * blocks are not contiguous on the disk. 146 * 147 * We pass a buffer_head back and forth and use its buffer_mapped() flag to 148 * represent the validity of its disk mapping and to decide when to do the next 149 * get_block() call. 150 */ 151 static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) 152 { 153 struct folio *folio = args->folio; 154 struct inode *inode = folio->mapping->host; 155 const unsigned blkbits = inode->i_blkbits; 156 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 157 const unsigned blocksize = 1 << blkbits; 158 struct buffer_head *map_bh = &args->map_bh; 159 sector_t block_in_file; 160 sector_t last_block; 161 sector_t last_block_in_file; 162 sector_t first_block; 163 unsigned page_block; 164 unsigned first_hole = blocks_per_page; 165 struct block_device *bdev = NULL; 166 int length; 167 int fully_mapped = 1; 168 blk_opf_t opf = REQ_OP_READ; 169 unsigned nblocks; 170 unsigned relative_block; 171 gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); 172 173 /* MAX_BUF_PER_PAGE, for example */ 174 VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 175 176 if (args->is_readahead) { 177 opf |= REQ_RAHEAD; 178 gfp |= __GFP_NORETRY | __GFP_NOWARN; 179 } 180 181 if (folio_buffers(folio)) 182 goto confused; 183 184 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 185 last_block = block_in_file + args->nr_pages * blocks_per_page; 186 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; 187 if (last_block > last_block_in_file) 188 last_block = last_block_in_file; 189 page_block = 0; 190 191 /* 192 * Map blocks using the result from the previous get_blocks call first. 193 */ 194 nblocks = map_bh->b_size >> blkbits; 195 if (buffer_mapped(map_bh) && 196 block_in_file > args->first_logical_block && 197 block_in_file < (args->first_logical_block + nblocks)) { 198 unsigned map_offset = block_in_file - args->first_logical_block; 199 unsigned last = nblocks - map_offset; 200 201 first_block = map_bh->b_blocknr + map_offset; 202 for (relative_block = 0; ; relative_block++) { 203 if (relative_block == last) { 204 clear_buffer_mapped(map_bh); 205 break; 206 } 207 if (page_block == blocks_per_page) 208 break; 209 page_block++; 210 block_in_file++; 211 } 212 bdev = map_bh->b_bdev; 213 } 214 215 /* 216 * Then do more get_blocks calls until we are done with this folio. 217 */ 218 map_bh->b_folio = folio; 219 while (page_block < blocks_per_page) { 220 map_bh->b_state = 0; 221 map_bh->b_size = 0; 222 223 if (block_in_file < last_block) { 224 map_bh->b_size = (last_block-block_in_file) << blkbits; 225 if (args->get_block(inode, block_in_file, map_bh, 0)) 226 goto confused; 227 args->first_logical_block = block_in_file; 228 } 229 230 if (!buffer_mapped(map_bh)) { 231 fully_mapped = 0; 232 if (first_hole == blocks_per_page) 233 first_hole = page_block; 234 page_block++; 235 block_in_file++; 236 continue; 237 } 238 239 /* some filesystems will copy data into the page during 240 * the get_block call, in which case we don't want to 241 * read it again. map_buffer_to_folio copies the data 242 * we just collected from get_block into the folio's buffers 243 * so read_folio doesn't have to repeat the get_block call 244 */ 245 if (buffer_uptodate(map_bh)) { 246 map_buffer_to_folio(folio, map_bh, page_block); 247 goto confused; 248 } 249 250 if (first_hole != blocks_per_page) 251 goto confused; /* hole -> non-hole */ 252 253 /* Contiguous blocks? */ 254 if (!page_block) 255 first_block = map_bh->b_blocknr; 256 else if (first_block + page_block != map_bh->b_blocknr) 257 goto confused; 258 nblocks = map_bh->b_size >> blkbits; 259 for (relative_block = 0; ; relative_block++) { 260 if (relative_block == nblocks) { 261 clear_buffer_mapped(map_bh); 262 break; 263 } else if (page_block == blocks_per_page) 264 break; 265 page_block++; 266 block_in_file++; 267 } 268 bdev = map_bh->b_bdev; 269 } 270 271 if (first_hole != blocks_per_page) { 272 folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); 273 if (first_hole == 0) { 274 folio_mark_uptodate(folio); 275 folio_unlock(folio); 276 goto out; 277 } 278 } else if (fully_mapped) { 279 folio_set_mappedtodisk(folio); 280 } 281 282 /* 283 * This folio will go to BIO. Do we need to send this BIO off first? 284 */ 285 if (args->bio && (args->last_block_in_bio != first_block - 1)) 286 args->bio = mpage_bio_submit_read(args->bio); 287 288 alloc_new: 289 if (args->bio == NULL) { 290 args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, 291 gfp); 292 if (args->bio == NULL) 293 goto confused; 294 args->bio->bi_iter.bi_sector = first_block << (blkbits - 9); 295 } 296 297 length = first_hole << blkbits; 298 if (!bio_add_folio(args->bio, folio, length, 0)) { 299 args->bio = mpage_bio_submit_read(args->bio); 300 goto alloc_new; 301 } 302 303 relative_block = block_in_file - args->first_logical_block; 304 nblocks = map_bh->b_size >> blkbits; 305 if ((buffer_boundary(map_bh) && relative_block == nblocks) || 306 (first_hole != blocks_per_page)) 307 args->bio = mpage_bio_submit_read(args->bio); 308 else 309 args->last_block_in_bio = first_block + blocks_per_page - 1; 310 out: 311 return args->bio; 312 313 confused: 314 if (args->bio) 315 args->bio = mpage_bio_submit_read(args->bio); 316 if (!folio_test_uptodate(folio)) 317 block_read_full_folio(folio, args->get_block); 318 else 319 folio_unlock(folio); 320 goto out; 321 } 322 323 /** 324 * mpage_readahead - start reads against pages 325 * @rac: Describes which pages to read. 326 * @get_block: The filesystem's block mapper function. 327 * 328 * This function walks the pages and the blocks within each page, building and 329 * emitting large BIOs. 330 * 331 * If anything unusual happens, such as: 332 * 333 * - encountering a page which has buffers 334 * - encountering a page which has a non-hole after a hole 335 * - encountering a page with non-contiguous blocks 336 * 337 * then this code just gives up and calls the buffer_head-based read function. 338 * It does handle a page which has holes at the end - that is a common case: 339 * the end-of-file on blocksize < PAGE_SIZE setups. 340 * 341 * BH_Boundary explanation: 342 * 343 * There is a problem. The mpage read code assembles several pages, gets all 344 * their disk mappings, and then submits them all. That's fine, but obtaining 345 * the disk mappings may require I/O. Reads of indirect blocks, for example. 346 * 347 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be 348 * submitted in the following order: 349 * 350 * 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 351 * 352 * because the indirect block has to be read to get the mappings of blocks 353 * 13,14,15,16. Obviously, this impacts performance. 354 * 355 * So what we do it to allow the filesystem's get_block() function to set 356 * BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block 357 * after this one will require I/O against a block which is probably close to 358 * this one. So you should push what I/O you have currently accumulated. 359 * 360 * This all causes the disk requests to be issued in the correct order. 361 */ 362 void mpage_readahead(struct readahead_control *rac, get_block_t get_block) 363 { 364 struct folio *folio; 365 struct mpage_readpage_args args = { 366 .get_block = get_block, 367 .is_readahead = true, 368 }; 369 370 while ((folio = readahead_folio(rac))) { 371 prefetchw(&folio->flags); 372 args.folio = folio; 373 args.nr_pages = readahead_count(rac); 374 args.bio = do_mpage_readpage(&args); 375 } 376 if (args.bio) 377 mpage_bio_submit_read(args.bio); 378 } 379 EXPORT_SYMBOL(mpage_readahead); 380 381 /* 382 * This isn't called much at all 383 */ 384 int mpage_read_folio(struct folio *folio, get_block_t get_block) 385 { 386 struct mpage_readpage_args args = { 387 .folio = folio, 388 .nr_pages = 1, 389 .get_block = get_block, 390 }; 391 392 args.bio = do_mpage_readpage(&args); 393 if (args.bio) 394 mpage_bio_submit_read(args.bio); 395 return 0; 396 } 397 EXPORT_SYMBOL(mpage_read_folio); 398 399 /* 400 * Writing is not so simple. 401 * 402 * If the page has buffers then they will be used for obtaining the disk 403 * mapping. We only support pages which are fully mapped-and-dirty, with a 404 * special case for pages which are unmapped at the end: end-of-file. 405 * 406 * If the page has no buffers (preferred) then the page is mapped here. 407 * 408 * If all blocks are found to be contiguous then the page can go into the 409 * BIO. Otherwise fall back to the mapping's writepage(). 410 * 411 * FIXME: This code wants an estimate of how many pages are still to be 412 * written, so it can intelligently allocate a suitably-sized BIO. For now, 413 * just allocate full-size (16-page) BIOs. 414 */ 415 416 struct mpage_data { 417 struct bio *bio; 418 sector_t last_block_in_bio; 419 get_block_t *get_block; 420 }; 421 422 /* 423 * We have our BIO, so we can now mark the buffers clean. Make 424 * sure to only clean buffers which we know we'll be writing. 425 */ 426 static void clean_buffers(struct folio *folio, unsigned first_unmapped) 427 { 428 unsigned buffer_counter = 0; 429 struct buffer_head *bh, *head = folio_buffers(folio); 430 431 if (!head) 432 return; 433 bh = head; 434 435 do { 436 if (buffer_counter++ == first_unmapped) 437 break; 438 clear_buffer_dirty(bh); 439 bh = bh->b_this_page; 440 } while (bh != head); 441 442 /* 443 * we cannot drop the bh if the page is not uptodate or a concurrent 444 * read_folio would fail to serialize with the bh and it would read from 445 * disk before we reach the platter. 446 */ 447 if (buffer_heads_over_limit && folio_test_uptodate(folio)) 448 try_to_free_buffers(folio); 449 } 450 451 static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, 452 void *data) 453 { 454 struct mpage_data *mpd = data; 455 struct bio *bio = mpd->bio; 456 struct address_space *mapping = folio->mapping; 457 struct inode *inode = mapping->host; 458 const unsigned blkbits = inode->i_blkbits; 459 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 460 sector_t last_block; 461 sector_t block_in_file; 462 sector_t first_block; 463 unsigned page_block; 464 unsigned first_unmapped = blocks_per_page; 465 struct block_device *bdev = NULL; 466 int boundary = 0; 467 sector_t boundary_block = 0; 468 struct block_device *boundary_bdev = NULL; 469 size_t length; 470 struct buffer_head map_bh; 471 loff_t i_size = i_size_read(inode); 472 int ret = 0; 473 struct buffer_head *head = folio_buffers(folio); 474 475 if (head) { 476 struct buffer_head *bh = head; 477 478 /* If they're all mapped and dirty, do it */ 479 page_block = 0; 480 do { 481 BUG_ON(buffer_locked(bh)); 482 if (!buffer_mapped(bh)) { 483 /* 484 * unmapped dirty buffers are created by 485 * block_dirty_folio -> mmapped data 486 */ 487 if (buffer_dirty(bh)) 488 goto confused; 489 if (first_unmapped == blocks_per_page) 490 first_unmapped = page_block; 491 continue; 492 } 493 494 if (first_unmapped != blocks_per_page) 495 goto confused; /* hole -> non-hole */ 496 497 if (!buffer_dirty(bh) || !buffer_uptodate(bh)) 498 goto confused; 499 if (page_block) { 500 if (bh->b_blocknr != first_block + page_block) 501 goto confused; 502 } else { 503 first_block = bh->b_blocknr; 504 } 505 page_block++; 506 boundary = buffer_boundary(bh); 507 if (boundary) { 508 boundary_block = bh->b_blocknr; 509 boundary_bdev = bh->b_bdev; 510 } 511 bdev = bh->b_bdev; 512 } while ((bh = bh->b_this_page) != head); 513 514 if (first_unmapped) 515 goto page_is_mapped; 516 517 /* 518 * Page has buffers, but they are all unmapped. The page was 519 * created by pagein or read over a hole which was handled by 520 * block_read_full_folio(). If this address_space is also 521 * using mpage_readahead then this can rarely happen. 522 */ 523 goto confused; 524 } 525 526 /* 527 * The page has no buffers: map it to disk 528 */ 529 BUG_ON(!folio_test_uptodate(folio)); 530 block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); 531 /* 532 * Whole page beyond EOF? Skip allocating blocks to avoid leaking 533 * space. 534 */ 535 if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) 536 goto page_is_mapped; 537 last_block = (i_size - 1) >> blkbits; 538 map_bh.b_folio = folio; 539 for (page_block = 0; page_block < blocks_per_page; ) { 540 541 map_bh.b_state = 0; 542 map_bh.b_size = 1 << blkbits; 543 if (mpd->get_block(inode, block_in_file, &map_bh, 1)) 544 goto confused; 545 if (!buffer_mapped(&map_bh)) 546 goto confused; 547 if (buffer_new(&map_bh)) 548 clean_bdev_bh_alias(&map_bh); 549 if (buffer_boundary(&map_bh)) { 550 boundary_block = map_bh.b_blocknr; 551 boundary_bdev = map_bh.b_bdev; 552 } 553 if (page_block) { 554 if (map_bh.b_blocknr != first_block + page_block) 555 goto confused; 556 } else { 557 first_block = map_bh.b_blocknr; 558 } 559 page_block++; 560 boundary = buffer_boundary(&map_bh); 561 bdev = map_bh.b_bdev; 562 if (block_in_file == last_block) 563 break; 564 block_in_file++; 565 } 566 BUG_ON(page_block == 0); 567 568 first_unmapped = page_block; 569 570 page_is_mapped: 571 /* Don't bother writing beyond EOF, truncate will discard the folio */ 572 if (folio_pos(folio) >= i_size) 573 goto confused; 574 length = folio_size(folio); 575 if (folio_pos(folio) + length > i_size) { 576 /* 577 * The page straddles i_size. It must be zeroed out on each 578 * and every writepage invocation because it may be mmapped. 579 * "A file is mapped in multiples of the page size. For a file 580 * that is not a multiple of the page size, the remaining memory 581 * is zeroed when mapped, and writes to that region are not 582 * written out to the file." 583 */ 584 length = i_size - folio_pos(folio); 585 folio_zero_segment(folio, length, folio_size(folio)); 586 } 587 588 /* 589 * This page will go to BIO. Do we need to send this BIO off first? 590 */ 591 if (bio && mpd->last_block_in_bio != first_block - 1) 592 bio = mpage_bio_submit_write(bio); 593 594 alloc_new: 595 if (bio == NULL) { 596 bio = bio_alloc(bdev, BIO_MAX_VECS, 597 REQ_OP_WRITE | wbc_to_write_flags(wbc), 598 GFP_NOFS); 599 bio->bi_iter.bi_sector = first_block << (blkbits - 9); 600 wbc_init_bio(wbc, bio); 601 bio->bi_write_hint = inode->i_write_hint; 602 } 603 604 /* 605 * Must try to add the page before marking the buffer clean or 606 * the confused fail path above (OOM) will be very confused when 607 * it finds all bh marked clean (i.e. it will not write anything) 608 */ 609 wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); 610 length = first_unmapped << blkbits; 611 if (!bio_add_folio(bio, folio, length, 0)) { 612 bio = mpage_bio_submit_write(bio); 613 goto alloc_new; 614 } 615 616 clean_buffers(folio, first_unmapped); 617 618 BUG_ON(folio_test_writeback(folio)); 619 folio_start_writeback(folio); 620 folio_unlock(folio); 621 if (boundary || (first_unmapped != blocks_per_page)) { 622 bio = mpage_bio_submit_write(bio); 623 if (boundary_block) { 624 write_boundary_block(boundary_bdev, 625 boundary_block, 1 << blkbits); 626 } 627 } else { 628 mpd->last_block_in_bio = first_block + blocks_per_page - 1; 629 } 630 goto out; 631 632 confused: 633 if (bio) 634 bio = mpage_bio_submit_write(bio); 635 636 /* 637 * The caller has a ref on the inode, so *mapping is stable 638 */ 639 ret = block_write_full_folio(folio, wbc, mpd->get_block); 640 mapping_set_error(mapping, ret); 641 out: 642 mpd->bio = bio; 643 return ret; 644 } 645 646 /** 647 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 648 * @mapping: address space structure to write 649 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 650 * @get_block: the filesystem's block mapper function. 651 * 652 * This is a library function, which implements the writepages() 653 * address_space_operation. 654 */ 655 int 656 mpage_writepages(struct address_space *mapping, 657 struct writeback_control *wbc, get_block_t get_block) 658 { 659 struct mpage_data mpd = { 660 .get_block = get_block, 661 }; 662 struct blk_plug plug; 663 int ret; 664 665 blk_start_plug(&plug); 666 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); 667 if (mpd.bio) 668 mpage_bio_submit_write(mpd.bio); 669 blk_finish_plug(&plug); 670 return ret; 671 } 672 EXPORT_SYMBOL(mpage_writepages); 673