1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010 Red Hat, Inc. 4 * Copyright (C) 2016-2023 Christoph Hellwig. 5 */ 6 #include <linux/iomap.h> 7 #include <linux/buffer_head.h> 8 #include <linux/writeback.h> 9 #include <linux/swap.h> 10 #include <linux/migrate.h> 11 #include <linux/fserror.h> 12 #include <linux/fsverity.h> 13 #include "internal.h" 14 #include "trace.h" 15 16 #include "../internal.h" 17 18 /* 19 * Structure allocated for each folio to track per-block uptodate, dirty state 20 * and I/O completions. 21 */ 22 struct iomap_folio_state { 23 spinlock_t state_lock; 24 unsigned int read_bytes_pending; 25 atomic_t write_bytes_pending; 26 27 /* 28 * Each block has two bits in this bitmap: 29 * Bits [0..blocks_per_folio) has the uptodate status. 30 * Bits [b_p_f...(2*b_p_f)) has the dirty status. 31 */ 32 unsigned long state[]; 33 }; 34 35 static inline bool ifs_is_fully_uptodate(struct folio *folio, 36 struct iomap_folio_state *ifs) 37 { 38 struct inode *inode = folio->mapping->host; 39 40 return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); 41 } 42 43 /* 44 * Find the next uptodate block in the folio. end_blk is inclusive. 45 * If no uptodate block is found, this will return end_blk + 1. 46 */ 47 static unsigned ifs_next_uptodate_block(struct folio *folio, 48 unsigned start_blk, unsigned end_blk) 49 { 50 struct iomap_folio_state *ifs = folio->private; 51 52 return find_next_bit(ifs->state, end_blk + 1, start_blk); 53 } 54 55 /* 56 * Find the next non-uptodate block in the folio. end_blk is inclusive. 57 * If no non-uptodate block is found, this will return end_blk + 1. 58 */ 59 static unsigned ifs_next_nonuptodate_block(struct folio *folio, 60 unsigned start_blk, unsigned end_blk) 61 { 62 struct iomap_folio_state *ifs = folio->private; 63 64 return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); 65 } 66 67 static bool ifs_set_range_uptodate(struct folio *folio, 68 struct iomap_folio_state *ifs, size_t off, size_t len) 69 { 70 struct inode *inode = folio->mapping->host; 71 unsigned int first_blk = off >> inode->i_blkbits; 72 unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; 73 unsigned int nr_blks = last_blk - first_blk + 1; 74 75 bitmap_set(ifs->state, first_blk, nr_blks); 76 return ifs_is_fully_uptodate(folio, ifs); 77 } 78 79 static void iomap_set_range_uptodate(struct folio *folio, size_t off, 80 size_t len) 81 { 82 struct iomap_folio_state *ifs = folio->private; 83 unsigned long flags; 84 bool mark_uptodate = true; 85 86 if (folio_test_uptodate(folio)) 87 return; 88 89 if (ifs) { 90 spin_lock_irqsave(&ifs->state_lock, flags); 91 /* 92 * If a read with bytes pending is in progress, we must not call 93 * folio_mark_uptodate(). The read completion path 94 * (iomap_read_end()) will call folio_end_read(), which uses XOR 95 * semantics to set the uptodate bit. If we set it here, the XOR 96 * in folio_end_read() will clear it, leaving the folio not 97 * uptodate. 98 */ 99 mark_uptodate = ifs_set_range_uptodate(folio, ifs, off, len) && 100 !ifs->read_bytes_pending; 101 spin_unlock_irqrestore(&ifs->state_lock, flags); 102 } 103 104 if (mark_uptodate) 105 folio_mark_uptodate(folio); 106 } 107 108 /* 109 * Find the next dirty block in the folio. end_blk is inclusive. 110 * If no dirty block is found, this will return end_blk + 1. 111 */ 112 static unsigned ifs_next_dirty_block(struct folio *folio, 113 unsigned start_blk, unsigned end_blk) 114 { 115 struct iomap_folio_state *ifs = folio->private; 116 struct inode *inode = folio->mapping->host; 117 unsigned int blks = i_blocks_per_folio(inode, folio); 118 119 return find_next_bit(ifs->state, blks + end_blk + 1, 120 blks + start_blk) - blks; 121 } 122 123 /* 124 * Find the next clean block in the folio. end_blk is inclusive. 125 * If no clean block is found, this will return end_blk + 1. 126 */ 127 static unsigned ifs_next_clean_block(struct folio *folio, 128 unsigned start_blk, unsigned end_blk) 129 { 130 struct iomap_folio_state *ifs = folio->private; 131 struct inode *inode = folio->mapping->host; 132 unsigned int blks = i_blocks_per_folio(inode, folio); 133 134 return find_next_zero_bit(ifs->state, blks + end_blk + 1, 135 blks + start_blk) - blks; 136 } 137 138 static unsigned ifs_find_dirty_range(struct folio *folio, 139 struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) 140 { 141 struct inode *inode = folio->mapping->host; 142 unsigned start_blk = 143 offset_in_folio(folio, *range_start) >> inode->i_blkbits; 144 unsigned end_blk = min_not_zero( 145 offset_in_folio(folio, range_end) >> inode->i_blkbits, 146 i_blocks_per_folio(inode, folio)) - 1; 147 unsigned nblks; 148 149 start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); 150 if (start_blk > end_blk) 151 return 0; 152 if (start_blk == end_blk) 153 nblks = 1; 154 else 155 nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - 156 start_blk; 157 158 *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); 159 return nblks << inode->i_blkbits; 160 } 161 162 static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, 163 u64 range_end) 164 { 165 struct iomap_folio_state *ifs = folio->private; 166 167 if (*range_start >= range_end) 168 return 0; 169 170 if (ifs) 171 return ifs_find_dirty_range(folio, ifs, range_start, range_end); 172 return range_end - *range_start; 173 } 174 175 static void ifs_clear_range_dirty(struct folio *folio, 176 struct iomap_folio_state *ifs, size_t off, size_t len) 177 { 178 struct inode *inode = folio->mapping->host; 179 unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); 180 unsigned int first_blk = (off >> inode->i_blkbits); 181 unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; 182 unsigned int nr_blks = last_blk - first_blk + 1; 183 unsigned long flags; 184 185 spin_lock_irqsave(&ifs->state_lock, flags); 186 bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); 187 spin_unlock_irqrestore(&ifs->state_lock, flags); 188 } 189 190 static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) 191 { 192 struct iomap_folio_state *ifs = folio->private; 193 194 if (ifs) 195 ifs_clear_range_dirty(folio, ifs, off, len); 196 } 197 198 static void ifs_set_range_dirty(struct folio *folio, 199 struct iomap_folio_state *ifs, size_t off, size_t len) 200 { 201 struct inode *inode = folio->mapping->host; 202 unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); 203 unsigned int first_blk = (off >> inode->i_blkbits); 204 unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; 205 unsigned int nr_blks = last_blk - first_blk + 1; 206 unsigned long flags; 207 208 spin_lock_irqsave(&ifs->state_lock, flags); 209 bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); 210 spin_unlock_irqrestore(&ifs->state_lock, flags); 211 } 212 213 static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) 214 { 215 struct iomap_folio_state *ifs = folio->private; 216 217 if (ifs) 218 ifs_set_range_dirty(folio, ifs, off, len); 219 } 220 221 static struct iomap_folio_state *ifs_alloc(struct inode *inode, 222 struct folio *folio, unsigned int flags) 223 { 224 struct iomap_folio_state *ifs = folio->private; 225 unsigned int nr_blocks = i_blocks_per_folio(inode, folio); 226 gfp_t gfp; 227 228 if (ifs || nr_blocks <= 1) 229 return ifs; 230 231 if (flags & IOMAP_NOWAIT) 232 gfp = GFP_NOWAIT; 233 else 234 gfp = GFP_NOFS | __GFP_NOFAIL; 235 236 /* 237 * ifs->state tracks two sets of state flags when the 238 * filesystem block size is smaller than the folio size. 239 * The first state tracks per-block uptodate and the 240 * second tracks per-block dirty state. 241 */ 242 ifs = kzalloc_flex(*ifs, state, BITS_TO_LONGS(2 * nr_blocks), gfp); 243 if (!ifs) 244 return ifs; 245 246 spin_lock_init(&ifs->state_lock); 247 if (folio_test_uptodate(folio)) 248 bitmap_set(ifs->state, 0, nr_blocks); 249 if (folio_test_dirty(folio)) 250 bitmap_set(ifs->state, nr_blocks, nr_blocks); 251 folio_attach_private(folio, ifs); 252 253 return ifs; 254 } 255 256 static void ifs_free(struct folio *folio) 257 { 258 struct iomap_folio_state *ifs = folio_detach_private(folio); 259 260 if (!ifs) 261 return; 262 WARN_ON_ONCE(ifs->read_bytes_pending != 0); 263 WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); 264 WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != 265 folio_test_uptodate(folio)); 266 kfree(ifs); 267 } 268 269 /* 270 * Calculate how many bytes to truncate based off the number of blocks to 271 * truncate and the end position to start truncating from. 272 */ 273 static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, 274 unsigned blocks_truncated) 275 { 276 unsigned block_size = 1 << block_bits; 277 unsigned block_offset = end_pos & (block_size - 1); 278 279 if (!block_offset) 280 return blocks_truncated << block_bits; 281 282 return ((blocks_truncated - 1) << block_bits) + block_offset; 283 } 284 285 /* 286 * Calculate the range inside the folio that we actually need to read. 287 */ 288 static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, 289 loff_t *pos, loff_t length, size_t *offp, size_t *lenp) 290 { 291 struct iomap_folio_state *ifs = folio->private; 292 loff_t orig_pos = *pos; 293 loff_t isize = i_size_read(inode); 294 unsigned block_bits = inode->i_blkbits; 295 unsigned block_size = (1 << block_bits); 296 size_t poff = offset_in_folio(folio, *pos); 297 size_t plen = min_t(loff_t, folio_size(folio) - poff, length); 298 size_t orig_plen = plen; 299 unsigned first = poff >> block_bits; 300 unsigned last = (poff + plen - 1) >> block_bits; 301 302 /* 303 * If the block size is smaller than the page size, we need to check the 304 * per-block uptodate status and adjust the offset and length if needed 305 * to avoid reading in already uptodate ranges. 306 */ 307 if (ifs) { 308 unsigned int next, blocks_skipped; 309 310 next = ifs_next_nonuptodate_block(folio, first, last); 311 blocks_skipped = next - first; 312 313 if (blocks_skipped) { 314 unsigned long block_offset = *pos & (block_size - 1); 315 unsigned bytes_skipped = 316 (blocks_skipped << block_bits) - block_offset; 317 318 *pos += bytes_skipped; 319 poff += bytes_skipped; 320 plen -= bytes_skipped; 321 } 322 first = next; 323 324 /* truncate len if we find any trailing uptodate block(s) */ 325 if (++next <= last) { 326 next = ifs_next_uptodate_block(folio, next, last); 327 if (next <= last) { 328 plen -= iomap_bytes_to_truncate(*pos + plen, 329 block_bits, last - next + 1); 330 last = next - 1; 331 } 332 } 333 } 334 335 /* 336 * If the extent spans the block that contains the i_size, we need to 337 * handle both halves separately so that we properly zero data in the 338 * page cache for blocks that are entirely outside of i_size. 339 */ 340 if (orig_pos <= isize && orig_pos + orig_plen > isize) { 341 unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; 342 343 if (first <= end && last > end) 344 plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, 345 last - end); 346 } 347 348 *offp = poff; 349 *lenp = plen; 350 } 351 352 static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, 353 loff_t pos) 354 { 355 const struct iomap *srcmap = iomap_iter_srcmap(iter); 356 357 /* 358 * If this block has not been written, there's nothing to read 359 */ 360 if (srcmap->type != IOMAP_MAPPED) 361 return true; 362 363 /* 364 * Newly allocated blocks have not been written 365 */ 366 if (srcmap->flags & IOMAP_F_NEW) 367 return true; 368 369 /* 370 * fsverity metadata is stored past i_size, we need to read it instead 371 * of zeroing 372 */ 373 if (srcmap->flags & IOMAP_F_FSVERITY) 374 return false; 375 376 return pos >= i_size_read(iter->inode); 377 } 378 379 /** 380 * iomap_read_inline_data - copy inline data into the page cache 381 * @iter: iteration structure 382 * @folio: folio to copy to 383 * 384 * Copy the inline data in @iter into @folio and zero out the rest of the folio. 385 * Only a single IOMAP_INLINE extent is allowed at the end of each file. 386 * Returns zero for success to complete the read, or the usual negative errno. 387 */ 388 static int iomap_read_inline_data(const struct iomap_iter *iter, 389 struct folio *folio) 390 { 391 const struct iomap *iomap = iomap_iter_srcmap(iter); 392 size_t size = i_size_read(iter->inode) - iomap->offset; 393 size_t offset = offset_in_folio(folio, iomap->offset); 394 395 if (WARN_ON_ONCE(!iomap->inline_data)) 396 return -EIO; 397 398 if (folio_test_uptodate(folio)) 399 return 0; 400 401 if (WARN_ON_ONCE(size > iomap->length)) { 402 fserror_report_io(iter->inode, FSERR_BUFFERED_READ, 403 iomap->offset, size, -EIO, GFP_NOFS); 404 return -EIO; 405 } 406 if (offset > 0) 407 ifs_alloc(iter->inode, folio, iter->flags); 408 409 folio_fill_tail(folio, offset, iomap->inline_data, size); 410 iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); 411 return 0; 412 } 413 414 void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, 415 int error) 416 { 417 struct iomap_folio_state *ifs = folio->private; 418 bool uptodate = !error; 419 bool finished = true; 420 421 if (error) 422 fserror_report_io(folio->mapping->host, FSERR_BUFFERED_READ, 423 folio_pos(folio) + off, len, error, 424 GFP_ATOMIC); 425 426 if (ifs) { 427 unsigned long flags; 428 429 spin_lock_irqsave(&ifs->state_lock, flags); 430 if (!error) 431 uptodate = ifs_set_range_uptodate(folio, ifs, off, len); 432 ifs->read_bytes_pending -= len; 433 finished = !ifs->read_bytes_pending; 434 spin_unlock_irqrestore(&ifs->state_lock, flags); 435 } 436 437 if (finished) 438 folio_end_read(folio, uptodate); 439 } 440 EXPORT_SYMBOL_GPL(iomap_finish_folio_read); 441 442 static void iomap_read_init(struct folio *folio) 443 { 444 struct iomap_folio_state *ifs = folio->private; 445 446 if (ifs) { 447 /* 448 * ifs->read_bytes_pending is used to track how many bytes are 449 * read in asynchronously by the IO helper. We need to track 450 * this so that we can know when the IO helper has finished 451 * reading in all the necessary ranges of the folio and can end 452 * the read. 453 * 454 * Increase ->read_bytes_pending by the folio size to start. 455 * We'll subtract any uptodate / zeroed ranges that did not 456 * require IO in iomap_read_end() after we're done processing 457 * the folio. 458 * 459 * We do this because otherwise, we would have to increment 460 * ifs->read_bytes_pending every time a range in the folio needs 461 * to be read in, which can get expensive since the spinlock 462 * needs to be held whenever modifying ifs->read_bytes_pending. 463 */ 464 spin_lock_irq(&ifs->state_lock); 465 WARN_ON_ONCE(ifs->read_bytes_pending != 0); 466 ifs->read_bytes_pending = folio_size(folio); 467 spin_unlock_irq(&ifs->state_lock); 468 } 469 } 470 471 /* 472 * This ends IO if no bytes were submitted to an IO helper. 473 * 474 * Otherwise, this calibrates ifs->read_bytes_pending to represent only the 475 * submitted bytes (see comment in iomap_read_init()). If all bytes submitted 476 * have already been completed by the IO helper, then this will end the read. 477 * Else the IO helper will end the read after all submitted ranges have been 478 * read. 479 */ 480 static void iomap_read_end(struct folio *folio, size_t bytes_submitted) 481 { 482 struct iomap_folio_state *ifs = folio->private; 483 484 if (ifs) { 485 bool end_read, uptodate; 486 487 spin_lock_irq(&ifs->state_lock); 488 if (!ifs->read_bytes_pending) { 489 WARN_ON_ONCE(bytes_submitted); 490 spin_unlock_irq(&ifs->state_lock); 491 folio_unlock(folio); 492 return; 493 } 494 495 /* 496 * Subtract any bytes that were initially accounted to 497 * read_bytes_pending but skipped for IO. 498 */ 499 ifs->read_bytes_pending -= folio_size(folio) - bytes_submitted; 500 501 /* 502 * If !ifs->read_bytes_pending, this means all pending reads by 503 * the IO helper have already completed, which means we need to 504 * end the folio read here. If ifs->read_bytes_pending != 0, 505 * the IO helper will end the folio read. 506 */ 507 end_read = !ifs->read_bytes_pending; 508 if (end_read) 509 uptodate = ifs_is_fully_uptodate(folio, ifs); 510 spin_unlock_irq(&ifs->state_lock); 511 if (end_read) 512 folio_end_read(folio, uptodate); 513 } else { 514 /* 515 * If a folio without an ifs is submitted to the IO helper, the 516 * read must be on the entire folio and the IO helper takes 517 * ownership of the folio. This means we should only enter 518 * iomap_read_end() for the !ifs case if no bytes were submitted 519 * to the IO helper, in which case we are responsible for 520 * unlocking the folio here. 521 */ 522 WARN_ON_ONCE(bytes_submitted); 523 folio_unlock(folio); 524 } 525 } 526 527 static int iomap_read_folio_iter(struct iomap_iter *iter, 528 struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) 529 { 530 const struct iomap *iomap = &iter->iomap; 531 loff_t pos = iter->pos; 532 loff_t length = iomap_length(iter); 533 struct folio *folio = ctx->cur_folio; 534 size_t folio_len = folio_size(folio); 535 struct iomap_folio_state *ifs; 536 size_t poff, plen; 537 loff_t pos_diff; 538 int ret; 539 540 if (iomap->type == IOMAP_INLINE) { 541 ret = iomap_read_inline_data(iter, folio); 542 if (ret) 543 return ret; 544 return iomap_iter_advance(iter, length); 545 } 546 547 ifs = ifs_alloc(iter->inode, folio, iter->flags); 548 549 length = min_t(loff_t, length, folio_len - offset_in_folio(folio, pos)); 550 while (length) { 551 iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, 552 &plen); 553 554 pos_diff = pos - iter->pos; 555 if (WARN_ON_ONCE(pos_diff + plen > length)) 556 return -EIO; 557 558 ret = iomap_iter_advance(iter, pos_diff); 559 if (ret) 560 return ret; 561 562 if (plen == 0) 563 return 0; 564 565 /* 566 * Handling of fsverity "holes". We hit this for two case: 567 * 1. No need to go further, the hole after fsverity 568 * descriptor is the end of the fsverity metadata. 569 * 570 * 2. This folio contains merkle tree blocks which need to be 571 * synthesized. If we already have fsverity info (ctx->vi) 572 * synthesize these blocks. 573 */ 574 if ((iomap->flags & IOMAP_F_FSVERITY) && 575 iomap->type == IOMAP_HOLE) { 576 if (ctx->vi) 577 fsverity_fill_zerohash(folio, poff, plen, 578 ctx->vi); 579 iomap_set_range_uptodate(folio, poff, plen); 580 } else if (iomap_block_needs_zeroing(iter, pos)) { 581 /* zero post-eof blocks as the page may be mapped */ 582 folio_zero_range(folio, poff, plen); 583 if (ctx->vi && 584 !fsverity_verify_blocks(ctx->vi, folio, plen, poff)) 585 return -EIO; 586 iomap_set_range_uptodate(folio, poff, plen); 587 } else { 588 if (!*bytes_submitted) 589 iomap_read_init(folio); 590 ret = ctx->ops->read_folio_range(iter, ctx, plen); 591 if (ret < 0) 592 fserror_report_io(iter->inode, 593 FSERR_BUFFERED_READ, pos, 594 plen, ret, GFP_NOFS); 595 if (ret) 596 return ret; 597 598 *bytes_submitted += plen; 599 /* 600 * Hand off folio ownership to the IO helper when: 601 * 1) The entire folio has been submitted for IO, or 602 * 2) There is no ifs attached to the folio 603 * 604 * Case (2) occurs when 1 << i_blkbits matches the folio 605 * size but the underlying filesystem or block device 606 * uses a smaller granularity for IO. 607 */ 608 if (*bytes_submitted == folio_len || !ifs) 609 ctx->cur_folio = NULL; 610 } 611 612 ret = iomap_iter_advance(iter, plen); 613 if (ret) 614 return ret; 615 length -= pos_diff + plen; 616 pos = iter->pos; 617 } 618 return 0; 619 } 620 621 void iomap_read_folio(const struct iomap_ops *ops, 622 struct iomap_read_folio_ctx *ctx, void *private) 623 { 624 struct folio *folio = ctx->cur_folio; 625 struct iomap_iter iter = { 626 .inode = folio->mapping->host, 627 .pos = folio_pos(folio), 628 .len = folio_size(folio), 629 .private = private, 630 }; 631 size_t bytes_submitted = 0; 632 int ret; 633 634 trace_iomap_readpage(iter.inode, 1); 635 636 /* 637 * Fetch fsverity_info for both data and fsverity metadata, as iomap 638 * needs zeroed hash for merkle tree block synthesis 639 */ 640 ctx->vi = fsverity_get_info(iter.inode); 641 if (ctx->vi && iter.pos < i_size_read(iter.inode)) 642 fsverity_readahead(ctx->vi, folio->index, 643 folio_nr_pages(folio)); 644 645 while ((ret = iomap_iter(&iter, ops)) > 0) 646 iter.status = iomap_read_folio_iter(&iter, ctx, 647 &bytes_submitted); 648 649 if (ctx->read_ctx && ctx->ops->submit_read) 650 ctx->ops->submit_read(&iter, ctx); 651 652 if (ctx->cur_folio) 653 iomap_read_end(ctx->cur_folio, bytes_submitted); 654 } 655 EXPORT_SYMBOL_GPL(iomap_read_folio); 656 657 static int iomap_readahead_iter(struct iomap_iter *iter, 658 struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) 659 { 660 int ret; 661 662 while (iomap_length(iter)) { 663 if (ctx->cur_folio && 664 offset_in_folio(ctx->cur_folio, iter->pos) == 0) { 665 iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); 666 ctx->cur_folio = NULL; 667 } 668 if (!ctx->cur_folio) { 669 ctx->cur_folio = readahead_folio(ctx->rac); 670 if (WARN_ON_ONCE(!ctx->cur_folio)) 671 return -EINVAL; 672 *cur_bytes_submitted = 0; 673 } 674 ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); 675 if (ret) 676 return ret; 677 } 678 679 return 0; 680 } 681 682 /** 683 * iomap_readahead - Attempt to read pages from a file. 684 * @ops: The operations vector for the filesystem. 685 * @ctx: The ctx used for issuing readahead. 686 * @private: The filesystem-specific information for issuing iomap_iter. 687 * 688 * This function is for filesystems to call to implement their readahead 689 * address_space operation. 690 * 691 * Context: The @ops callbacks may submit I/O (eg to read the addresses of 692 * blocks from disc), and may wait for it. The caller may be trying to 693 * access a different page, and so sleeping excessively should be avoided. 694 * It may allocate memory, but should avoid costly allocations. This 695 * function is called with memalloc_nofs set, so allocations will not cause 696 * the filesystem to be reentered. 697 */ 698 void iomap_readahead(const struct iomap_ops *ops, 699 struct iomap_read_folio_ctx *ctx, void *private) 700 { 701 struct readahead_control *rac = ctx->rac; 702 struct iomap_iter iter = { 703 .inode = rac->mapping->host, 704 .pos = readahead_pos(rac), 705 .len = readahead_length(rac), 706 .private = private, 707 }; 708 size_t cur_bytes_submitted; 709 710 trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); 711 712 /* 713 * Fetch fsverity_info for both data and fsverity metadata, as iomap 714 * needs zeroed hash for merkle tree block synthesis 715 */ 716 ctx->vi = fsverity_get_info(iter.inode); 717 if (ctx->vi && iter.pos < i_size_read(iter.inode)) 718 fsverity_readahead(ctx->vi, readahead_index(rac), 719 readahead_count(rac)); 720 721 while (iomap_iter(&iter, ops) > 0) 722 iter.status = iomap_readahead_iter(&iter, ctx, 723 &cur_bytes_submitted); 724 725 if (ctx->read_ctx && ctx->ops->submit_read) 726 ctx->ops->submit_read(&iter, ctx); 727 728 if (ctx->cur_folio) 729 iomap_read_end(ctx->cur_folio, cur_bytes_submitted); 730 } 731 EXPORT_SYMBOL_GPL(iomap_readahead); 732 733 /* 734 * iomap_is_partially_uptodate checks whether blocks within a folio are 735 * uptodate or not. 736 * 737 * Returns true if all blocks which correspond to the specified part 738 * of the folio are uptodate. 739 */ 740 bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) 741 { 742 struct iomap_folio_state *ifs = folio->private; 743 struct inode *inode = folio->mapping->host; 744 unsigned first, last; 745 746 if (!ifs) 747 return false; 748 749 /* Caller's range may extend past the end of this folio */ 750 count = min(folio_size(folio) - from, count); 751 752 /* First and last blocks in range within folio */ 753 first = from >> inode->i_blkbits; 754 last = (from + count - 1) >> inode->i_blkbits; 755 756 return ifs_next_nonuptodate_block(folio, first, last) > last; 757 } 758 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); 759 760 /** 761 * iomap_get_folio - get a folio reference for writing 762 * @iter: iteration structure 763 * @pos: start offset of write 764 * @len: Suggested size of folio to create. 765 * 766 * Returns a locked reference to the folio at @pos, or an error pointer if the 767 * folio could not be obtained. 768 */ 769 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) 770 { 771 fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; 772 773 if (iter->flags & IOMAP_NOWAIT) 774 fgp |= FGP_NOWAIT; 775 if (iter->flags & IOMAP_DONTCACHE) 776 fgp |= FGP_DONTCACHE; 777 fgp |= fgf_set_order(len); 778 779 return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, 780 fgp, mapping_gfp_mask(iter->inode->i_mapping)); 781 } 782 EXPORT_SYMBOL_GPL(iomap_get_folio); 783 784 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) 785 { 786 trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), 787 folio_size(folio)); 788 789 /* 790 * If the folio is dirty, we refuse to release our metadata because 791 * it may be partially dirty. Once we track per-block dirty state, 792 * we can release the metadata if every block is dirty. 793 */ 794 if (folio_test_dirty(folio)) 795 return false; 796 ifs_free(folio); 797 return true; 798 } 799 EXPORT_SYMBOL_GPL(iomap_release_folio); 800 801 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) 802 { 803 trace_iomap_invalidate_folio(folio->mapping->host, 804 folio_pos(folio) + offset, len); 805 806 /* 807 * If we're invalidating the entire folio, clear the dirty state 808 * from it and release it to avoid unnecessary buildup of the LRU. 809 */ 810 if (offset == 0 && len == folio_size(folio)) { 811 WARN_ON_ONCE(folio_test_writeback(folio)); 812 folio_cancel_dirty(folio); 813 ifs_free(folio); 814 } 815 } 816 EXPORT_SYMBOL_GPL(iomap_invalidate_folio); 817 818 bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) 819 { 820 struct inode *inode = mapping->host; 821 size_t len = folio_size(folio); 822 823 ifs_alloc(inode, folio, 0); 824 iomap_set_range_dirty(folio, 0, len); 825 return filemap_dirty_folio(mapping, folio); 826 } 827 EXPORT_SYMBOL_GPL(iomap_dirty_folio); 828 829 static void 830 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) 831 { 832 loff_t i_size = i_size_read(inode); 833 834 /* 835 * Only truncate newly allocated pages beyoned EOF, even if the 836 * write started inside the existing inode size. 837 */ 838 if (pos + len > i_size) 839 truncate_pagecache_range(inode, max(pos, i_size), 840 pos + len - 1); 841 } 842 843 static int __iomap_write_begin(const struct iomap_iter *iter, 844 const struct iomap_write_ops *write_ops, size_t len, 845 struct folio *folio) 846 { 847 struct iomap_folio_state *ifs; 848 loff_t pos = iter->pos; 849 loff_t block_size = i_blocksize(iter->inode); 850 loff_t block_start = round_down(pos, block_size); 851 loff_t block_end = round_up(pos + len, block_size); 852 unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); 853 size_t from = offset_in_folio(folio, pos), to = from + len; 854 size_t poff, plen; 855 856 /* 857 * If the write or zeroing completely overlaps the current folio, then 858 * entire folio will be dirtied so there is no need for 859 * per-block state tracking structures to be attached to this folio. 860 * For the unshare case, we must read in the ondisk contents because we 861 * are not changing pagecache contents. 862 */ 863 if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && 864 pos + len >= folio_next_pos(folio)) 865 return 0; 866 867 ifs = ifs_alloc(iter->inode, folio, iter->flags); 868 if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) 869 return -EAGAIN; 870 871 if (folio_test_uptodate(folio)) 872 return 0; 873 874 do { 875 iomap_adjust_read_range(iter->inode, folio, &block_start, 876 block_end - block_start, &poff, &plen); 877 if (plen == 0) 878 break; 879 880 /* 881 * If the read range will be entirely overwritten by the write, 882 * we can skip having to zero/read it in. 883 */ 884 if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && 885 to >= poff + plen) 886 continue; 887 888 if (iomap_block_needs_zeroing(iter, block_start)) { 889 if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) 890 return -EIO; 891 folio_zero_segments(folio, poff, from, to, poff + plen); 892 } else { 893 const struct iomap *iomap = iomap_iter_srcmap(iter); 894 int status; 895 896 if (iter->flags & IOMAP_NOWAIT) 897 return -EAGAIN; 898 899 if (write_ops && write_ops->read_folio_range) 900 status = write_ops->read_folio_range(iter, 901 folio, block_start, plen); 902 else 903 status = iomap_bio_read_folio_range_sync(iter, 904 folio, block_start, plen); 905 if (status < 0) 906 fserror_report_io(iter->inode, 907 FSERR_BUFFERED_READ, pos, 908 plen, status, GFP_NOFS); 909 if (status) 910 return status; 911 912 if (iomap->flags & IOMAP_F_ZERO_TAIL) 913 folio_zero_segment(folio, to, poff + plen); 914 } 915 iomap_set_range_uptodate(folio, poff, plen); 916 } while ((block_start += plen) < block_end); 917 918 return 0; 919 } 920 921 static struct folio *__iomap_get_folio(struct iomap_iter *iter, 922 const struct iomap_write_ops *write_ops, size_t len) 923 { 924 loff_t pos = iter->pos; 925 926 if (!mapping_large_folio_support(iter->inode->i_mapping)) 927 len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); 928 929 if (iter->iomap.flags & IOMAP_F_FOLIO_BATCH) { 930 struct folio *folio = folio_batch_next(iter->fbatch); 931 932 if (!folio) 933 return NULL; 934 935 /* 936 * The folio mapping generally shouldn't have changed based on 937 * fs locks, but be consistent with filemap lookup and retry 938 * the iter if it does. 939 */ 940 folio_lock(folio); 941 if (unlikely(folio->mapping != iter->inode->i_mapping)) { 942 iter->iomap.flags |= IOMAP_F_STALE; 943 folio_unlock(folio); 944 return NULL; 945 } 946 947 folio_get(folio); 948 folio_wait_stable(folio); 949 return folio; 950 } 951 952 if (write_ops && write_ops->get_folio) 953 return write_ops->get_folio(iter, pos, len); 954 return iomap_get_folio(iter, pos, len); 955 } 956 957 static void __iomap_put_folio(struct iomap_iter *iter, 958 const struct iomap_write_ops *write_ops, size_t ret, 959 struct folio *folio) 960 { 961 loff_t pos = iter->pos; 962 963 if (write_ops && write_ops->put_folio) { 964 write_ops->put_folio(iter->inode, pos, ret, folio); 965 } else { 966 folio_unlock(folio); 967 folio_put(folio); 968 } 969 } 970 971 /* trim pos and bytes to within a given folio */ 972 static loff_t iomap_trim_folio_range(struct iomap_iter *iter, 973 struct folio *folio, size_t *offset, u64 *bytes) 974 { 975 loff_t pos = iter->pos; 976 size_t fsize = folio_size(folio); 977 978 WARN_ON_ONCE(pos < folio_pos(folio)); 979 WARN_ON_ONCE(pos >= folio_pos(folio) + fsize); 980 981 *offset = offset_in_folio(folio, pos); 982 *bytes = min(*bytes, fsize - *offset); 983 984 return pos; 985 } 986 987 static int iomap_write_begin_inline(const struct iomap_iter *iter, 988 struct folio *folio) 989 { 990 /* needs more work for the tailpacking case; disable for now */ 991 if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) 992 return -EIO; 993 return iomap_read_inline_data(iter, folio); 994 } 995 996 /* 997 * Grab and prepare a folio for write based on iter state. Returns the folio, 998 * offset, and length. Callers can optionally pass a max length *plen, 999 * otherwise init to zero. 1000 */ 1001 static int iomap_write_begin(struct iomap_iter *iter, 1002 const struct iomap_write_ops *write_ops, struct folio **foliop, 1003 size_t *poffset, u64 *plen) 1004 { 1005 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1006 loff_t pos; 1007 u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); 1008 struct folio *folio; 1009 int status = 0; 1010 1011 len = min_not_zero(len, *plen); 1012 *foliop = NULL; 1013 *plen = 0; 1014 1015 if (fatal_signal_pending(current)) 1016 return -EINTR; 1017 1018 folio = __iomap_get_folio(iter, write_ops, len); 1019 if (IS_ERR(folio)) 1020 return PTR_ERR(folio); 1021 1022 /* 1023 * No folio means we're done with a batch. We still have range to 1024 * process so return and let the caller iterate and refill the batch. 1025 */ 1026 if (!folio) { 1027 WARN_ON_ONCE(!(iter->iomap.flags & IOMAP_F_FOLIO_BATCH)); 1028 return 0; 1029 } 1030 1031 /* 1032 * Now we have a locked folio, before we do anything with it we need to 1033 * check that the iomap we have cached is not stale. The inode extent 1034 * mapping can change due to concurrent IO in flight (e.g. 1035 * IOMAP_UNWRITTEN state can change and memory reclaim could have 1036 * reclaimed a previously partially written page at this index after IO 1037 * completion before this write reaches this file offset) and hence we 1038 * could do the wrong thing here (zero a page range incorrectly or fail 1039 * to zero) and corrupt data. 1040 */ 1041 if (write_ops && write_ops->iomap_valid) { 1042 bool iomap_valid = write_ops->iomap_valid(iter->inode, 1043 &iter->iomap); 1044 if (!iomap_valid) { 1045 iter->iomap.flags |= IOMAP_F_STALE; 1046 status = 0; 1047 goto out_unlock; 1048 } 1049 } 1050 1051 /* 1052 * The folios in a batch may not be contiguous. If we've skipped 1053 * forward, advance the iter to the pos of the current folio. If the 1054 * folio starts beyond the end of the mapping, it may have been trimmed 1055 * since the lookup for whatever reason. Return a NULL folio to 1056 * terminate the op. 1057 */ 1058 if (folio_pos(folio) > iter->pos) { 1059 len = min_t(u64, folio_pos(folio) - iter->pos, 1060 iomap_length(iter)); 1061 status = iomap_iter_advance(iter, len); 1062 len = iomap_length(iter); 1063 if (status || !len) 1064 goto out_unlock; 1065 } 1066 1067 pos = iomap_trim_folio_range(iter, folio, poffset, &len); 1068 1069 if (srcmap->type == IOMAP_INLINE) 1070 status = iomap_write_begin_inline(iter, folio); 1071 else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) 1072 status = __block_write_begin_int(folio, pos, len, NULL, srcmap); 1073 else 1074 status = __iomap_write_begin(iter, write_ops, len, folio); 1075 1076 if (unlikely(status)) 1077 goto out_unlock; 1078 1079 *foliop = folio; 1080 *plen = len; 1081 return 0; 1082 1083 out_unlock: 1084 __iomap_put_folio(iter, write_ops, 0, folio); 1085 return status; 1086 } 1087 1088 static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, 1089 size_t copied, struct folio *folio) 1090 { 1091 flush_dcache_folio(folio); 1092 1093 /* 1094 * The blocks that were entirely written will now be uptodate, so we 1095 * don't have to worry about a read_folio reading them and overwriting a 1096 * partial write. However, if we've encountered a short write and only 1097 * partially written into a block, it will not be marked uptodate, so a 1098 * read_folio might come in and destroy our partial write. 1099 * 1100 * Do the simplest thing and just treat any short write to a 1101 * non-uptodate page as a zero-length write, and force the caller to 1102 * redo the whole thing. 1103 */ 1104 if (unlikely(copied < len && !folio_test_uptodate(folio))) 1105 return false; 1106 iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); 1107 iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); 1108 filemap_dirty_folio(inode->i_mapping, folio); 1109 return true; 1110 } 1111 1112 static bool iomap_write_end_inline(const struct iomap_iter *iter, 1113 struct folio *folio, loff_t pos, size_t copied) 1114 { 1115 const struct iomap *iomap = &iter->iomap; 1116 void *addr; 1117 1118 WARN_ON_ONCE(!folio_test_uptodate(folio)); 1119 1120 if (WARN_ON_ONCE(!iomap->inline_data)) 1121 return false; 1122 1123 flush_dcache_folio(folio); 1124 addr = kmap_local_folio(folio, pos); 1125 memcpy(iomap_inline_data(iomap, pos), addr, copied); 1126 kunmap_local(addr); 1127 1128 mark_inode_dirty(iter->inode); 1129 return true; 1130 } 1131 1132 /* 1133 * Returns true if all copied bytes have been written to the pagecache, 1134 * otherwise return false. 1135 */ 1136 static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, 1137 struct folio *folio) 1138 { 1139 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1140 loff_t pos = iter->pos; 1141 1142 if (srcmap->type == IOMAP_INLINE) 1143 return iomap_write_end_inline(iter, folio, pos, copied); 1144 1145 if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { 1146 size_t bh_written; 1147 1148 bh_written = block_write_end(pos, len, copied, folio); 1149 WARN_ON_ONCE(bh_written != copied && bh_written != 0); 1150 return bh_written == copied; 1151 } 1152 1153 return __iomap_write_end(iter->inode, pos, len, copied, folio); 1154 } 1155 1156 static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i, 1157 const struct iomap_write_ops *write_ops) 1158 { 1159 ssize_t total_written = 0; 1160 int status = 0; 1161 struct address_space *mapping = iter->inode->i_mapping; 1162 size_t chunk = mapping_max_folio_size(mapping); 1163 unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; 1164 1165 do { 1166 struct folio *folio; 1167 loff_t old_size; 1168 size_t offset; /* Offset into folio */ 1169 u64 bytes; /* Bytes to write to folio */ 1170 size_t copied; /* Bytes copied from user */ 1171 u64 written; /* Bytes have been written */ 1172 loff_t pos; 1173 1174 bytes = iov_iter_count(i); 1175 retry: 1176 offset = iter->pos & (chunk - 1); 1177 bytes = min(chunk - offset, bytes); 1178 status = balance_dirty_pages_ratelimited_flags(mapping, 1179 bdp_flags); 1180 if (unlikely(status)) 1181 break; 1182 1183 if (bytes > iomap_length(iter)) 1184 bytes = iomap_length(iter); 1185 1186 /* 1187 * Bring in the user page that we'll copy from _first_. 1188 * Otherwise there's a nasty deadlock on copying from the 1189 * same page as we're writing to, without it being marked 1190 * up-to-date. 1191 * 1192 * For async buffered writes the assumption is that the user 1193 * page has already been faulted in. This can be optimized by 1194 * faulting the user page. 1195 */ 1196 if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { 1197 status = -EFAULT; 1198 break; 1199 } 1200 1201 status = iomap_write_begin(iter, write_ops, &folio, &offset, 1202 &bytes); 1203 if (unlikely(status)) { 1204 iomap_write_failed(iter->inode, iter->pos, bytes); 1205 break; 1206 } 1207 if (iter->iomap.flags & IOMAP_F_STALE) 1208 break; 1209 1210 pos = iter->pos; 1211 1212 if (mapping_writably_mapped(mapping)) 1213 flush_dcache_folio(folio); 1214 1215 copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); 1216 written = iomap_write_end(iter, bytes, copied, folio) ? 1217 copied : 0; 1218 1219 /* 1220 * Update the in-memory inode size after copying the data into 1221 * the page cache. It's up to the file system to write the 1222 * updated size to disk, preferably after I/O completion so that 1223 * no stale data is exposed. Only once that's done can we 1224 * unlock and release the folio. 1225 */ 1226 old_size = iter->inode->i_size; 1227 if (pos + written > old_size && 1228 !(iter->iomap.flags & IOMAP_F_FSVERITY)) { 1229 i_size_write(iter->inode, pos + written); 1230 iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; 1231 } 1232 __iomap_put_folio(iter, write_ops, written, folio); 1233 1234 if (old_size < pos && !(iter->iomap.flags & IOMAP_F_FSVERITY)) 1235 pagecache_isize_extended(iter->inode, old_size, pos); 1236 1237 cond_resched(); 1238 if (unlikely(written == 0)) { 1239 /* 1240 * A short copy made iomap_write_end() reject the 1241 * thing entirely. Might be memory poisoning 1242 * halfway through, might be a race with munmap, 1243 * might be severe memory pressure. 1244 */ 1245 iomap_write_failed(iter->inode, pos, bytes); 1246 iov_iter_revert(i, copied); 1247 1248 if (chunk > PAGE_SIZE) 1249 chunk /= 2; 1250 if (copied) { 1251 bytes = copied; 1252 goto retry; 1253 } 1254 } else { 1255 total_written += written; 1256 iomap_iter_advance(iter, written); 1257 } 1258 } while (iov_iter_count(i) && iomap_length(iter)); 1259 1260 return total_written ? 0 : status; 1261 } 1262 1263 ssize_t 1264 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, 1265 const struct iomap_ops *ops, 1266 const struct iomap_write_ops *write_ops, void *private) 1267 { 1268 struct iomap_iter iter = { 1269 .inode = iocb->ki_filp->f_mapping->host, 1270 .pos = iocb->ki_pos, 1271 .len = iov_iter_count(i), 1272 .flags = IOMAP_WRITE, 1273 .private = private, 1274 }; 1275 ssize_t ret; 1276 1277 if (iocb->ki_flags & IOCB_NOWAIT) 1278 iter.flags |= IOMAP_NOWAIT; 1279 if (iocb->ki_flags & IOCB_DONTCACHE) 1280 iter.flags |= IOMAP_DONTCACHE; 1281 1282 while ((ret = iomap_iter(&iter, ops)) > 0) 1283 iter.status = iomap_write_iter(&iter, i, write_ops); 1284 1285 if (unlikely(iter.pos == iocb->ki_pos)) 1286 return ret; 1287 ret = iter.pos - iocb->ki_pos; 1288 iocb->ki_pos = iter.pos; 1289 return ret; 1290 } 1291 EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 1292 1293 int iomap_fsverity_write(struct file *file, loff_t pos, size_t length, 1294 const void *buf, const struct iomap_ops *ops, 1295 const struct iomap_write_ops *write_ops) 1296 { 1297 int ret; 1298 struct iov_iter iiter; 1299 struct kvec kvec = { 1300 .iov_base = (void *)buf, 1301 .iov_len = length, 1302 }; 1303 struct kiocb iocb = { 1304 .ki_filp = file, 1305 .ki_ioprio = get_current_ioprio(), 1306 .ki_pos = pos, 1307 }; 1308 1309 iov_iter_kvec(&iiter, WRITE, &kvec, 1, length); 1310 1311 ret = iomap_file_buffered_write(&iocb, &iiter, ops, write_ops, NULL); 1312 if (ret < 0) 1313 return ret; 1314 return ret == length ? 0 : -EIO; 1315 } 1316 EXPORT_SYMBOL_GPL(iomap_fsverity_write); 1317 1318 static void iomap_write_delalloc_ifs_punch(struct inode *inode, 1319 struct folio *folio, loff_t start_byte, loff_t end_byte, 1320 struct iomap *iomap, iomap_punch_t punch) 1321 { 1322 unsigned int first_blk, last_blk; 1323 loff_t last_byte; 1324 u8 blkbits = inode->i_blkbits; 1325 struct iomap_folio_state *ifs; 1326 1327 /* 1328 * When we have per-block dirty tracking, there can be 1329 * blocks within a folio which are marked uptodate 1330 * but not dirty. In that case it is necessary to punch 1331 * out such blocks to avoid leaking any delalloc blocks. 1332 */ 1333 ifs = folio->private; 1334 if (!ifs) 1335 return; 1336 1337 last_byte = min_t(loff_t, end_byte - 1, folio_next_pos(folio) - 1); 1338 first_blk = offset_in_folio(folio, start_byte) >> blkbits; 1339 last_blk = offset_in_folio(folio, last_byte) >> blkbits; 1340 while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) 1341 <= last_blk) { 1342 punch(inode, folio_pos(folio) + (first_blk << blkbits), 1343 1 << blkbits, iomap); 1344 first_blk++; 1345 } 1346 } 1347 1348 static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, 1349 loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, 1350 struct iomap *iomap, iomap_punch_t punch) 1351 { 1352 if (!folio_test_dirty(folio)) 1353 return; 1354 1355 /* if dirty, punch up to offset */ 1356 if (start_byte > *punch_start_byte) { 1357 punch(inode, *punch_start_byte, start_byte - *punch_start_byte, 1358 iomap); 1359 } 1360 1361 /* Punch non-dirty blocks within folio */ 1362 iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, 1363 iomap, punch); 1364 1365 /* 1366 * Make sure the next punch start is correctly bound to 1367 * the end of this data range, not the end of the folio. 1368 */ 1369 *punch_start_byte = min_t(loff_t, end_byte, folio_next_pos(folio)); 1370 } 1371 1372 /* 1373 * Scan the data range passed to us for dirty page cache folios. If we find a 1374 * dirty folio, punch out the preceding range and update the offset from which 1375 * the next punch will start from. 1376 * 1377 * We can punch out storage reservations under clean pages because they either 1378 * contain data that has been written back - in which case the delalloc punch 1379 * over that range is a no-op - or they have been read faults in which case they 1380 * contain zeroes and we can remove the delalloc backing range and any new 1381 * writes to those pages will do the normal hole filling operation... 1382 * 1383 * This makes the logic simple: we only need to keep the delalloc extents only 1384 * over the dirty ranges of the page cache. 1385 * 1386 * This function uses [start_byte, end_byte) intervals (i.e. open ended) to 1387 * simplify range iterations. 1388 */ 1389 static void iomap_write_delalloc_scan(struct inode *inode, 1390 loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, 1391 struct iomap *iomap, iomap_punch_t punch) 1392 { 1393 while (start_byte < end_byte) { 1394 struct folio *folio; 1395 1396 /* grab locked page */ 1397 folio = filemap_lock_folio(inode->i_mapping, 1398 start_byte >> PAGE_SHIFT); 1399 if (IS_ERR(folio)) { 1400 start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + 1401 PAGE_SIZE; 1402 continue; 1403 } 1404 1405 iomap_write_delalloc_punch(inode, folio, punch_start_byte, 1406 start_byte, end_byte, iomap, punch); 1407 1408 /* move offset to start of next folio in range */ 1409 start_byte = folio_next_pos(folio); 1410 folio_unlock(folio); 1411 folio_put(folio); 1412 } 1413 } 1414 1415 /* 1416 * When a short write occurs, the filesystem might need to use ->iomap_end 1417 * to remove space reservations created in ->iomap_begin. 1418 * 1419 * For filesystems that use delayed allocation, there can be dirty pages over 1420 * the delalloc extent outside the range of a short write but still within the 1421 * delalloc extent allocated for this iomap if the write raced with page 1422 * faults. 1423 * 1424 * Punch out all the delalloc blocks in the range given except for those that 1425 * have dirty data still pending in the page cache - those are going to be 1426 * written and so must still retain the delalloc backing for writeback. 1427 * 1428 * The punch() callback *must* only punch delalloc extents in the range passed 1429 * to it. It must skip over all other types of extents in the range and leave 1430 * them completely unchanged. It must do this punch atomically with respect to 1431 * other extent modifications. 1432 * 1433 * The punch() callback may be called with a folio locked to prevent writeback 1434 * extent allocation racing at the edge of the range we are currently punching. 1435 * The locked folio may or may not cover the range being punched, so it is not 1436 * safe for the punch() callback to lock folios itself. 1437 * 1438 * Lock order is: 1439 * 1440 * inode->i_rwsem (shared or exclusive) 1441 * inode->i_mapping->invalidate_lock (exclusive) 1442 * folio_lock() 1443 * ->punch 1444 * internal filesystem allocation lock 1445 * 1446 * As we are scanning the page cache for data, we don't need to reimplement the 1447 * wheel - mapping_seek_hole_data() does exactly what we need to identify the 1448 * start and end of data ranges correctly even for sub-folio block sizes. This 1449 * byte range based iteration is especially convenient because it means we 1450 * don't have to care about variable size folios, nor where the start or end of 1451 * the data range lies within a folio, if they lie within the same folio or even 1452 * if there are multiple discontiguous data ranges within the folio. 1453 * 1454 * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so 1455 * can return data ranges that exist in the cache beyond EOF. e.g. a page fault 1456 * spanning EOF will initialise the post-EOF data to zeroes and mark it up to 1457 * date. A write page fault can then mark it dirty. If we then fail a write() 1458 * beyond EOF into that up to date cached range, we allocate a delalloc block 1459 * beyond EOF and then have to punch it out. Because the range is up to date, 1460 * mapping_seek_hole_data() will return it, and we will skip the punch because 1461 * the folio is dirty. THis is incorrect - we always need to punch out delalloc 1462 * beyond EOF in this case as writeback will never write back and covert that 1463 * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, 1464 * resulting in always punching out the range from the EOF to the end of the 1465 * range the iomap spans. 1466 * 1467 * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it 1468 * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA 1469 * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) 1470 * returns the end of the data range (data_end). Using closed intervals would 1471 * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose 1472 * the code to subtle off-by-one bugs.... 1473 */ 1474 void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, 1475 loff_t end_byte, unsigned flags, struct iomap *iomap, 1476 iomap_punch_t punch) 1477 { 1478 loff_t punch_start_byte = start_byte; 1479 loff_t scan_end_byte = min(i_size_read(inode), end_byte); 1480 1481 /* 1482 * The caller must hold invalidate_lock to avoid races with page faults 1483 * re-instantiating folios and dirtying them via ->page_mkwrite whilst 1484 * we walk the cache and perform delalloc extent removal. Failing to do 1485 * this can leave dirty pages with no space reservation in the cache. 1486 */ 1487 lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); 1488 1489 while (start_byte < scan_end_byte) { 1490 loff_t data_end; 1491 1492 start_byte = mapping_seek_hole_data(inode->i_mapping, 1493 start_byte, scan_end_byte, SEEK_DATA); 1494 /* 1495 * If there is no more data to scan, all that is left is to 1496 * punch out the remaining range. 1497 * 1498 * Note that mapping_seek_hole_data is only supposed to return 1499 * either an offset or -ENXIO, so WARN on any other error as 1500 * that would be an API change without updating the callers. 1501 */ 1502 if (start_byte == -ENXIO || start_byte == scan_end_byte) 1503 break; 1504 if (WARN_ON_ONCE(start_byte < 0)) 1505 return; 1506 WARN_ON_ONCE(start_byte < punch_start_byte); 1507 WARN_ON_ONCE(start_byte > scan_end_byte); 1508 1509 /* 1510 * We find the end of this contiguous cached data range by 1511 * seeking from start_byte to the beginning of the next hole. 1512 */ 1513 data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, 1514 scan_end_byte, SEEK_HOLE); 1515 if (WARN_ON_ONCE(data_end < 0)) 1516 return; 1517 1518 /* 1519 * If we race with post-direct I/O invalidation of the page cache, 1520 * there might be no data left at start_byte. 1521 */ 1522 if (data_end == start_byte) 1523 continue; 1524 1525 WARN_ON_ONCE(data_end < start_byte); 1526 WARN_ON_ONCE(data_end > scan_end_byte); 1527 1528 iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte, 1529 data_end, iomap, punch); 1530 1531 /* The next data search starts at the end of this one. */ 1532 start_byte = data_end; 1533 } 1534 1535 if (punch_start_byte < end_byte) 1536 punch(inode, punch_start_byte, end_byte - punch_start_byte, 1537 iomap); 1538 } 1539 EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); 1540 1541 static int iomap_unshare_iter(struct iomap_iter *iter, 1542 const struct iomap_write_ops *write_ops) 1543 { 1544 struct iomap *iomap = &iter->iomap; 1545 u64 bytes = iomap_length(iter); 1546 int status; 1547 1548 if (!iomap_want_unshare_iter(iter)) 1549 return iomap_iter_advance(iter, bytes); 1550 1551 do { 1552 struct folio *folio; 1553 size_t offset; 1554 bool ret; 1555 1556 bytes = min_t(u64, SIZE_MAX, bytes); 1557 status = iomap_write_begin(iter, write_ops, &folio, &offset, 1558 &bytes); 1559 if (unlikely(status)) 1560 return status; 1561 if (iomap->flags & IOMAP_F_STALE) 1562 break; 1563 1564 ret = iomap_write_end(iter, bytes, bytes, folio); 1565 __iomap_put_folio(iter, write_ops, bytes, folio); 1566 if (WARN_ON_ONCE(!ret)) 1567 return -EIO; 1568 1569 cond_resched(); 1570 1571 balance_dirty_pages_ratelimited(iter->inode->i_mapping); 1572 1573 status = iomap_iter_advance(iter, bytes); 1574 if (status) 1575 break; 1576 } while ((bytes = iomap_length(iter)) > 0); 1577 1578 return status; 1579 } 1580 1581 int 1582 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, 1583 const struct iomap_ops *ops, 1584 const struct iomap_write_ops *write_ops) 1585 { 1586 struct iomap_iter iter = { 1587 .inode = inode, 1588 .pos = pos, 1589 .flags = IOMAP_WRITE | IOMAP_UNSHARE, 1590 }; 1591 loff_t size = i_size_read(inode); 1592 int ret; 1593 1594 if (pos < 0 || pos >= size) 1595 return 0; 1596 1597 iter.len = min(len, size - pos); 1598 while ((ret = iomap_iter(&iter, ops)) > 0) 1599 iter.status = iomap_unshare_iter(&iter, write_ops); 1600 return ret; 1601 } 1602 EXPORT_SYMBOL_GPL(iomap_file_unshare); 1603 1604 /* 1605 * Flush the remaining range of the iter and mark the current mapping stale. 1606 * This is used when zero range sees an unwritten mapping that may have had 1607 * dirty pagecache over it. 1608 */ 1609 static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) 1610 { 1611 struct address_space *mapping = i->inode->i_mapping; 1612 loff_t end = i->pos + i->len - 1; 1613 1614 i->iomap.flags |= IOMAP_F_STALE; 1615 return filemap_write_and_wait_range(mapping, i->pos, end); 1616 } 1617 1618 static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, 1619 const struct iomap_write_ops *write_ops) 1620 { 1621 u64 bytes = iomap_length(iter); 1622 int status; 1623 1624 do { 1625 struct folio *folio; 1626 size_t offset; 1627 bool ret; 1628 1629 balance_dirty_pages_ratelimited(iter->inode->i_mapping); 1630 1631 bytes = min_t(u64, SIZE_MAX, bytes); 1632 status = iomap_write_begin(iter, write_ops, &folio, &offset, 1633 &bytes); 1634 if (status) 1635 return status; 1636 if (iter->iomap.flags & IOMAP_F_STALE) 1637 break; 1638 1639 /* a NULL folio means we're done with a folio batch */ 1640 if (!folio) { 1641 status = iomap_iter_advance_full(iter); 1642 break; 1643 } 1644 1645 /* warn about zeroing folios beyond eof that won't write back */ 1646 WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); 1647 1648 trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset, 1649 bytes); 1650 1651 folio_zero_range(folio, offset, bytes); 1652 folio_mark_accessed(folio); 1653 1654 ret = iomap_write_end(iter, bytes, bytes, folio); 1655 __iomap_put_folio(iter, write_ops, bytes, folio); 1656 if (WARN_ON_ONCE(!ret)) 1657 return -EIO; 1658 1659 status = iomap_iter_advance(iter, bytes); 1660 if (status) 1661 break; 1662 } while ((bytes = iomap_length(iter)) > 0); 1663 1664 if (did_zero) 1665 *did_zero = true; 1666 return status; 1667 } 1668 1669 /** 1670 * iomap_fill_dirty_folios - fill a folio batch with dirty folios 1671 * @iter: Iteration structure 1672 * @start: Start offset of range. Updated based on lookup progress. 1673 * @end: End offset of range 1674 * @iomap_flags: Flags to set on the associated iomap to track the batch. 1675 * 1676 * Returns the folio count directly. Also returns the associated control flag if 1677 * the the batch lookup is performed and the expected offset of a subsequent 1678 * lookup via out params. The caller is responsible to set the flag on the 1679 * associated iomap. 1680 */ 1681 unsigned int 1682 iomap_fill_dirty_folios( 1683 struct iomap_iter *iter, 1684 loff_t *start, 1685 loff_t end, 1686 unsigned int *iomap_flags) 1687 { 1688 struct address_space *mapping = iter->inode->i_mapping; 1689 pgoff_t pstart = *start >> PAGE_SHIFT; 1690 pgoff_t pend = (end - 1) >> PAGE_SHIFT; 1691 unsigned int count; 1692 1693 if (!iter->fbatch) { 1694 *start = end; 1695 return 0; 1696 } 1697 1698 count = filemap_get_folios_dirty(mapping, &pstart, pend, iter->fbatch); 1699 *start = (pstart << PAGE_SHIFT); 1700 *iomap_flags |= IOMAP_F_FOLIO_BATCH; 1701 return count; 1702 } 1703 EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); 1704 1705 int 1706 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1707 const struct iomap_ops *ops, 1708 const struct iomap_write_ops *write_ops, void *private) 1709 { 1710 struct folio_batch fbatch; 1711 struct iomap_iter iter = { 1712 .inode = inode, 1713 .pos = pos, 1714 .len = len, 1715 .flags = IOMAP_ZERO, 1716 .private = private, 1717 .fbatch = &fbatch, 1718 }; 1719 struct address_space *mapping = inode->i_mapping; 1720 int ret; 1721 bool range_dirty; 1722 1723 folio_batch_init(&fbatch); 1724 1725 /* 1726 * To avoid an unconditional flush, check pagecache state and only flush 1727 * if dirty and the fs returns a mapping that might convert on 1728 * writeback. 1729 */ 1730 range_dirty = filemap_range_needs_writeback(mapping, iter.pos, 1731 iter.pos + iter.len - 1); 1732 while ((ret = iomap_iter(&iter, ops)) > 0) { 1733 const struct iomap *srcmap = iomap_iter_srcmap(&iter); 1734 1735 if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && 1736 (srcmap->type == IOMAP_HOLE || 1737 srcmap->type == IOMAP_UNWRITTEN)) { 1738 s64 status; 1739 1740 if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) { 1741 range_dirty = false; 1742 status = iomap_zero_iter_flush_and_stale(&iter); 1743 } else { 1744 status = iomap_iter_advance_full(&iter); 1745 } 1746 iter.status = status; 1747 continue; 1748 } 1749 1750 iter.status = iomap_zero_iter(&iter, did_zero, write_ops); 1751 } 1752 return ret; 1753 } 1754 EXPORT_SYMBOL_GPL(iomap_zero_range); 1755 1756 int 1757 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1758 const struct iomap_ops *ops, 1759 const struct iomap_write_ops *write_ops, void *private) 1760 { 1761 unsigned int blocksize = i_blocksize(inode); 1762 unsigned int off = pos & (blocksize - 1); 1763 1764 /* Block boundary? Nothing to do */ 1765 if (!off) 1766 return 0; 1767 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, 1768 write_ops, private); 1769 } 1770 EXPORT_SYMBOL_GPL(iomap_truncate_page); 1771 1772 static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, 1773 struct folio *folio) 1774 { 1775 loff_t length = iomap_length(iter); 1776 int ret; 1777 1778 if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { 1779 ret = __block_write_begin_int(folio, iter->pos, length, NULL, 1780 &iter->iomap); 1781 if (ret) 1782 return ret; 1783 block_commit_write(folio, 0, length); 1784 } else { 1785 WARN_ON_ONCE(!folio_test_uptodate(folio)); 1786 folio_mark_dirty(folio); 1787 } 1788 1789 return iomap_iter_advance(iter, length); 1790 } 1791 1792 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, 1793 void *private) 1794 { 1795 struct iomap_iter iter = { 1796 .inode = file_inode(vmf->vma->vm_file), 1797 .flags = IOMAP_WRITE | IOMAP_FAULT, 1798 .private = private, 1799 }; 1800 struct folio *folio = page_folio(vmf->page); 1801 ssize_t ret; 1802 1803 folio_lock(folio); 1804 ret = folio_mkwrite_check_truncate(folio, iter.inode); 1805 if (ret < 0) 1806 goto out_unlock; 1807 iter.pos = folio_pos(folio); 1808 iter.len = ret; 1809 while ((ret = iomap_iter(&iter, ops)) > 0) 1810 iter.status = iomap_folio_mkwrite_iter(&iter, folio); 1811 1812 if (ret < 0) 1813 goto out_unlock; 1814 folio_wait_stable(folio); 1815 return VM_FAULT_LOCKED; 1816 out_unlock: 1817 folio_unlock(folio); 1818 return vmf_fs_error(ret); 1819 } 1820 EXPORT_SYMBOL_GPL(iomap_page_mkwrite); 1821 1822 static void iomap_writeback_init(struct inode *inode, struct folio *folio) 1823 { 1824 struct iomap_folio_state *ifs = folio->private; 1825 1826 WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); 1827 if (ifs) { 1828 WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); 1829 /* 1830 * Set this to the folio size. After processing the folio for 1831 * writeback in iomap_writeback_folio(), we'll subtract any 1832 * ranges not written back. 1833 * 1834 * We do this because otherwise, we would have to atomically 1835 * increment ifs->write_bytes_pending every time a range in the 1836 * folio needs to be written back. 1837 */ 1838 atomic_set(&ifs->write_bytes_pending, folio_size(folio)); 1839 } 1840 } 1841 1842 void iomap_finish_folio_write(struct inode *inode, struct folio *folio, 1843 size_t len) 1844 { 1845 struct iomap_folio_state *ifs = folio->private; 1846 1847 WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); 1848 WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); 1849 1850 if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) 1851 folio_end_writeback(folio); 1852 } 1853 EXPORT_SYMBOL_GPL(iomap_finish_folio_write); 1854 1855 static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, 1856 struct folio *folio, u64 pos, u32 rlen, u64 end_pos, 1857 size_t *bytes_submitted) 1858 { 1859 do { 1860 ssize_t ret; 1861 1862 ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos); 1863 if (WARN_ON_ONCE(ret == 0 || ret > rlen)) 1864 return -EIO; 1865 if (ret < 0) 1866 return ret; 1867 rlen -= ret; 1868 pos += ret; 1869 1870 /* 1871 * Holes are not written back by ->writeback_range, so track 1872 * if we did handle anything that is not a hole here. 1873 */ 1874 if (wpc->iomap.type != IOMAP_HOLE) 1875 *bytes_submitted += ret; 1876 } while (rlen); 1877 1878 return 0; 1879 } 1880 1881 /* 1882 * Check interaction of the folio with the file end. 1883 * 1884 * If the folio is entirely beyond i_size, return false. If it straddles 1885 * i_size, adjust end_pos and zero all data beyond i_size. Don't skip fsverity 1886 * folios as those are beyond i_size. 1887 */ 1888 static bool iomap_writeback_handle_eof(struct folio *folio, 1889 struct iomap_writepage_ctx *wpc, u64 *end_pos) 1890 { 1891 struct inode *inode = wpc->inode; 1892 u64 isize = i_size_read(inode); 1893 1894 if (wpc->iomap.flags & IOMAP_F_FSVERITY) { 1895 WARN_ON_ONCE(folio_pos(folio) < isize); 1896 return true; 1897 } 1898 1899 if (*end_pos > isize) { 1900 size_t poff = offset_in_folio(folio, isize); 1901 pgoff_t end_index = isize >> PAGE_SHIFT; 1902 1903 /* 1904 * If the folio is entirely ouside of i_size, skip it. 1905 * 1906 * This can happen due to a truncate operation that is in 1907 * progress and in that case truncate will finish it off once 1908 * we've dropped the folio lock. 1909 * 1910 * Note that the pgoff_t used for end_index is an unsigned long. 1911 * If the given offset is greater than 16TB on a 32-bit system, 1912 * then if we checked if the folio is fully outside i_size with 1913 * "if (folio->index >= end_index + 1)", "end_index + 1" would 1914 * overflow and evaluate to 0. Hence this folio would be 1915 * redirtied and written out repeatedly, which would result in 1916 * an infinite loop; the user program performing this operation 1917 * would hang. Instead, we can detect this situation by 1918 * checking if the folio is totally beyond i_size or if its 1919 * offset is just equal to the EOF. 1920 */ 1921 if (folio->index > end_index || 1922 (folio->index == end_index && poff == 0)) 1923 return false; 1924 1925 /* 1926 * The folio straddles i_size. 1927 * 1928 * It must be zeroed out on each and every writepage invocation 1929 * because it may be mmapped: 1930 * 1931 * A file is mapped in multiples of the page size. For a 1932 * file that is not a multiple of the page size, the 1933 * remaining memory is zeroed when mapped, and writes to that 1934 * region are not written out to the file. 1935 * 1936 * Also adjust the end_pos to the end of file and skip writeback 1937 * for all blocks entirely beyond i_size. 1938 */ 1939 folio_zero_segment(folio, poff, folio_size(folio)); 1940 *end_pos = isize; 1941 } 1942 1943 return true; 1944 } 1945 1946 int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) 1947 { 1948 struct iomap_folio_state *ifs = folio->private; 1949 struct inode *inode = wpc->inode; 1950 u64 pos = folio_pos(folio); 1951 u64 end_pos = pos + folio_size(folio); 1952 u64 end_aligned = 0; 1953 loff_t orig_pos = pos; 1954 size_t bytes_submitted = 0; 1955 int error = 0; 1956 u32 rlen; 1957 1958 WARN_ON_ONCE(!folio_test_locked(folio)); 1959 WARN_ON_ONCE(folio_test_dirty(folio)); 1960 WARN_ON_ONCE(folio_test_writeback(folio)); 1961 1962 trace_iomap_writeback_folio(inode, pos, folio_size(folio)); 1963 1964 if (!iomap_writeback_handle_eof(folio, wpc, &end_pos)) 1965 return 0; 1966 WARN_ON_ONCE(end_pos <= pos); 1967 1968 if (i_blocks_per_folio(inode, folio) > 1) { 1969 if (!ifs) { 1970 ifs = ifs_alloc(inode, folio, 0); 1971 iomap_set_range_dirty(folio, 0, end_pos - pos); 1972 } 1973 1974 iomap_writeback_init(inode, folio); 1975 } 1976 1977 /* 1978 * Set the writeback bit ASAP, as the I/O completion for the single 1979 * block per folio case happen hit as soon as we're submitting the bio. 1980 */ 1981 folio_start_writeback(folio); 1982 1983 /* 1984 * Walk through the folio to find dirty areas to write back. 1985 */ 1986 end_aligned = round_up(end_pos, i_blocksize(inode)); 1987 while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { 1988 error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, 1989 &bytes_submitted); 1990 if (error) 1991 break; 1992 pos += rlen; 1993 } 1994 1995 if (bytes_submitted) 1996 wpc->nr_folios++; 1997 if (error && pos > orig_pos) 1998 fserror_report_io(inode, FSERR_BUFFERED_WRITE, orig_pos, 0, 1999 error, GFP_NOFS); 2000 2001 /* 2002 * We can have dirty bits set past end of file in page_mkwrite path 2003 * while mapping the last partial folio. Hence it's better to clear 2004 * all the dirty bits in the folio here. 2005 */ 2006 iomap_clear_range_dirty(folio, 0, folio_size(folio)); 2007 2008 /* 2009 * Usually the writeback bit is cleared by the I/O completion handler. 2010 * But we may end up either not actually writing any blocks, or (when 2011 * there are multiple blocks in a folio) all I/O might have finished 2012 * already at this point. In that case we need to clear the writeback 2013 * bit ourselves right after unlocking the page. 2014 */ 2015 if (ifs) { 2016 /* 2017 * Subtract any bytes that were initially accounted to 2018 * write_bytes_pending but skipped for writeback. 2019 */ 2020 size_t bytes_not_submitted = folio_size(folio) - 2021 bytes_submitted; 2022 2023 if (bytes_not_submitted) 2024 iomap_finish_folio_write(inode, folio, 2025 bytes_not_submitted); 2026 } else if (!bytes_submitted) { 2027 folio_end_writeback(folio); 2028 } 2029 2030 mapping_set_error(inode->i_mapping, error); 2031 return error; 2032 } 2033 EXPORT_SYMBOL_GPL(iomap_writeback_folio); 2034 2035 int 2036 iomap_writepages(struct iomap_writepage_ctx *wpc) 2037 { 2038 struct address_space *mapping = wpc->inode->i_mapping; 2039 struct folio *folio = NULL; 2040 int error; 2041 2042 /* 2043 * Writeback from reclaim context should never happen except in the case 2044 * of a VM regression so warn about it and refuse to write the data. 2045 */ 2046 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == 2047 PF_MEMALLOC)) 2048 return -EIO; 2049 2050 while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error))) { 2051 error = iomap_writeback_folio(wpc, folio); 2052 folio_unlock(folio); 2053 } 2054 2055 /* 2056 * If @error is non-zero, it means that we have a situation where some 2057 * part of the submission process has failed after we've marked pages 2058 * for writeback. 2059 * 2060 * We cannot cancel the writeback directly in that case, so always call 2061 * ->writeback_submit to run the I/O completion handler to clear the 2062 * writeback bit and let the file system proess the errors. 2063 */ 2064 if (wpc->wb_ctx) 2065 return wpc->ops->writeback_submit(wpc, error); 2066 return error; 2067 } 2068 EXPORT_SYMBOL_GPL(iomap_writepages); 2069