1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2016-2025 Christoph Hellwig. 4 */ 5 #include <linux/iomap.h> 6 #include <linux/list_sort.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/fserror.h> 10 #include "internal.h" 11 #include "trace.h" 12 13 struct bio_set iomap_ioend_bioset; 14 EXPORT_SYMBOL_GPL(iomap_ioend_bioset); 15 16 struct iomap_ioend *iomap_init_ioend(struct inode *inode, 17 struct bio *bio, loff_t file_offset, u16 ioend_flags) 18 { 19 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 20 21 atomic_set(&ioend->io_remaining, 1); 22 ioend->io_error = 0; 23 ioend->io_parent = NULL; 24 INIT_LIST_HEAD(&ioend->io_list); 25 ioend->io_flags = ioend_flags; 26 ioend->io_inode = inode; 27 ioend->io_offset = file_offset; 28 ioend->io_size = bio->bi_iter.bi_size; 29 ioend->io_sector = bio->bi_iter.bi_sector; 30 ioend->io_private = NULL; 31 return ioend; 32 } 33 EXPORT_SYMBOL_GPL(iomap_init_ioend); 34 35 /* 36 * We're now finished for good with this ioend structure. Update the folio 37 * state, release holds on bios, and finally free up memory. Do not use the 38 * ioend after this. 39 */ 40 static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) 41 { 42 struct inode *inode = ioend->io_inode; 43 struct bio *bio = &ioend->io_bio; 44 struct folio_iter fi; 45 u32 folio_count = 0; 46 47 if (ioend->io_error) { 48 mapping_set_error(inode->i_mapping, ioend->io_error); 49 if (!bio_flagged(bio, BIO_QUIET)) { 50 pr_err_ratelimited( 51 "%s: writeback error on inode %lu, offset %lld, sector %llu", 52 inode->i_sb->s_id, inode->i_ino, 53 ioend->io_offset, ioend->io_sector); 54 } 55 } 56 57 /* walk all folios in bio, ending page IO on them */ 58 bio_for_each_folio_all(fi, bio) { 59 if (ioend->io_error) 60 fserror_report_io(inode, FSERR_BUFFERED_WRITE, 61 folio_pos(fi.folio) + fi.offset, 62 fi.length, ioend->io_error, 63 GFP_ATOMIC); 64 iomap_finish_folio_write(inode, fi.folio, fi.length); 65 folio_count++; 66 } 67 68 bio_put(bio); /* frees the ioend */ 69 return folio_count; 70 } 71 72 static DEFINE_SPINLOCK(failed_ioend_lock); 73 static LIST_HEAD(failed_ioend_list); 74 75 static void 76 iomap_fail_ioends( 77 struct work_struct *work) 78 { 79 struct iomap_ioend *ioend; 80 struct list_head tmp; 81 unsigned long flags; 82 83 spin_lock_irqsave(&failed_ioend_lock, flags); 84 list_replace_init(&failed_ioend_list, &tmp); 85 spin_unlock_irqrestore(&failed_ioend_lock, flags); 86 87 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, 88 io_list))) { 89 list_del_init(&ioend->io_list); 90 iomap_finish_ioend_buffered(ioend); 91 cond_resched(); 92 } 93 } 94 95 static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends); 96 97 static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend) 98 { 99 unsigned long flags; 100 101 /* 102 * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions 103 * in the fserror code. The caller no longer owns the ioend reference 104 * after the spinlock drops. 105 */ 106 spin_lock_irqsave(&failed_ioend_lock, flags); 107 if (list_empty(&failed_ioend_list)) 108 WARN_ON_ONCE(!schedule_work(&failed_ioend_work)); 109 list_add_tail(&ioend->io_list, &failed_ioend_list); 110 spin_unlock_irqrestore(&failed_ioend_lock, flags); 111 } 112 113 static void ioend_writeback_end_bio(struct bio *bio) 114 { 115 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 116 117 ioend->io_error = blk_status_to_errno(bio->bi_status); 118 if (ioend->io_error) { 119 iomap_fail_ioend_buffered(ioend); 120 return; 121 } 122 123 iomap_finish_ioend_buffered(ioend); 124 } 125 126 /* 127 * We cannot cancel the ioend directly in case of an error, so call the bio end 128 * I/O handler with the error status here to run the normal I/O completion 129 * handler. 130 */ 131 int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) 132 { 133 struct iomap_ioend *ioend = wpc->wb_ctx; 134 135 if (!ioend->io_bio.bi_end_io) 136 ioend->io_bio.bi_end_io = ioend_writeback_end_bio; 137 138 if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) 139 error = -EIO; 140 141 if (error) { 142 ioend->io_bio.bi_status = errno_to_blk_status(error); 143 bio_endio(&ioend->io_bio); 144 return error; 145 } 146 147 submit_bio(&ioend->io_bio); 148 return 0; 149 } 150 EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit); 151 152 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, 153 loff_t pos, u16 ioend_flags) 154 { 155 struct bio *bio; 156 157 bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, 158 REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc), 159 GFP_NOFS, &iomap_ioend_bioset); 160 bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); 161 bio->bi_write_hint = wpc->inode->i_write_hint; 162 wbc_init_bio(wpc->wbc, bio); 163 wpc->nr_folios = 0; 164 return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags); 165 } 166 167 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, 168 u16 ioend_flags) 169 { 170 struct iomap_ioend *ioend = wpc->wb_ctx; 171 172 if (ioend_flags & IOMAP_IOEND_BOUNDARY) 173 return false; 174 if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != 175 (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) 176 return false; 177 if (pos != ioend->io_offset + ioend->io_size) 178 return false; 179 if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && 180 iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio)) 181 return false; 182 /* 183 * Limit ioend bio chain lengths to minimise IO completion latency. This 184 * also prevents long tight loops ending page writeback on all the 185 * folios in the ioend. 186 */ 187 if (wpc->nr_folios >= IOEND_BATCH_SIZE) 188 return false; 189 return true; 190 } 191 192 /* 193 * Test to see if we have an existing ioend structure that we could append to 194 * first; otherwise finish off the current ioend and start another. 195 * 196 * If a new ioend is created and cached, the old ioend is submitted to the block 197 * layer instantly. Batching optimisations are provided by higher level block 198 * plugging. 199 * 200 * At the end of a writeback pass, there will be a cached ioend remaining on the 201 * writepage context that the caller will need to submit. 202 */ 203 ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, 204 loff_t pos, loff_t end_pos, unsigned int dirty_len) 205 { 206 struct iomap_ioend *ioend = wpc->wb_ctx; 207 size_t poff = offset_in_folio(folio, pos); 208 unsigned int ioend_flags = 0; 209 unsigned int map_len = min_t(u64, dirty_len, 210 wpc->iomap.offset + wpc->iomap.length - pos); 211 int error; 212 213 trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap); 214 215 WARN_ON_ONCE(!folio->private && map_len < dirty_len); 216 217 switch (wpc->iomap.type) { 218 case IOMAP_INLINE: 219 WARN_ON_ONCE(1); 220 return -EIO; 221 case IOMAP_HOLE: 222 return map_len; 223 default: 224 break; 225 } 226 227 if (wpc->iomap.type == IOMAP_UNWRITTEN) 228 ioend_flags |= IOMAP_IOEND_UNWRITTEN; 229 if (wpc->iomap.flags & IOMAP_F_SHARED) 230 ioend_flags |= IOMAP_IOEND_SHARED; 231 if (folio_test_dropbehind(folio)) 232 ioend_flags |= IOMAP_IOEND_DONTCACHE; 233 if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) 234 ioend_flags |= IOMAP_IOEND_BOUNDARY; 235 236 if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { 237 new_ioend: 238 if (ioend) { 239 error = wpc->ops->writeback_submit(wpc, 0); 240 if (error) 241 return error; 242 } 243 wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags); 244 } 245 246 if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) 247 goto new_ioend; 248 249 /* 250 * Clamp io_offset and io_size to the incore EOF so that ondisk 251 * file size updates in the ioend completion are byte-accurate. 252 * This avoids recovering files with zeroed tail regions when 253 * writeback races with appending writes: 254 * 255 * Thread 1: Thread 2: 256 * ------------ ----------- 257 * write [A, A+B] 258 * update inode size to A+B 259 * submit I/O [A, A+BS] 260 * write [A+B, A+B+C] 261 * update inode size to A+B+C 262 * <I/O completes, updates disk size to min(A+B+C, A+BS)> 263 * <power failure> 264 * 265 * After reboot: 266 * 1) with A+B+C < A+BS, the file has zero padding in range 267 * [A+B, A+B+C] 268 * 269 * |< Block Size (BS) >| 270 * |DDDDDDDDDDDD0000000000000| 271 * ^ ^ ^ 272 * A A+B A+B+C 273 * (EOF) 274 * 275 * 2) with A+B+C > A+BS, the file has zero padding in range 276 * [A+B, A+BS] 277 * 278 * |< Block Size (BS) >|< Block Size (BS) >| 279 * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| 280 * ^ ^ ^ ^ 281 * A A+B A+BS A+B+C 282 * (EOF) 283 * 284 * D = Valid Data 285 * 0 = Zero Padding 286 * 287 * Note that this defeats the ability to chain the ioends of 288 * appending writes. 289 */ 290 ioend->io_size += map_len; 291 if (ioend->io_offset + ioend->io_size > end_pos) 292 ioend->io_size = end_pos - ioend->io_offset; 293 294 wbc_account_cgroup_owner(wpc->wbc, folio, map_len); 295 return map_len; 296 } 297 EXPORT_SYMBOL_GPL(iomap_add_to_ioend); 298 299 static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) 300 { 301 if (ioend->io_parent) { 302 struct bio *bio = &ioend->io_bio; 303 304 ioend = ioend->io_parent; 305 bio_put(bio); 306 } 307 308 if (error) 309 cmpxchg(&ioend->io_error, 0, error); 310 311 if (!atomic_dec_and_test(&ioend->io_remaining)) 312 return 0; 313 if (ioend->io_flags & IOMAP_IOEND_DIRECT) 314 return iomap_finish_ioend_direct(ioend); 315 return iomap_finish_ioend_buffered(ioend); 316 } 317 318 /* 319 * Ioend completion routine for merged bios. This can only be called from task 320 * contexts as merged ioends can be of unbound length. Hence we have to break up 321 * the writeback completions into manageable chunks to avoid long scheduler 322 * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get 323 * good batch processing throughput without creating adverse scheduler latency 324 * conditions. 325 */ 326 void iomap_finish_ioends(struct iomap_ioend *ioend, int error) 327 { 328 struct list_head tmp; 329 u32 completions; 330 331 might_sleep(); 332 333 list_replace_init(&ioend->io_list, &tmp); 334 completions = iomap_finish_ioend(ioend, error); 335 336 while (!list_empty(&tmp)) { 337 if (completions > IOEND_BATCH_SIZE * 8) { 338 cond_resched(); 339 completions = 0; 340 } 341 ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); 342 list_del_init(&ioend->io_list); 343 completions += iomap_finish_ioend(ioend, error); 344 } 345 } 346 EXPORT_SYMBOL_GPL(iomap_finish_ioends); 347 348 /* 349 * We can merge two adjacent ioends if they have the same set of work to do. 350 */ 351 static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, 352 struct iomap_ioend *next) 353 { 354 /* 355 * There is no point in merging reads as there is no completion 356 * processing that can be easily batched up for them. 357 */ 358 if (bio_op(&ioend->io_bio) == REQ_OP_READ || 359 bio_op(&next->io_bio) == REQ_OP_READ) 360 return false; 361 362 if (ioend->io_bio.bi_status != next->io_bio.bi_status) 363 return false; 364 if (next->io_flags & IOMAP_IOEND_BOUNDARY) 365 return false; 366 if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != 367 (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) 368 return false; 369 if (ioend->io_offset + ioend->io_size != next->io_offset) 370 return false; 371 /* 372 * Do not merge physically discontiguous ioends. The filesystem 373 * completion functions will have to iterate the physical 374 * discontiguities even if we merge the ioends at a logical level, so 375 * we don't gain anything by merging physical discontiguities here. 376 * 377 * We cannot use bio->bi_iter.bi_sector here as it is modified during 378 * submission so does not point to the start sector of the bio at 379 * completion. 380 */ 381 if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != 382 next->io_sector) 383 return false; 384 return true; 385 } 386 387 void iomap_ioend_try_merge(struct iomap_ioend *ioend, 388 struct list_head *more_ioends) 389 { 390 struct iomap_ioend *next; 391 392 INIT_LIST_HEAD(&ioend->io_list); 393 394 while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, 395 io_list))) { 396 if (!iomap_ioend_can_merge(ioend, next)) 397 break; 398 list_move_tail(&next->io_list, &ioend->io_list); 399 ioend->io_size += next->io_size; 400 } 401 } 402 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); 403 404 static int iomap_ioend_compare(void *priv, const struct list_head *a, 405 const struct list_head *b) 406 { 407 struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); 408 struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); 409 410 if (ia->io_offset < ib->io_offset) 411 return -1; 412 if (ia->io_offset > ib->io_offset) 413 return 1; 414 return 0; 415 } 416 417 void iomap_sort_ioends(struct list_head *ioend_list) 418 { 419 list_sort(NULL, ioend_list, iomap_ioend_compare); 420 } 421 EXPORT_SYMBOL_GPL(iomap_sort_ioends); 422 423 /* 424 * Split up to the first @max_len bytes from @ioend if the ioend covers more 425 * than @max_len bytes. 426 * 427 * If @is_append is set, the split will be based on the hardware limits for 428 * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware 429 * limits don't allow the entire @max_len length. 430 * 431 * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer 432 * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to 433 * switch the operation after this call, but before submitting the bio. 434 */ 435 struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, 436 unsigned int max_len, bool is_append) 437 { 438 struct bio *bio = &ioend->io_bio; 439 struct iomap_ioend *split_ioend; 440 unsigned int nr_segs; 441 int sector_offset; 442 struct bio *split; 443 444 if (is_append) { 445 struct queue_limits *lim = bdev_limits(bio->bi_bdev); 446 447 max_len = min(max_len, 448 lim->max_zone_append_sectors << SECTOR_SHIFT); 449 450 sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); 451 if (unlikely(sector_offset < 0)) 452 return ERR_PTR(sector_offset); 453 if (!sector_offset) 454 return NULL; 455 } else { 456 if (bio->bi_iter.bi_size <= max_len) 457 return NULL; 458 sector_offset = max_len >> SECTOR_SHIFT; 459 } 460 461 /* ensure the split ioend is still block size aligned */ 462 sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, 463 i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; 464 465 split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); 466 if (IS_ERR(split)) 467 return ERR_CAST(split); 468 split->bi_private = bio->bi_private; 469 split->bi_end_io = bio->bi_end_io; 470 471 split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, 472 ioend->io_flags); 473 split_ioend->io_parent = ioend; 474 475 atomic_inc(&ioend->io_remaining); 476 ioend->io_offset += split_ioend->io_size; 477 ioend->io_size -= split_ioend->io_size; 478 479 split_ioend->io_sector = ioend->io_sector; 480 if (!is_append) 481 ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); 482 return split_ioend; 483 } 484 EXPORT_SYMBOL_GPL(iomap_split_ioend); 485 486 static int __init iomap_ioend_init(void) 487 { 488 return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), 489 offsetof(struct iomap_ioend, io_bio), 490 BIOSET_NEED_BVECS); 491 } 492 fs_initcall(iomap_ioend_init); 493