1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2016-2025 Christoph Hellwig. 4 */ 5 #include <linux/iomap.h> 6 #include <linux/list_sort.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/fserror.h> 10 #include "internal.h" 11 #include "trace.h" 12 13 struct bio_set iomap_ioend_bioset; 14 EXPORT_SYMBOL_GPL(iomap_ioend_bioset); 15 16 struct iomap_ioend *iomap_init_ioend(struct inode *inode, 17 struct bio *bio, loff_t file_offset, u16 ioend_flags) 18 { 19 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 20 21 atomic_set(&ioend->io_remaining, 1); 22 ioend->io_error = 0; 23 ioend->io_parent = NULL; 24 INIT_LIST_HEAD(&ioend->io_list); 25 ioend->io_flags = ioend_flags; 26 ioend->io_inode = inode; 27 ioend->io_offset = file_offset; 28 ioend->io_size = bio->bi_iter.bi_size; 29 ioend->io_sector = bio->bi_iter.bi_sector; 30 ioend->io_private = NULL; 31 return ioend; 32 } 33 EXPORT_SYMBOL_GPL(iomap_init_ioend); 34 35 /* 36 * We're now finished for good with this ioend structure. Update the folio 37 * state, release holds on bios, and finally free up memory. Do not use the 38 * ioend after this. 39 */ 40 static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) 41 { 42 struct inode *inode = ioend->io_inode; 43 struct bio *bio = &ioend->io_bio; 44 struct folio_iter fi; 45 u32 folio_count = 0; 46 47 if (ioend->io_error) { 48 mapping_set_error(inode->i_mapping, ioend->io_error); 49 if (!bio_flagged(bio, BIO_QUIET)) { 50 pr_err_ratelimited( 51 "%s: writeback error on inode %lu, offset %lld, sector %llu", 52 inode->i_sb->s_id, inode->i_ino, 53 ioend->io_offset, ioend->io_sector); 54 } 55 } 56 57 /* walk all folios in bio, ending page IO on them */ 58 bio_for_each_folio_all(fi, bio) { 59 if (ioend->io_error) 60 fserror_report_io(inode, FSERR_BUFFERED_WRITE, 61 folio_pos(fi.folio) + fi.offset, 62 fi.length, ioend->io_error, 63 GFP_ATOMIC); 64 iomap_finish_folio_write(inode, fi.folio, fi.length); 65 folio_count++; 66 } 67 68 bio_put(bio); /* frees the ioend */ 69 return folio_count; 70 } 71 72 static DEFINE_SPINLOCK(failed_ioend_lock); 73 static LIST_HEAD(failed_ioend_list); 74 75 static void 76 iomap_fail_ioends( 77 struct work_struct *work) 78 { 79 struct iomap_ioend *ioend; 80 struct list_head tmp; 81 unsigned long flags; 82 83 spin_lock_irqsave(&failed_ioend_lock, flags); 84 list_replace_init(&failed_ioend_list, &tmp); 85 spin_unlock_irqrestore(&failed_ioend_lock, flags); 86 87 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, 88 io_list))) { 89 list_del_init(&ioend->io_list); 90 iomap_finish_ioend_buffered(ioend); 91 cond_resched(); 92 } 93 } 94 95 static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends); 96 97 static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend) 98 { 99 unsigned long flags; 100 101 /* 102 * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions 103 * in the fserror code. The caller no longer owns the ioend reference 104 * after the spinlock drops. 105 */ 106 spin_lock_irqsave(&failed_ioend_lock, flags); 107 if (list_empty(&failed_ioend_list)) 108 WARN_ON_ONCE(!schedule_work(&failed_ioend_work)); 109 list_add_tail(&ioend->io_list, &failed_ioend_list); 110 spin_unlock_irqrestore(&failed_ioend_lock, flags); 111 } 112 113 static void ioend_writeback_end_bio(struct bio *bio) 114 { 115 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 116 117 ioend->io_error = blk_status_to_errno(bio->bi_status); 118 if (ioend->io_error) { 119 iomap_fail_ioend_buffered(ioend); 120 return; 121 } 122 123 iomap_finish_ioend_buffered(ioend); 124 } 125 126 /* 127 * We cannot cancel the ioend directly in case of an error, so call the bio end 128 * I/O handler with the error status here to run the normal I/O completion 129 * handler. 130 */ 131 int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) 132 { 133 struct iomap_ioend *ioend = wpc->wb_ctx; 134 135 if (!ioend->io_bio.bi_end_io) 136 ioend->io_bio.bi_end_io = ioend_writeback_end_bio; 137 138 if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) 139 error = -EIO; 140 141 if (error) { 142 ioend->io_bio.bi_status = errno_to_blk_status(error); 143 bio_endio(&ioend->io_bio); 144 return error; 145 } 146 147 submit_bio(&ioend->io_bio); 148 return 0; 149 } 150 EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit); 151 152 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, 153 loff_t pos, u16 ioend_flags) 154 { 155 struct bio *bio; 156 157 bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, 158 REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc), 159 GFP_NOFS, &iomap_ioend_bioset); 160 bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); 161 bio->bi_write_hint = wpc->inode->i_write_hint; 162 wbc_init_bio(wpc->wbc, bio); 163 wpc->nr_folios = 0; 164 return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags); 165 } 166 167 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, 168 u16 ioend_flags) 169 { 170 struct iomap_ioend *ioend = wpc->wb_ctx; 171 172 if (ioend_flags & IOMAP_IOEND_BOUNDARY) 173 return false; 174 if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != 175 (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) 176 return false; 177 if (pos != ioend->io_offset + ioend->io_size) 178 return false; 179 if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && 180 iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio)) 181 return false; 182 /* 183 * Limit ioend bio chain lengths to minimise IO completion latency. This 184 * also prevents long tight loops ending page writeback on all the 185 * folios in the ioend. 186 */ 187 if (wpc->nr_folios >= IOEND_BATCH_SIZE) 188 return false; 189 return true; 190 } 191 192 /* 193 * Test to see if we have an existing ioend structure that we could append to 194 * first; otherwise finish off the current ioend and start another. 195 * 196 * If a new ioend is created and cached, the old ioend is submitted to the block 197 * layer instantly. Batching optimisations are provided by higher level block 198 * plugging. 199 * 200 * At the end of a writeback pass, there will be a cached ioend remaining on the 201 * writepage context that the caller will need to submit. 202 */ 203 ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, 204 loff_t pos, loff_t end_pos, unsigned int dirty_len) 205 { 206 struct iomap_ioend *ioend = wpc->wb_ctx; 207 size_t poff = offset_in_folio(folio, pos); 208 unsigned int ioend_flags = 0; 209 unsigned int map_len = min_t(u64, dirty_len, 210 wpc->iomap.offset + wpc->iomap.length - pos); 211 int error; 212 213 trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap); 214 215 WARN_ON_ONCE(!folio->private && map_len < dirty_len); 216 217 switch (wpc->iomap.type) { 218 case IOMAP_UNWRITTEN: 219 ioend_flags |= IOMAP_IOEND_UNWRITTEN; 220 break; 221 case IOMAP_MAPPED: 222 break; 223 case IOMAP_HOLE: 224 return map_len; 225 default: 226 WARN_ON_ONCE(1); 227 return -EIO; 228 } 229 230 if (wpc->iomap.flags & IOMAP_F_SHARED) 231 ioend_flags |= IOMAP_IOEND_SHARED; 232 if (folio_test_dropbehind(folio)) 233 ioend_flags |= IOMAP_IOEND_DONTCACHE; 234 if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) 235 ioend_flags |= IOMAP_IOEND_BOUNDARY; 236 237 if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { 238 new_ioend: 239 if (ioend) { 240 error = wpc->ops->writeback_submit(wpc, 0); 241 if (error) 242 return error; 243 } 244 wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags); 245 } 246 247 if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) 248 goto new_ioend; 249 250 /* 251 * Clamp io_offset and io_size to the incore EOF so that ondisk 252 * file size updates in the ioend completion are byte-accurate. 253 * This avoids recovering files with zeroed tail regions when 254 * writeback races with appending writes: 255 * 256 * Thread 1: Thread 2: 257 * ------------ ----------- 258 * write [A, A+B] 259 * update inode size to A+B 260 * submit I/O [A, A+BS] 261 * write [A+B, A+B+C] 262 * update inode size to A+B+C 263 * <I/O completes, updates disk size to min(A+B+C, A+BS)> 264 * <power failure> 265 * 266 * After reboot: 267 * 1) with A+B+C < A+BS, the file has zero padding in range 268 * [A+B, A+B+C] 269 * 270 * |< Block Size (BS) >| 271 * |DDDDDDDDDDDD0000000000000| 272 * ^ ^ ^ 273 * A A+B A+B+C 274 * (EOF) 275 * 276 * 2) with A+B+C > A+BS, the file has zero padding in range 277 * [A+B, A+BS] 278 * 279 * |< Block Size (BS) >|< Block Size (BS) >| 280 * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| 281 * ^ ^ ^ ^ 282 * A A+B A+BS A+B+C 283 * (EOF) 284 * 285 * D = Valid Data 286 * 0 = Zero Padding 287 * 288 * Note that this defeats the ability to chain the ioends of 289 * appending writes. 290 */ 291 ioend->io_size += map_len; 292 if (ioend->io_offset + ioend->io_size > end_pos) 293 ioend->io_size = end_pos - ioend->io_offset; 294 295 wbc_account_cgroup_owner(wpc->wbc, folio, map_len); 296 return map_len; 297 } 298 EXPORT_SYMBOL_GPL(iomap_add_to_ioend); 299 300 static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) 301 { 302 if (ioend->io_parent) { 303 struct bio *bio = &ioend->io_bio; 304 305 ioend = ioend->io_parent; 306 bio_put(bio); 307 } 308 309 if (error) 310 cmpxchg(&ioend->io_error, 0, error); 311 312 if (!atomic_dec_and_test(&ioend->io_remaining)) 313 return 0; 314 if (ioend->io_flags & IOMAP_IOEND_DIRECT) 315 return iomap_finish_ioend_direct(ioend); 316 return iomap_finish_ioend_buffered(ioend); 317 } 318 319 /* 320 * Ioend completion routine for merged bios. This can only be called from task 321 * contexts as merged ioends can be of unbound length. Hence we have to break up 322 * the writeback completions into manageable chunks to avoid long scheduler 323 * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get 324 * good batch processing throughput without creating adverse scheduler latency 325 * conditions. 326 */ 327 void iomap_finish_ioends(struct iomap_ioend *ioend, int error) 328 { 329 struct list_head tmp; 330 u32 completions; 331 332 might_sleep(); 333 334 list_replace_init(&ioend->io_list, &tmp); 335 completions = iomap_finish_ioend(ioend, error); 336 337 while (!list_empty(&tmp)) { 338 if (completions > IOEND_BATCH_SIZE * 8) { 339 cond_resched(); 340 completions = 0; 341 } 342 ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); 343 list_del_init(&ioend->io_list); 344 completions += iomap_finish_ioend(ioend, error); 345 } 346 } 347 EXPORT_SYMBOL_GPL(iomap_finish_ioends); 348 349 /* 350 * We can merge two adjacent ioends if they have the same set of work to do. 351 */ 352 static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, 353 struct iomap_ioend *next) 354 { 355 /* 356 * There is no point in merging reads as there is no completion 357 * processing that can be easily batched up for them. 358 */ 359 if (bio_op(&ioend->io_bio) == REQ_OP_READ || 360 bio_op(&next->io_bio) == REQ_OP_READ) 361 return false; 362 363 if (ioend->io_bio.bi_status != next->io_bio.bi_status) 364 return false; 365 if (next->io_flags & IOMAP_IOEND_BOUNDARY) 366 return false; 367 if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != 368 (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) 369 return false; 370 if (ioend->io_offset + ioend->io_size != next->io_offset) 371 return false; 372 /* 373 * Do not merge physically discontiguous ioends. The filesystem 374 * completion functions will have to iterate the physical 375 * discontiguities even if we merge the ioends at a logical level, so 376 * we don't gain anything by merging physical discontiguities here. 377 * 378 * We cannot use bio->bi_iter.bi_sector here as it is modified during 379 * submission so does not point to the start sector of the bio at 380 * completion. 381 */ 382 if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != 383 next->io_sector) 384 return false; 385 return true; 386 } 387 388 void iomap_ioend_try_merge(struct iomap_ioend *ioend, 389 struct list_head *more_ioends) 390 { 391 struct iomap_ioend *next; 392 393 INIT_LIST_HEAD(&ioend->io_list); 394 395 while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, 396 io_list))) { 397 if (!iomap_ioend_can_merge(ioend, next)) 398 break; 399 list_move_tail(&next->io_list, &ioend->io_list); 400 ioend->io_size += next->io_size; 401 } 402 } 403 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); 404 405 static int iomap_ioend_compare(void *priv, const struct list_head *a, 406 const struct list_head *b) 407 { 408 struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); 409 struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); 410 411 if (ia->io_offset < ib->io_offset) 412 return -1; 413 if (ia->io_offset > ib->io_offset) 414 return 1; 415 return 0; 416 } 417 418 void iomap_sort_ioends(struct list_head *ioend_list) 419 { 420 list_sort(NULL, ioend_list, iomap_ioend_compare); 421 } 422 EXPORT_SYMBOL_GPL(iomap_sort_ioends); 423 424 /* 425 * Split up to the first @max_len bytes from @ioend if the ioend covers more 426 * than @max_len bytes. 427 * 428 * If @is_append is set, the split will be based on the hardware limits for 429 * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware 430 * limits don't allow the entire @max_len length. 431 * 432 * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer 433 * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to 434 * switch the operation after this call, but before submitting the bio. 435 */ 436 struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, 437 unsigned int max_len, bool is_append) 438 { 439 struct bio *bio = &ioend->io_bio; 440 struct iomap_ioend *split_ioend; 441 unsigned int nr_segs; 442 int sector_offset; 443 struct bio *split; 444 445 if (is_append) { 446 struct queue_limits *lim = bdev_limits(bio->bi_bdev); 447 448 max_len = min(max_len, 449 lim->max_zone_append_sectors << SECTOR_SHIFT); 450 451 sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); 452 if (unlikely(sector_offset < 0)) 453 return ERR_PTR(sector_offset); 454 if (!sector_offset) 455 return NULL; 456 } else { 457 if (bio->bi_iter.bi_size <= max_len) 458 return NULL; 459 sector_offset = max_len >> SECTOR_SHIFT; 460 } 461 462 /* ensure the split ioend is still block size aligned */ 463 sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, 464 i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; 465 466 split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); 467 if (IS_ERR(split)) 468 return ERR_CAST(split); 469 split->bi_private = bio->bi_private; 470 split->bi_end_io = bio->bi_end_io; 471 472 split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, 473 ioend->io_flags); 474 split_ioend->io_parent = ioend; 475 476 atomic_inc(&ioend->io_remaining); 477 ioend->io_offset += split_ioend->io_size; 478 ioend->io_size -= split_ioend->io_size; 479 480 split_ioend->io_sector = ioend->io_sector; 481 if (!is_append) 482 ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); 483 return split_ioend; 484 } 485 EXPORT_SYMBOL_GPL(iomap_split_ioend); 486 487 static int __init iomap_ioend_init(void) 488 { 489 return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), 490 offsetof(struct iomap_ioend, io_bio), 491 BIOSET_NEED_BVECS); 492 } 493 fs_initcall(iomap_ioend_init); 494