1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "dev-replace.h" 14 #include "zoned.h" 15 #include "file-item.h" 16 #include "raid-stripe-tree.h" 17 18 static struct bio_set btrfs_bioset; 19 static struct bio_set btrfs_clone_bioset; 20 static struct bio_set btrfs_repair_bioset; 21 static mempool_t btrfs_failed_bio_pool; 22 23 struct btrfs_failed_bio { 24 struct btrfs_bio *bbio; 25 int num_copies; 26 atomic_t repair_count; 27 }; 28 29 /* Is this a data path I/O that needs storage layer checksum and repair? */ 30 static inline bool is_data_bbio(const struct btrfs_bio *bbio) 31 { 32 return bbio->inode && is_data_inode(bbio->inode); 33 } 34 35 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) 36 { 37 return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; 38 } 39 40 /* 41 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 42 * is already initialized by the block layer. 43 */ 44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, 45 btrfs_bio_end_io_t end_io, void *private) 46 { 47 /* @inode parameter is mandatory. */ 48 ASSERT(inode); 49 50 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 51 bbio->inode = inode; 52 bbio->end_io = end_io; 53 bbio->private = private; 54 bbio->file_offset = file_offset; 55 atomic_set(&bbio->pending_ios, 1); 56 WRITE_ONCE(bbio->status, BLK_STS_OK); 57 } 58 59 /* 60 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 61 * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). 62 * 63 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 64 * a mempool. 65 */ 66 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 67 struct btrfs_inode *inode, u64 file_offset, 68 btrfs_bio_end_io_t end_io, void *private) 69 { 70 struct btrfs_bio *bbio; 71 struct bio *bio; 72 73 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 74 bbio = btrfs_bio(bio); 75 btrfs_bio_init(bbio, inode, file_offset, end_io, private); 76 return bbio; 77 } 78 79 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, 80 struct btrfs_bio *orig_bbio, 81 u64 map_length) 82 { 83 struct btrfs_bio *bbio; 84 struct bio *bio; 85 86 bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, 87 &btrfs_clone_bioset); 88 if (IS_ERR(bio)) 89 return ERR_CAST(bio); 90 91 bbio = btrfs_bio(bio); 92 btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio); 93 orig_bbio->file_offset += map_length; 94 if (bbio_has_ordered_extent(bbio)) { 95 refcount_inc(&orig_bbio->ordered->refs); 96 bbio->ordered = orig_bbio->ordered; 97 bbio->orig_logical = orig_bbio->orig_logical; 98 orig_bbio->orig_logical += map_length; 99 } 100 bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; 101 atomic_inc(&orig_bbio->pending_ios); 102 return bbio; 103 } 104 105 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) 106 { 107 /* Make sure we're already in task context. */ 108 ASSERT(in_task()); 109 110 if (bbio->async_csum) 111 wait_for_completion(&bbio->csum_done); 112 113 bbio->bio.bi_status = status; 114 if (bbio->bio.bi_pool == &btrfs_clone_bioset) { 115 struct btrfs_bio *orig_bbio = bbio->private; 116 117 /* Free bio that was never submitted to the underlying device. */ 118 if (bbio_has_ordered_extent(bbio)) 119 btrfs_put_ordered_extent(bbio->ordered); 120 bio_put(&bbio->bio); 121 122 bbio = orig_bbio; 123 } 124 125 /* 126 * At this point, bbio always points to the original btrfs_bio. Save 127 * the first error in it. 128 */ 129 if (status != BLK_STS_OK) 130 cmpxchg(&bbio->status, BLK_STS_OK, status); 131 132 if (atomic_dec_and_test(&bbio->pending_ios)) { 133 /* Load split bio's error which might be set above. */ 134 if (status == BLK_STS_OK) 135 bbio->bio.bi_status = READ_ONCE(bbio->status); 136 137 if (bbio_has_ordered_extent(bbio)) { 138 struct btrfs_ordered_extent *ordered = bbio->ordered; 139 140 bbio->end_io(bbio); 141 btrfs_put_ordered_extent(ordered); 142 } else { 143 bbio->end_io(bbio); 144 } 145 } 146 } 147 148 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) 149 { 150 if (cur_mirror == fbio->num_copies) 151 return cur_mirror + 1 - fbio->num_copies; 152 return cur_mirror + 1; 153 } 154 155 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) 156 { 157 if (cur_mirror == 1) 158 return fbio->num_copies; 159 return cur_mirror - 1; 160 } 161 162 static void btrfs_repair_done(struct btrfs_failed_bio *fbio) 163 { 164 if (atomic_dec_and_test(&fbio->repair_count)) { 165 btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); 166 mempool_free(fbio, &btrfs_failed_bio_pool); 167 } 168 } 169 170 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, 171 struct btrfs_device *dev) 172 { 173 struct btrfs_failed_bio *fbio = repair_bbio->private; 174 struct btrfs_inode *inode = repair_bbio->inode; 175 struct btrfs_fs_info *fs_info = inode->root->fs_info; 176 /* 177 * We can not move forward the saved_iter, as it will be later 178 * utilized by repair_bbio again. 179 */ 180 struct bvec_iter saved_iter = repair_bbio->saved_iter; 181 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 182 const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; 183 const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; 184 int mirror = repair_bbio->mirror_num; 185 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 186 phys_addr_t paddr; 187 unsigned int slot = 0; 188 189 /* Repair bbio should be eaxctly one block sized. */ 190 ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); 191 192 btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { 193 ASSERT(slot < nr_steps); 194 paddrs[slot] = paddr; 195 slot++; 196 } 197 198 if (repair_bbio->bio.bi_status || 199 !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { 200 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); 201 repair_bbio->bio.bi_iter = repair_bbio->saved_iter; 202 203 mirror = next_repair_mirror(fbio, mirror); 204 if (mirror == fbio->bbio->mirror_num) { 205 btrfs_debug(fs_info, "no mirror left"); 206 fbio->bbio->bio.bi_status = BLK_STS_IOERR; 207 goto done; 208 } 209 210 btrfs_submit_bbio(repair_bbio, mirror); 211 return; 212 } 213 214 do { 215 mirror = prev_repair_mirror(fbio, mirror); 216 btrfs_repair_io_failure(fs_info, btrfs_ino(inode), 217 repair_bbio->file_offset, fs_info->sectorsize, 218 logical, paddrs, step, mirror); 219 } while (mirror != fbio->bbio->mirror_num); 220 221 done: 222 btrfs_repair_done(fbio); 223 bio_put(&repair_bbio->bio); 224 } 225 226 /* 227 * Try to kick off a repair read to the next available mirror for a bad sector. 228 * 229 * This primarily tries to recover good data to serve the actual read request, 230 * but also tries to write the good data back to the bad mirror(s) when a 231 * read succeeded to restore the redundancy. 232 */ 233 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, 234 u32 bio_offset, 235 phys_addr_t paddrs[], 236 struct btrfs_failed_bio *fbio) 237 { 238 struct btrfs_inode *inode = failed_bbio->inode; 239 struct btrfs_fs_info *fs_info = inode->root->fs_info; 240 const u32 sectorsize = fs_info->sectorsize; 241 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 242 const u32 nr_steps = sectorsize / step; 243 /* 244 * For bs > ps cases, the saved_iter can be partially moved forward. 245 * In that case we should round it down to the block boundary. 246 */ 247 const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, 248 sectorsize); 249 struct btrfs_bio *repair_bbio; 250 struct bio *repair_bio; 251 int num_copies; 252 int mirror; 253 254 btrfs_debug(fs_info, "repair read error: read error at %llu", 255 failed_bbio->file_offset + bio_offset); 256 257 num_copies = btrfs_num_copies(fs_info, logical, sectorsize); 258 if (num_copies == 1) { 259 btrfs_debug(fs_info, "no copy to repair from"); 260 failed_bbio->bio.bi_status = BLK_STS_IOERR; 261 return fbio; 262 } 263 264 if (!fbio) { 265 fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); 266 fbio->bbio = failed_bbio; 267 fbio->num_copies = num_copies; 268 atomic_set(&fbio->repair_count, 1); 269 } 270 271 atomic_inc(&fbio->repair_count); 272 273 repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, 274 &btrfs_repair_bioset); 275 repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; 276 for (int i = 0; i < nr_steps; i++) { 277 int ret; 278 279 ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); 280 281 ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, 282 offset_in_page(paddrs[i])); 283 ASSERT(ret == step); 284 } 285 286 repair_bbio = btrfs_bio(repair_bio); 287 btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, 288 NULL, fbio); 289 290 mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); 291 btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); 292 btrfs_submit_bbio(repair_bbio, mirror); 293 return fbio; 294 } 295 296 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) 297 { 298 struct btrfs_inode *inode = bbio->inode; 299 struct btrfs_fs_info *fs_info = inode->root->fs_info; 300 const u32 sectorsize = fs_info->sectorsize; 301 const u32 step = min(sectorsize, PAGE_SIZE); 302 const u32 nr_steps = sectorsize / step; 303 struct bvec_iter *iter = &bbio->saved_iter; 304 blk_status_t status = bbio->bio.bi_status; 305 struct btrfs_failed_bio *fbio = NULL; 306 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 307 phys_addr_t paddr; 308 u32 offset = 0; 309 310 /* Read-repair requires the inode field to be set by the submitter. */ 311 ASSERT(inode); 312 313 /* 314 * Hand off repair bios to the repair code as there is no upper level 315 * submitter for them. 316 */ 317 if (bbio->bio.bi_pool == &btrfs_repair_bioset) { 318 btrfs_end_repair_bio(bbio, dev); 319 return; 320 } 321 322 /* Clear the I/O error. A failed repair will reset it. */ 323 bbio->bio.bi_status = BLK_STS_OK; 324 325 btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { 326 paddrs[(offset / step) % nr_steps] = paddr; 327 offset += step; 328 329 if (IS_ALIGNED(offset, sectorsize)) { 330 if (status || 331 !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) 332 fbio = repair_one_sector(bbio, offset - sectorsize, 333 paddrs, fbio); 334 } 335 } 336 if (bbio->csum != bbio->csum_inline) 337 kvfree(bbio->csum); 338 339 if (fbio) 340 btrfs_repair_done(fbio); 341 else 342 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 343 } 344 345 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) 346 { 347 if (!dev || !dev->bdev) 348 return; 349 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 350 return; 351 352 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 353 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 354 else if (!(bio->bi_opf & REQ_RAHEAD)) 355 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 356 if (bio->bi_opf & REQ_PREFLUSH) 357 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 358 } 359 360 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info, 361 const struct bio *bio) 362 { 363 if (bio->bi_opf & REQ_META) 364 return fs_info->endio_meta_workers; 365 return fs_info->endio_workers; 366 } 367 368 static void simple_end_io_work(struct work_struct *work) 369 { 370 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 371 struct bio *bio = &bbio->bio; 372 373 if (bio_op(bio) == REQ_OP_READ) { 374 /* Metadata reads are checked and repaired by the submitter. */ 375 if (is_data_bbio(bbio)) 376 return btrfs_check_read_bio(bbio, bbio->bio.bi_private); 377 return btrfs_bio_end_io(bbio, bbio->bio.bi_status); 378 } 379 if (bio_is_zone_append(bio) && !bio->bi_status) 380 btrfs_record_physical_zoned(bbio); 381 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 382 } 383 384 static void btrfs_simple_end_io(struct bio *bio) 385 { 386 struct btrfs_bio *bbio = btrfs_bio(bio); 387 struct btrfs_device *dev = bio->bi_private; 388 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 389 390 btrfs_bio_counter_dec(fs_info); 391 392 if (bio->bi_status) 393 btrfs_log_dev_io_error(bio, dev); 394 395 INIT_WORK(&bbio->end_io_work, simple_end_io_work); 396 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 397 } 398 399 static void btrfs_raid56_end_io(struct bio *bio) 400 { 401 struct btrfs_io_context *bioc = bio->bi_private; 402 struct btrfs_bio *bbio = btrfs_bio(bio); 403 404 /* RAID56 endio is always handled in workqueue. */ 405 ASSERT(in_task()); 406 407 btrfs_bio_counter_dec(bioc->fs_info); 408 bbio->mirror_num = bioc->mirror_num; 409 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) 410 btrfs_check_read_bio(bbio, NULL); 411 else 412 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 413 414 btrfs_put_bioc(bioc); 415 } 416 417 static void orig_write_end_io_work(struct work_struct *work) 418 { 419 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 420 struct bio *bio = &bbio->bio; 421 struct btrfs_io_stripe *stripe = bio->bi_private; 422 struct btrfs_io_context *bioc = stripe->bioc; 423 424 btrfs_bio_counter_dec(bioc->fs_info); 425 426 if (bio->bi_status) { 427 atomic_inc(&bioc->error); 428 btrfs_log_dev_io_error(bio, stripe->dev); 429 } 430 431 /* 432 * Only send an error to the higher layers if it is beyond the tolerance 433 * threshold. 434 */ 435 if (atomic_read(&bioc->error) > bioc->max_errors) 436 bio->bi_status = BLK_STS_IOERR; 437 else 438 bio->bi_status = BLK_STS_OK; 439 440 if (bio_is_zone_append(bio) && !bio->bi_status) 441 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 442 443 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 444 btrfs_put_bioc(bioc); 445 } 446 447 static void btrfs_orig_write_end_io(struct bio *bio) 448 { 449 struct btrfs_bio *bbio = btrfs_bio(bio); 450 451 INIT_WORK(&bbio->end_io_work, orig_write_end_io_work); 452 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); 453 } 454 455 static void clone_write_end_io_work(struct work_struct *work) 456 { 457 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 458 struct bio *bio = &bbio->bio; 459 struct btrfs_io_stripe *stripe = bio->bi_private; 460 461 if (bio->bi_status) { 462 atomic_inc(&stripe->bioc->error); 463 btrfs_log_dev_io_error(bio, stripe->dev); 464 } else if (bio_is_zone_append(bio)) { 465 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 466 } 467 468 /* Pass on control to the original bio this one was cloned from */ 469 bio_endio(stripe->bioc->orig_bio); 470 bio_put(bio); 471 } 472 473 static void btrfs_clone_write_end_io(struct bio *bio) 474 { 475 struct btrfs_bio *bbio = btrfs_bio(bio); 476 477 INIT_WORK(&bbio->end_io_work, clone_write_end_io_work); 478 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); 479 } 480 481 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 482 { 483 if (!dev || !dev->bdev || 484 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 485 (btrfs_op(bio) == BTRFS_MAP_WRITE && 486 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 487 bio_io_error(bio); 488 return; 489 } 490 491 bio_set_dev(bio, dev->bdev); 492 493 /* 494 * For zone append writing, bi_sector must point the beginning of the 495 * zone 496 */ 497 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 498 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 499 u64 zone_start = round_down(physical, dev->fs_info->zone_size); 500 501 ASSERT(btrfs_dev_is_sequential(dev, physical)); 502 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 503 } 504 btrfs_debug(dev->fs_info, 505 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 506 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 507 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 508 dev->devid, bio->bi_iter.bi_size); 509 510 /* 511 * Track reads if tracking is enabled; ignore I/O operations before the 512 * filesystem is fully initialized. 513 */ 514 if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) 515 percpu_counter_add(&dev->fs_info->stats_read_blocks, 516 bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); 517 518 if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) 519 blkcg_punt_bio_submit(bio); 520 else 521 submit_bio(bio); 522 } 523 524 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 525 { 526 struct bio *orig_bio = bioc->orig_bio, *bio; 527 struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio); 528 529 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 530 531 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 532 if (dev_nr == bioc->num_stripes - 1) { 533 bio = orig_bio; 534 bio->bi_end_io = btrfs_orig_write_end_io; 535 } else { 536 /* We need to use endio_work to run end_io in task context. */ 537 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset); 538 bio_inc_remaining(orig_bio); 539 btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, 540 orig_bbio->file_offset, NULL, NULL); 541 bio->bi_end_io = btrfs_clone_write_end_io; 542 } 543 544 bio->bi_private = &bioc->stripes[dev_nr]; 545 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 546 bioc->stripes[dev_nr].bioc = bioc; 547 bioc->size = bio->bi_iter.bi_size; 548 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 549 } 550 551 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, 552 struct btrfs_io_stripe *smap, int mirror_num) 553 { 554 if (!bioc) { 555 /* Single mirror read/write fast path. */ 556 btrfs_bio(bio)->mirror_num = mirror_num; 557 bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; 558 if (bio_op(bio) != REQ_OP_READ) 559 btrfs_bio(bio)->orig_physical = smap->physical; 560 bio->bi_private = smap->dev; 561 bio->bi_end_io = btrfs_simple_end_io; 562 btrfs_submit_dev_bio(smap->dev, bio); 563 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 564 /* Parity RAID write or read recovery. */ 565 bio->bi_private = bioc; 566 bio->bi_end_io = btrfs_raid56_end_io; 567 if (bio_op(bio) == REQ_OP_READ) 568 raid56_parity_recover(bio, bioc, mirror_num); 569 else 570 raid56_parity_write(bio, bioc); 571 } else { 572 /* Write to multiple mirrors. */ 573 int total_devs = bioc->num_stripes; 574 575 bioc->orig_bio = bio; 576 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) 577 btrfs_submit_mirrored_bio(bioc, dev_nr); 578 } 579 } 580 581 static int btrfs_bio_csum(struct btrfs_bio *bbio) 582 { 583 if (bbio->bio.bi_opf & REQ_META) 584 return btree_csum_one_bio(bbio); 585 #ifdef CONFIG_BTRFS_EXPERIMENTAL 586 return btrfs_csum_one_bio(bbio, true); 587 #else 588 return btrfs_csum_one_bio(bbio, false); 589 #endif 590 } 591 592 /* 593 * Async submit bios are used to offload expensive checksumming onto the worker 594 * threads. 595 */ 596 struct async_submit_bio { 597 struct btrfs_bio *bbio; 598 struct btrfs_io_context *bioc; 599 struct btrfs_io_stripe smap; 600 int mirror_num; 601 struct btrfs_work work; 602 }; 603 604 /* 605 * In order to insert checksums into the metadata in large chunks, we wait 606 * until bio submission time. All the pages in the bio are checksummed and 607 * sums are attached onto the ordered extent record. 608 * 609 * At IO completion time the csums attached on the ordered extent record are 610 * inserted into the btree. 611 */ 612 static void run_one_async_start(struct btrfs_work *work) 613 { 614 struct async_submit_bio *async = 615 container_of(work, struct async_submit_bio, work); 616 int ret; 617 618 ret = btrfs_bio_csum(async->bbio); 619 if (ret) 620 async->bbio->bio.bi_status = errno_to_blk_status(ret); 621 } 622 623 /* 624 * In order to insert checksums into the metadata in large chunks, we wait 625 * until bio submission time. All the pages in the bio are checksummed and 626 * sums are attached onto the ordered extent record. 627 * 628 * At IO completion time the csums attached on the ordered extent record are 629 * inserted into the tree. 630 * 631 * If called with @do_free == true, then it will free the work struct. 632 */ 633 static void run_one_async_done(struct btrfs_work *work, bool do_free) 634 { 635 struct async_submit_bio *async = 636 container_of(work, struct async_submit_bio, work); 637 struct bio *bio = &async->bbio->bio; 638 639 if (do_free) { 640 kfree(container_of(work, struct async_submit_bio, work)); 641 return; 642 } 643 644 /* If an error occurred we just want to clean up the bio and move on. */ 645 if (bio->bi_status) { 646 btrfs_bio_end_io(async->bbio, bio->bi_status); 647 return; 648 } 649 650 /* 651 * All of the bios that pass through here are from async helpers. 652 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's 653 * context. This changes nothing when cgroups aren't in use. 654 */ 655 bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; 656 btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); 657 } 658 659 static bool should_async_write(struct btrfs_bio *bbio) 660 { 661 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 662 bool auto_csum_mode = true; 663 664 #ifdef CONFIG_BTRFS_EXPERIMENTAL 665 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 666 enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); 667 668 if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) 669 return true; 670 /* 671 * Write bios will calculate checksum and submit bio at the same time. 672 * Unless explicitly required don't offload serial csum calculate and bio 673 * submit into a workqueue. 674 */ 675 return false; 676 #endif 677 678 /* Submit synchronously if the checksum implementation is fast. */ 679 if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) 680 return false; 681 682 /* 683 * Try to defer the submission to a workqueue to parallelize the 684 * checksum calculation unless the I/O is issued synchronously. 685 */ 686 if (op_is_sync(bbio->bio.bi_opf)) 687 return false; 688 689 /* Zoned devices require I/O to be submitted in order. */ 690 if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info)) 691 return false; 692 693 return true; 694 } 695 696 /* 697 * Submit bio to an async queue. 698 * 699 * Return true if the work has been successfully submitted, else false. 700 */ 701 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, 702 struct btrfs_io_context *bioc, 703 struct btrfs_io_stripe *smap, int mirror_num) 704 { 705 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 706 struct async_submit_bio *async; 707 708 async = kmalloc(sizeof(*async), GFP_NOFS); 709 if (!async) 710 return false; 711 712 async->bbio = bbio; 713 async->bioc = bioc; 714 async->smap = *smap; 715 async->mirror_num = mirror_num; 716 717 btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); 718 btrfs_queue_work(fs_info->workers, &async->work); 719 return true; 720 } 721 722 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) 723 { 724 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 725 unsigned int nr_segs; 726 int sector_offset; 727 728 map_length = min(map_length, fs_info->max_zone_append_size); 729 sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits, 730 &nr_segs, map_length); 731 if (sector_offset) { 732 /* 733 * bio_split_rw_at() could split at a size smaller than our 734 * sectorsize and thus cause unaligned I/Os. Fix that by 735 * always rounding down to the nearest boundary. 736 */ 737 return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize); 738 } 739 return map_length; 740 } 741 742 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) 743 { 744 struct btrfs_inode *inode = bbio->inode; 745 struct btrfs_fs_info *fs_info = inode->root->fs_info; 746 struct bio *bio = &bbio->bio; 747 u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 748 u64 length = bio->bi_iter.bi_size; 749 u64 map_length = length; 750 bool use_append = btrfs_use_zone_append(bbio); 751 struct btrfs_io_context *bioc = NULL; 752 struct btrfs_io_stripe smap; 753 blk_status_t status; 754 int ret; 755 756 if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root)) 757 smap.rst_search_commit_root = true; 758 else 759 smap.rst_search_commit_root = false; 760 761 btrfs_bio_counter_inc_blocked(fs_info); 762 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 763 &bioc, &smap, &mirror_num); 764 if (ret) { 765 status = errno_to_blk_status(ret); 766 btrfs_bio_counter_dec(fs_info); 767 goto end_bbio; 768 } 769 770 /* 771 * For fscrypt writes we will get the encrypted bio after we've remapped 772 * our bio to the physical disk location, so we need to save the 773 * original bytenr so we know what we're checksumming. 774 */ 775 if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) 776 bbio->orig_logical = logical; 777 778 map_length = min(map_length, length); 779 if (use_append) 780 map_length = btrfs_append_map_length(bbio, map_length); 781 782 if (map_length < length) { 783 struct btrfs_bio *split; 784 785 split = btrfs_split_bio(fs_info, bbio, map_length); 786 if (IS_ERR(split)) { 787 status = errno_to_blk_status(PTR_ERR(split)); 788 btrfs_bio_counter_dec(fs_info); 789 goto end_bbio; 790 } 791 bbio = split; 792 bio = &bbio->bio; 793 } 794 795 /* 796 * Save the iter for the end_io handler and preload the checksums for 797 * data reads. 798 */ 799 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { 800 bbio->saved_iter = bio->bi_iter; 801 ret = btrfs_lookup_bio_sums(bbio); 802 status = errno_to_blk_status(ret); 803 if (status) 804 goto fail; 805 } 806 807 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 808 if (use_append) { 809 bio->bi_opf &= ~REQ_OP_WRITE; 810 bio->bi_opf |= REQ_OP_ZONE_APPEND; 811 } 812 813 if (is_data_bbio(bbio) && bioc && bioc->use_rst) { 814 /* 815 * No locking for the list update, as we only add to 816 * the list in the I/O submission path, and list 817 * iteration only happens in the completion path, which 818 * can't happen until after the last submission. 819 */ 820 btrfs_get_bioc(bioc); 821 list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); 822 } 823 824 /* 825 * Csum items for reloc roots have already been cloned at this 826 * point, so they are handled as part of the no-checksum case. 827 */ 828 if (!(inode->flags & BTRFS_INODE_NODATASUM) && 829 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && 830 !btrfs_is_data_reloc_root(inode->root)) { 831 if (should_async_write(bbio) && 832 btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) 833 goto done; 834 835 ret = btrfs_bio_csum(bbio); 836 status = errno_to_blk_status(ret); 837 if (status) 838 goto fail; 839 } else if (use_append || 840 (btrfs_is_zoned(fs_info) && inode && 841 inode->flags & BTRFS_INODE_NODATASUM)) { 842 ret = btrfs_alloc_dummy_sum(bbio); 843 status = errno_to_blk_status(ret); 844 if (status) 845 goto fail; 846 } 847 } 848 849 btrfs_submit_bio(bio, bioc, &smap, mirror_num); 850 done: 851 return map_length == length; 852 853 fail: 854 btrfs_bio_counter_dec(fs_info); 855 /* 856 * We have split the original bbio, now we have to end both the current 857 * @bbio and remaining one, as the remaining one will never be submitted. 858 */ 859 if (map_length < length) { 860 struct btrfs_bio *remaining = bbio->private; 861 862 ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); 863 ASSERT(remaining); 864 865 btrfs_bio_end_io(remaining, status); 866 } 867 end_bbio: 868 btrfs_bio_end_io(bbio, status); 869 /* Do not submit another chunk */ 870 return true; 871 } 872 873 static void assert_bbio_alignment(struct btrfs_bio *bbio) 874 { 875 #ifdef CONFIG_BTRFS_ASSERT 876 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 877 struct bio_vec bvec; 878 struct bvec_iter iter; 879 const u32 blocksize = fs_info->sectorsize; 880 const u32 alignment = min(blocksize, PAGE_SIZE); 881 const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 882 const u32 length = bbio->bio.bi_iter.bi_size; 883 884 /* The logical and length should still be aligned to blocksize. */ 885 ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) && 886 length != 0, "root=%llu inode=%llu logical=%llu length=%u", 887 btrfs_root_id(bbio->inode->root), 888 btrfs_ino(bbio->inode), logical, length); 889 890 bio_for_each_bvec(bvec, &bbio->bio, iter) 891 ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) && 892 IS_ALIGNED(bvec.bv_len, alignment), 893 "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", 894 btrfs_root_id(bbio->inode->root), 895 btrfs_ino(bbio->inode), logical, length, iter.bi_idx, 896 bvec.bv_offset, bvec.bv_len); 897 #endif 898 } 899 900 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) 901 { 902 /* If bbio->inode is not populated, its file_offset must be 0. */ 903 ASSERT(bbio->inode || bbio->file_offset == 0); 904 905 assert_bbio_alignment(bbio); 906 907 while (!btrfs_submit_chunk(bbio, mirror_num)) 908 ; 909 } 910 911 /* 912 * Submit a repair write. 913 * 914 * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a 915 * RAID setup. Here we only want to write the one bad copy, so we do the 916 * mapping ourselves and submit the bio directly. 917 * 918 * The I/O is issued synchronously to block the repair read completion from 919 * freeing the bio. 920 * 921 * @ino: Offending inode number 922 * @fileoff: File offset inside the inode 923 * @length: Length of the repair write 924 * @logical: Logical address of the range 925 * @paddrs: Physical address array of the content 926 * @step: Length of for each paddrs 927 * @mirror_num: Mirror number to write to. Must not be zero 928 */ 929 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, 930 u32 length, u64 logical, const phys_addr_t paddrs[], 931 unsigned int step, int mirror_num) 932 { 933 const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); 934 struct btrfs_io_stripe smap = { 0 }; 935 struct bio *bio = NULL; 936 int ret = 0; 937 938 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 939 BUG_ON(!mirror_num); 940 941 /* Basic alignment checks. */ 942 ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); 943 ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); 944 ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); 945 /* Either it's a single data or metadata block. */ 946 ASSERT(length <= BTRFS_MAX_BLOCKSIZE); 947 ASSERT(step <= length); 948 ASSERT(is_power_of_2(step)); 949 950 if (btrfs_repair_one_zone(fs_info, logical)) 951 return 0; 952 953 /* 954 * Avoid races with device replace and make sure our bioc has devices 955 * associated to its stripes that don't go away while we are doing the 956 * read repair operation. 957 */ 958 btrfs_bio_counter_inc_blocked(fs_info); 959 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); 960 if (ret < 0) 961 goto out_counter_dec; 962 963 if (unlikely(!smap.dev->bdev || 964 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { 965 ret = -EIO; 966 goto out_counter_dec; 967 } 968 969 bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); 970 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 971 for (int i = 0; i < nr_steps; i++) { 972 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); 973 /* We should have allocated enough slots to contain all the different pages. */ 974 ASSERT(ret == step); 975 } 976 ret = submit_bio_wait(bio); 977 bio_put(bio); 978 if (ret) { 979 /* try to remap that extent elsewhere? */ 980 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); 981 goto out_counter_dec; 982 } 983 984 btrfs_info_rl(fs_info, 985 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 986 ino, fileoff, btrfs_dev_name(smap.dev), 987 smap.physical >> SECTOR_SHIFT); 988 ret = 0; 989 990 out_counter_dec: 991 btrfs_bio_counter_dec(fs_info); 992 return ret; 993 } 994 995 /* 996 * Submit a btrfs_bio based repair write. 997 * 998 * If @dev_replace is true, the write would be submitted to dev-replace target. 999 */ 1000 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) 1001 { 1002 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 1003 u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 1004 u64 length = bbio->bio.bi_iter.bi_size; 1005 struct btrfs_io_stripe smap = { 0 }; 1006 int ret; 1007 1008 ASSERT(mirror_num > 0); 1009 ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); 1010 ASSERT(!is_data_inode(bbio->inode)); 1011 ASSERT(bbio->is_scrub); 1012 1013 btrfs_bio_counter_inc_blocked(fs_info); 1014 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); 1015 if (ret < 0) 1016 goto fail; 1017 1018 if (dev_replace) { 1019 ASSERT(smap.dev == fs_info->dev_replace.srcdev); 1020 smap.dev = fs_info->dev_replace.tgtdev; 1021 } 1022 btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); 1023 return; 1024 1025 fail: 1026 btrfs_bio_counter_dec(fs_info); 1027 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 1028 } 1029 1030 int __init btrfs_bioset_init(void) 1031 { 1032 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 1033 offsetof(struct btrfs_bio, bio), 1034 BIOSET_NEED_BVECS)) 1035 return -ENOMEM; 1036 if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, 1037 offsetof(struct btrfs_bio, bio), 0)) 1038 goto out; 1039 if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, 1040 offsetof(struct btrfs_bio, bio), 1041 BIOSET_NEED_BVECS)) 1042 goto out; 1043 if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, 1044 sizeof(struct btrfs_failed_bio))) 1045 goto out; 1046 return 0; 1047 1048 out: 1049 btrfs_bioset_exit(); 1050 return -ENOMEM; 1051 } 1052 1053 void __cold btrfs_bioset_exit(void) 1054 { 1055 mempool_exit(&btrfs_failed_bio_pool); 1056 bioset_exit(&btrfs_repair_bioset); 1057 bioset_exit(&btrfs_clone_bioset); 1058 bioset_exit(&btrfs_bioset); 1059 } 1060