1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "dev-replace.h" 14 #include "zoned.h" 15 #include "file-item.h" 16 #include "raid-stripe-tree.h" 17 18 static struct bio_set btrfs_bioset; 19 static struct bio_set btrfs_clone_bioset; 20 static struct bio_set btrfs_repair_bioset; 21 static mempool_t btrfs_failed_bio_pool; 22 23 struct btrfs_failed_bio { 24 struct btrfs_bio *bbio; 25 int num_copies; 26 atomic_t repair_count; 27 }; 28 29 /* Is this a data path I/O that needs storage layer checksum and repair? */ 30 static inline bool is_data_bbio(const struct btrfs_bio *bbio) 31 { 32 return bbio->inode && is_data_inode(bbio->inode); 33 } 34 35 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) 36 { 37 return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; 38 } 39 40 /* 41 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 42 * is already initialized by the block layer. 43 */ 44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, 45 btrfs_bio_end_io_t end_io, void *private) 46 { 47 /* @inode parameter is mandatory. */ 48 ASSERT(inode); 49 50 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 51 bbio->inode = inode; 52 bbio->end_io = end_io; 53 bbio->private = private; 54 bbio->file_offset = file_offset; 55 atomic_set(&bbio->pending_ios, 1); 56 WRITE_ONCE(bbio->status, BLK_STS_OK); 57 } 58 59 /* 60 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 61 * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). 62 * 63 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 64 * a mempool. 65 */ 66 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 67 struct btrfs_inode *inode, u64 file_offset, 68 btrfs_bio_end_io_t end_io, void *private) 69 { 70 struct btrfs_bio *bbio; 71 struct bio *bio; 72 73 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 74 bbio = btrfs_bio(bio); 75 btrfs_bio_init(bbio, inode, file_offset, end_io, private); 76 return bbio; 77 } 78 79 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, 80 struct btrfs_bio *orig_bbio, 81 u64 map_length) 82 { 83 struct btrfs_bio *bbio; 84 struct bio *bio; 85 86 bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, 87 &btrfs_clone_bioset); 88 if (IS_ERR(bio)) 89 return ERR_CAST(bio); 90 91 bbio = btrfs_bio(bio); 92 btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio); 93 orig_bbio->file_offset += map_length; 94 if (bbio_has_ordered_extent(bbio)) { 95 refcount_inc(&orig_bbio->ordered->refs); 96 bbio->ordered = orig_bbio->ordered; 97 bbio->orig_logical = orig_bbio->orig_logical; 98 orig_bbio->orig_logical += map_length; 99 } 100 101 bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; 102 bbio->can_use_append = orig_bbio->can_use_append; 103 bbio->is_scrub = orig_bbio->is_scrub; 104 bbio->is_remap = orig_bbio->is_remap; 105 bbio->async_csum = orig_bbio->async_csum; 106 107 atomic_inc(&orig_bbio->pending_ios); 108 return bbio; 109 } 110 111 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) 112 { 113 /* Make sure we're already in task context. */ 114 ASSERT(in_task()); 115 116 if (bbio->async_csum) 117 wait_for_completion(&bbio->csum_done); 118 119 bbio->bio.bi_status = status; 120 if (bbio->bio.bi_pool == &btrfs_clone_bioset) { 121 struct btrfs_bio *orig_bbio = bbio->private; 122 123 /* Free bio that was never submitted to the underlying device. */ 124 if (bbio_has_ordered_extent(bbio)) 125 btrfs_put_ordered_extent(bbio->ordered); 126 bio_put(&bbio->bio); 127 128 bbio = orig_bbio; 129 } 130 131 /* 132 * At this point, bbio always points to the original btrfs_bio. Save 133 * the first error in it. 134 */ 135 if (status != BLK_STS_OK) 136 cmpxchg(&bbio->status, BLK_STS_OK, status); 137 138 if (atomic_dec_and_test(&bbio->pending_ios)) { 139 /* Load split bio's error which might be set above. */ 140 if (status == BLK_STS_OK) 141 bbio->bio.bi_status = READ_ONCE(bbio->status); 142 143 if (bbio_has_ordered_extent(bbio)) { 144 struct btrfs_ordered_extent *ordered = bbio->ordered; 145 146 bbio->end_io(bbio); 147 btrfs_put_ordered_extent(ordered); 148 } else { 149 bbio->end_io(bbio); 150 } 151 } 152 } 153 154 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) 155 { 156 if (cur_mirror == fbio->num_copies) 157 return cur_mirror + 1 - fbio->num_copies; 158 return cur_mirror + 1; 159 } 160 161 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) 162 { 163 if (cur_mirror == 1) 164 return fbio->num_copies; 165 return cur_mirror - 1; 166 } 167 168 static void btrfs_repair_done(struct btrfs_failed_bio *fbio) 169 { 170 if (atomic_dec_and_test(&fbio->repair_count)) { 171 btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); 172 mempool_free(fbio, &btrfs_failed_bio_pool); 173 } 174 } 175 176 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, 177 struct btrfs_device *dev) 178 { 179 struct btrfs_failed_bio *fbio = repair_bbio->private; 180 struct btrfs_inode *inode = repair_bbio->inode; 181 struct btrfs_fs_info *fs_info = inode->root->fs_info; 182 /* 183 * We can not move forward the saved_iter, as it will be later 184 * utilized by repair_bbio again. 185 */ 186 struct bvec_iter saved_iter = repair_bbio->saved_iter; 187 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 188 const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; 189 const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; 190 int mirror = repair_bbio->mirror_num; 191 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 192 phys_addr_t paddr; 193 unsigned int slot = 0; 194 195 /* Repair bbio should be eaxctly one block sized. */ 196 ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); 197 198 btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { 199 ASSERT(slot < nr_steps); 200 paddrs[slot] = paddr; 201 slot++; 202 } 203 204 if (repair_bbio->bio.bi_status || 205 !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { 206 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); 207 repair_bbio->bio.bi_iter = repair_bbio->saved_iter; 208 209 mirror = next_repair_mirror(fbio, mirror); 210 if (mirror == fbio->bbio->mirror_num) { 211 btrfs_debug(fs_info, "no mirror left"); 212 fbio->bbio->bio.bi_status = BLK_STS_IOERR; 213 goto done; 214 } 215 216 btrfs_submit_bbio(repair_bbio, mirror); 217 return; 218 } 219 220 do { 221 mirror = prev_repair_mirror(fbio, mirror); 222 btrfs_repair_io_failure(fs_info, btrfs_ino(inode), 223 repair_bbio->file_offset, fs_info->sectorsize, 224 logical, paddrs, step, mirror); 225 } while (mirror != fbio->bbio->mirror_num); 226 227 done: 228 btrfs_repair_done(fbio); 229 bio_put(&repair_bbio->bio); 230 } 231 232 /* 233 * Try to kick off a repair read to the next available mirror for a bad sector. 234 * 235 * This primarily tries to recover good data to serve the actual read request, 236 * but also tries to write the good data back to the bad mirror(s) when a 237 * read succeeded to restore the redundancy. 238 */ 239 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, 240 u32 bio_offset, 241 phys_addr_t paddrs[], 242 struct btrfs_failed_bio *fbio) 243 { 244 struct btrfs_inode *inode = failed_bbio->inode; 245 struct btrfs_fs_info *fs_info = inode->root->fs_info; 246 const u32 sectorsize = fs_info->sectorsize; 247 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 248 const u32 nr_steps = sectorsize / step; 249 /* 250 * For bs > ps cases, the saved_iter can be partially moved forward. 251 * In that case we should round it down to the block boundary. 252 */ 253 const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, 254 sectorsize); 255 struct btrfs_bio *repair_bbio; 256 struct bio *repair_bio; 257 int num_copies; 258 int mirror; 259 260 btrfs_debug(fs_info, "repair read error: read error at %llu", 261 failed_bbio->file_offset + bio_offset); 262 263 num_copies = btrfs_num_copies(fs_info, logical, sectorsize); 264 if (num_copies == 1) { 265 btrfs_debug(fs_info, "no copy to repair from"); 266 failed_bbio->bio.bi_status = BLK_STS_IOERR; 267 return fbio; 268 } 269 270 if (!fbio) { 271 fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS); 272 fbio->bbio = failed_bbio; 273 fbio->num_copies = num_copies; 274 atomic_set(&fbio->repair_count, 1); 275 } 276 277 atomic_inc(&fbio->repair_count); 278 279 repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, 280 &btrfs_repair_bioset); 281 repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; 282 for (int i = 0; i < nr_steps; i++) { 283 int ret; 284 285 ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); 286 287 ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, 288 offset_in_page(paddrs[i])); 289 ASSERT(ret == step); 290 } 291 292 repair_bbio = btrfs_bio(repair_bio); 293 btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, 294 NULL, fbio); 295 296 mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); 297 btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); 298 btrfs_submit_bbio(repair_bbio, mirror); 299 return fbio; 300 } 301 302 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) 303 { 304 struct btrfs_inode *inode = bbio->inode; 305 struct btrfs_fs_info *fs_info = inode->root->fs_info; 306 const u32 sectorsize = fs_info->sectorsize; 307 const u32 step = min(sectorsize, PAGE_SIZE); 308 const u32 nr_steps = sectorsize / step; 309 struct bvec_iter *iter = &bbio->saved_iter; 310 blk_status_t status = bbio->bio.bi_status; 311 struct btrfs_failed_bio *fbio = NULL; 312 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 313 phys_addr_t paddr; 314 u32 offset = 0; 315 316 /* Read-repair requires the inode field to be set by the submitter. */ 317 ASSERT(inode); 318 319 /* 320 * Hand off repair bios to the repair code as there is no upper level 321 * submitter for them. 322 */ 323 if (bbio->bio.bi_pool == &btrfs_repair_bioset) { 324 btrfs_end_repair_bio(bbio, dev); 325 return; 326 } 327 328 /* Clear the I/O error. A failed repair will reset it. */ 329 bbio->bio.bi_status = BLK_STS_OK; 330 331 btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { 332 paddrs[(offset / step) % nr_steps] = paddr; 333 offset += step; 334 335 if (IS_ALIGNED(offset, sectorsize)) { 336 if (status || 337 !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) 338 fbio = repair_one_sector(bbio, offset - sectorsize, 339 paddrs, fbio); 340 } 341 } 342 if (bbio->csum != bbio->csum_inline) 343 kvfree(bbio->csum); 344 345 if (fbio) 346 btrfs_repair_done(fbio); 347 else 348 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 349 } 350 351 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) 352 { 353 if (!dev || !dev->bdev) 354 return; 355 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 356 return; 357 358 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 359 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 360 else if (!(bio->bi_opf & REQ_RAHEAD)) 361 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 362 if (bio->bi_opf & REQ_PREFLUSH) 363 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 364 } 365 366 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info, 367 const struct bio *bio) 368 { 369 if (bio->bi_opf & REQ_META) 370 return fs_info->endio_meta_workers; 371 return fs_info->endio_workers; 372 } 373 374 static void simple_end_io_work(struct work_struct *work) 375 { 376 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 377 struct bio *bio = &bbio->bio; 378 379 if (bio_op(bio) == REQ_OP_READ) { 380 /* Metadata reads are checked and repaired by the submitter. */ 381 if (is_data_bbio(bbio)) 382 return btrfs_check_read_bio(bbio, bbio->bio.bi_private); 383 return btrfs_bio_end_io(bbio, bbio->bio.bi_status); 384 } 385 if (bio_is_zone_append(bio) && !bio->bi_status) 386 btrfs_record_physical_zoned(bbio); 387 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 388 } 389 390 static void btrfs_simple_end_io(struct bio *bio) 391 { 392 struct btrfs_bio *bbio = btrfs_bio(bio); 393 struct btrfs_device *dev = bio->bi_private; 394 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 395 396 btrfs_bio_counter_dec(fs_info); 397 398 if (bio->bi_status) 399 btrfs_log_dev_io_error(bio, dev); 400 401 INIT_WORK(&bbio->end_io_work, simple_end_io_work); 402 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 403 } 404 405 static void btrfs_raid56_end_io(struct bio *bio) 406 { 407 struct btrfs_io_context *bioc = bio->bi_private; 408 struct btrfs_bio *bbio = btrfs_bio(bio); 409 410 /* RAID56 endio is always handled in workqueue. */ 411 ASSERT(in_task()); 412 413 btrfs_bio_counter_dec(bioc->fs_info); 414 bbio->mirror_num = bioc->mirror_num; 415 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) 416 btrfs_check_read_bio(bbio, NULL); 417 else 418 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 419 420 btrfs_put_bioc(bioc); 421 } 422 423 static void orig_write_end_io_work(struct work_struct *work) 424 { 425 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 426 struct bio *bio = &bbio->bio; 427 struct btrfs_io_stripe *stripe = bio->bi_private; 428 struct btrfs_io_context *bioc = stripe->bioc; 429 430 btrfs_bio_counter_dec(bioc->fs_info); 431 432 if (bio->bi_status) { 433 atomic_inc(&bioc->error); 434 btrfs_log_dev_io_error(bio, stripe->dev); 435 } 436 437 /* 438 * Only send an error to the higher layers if it is beyond the tolerance 439 * threshold. 440 */ 441 if (atomic_read(&bioc->error) > bioc->max_errors) 442 bio->bi_status = BLK_STS_IOERR; 443 else 444 bio->bi_status = BLK_STS_OK; 445 446 if (bio_is_zone_append(bio) && !bio->bi_status) 447 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 448 449 btrfs_bio_end_io(bbio, bbio->bio.bi_status); 450 btrfs_put_bioc(bioc); 451 } 452 453 static void btrfs_orig_write_end_io(struct bio *bio) 454 { 455 struct btrfs_bio *bbio = btrfs_bio(bio); 456 457 INIT_WORK(&bbio->end_io_work, orig_write_end_io_work); 458 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); 459 } 460 461 static void clone_write_end_io_work(struct work_struct *work) 462 { 463 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 464 struct bio *bio = &bbio->bio; 465 struct btrfs_io_stripe *stripe = bio->bi_private; 466 467 if (bio->bi_status) { 468 atomic_inc(&stripe->bioc->error); 469 btrfs_log_dev_io_error(bio, stripe->dev); 470 } else if (bio_is_zone_append(bio)) { 471 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 472 } 473 474 /* Pass on control to the original bio this one was cloned from */ 475 bio_endio(stripe->bioc->orig_bio); 476 bio_put(bio); 477 } 478 479 static void btrfs_clone_write_end_io(struct bio *bio) 480 { 481 struct btrfs_bio *bbio = btrfs_bio(bio); 482 483 INIT_WORK(&bbio->end_io_work, clone_write_end_io_work); 484 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); 485 } 486 487 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 488 { 489 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 490 491 if (!dev || !dev->bdev || 492 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 493 (btrfs_op(bio) == BTRFS_MAP_WRITE && 494 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 495 bio_io_error(bio); 496 return; 497 } 498 499 bio_set_dev(bio, dev->bdev); 500 501 /* 502 * For zone append writing, bi_sector must point the beginning of the 503 * zone 504 */ 505 if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) { 506 u64 zone_start = round_down(physical, dev->fs_info->zone_size); 507 508 ASSERT(btrfs_dev_is_sequential(dev, physical)); 509 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 510 bio->bi_opf &= ~REQ_OP_WRITE; 511 bio->bi_opf |= REQ_OP_ZONE_APPEND; 512 } 513 btrfs_debug(dev->fs_info, 514 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 515 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 516 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 517 dev->devid, bio->bi_iter.bi_size); 518 519 /* 520 * Track reads if tracking is enabled; ignore I/O operations before the 521 * filesystem is fully initialized. 522 */ 523 if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) 524 percpu_counter_add(&dev->fs_info->stats_read_blocks, 525 bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); 526 527 if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) 528 blkcg_punt_bio_submit(bio); 529 else 530 submit_bio(bio); 531 } 532 533 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 534 { 535 struct bio *orig_bio = bioc->orig_bio, *bio; 536 struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio); 537 538 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 539 540 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 541 if (dev_nr == bioc->num_stripes - 1) { 542 bio = orig_bio; 543 bio->bi_end_io = btrfs_orig_write_end_io; 544 } else { 545 /* We need to use endio_work to run end_io in task context. */ 546 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset); 547 bio_inc_remaining(orig_bio); 548 btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, 549 orig_bbio->file_offset, NULL, NULL); 550 bio->bi_end_io = btrfs_clone_write_end_io; 551 } 552 553 bio->bi_private = &bioc->stripes[dev_nr]; 554 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 555 bioc->stripes[dev_nr].bioc = bioc; 556 bioc->size = bio->bi_iter.bi_size; 557 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 558 } 559 560 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, 561 struct btrfs_io_stripe *smap, int mirror_num) 562 { 563 if (!bioc) { 564 /* Single mirror read/write fast path. */ 565 btrfs_bio(bio)->mirror_num = mirror_num; 566 bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; 567 if (bio_op(bio) != REQ_OP_READ) 568 btrfs_bio(bio)->orig_physical = smap->physical; 569 bio->bi_private = smap->dev; 570 bio->bi_end_io = btrfs_simple_end_io; 571 btrfs_submit_dev_bio(smap->dev, bio); 572 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 573 /* Parity RAID write or read recovery. */ 574 bio->bi_private = bioc; 575 bio->bi_end_io = btrfs_raid56_end_io; 576 if (bio_op(bio) == REQ_OP_READ) 577 raid56_parity_recover(bio, bioc, mirror_num); 578 else 579 raid56_parity_write(bio, bioc); 580 } else { 581 /* Write to multiple mirrors. */ 582 int total_devs = bioc->num_stripes; 583 584 bioc->orig_bio = bio; 585 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) 586 btrfs_submit_mirrored_bio(bioc, dev_nr); 587 } 588 } 589 590 static int btrfs_bio_csum(struct btrfs_bio *bbio) 591 { 592 if (bbio->bio.bi_opf & REQ_META) 593 return btree_csum_one_bio(bbio); 594 #ifdef CONFIG_BTRFS_EXPERIMENTAL 595 return btrfs_csum_one_bio(bbio, true); 596 #else 597 return btrfs_csum_one_bio(bbio, false); 598 #endif 599 } 600 601 /* 602 * Async submit bios are used to offload expensive checksumming onto the worker 603 * threads. 604 */ 605 struct async_submit_bio { 606 struct btrfs_bio *bbio; 607 struct btrfs_io_context *bioc; 608 struct btrfs_io_stripe smap; 609 int mirror_num; 610 struct btrfs_work work; 611 }; 612 613 /* 614 * In order to insert checksums into the metadata in large chunks, we wait 615 * until bio submission time. All the pages in the bio are checksummed and 616 * sums are attached onto the ordered extent record. 617 * 618 * At IO completion time the csums attached on the ordered extent record are 619 * inserted into the btree. 620 */ 621 static void run_one_async_start(struct btrfs_work *work) 622 { 623 struct async_submit_bio *async = 624 container_of(work, struct async_submit_bio, work); 625 int ret; 626 627 ret = btrfs_bio_csum(async->bbio); 628 if (ret) 629 async->bbio->bio.bi_status = errno_to_blk_status(ret); 630 } 631 632 /* 633 * In order to insert checksums into the metadata in large chunks, we wait 634 * until bio submission time. All the pages in the bio are checksummed and 635 * sums are attached onto the ordered extent record. 636 * 637 * At IO completion time the csums attached on the ordered extent record are 638 * inserted into the tree. 639 * 640 * If called with @do_free == true, then it will free the work struct. 641 */ 642 static void run_one_async_done(struct btrfs_work *work, bool do_free) 643 { 644 struct async_submit_bio *async = 645 container_of(work, struct async_submit_bio, work); 646 struct bio *bio = &async->bbio->bio; 647 648 if (do_free) { 649 kfree(container_of(work, struct async_submit_bio, work)); 650 return; 651 } 652 653 /* If an error occurred we just want to clean up the bio and move on. */ 654 if (bio->bi_status) { 655 btrfs_bio_end_io(async->bbio, bio->bi_status); 656 return; 657 } 658 659 /* 660 * All of the bios that pass through here are from async helpers. 661 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's 662 * context. This changes nothing when cgroups aren't in use. 663 */ 664 bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; 665 btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); 666 } 667 668 static bool should_async_write(struct btrfs_bio *bbio) 669 { 670 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 671 bool auto_csum_mode = true; 672 673 #ifdef CONFIG_BTRFS_EXPERIMENTAL 674 /* 675 * Write bios will calculate checksum and submit bio at the same time. 676 * Unless explicitly required don't offload serial csum calculate and bio 677 * submit into a workqueue. 678 */ 679 return false; 680 #endif 681 682 /* Submit synchronously if the checksum implementation is fast. */ 683 if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) 684 return false; 685 686 /* 687 * Try to defer the submission to a workqueue to parallelize the 688 * checksum calculation unless the I/O is issued synchronously. 689 */ 690 if (op_is_sync(bbio->bio.bi_opf)) 691 return false; 692 693 /* Zoned devices require I/O to be submitted in order. */ 694 if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info)) 695 return false; 696 697 return true; 698 } 699 700 /* 701 * Submit bio to an async queue. 702 * 703 * Return true if the work has been successfully submitted, else false. 704 */ 705 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, 706 struct btrfs_io_context *bioc, 707 struct btrfs_io_stripe *smap, int mirror_num) 708 { 709 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 710 struct async_submit_bio *async; 711 712 async = kmalloc(sizeof(*async), GFP_NOFS); 713 if (!async) 714 return false; 715 716 async->bbio = bbio; 717 async->bioc = bioc; 718 async->smap = *smap; 719 async->mirror_num = mirror_num; 720 721 btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); 722 btrfs_queue_work(fs_info->workers, &async->work); 723 return true; 724 } 725 726 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) 727 { 728 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 729 unsigned int nr_segs; 730 int sector_offset; 731 732 map_length = min(map_length, fs_info->max_zone_append_size); 733 sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits, 734 &nr_segs, map_length); 735 if (sector_offset) { 736 /* 737 * bio_split_rw_at() could split at a size smaller than our 738 * sectorsize and thus cause unaligned I/Os. Fix that by 739 * always rounding down to the nearest boundary. 740 */ 741 return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize); 742 } 743 return map_length; 744 } 745 746 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) 747 { 748 struct btrfs_inode *inode = bbio->inode; 749 struct btrfs_fs_info *fs_info = inode->root->fs_info; 750 struct bio *bio = &bbio->bio; 751 u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 752 u64 length = bio->bi_iter.bi_size; 753 u64 map_length = length; 754 struct btrfs_io_context *bioc = NULL; 755 struct btrfs_io_stripe smap; 756 blk_status_t status; 757 int ret; 758 759 if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root)) 760 smap.rst_search_commit_root = true; 761 else 762 smap.rst_search_commit_root = false; 763 764 btrfs_bio_counter_inc_blocked(fs_info); 765 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 766 &bioc, &smap, &mirror_num); 767 if (ret) { 768 status = errno_to_blk_status(ret); 769 btrfs_bio_counter_dec(fs_info); 770 goto end_bbio; 771 } 772 773 /* 774 * For fscrypt writes we will get the encrypted bio after we've remapped 775 * our bio to the physical disk location, so we need to save the 776 * original bytenr so we know what we're checksumming. 777 */ 778 if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) 779 bbio->orig_logical = logical; 780 781 bbio->can_use_append = btrfs_use_zone_append(bbio); 782 783 map_length = min(map_length, length); 784 if (bbio->can_use_append) 785 map_length = btrfs_append_map_length(bbio, map_length); 786 787 if (map_length < length) { 788 struct btrfs_bio *split; 789 790 split = btrfs_split_bio(fs_info, bbio, map_length); 791 if (IS_ERR(split)) { 792 status = errno_to_blk_status(PTR_ERR(split)); 793 btrfs_bio_counter_dec(fs_info); 794 goto end_bbio; 795 } 796 bbio = split; 797 bio = &bbio->bio; 798 } 799 800 /* 801 * Save the iter for the end_io handler and preload the checksums for 802 * data reads. 803 */ 804 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { 805 bbio->saved_iter = bio->bi_iter; 806 ret = btrfs_lookup_bio_sums(bbio); 807 status = errno_to_blk_status(ret); 808 if (status) 809 goto fail; 810 } 811 812 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 813 if (is_data_bbio(bbio) && bioc && bioc->use_rst) { 814 /* 815 * No locking for the list update, as we only add to 816 * the list in the I/O submission path, and list 817 * iteration only happens in the completion path, which 818 * can't happen until after the last submission. 819 */ 820 btrfs_get_bioc(bioc); 821 list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); 822 } 823 824 /* 825 * Csum items for reloc roots have already been cloned at this 826 * point, so they are handled as part of the no-checksum case. 827 */ 828 if (!(inode->flags & BTRFS_INODE_NODATASUM) && 829 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && 830 !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) { 831 if (should_async_write(bbio) && 832 btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) 833 goto done; 834 835 ret = btrfs_bio_csum(bbio); 836 status = errno_to_blk_status(ret); 837 if (status) 838 goto fail; 839 } else if (bbio->can_use_append || 840 (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) { 841 ret = btrfs_alloc_dummy_sum(bbio); 842 status = errno_to_blk_status(ret); 843 if (status) 844 goto fail; 845 } 846 } 847 848 btrfs_submit_bio(bio, bioc, &smap, mirror_num); 849 done: 850 return map_length == length; 851 852 fail: 853 btrfs_bio_counter_dec(fs_info); 854 /* 855 * We have split the original bbio, now we have to end both the current 856 * @bbio and remaining one, as the remaining one will never be submitted. 857 */ 858 if (map_length < length) { 859 struct btrfs_bio *remaining = bbio->private; 860 861 ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); 862 ASSERT(remaining); 863 864 btrfs_bio_end_io(remaining, status); 865 } 866 end_bbio: 867 btrfs_bio_end_io(bbio, status); 868 /* Do not submit another chunk */ 869 return true; 870 } 871 872 static void assert_bbio_alignment(struct btrfs_bio *bbio) 873 { 874 #ifdef CONFIG_BTRFS_ASSERT 875 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 876 struct bio_vec bvec; 877 struct bvec_iter iter; 878 const u32 blocksize = fs_info->sectorsize; 879 const u32 alignment = min(blocksize, PAGE_SIZE); 880 const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 881 const u32 length = bbio->bio.bi_iter.bi_size; 882 883 /* The logical and length should still be aligned to blocksize. */ 884 ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) && 885 length != 0, "root=%llu inode=%llu logical=%llu length=%u", 886 btrfs_root_id(bbio->inode->root), 887 btrfs_ino(bbio->inode), logical, length); 888 889 bio_for_each_bvec(bvec, &bbio->bio, iter) 890 ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) && 891 IS_ALIGNED(bvec.bv_len, alignment), 892 "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", 893 btrfs_root_id(bbio->inode->root), 894 btrfs_ino(bbio->inode), logical, length, iter.bi_idx, 895 bvec.bv_offset, bvec.bv_len); 896 #endif 897 } 898 899 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) 900 { 901 /* If bbio->inode is not populated, its file_offset must be 0. */ 902 ASSERT(bbio->inode || bbio->file_offset == 0); 903 904 assert_bbio_alignment(bbio); 905 906 while (!btrfs_submit_chunk(bbio, mirror_num)) 907 ; 908 } 909 910 /* 911 * Submit a repair write. 912 * 913 * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a 914 * RAID setup. Here we only want to write the one bad copy, so we do the 915 * mapping ourselves and submit the bio directly. 916 * 917 * The I/O is issued synchronously to block the repair read completion from 918 * freeing the bio. 919 * 920 * @ino: Offending inode number 921 * @fileoff: File offset inside the inode 922 * @length: Length of the repair write 923 * @logical: Logical address of the range 924 * @paddrs: Physical address array of the content 925 * @step: Length of for each paddrs 926 * @mirror_num: Mirror number to write to. Must not be zero 927 */ 928 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, 929 u32 length, u64 logical, const phys_addr_t paddrs[], 930 unsigned int step, int mirror_num) 931 { 932 const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); 933 struct btrfs_io_stripe smap = { 0 }; 934 struct bio *bio = NULL; 935 int ret = 0; 936 937 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 938 BUG_ON(!mirror_num); 939 940 /* Basic alignment checks. */ 941 ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); 942 ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); 943 ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); 944 /* Either it's a single data or metadata block. */ 945 ASSERT(length <= BTRFS_MAX_BLOCKSIZE); 946 ASSERT(step <= length); 947 ASSERT(is_power_of_2(step)); 948 949 if (btrfs_repair_one_zone(fs_info, logical)) 950 return 0; 951 952 /* 953 * Avoid races with device replace and make sure our bioc has devices 954 * associated to its stripes that don't go away while we are doing the 955 * read repair operation. 956 */ 957 btrfs_bio_counter_inc_blocked(fs_info); 958 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); 959 if (ret < 0) 960 goto out_counter_dec; 961 962 if (unlikely(!smap.dev->bdev || 963 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) { 964 ret = -EIO; 965 goto out_counter_dec; 966 } 967 968 bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); 969 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 970 for (int i = 0; i < nr_steps; i++) { 971 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); 972 /* We should have allocated enough slots to contain all the different pages. */ 973 ASSERT(ret == step); 974 } 975 ret = submit_bio_wait(bio); 976 bio_put(bio); 977 if (ret) { 978 /* try to remap that extent elsewhere? */ 979 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); 980 goto out_counter_dec; 981 } 982 983 btrfs_info_rl(fs_info, 984 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 985 ino, fileoff, btrfs_dev_name(smap.dev), 986 smap.physical >> SECTOR_SHIFT); 987 ret = 0; 988 989 out_counter_dec: 990 btrfs_bio_counter_dec(fs_info); 991 return ret; 992 } 993 994 /* 995 * Submit a btrfs_bio based repair write. 996 * 997 * If @dev_replace is true, the write would be submitted to dev-replace target. 998 */ 999 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) 1000 { 1001 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 1002 u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 1003 u64 length = bbio->bio.bi_iter.bi_size; 1004 struct btrfs_io_stripe smap = { 0 }; 1005 int ret; 1006 1007 ASSERT(mirror_num > 0); 1008 ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); 1009 ASSERT(!is_data_inode(bbio->inode)); 1010 ASSERT(bbio->is_scrub); 1011 1012 btrfs_bio_counter_inc_blocked(fs_info); 1013 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); 1014 if (ret < 0) 1015 goto fail; 1016 1017 if (dev_replace) { 1018 ASSERT(smap.dev == fs_info->dev_replace.srcdev); 1019 smap.dev = fs_info->dev_replace.tgtdev; 1020 } 1021 btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); 1022 return; 1023 1024 fail: 1025 btrfs_bio_counter_dec(fs_info); 1026 btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 1027 } 1028 1029 int __init btrfs_bioset_init(void) 1030 { 1031 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 1032 offsetof(struct btrfs_bio, bio), 1033 BIOSET_NEED_BVECS)) 1034 return -ENOMEM; 1035 if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, 1036 offsetof(struct btrfs_bio, bio), 0)) 1037 goto out; 1038 if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, 1039 offsetof(struct btrfs_bio, bio), 1040 BIOSET_NEED_BVECS)) 1041 goto out; 1042 if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, 1043 sizeof(struct btrfs_failed_bio))) 1044 goto out; 1045 return 0; 1046 1047 out: 1048 btrfs_bioset_exit(); 1049 return -ENOMEM; 1050 } 1051 1052 void __cold btrfs_bioset_exit(void) 1053 { 1054 mempool_exit(&btrfs_failed_bio_pool); 1055 bioset_exit(&btrfs_repair_bioset); 1056 bioset_exit(&btrfs_clone_bioset); 1057 bioset_exit(&btrfs_bioset); 1058 } 1059