1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "check-integrity.h" 14 #include "dev-replace.h" 15 #include "rcu-string.h" 16 #include "zoned.h" 17 18 static struct bio_set btrfs_bioset; 19 20 /* 21 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 22 * is already initialized by the block layer. 23 */ 24 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 25 btrfs_bio_end_io_t end_io, void *private) 26 { 27 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 28 bbio->end_io = end_io; 29 bbio->private = private; 30 } 31 32 /* 33 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 34 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 35 * 36 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 37 * a mempool. 38 */ 39 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 40 btrfs_bio_end_io_t end_io, void *private) 41 { 42 struct bio *bio; 43 44 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 45 btrfs_bio_init(btrfs_bio(bio), end_io, private); 46 return bio; 47 } 48 49 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 50 btrfs_bio_end_io_t end_io, void *private) 51 { 52 struct bio *bio; 53 struct btrfs_bio *bbio; 54 55 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 56 57 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 58 bbio = btrfs_bio(bio); 59 btrfs_bio_init(bbio, end_io, private); 60 61 bio_trim(bio, offset >> 9, size >> 9); 62 bbio->iter = bio->bi_iter; 63 return bio; 64 } 65 66 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 67 { 68 if (!dev || !dev->bdev) 69 return; 70 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 71 return; 72 73 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 74 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 75 if (!(bio->bi_opf & REQ_RAHEAD)) 76 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 77 if (bio->bi_opf & REQ_PREFLUSH) 78 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 79 } 80 81 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 82 struct bio *bio) 83 { 84 if (bio->bi_opf & REQ_META) 85 return fs_info->endio_meta_workers; 86 return fs_info->endio_workers; 87 } 88 89 static void btrfs_end_bio_work(struct work_struct *work) 90 { 91 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 92 93 bbio->end_io(bbio); 94 } 95 96 static void btrfs_simple_end_io(struct bio *bio) 97 { 98 struct btrfs_fs_info *fs_info = bio->bi_private; 99 struct btrfs_bio *bbio = btrfs_bio(bio); 100 101 btrfs_bio_counter_dec(fs_info); 102 103 if (bio->bi_status) 104 btrfs_log_dev_io_error(bio, bbio->device); 105 106 if (bio_op(bio) == REQ_OP_READ) { 107 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 108 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 109 } else { 110 bbio->end_io(bbio); 111 } 112 } 113 114 static void btrfs_raid56_end_io(struct bio *bio) 115 { 116 struct btrfs_io_context *bioc = bio->bi_private; 117 struct btrfs_bio *bbio = btrfs_bio(bio); 118 119 btrfs_bio_counter_dec(bioc->fs_info); 120 bbio->mirror_num = bioc->mirror_num; 121 bbio->end_io(bbio); 122 123 btrfs_put_bioc(bioc); 124 } 125 126 static void btrfs_orig_write_end_io(struct bio *bio) 127 { 128 struct btrfs_io_stripe *stripe = bio->bi_private; 129 struct btrfs_io_context *bioc = stripe->bioc; 130 struct btrfs_bio *bbio = btrfs_bio(bio); 131 132 btrfs_bio_counter_dec(bioc->fs_info); 133 134 if (bio->bi_status) { 135 atomic_inc(&bioc->error); 136 btrfs_log_dev_io_error(bio, stripe->dev); 137 } 138 139 /* 140 * Only send an error to the higher layers if it is beyond the tolerance 141 * threshold. 142 */ 143 if (atomic_read(&bioc->error) > bioc->max_errors) 144 bio->bi_status = BLK_STS_IOERR; 145 else 146 bio->bi_status = BLK_STS_OK; 147 148 bbio->end_io(bbio); 149 btrfs_put_bioc(bioc); 150 } 151 152 static void btrfs_clone_write_end_io(struct bio *bio) 153 { 154 struct btrfs_io_stripe *stripe = bio->bi_private; 155 156 if (bio->bi_status) { 157 atomic_inc(&stripe->bioc->error); 158 btrfs_log_dev_io_error(bio, stripe->dev); 159 } 160 161 /* Pass on control to the original bio this one was cloned from */ 162 bio_endio(stripe->bioc->orig_bio); 163 bio_put(bio); 164 } 165 166 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 167 { 168 if (!dev || !dev->bdev || 169 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 170 (btrfs_op(bio) == BTRFS_MAP_WRITE && 171 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 172 bio_io_error(bio); 173 return; 174 } 175 176 bio_set_dev(bio, dev->bdev); 177 178 /* 179 * For zone append writing, bi_sector must point the beginning of the 180 * zone 181 */ 182 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 183 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 184 185 if (btrfs_dev_is_sequential(dev, physical)) { 186 u64 zone_start = round_down(physical, 187 dev->fs_info->zone_size); 188 189 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 190 } else { 191 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 192 bio->bi_opf |= REQ_OP_WRITE; 193 } 194 } 195 btrfs_debug_in_rcu(dev->fs_info, 196 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 197 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 198 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 199 dev->devid, bio->bi_iter.bi_size); 200 201 btrfsic_check_bio(bio); 202 submit_bio(bio); 203 } 204 205 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 206 { 207 struct bio *orig_bio = bioc->orig_bio, *bio; 208 209 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 210 211 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 212 if (dev_nr == bioc->num_stripes - 1) { 213 bio = orig_bio; 214 bio->bi_end_io = btrfs_orig_write_end_io; 215 } else { 216 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 217 bio_inc_remaining(orig_bio); 218 bio->bi_end_io = btrfs_clone_write_end_io; 219 } 220 221 bio->bi_private = &bioc->stripes[dev_nr]; 222 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 223 bioc->stripes[dev_nr].bioc = bioc; 224 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 225 } 226 227 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 228 { 229 u64 logical = bio->bi_iter.bi_sector << 9; 230 u64 length = bio->bi_iter.bi_size; 231 u64 map_length = length; 232 struct btrfs_io_context *bioc = NULL; 233 struct btrfs_io_stripe smap; 234 int ret; 235 236 btrfs_bio_counter_inc_blocked(fs_info); 237 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 238 &bioc, &smap, &mirror_num, 1); 239 if (ret) { 240 btrfs_bio_counter_dec(fs_info); 241 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); 242 return; 243 } 244 245 if (map_length < length) { 246 btrfs_crit(fs_info, 247 "mapping failed logical %llu bio len %llu len %llu", 248 logical, length, map_length); 249 BUG(); 250 } 251 252 if (!bioc) { 253 /* Single mirror read/write fast path */ 254 btrfs_bio(bio)->mirror_num = mirror_num; 255 btrfs_bio(bio)->device = smap.dev; 256 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 257 bio->bi_private = fs_info; 258 bio->bi_end_io = btrfs_simple_end_io; 259 btrfs_submit_dev_bio(smap.dev, bio); 260 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 261 /* Parity RAID write or read recovery */ 262 bio->bi_private = bioc; 263 bio->bi_end_io = btrfs_raid56_end_io; 264 if (bio_op(bio) == REQ_OP_READ) 265 raid56_parity_recover(bio, bioc, mirror_num); 266 else 267 raid56_parity_write(bio, bioc); 268 } else { 269 /* Write to multiple mirrors */ 270 int total_devs = bioc->num_stripes; 271 int dev_nr; 272 273 bioc->orig_bio = bio; 274 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 275 btrfs_submit_mirrored_bio(bioc, dev_nr); 276 } 277 } 278 279 /* 280 * Submit a repair write. 281 * 282 * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a 283 * RAID setup. Here we only want to write the one bad copy, so we do the 284 * mapping ourselves and submit the bio directly. 285 * 286 * The I/O is issued sychronously to block the repair read completion from 287 * freeing the bio. 288 */ 289 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 290 u64 length, u64 logical, struct page *page, 291 unsigned int pg_offset, int mirror_num) 292 { 293 struct btrfs_device *dev; 294 struct bio_vec bvec; 295 struct bio bio; 296 u64 map_length = 0; 297 u64 sector; 298 struct btrfs_io_context *bioc = NULL; 299 int ret = 0; 300 301 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 302 BUG_ON(!mirror_num); 303 304 if (btrfs_repair_one_zone(fs_info, logical)) 305 return 0; 306 307 map_length = length; 308 309 /* 310 * Avoid races with device replace and make sure our bioc has devices 311 * associated to its stripes that don't go away while we are doing the 312 * read repair operation. 313 */ 314 btrfs_bio_counter_inc_blocked(fs_info); 315 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 316 /* 317 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 318 * to update all raid stripes, but here we just want to correct 319 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 320 * stripe's dev and sector. 321 */ 322 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 323 &map_length, &bioc, 0); 324 if (ret) 325 goto out_counter_dec; 326 ASSERT(bioc->mirror_num == 1); 327 } else { 328 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 329 &map_length, &bioc, mirror_num); 330 if (ret) 331 goto out_counter_dec; 332 /* 333 * This happens when dev-replace is also running, and the 334 * mirror_num indicates the dev-replace target. 335 * 336 * In this case, we don't need to do anything, as the read 337 * error just means the replace progress hasn't reached our 338 * read range, and later replace routine would handle it well. 339 */ 340 if (mirror_num != bioc->mirror_num) 341 goto out_counter_dec; 342 } 343 344 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 345 dev = bioc->stripes[bioc->mirror_num - 1].dev; 346 btrfs_put_bioc(bioc); 347 348 if (!dev || !dev->bdev || 349 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 350 ret = -EIO; 351 goto out_counter_dec; 352 } 353 354 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 355 bio.bi_iter.bi_sector = sector; 356 __bio_add_page(&bio, page, length, pg_offset); 357 358 btrfsic_check_bio(&bio); 359 ret = submit_bio_wait(&bio); 360 if (ret) { 361 /* try to remap that extent elsewhere? */ 362 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 363 goto out_bio_uninit; 364 } 365 366 btrfs_info_rl_in_rcu(fs_info, 367 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 368 ino, start, btrfs_dev_name(dev), sector); 369 ret = 0; 370 371 out_bio_uninit: 372 bio_uninit(&bio); 373 out_counter_dec: 374 btrfs_bio_counter_dec(fs_info); 375 return ret; 376 } 377 378 int __init btrfs_bioset_init(void) 379 { 380 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 381 offsetof(struct btrfs_bio, bio), 382 BIOSET_NEED_BVECS)) 383 return -ENOMEM; 384 return 0; 385 } 386 387 void __cold btrfs_bioset_exit(void) 388 { 389 bioset_exit(&btrfs_bioset); 390 } 391