1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "reflink.h" 25 #include "subvolume.h" 26 #include "trace.h" 27 28 #include <linux/random.h> 29 #include <linux/sched/mm.h> 30 31 #ifdef CONFIG_BCACHEFS_DEBUG 32 static unsigned bch2_read_corrupt_ratio; 33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 34 MODULE_PARM_DESC(read_corrupt_ratio, ""); 35 #endif 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static bool bch2_target_congested(struct bch_fs *c, u16 target) 40 { 41 const struct bch_devs_mask *devs; 42 unsigned d, nr = 0, total = 0; 43 u64 now = local_clock(), last; 44 s64 congested; 45 struct bch_dev *ca; 46 47 if (!target) 48 return false; 49 50 rcu_read_lock(); 51 devs = bch2_target_to_mask(c, target) ?: 52 &c->rw_devs[BCH_DATA_user]; 53 54 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 55 ca = rcu_dereference(c->devs[d]); 56 if (!ca) 57 continue; 58 59 congested = atomic_read(&ca->congested); 60 last = READ_ONCE(ca->congested_last); 61 if (time_after64(now, last)) 62 congested -= (now - last) >> 12; 63 64 total += max(congested, 0LL); 65 nr++; 66 } 67 rcu_read_unlock(); 68 69 return get_random_u32_below(nr * CONGESTED_MAX) < total; 70 } 71 72 #else 73 74 static bool bch2_target_congested(struct bch_fs *c, u16 target) 75 { 76 return false; 77 } 78 79 #endif 80 81 /* Cache promotion on read */ 82 83 struct promote_op { 84 struct rcu_head rcu; 85 u64 start_time; 86 87 struct rhash_head hash; 88 struct bpos pos; 89 90 struct work_struct work; 91 struct data_update write; 92 struct bio_vec bi_inline_vecs[]; /* must be last */ 93 }; 94 95 static const struct rhashtable_params bch_promote_params = { 96 .head_offset = offsetof(struct promote_op, hash), 97 .key_offset = offsetof(struct promote_op, pos), 98 .key_len = sizeof(struct bpos), 99 .automatic_shrinking = true, 100 }; 101 102 static inline bool have_io_error(struct bch_io_failures *failed) 103 { 104 return failed && failed->nr; 105 } 106 107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 108 { 109 EBUG_ON(rbio->split); 110 111 return rbio->data_update 112 ? container_of(rbio, struct data_update, rbio) 113 : NULL; 114 } 115 116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 117 { 118 struct data_update *u = rbio_data_update(orig); 119 if (!u) 120 return false; 121 122 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 123 unsigned i = 0; 124 bkey_for_each_ptr(ptrs, ptr) { 125 if (ptr->dev == dev && 126 u->data_opts.rewrite_ptrs & BIT(i)) 127 return true; 128 i++; 129 } 130 131 return false; 132 } 133 134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 135 struct bpos pos, 136 struct bch_io_opts opts, 137 unsigned flags, 138 struct bch_io_failures *failed) 139 { 140 if (!have_io_error(failed)) { 141 BUG_ON(!opts.promote_target); 142 143 if (!(flags & BCH_READ_may_promote)) 144 return -BCH_ERR_nopromote_may_not; 145 146 if (bch2_bkey_has_target(c, k, opts.promote_target)) 147 return -BCH_ERR_nopromote_already_promoted; 148 149 if (bkey_extent_is_unwritten(k)) 150 return -BCH_ERR_nopromote_unwritten; 151 152 if (bch2_target_congested(c, opts.promote_target)) 153 return -BCH_ERR_nopromote_congested; 154 } 155 156 if (rhashtable_lookup_fast(&c->promote_table, &pos, 157 bch_promote_params)) 158 return -BCH_ERR_nopromote_in_flight; 159 160 return 0; 161 } 162 163 static noinline void promote_free(struct bch_read_bio *rbio) 164 { 165 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 166 struct bch_fs *c = rbio->c; 167 168 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 169 bch_promote_params); 170 BUG_ON(ret); 171 172 bch2_data_update_exit(&op->write); 173 174 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 175 kfree_rcu(op, rcu); 176 } 177 178 static void promote_done(struct bch_write_op *wop) 179 { 180 struct promote_op *op = container_of(wop, struct promote_op, write.op); 181 struct bch_fs *c = op->write.rbio.c; 182 183 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 184 promote_free(&op->write.rbio); 185 } 186 187 static void promote_start_work(struct work_struct *work) 188 { 189 struct promote_op *op = container_of(work, struct promote_op, work); 190 191 bch2_data_update_read_done(&op->write); 192 } 193 194 static noinline void promote_start(struct bch_read_bio *rbio) 195 { 196 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 197 198 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 199 200 INIT_WORK(&op->work, promote_start_work); 201 queue_work(rbio->c->write_ref_wq, &op->work); 202 } 203 204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 205 enum btree_id btree_id, 206 struct bkey_s_c k, 207 struct bpos pos, 208 struct extent_ptr_decoded *pick, 209 unsigned sectors, 210 struct bch_read_bio *orig, 211 struct bch_io_failures *failed) 212 { 213 struct bch_fs *c = trans->c; 214 int ret; 215 216 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 217 218 if (!have_io_error(failed)) { 219 update_opts.target = orig->opts.promote_target; 220 update_opts.extra_replicas = 1; 221 update_opts.write_flags |= BCH_WRITE_cached; 222 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 223 } else { 224 update_opts.target = orig->opts.foreground_target; 225 226 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 227 unsigned ptr_bit = 1; 228 bkey_for_each_ptr(ptrs, ptr) { 229 if (bch2_dev_io_failures(failed, ptr->dev) && 230 !ptr_being_rewritten(orig, ptr->dev)) 231 update_opts.rewrite_ptrs |= ptr_bit; 232 ptr_bit <<= 1; 233 } 234 235 if (!update_opts.rewrite_ptrs) 236 return NULL; 237 } 238 239 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 240 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 241 242 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 243 if (!op) { 244 ret = -BCH_ERR_nopromote_enomem; 245 goto err_put; 246 } 247 248 op->start_time = local_clock(); 249 op->pos = pos; 250 251 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 252 bch_promote_params)) { 253 ret = -BCH_ERR_nopromote_in_flight; 254 goto err; 255 } 256 257 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 258 writepoint_hashed((unsigned long) current), 259 &orig->opts, 260 update_opts, 261 btree_id, k); 262 op->write.type = BCH_DATA_UPDATE_promote; 263 /* 264 * possible errors: -BCH_ERR_nocow_lock_blocked, 265 * -BCH_ERR_ENOSPC_disk_reservation: 266 */ 267 if (ret) 268 goto err_remove_hash; 269 270 rbio_init_fragment(&op->write.rbio.bio, orig); 271 op->write.rbio.bounce = true; 272 op->write.rbio.promote = true; 273 op->write.op.end_io = promote_done; 274 275 return &op->write.rbio; 276 err_remove_hash: 277 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 278 bch_promote_params)); 279 err: 280 bio_free_pages(&op->write.op.wbio.bio); 281 /* We may have added to the rhashtable and thus need rcu freeing: */ 282 kfree_rcu(op, rcu); 283 err_put: 284 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 285 return ERR_PTR(ret); 286 } 287 288 noinline 289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 290 struct bvec_iter iter, 291 struct bkey_s_c k, 292 struct extent_ptr_decoded *pick, 293 unsigned flags, 294 struct bch_read_bio *orig, 295 bool *bounce, 296 bool *read_full, 297 struct bch_io_failures *failed) 298 { 299 struct bch_fs *c = trans->c; 300 /* 301 * if failed != NULL we're not actually doing a promote, we're 302 * recovering from an io/checksum error 303 */ 304 bool promote_full = (have_io_error(failed) || 305 *read_full || 306 READ_ONCE(c->opts.promote_whole_extents)); 307 /* data might have to be decompressed in the write path: */ 308 unsigned sectors = promote_full 309 ? max(pick->crc.compressed_size, pick->crc.live_size) 310 : bvec_iter_sectors(iter); 311 struct bpos pos = promote_full 312 ? bkey_start_pos(k.k) 313 : POS(k.k->p.inode, iter.bi_sector); 314 int ret; 315 316 ret = should_promote(c, k, pos, orig->opts, flags, failed); 317 if (ret) 318 goto nopromote; 319 320 struct bch_read_bio *promote = 321 __promote_alloc(trans, 322 k.k->type == KEY_TYPE_reflink_v 323 ? BTREE_ID_reflink 324 : BTREE_ID_extents, 325 k, pos, pick, sectors, orig, failed); 326 if (!promote) 327 return NULL; 328 329 ret = PTR_ERR_OR_ZERO(promote); 330 if (ret) 331 goto nopromote; 332 333 *bounce = true; 334 *read_full = promote_full; 335 return promote; 336 nopromote: 337 trace_io_read_nopromote(c, ret); 338 return NULL; 339 } 340 341 /* Read */ 342 343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 344 struct bch_read_bio *rbio, struct bpos read_pos) 345 { 346 int ret = lockrestart_do(trans, 347 bch2_inum_offset_err_msg_trans(trans, out, 348 (subvol_inum) { rbio->subvol, read_pos.inode }, 349 read_pos.offset << 9)); 350 if (ret) 351 return ret; 352 353 if (rbio->data_update) 354 prt_str(out, "(internal move) "); 355 356 return 0; 357 } 358 359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 360 struct bch_read_bio *rbio, struct bpos read_pos) 361 { 362 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 363 } 364 365 enum rbio_context { 366 RBIO_CONTEXT_NULL, 367 RBIO_CONTEXT_HIGHPRI, 368 RBIO_CONTEXT_UNBOUND, 369 }; 370 371 static inline struct bch_read_bio * 372 bch2_rbio_parent(struct bch_read_bio *rbio) 373 { 374 return rbio->split ? rbio->parent : rbio; 375 } 376 377 __always_inline 378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 379 enum rbio_context context, 380 struct workqueue_struct *wq) 381 { 382 if (context <= rbio->context) { 383 fn(&rbio->work); 384 } else { 385 rbio->work.func = fn; 386 rbio->context = context; 387 queue_work(wq, &rbio->work); 388 } 389 } 390 391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 392 { 393 BUG_ON(rbio->bounce && !rbio->split); 394 395 if (rbio->have_ioref) { 396 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 397 percpu_ref_put(&ca->io_ref[READ]); 398 } 399 400 if (rbio->split) { 401 struct bch_read_bio *parent = rbio->parent; 402 403 if (unlikely(rbio->promote)) { 404 if (!rbio->bio.bi_status) 405 promote_start(rbio); 406 else 407 promote_free(rbio); 408 } else { 409 if (rbio->bounce) 410 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 411 412 bio_put(&rbio->bio); 413 } 414 415 rbio = parent; 416 } 417 418 return rbio; 419 } 420 421 /* 422 * Only called on a top level bch_read_bio to complete an entire read request, 423 * not a split: 424 */ 425 static void bch2_rbio_done(struct bch_read_bio *rbio) 426 { 427 if (rbio->start_time) 428 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 429 rbio->start_time); 430 bio_endio(&rbio->bio); 431 } 432 433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 434 struct bch_read_bio *rbio, 435 struct bvec_iter bvec_iter, 436 struct bch_io_failures *failed, 437 unsigned flags) 438 { 439 struct data_update *u = container_of(rbio, struct data_update, rbio); 440 retry: 441 bch2_trans_begin(trans); 442 443 struct btree_iter iter; 444 struct bkey_s_c k; 445 int ret = lockrestart_do(trans, 446 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 447 u->btree_id, bkey_start_pos(&u->k.k->k), 448 0))); 449 if (ret) 450 goto err; 451 452 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 453 /* extent we wanted to read no longer exists: */ 454 rbio->ret = -BCH_ERR_data_read_key_overwritten; 455 goto err; 456 } 457 458 ret = __bch2_read_extent(trans, rbio, bvec_iter, 459 bkey_start_pos(&u->k.k->k), 460 u->btree_id, 461 bkey_i_to_s_c(u->k.k), 462 0, failed, flags, -1); 463 err: 464 bch2_trans_iter_exit(trans, &iter); 465 466 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) 467 goto retry; 468 469 if (ret) { 470 rbio->bio.bi_status = BLK_STS_IOERR; 471 rbio->ret = ret; 472 } 473 474 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 475 return ret; 476 } 477 478 static void bch2_rbio_retry(struct work_struct *work) 479 { 480 struct bch_read_bio *rbio = 481 container_of(work, struct bch_read_bio, work); 482 struct bch_fs *c = rbio->c; 483 struct bvec_iter iter = rbio->bvec_iter; 484 unsigned flags = rbio->flags; 485 subvol_inum inum = { 486 .subvol = rbio->subvol, 487 .inum = rbio->read_pos.inode, 488 }; 489 struct bch_io_failures failed = { .nr = 0 }; 490 struct btree_trans *trans = bch2_trans_get(c); 491 492 trace_io_read_retry(&rbio->bio); 493 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 494 bvec_iter_sectors(rbio->bvec_iter)); 495 496 if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 497 bch2_mark_io_failure(&failed, &rbio->pick, 498 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 499 500 if (!rbio->split) { 501 rbio->bio.bi_status = 0; 502 rbio->ret = 0; 503 } 504 505 unsigned subvol = rbio->subvol; 506 struct bpos read_pos = rbio->read_pos; 507 508 rbio = bch2_rbio_free(rbio); 509 510 flags |= BCH_READ_in_retry; 511 flags &= ~BCH_READ_may_promote; 512 flags &= ~BCH_READ_last_fragment; 513 flags |= BCH_READ_must_clone; 514 515 int ret = rbio->data_update 516 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 517 : __bch2_read(trans, rbio, iter, inum, &failed, flags); 518 519 if (ret) { 520 rbio->ret = ret; 521 rbio->bio.bi_status = BLK_STS_IOERR; 522 } else { 523 struct printbuf buf = PRINTBUF; 524 525 lockrestart_do(trans, 526 bch2_inum_offset_err_msg_trans(trans, &buf, 527 (subvol_inum) { subvol, read_pos.inode }, 528 read_pos.offset << 9)); 529 if (rbio->data_update) 530 prt_str(&buf, "(internal move) "); 531 prt_str(&buf, "successful retry"); 532 533 bch_err_ratelimited(c, "%s", buf.buf); 534 printbuf_exit(&buf); 535 } 536 537 bch2_rbio_done(rbio); 538 bch2_trans_put(trans); 539 } 540 541 static void bch2_rbio_error(struct bch_read_bio *rbio, 542 int ret, blk_status_t blk_error) 543 { 544 BUG_ON(ret >= 0); 545 546 rbio->ret = ret; 547 rbio->bio.bi_status = blk_error; 548 549 bch2_rbio_parent(rbio)->saw_error = true; 550 551 if (rbio->flags & BCH_READ_in_retry) 552 return; 553 554 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 555 bch2_rbio_punt(rbio, bch2_rbio_retry, 556 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 557 } else { 558 rbio = bch2_rbio_free(rbio); 559 560 rbio->ret = ret; 561 rbio->bio.bi_status = blk_error; 562 563 bch2_rbio_done(rbio); 564 } 565 } 566 567 static void bch2_read_io_err(struct work_struct *work) 568 { 569 struct bch_read_bio *rbio = 570 container_of(work, struct bch_read_bio, work); 571 struct bio *bio = &rbio->bio; 572 struct bch_fs *c = rbio->c; 573 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 574 struct printbuf buf = PRINTBUF; 575 576 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 577 prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); 578 579 if (ca) 580 bch_err_ratelimited(ca, "%s", buf.buf); 581 else 582 bch_err_ratelimited(c, "%s", buf.buf); 583 584 printbuf_exit(&buf); 585 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 586 } 587 588 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 589 struct bch_read_bio *rbio) 590 { 591 struct bch_fs *c = rbio->c; 592 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 593 struct bch_extent_crc_unpacked new_crc; 594 struct btree_iter iter; 595 struct bkey_i *new; 596 struct bkey_s_c k; 597 int ret = 0; 598 599 if (crc_is_compressed(rbio->pick.crc)) 600 return 0; 601 602 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 603 BTREE_ITER_slots|BTREE_ITER_intent); 604 if ((ret = bkey_err(k))) 605 goto out; 606 607 if (bversion_cmp(k.k->bversion, rbio->version) || 608 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 609 goto out; 610 611 /* Extent was merged? */ 612 if (bkey_start_offset(k.k) < data_offset || 613 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 614 goto out; 615 616 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 617 rbio->pick.crc, NULL, &new_crc, 618 bkey_start_offset(k.k) - data_offset, k.k->size, 619 rbio->pick.crc.csum_type)) { 620 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 621 ret = 0; 622 goto out; 623 } 624 625 /* 626 * going to be temporarily appending another checksum entry: 627 */ 628 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 629 sizeof(struct bch_extent_crc128)); 630 if ((ret = PTR_ERR_OR_ZERO(new))) 631 goto out; 632 633 bkey_reassemble(new, k); 634 635 if (!bch2_bkey_narrow_crcs(new, new_crc)) 636 goto out; 637 638 ret = bch2_trans_update(trans, &iter, new, 639 BTREE_UPDATE_internal_snapshot_node); 640 out: 641 bch2_trans_iter_exit(trans, &iter); 642 return ret; 643 } 644 645 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 646 { 647 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 648 __bch2_rbio_narrow_crcs(trans, rbio)); 649 } 650 651 static void bch2_read_csum_err(struct work_struct *work) 652 { 653 struct bch_read_bio *rbio = 654 container_of(work, struct bch_read_bio, work); 655 struct bch_fs *c = rbio->c; 656 struct bio *src = &rbio->bio; 657 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 658 struct nonce nonce = extent_nonce(rbio->version, crc); 659 struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 660 struct printbuf buf = PRINTBUF; 661 662 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 663 prt_str(&buf, "data "); 664 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 665 666 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 667 if (ca) 668 bch_err_ratelimited(ca, "%s", buf.buf); 669 else 670 bch_err_ratelimited(c, "%s", buf.buf); 671 672 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 673 printbuf_exit(&buf); 674 } 675 676 static void bch2_read_decompress_err(struct work_struct *work) 677 { 678 struct bch_read_bio *rbio = 679 container_of(work, struct bch_read_bio, work); 680 struct bch_fs *c = rbio->c; 681 struct printbuf buf = PRINTBUF; 682 683 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 684 prt_str(&buf, "decompression error"); 685 686 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 687 if (ca) 688 bch_err_ratelimited(ca, "%s", buf.buf); 689 else 690 bch_err_ratelimited(c, "%s", buf.buf); 691 692 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 693 printbuf_exit(&buf); 694 } 695 696 static void bch2_read_decrypt_err(struct work_struct *work) 697 { 698 struct bch_read_bio *rbio = 699 container_of(work, struct bch_read_bio, work); 700 struct bch_fs *c = rbio->c; 701 struct printbuf buf = PRINTBUF; 702 703 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 704 prt_str(&buf, "decrypt error"); 705 706 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 707 if (ca) 708 bch_err_ratelimited(ca, "%s", buf.buf); 709 else 710 bch_err_ratelimited(c, "%s", buf.buf); 711 712 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 713 printbuf_exit(&buf); 714 } 715 716 /* Inner part that may run in process context */ 717 static void __bch2_read_endio(struct work_struct *work) 718 { 719 struct bch_read_bio *rbio = 720 container_of(work, struct bch_read_bio, work); 721 struct bch_fs *c = rbio->c; 722 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 723 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 724 struct bio *src = &rbio->bio; 725 struct bio *dst = &parent->bio; 726 struct bvec_iter dst_iter = rbio->bvec_iter; 727 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 728 struct nonce nonce = extent_nonce(rbio->version, crc); 729 unsigned nofs_flags; 730 struct bch_csum csum; 731 int ret; 732 733 nofs_flags = memalloc_nofs_save(); 734 735 /* Reset iterator for checksumming and copying bounced data: */ 736 if (rbio->bounce) { 737 src->bi_iter.bi_size = crc.compressed_size << 9; 738 src->bi_iter.bi_idx = 0; 739 src->bi_iter.bi_bvec_done = 0; 740 } else { 741 src->bi_iter = rbio->bvec_iter; 742 } 743 744 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 745 746 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 747 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 748 749 /* 750 * Checksum error: if the bio wasn't bounced, we may have been 751 * reading into buffers owned by userspace (that userspace can 752 * scribble over) - retry the read, bouncing it this time: 753 */ 754 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 755 rbio->flags |= BCH_READ_must_bounce; 756 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 757 BLK_STS_IOERR); 758 goto out; 759 } 760 761 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 762 763 if (!csum_good) 764 goto csum_err; 765 766 /* 767 * XXX 768 * We need to rework the narrow_crcs path to deliver the read completion 769 * first, and then punt to a different workqueue, otherwise we're 770 * holding up reads while doing btree updates which is bad for memory 771 * reclaim. 772 */ 773 if (unlikely(rbio->narrow_crcs)) 774 bch2_rbio_narrow_crcs(rbio); 775 776 if (likely(!parent->data_update)) { 777 /* Adjust crc to point to subset of data we want: */ 778 crc.offset += rbio->offset_into_extent; 779 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 780 781 if (crc_is_compressed(crc)) { 782 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 783 if (ret) 784 goto decrypt_err; 785 786 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 787 !c->opts.no_data_io) 788 goto decompression_err; 789 } else { 790 /* don't need to decrypt the entire bio: */ 791 nonce = nonce_add(nonce, crc.offset << 9); 792 bio_advance(src, crc.offset << 9); 793 794 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 795 src->bi_iter.bi_size = dst_iter.bi_size; 796 797 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 798 if (ret) 799 goto decrypt_err; 800 801 if (rbio->bounce) { 802 struct bvec_iter src_iter = src->bi_iter; 803 804 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 805 } 806 } 807 } else { 808 if (rbio->split) 809 rbio->parent->pick = rbio->pick; 810 811 if (rbio->bounce) { 812 struct bvec_iter src_iter = src->bi_iter; 813 814 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 815 } 816 } 817 818 if (rbio->promote) { 819 /* 820 * Re encrypt data we decrypted, so it's consistent with 821 * rbio->crc: 822 */ 823 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 824 if (ret) 825 goto decrypt_err; 826 } 827 828 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 829 rbio = bch2_rbio_free(rbio); 830 bch2_rbio_done(rbio); 831 } 832 out: 833 memalloc_nofs_restore(nofs_flags); 834 return; 835 csum_err: 836 bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 837 goto out; 838 decompression_err: 839 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 840 goto out; 841 decrypt_err: 842 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 843 goto out; 844 } 845 846 static void bch2_read_endio(struct bio *bio) 847 { 848 struct bch_read_bio *rbio = 849 container_of(bio, struct bch_read_bio, bio); 850 struct bch_fs *c = rbio->c; 851 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 852 struct workqueue_struct *wq = NULL; 853 enum rbio_context context = RBIO_CONTEXT_NULL; 854 855 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 856 rbio->submit_time, !bio->bi_status); 857 858 if (!rbio->split) 859 rbio->bio.bi_end_io = rbio->end_io; 860 861 if (unlikely(bio->bi_status)) { 862 bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 863 return; 864 } 865 866 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 867 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 868 trace_and_count(c, io_read_reuse_race, &rbio->bio); 869 870 if (rbio->flags & BCH_READ_retry_if_stale) 871 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 872 else 873 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 874 return; 875 } 876 877 if (rbio->narrow_crcs || 878 rbio->promote || 879 crc_is_compressed(rbio->pick.crc) || 880 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 881 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 882 else if (rbio->pick.crc.csum_type) 883 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 884 885 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 886 } 887 888 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 889 struct bch_dev *ca, 890 struct bkey_s_c k, 891 struct bch_extent_ptr ptr) 892 { 893 struct bch_fs *c = trans->c; 894 struct btree_iter iter; 895 struct printbuf buf = PRINTBUF; 896 int ret; 897 898 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 899 PTR_BUCKET_POS(ca, &ptr), 900 BTREE_ITER_cached); 901 902 int gen = bucket_gen_get(ca, iter.pos.offset); 903 if (gen >= 0) { 904 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 905 printbuf_indent_add(&buf, 2); 906 907 bch2_bkey_val_to_text(&buf, c, k); 908 prt_newline(&buf); 909 910 prt_printf(&buf, "memory gen: %u", gen); 911 912 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 913 if (!ret) { 914 prt_newline(&buf); 915 bch2_bkey_val_to_text(&buf, c, k); 916 } 917 } else { 918 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 919 iter.pos.inode, iter.pos.offset); 920 printbuf_indent_add(&buf, 2); 921 922 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 923 ca->mi.first_bucket, ca->mi.nbuckets); 924 925 bch2_bkey_val_to_text(&buf, c, k); 926 prt_newline(&buf); 927 } 928 929 bch2_fs_inconsistent(c, "%s", buf.buf); 930 931 bch2_trans_iter_exit(trans, &iter); 932 printbuf_exit(&buf); 933 } 934 935 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 936 struct bvec_iter iter, struct bpos read_pos, 937 enum btree_id data_btree, struct bkey_s_c k, 938 unsigned offset_into_extent, 939 struct bch_io_failures *failed, unsigned flags, int dev) 940 { 941 struct bch_fs *c = trans->c; 942 struct extent_ptr_decoded pick; 943 struct bch_read_bio *rbio = NULL; 944 bool bounce = false, read_full = false, narrow_crcs = false; 945 struct bpos data_pos = bkey_start_pos(k.k); 946 struct data_update *u = rbio_data_update(orig); 947 int ret = 0; 948 949 if (bkey_extent_is_inline_data(k.k)) { 950 unsigned bytes = min_t(unsigned, iter.bi_size, 951 bkey_inline_data_bytes(k.k)); 952 953 swap(iter.bi_size, bytes); 954 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 955 swap(iter.bi_size, bytes); 956 bio_advance_iter(&orig->bio, &iter, bytes); 957 zero_fill_bio_iter(&orig->bio, iter); 958 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 959 bvec_iter_sectors(iter)); 960 goto out_read_done; 961 } 962 retry_pick: 963 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 964 965 /* hole or reservation - just zero fill: */ 966 if (!ret) 967 goto hole; 968 969 if (unlikely(ret < 0)) { 970 struct printbuf buf = PRINTBUF; 971 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 972 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 973 bch2_bkey_val_to_text(&buf, c, k); 974 975 bch_err_ratelimited(c, "%s", buf.buf); 976 printbuf_exit(&buf); 977 goto err; 978 } 979 980 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { 981 struct printbuf buf = PRINTBUF; 982 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 983 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 984 bch2_bkey_val_to_text(&buf, c, k); 985 986 bch_err_ratelimited(c, "%s", buf.buf); 987 printbuf_exit(&buf); 988 ret = -BCH_ERR_data_read_no_encryption_key; 989 goto err; 990 } 991 992 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 993 994 /* 995 * Stale dirty pointers are treated as IO errors, but @failed isn't 996 * allocated unless we're in the retry path - so if we're not in the 997 * retry path, don't check here, it'll be caught in bch2_read_endio() 998 * and we'll end up in the retry path: 999 */ 1000 if ((flags & BCH_READ_in_retry) && 1001 !pick.ptr.cached && 1002 ca && 1003 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1004 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1005 bch2_mark_io_failure(failed, &pick, false); 1006 percpu_ref_put(&ca->io_ref[READ]); 1007 goto retry_pick; 1008 } 1009 1010 if (likely(!u)) { 1011 if (!(flags & BCH_READ_last_fragment) || 1012 bio_flagged(&orig->bio, BIO_CHAIN)) 1013 flags |= BCH_READ_must_clone; 1014 1015 narrow_crcs = !(flags & BCH_READ_in_retry) && 1016 bch2_can_narrow_extent_crcs(k, pick.crc); 1017 1018 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1019 flags |= BCH_READ_must_bounce; 1020 1021 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1022 1023 if (crc_is_compressed(pick.crc) || 1024 (pick.crc.csum_type != BCH_CSUM_none && 1025 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1026 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1027 (flags & BCH_READ_user_mapped)) || 1028 (flags & BCH_READ_must_bounce)))) { 1029 read_full = true; 1030 bounce = true; 1031 } 1032 } else { 1033 /* 1034 * can happen if we retry, and the extent we were going to read 1035 * has been merged in the meantime: 1036 */ 1037 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1038 if (ca) 1039 percpu_ref_put(&ca->io_ref[READ]); 1040 rbio->ret = -BCH_ERR_data_read_buffer_too_small; 1041 goto out_read_done; 1042 } 1043 1044 iter.bi_size = pick.crc.compressed_size << 9; 1045 read_full = true; 1046 } 1047 1048 if (orig->opts.promote_target || have_io_error(failed)) 1049 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1050 &bounce, &read_full, failed); 1051 1052 if (!read_full) { 1053 EBUG_ON(crc_is_compressed(pick.crc)); 1054 EBUG_ON(pick.crc.csum_type && 1055 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1056 bvec_iter_sectors(iter) != pick.crc.live_size || 1057 pick.crc.offset || 1058 offset_into_extent)); 1059 1060 data_pos.offset += offset_into_extent; 1061 pick.ptr.offset += pick.crc.offset + 1062 offset_into_extent; 1063 offset_into_extent = 0; 1064 pick.crc.compressed_size = bvec_iter_sectors(iter); 1065 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1066 pick.crc.offset = 0; 1067 pick.crc.live_size = bvec_iter_sectors(iter); 1068 } 1069 1070 if (rbio) { 1071 /* 1072 * promote already allocated bounce rbio: 1073 * promote needs to allocate a bio big enough for uncompressing 1074 * data in the write path, but we're not going to use it all 1075 * here: 1076 */ 1077 EBUG_ON(rbio->bio.bi_iter.bi_size < 1078 pick.crc.compressed_size << 9); 1079 rbio->bio.bi_iter.bi_size = 1080 pick.crc.compressed_size << 9; 1081 } else if (bounce) { 1082 unsigned sectors = pick.crc.compressed_size; 1083 1084 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1085 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1086 0, 1087 GFP_NOFS, 1088 &c->bio_read_split), 1089 orig); 1090 1091 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1092 rbio->bounce = true; 1093 } else if (flags & BCH_READ_must_clone) { 1094 /* 1095 * Have to clone if there were any splits, due to error 1096 * reporting issues (if a split errored, and retrying didn't 1097 * work, when it reports the error to its parent (us) we don't 1098 * know if the error was from our bio, and we should retry, or 1099 * from the whole bio, in which case we don't want to retry and 1100 * lose the error) 1101 */ 1102 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1103 &c->bio_read_split), 1104 orig); 1105 rbio->bio.bi_iter = iter; 1106 } else { 1107 rbio = orig; 1108 rbio->bio.bi_iter = iter; 1109 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1110 } 1111 1112 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1113 1114 rbio->submit_time = local_clock(); 1115 if (!rbio->split) 1116 rbio->end_io = orig->bio.bi_end_io; 1117 rbio->bvec_iter = iter; 1118 rbio->offset_into_extent= offset_into_extent; 1119 rbio->flags = flags; 1120 rbio->have_ioref = ca != NULL; 1121 rbio->narrow_crcs = narrow_crcs; 1122 rbio->ret = 0; 1123 rbio->context = 0; 1124 rbio->pick = pick; 1125 rbio->subvol = orig->subvol; 1126 rbio->read_pos = read_pos; 1127 rbio->data_btree = data_btree; 1128 rbio->data_pos = data_pos; 1129 rbio->version = k.k->bversion; 1130 INIT_WORK(&rbio->work, NULL); 1131 1132 rbio->bio.bi_opf = orig->bio.bi_opf; 1133 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1134 rbio->bio.bi_end_io = bch2_read_endio; 1135 1136 if (rbio->bounce) 1137 trace_and_count(c, io_read_bounce, &rbio->bio); 1138 1139 if (!u) 1140 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1141 else 1142 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1143 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1144 1145 /* 1146 * If it's being moved internally, we don't want to flag it as a cache 1147 * hit: 1148 */ 1149 if (ca && pick.ptr.cached && !u) 1150 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1151 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1152 1153 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1154 bio_inc_remaining(&orig->bio); 1155 trace_and_count(c, io_read_split, &orig->bio); 1156 } 1157 1158 /* 1159 * Unlock the iterator while the btree node's lock is still in 1160 * cache, before doing the IO: 1161 */ 1162 if (!(flags & BCH_READ_in_retry)) 1163 bch2_trans_unlock(trans); 1164 else 1165 bch2_trans_unlock_long(trans); 1166 1167 if (likely(!rbio->pick.do_ec_reconstruct)) { 1168 if (unlikely(!rbio->have_ioref)) { 1169 struct printbuf buf = PRINTBUF; 1170 bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); 1171 prt_printf(&buf, "no device to read from:\n "); 1172 bch2_bkey_val_to_text(&buf, c, k); 1173 1174 bch_err_ratelimited(c, "%s", buf.buf); 1175 printbuf_exit(&buf); 1176 1177 bch2_rbio_error(rbio, 1178 -BCH_ERR_data_read_retry_device_offline, 1179 BLK_STS_IOERR); 1180 goto out; 1181 } 1182 1183 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1184 bio_sectors(&rbio->bio)); 1185 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1186 1187 if (unlikely(c->opts.no_data_io)) { 1188 if (likely(!(flags & BCH_READ_in_retry))) 1189 bio_endio(&rbio->bio); 1190 } else { 1191 if (likely(!(flags & BCH_READ_in_retry))) 1192 submit_bio(&rbio->bio); 1193 else 1194 submit_bio_wait(&rbio->bio); 1195 } 1196 1197 /* 1198 * We just submitted IO which may block, we expect relock fail 1199 * events and shouldn't count them: 1200 */ 1201 trans->notrace_relock_fail = true; 1202 } else { 1203 /* Attempting reconstruct read: */ 1204 if (bch2_ec_read_extent(trans, rbio, k)) { 1205 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1206 BLK_STS_IOERR); 1207 goto out; 1208 } 1209 1210 if (likely(!(flags & BCH_READ_in_retry))) 1211 bio_endio(&rbio->bio); 1212 } 1213 out: 1214 if (likely(!(flags & BCH_READ_in_retry))) { 1215 return 0; 1216 } else { 1217 bch2_trans_unlock(trans); 1218 1219 int ret; 1220 1221 rbio->context = RBIO_CONTEXT_UNBOUND; 1222 bch2_read_endio(&rbio->bio); 1223 1224 ret = rbio->ret; 1225 rbio = bch2_rbio_free(rbio); 1226 1227 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1228 bch2_mark_io_failure(failed, &pick, 1229 ret == -BCH_ERR_data_read_retry_csum_err); 1230 1231 return ret; 1232 } 1233 1234 err: 1235 if (flags & BCH_READ_in_retry) 1236 return ret; 1237 1238 orig->bio.bi_status = BLK_STS_IOERR; 1239 orig->ret = ret; 1240 goto out_read_done; 1241 1242 hole: 1243 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1244 bvec_iter_sectors(iter)); 1245 /* 1246 * won't normally happen in the data update (bch2_move_extent()) path, 1247 * but if we retry and the extent we wanted to read no longer exists we 1248 * have to signal that: 1249 */ 1250 if (u) 1251 orig->ret = -BCH_ERR_data_read_key_overwritten; 1252 1253 zero_fill_bio_iter(&orig->bio, iter); 1254 out_read_done: 1255 if ((flags & BCH_READ_last_fragment) && 1256 !(flags & BCH_READ_in_retry)) 1257 bch2_rbio_done(orig); 1258 return 0; 1259 } 1260 1261 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1262 struct bvec_iter bvec_iter, subvol_inum inum, 1263 struct bch_io_failures *failed, unsigned flags) 1264 { 1265 struct bch_fs *c = trans->c; 1266 struct btree_iter iter; 1267 struct bkey_buf sk; 1268 struct bkey_s_c k; 1269 int ret; 1270 1271 EBUG_ON(rbio->data_update); 1272 1273 bch2_bkey_buf_init(&sk); 1274 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1275 POS(inum.inum, bvec_iter.bi_sector), 1276 BTREE_ITER_slots); 1277 1278 while (1) { 1279 enum btree_id data_btree = BTREE_ID_extents; 1280 1281 bch2_trans_begin(trans); 1282 1283 u32 snapshot; 1284 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1285 if (ret) 1286 goto err; 1287 1288 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1289 1290 bch2_btree_iter_set_pos(trans, &iter, 1291 POS(inum.inum, bvec_iter.bi_sector)); 1292 1293 k = bch2_btree_iter_peek_slot(trans, &iter); 1294 ret = bkey_err(k); 1295 if (ret) 1296 goto err; 1297 1298 s64 offset_into_extent = iter.pos.offset - 1299 bkey_start_offset(k.k); 1300 unsigned sectors = k.k->size - offset_into_extent; 1301 1302 bch2_bkey_buf_reassemble(&sk, c, k); 1303 1304 ret = bch2_read_indirect_extent(trans, &data_btree, 1305 &offset_into_extent, &sk); 1306 if (ret) 1307 goto err; 1308 1309 k = bkey_i_to_s_c(sk.k); 1310 1311 /* 1312 * With indirect extents, the amount of data to read is the min 1313 * of the original extent and the indirect extent: 1314 */ 1315 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1316 1317 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1318 swap(bvec_iter.bi_size, bytes); 1319 1320 if (bvec_iter.bi_size == bytes) 1321 flags |= BCH_READ_last_fragment; 1322 1323 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1324 data_btree, k, 1325 offset_into_extent, failed, flags, -1); 1326 swap(bvec_iter.bi_size, bytes); 1327 1328 if (ret) 1329 goto err; 1330 1331 if (flags & BCH_READ_last_fragment) 1332 break; 1333 1334 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1335 err: 1336 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1337 flags |= BCH_READ_must_bounce; 1338 1339 if (ret && 1340 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1341 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1342 break; 1343 } 1344 1345 bch2_trans_iter_exit(trans, &iter); 1346 1347 if (ret) { 1348 struct printbuf buf = PRINTBUF; 1349 lockrestart_do(trans, 1350 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1351 bvec_iter.bi_sector << 9)); 1352 prt_printf(&buf, "read error: %s", bch2_err_str(ret)); 1353 bch_err_ratelimited(c, "%s", buf.buf); 1354 printbuf_exit(&buf); 1355 1356 rbio->bio.bi_status = BLK_STS_IOERR; 1357 rbio->ret = ret; 1358 1359 if (!(flags & BCH_READ_in_retry)) 1360 bch2_rbio_done(rbio); 1361 } 1362 1363 bch2_bkey_buf_exit(&sk, c); 1364 return ret; 1365 } 1366 1367 void bch2_fs_io_read_exit(struct bch_fs *c) 1368 { 1369 if (c->promote_table.tbl) 1370 rhashtable_destroy(&c->promote_table); 1371 bioset_exit(&c->bio_read_split); 1372 bioset_exit(&c->bio_read); 1373 } 1374 1375 int bch2_fs_io_read_init(struct bch_fs *c) 1376 { 1377 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1378 BIOSET_NEED_BVECS)) 1379 return -BCH_ERR_ENOMEM_bio_read_init; 1380 1381 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1382 BIOSET_NEED_BVECS)) 1383 return -BCH_ERR_ENOMEM_bio_read_split_init; 1384 1385 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1386 return -BCH_ERR_ENOMEM_promote_table_init; 1387 1388 return 0; 1389 } 1390