1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "reflink.h" 25 #include "subvolume.h" 26 #include "trace.h" 27 28 #include <linux/random.h> 29 #include <linux/sched/mm.h> 30 31 #ifdef CONFIG_BCACHEFS_DEBUG 32 static unsigned bch2_read_corrupt_ratio; 33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 34 MODULE_PARM_DESC(read_corrupt_ratio, ""); 35 #endif 36 37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 38 39 static bool bch2_target_congested(struct bch_fs *c, u16 target) 40 { 41 const struct bch_devs_mask *devs; 42 unsigned d, nr = 0, total = 0; 43 u64 now = local_clock(), last; 44 s64 congested; 45 struct bch_dev *ca; 46 47 if (!target) 48 return false; 49 50 rcu_read_lock(); 51 devs = bch2_target_to_mask(c, target) ?: 52 &c->rw_devs[BCH_DATA_user]; 53 54 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 55 ca = rcu_dereference(c->devs[d]); 56 if (!ca) 57 continue; 58 59 congested = atomic_read(&ca->congested); 60 last = READ_ONCE(ca->congested_last); 61 if (time_after64(now, last)) 62 congested -= (now - last) >> 12; 63 64 total += max(congested, 0LL); 65 nr++; 66 } 67 rcu_read_unlock(); 68 69 return get_random_u32_below(nr * CONGESTED_MAX) < total; 70 } 71 72 #else 73 74 static bool bch2_target_congested(struct bch_fs *c, u16 target) 75 { 76 return false; 77 } 78 79 #endif 80 81 /* Cache promotion on read */ 82 83 struct promote_op { 84 struct rcu_head rcu; 85 u64 start_time; 86 87 struct rhash_head hash; 88 struct bpos pos; 89 90 struct work_struct work; 91 struct data_update write; 92 struct bio_vec bi_inline_vecs[]; /* must be last */ 93 }; 94 95 static const struct rhashtable_params bch_promote_params = { 96 .head_offset = offsetof(struct promote_op, hash), 97 .key_offset = offsetof(struct promote_op, pos), 98 .key_len = sizeof(struct bpos), 99 .automatic_shrinking = true, 100 }; 101 102 static inline bool have_io_error(struct bch_io_failures *failed) 103 { 104 return failed && failed->nr; 105 } 106 107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 108 { 109 EBUG_ON(rbio->split); 110 111 return rbio->data_update 112 ? container_of(rbio, struct data_update, rbio) 113 : NULL; 114 } 115 116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 117 { 118 struct data_update *u = rbio_data_update(orig); 119 if (!u) 120 return false; 121 122 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 123 unsigned i = 0; 124 bkey_for_each_ptr(ptrs, ptr) { 125 if (ptr->dev == dev && 126 u->data_opts.rewrite_ptrs & BIT(i)) 127 return true; 128 i++; 129 } 130 131 return false; 132 } 133 134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 135 struct bpos pos, 136 struct bch_io_opts opts, 137 unsigned flags, 138 struct bch_io_failures *failed) 139 { 140 if (!have_io_error(failed)) { 141 BUG_ON(!opts.promote_target); 142 143 if (!(flags & BCH_READ_may_promote)) 144 return -BCH_ERR_nopromote_may_not; 145 146 if (bch2_bkey_has_target(c, k, opts.promote_target)) 147 return -BCH_ERR_nopromote_already_promoted; 148 149 if (bkey_extent_is_unwritten(k)) 150 return -BCH_ERR_nopromote_unwritten; 151 152 if (bch2_target_congested(c, opts.promote_target)) 153 return -BCH_ERR_nopromote_congested; 154 } 155 156 if (rhashtable_lookup_fast(&c->promote_table, &pos, 157 bch_promote_params)) 158 return -BCH_ERR_nopromote_in_flight; 159 160 return 0; 161 } 162 163 static noinline void promote_free(struct bch_read_bio *rbio) 164 { 165 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 166 struct bch_fs *c = rbio->c; 167 168 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 169 bch_promote_params); 170 BUG_ON(ret); 171 172 bch2_data_update_exit(&op->write); 173 174 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 175 kfree_rcu(op, rcu); 176 } 177 178 static void promote_done(struct bch_write_op *wop) 179 { 180 struct promote_op *op = container_of(wop, struct promote_op, write.op); 181 struct bch_fs *c = op->write.rbio.c; 182 183 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 184 promote_free(&op->write.rbio); 185 } 186 187 static void promote_start_work(struct work_struct *work) 188 { 189 struct promote_op *op = container_of(work, struct promote_op, work); 190 191 bch2_data_update_read_done(&op->write); 192 } 193 194 static noinline void promote_start(struct bch_read_bio *rbio) 195 { 196 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 197 198 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 199 200 INIT_WORK(&op->work, promote_start_work); 201 queue_work(rbio->c->write_ref_wq, &op->work); 202 } 203 204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 205 enum btree_id btree_id, 206 struct bkey_s_c k, 207 struct bpos pos, 208 struct extent_ptr_decoded *pick, 209 unsigned sectors, 210 struct bch_read_bio *orig, 211 struct bch_io_failures *failed) 212 { 213 struct bch_fs *c = trans->c; 214 int ret; 215 216 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 217 218 if (!have_io_error(failed)) { 219 update_opts.target = orig->opts.promote_target; 220 update_opts.extra_replicas = 1; 221 update_opts.write_flags |= BCH_WRITE_cached; 222 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 223 } else { 224 update_opts.target = orig->opts.foreground_target; 225 226 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 227 unsigned ptr_bit = 1; 228 bkey_for_each_ptr(ptrs, ptr) { 229 if (bch2_dev_io_failures(failed, ptr->dev) && 230 !ptr_being_rewritten(orig, ptr->dev)) 231 update_opts.rewrite_ptrs |= ptr_bit; 232 ptr_bit <<= 1; 233 } 234 235 if (!update_opts.rewrite_ptrs) 236 return NULL; 237 } 238 239 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 240 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 241 242 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 243 if (!op) { 244 ret = -BCH_ERR_nopromote_enomem; 245 goto err_put; 246 } 247 248 op->start_time = local_clock(); 249 op->pos = pos; 250 251 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 252 bch_promote_params)) { 253 ret = -BCH_ERR_nopromote_in_flight; 254 goto err; 255 } 256 257 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 258 writepoint_hashed((unsigned long) current), 259 &orig->opts, 260 update_opts, 261 btree_id, k); 262 op->write.type = BCH_DATA_UPDATE_promote; 263 /* 264 * possible errors: -BCH_ERR_nocow_lock_blocked, 265 * -BCH_ERR_ENOSPC_disk_reservation: 266 */ 267 if (ret) 268 goto err_remove_hash; 269 270 rbio_init_fragment(&op->write.rbio.bio, orig); 271 op->write.rbio.bounce = true; 272 op->write.rbio.promote = true; 273 op->write.op.end_io = promote_done; 274 275 return &op->write.rbio; 276 err_remove_hash: 277 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 278 bch_promote_params)); 279 err: 280 bio_free_pages(&op->write.op.wbio.bio); 281 /* We may have added to the rhashtable and thus need rcu freeing: */ 282 kfree_rcu(op, rcu); 283 err_put: 284 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 285 return ERR_PTR(ret); 286 } 287 288 noinline 289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 290 struct bvec_iter iter, 291 struct bkey_s_c k, 292 struct extent_ptr_decoded *pick, 293 unsigned flags, 294 struct bch_read_bio *orig, 295 bool *bounce, 296 bool *read_full, 297 struct bch_io_failures *failed) 298 { 299 struct bch_fs *c = trans->c; 300 /* 301 * if failed != NULL we're not actually doing a promote, we're 302 * recovering from an io/checksum error 303 */ 304 bool promote_full = (have_io_error(failed) || 305 *read_full || 306 READ_ONCE(c->opts.promote_whole_extents)); 307 /* data might have to be decompressed in the write path: */ 308 unsigned sectors = promote_full 309 ? max(pick->crc.compressed_size, pick->crc.live_size) 310 : bvec_iter_sectors(iter); 311 struct bpos pos = promote_full 312 ? bkey_start_pos(k.k) 313 : POS(k.k->p.inode, iter.bi_sector); 314 int ret; 315 316 ret = should_promote(c, k, pos, orig->opts, flags, failed); 317 if (ret) 318 goto nopromote; 319 320 struct bch_read_bio *promote = 321 __promote_alloc(trans, 322 k.k->type == KEY_TYPE_reflink_v 323 ? BTREE_ID_reflink 324 : BTREE_ID_extents, 325 k, pos, pick, sectors, orig, failed); 326 if (!promote) 327 return NULL; 328 329 ret = PTR_ERR_OR_ZERO(promote); 330 if (ret) 331 goto nopromote; 332 333 *bounce = true; 334 *read_full = promote_full; 335 return promote; 336 nopromote: 337 trace_io_read_nopromote(c, ret); 338 return NULL; 339 } 340 341 /* Read */ 342 343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 344 struct bch_read_bio *rbio, struct bpos read_pos) 345 { 346 int ret = lockrestart_do(trans, 347 bch2_inum_offset_err_msg_trans(trans, out, 348 (subvol_inum) { rbio->subvol, read_pos.inode }, 349 read_pos.offset << 9)); 350 if (ret) 351 return ret; 352 353 if (rbio->data_update) 354 prt_str(out, "(internal move) "); 355 356 return 0; 357 } 358 359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 360 struct bch_read_bio *rbio, struct bpos read_pos) 361 { 362 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 363 } 364 365 enum rbio_context { 366 RBIO_CONTEXT_NULL, 367 RBIO_CONTEXT_HIGHPRI, 368 RBIO_CONTEXT_UNBOUND, 369 }; 370 371 static inline struct bch_read_bio * 372 bch2_rbio_parent(struct bch_read_bio *rbio) 373 { 374 return rbio->split ? rbio->parent : rbio; 375 } 376 377 __always_inline 378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 379 enum rbio_context context, 380 struct workqueue_struct *wq) 381 { 382 if (context <= rbio->context) { 383 fn(&rbio->work); 384 } else { 385 rbio->work.func = fn; 386 rbio->context = context; 387 queue_work(wq, &rbio->work); 388 } 389 } 390 391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 392 { 393 BUG_ON(rbio->bounce && !rbio->split); 394 395 if (rbio->have_ioref) { 396 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 397 percpu_ref_put(&ca->io_ref[READ]); 398 } 399 400 if (rbio->split) { 401 struct bch_read_bio *parent = rbio->parent; 402 403 if (unlikely(rbio->promote)) { 404 if (!rbio->bio.bi_status) 405 promote_start(rbio); 406 else 407 promote_free(rbio); 408 } else { 409 if (rbio->bounce) 410 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 411 412 bio_put(&rbio->bio); 413 } 414 415 rbio = parent; 416 } 417 418 return rbio; 419 } 420 421 /* 422 * Only called on a top level bch_read_bio to complete an entire read request, 423 * not a split: 424 */ 425 static void bch2_rbio_done(struct bch_read_bio *rbio) 426 { 427 if (rbio->start_time) 428 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 429 rbio->start_time); 430 bio_endio(&rbio->bio); 431 } 432 433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 434 struct bch_read_bio *rbio, 435 struct bvec_iter bvec_iter, 436 struct bch_io_failures *failed, 437 unsigned flags) 438 { 439 struct data_update *u = container_of(rbio, struct data_update, rbio); 440 retry: 441 bch2_trans_begin(trans); 442 443 struct btree_iter iter; 444 struct bkey_s_c k; 445 int ret = lockrestart_do(trans, 446 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 447 u->btree_id, bkey_start_pos(&u->k.k->k), 448 0))); 449 if (ret) 450 goto err; 451 452 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 453 /* extent we wanted to read no longer exists: */ 454 rbio->ret = -BCH_ERR_data_read_key_overwritten; 455 goto err; 456 } 457 458 ret = __bch2_read_extent(trans, rbio, bvec_iter, 459 bkey_start_pos(&u->k.k->k), 460 u->btree_id, 461 bkey_i_to_s_c(u->k.k), 462 0, failed, flags, -1); 463 err: 464 bch2_trans_iter_exit(trans, &iter); 465 466 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) 467 goto retry; 468 469 if (ret) { 470 rbio->bio.bi_status = BLK_STS_IOERR; 471 rbio->ret = ret; 472 } 473 474 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 475 return ret; 476 } 477 478 static void bch2_rbio_retry(struct work_struct *work) 479 { 480 struct bch_read_bio *rbio = 481 container_of(work, struct bch_read_bio, work); 482 struct bch_fs *c = rbio->c; 483 struct bvec_iter iter = rbio->bvec_iter; 484 unsigned flags = rbio->flags; 485 subvol_inum inum = { 486 .subvol = rbio->subvol, 487 .inum = rbio->read_pos.inode, 488 }; 489 struct bch_io_failures failed = { .nr = 0 }; 490 int orig_error = rbio->ret; 491 492 struct btree_trans *trans = bch2_trans_get(c); 493 494 trace_io_read_retry(&rbio->bio); 495 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 496 bvec_iter_sectors(rbio->bvec_iter)); 497 498 if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 499 bch2_mark_io_failure(&failed, &rbio->pick, 500 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 501 502 if (!rbio->split) { 503 rbio->bio.bi_status = 0; 504 rbio->ret = 0; 505 } 506 507 unsigned subvol = rbio->subvol; 508 struct bpos read_pos = rbio->read_pos; 509 510 rbio = bch2_rbio_free(rbio); 511 512 flags |= BCH_READ_in_retry; 513 flags &= ~BCH_READ_may_promote; 514 flags &= ~BCH_READ_last_fragment; 515 flags |= BCH_READ_must_clone; 516 517 int ret = rbio->data_update 518 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 519 : __bch2_read(trans, rbio, iter, inum, &failed, flags); 520 521 if (ret) { 522 rbio->ret = ret; 523 rbio->bio.bi_status = BLK_STS_IOERR; 524 } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && 525 orig_error != -BCH_ERR_data_read_ptr_stale_race && 526 !failed.nr) { 527 struct printbuf buf = PRINTBUF; 528 529 lockrestart_do(trans, 530 bch2_inum_offset_err_msg_trans(trans, &buf, 531 (subvol_inum) { subvol, read_pos.inode }, 532 read_pos.offset << 9)); 533 if (rbio->data_update) 534 prt_str(&buf, "(internal move) "); 535 prt_str(&buf, "successful retry"); 536 537 bch_err_ratelimited(c, "%s", buf.buf); 538 printbuf_exit(&buf); 539 } 540 541 bch2_rbio_done(rbio); 542 bch2_trans_put(trans); 543 } 544 545 static void bch2_rbio_error(struct bch_read_bio *rbio, 546 int ret, blk_status_t blk_error) 547 { 548 BUG_ON(ret >= 0); 549 550 rbio->ret = ret; 551 rbio->bio.bi_status = blk_error; 552 553 bch2_rbio_parent(rbio)->saw_error = true; 554 555 if (rbio->flags & BCH_READ_in_retry) 556 return; 557 558 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 559 bch2_rbio_punt(rbio, bch2_rbio_retry, 560 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 561 } else { 562 rbio = bch2_rbio_free(rbio); 563 564 rbio->ret = ret; 565 rbio->bio.bi_status = blk_error; 566 567 bch2_rbio_done(rbio); 568 } 569 } 570 571 static void bch2_read_io_err(struct work_struct *work) 572 { 573 struct bch_read_bio *rbio = 574 container_of(work, struct bch_read_bio, work); 575 struct bio *bio = &rbio->bio; 576 struct bch_fs *c = rbio->c; 577 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 578 struct printbuf buf = PRINTBUF; 579 580 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 581 prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); 582 583 if (ca) 584 bch_err_ratelimited(ca, "%s", buf.buf); 585 else 586 bch_err_ratelimited(c, "%s", buf.buf); 587 588 printbuf_exit(&buf); 589 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 590 } 591 592 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 593 struct bch_read_bio *rbio) 594 { 595 struct bch_fs *c = rbio->c; 596 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 597 struct bch_extent_crc_unpacked new_crc; 598 struct btree_iter iter; 599 struct bkey_i *new; 600 struct bkey_s_c k; 601 int ret = 0; 602 603 if (crc_is_compressed(rbio->pick.crc)) 604 return 0; 605 606 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 607 BTREE_ITER_slots|BTREE_ITER_intent); 608 if ((ret = bkey_err(k))) 609 goto out; 610 611 if (bversion_cmp(k.k->bversion, rbio->version) || 612 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 613 goto out; 614 615 /* Extent was merged? */ 616 if (bkey_start_offset(k.k) < data_offset || 617 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 618 goto out; 619 620 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 621 rbio->pick.crc, NULL, &new_crc, 622 bkey_start_offset(k.k) - data_offset, k.k->size, 623 rbio->pick.crc.csum_type)) { 624 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 625 ret = 0; 626 goto out; 627 } 628 629 /* 630 * going to be temporarily appending another checksum entry: 631 */ 632 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 633 sizeof(struct bch_extent_crc128)); 634 if ((ret = PTR_ERR_OR_ZERO(new))) 635 goto out; 636 637 bkey_reassemble(new, k); 638 639 if (!bch2_bkey_narrow_crcs(new, new_crc)) 640 goto out; 641 642 ret = bch2_trans_update(trans, &iter, new, 643 BTREE_UPDATE_internal_snapshot_node); 644 out: 645 bch2_trans_iter_exit(trans, &iter); 646 return ret; 647 } 648 649 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 650 { 651 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 652 __bch2_rbio_narrow_crcs(trans, rbio)); 653 } 654 655 static void bch2_read_csum_err(struct work_struct *work) 656 { 657 struct bch_read_bio *rbio = 658 container_of(work, struct bch_read_bio, work); 659 struct bch_fs *c = rbio->c; 660 struct bio *src = &rbio->bio; 661 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 662 struct nonce nonce = extent_nonce(rbio->version, crc); 663 struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 664 struct printbuf buf = PRINTBUF; 665 666 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 667 prt_str(&buf, "data "); 668 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 669 670 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 671 if (ca) 672 bch_err_ratelimited(ca, "%s", buf.buf); 673 else 674 bch_err_ratelimited(c, "%s", buf.buf); 675 676 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 677 printbuf_exit(&buf); 678 } 679 680 static void bch2_read_decompress_err(struct work_struct *work) 681 { 682 struct bch_read_bio *rbio = 683 container_of(work, struct bch_read_bio, work); 684 struct bch_fs *c = rbio->c; 685 struct printbuf buf = PRINTBUF; 686 687 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 688 prt_str(&buf, "decompression error"); 689 690 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 691 if (ca) 692 bch_err_ratelimited(ca, "%s", buf.buf); 693 else 694 bch_err_ratelimited(c, "%s", buf.buf); 695 696 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 697 printbuf_exit(&buf); 698 } 699 700 static void bch2_read_decrypt_err(struct work_struct *work) 701 { 702 struct bch_read_bio *rbio = 703 container_of(work, struct bch_read_bio, work); 704 struct bch_fs *c = rbio->c; 705 struct printbuf buf = PRINTBUF; 706 707 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 708 prt_str(&buf, "decrypt error"); 709 710 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 711 if (ca) 712 bch_err_ratelimited(ca, "%s", buf.buf); 713 else 714 bch_err_ratelimited(c, "%s", buf.buf); 715 716 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 717 printbuf_exit(&buf); 718 } 719 720 /* Inner part that may run in process context */ 721 static void __bch2_read_endio(struct work_struct *work) 722 { 723 struct bch_read_bio *rbio = 724 container_of(work, struct bch_read_bio, work); 725 struct bch_fs *c = rbio->c; 726 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 727 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 728 struct bio *src = &rbio->bio; 729 struct bio *dst = &parent->bio; 730 struct bvec_iter dst_iter = rbio->bvec_iter; 731 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 732 struct nonce nonce = extent_nonce(rbio->version, crc); 733 unsigned nofs_flags; 734 struct bch_csum csum; 735 int ret; 736 737 nofs_flags = memalloc_nofs_save(); 738 739 /* Reset iterator for checksumming and copying bounced data: */ 740 if (rbio->bounce) { 741 src->bi_iter.bi_size = crc.compressed_size << 9; 742 src->bi_iter.bi_idx = 0; 743 src->bi_iter.bi_bvec_done = 0; 744 } else { 745 src->bi_iter = rbio->bvec_iter; 746 } 747 748 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 749 750 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 751 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 752 753 /* 754 * Checksum error: if the bio wasn't bounced, we may have been 755 * reading into buffers owned by userspace (that userspace can 756 * scribble over) - retry the read, bouncing it this time: 757 */ 758 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 759 rbio->flags |= BCH_READ_must_bounce; 760 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 761 BLK_STS_IOERR); 762 goto out; 763 } 764 765 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 766 767 if (!csum_good) 768 goto csum_err; 769 770 /* 771 * XXX 772 * We need to rework the narrow_crcs path to deliver the read completion 773 * first, and then punt to a different workqueue, otherwise we're 774 * holding up reads while doing btree updates which is bad for memory 775 * reclaim. 776 */ 777 if (unlikely(rbio->narrow_crcs)) 778 bch2_rbio_narrow_crcs(rbio); 779 780 if (likely(!parent->data_update)) { 781 /* Adjust crc to point to subset of data we want: */ 782 crc.offset += rbio->offset_into_extent; 783 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 784 785 if (crc_is_compressed(crc)) { 786 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 787 if (ret) 788 goto decrypt_err; 789 790 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 791 !c->opts.no_data_io) 792 goto decompression_err; 793 } else { 794 /* don't need to decrypt the entire bio: */ 795 nonce = nonce_add(nonce, crc.offset << 9); 796 bio_advance(src, crc.offset << 9); 797 798 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 799 src->bi_iter.bi_size = dst_iter.bi_size; 800 801 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 802 if (ret) 803 goto decrypt_err; 804 805 if (rbio->bounce) { 806 struct bvec_iter src_iter = src->bi_iter; 807 808 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 809 } 810 } 811 } else { 812 if (rbio->split) 813 rbio->parent->pick = rbio->pick; 814 815 if (rbio->bounce) { 816 struct bvec_iter src_iter = src->bi_iter; 817 818 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 819 } 820 } 821 822 if (rbio->promote) { 823 /* 824 * Re encrypt data we decrypted, so it's consistent with 825 * rbio->crc: 826 */ 827 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 828 if (ret) 829 goto decrypt_err; 830 } 831 832 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 833 rbio = bch2_rbio_free(rbio); 834 bch2_rbio_done(rbio); 835 } 836 out: 837 memalloc_nofs_restore(nofs_flags); 838 return; 839 csum_err: 840 bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 841 goto out; 842 decompression_err: 843 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 844 goto out; 845 decrypt_err: 846 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 847 goto out; 848 } 849 850 static void bch2_read_endio(struct bio *bio) 851 { 852 struct bch_read_bio *rbio = 853 container_of(bio, struct bch_read_bio, bio); 854 struct bch_fs *c = rbio->c; 855 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 856 struct workqueue_struct *wq = NULL; 857 enum rbio_context context = RBIO_CONTEXT_NULL; 858 859 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 860 rbio->submit_time, !bio->bi_status); 861 862 if (!rbio->split) 863 rbio->bio.bi_end_io = rbio->end_io; 864 865 if (unlikely(bio->bi_status)) { 866 bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 867 return; 868 } 869 870 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 871 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 872 trace_and_count(c, io_read_reuse_race, &rbio->bio); 873 874 if (rbio->flags & BCH_READ_retry_if_stale) 875 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 876 else 877 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 878 return; 879 } 880 881 if (rbio->narrow_crcs || 882 rbio->promote || 883 crc_is_compressed(rbio->pick.crc) || 884 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 885 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 886 else if (rbio->pick.crc.csum_type) 887 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 888 889 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 890 } 891 892 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 893 struct bch_dev *ca, 894 struct bkey_s_c k, 895 struct bch_extent_ptr ptr) 896 { 897 struct bch_fs *c = trans->c; 898 struct btree_iter iter; 899 struct printbuf buf = PRINTBUF; 900 int ret; 901 902 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 903 PTR_BUCKET_POS(ca, &ptr), 904 BTREE_ITER_cached); 905 906 int gen = bucket_gen_get(ca, iter.pos.offset); 907 if (gen >= 0) { 908 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 909 printbuf_indent_add(&buf, 2); 910 911 bch2_bkey_val_to_text(&buf, c, k); 912 prt_newline(&buf); 913 914 prt_printf(&buf, "memory gen: %u", gen); 915 916 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 917 if (!ret) { 918 prt_newline(&buf); 919 bch2_bkey_val_to_text(&buf, c, k); 920 } 921 } else { 922 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 923 iter.pos.inode, iter.pos.offset); 924 printbuf_indent_add(&buf, 2); 925 926 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 927 ca->mi.first_bucket, ca->mi.nbuckets); 928 929 bch2_bkey_val_to_text(&buf, c, k); 930 prt_newline(&buf); 931 } 932 933 bch2_fs_inconsistent(c, "%s", buf.buf); 934 935 bch2_trans_iter_exit(trans, &iter); 936 printbuf_exit(&buf); 937 } 938 939 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 940 struct bvec_iter iter, struct bpos read_pos, 941 enum btree_id data_btree, struct bkey_s_c k, 942 unsigned offset_into_extent, 943 struct bch_io_failures *failed, unsigned flags, int dev) 944 { 945 struct bch_fs *c = trans->c; 946 struct extent_ptr_decoded pick; 947 struct bch_read_bio *rbio = NULL; 948 bool bounce = false, read_full = false, narrow_crcs = false; 949 struct bpos data_pos = bkey_start_pos(k.k); 950 struct data_update *u = rbio_data_update(orig); 951 int ret = 0; 952 953 if (bkey_extent_is_inline_data(k.k)) { 954 unsigned bytes = min_t(unsigned, iter.bi_size, 955 bkey_inline_data_bytes(k.k)); 956 957 swap(iter.bi_size, bytes); 958 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 959 swap(iter.bi_size, bytes); 960 bio_advance_iter(&orig->bio, &iter, bytes); 961 zero_fill_bio_iter(&orig->bio, iter); 962 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 963 bvec_iter_sectors(iter)); 964 goto out_read_done; 965 } 966 retry_pick: 967 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 968 969 /* hole or reservation - just zero fill: */ 970 if (!ret) 971 goto hole; 972 973 if (unlikely(ret < 0)) { 974 struct printbuf buf = PRINTBUF; 975 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 976 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 977 bch2_bkey_val_to_text(&buf, c, k); 978 979 bch_err_ratelimited(c, "%s", buf.buf); 980 printbuf_exit(&buf); 981 goto err; 982 } 983 984 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && 985 !c->chacha20_key_set) { 986 struct printbuf buf = PRINTBUF; 987 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 988 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 989 bch2_bkey_val_to_text(&buf, c, k); 990 991 bch_err_ratelimited(c, "%s", buf.buf); 992 printbuf_exit(&buf); 993 ret = -BCH_ERR_data_read_no_encryption_key; 994 goto err; 995 } 996 997 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 998 999 /* 1000 * Stale dirty pointers are treated as IO errors, but @failed isn't 1001 * allocated unless we're in the retry path - so if we're not in the 1002 * retry path, don't check here, it'll be caught in bch2_read_endio() 1003 * and we'll end up in the retry path: 1004 */ 1005 if ((flags & BCH_READ_in_retry) && 1006 !pick.ptr.cached && 1007 ca && 1008 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1009 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1010 bch2_mark_io_failure(failed, &pick, false); 1011 percpu_ref_put(&ca->io_ref[READ]); 1012 goto retry_pick; 1013 } 1014 1015 if (likely(!u)) { 1016 if (!(flags & BCH_READ_last_fragment) || 1017 bio_flagged(&orig->bio, BIO_CHAIN)) 1018 flags |= BCH_READ_must_clone; 1019 1020 narrow_crcs = !(flags & BCH_READ_in_retry) && 1021 bch2_can_narrow_extent_crcs(k, pick.crc); 1022 1023 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1024 flags |= BCH_READ_must_bounce; 1025 1026 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1027 1028 if (crc_is_compressed(pick.crc) || 1029 (pick.crc.csum_type != BCH_CSUM_none && 1030 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1031 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1032 (flags & BCH_READ_user_mapped)) || 1033 (flags & BCH_READ_must_bounce)))) { 1034 read_full = true; 1035 bounce = true; 1036 } 1037 } else { 1038 /* 1039 * can happen if we retry, and the extent we were going to read 1040 * has been merged in the meantime: 1041 */ 1042 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1043 if (ca) 1044 percpu_ref_put(&ca->io_ref[READ]); 1045 rbio->ret = -BCH_ERR_data_read_buffer_too_small; 1046 goto out_read_done; 1047 } 1048 1049 iter.bi_size = pick.crc.compressed_size << 9; 1050 read_full = true; 1051 } 1052 1053 if (orig->opts.promote_target || have_io_error(failed)) 1054 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1055 &bounce, &read_full, failed); 1056 1057 if (!read_full) { 1058 EBUG_ON(crc_is_compressed(pick.crc)); 1059 EBUG_ON(pick.crc.csum_type && 1060 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1061 bvec_iter_sectors(iter) != pick.crc.live_size || 1062 pick.crc.offset || 1063 offset_into_extent)); 1064 1065 data_pos.offset += offset_into_extent; 1066 pick.ptr.offset += pick.crc.offset + 1067 offset_into_extent; 1068 offset_into_extent = 0; 1069 pick.crc.compressed_size = bvec_iter_sectors(iter); 1070 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1071 pick.crc.offset = 0; 1072 pick.crc.live_size = bvec_iter_sectors(iter); 1073 } 1074 1075 if (rbio) { 1076 /* 1077 * promote already allocated bounce rbio: 1078 * promote needs to allocate a bio big enough for uncompressing 1079 * data in the write path, but we're not going to use it all 1080 * here: 1081 */ 1082 EBUG_ON(rbio->bio.bi_iter.bi_size < 1083 pick.crc.compressed_size << 9); 1084 rbio->bio.bi_iter.bi_size = 1085 pick.crc.compressed_size << 9; 1086 } else if (bounce) { 1087 unsigned sectors = pick.crc.compressed_size; 1088 1089 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1090 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1091 0, 1092 GFP_NOFS, 1093 &c->bio_read_split), 1094 orig); 1095 1096 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1097 rbio->bounce = true; 1098 } else if (flags & BCH_READ_must_clone) { 1099 /* 1100 * Have to clone if there were any splits, due to error 1101 * reporting issues (if a split errored, and retrying didn't 1102 * work, when it reports the error to its parent (us) we don't 1103 * know if the error was from our bio, and we should retry, or 1104 * from the whole bio, in which case we don't want to retry and 1105 * lose the error) 1106 */ 1107 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1108 &c->bio_read_split), 1109 orig); 1110 rbio->bio.bi_iter = iter; 1111 } else { 1112 rbio = orig; 1113 rbio->bio.bi_iter = iter; 1114 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1115 } 1116 1117 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1118 1119 rbio->submit_time = local_clock(); 1120 if (!rbio->split) 1121 rbio->end_io = orig->bio.bi_end_io; 1122 rbio->bvec_iter = iter; 1123 rbio->offset_into_extent= offset_into_extent; 1124 rbio->flags = flags; 1125 rbio->have_ioref = ca != NULL; 1126 rbio->narrow_crcs = narrow_crcs; 1127 rbio->ret = 0; 1128 rbio->context = 0; 1129 rbio->pick = pick; 1130 rbio->subvol = orig->subvol; 1131 rbio->read_pos = read_pos; 1132 rbio->data_btree = data_btree; 1133 rbio->data_pos = data_pos; 1134 rbio->version = k.k->bversion; 1135 INIT_WORK(&rbio->work, NULL); 1136 1137 rbio->bio.bi_opf = orig->bio.bi_opf; 1138 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1139 rbio->bio.bi_end_io = bch2_read_endio; 1140 1141 if (rbio->bounce) 1142 trace_and_count(c, io_read_bounce, &rbio->bio); 1143 1144 if (!u) 1145 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1146 else 1147 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1148 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1149 1150 /* 1151 * If it's being moved internally, we don't want to flag it as a cache 1152 * hit: 1153 */ 1154 if (ca && pick.ptr.cached && !u) 1155 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1156 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1157 1158 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1159 bio_inc_remaining(&orig->bio); 1160 trace_and_count(c, io_read_split, &orig->bio); 1161 } 1162 1163 /* 1164 * Unlock the iterator while the btree node's lock is still in 1165 * cache, before doing the IO: 1166 */ 1167 if (!(flags & BCH_READ_in_retry)) 1168 bch2_trans_unlock(trans); 1169 else 1170 bch2_trans_unlock_long(trans); 1171 1172 if (likely(!rbio->pick.do_ec_reconstruct)) { 1173 if (unlikely(!rbio->have_ioref)) { 1174 struct printbuf buf = PRINTBUF; 1175 bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); 1176 prt_printf(&buf, "no device to read from:\n "); 1177 bch2_bkey_val_to_text(&buf, c, k); 1178 1179 bch_err_ratelimited(c, "%s", buf.buf); 1180 printbuf_exit(&buf); 1181 1182 bch2_rbio_error(rbio, 1183 -BCH_ERR_data_read_retry_device_offline, 1184 BLK_STS_IOERR); 1185 goto out; 1186 } 1187 1188 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1189 bio_sectors(&rbio->bio)); 1190 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1191 1192 if (unlikely(c->opts.no_data_io)) { 1193 if (likely(!(flags & BCH_READ_in_retry))) 1194 bio_endio(&rbio->bio); 1195 } else { 1196 if (likely(!(flags & BCH_READ_in_retry))) 1197 submit_bio(&rbio->bio); 1198 else 1199 submit_bio_wait(&rbio->bio); 1200 } 1201 1202 /* 1203 * We just submitted IO which may block, we expect relock fail 1204 * events and shouldn't count them: 1205 */ 1206 trans->notrace_relock_fail = true; 1207 } else { 1208 /* Attempting reconstruct read: */ 1209 if (bch2_ec_read_extent(trans, rbio, k)) { 1210 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1211 BLK_STS_IOERR); 1212 goto out; 1213 } 1214 1215 if (likely(!(flags & BCH_READ_in_retry))) 1216 bio_endio(&rbio->bio); 1217 } 1218 out: 1219 if (likely(!(flags & BCH_READ_in_retry))) { 1220 return 0; 1221 } else { 1222 bch2_trans_unlock(trans); 1223 1224 int ret; 1225 1226 rbio->context = RBIO_CONTEXT_UNBOUND; 1227 bch2_read_endio(&rbio->bio); 1228 1229 ret = rbio->ret; 1230 rbio = bch2_rbio_free(rbio); 1231 1232 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1233 bch2_mark_io_failure(failed, &pick, 1234 ret == -BCH_ERR_data_read_retry_csum_err); 1235 1236 return ret; 1237 } 1238 1239 err: 1240 if (flags & BCH_READ_in_retry) 1241 return ret; 1242 1243 orig->bio.bi_status = BLK_STS_IOERR; 1244 orig->ret = ret; 1245 goto out_read_done; 1246 1247 hole: 1248 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1249 bvec_iter_sectors(iter)); 1250 /* 1251 * won't normally happen in the data update (bch2_move_extent()) path, 1252 * but if we retry and the extent we wanted to read no longer exists we 1253 * have to signal that: 1254 */ 1255 if (u) 1256 orig->ret = -BCH_ERR_data_read_key_overwritten; 1257 1258 zero_fill_bio_iter(&orig->bio, iter); 1259 out_read_done: 1260 if ((flags & BCH_READ_last_fragment) && 1261 !(flags & BCH_READ_in_retry)) 1262 bch2_rbio_done(orig); 1263 return 0; 1264 } 1265 1266 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1267 struct bvec_iter bvec_iter, subvol_inum inum, 1268 struct bch_io_failures *failed, unsigned flags) 1269 { 1270 struct bch_fs *c = trans->c; 1271 struct btree_iter iter; 1272 struct bkey_buf sk; 1273 struct bkey_s_c k; 1274 int ret; 1275 1276 EBUG_ON(rbio->data_update); 1277 1278 bch2_bkey_buf_init(&sk); 1279 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1280 POS(inum.inum, bvec_iter.bi_sector), 1281 BTREE_ITER_slots); 1282 1283 while (1) { 1284 enum btree_id data_btree = BTREE_ID_extents; 1285 1286 bch2_trans_begin(trans); 1287 1288 u32 snapshot; 1289 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1290 if (ret) 1291 goto err; 1292 1293 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1294 1295 bch2_btree_iter_set_pos(trans, &iter, 1296 POS(inum.inum, bvec_iter.bi_sector)); 1297 1298 k = bch2_btree_iter_peek_slot(trans, &iter); 1299 ret = bkey_err(k); 1300 if (ret) 1301 goto err; 1302 1303 s64 offset_into_extent = iter.pos.offset - 1304 bkey_start_offset(k.k); 1305 unsigned sectors = k.k->size - offset_into_extent; 1306 1307 bch2_bkey_buf_reassemble(&sk, c, k); 1308 1309 ret = bch2_read_indirect_extent(trans, &data_btree, 1310 &offset_into_extent, &sk); 1311 if (ret) 1312 goto err; 1313 1314 k = bkey_i_to_s_c(sk.k); 1315 1316 /* 1317 * With indirect extents, the amount of data to read is the min 1318 * of the original extent and the indirect extent: 1319 */ 1320 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1321 1322 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1323 swap(bvec_iter.bi_size, bytes); 1324 1325 if (bvec_iter.bi_size == bytes) 1326 flags |= BCH_READ_last_fragment; 1327 1328 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1329 data_btree, k, 1330 offset_into_extent, failed, flags, -1); 1331 swap(bvec_iter.bi_size, bytes); 1332 1333 if (ret) 1334 goto err; 1335 1336 if (flags & BCH_READ_last_fragment) 1337 break; 1338 1339 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1340 err: 1341 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1342 flags |= BCH_READ_must_bounce; 1343 1344 if (ret && 1345 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1346 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1347 break; 1348 } 1349 1350 bch2_trans_iter_exit(trans, &iter); 1351 1352 if (unlikely(ret)) { 1353 if (ret != -BCH_ERR_extent_poisoned) { 1354 struct printbuf buf = PRINTBUF; 1355 lockrestart_do(trans, 1356 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1357 bvec_iter.bi_sector << 9)); 1358 prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); 1359 bch_err_ratelimited(c, "%s", buf.buf); 1360 printbuf_exit(&buf); 1361 } 1362 1363 rbio->bio.bi_status = BLK_STS_IOERR; 1364 rbio->ret = ret; 1365 1366 if (!(flags & BCH_READ_in_retry)) 1367 bch2_rbio_done(rbio); 1368 } 1369 1370 bch2_bkey_buf_exit(&sk, c); 1371 return ret; 1372 } 1373 1374 void bch2_fs_io_read_exit(struct bch_fs *c) 1375 { 1376 if (c->promote_table.tbl) 1377 rhashtable_destroy(&c->promote_table); 1378 bioset_exit(&c->bio_read_split); 1379 bioset_exit(&c->bio_read); 1380 } 1381 1382 int bch2_fs_io_read_init(struct bch_fs *c) 1383 { 1384 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1385 BIOSET_NEED_BVECS)) 1386 return -BCH_ERR_ENOMEM_bio_read_init; 1387 1388 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1389 BIOSET_NEED_BVECS)) 1390 return -BCH_ERR_ENOMEM_bio_read_split_init; 1391 1392 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1393 return -BCH_ERR_ENOMEM_promote_table_init; 1394 1395 return 0; 1396 } 1397