1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "async_objs.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "clock.h" 17 #include "compress.h" 18 #include "data_update.h" 19 #include "disk_groups.h" 20 #include "ec.h" 21 #include "enumerated_ref.h" 22 #include "error.h" 23 #include "io_read.h" 24 #include "io_misc.h" 25 #include "io_write.h" 26 #include "reflink.h" 27 #include "subvolume.h" 28 #include "trace.h" 29 30 #include <linux/moduleparam.h> 31 #include <linux/random.h> 32 #include <linux/sched/mm.h> 33 34 #ifdef CONFIG_BCACHEFS_DEBUG 35 static unsigned bch2_read_corrupt_ratio; 36 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 37 MODULE_PARM_DESC(read_corrupt_ratio, ""); 38 #endif 39 40 static bool bch2_poison_extents_on_checksum_error; 41 module_param_named(poison_extents_on_checksum_error, 42 bch2_poison_extents_on_checksum_error, bool, 0644); 43 MODULE_PARM_DESC(poison_extents_on_checksum_error, 44 "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); 45 46 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 47 48 static bool bch2_target_congested(struct bch_fs *c, u16 target) 49 { 50 const struct bch_devs_mask *devs; 51 unsigned d, nr = 0, total = 0; 52 u64 now = local_clock(), last; 53 s64 congested; 54 struct bch_dev *ca; 55 56 if (!target) 57 return false; 58 59 guard(rcu)(); 60 devs = bch2_target_to_mask(c, target) ?: 61 &c->rw_devs[BCH_DATA_user]; 62 63 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 64 ca = rcu_dereference(c->devs[d]); 65 if (!ca) 66 continue; 67 68 congested = atomic_read(&ca->congested); 69 last = READ_ONCE(ca->congested_last); 70 if (time_after64(now, last)) 71 congested -= (now - last) >> 12; 72 73 total += max(congested, 0LL); 74 nr++; 75 } 76 77 return get_random_u32_below(nr * CONGESTED_MAX) < total; 78 } 79 80 #else 81 82 static bool bch2_target_congested(struct bch_fs *c, u16 target) 83 { 84 return false; 85 } 86 87 #endif 88 89 /* Cache promotion on read */ 90 91 static const struct rhashtable_params bch_promote_params = { 92 .head_offset = offsetof(struct promote_op, hash), 93 .key_offset = offsetof(struct promote_op, pos), 94 .key_len = sizeof(struct bpos), 95 .automatic_shrinking = true, 96 }; 97 98 static inline bool have_io_error(struct bch_io_failures *failed) 99 { 100 return failed && failed->nr; 101 } 102 103 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 104 { 105 EBUG_ON(rbio->split); 106 107 return rbio->data_update 108 ? container_of(rbio, struct data_update, rbio) 109 : NULL; 110 } 111 112 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 113 { 114 struct data_update *u = rbio_data_update(orig); 115 if (!u) 116 return false; 117 118 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 119 unsigned i = 0; 120 bkey_for_each_ptr(ptrs, ptr) { 121 if (ptr->dev == dev && 122 u->data_opts.rewrite_ptrs & BIT(i)) 123 return true; 124 i++; 125 } 126 127 return false; 128 } 129 130 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 131 struct bpos pos, 132 struct bch_io_opts opts, 133 unsigned flags, 134 struct bch_io_failures *failed) 135 { 136 if (!have_io_error(failed)) { 137 BUG_ON(!opts.promote_target); 138 139 if (!(flags & BCH_READ_may_promote)) 140 return bch_err_throw(c, nopromote_may_not); 141 142 if (bch2_bkey_has_target(c, k, opts.promote_target)) 143 return bch_err_throw(c, nopromote_already_promoted); 144 145 if (bkey_extent_is_unwritten(k)) 146 return bch_err_throw(c, nopromote_unwritten); 147 148 if (bch2_target_congested(c, opts.promote_target)) 149 return bch_err_throw(c, nopromote_congested); 150 } 151 152 if (rhashtable_lookup_fast(&c->promote_table, &pos, 153 bch_promote_params)) 154 return bch_err_throw(c, nopromote_in_flight); 155 156 return 0; 157 } 158 159 static noinline void promote_free(struct bch_read_bio *rbio) 160 { 161 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 162 struct bch_fs *c = rbio->c; 163 164 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 165 bch_promote_params); 166 BUG_ON(ret); 167 168 async_object_list_del(c, promote, op->list_idx); 169 170 bch2_data_update_exit(&op->write); 171 172 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 173 kfree_rcu(op, rcu); 174 } 175 176 static void promote_done(struct bch_write_op *wop) 177 { 178 struct promote_op *op = container_of(wop, struct promote_op, write.op); 179 struct bch_fs *c = op->write.rbio.c; 180 181 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 182 promote_free(&op->write.rbio); 183 } 184 185 static void promote_start_work(struct work_struct *work) 186 { 187 struct promote_op *op = container_of(work, struct promote_op, work); 188 189 bch2_data_update_read_done(&op->write); 190 } 191 192 static noinline void promote_start(struct bch_read_bio *rbio) 193 { 194 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 195 196 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 197 198 INIT_WORK(&op->work, promote_start_work); 199 queue_work(rbio->c->write_ref_wq, &op->work); 200 } 201 202 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 203 enum btree_id btree_id, 204 struct bkey_s_c k, 205 struct bpos pos, 206 struct extent_ptr_decoded *pick, 207 unsigned sectors, 208 struct bch_read_bio *orig, 209 struct bch_io_failures *failed) 210 { 211 struct bch_fs *c = trans->c; 212 int ret; 213 214 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 215 216 if (!have_io_error(failed)) { 217 update_opts.target = orig->opts.promote_target; 218 update_opts.extra_replicas = 1; 219 update_opts.write_flags |= BCH_WRITE_cached; 220 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 221 } else { 222 update_opts.target = orig->opts.foreground_target; 223 224 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 225 unsigned ptr_bit = 1; 226 bkey_for_each_ptr(ptrs, ptr) { 227 if (bch2_dev_io_failures(failed, ptr->dev) && 228 !ptr_being_rewritten(orig, ptr->dev)) 229 update_opts.rewrite_ptrs |= ptr_bit; 230 ptr_bit <<= 1; 231 } 232 233 if (!update_opts.rewrite_ptrs) 234 return NULL; 235 } 236 237 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) 238 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 239 240 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 241 if (!op) { 242 ret = bch_err_throw(c, nopromote_enomem); 243 goto err_put; 244 } 245 246 op->start_time = local_clock(); 247 op->pos = pos; 248 249 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 250 bch_promote_params)) { 251 ret = bch_err_throw(c, nopromote_in_flight); 252 goto err; 253 } 254 255 ret = async_object_list_add(c, promote, op, &op->list_idx); 256 if (ret < 0) 257 goto err_remove_hash; 258 259 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 260 writepoint_hashed((unsigned long) current), 261 &orig->opts, 262 update_opts, 263 btree_id, k); 264 op->write.type = BCH_DATA_UPDATE_promote; 265 /* 266 * possible errors: -BCH_ERR_nocow_lock_blocked, 267 * -BCH_ERR_ENOSPC_disk_reservation: 268 */ 269 if (ret) 270 goto err_remove_list; 271 272 rbio_init_fragment(&op->write.rbio.bio, orig); 273 op->write.rbio.bounce = true; 274 op->write.rbio.promote = true; 275 op->write.op.end_io = promote_done; 276 277 return &op->write.rbio; 278 err_remove_list: 279 async_object_list_del(c, promote, op->list_idx); 280 err_remove_hash: 281 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 282 bch_promote_params)); 283 err: 284 bio_free_pages(&op->write.op.wbio.bio); 285 /* We may have added to the rhashtable and thus need rcu freeing: */ 286 kfree_rcu(op, rcu); 287 err_put: 288 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 289 return ERR_PTR(ret); 290 } 291 292 noinline 293 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 294 struct bvec_iter iter, 295 struct bkey_s_c k, 296 struct extent_ptr_decoded *pick, 297 unsigned flags, 298 struct bch_read_bio *orig, 299 bool *bounce, 300 bool *read_full, 301 struct bch_io_failures *failed) 302 { 303 /* 304 * We're in the retry path, but we don't know what to repair yet, and we 305 * don't want to do a promote here: 306 */ 307 if (failed && !failed->nr) 308 return NULL; 309 310 struct bch_fs *c = trans->c; 311 /* 312 * if failed != NULL we're not actually doing a promote, we're 313 * recovering from an io/checksum error 314 */ 315 bool promote_full = (have_io_error(failed) || 316 *read_full || 317 READ_ONCE(c->opts.promote_whole_extents)); 318 /* data might have to be decompressed in the write path: */ 319 unsigned sectors = promote_full 320 ? max(pick->crc.compressed_size, pick->crc.live_size) 321 : bvec_iter_sectors(iter); 322 struct bpos pos = promote_full 323 ? bkey_start_pos(k.k) 324 : POS(k.k->p.inode, iter.bi_sector); 325 int ret; 326 327 ret = should_promote(c, k, pos, orig->opts, flags, failed); 328 if (ret) 329 goto nopromote; 330 331 struct bch_read_bio *promote = 332 __promote_alloc(trans, 333 k.k->type == KEY_TYPE_reflink_v 334 ? BTREE_ID_reflink 335 : BTREE_ID_extents, 336 k, pos, pick, sectors, orig, failed); 337 if (!promote) 338 return NULL; 339 340 ret = PTR_ERR_OR_ZERO(promote); 341 if (ret) 342 goto nopromote; 343 344 *bounce = true; 345 *read_full = promote_full; 346 return promote; 347 nopromote: 348 trace_io_read_nopromote(c, ret); 349 return NULL; 350 } 351 352 void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) 353 { 354 if (!op->write.read_done) { 355 prt_printf(out, "parent read: %px\n", op->write.rbio.parent); 356 printbuf_indent_add(out, 2); 357 bch2_read_bio_to_text(out, op->write.rbio.parent); 358 printbuf_indent_sub(out, 2); 359 } 360 361 bch2_data_update_to_text(out, &op->write); 362 } 363 364 /* Read */ 365 366 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 367 struct bch_read_bio *rbio, struct bpos read_pos) 368 { 369 int ret = lockrestart_do(trans, 370 bch2_inum_offset_err_msg_trans(trans, out, 371 (subvol_inum) { rbio->subvol, read_pos.inode }, 372 read_pos.offset << 9)); 373 if (ret) 374 return ret; 375 376 if (rbio->data_update) 377 prt_str(out, "(internal move) "); 378 379 return 0; 380 } 381 382 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 383 struct bch_read_bio *rbio, struct bpos read_pos) 384 { 385 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 386 } 387 388 enum rbio_context { 389 RBIO_CONTEXT_NULL, 390 RBIO_CONTEXT_HIGHPRI, 391 RBIO_CONTEXT_UNBOUND, 392 }; 393 394 static inline struct bch_read_bio * 395 bch2_rbio_parent(struct bch_read_bio *rbio) 396 { 397 return rbio->split ? rbio->parent : rbio; 398 } 399 400 __always_inline 401 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 402 enum rbio_context context, 403 struct workqueue_struct *wq) 404 { 405 if (context <= rbio->context) { 406 fn(&rbio->work); 407 } else { 408 rbio->work.func = fn; 409 rbio->context = context; 410 queue_work(wq, &rbio->work); 411 } 412 } 413 414 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 415 { 416 BUG_ON(rbio->bounce && !rbio->split); 417 418 if (rbio->have_ioref) { 419 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 420 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 421 } 422 423 if (rbio->split) { 424 struct bch_read_bio *parent = rbio->parent; 425 426 if (unlikely(rbio->promote)) { 427 if (!rbio->bio.bi_status) 428 promote_start(rbio); 429 else 430 promote_free(rbio); 431 } else { 432 async_object_list_del(rbio->c, rbio, rbio->list_idx); 433 434 if (rbio->bounce) 435 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 436 437 bio_put(&rbio->bio); 438 } 439 440 rbio = parent; 441 } 442 443 return rbio; 444 } 445 446 /* 447 * Only called on a top level bch_read_bio to complete an entire read request, 448 * not a split: 449 */ 450 static void bch2_rbio_done(struct bch_read_bio *rbio) 451 { 452 if (rbio->start_time) 453 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 454 rbio->start_time); 455 bio_endio(&rbio->bio); 456 } 457 458 static void get_rbio_extent(struct btree_trans *trans, 459 struct bch_read_bio *rbio, 460 struct bkey_buf *sk) 461 { 462 struct btree_iter iter; 463 struct bkey_s_c k; 464 int ret = lockrestart_do(trans, 465 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 466 rbio->data_btree, rbio->data_pos, 0))); 467 if (ret) 468 return; 469 470 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 471 bkey_for_each_ptr(ptrs, ptr) 472 if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { 473 bch2_bkey_buf_reassemble(sk, trans->c, k); 474 break; 475 } 476 477 bch2_trans_iter_exit(trans, &iter); 478 } 479 480 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, 481 enum btree_id btree, struct bkey_s_c read_k) 482 { 483 if (!bch2_poison_extents_on_checksum_error) 484 return 0; 485 486 struct bch_fs *c = trans->c; 487 488 struct data_update *u = rbio_data_update(rbio); 489 if (u) 490 read_k = bkey_i_to_s_c(u->k.k); 491 492 u64 flags = bch2_bkey_extent_flags(read_k); 493 if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) 494 return 0; 495 496 struct btree_iter iter; 497 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), 498 BTREE_ITER_intent); 499 int ret = bkey_err(k); 500 if (ret) 501 return ret; 502 503 if (!bkey_and_val_eq(k, read_k)) 504 goto out; 505 506 struct bkey_i *new = bch2_trans_kmalloc(trans, 507 bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); 508 ret = PTR_ERR_OR_ZERO(new) ?: 509 (bkey_reassemble(new, k), 0) ?: 510 bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: 511 bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: 512 bch2_trans_commit(trans, NULL, NULL, 0); 513 514 /* 515 * Propagate key change back to data update path, in particular so it 516 * knows the extent has been poisoned and it's safe to change the 517 * checksum 518 */ 519 if (u && !ret) 520 bch2_bkey_buf_copy(&u->k, c, new); 521 out: 522 bch2_trans_iter_exit(trans, &iter); 523 return ret; 524 } 525 526 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 527 struct bch_read_bio *rbio, 528 struct bvec_iter bvec_iter, 529 struct bch_io_failures *failed, 530 unsigned flags) 531 { 532 struct data_update *u = container_of(rbio, struct data_update, rbio); 533 retry: 534 bch2_trans_begin(trans); 535 536 struct btree_iter iter; 537 struct bkey_s_c k; 538 int ret = lockrestart_do(trans, 539 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 540 u->btree_id, bkey_start_pos(&u->k.k->k), 541 0))); 542 if (ret) 543 goto err; 544 545 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 546 /* extent we wanted to read no longer exists: */ 547 rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); 548 goto err; 549 } 550 551 ret = __bch2_read_extent(trans, rbio, bvec_iter, 552 bkey_start_pos(&u->k.k->k), 553 u->btree_id, 554 bkey_i_to_s_c(u->k.k), 555 0, failed, flags, -1); 556 err: 557 bch2_trans_iter_exit(trans, &iter); 558 559 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 560 bch2_err_matches(ret, BCH_ERR_data_read_retry)) 561 goto retry; 562 563 if (ret) { 564 rbio->bio.bi_status = BLK_STS_IOERR; 565 rbio->ret = ret; 566 } 567 568 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 569 return ret; 570 } 571 572 static void bch2_rbio_retry(struct work_struct *work) 573 { 574 struct bch_read_bio *rbio = 575 container_of(work, struct bch_read_bio, work); 576 struct bch_fs *c = rbio->c; 577 struct bvec_iter iter = rbio->bvec_iter; 578 unsigned flags = rbio->flags; 579 subvol_inum inum = { 580 .subvol = rbio->subvol, 581 .inum = rbio->read_pos.inode, 582 }; 583 struct bch_io_failures failed = { .nr = 0 }; 584 585 struct btree_trans *trans = bch2_trans_get(c); 586 587 struct bkey_buf sk; 588 bch2_bkey_buf_init(&sk); 589 bkey_init(&sk.k->k); 590 591 trace_io_read_retry(&rbio->bio); 592 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 593 bvec_iter_sectors(rbio->bvec_iter)); 594 595 get_rbio_extent(trans, rbio, &sk); 596 597 if (!bkey_deleted(&sk.k->k) && 598 bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 599 bch2_mark_io_failure(&failed, &rbio->pick, 600 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 601 602 if (!rbio->split) { 603 rbio->bio.bi_status = 0; 604 rbio->ret = 0; 605 } 606 607 unsigned subvol = rbio->subvol; 608 struct bpos read_pos = rbio->read_pos; 609 610 rbio = bch2_rbio_free(rbio); 611 612 flags |= BCH_READ_in_retry; 613 flags &= ~BCH_READ_may_promote; 614 flags &= ~BCH_READ_last_fragment; 615 flags |= BCH_READ_must_clone; 616 617 int ret = rbio->data_update 618 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 619 : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); 620 621 if (ret) { 622 rbio->ret = ret; 623 rbio->bio.bi_status = BLK_STS_IOERR; 624 } 625 626 if (failed.nr || ret) { 627 struct printbuf buf = PRINTBUF; 628 bch2_log_msg_start(c, &buf); 629 630 lockrestart_do(trans, 631 bch2_inum_offset_err_msg_trans(trans, &buf, 632 (subvol_inum) { subvol, read_pos.inode }, 633 read_pos.offset << 9)); 634 if (rbio->data_update) 635 prt_str(&buf, "(internal move) "); 636 637 prt_str(&buf, "data read error, "); 638 if (!ret) 639 prt_str(&buf, "successful retry"); 640 else 641 prt_str(&buf, bch2_err_str(ret)); 642 prt_newline(&buf); 643 644 if (!bkey_deleted(&sk.k->k)) { 645 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); 646 prt_newline(&buf); 647 } 648 649 bch2_io_failures_to_text(&buf, c, &failed); 650 651 bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); 652 printbuf_exit(&buf); 653 } 654 655 bch2_rbio_done(rbio); 656 bch2_bkey_buf_exit(&sk, c); 657 bch2_trans_put(trans); 658 } 659 660 static void bch2_rbio_error(struct bch_read_bio *rbio, 661 int ret, blk_status_t blk_error) 662 { 663 BUG_ON(ret >= 0); 664 665 rbio->ret = ret; 666 rbio->bio.bi_status = blk_error; 667 668 bch2_rbio_parent(rbio)->saw_error = true; 669 670 if (rbio->flags & BCH_READ_in_retry) 671 return; 672 673 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 674 bch2_rbio_punt(rbio, bch2_rbio_retry, 675 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 676 } else { 677 rbio = bch2_rbio_free(rbio); 678 679 rbio->ret = ret; 680 rbio->bio.bi_status = blk_error; 681 682 bch2_rbio_done(rbio); 683 } 684 } 685 686 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 687 struct bch_read_bio *rbio) 688 { 689 struct bch_fs *c = rbio->c; 690 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 691 struct bch_extent_crc_unpacked new_crc; 692 struct btree_iter iter; 693 struct bkey_i *new; 694 struct bkey_s_c k; 695 int ret = 0; 696 697 if (crc_is_compressed(rbio->pick.crc)) 698 return 0; 699 700 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 701 BTREE_ITER_slots|BTREE_ITER_intent); 702 if ((ret = bkey_err(k))) 703 goto out; 704 705 if (bversion_cmp(k.k->bversion, rbio->version) || 706 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 707 goto out; 708 709 /* Extent was merged? */ 710 if (bkey_start_offset(k.k) < data_offset || 711 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 712 goto out; 713 714 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 715 rbio->pick.crc, NULL, &new_crc, 716 bkey_start_offset(k.k) - data_offset, k.k->size, 717 rbio->pick.crc.csum_type)) { 718 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 719 ret = 0; 720 goto out; 721 } 722 723 /* 724 * going to be temporarily appending another checksum entry: 725 */ 726 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 727 sizeof(struct bch_extent_crc128)); 728 if ((ret = PTR_ERR_OR_ZERO(new))) 729 goto out; 730 731 bkey_reassemble(new, k); 732 733 if (!bch2_bkey_narrow_crcs(new, new_crc)) 734 goto out; 735 736 ret = bch2_trans_update(trans, &iter, new, 737 BTREE_UPDATE_internal_snapshot_node); 738 out: 739 bch2_trans_iter_exit(trans, &iter); 740 return ret; 741 } 742 743 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 744 { 745 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 746 __bch2_rbio_narrow_crcs(trans, rbio)); 747 } 748 749 static void bch2_read_decompress_err(struct work_struct *work) 750 { 751 struct bch_read_bio *rbio = 752 container_of(work, struct bch_read_bio, work); 753 struct bch_fs *c = rbio->c; 754 struct printbuf buf = PRINTBUF; 755 756 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 757 prt_str(&buf, "decompression error"); 758 759 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 760 if (ca) 761 bch_err_ratelimited(ca, "%s", buf.buf); 762 else 763 bch_err_ratelimited(c, "%s", buf.buf); 764 765 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 766 printbuf_exit(&buf); 767 } 768 769 static void bch2_read_decrypt_err(struct work_struct *work) 770 { 771 struct bch_read_bio *rbio = 772 container_of(work, struct bch_read_bio, work); 773 struct bch_fs *c = rbio->c; 774 struct printbuf buf = PRINTBUF; 775 776 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 777 prt_str(&buf, "decrypt error"); 778 779 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 780 if (ca) 781 bch_err_ratelimited(ca, "%s", buf.buf); 782 else 783 bch_err_ratelimited(c, "%s", buf.buf); 784 785 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 786 printbuf_exit(&buf); 787 } 788 789 /* Inner part that may run in process context */ 790 static void __bch2_read_endio(struct work_struct *work) 791 { 792 struct bch_read_bio *rbio = 793 container_of(work, struct bch_read_bio, work); 794 struct bch_fs *c = rbio->c; 795 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 796 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 797 struct bio *src = &rbio->bio; 798 struct bio *dst = &parent->bio; 799 struct bvec_iter dst_iter = rbio->bvec_iter; 800 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 801 struct nonce nonce = extent_nonce(rbio->version, crc); 802 unsigned nofs_flags; 803 struct bch_csum csum; 804 int ret; 805 806 nofs_flags = memalloc_nofs_save(); 807 808 /* Reset iterator for checksumming and copying bounced data: */ 809 if (rbio->bounce) { 810 src->bi_iter.bi_size = crc.compressed_size << 9; 811 src->bi_iter.bi_idx = 0; 812 src->bi_iter.bi_bvec_done = 0; 813 } else { 814 src->bi_iter = rbio->bvec_iter; 815 } 816 817 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 818 819 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 820 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 821 822 /* 823 * Checksum error: if the bio wasn't bounced, we may have been 824 * reading into buffers owned by userspace (that userspace can 825 * scribble over) - retry the read, bouncing it this time: 826 */ 827 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 828 rbio->flags |= BCH_READ_must_bounce; 829 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 830 BLK_STS_IOERR); 831 goto out; 832 } 833 834 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 835 836 if (!csum_good) 837 goto csum_err; 838 839 /* 840 * XXX 841 * We need to rework the narrow_crcs path to deliver the read completion 842 * first, and then punt to a different workqueue, otherwise we're 843 * holding up reads while doing btree updates which is bad for memory 844 * reclaim. 845 */ 846 if (unlikely(rbio->narrow_crcs)) 847 bch2_rbio_narrow_crcs(rbio); 848 849 if (likely(!parent->data_update)) { 850 /* Adjust crc to point to subset of data we want: */ 851 crc.offset += rbio->offset_into_extent; 852 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 853 854 if (crc_is_compressed(crc)) { 855 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 856 if (ret) 857 goto decrypt_err; 858 859 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 860 !c->opts.no_data_io) 861 goto decompression_err; 862 } else { 863 /* don't need to decrypt the entire bio: */ 864 nonce = nonce_add(nonce, crc.offset << 9); 865 bio_advance(src, crc.offset << 9); 866 867 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 868 src->bi_iter.bi_size = dst_iter.bi_size; 869 870 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 871 if (ret) 872 goto decrypt_err; 873 874 if (rbio->bounce) { 875 struct bvec_iter src_iter = src->bi_iter; 876 877 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 878 } 879 } 880 } else { 881 if (rbio->split) 882 rbio->parent->pick = rbio->pick; 883 884 if (rbio->bounce) { 885 struct bvec_iter src_iter = src->bi_iter; 886 887 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 888 } 889 } 890 891 if (rbio->promote) { 892 /* 893 * Re encrypt data we decrypted, so it's consistent with 894 * rbio->crc: 895 */ 896 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 897 if (ret) 898 goto decrypt_err; 899 } 900 901 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 902 rbio = bch2_rbio_free(rbio); 903 bch2_rbio_done(rbio); 904 } 905 out: 906 memalloc_nofs_restore(nofs_flags); 907 return; 908 csum_err: 909 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 910 goto out; 911 decompression_err: 912 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 913 goto out; 914 decrypt_err: 915 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 916 goto out; 917 } 918 919 static void bch2_read_endio(struct bio *bio) 920 { 921 struct bch_read_bio *rbio = 922 container_of(bio, struct bch_read_bio, bio); 923 struct bch_fs *c = rbio->c; 924 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 925 struct workqueue_struct *wq = NULL; 926 enum rbio_context context = RBIO_CONTEXT_NULL; 927 928 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 929 rbio->submit_time, !bio->bi_status); 930 931 if (!rbio->split) 932 rbio->bio.bi_end_io = rbio->end_io; 933 934 if (unlikely(bio->bi_status)) { 935 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 936 return; 937 } 938 939 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 940 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 941 trace_and_count(c, io_read_reuse_race, &rbio->bio); 942 943 if (rbio->flags & BCH_READ_retry_if_stale) 944 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 945 else 946 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 947 return; 948 } 949 950 if (rbio->narrow_crcs || 951 rbio->promote || 952 crc_is_compressed(rbio->pick.crc) || 953 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 954 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 955 else if (rbio->pick.crc.csum_type) 956 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 957 958 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 959 } 960 961 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 962 struct bch_dev *ca, 963 struct bkey_s_c k, 964 struct bch_extent_ptr ptr) 965 { 966 struct bch_fs *c = trans->c; 967 struct btree_iter iter; 968 struct printbuf buf = PRINTBUF; 969 int ret; 970 971 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 972 PTR_BUCKET_POS(ca, &ptr), 973 BTREE_ITER_cached); 974 975 int gen = bucket_gen_get(ca, iter.pos.offset); 976 if (gen >= 0) { 977 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 978 printbuf_indent_add(&buf, 2); 979 980 bch2_bkey_val_to_text(&buf, c, k); 981 prt_newline(&buf); 982 983 prt_printf(&buf, "memory gen: %u", gen); 984 985 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 986 if (!ret) { 987 prt_newline(&buf); 988 bch2_bkey_val_to_text(&buf, c, k); 989 } 990 } else { 991 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 992 iter.pos.inode, iter.pos.offset); 993 printbuf_indent_add(&buf, 2); 994 995 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 996 ca->mi.first_bucket, ca->mi.nbuckets); 997 998 bch2_bkey_val_to_text(&buf, c, k); 999 prt_newline(&buf); 1000 } 1001 1002 bch2_fs_inconsistent(c, "%s", buf.buf); 1003 1004 bch2_trans_iter_exit(trans, &iter); 1005 printbuf_exit(&buf); 1006 } 1007 1008 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 1009 struct bvec_iter iter, struct bpos read_pos, 1010 enum btree_id data_btree, struct bkey_s_c k, 1011 unsigned offset_into_extent, 1012 struct bch_io_failures *failed, unsigned flags, int dev) 1013 { 1014 struct bch_fs *c = trans->c; 1015 struct extent_ptr_decoded pick; 1016 struct bch_read_bio *rbio = NULL; 1017 bool bounce = false, read_full = false, narrow_crcs = false; 1018 struct bpos data_pos = bkey_start_pos(k.k); 1019 struct data_update *u = rbio_data_update(orig); 1020 int ret = 0; 1021 1022 if (bkey_extent_is_inline_data(k.k)) { 1023 unsigned bytes = min_t(unsigned, iter.bi_size, 1024 bkey_inline_data_bytes(k.k)); 1025 1026 swap(iter.bi_size, bytes); 1027 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 1028 swap(iter.bi_size, bytes); 1029 bio_advance_iter(&orig->bio, &iter, bytes); 1030 zero_fill_bio_iter(&orig->bio, iter); 1031 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 1032 bvec_iter_sectors(iter)); 1033 goto out_read_done; 1034 } 1035 1036 if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && 1037 !orig->data_update) 1038 return bch_err_throw(c, extent_poisoned); 1039 retry_pick: 1040 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 1041 1042 /* hole or reservation - just zero fill: */ 1043 if (!ret) 1044 goto hole; 1045 1046 if (unlikely(ret < 0)) { 1047 if (ret == -BCH_ERR_data_read_csum_err) { 1048 int ret2 = maybe_poison_extent(trans, orig, data_btree, k); 1049 if (ret2) { 1050 ret = ret2; 1051 goto err; 1052 } 1053 1054 trace_and_count(c, io_read_fail_and_poison, &orig->bio); 1055 } 1056 1057 struct printbuf buf = PRINTBUF; 1058 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1059 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 1060 bch2_bkey_val_to_text(&buf, c, k); 1061 1062 bch_err_ratelimited(c, "%s", buf.buf); 1063 printbuf_exit(&buf); 1064 goto err; 1065 } 1066 1067 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && 1068 !c->chacha20_key_set) { 1069 struct printbuf buf = PRINTBUF; 1070 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1071 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 1072 bch2_bkey_val_to_text(&buf, c, k); 1073 1074 bch_err_ratelimited(c, "%s", buf.buf); 1075 printbuf_exit(&buf); 1076 ret = bch_err_throw(c, data_read_no_encryption_key); 1077 goto err; 1078 } 1079 1080 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, 1081 BCH_DEV_READ_REF_io_read); 1082 1083 /* 1084 * Stale dirty pointers are treated as IO errors, but @failed isn't 1085 * allocated unless we're in the retry path - so if we're not in the 1086 * retry path, don't check here, it'll be caught in bch2_read_endio() 1087 * and we'll end up in the retry path: 1088 */ 1089 if ((flags & BCH_READ_in_retry) && 1090 !pick.ptr.cached && 1091 ca && 1092 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1093 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1094 bch2_mark_io_failure(failed, &pick, false); 1095 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 1096 goto retry_pick; 1097 } 1098 1099 if (likely(!u)) { 1100 if (!(flags & BCH_READ_last_fragment) || 1101 bio_flagged(&orig->bio, BIO_CHAIN)) 1102 flags |= BCH_READ_must_clone; 1103 1104 narrow_crcs = !(flags & BCH_READ_in_retry) && 1105 bch2_can_narrow_extent_crcs(k, pick.crc); 1106 1107 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1108 flags |= BCH_READ_must_bounce; 1109 1110 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1111 1112 if (crc_is_compressed(pick.crc) || 1113 (pick.crc.csum_type != BCH_CSUM_none && 1114 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1115 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1116 (flags & BCH_READ_user_mapped)) || 1117 (flags & BCH_READ_must_bounce)))) { 1118 read_full = true; 1119 bounce = true; 1120 } 1121 } else { 1122 /* 1123 * can happen if we retry, and the extent we were going to read 1124 * has been merged in the meantime: 1125 */ 1126 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1127 if (ca) 1128 enumerated_ref_put(&ca->io_ref[READ], 1129 BCH_DEV_READ_REF_io_read); 1130 rbio->ret = bch_err_throw(c, data_read_buffer_too_small); 1131 goto out_read_done; 1132 } 1133 1134 iter.bi_size = pick.crc.compressed_size << 9; 1135 read_full = true; 1136 } 1137 1138 if (orig->opts.promote_target || have_io_error(failed)) 1139 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1140 &bounce, &read_full, failed); 1141 1142 if (!read_full) { 1143 EBUG_ON(crc_is_compressed(pick.crc)); 1144 EBUG_ON(pick.crc.csum_type && 1145 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1146 bvec_iter_sectors(iter) != pick.crc.live_size || 1147 pick.crc.offset || 1148 offset_into_extent)); 1149 1150 data_pos.offset += offset_into_extent; 1151 pick.ptr.offset += pick.crc.offset + 1152 offset_into_extent; 1153 offset_into_extent = 0; 1154 pick.crc.compressed_size = bvec_iter_sectors(iter); 1155 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1156 pick.crc.offset = 0; 1157 pick.crc.live_size = bvec_iter_sectors(iter); 1158 } 1159 1160 if (rbio) { 1161 /* 1162 * promote already allocated bounce rbio: 1163 * promote needs to allocate a bio big enough for uncompressing 1164 * data in the write path, but we're not going to use it all 1165 * here: 1166 */ 1167 EBUG_ON(rbio->bio.bi_iter.bi_size < 1168 pick.crc.compressed_size << 9); 1169 rbio->bio.bi_iter.bi_size = 1170 pick.crc.compressed_size << 9; 1171 } else if (bounce) { 1172 unsigned sectors = pick.crc.compressed_size; 1173 1174 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1175 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1176 0, 1177 GFP_NOFS, 1178 &c->bio_read_split), 1179 orig); 1180 1181 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1182 rbio->bounce = true; 1183 } else if (flags & BCH_READ_must_clone) { 1184 /* 1185 * Have to clone if there were any splits, due to error 1186 * reporting issues (if a split errored, and retrying didn't 1187 * work, when it reports the error to its parent (us) we don't 1188 * know if the error was from our bio, and we should retry, or 1189 * from the whole bio, in which case we don't want to retry and 1190 * lose the error) 1191 */ 1192 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1193 &c->bio_read_split), 1194 orig); 1195 rbio->bio.bi_iter = iter; 1196 } else { 1197 rbio = orig; 1198 rbio->bio.bi_iter = iter; 1199 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1200 } 1201 1202 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1203 1204 rbio->submit_time = local_clock(); 1205 if (!rbio->split) 1206 rbio->end_io = orig->bio.bi_end_io; 1207 rbio->bvec_iter = iter; 1208 rbio->offset_into_extent= offset_into_extent; 1209 rbio->flags = flags; 1210 rbio->have_ioref = ca != NULL; 1211 rbio->narrow_crcs = narrow_crcs; 1212 rbio->ret = 0; 1213 rbio->context = 0; 1214 rbio->pick = pick; 1215 rbio->subvol = orig->subvol; 1216 rbio->read_pos = read_pos; 1217 rbio->data_btree = data_btree; 1218 rbio->data_pos = data_pos; 1219 rbio->version = k.k->bversion; 1220 INIT_WORK(&rbio->work, NULL); 1221 1222 rbio->bio.bi_opf = orig->bio.bi_opf; 1223 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1224 rbio->bio.bi_end_io = bch2_read_endio; 1225 1226 async_object_list_add(c, rbio, rbio, &rbio->list_idx); 1227 1228 if (rbio->bounce) 1229 trace_and_count(c, io_read_bounce, &rbio->bio); 1230 1231 if (!u) 1232 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1233 else 1234 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1235 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1236 1237 /* 1238 * If it's being moved internally, we don't want to flag it as a cache 1239 * hit: 1240 */ 1241 if (ca && pick.ptr.cached && !u) 1242 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1243 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1244 1245 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1246 bio_inc_remaining(&orig->bio); 1247 trace_and_count(c, io_read_split, &orig->bio); 1248 } 1249 1250 /* 1251 * Unlock the iterator while the btree node's lock is still in 1252 * cache, before doing the IO: 1253 */ 1254 if (!(flags & BCH_READ_in_retry)) 1255 bch2_trans_unlock(trans); 1256 else 1257 bch2_trans_unlock_long(trans); 1258 1259 if (likely(!rbio->pick.do_ec_reconstruct)) { 1260 if (unlikely(!rbio->have_ioref)) { 1261 bch2_rbio_error(rbio, 1262 -BCH_ERR_data_read_retry_device_offline, 1263 BLK_STS_IOERR); 1264 goto out; 1265 } 1266 1267 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1268 bio_sectors(&rbio->bio)); 1269 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1270 1271 if (unlikely(c->opts.no_data_io)) { 1272 if (likely(!(flags & BCH_READ_in_retry))) 1273 bio_endio(&rbio->bio); 1274 } else { 1275 if (likely(!(flags & BCH_READ_in_retry))) 1276 submit_bio(&rbio->bio); 1277 else 1278 submit_bio_wait(&rbio->bio); 1279 } 1280 1281 /* 1282 * We just submitted IO which may block, we expect relock fail 1283 * events and shouldn't count them: 1284 */ 1285 trans->notrace_relock_fail = true; 1286 } else { 1287 /* Attempting reconstruct read: */ 1288 if (bch2_ec_read_extent(trans, rbio, k)) { 1289 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1290 BLK_STS_IOERR); 1291 goto out; 1292 } 1293 1294 if (likely(!(flags & BCH_READ_in_retry))) 1295 bio_endio(&rbio->bio); 1296 } 1297 out: 1298 if (likely(!(flags & BCH_READ_in_retry))) { 1299 return 0; 1300 } else { 1301 bch2_trans_unlock(trans); 1302 1303 int ret; 1304 1305 rbio->context = RBIO_CONTEXT_UNBOUND; 1306 bch2_read_endio(&rbio->bio); 1307 1308 ret = rbio->ret; 1309 rbio = bch2_rbio_free(rbio); 1310 1311 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1312 bch2_mark_io_failure(failed, &pick, 1313 ret == -BCH_ERR_data_read_retry_csum_err); 1314 1315 return ret; 1316 } 1317 1318 err: 1319 if (flags & BCH_READ_in_retry) 1320 return ret; 1321 1322 orig->bio.bi_status = BLK_STS_IOERR; 1323 orig->ret = ret; 1324 goto out_read_done; 1325 1326 hole: 1327 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1328 bvec_iter_sectors(iter)); 1329 /* 1330 * won't normally happen in the data update (bch2_move_extent()) path, 1331 * but if we retry and the extent we wanted to read no longer exists we 1332 * have to signal that: 1333 */ 1334 if (u) 1335 orig->ret = bch_err_throw(c, data_read_key_overwritten); 1336 1337 zero_fill_bio_iter(&orig->bio, iter); 1338 out_read_done: 1339 if ((flags & BCH_READ_last_fragment) && 1340 !(flags & BCH_READ_in_retry)) 1341 bch2_rbio_done(orig); 1342 return 0; 1343 } 1344 1345 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1346 struct bvec_iter bvec_iter, subvol_inum inum, 1347 struct bch_io_failures *failed, 1348 struct bkey_buf *prev_read, 1349 unsigned flags) 1350 { 1351 struct bch_fs *c = trans->c; 1352 struct btree_iter iter; 1353 struct bkey_buf sk; 1354 struct bkey_s_c k; 1355 enum btree_id data_btree; 1356 int ret; 1357 1358 EBUG_ON(rbio->data_update); 1359 1360 bch2_bkey_buf_init(&sk); 1361 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1362 POS(inum.inum, bvec_iter.bi_sector), 1363 BTREE_ITER_slots); 1364 1365 while (1) { 1366 data_btree = BTREE_ID_extents; 1367 1368 bch2_trans_begin(trans); 1369 1370 u32 snapshot; 1371 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1372 if (ret) 1373 goto err; 1374 1375 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1376 1377 bch2_btree_iter_set_pos(trans, &iter, 1378 POS(inum.inum, bvec_iter.bi_sector)); 1379 1380 k = bch2_btree_iter_peek_slot(trans, &iter); 1381 ret = bkey_err(k); 1382 if (ret) 1383 goto err; 1384 1385 s64 offset_into_extent = iter.pos.offset - 1386 bkey_start_offset(k.k); 1387 unsigned sectors = k.k->size - offset_into_extent; 1388 1389 bch2_bkey_buf_reassemble(&sk, c, k); 1390 1391 ret = bch2_read_indirect_extent(trans, &data_btree, 1392 &offset_into_extent, &sk); 1393 if (ret) 1394 goto err; 1395 1396 k = bkey_i_to_s_c(sk.k); 1397 1398 if (unlikely(flags & BCH_READ_in_retry)) { 1399 if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) 1400 failed->nr = 0; 1401 bch2_bkey_buf_copy(prev_read, c, sk.k); 1402 } 1403 1404 /* 1405 * With indirect extents, the amount of data to read is the min 1406 * of the original extent and the indirect extent: 1407 */ 1408 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1409 1410 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1411 swap(bvec_iter.bi_size, bytes); 1412 1413 if (bvec_iter.bi_size == bytes) 1414 flags |= BCH_READ_last_fragment; 1415 1416 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1417 data_btree, k, 1418 offset_into_extent, failed, flags, -1); 1419 swap(bvec_iter.bi_size, bytes); 1420 1421 if (ret) 1422 goto err; 1423 1424 if (flags & BCH_READ_last_fragment) 1425 break; 1426 1427 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1428 err: 1429 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1430 flags |= BCH_READ_must_bounce; 1431 1432 if (ret && 1433 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1434 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1435 break; 1436 } 1437 1438 if (unlikely(ret)) { 1439 if (ret != -BCH_ERR_extent_poisoned) { 1440 struct printbuf buf = PRINTBUF; 1441 lockrestart_do(trans, 1442 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1443 bvec_iter.bi_sector << 9)); 1444 prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); 1445 bch_err_ratelimited(c, "%s", buf.buf); 1446 printbuf_exit(&buf); 1447 } 1448 1449 rbio->bio.bi_status = BLK_STS_IOERR; 1450 rbio->ret = ret; 1451 1452 if (!(flags & BCH_READ_in_retry)) 1453 bch2_rbio_done(rbio); 1454 } 1455 1456 bch2_trans_iter_exit(trans, &iter); 1457 bch2_bkey_buf_exit(&sk, c); 1458 return ret; 1459 } 1460 1461 static const char * const bch2_read_bio_flags[] = { 1462 #define x(n) #n, 1463 BCH_READ_FLAGS() 1464 #undef x 1465 NULL 1466 }; 1467 1468 void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) 1469 { 1470 u64 now = local_clock(); 1471 prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); 1472 prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); 1473 1474 if (!rbio->split) 1475 prt_printf(out, "end_io:\t%ps\n", rbio->end_io); 1476 else 1477 prt_printf(out, "parent:\t%px\n", rbio->parent); 1478 1479 prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); 1480 1481 prt_printf(out, "promote:\t%u\n", rbio->promote); 1482 prt_printf(out, "bounce:\t%u\n", rbio->bounce); 1483 prt_printf(out, "split:\t%u\n", rbio->split); 1484 prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); 1485 prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); 1486 prt_printf(out, "context:\t%u\n", rbio->context); 1487 prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); 1488 1489 prt_printf(out, "flags:\t"); 1490 bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); 1491 prt_newline(out); 1492 1493 bch2_bio_to_text(out, &rbio->bio); 1494 } 1495 1496 void bch2_fs_io_read_exit(struct bch_fs *c) 1497 { 1498 if (c->promote_table.tbl) 1499 rhashtable_destroy(&c->promote_table); 1500 bioset_exit(&c->bio_read_split); 1501 bioset_exit(&c->bio_read); 1502 mempool_exit(&c->bio_bounce_pages); 1503 } 1504 1505 int bch2_fs_io_read_init(struct bch_fs *c) 1506 { 1507 if (mempool_init_page_pool(&c->bio_bounce_pages, 1508 max_t(unsigned, 1509 c->opts.btree_node_size, 1510 c->opts.encoded_extent_max) / 1511 PAGE_SIZE, 0)) 1512 return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); 1513 1514 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1515 BIOSET_NEED_BVECS)) 1516 return bch_err_throw(c, ENOMEM_bio_read_init); 1517 1518 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1519 BIOSET_NEED_BVECS)) 1520 return bch_err_throw(c, ENOMEM_bio_read_split_init); 1521 1522 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1523 return bch_err_throw(c, ENOMEM_promote_table_init); 1524 1525 return 0; 1526 } 1527