1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "async_objs.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "clock.h" 17 #include "compress.h" 18 #include "data_update.h" 19 #include "disk_groups.h" 20 #include "ec.h" 21 #include "enumerated_ref.h" 22 #include "error.h" 23 #include "io_read.h" 24 #include "io_misc.h" 25 #include "io_write.h" 26 #include "reflink.h" 27 #include "subvolume.h" 28 #include "trace.h" 29 30 #include <linux/moduleparam.h> 31 #include <linux/random.h> 32 #include <linux/sched/mm.h> 33 34 #ifdef CONFIG_BCACHEFS_DEBUG 35 static unsigned bch2_read_corrupt_ratio; 36 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 37 MODULE_PARM_DESC(read_corrupt_ratio, ""); 38 #endif 39 40 static bool bch2_poison_extents_on_checksum_error; 41 module_param_named(poison_extents_on_checksum_error, 42 bch2_poison_extents_on_checksum_error, bool, 0644); 43 MODULE_PARM_DESC(poison_extents_on_checksum_error, 44 "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); 45 46 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 47 48 static bool bch2_target_congested(struct bch_fs *c, u16 target) 49 { 50 const struct bch_devs_mask *devs; 51 unsigned d, nr = 0, total = 0; 52 u64 now = local_clock(), last; 53 s64 congested; 54 struct bch_dev *ca; 55 56 if (!target) 57 return false; 58 59 guard(rcu)(); 60 devs = bch2_target_to_mask(c, target) ?: 61 &c->rw_devs[BCH_DATA_user]; 62 63 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 64 ca = rcu_dereference(c->devs[d]); 65 if (!ca) 66 continue; 67 68 congested = atomic_read(&ca->congested); 69 last = READ_ONCE(ca->congested_last); 70 if (time_after64(now, last)) 71 congested -= (now - last) >> 12; 72 73 total += max(congested, 0LL); 74 nr++; 75 } 76 77 return get_random_u32_below(nr * CONGESTED_MAX) < total; 78 } 79 80 #else 81 82 static bool bch2_target_congested(struct bch_fs *c, u16 target) 83 { 84 return false; 85 } 86 87 #endif 88 89 /* Cache promotion on read */ 90 91 static const struct rhashtable_params bch_promote_params = { 92 .head_offset = offsetof(struct promote_op, hash), 93 .key_offset = offsetof(struct promote_op, pos), 94 .key_len = sizeof(struct bpos), 95 .automatic_shrinking = true, 96 }; 97 98 static inline bool have_io_error(struct bch_io_failures *failed) 99 { 100 return failed && failed->nr; 101 } 102 103 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 104 { 105 EBUG_ON(rbio->split); 106 107 return rbio->data_update 108 ? container_of(rbio, struct data_update, rbio) 109 : NULL; 110 } 111 112 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 113 { 114 struct data_update *u = rbio_data_update(orig); 115 if (!u) 116 return false; 117 118 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 119 unsigned i = 0; 120 bkey_for_each_ptr(ptrs, ptr) { 121 if (ptr->dev == dev && 122 u->data_opts.rewrite_ptrs & BIT(i)) 123 return true; 124 i++; 125 } 126 127 return false; 128 } 129 130 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 131 struct bpos pos, 132 struct bch_io_opts opts, 133 unsigned flags, 134 struct bch_io_failures *failed) 135 { 136 if (!have_io_error(failed)) { 137 BUG_ON(!opts.promote_target); 138 139 if (!(flags & BCH_READ_may_promote)) 140 return bch_err_throw(c, nopromote_may_not); 141 142 if (bch2_bkey_has_target(c, k, opts.promote_target)) 143 return bch_err_throw(c, nopromote_already_promoted); 144 145 if (bkey_extent_is_unwritten(k)) 146 return bch_err_throw(c, nopromote_unwritten); 147 148 if (bch2_target_congested(c, opts.promote_target)) 149 return bch_err_throw(c, nopromote_congested); 150 } 151 152 if (rhashtable_lookup_fast(&c->promote_table, &pos, 153 bch_promote_params)) 154 return bch_err_throw(c, nopromote_in_flight); 155 156 return 0; 157 } 158 159 static noinline void promote_free(struct bch_read_bio *rbio) 160 { 161 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 162 struct bch_fs *c = rbio->c; 163 164 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 165 bch_promote_params); 166 BUG_ON(ret); 167 168 async_object_list_del(c, promote, op->list_idx); 169 async_object_list_del(c, rbio, rbio->list_idx); 170 171 bch2_data_update_exit(&op->write); 172 173 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 174 kfree_rcu(op, rcu); 175 } 176 177 static void promote_done(struct bch_write_op *wop) 178 { 179 struct promote_op *op = container_of(wop, struct promote_op, write.op); 180 struct bch_fs *c = op->write.rbio.c; 181 182 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 183 promote_free(&op->write.rbio); 184 } 185 186 static void promote_start_work(struct work_struct *work) 187 { 188 struct promote_op *op = container_of(work, struct promote_op, work); 189 190 bch2_data_update_read_done(&op->write); 191 } 192 193 static noinline void promote_start(struct bch_read_bio *rbio) 194 { 195 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 196 197 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 198 199 INIT_WORK(&op->work, promote_start_work); 200 queue_work(rbio->c->write_ref_wq, &op->work); 201 } 202 203 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 204 enum btree_id btree_id, 205 struct bkey_s_c k, 206 struct bpos pos, 207 struct extent_ptr_decoded *pick, 208 unsigned sectors, 209 struct bch_read_bio *orig, 210 struct bch_io_failures *failed) 211 { 212 struct bch_fs *c = trans->c; 213 int ret; 214 215 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 216 217 if (!have_io_error(failed)) { 218 update_opts.target = orig->opts.promote_target; 219 update_opts.extra_replicas = 1; 220 update_opts.write_flags |= BCH_WRITE_cached; 221 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 222 } else { 223 update_opts.target = orig->opts.foreground_target; 224 225 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 226 unsigned ptr_bit = 1; 227 bkey_for_each_ptr(ptrs, ptr) { 228 if (bch2_dev_io_failures(failed, ptr->dev) && 229 !ptr_being_rewritten(orig, ptr->dev)) 230 update_opts.rewrite_ptrs |= ptr_bit; 231 ptr_bit <<= 1; 232 } 233 234 if (!update_opts.rewrite_ptrs) 235 return NULL; 236 } 237 238 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) 239 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 240 241 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 242 if (!op) { 243 ret = bch_err_throw(c, nopromote_enomem); 244 goto err_put; 245 } 246 247 op->start_time = local_clock(); 248 op->pos = pos; 249 250 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 251 bch_promote_params)) { 252 ret = bch_err_throw(c, nopromote_in_flight); 253 goto err; 254 } 255 256 ret = async_object_list_add(c, promote, op, &op->list_idx); 257 if (ret < 0) 258 goto err_remove_hash; 259 260 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 261 writepoint_hashed((unsigned long) current), 262 &orig->opts, 263 update_opts, 264 btree_id, k); 265 op->write.type = BCH_DATA_UPDATE_promote; 266 /* 267 * possible errors: -BCH_ERR_nocow_lock_blocked, 268 * -BCH_ERR_ENOSPC_disk_reservation: 269 */ 270 if (ret) 271 goto err_remove_list; 272 273 rbio_init_fragment(&op->write.rbio.bio, orig); 274 op->write.rbio.bounce = true; 275 op->write.rbio.promote = true; 276 op->write.op.end_io = promote_done; 277 278 return &op->write.rbio; 279 err_remove_list: 280 async_object_list_del(c, promote, op->list_idx); 281 err_remove_hash: 282 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 283 bch_promote_params)); 284 err: 285 bio_free_pages(&op->write.op.wbio.bio); 286 /* We may have added to the rhashtable and thus need rcu freeing: */ 287 kfree_rcu(op, rcu); 288 err_put: 289 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 290 return ERR_PTR(ret); 291 } 292 293 noinline 294 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 295 struct bvec_iter iter, 296 struct bkey_s_c k, 297 struct extent_ptr_decoded *pick, 298 unsigned flags, 299 struct bch_read_bio *orig, 300 bool *bounce, 301 bool *read_full, 302 struct bch_io_failures *failed) 303 { 304 /* 305 * We're in the retry path, but we don't know what to repair yet, and we 306 * don't want to do a promote here: 307 */ 308 if (failed && !failed->nr) 309 return NULL; 310 311 struct bch_fs *c = trans->c; 312 /* 313 * if failed != NULL we're not actually doing a promote, we're 314 * recovering from an io/checksum error 315 */ 316 bool promote_full = (have_io_error(failed) || 317 *read_full || 318 READ_ONCE(c->opts.promote_whole_extents)); 319 /* data might have to be decompressed in the write path: */ 320 unsigned sectors = promote_full 321 ? max(pick->crc.compressed_size, pick->crc.live_size) 322 : bvec_iter_sectors(iter); 323 struct bpos pos = promote_full 324 ? bkey_start_pos(k.k) 325 : POS(k.k->p.inode, iter.bi_sector); 326 int ret; 327 328 ret = should_promote(c, k, pos, orig->opts, flags, failed); 329 if (ret) 330 goto nopromote; 331 332 struct bch_read_bio *promote = 333 __promote_alloc(trans, 334 k.k->type == KEY_TYPE_reflink_v 335 ? BTREE_ID_reflink 336 : BTREE_ID_extents, 337 k, pos, pick, sectors, orig, failed); 338 if (!promote) 339 return NULL; 340 341 ret = PTR_ERR_OR_ZERO(promote); 342 if (ret) 343 goto nopromote; 344 345 *bounce = true; 346 *read_full = promote_full; 347 348 if (have_io_error(failed)) 349 orig->self_healing = true; 350 351 return promote; 352 nopromote: 353 trace_io_read_nopromote(c, ret); 354 return NULL; 355 } 356 357 void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) 358 { 359 if (!op->write.read_done) { 360 prt_printf(out, "parent read: %px\n", op->write.rbio.parent); 361 printbuf_indent_add(out, 2); 362 bch2_read_bio_to_text(out, op->write.rbio.parent); 363 printbuf_indent_sub(out, 2); 364 } 365 366 bch2_data_update_to_text(out, &op->write); 367 } 368 369 /* Read */ 370 371 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 372 struct bch_read_bio *rbio, struct bpos read_pos) 373 { 374 int ret = lockrestart_do(trans, 375 bch2_inum_offset_err_msg_trans(trans, out, 376 (subvol_inum) { rbio->subvol, read_pos.inode }, 377 read_pos.offset << 9)); 378 if (ret) 379 return ret; 380 381 if (rbio->data_update) 382 prt_str(out, "(internal move) "); 383 384 return 0; 385 } 386 387 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 388 struct bch_read_bio *rbio, struct bpos read_pos) 389 { 390 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 391 } 392 393 enum rbio_context { 394 RBIO_CONTEXT_NULL, 395 RBIO_CONTEXT_HIGHPRI, 396 RBIO_CONTEXT_UNBOUND, 397 }; 398 399 static inline struct bch_read_bio * 400 bch2_rbio_parent(struct bch_read_bio *rbio) 401 { 402 return rbio->split ? rbio->parent : rbio; 403 } 404 405 __always_inline 406 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 407 enum rbio_context context, 408 struct workqueue_struct *wq) 409 { 410 if (context <= rbio->context) { 411 fn(&rbio->work); 412 } else { 413 rbio->work.func = fn; 414 rbio->context = context; 415 queue_work(wq, &rbio->work); 416 } 417 } 418 419 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 420 { 421 BUG_ON(rbio->bounce && !rbio->split); 422 423 if (rbio->have_ioref) { 424 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 425 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 426 } 427 428 if (rbio->split) { 429 struct bch_read_bio *parent = rbio->parent; 430 431 if (unlikely(rbio->promote)) { 432 if (!rbio->bio.bi_status) 433 promote_start(rbio); 434 else 435 promote_free(rbio); 436 } else { 437 async_object_list_del(rbio->c, rbio, rbio->list_idx); 438 439 if (rbio->bounce) 440 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 441 442 bio_put(&rbio->bio); 443 } 444 445 rbio = parent; 446 } 447 448 return rbio; 449 } 450 451 /* 452 * Only called on a top level bch_read_bio to complete an entire read request, 453 * not a split: 454 */ 455 static void bch2_rbio_done(struct bch_read_bio *rbio) 456 { 457 if (rbio->start_time) 458 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 459 rbio->start_time); 460 #ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS 461 if (rbio->list_idx) 462 async_object_list_del(rbio->c, rbio, rbio->list_idx); 463 #endif 464 bio_endio(&rbio->bio); 465 } 466 467 static void get_rbio_extent(struct btree_trans *trans, 468 struct bch_read_bio *rbio, 469 struct bkey_buf *sk) 470 { 471 struct btree_iter iter; 472 struct bkey_s_c k; 473 int ret = lockrestart_do(trans, 474 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 475 rbio->data_btree, rbio->data_pos, 0))); 476 if (ret) 477 return; 478 479 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 480 bkey_for_each_ptr(ptrs, ptr) 481 if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { 482 bch2_bkey_buf_reassemble(sk, trans->c, k); 483 break; 484 } 485 486 bch2_trans_iter_exit(trans, &iter); 487 } 488 489 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, 490 enum btree_id btree, struct bkey_s_c read_k) 491 { 492 if (!bch2_poison_extents_on_checksum_error) 493 return 0; 494 495 struct bch_fs *c = trans->c; 496 497 struct data_update *u = rbio_data_update(rbio); 498 if (u) 499 read_k = bkey_i_to_s_c(u->k.k); 500 501 u64 flags = bch2_bkey_extent_flags(read_k); 502 if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) 503 return 0; 504 505 struct btree_iter iter; 506 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), 507 BTREE_ITER_intent); 508 int ret = bkey_err(k); 509 if (ret) 510 return ret; 511 512 if (!bkey_and_val_eq(k, read_k)) 513 goto out; 514 515 struct bkey_i *new = bch2_trans_kmalloc(trans, 516 bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); 517 ret = PTR_ERR_OR_ZERO(new) ?: 518 (bkey_reassemble(new, k), 0) ?: 519 bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: 520 bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: 521 bch2_trans_commit(trans, NULL, NULL, 0); 522 523 /* 524 * Propagate key change back to data update path, in particular so it 525 * knows the extent has been poisoned and it's safe to change the 526 * checksum 527 */ 528 if (u && !ret) 529 bch2_bkey_buf_copy(&u->k, c, new); 530 out: 531 bch2_trans_iter_exit(trans, &iter); 532 return ret; 533 } 534 535 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 536 struct bch_read_bio *rbio, 537 struct bvec_iter bvec_iter, 538 struct bch_io_failures *failed, 539 unsigned flags) 540 { 541 struct data_update *u = container_of(rbio, struct data_update, rbio); 542 retry: 543 bch2_trans_begin(trans); 544 545 struct btree_iter iter; 546 struct bkey_s_c k; 547 int ret = lockrestart_do(trans, 548 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 549 u->btree_id, bkey_start_pos(&u->k.k->k), 550 0))); 551 if (ret) 552 goto err; 553 554 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 555 /* extent we wanted to read no longer exists: */ 556 rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); 557 goto err; 558 } 559 560 ret = __bch2_read_extent(trans, rbio, bvec_iter, 561 bkey_start_pos(&u->k.k->k), 562 u->btree_id, 563 bkey_i_to_s_c(u->k.k), 564 0, failed, flags, -1); 565 err: 566 bch2_trans_iter_exit(trans, &iter); 567 568 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 569 bch2_err_matches(ret, BCH_ERR_data_read_retry)) 570 goto retry; 571 572 if (ret) { 573 rbio->bio.bi_status = BLK_STS_IOERR; 574 rbio->ret = ret; 575 } 576 577 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 578 return ret; 579 } 580 581 static void bch2_rbio_retry(struct work_struct *work) 582 { 583 struct bch_read_bio *rbio = 584 container_of(work, struct bch_read_bio, work); 585 struct bch_fs *c = rbio->c; 586 struct bvec_iter iter = rbio->bvec_iter; 587 unsigned flags = rbio->flags; 588 subvol_inum inum = { 589 .subvol = rbio->subvol, 590 .inum = rbio->read_pos.inode, 591 }; 592 struct bch_io_failures failed = { .nr = 0 }; 593 594 struct btree_trans *trans = bch2_trans_get(c); 595 596 struct bkey_buf sk; 597 bch2_bkey_buf_init(&sk); 598 bkey_init(&sk.k->k); 599 600 trace_io_read_retry(&rbio->bio); 601 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 602 bvec_iter_sectors(rbio->bvec_iter)); 603 604 get_rbio_extent(trans, rbio, &sk); 605 606 if (!bkey_deleted(&sk.k->k) && 607 bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 608 bch2_mark_io_failure(&failed, &rbio->pick, 609 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 610 611 if (!rbio->split) { 612 rbio->bio.bi_status = 0; 613 rbio->ret = 0; 614 } 615 616 unsigned subvol = rbio->subvol; 617 struct bpos read_pos = rbio->read_pos; 618 619 rbio = bch2_rbio_free(rbio); 620 621 flags |= BCH_READ_in_retry; 622 flags &= ~BCH_READ_may_promote; 623 flags &= ~BCH_READ_last_fragment; 624 flags |= BCH_READ_must_clone; 625 626 int ret = rbio->data_update 627 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 628 : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); 629 630 if (ret) { 631 rbio->ret = ret; 632 rbio->bio.bi_status = BLK_STS_IOERR; 633 } 634 635 if (failed.nr || ret) { 636 struct printbuf buf = PRINTBUF; 637 bch2_log_msg_start(c, &buf); 638 639 lockrestart_do(trans, 640 bch2_inum_offset_err_msg_trans(trans, &buf, 641 (subvol_inum) { subvol, read_pos.inode }, 642 read_pos.offset << 9)); 643 if (rbio->data_update) 644 prt_str(&buf, "(internal move) "); 645 646 prt_str(&buf, "data read error, "); 647 if (!ret) { 648 prt_str(&buf, "successful retry"); 649 if (rbio->self_healing) 650 prt_str(&buf, ", self healing"); 651 } else 652 prt_str(&buf, bch2_err_str(ret)); 653 prt_newline(&buf); 654 655 656 if (!bkey_deleted(&sk.k->k)) { 657 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); 658 prt_newline(&buf); 659 } 660 661 bch2_io_failures_to_text(&buf, c, &failed); 662 663 bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); 664 printbuf_exit(&buf); 665 } 666 667 bch2_rbio_done(rbio); 668 bch2_bkey_buf_exit(&sk, c); 669 bch2_trans_put(trans); 670 } 671 672 static void bch2_rbio_error(struct bch_read_bio *rbio, 673 int ret, blk_status_t blk_error) 674 { 675 BUG_ON(ret >= 0); 676 677 rbio->ret = ret; 678 rbio->bio.bi_status = blk_error; 679 680 bch2_rbio_parent(rbio)->saw_error = true; 681 682 if (rbio->flags & BCH_READ_in_retry) 683 return; 684 685 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 686 bch2_rbio_punt(rbio, bch2_rbio_retry, 687 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 688 } else { 689 rbio = bch2_rbio_free(rbio); 690 691 rbio->ret = ret; 692 rbio->bio.bi_status = blk_error; 693 694 bch2_rbio_done(rbio); 695 } 696 } 697 698 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 699 struct bch_read_bio *rbio) 700 { 701 struct bch_fs *c = rbio->c; 702 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 703 struct bch_extent_crc_unpacked new_crc; 704 struct btree_iter iter; 705 struct bkey_i *new; 706 struct bkey_s_c k; 707 int ret = 0; 708 709 if (crc_is_compressed(rbio->pick.crc)) 710 return 0; 711 712 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 713 BTREE_ITER_slots|BTREE_ITER_intent); 714 if ((ret = bkey_err(k))) 715 goto out; 716 717 if (bversion_cmp(k.k->bversion, rbio->version) || 718 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 719 goto out; 720 721 /* Extent was merged? */ 722 if (bkey_start_offset(k.k) < data_offset || 723 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 724 goto out; 725 726 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 727 rbio->pick.crc, NULL, &new_crc, 728 bkey_start_offset(k.k) - data_offset, k.k->size, 729 rbio->pick.crc.csum_type)) { 730 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 731 ret = 0; 732 goto out; 733 } 734 735 /* 736 * going to be temporarily appending another checksum entry: 737 */ 738 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 739 sizeof(struct bch_extent_crc128)); 740 if ((ret = PTR_ERR_OR_ZERO(new))) 741 goto out; 742 743 bkey_reassemble(new, k); 744 745 if (!bch2_bkey_narrow_crcs(new, new_crc)) 746 goto out; 747 748 ret = bch2_trans_update(trans, &iter, new, 749 BTREE_UPDATE_internal_snapshot_node); 750 out: 751 bch2_trans_iter_exit(trans, &iter); 752 return ret; 753 } 754 755 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 756 { 757 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 758 __bch2_rbio_narrow_crcs(trans, rbio)); 759 } 760 761 static void bch2_read_decompress_err(struct work_struct *work) 762 { 763 struct bch_read_bio *rbio = 764 container_of(work, struct bch_read_bio, work); 765 struct bch_fs *c = rbio->c; 766 struct printbuf buf = PRINTBUF; 767 768 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 769 prt_str(&buf, "decompression error"); 770 771 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 772 if (ca) 773 bch_err_ratelimited(ca, "%s", buf.buf); 774 else 775 bch_err_ratelimited(c, "%s", buf.buf); 776 777 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 778 printbuf_exit(&buf); 779 } 780 781 static void bch2_read_decrypt_err(struct work_struct *work) 782 { 783 struct bch_read_bio *rbio = 784 container_of(work, struct bch_read_bio, work); 785 struct bch_fs *c = rbio->c; 786 struct printbuf buf = PRINTBUF; 787 788 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 789 prt_str(&buf, "decrypt error"); 790 791 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 792 if (ca) 793 bch_err_ratelimited(ca, "%s", buf.buf); 794 else 795 bch_err_ratelimited(c, "%s", buf.buf); 796 797 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 798 printbuf_exit(&buf); 799 } 800 801 /* Inner part that may run in process context */ 802 static void __bch2_read_endio(struct work_struct *work) 803 { 804 struct bch_read_bio *rbio = 805 container_of(work, struct bch_read_bio, work); 806 struct bch_fs *c = rbio->c; 807 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 808 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 809 struct bio *src = &rbio->bio; 810 struct bio *dst = &parent->bio; 811 struct bvec_iter dst_iter = rbio->bvec_iter; 812 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 813 struct nonce nonce = extent_nonce(rbio->version, crc); 814 unsigned nofs_flags; 815 struct bch_csum csum; 816 int ret; 817 818 nofs_flags = memalloc_nofs_save(); 819 820 /* Reset iterator for checksumming and copying bounced data: */ 821 if (rbio->bounce) { 822 src->bi_iter.bi_size = crc.compressed_size << 9; 823 src->bi_iter.bi_idx = 0; 824 src->bi_iter.bi_bvec_done = 0; 825 } else { 826 src->bi_iter = rbio->bvec_iter; 827 } 828 829 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 830 831 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 832 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 833 834 /* 835 * Checksum error: if the bio wasn't bounced, we may have been 836 * reading into buffers owned by userspace (that userspace can 837 * scribble over) - retry the read, bouncing it this time: 838 */ 839 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 840 rbio->flags |= BCH_READ_must_bounce; 841 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 842 BLK_STS_IOERR); 843 goto out; 844 } 845 846 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 847 848 if (!csum_good) 849 goto csum_err; 850 851 /* 852 * XXX 853 * We need to rework the narrow_crcs path to deliver the read completion 854 * first, and then punt to a different workqueue, otherwise we're 855 * holding up reads while doing btree updates which is bad for memory 856 * reclaim. 857 */ 858 if (unlikely(rbio->narrow_crcs)) 859 bch2_rbio_narrow_crcs(rbio); 860 861 if (likely(!parent->data_update)) { 862 /* Adjust crc to point to subset of data we want: */ 863 crc.offset += rbio->offset_into_extent; 864 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 865 866 if (crc_is_compressed(crc)) { 867 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 868 if (ret) 869 goto decrypt_err; 870 871 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 872 !c->opts.no_data_io) 873 goto decompression_err; 874 } else { 875 /* don't need to decrypt the entire bio: */ 876 nonce = nonce_add(nonce, crc.offset << 9); 877 bio_advance(src, crc.offset << 9); 878 879 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 880 src->bi_iter.bi_size = dst_iter.bi_size; 881 882 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 883 if (ret) 884 goto decrypt_err; 885 886 if (rbio->bounce) { 887 struct bvec_iter src_iter = src->bi_iter; 888 889 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 890 } 891 } 892 } else { 893 if (rbio->split) 894 rbio->parent->pick = rbio->pick; 895 896 if (rbio->bounce) { 897 struct bvec_iter src_iter = src->bi_iter; 898 899 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 900 } 901 } 902 903 if (rbio->promote) { 904 /* 905 * Re encrypt data we decrypted, so it's consistent with 906 * rbio->crc: 907 */ 908 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 909 if (ret) 910 goto decrypt_err; 911 } 912 913 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 914 rbio = bch2_rbio_free(rbio); 915 bch2_rbio_done(rbio); 916 } 917 out: 918 memalloc_nofs_restore(nofs_flags); 919 return; 920 csum_err: 921 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 922 goto out; 923 decompression_err: 924 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 925 goto out; 926 decrypt_err: 927 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 928 goto out; 929 } 930 931 static void bch2_read_endio(struct bio *bio) 932 { 933 struct bch_read_bio *rbio = 934 container_of(bio, struct bch_read_bio, bio); 935 struct bch_fs *c = rbio->c; 936 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 937 struct workqueue_struct *wq = NULL; 938 enum rbio_context context = RBIO_CONTEXT_NULL; 939 940 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 941 rbio->submit_time, !bio->bi_status); 942 943 if (!rbio->split) 944 rbio->bio.bi_end_io = rbio->end_io; 945 946 if (unlikely(bio->bi_status)) { 947 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 948 return; 949 } 950 951 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 952 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 953 trace_and_count(c, io_read_reuse_race, &rbio->bio); 954 955 if (rbio->flags & BCH_READ_retry_if_stale) 956 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 957 else 958 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 959 return; 960 } 961 962 if (rbio->narrow_crcs || 963 rbio->promote || 964 crc_is_compressed(rbio->pick.crc) || 965 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 966 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 967 else if (rbio->pick.crc.csum_type) 968 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 969 970 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 971 } 972 973 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 974 struct bch_dev *ca, 975 struct bkey_s_c k, 976 struct bch_extent_ptr ptr) 977 { 978 struct bch_fs *c = trans->c; 979 struct btree_iter iter; 980 struct printbuf buf = PRINTBUF; 981 int ret; 982 983 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 984 PTR_BUCKET_POS(ca, &ptr), 985 BTREE_ITER_cached); 986 987 int gen = bucket_gen_get(ca, iter.pos.offset); 988 if (gen >= 0) { 989 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 990 printbuf_indent_add(&buf, 2); 991 992 bch2_bkey_val_to_text(&buf, c, k); 993 prt_newline(&buf); 994 995 prt_printf(&buf, "memory gen: %u", gen); 996 997 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 998 if (!ret) { 999 prt_newline(&buf); 1000 bch2_bkey_val_to_text(&buf, c, k); 1001 } 1002 } else { 1003 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 1004 iter.pos.inode, iter.pos.offset); 1005 printbuf_indent_add(&buf, 2); 1006 1007 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 1008 ca->mi.first_bucket, ca->mi.nbuckets); 1009 1010 bch2_bkey_val_to_text(&buf, c, k); 1011 prt_newline(&buf); 1012 } 1013 1014 bch2_fs_inconsistent(c, "%s", buf.buf); 1015 1016 bch2_trans_iter_exit(trans, &iter); 1017 printbuf_exit(&buf); 1018 } 1019 1020 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 1021 struct bvec_iter iter, struct bpos read_pos, 1022 enum btree_id data_btree, struct bkey_s_c k, 1023 unsigned offset_into_extent, 1024 struct bch_io_failures *failed, unsigned flags, int dev) 1025 { 1026 struct bch_fs *c = trans->c; 1027 struct extent_ptr_decoded pick; 1028 struct bch_read_bio *rbio = NULL; 1029 bool bounce = false, read_full = false, narrow_crcs = false; 1030 struct bpos data_pos = bkey_start_pos(k.k); 1031 struct data_update *u = rbio_data_update(orig); 1032 int ret = 0; 1033 1034 if (bkey_extent_is_inline_data(k.k)) { 1035 unsigned bytes = min_t(unsigned, iter.bi_size, 1036 bkey_inline_data_bytes(k.k)); 1037 1038 swap(iter.bi_size, bytes); 1039 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 1040 swap(iter.bi_size, bytes); 1041 bio_advance_iter(&orig->bio, &iter, bytes); 1042 zero_fill_bio_iter(&orig->bio, iter); 1043 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 1044 bvec_iter_sectors(iter)); 1045 goto out_read_done; 1046 } 1047 1048 if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && 1049 !orig->data_update) 1050 return bch_err_throw(c, extent_poisoned); 1051 retry_pick: 1052 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 1053 1054 /* hole or reservation - just zero fill: */ 1055 if (!ret) 1056 goto hole; 1057 1058 if (unlikely(ret < 0)) { 1059 if (ret == -BCH_ERR_data_read_csum_err) { 1060 int ret2 = maybe_poison_extent(trans, orig, data_btree, k); 1061 if (ret2) { 1062 ret = ret2; 1063 goto err; 1064 } 1065 1066 trace_and_count(c, io_read_fail_and_poison, &orig->bio); 1067 } 1068 1069 struct printbuf buf = PRINTBUF; 1070 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1071 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 1072 bch2_bkey_val_to_text(&buf, c, k); 1073 1074 bch_err_ratelimited(c, "%s", buf.buf); 1075 printbuf_exit(&buf); 1076 goto err; 1077 } 1078 1079 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && 1080 !c->chacha20_key_set) { 1081 struct printbuf buf = PRINTBUF; 1082 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1083 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 1084 bch2_bkey_val_to_text(&buf, c, k); 1085 1086 bch_err_ratelimited(c, "%s", buf.buf); 1087 printbuf_exit(&buf); 1088 ret = bch_err_throw(c, data_read_no_encryption_key); 1089 goto err; 1090 } 1091 1092 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, 1093 BCH_DEV_READ_REF_io_read); 1094 1095 /* 1096 * Stale dirty pointers are treated as IO errors, but @failed isn't 1097 * allocated unless we're in the retry path - so if we're not in the 1098 * retry path, don't check here, it'll be caught in bch2_read_endio() 1099 * and we'll end up in the retry path: 1100 */ 1101 if ((flags & BCH_READ_in_retry) && 1102 !pick.ptr.cached && 1103 ca && 1104 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1105 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1106 bch2_mark_io_failure(failed, &pick, false); 1107 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 1108 goto retry_pick; 1109 } 1110 1111 if (likely(!u)) { 1112 if (!(flags & BCH_READ_last_fragment) || 1113 bio_flagged(&orig->bio, BIO_CHAIN)) 1114 flags |= BCH_READ_must_clone; 1115 1116 narrow_crcs = !(flags & BCH_READ_in_retry) && 1117 bch2_can_narrow_extent_crcs(k, pick.crc); 1118 1119 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1120 flags |= BCH_READ_must_bounce; 1121 1122 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1123 1124 if (crc_is_compressed(pick.crc) || 1125 (pick.crc.csum_type != BCH_CSUM_none && 1126 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1127 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1128 (flags & BCH_READ_user_mapped)) || 1129 (flags & BCH_READ_must_bounce)))) { 1130 read_full = true; 1131 bounce = true; 1132 } 1133 } else { 1134 /* 1135 * can happen if we retry, and the extent we were going to read 1136 * has been merged in the meantime: 1137 */ 1138 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1139 if (ca) 1140 enumerated_ref_put(&ca->io_ref[READ], 1141 BCH_DEV_READ_REF_io_read); 1142 rbio->ret = bch_err_throw(c, data_read_buffer_too_small); 1143 goto out_read_done; 1144 } 1145 1146 iter.bi_size = pick.crc.compressed_size << 9; 1147 read_full = true; 1148 } 1149 1150 if (orig->opts.promote_target || have_io_error(failed)) 1151 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1152 &bounce, &read_full, failed); 1153 1154 if (!read_full) { 1155 EBUG_ON(crc_is_compressed(pick.crc)); 1156 EBUG_ON(pick.crc.csum_type && 1157 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1158 bvec_iter_sectors(iter) != pick.crc.live_size || 1159 pick.crc.offset || 1160 offset_into_extent)); 1161 1162 data_pos.offset += offset_into_extent; 1163 pick.ptr.offset += pick.crc.offset + 1164 offset_into_extent; 1165 offset_into_extent = 0; 1166 pick.crc.compressed_size = bvec_iter_sectors(iter); 1167 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1168 pick.crc.offset = 0; 1169 pick.crc.live_size = bvec_iter_sectors(iter); 1170 } 1171 1172 if (rbio) { 1173 /* 1174 * promote already allocated bounce rbio: 1175 * promote needs to allocate a bio big enough for uncompressing 1176 * data in the write path, but we're not going to use it all 1177 * here: 1178 */ 1179 EBUG_ON(rbio->bio.bi_iter.bi_size < 1180 pick.crc.compressed_size << 9); 1181 rbio->bio.bi_iter.bi_size = 1182 pick.crc.compressed_size << 9; 1183 } else if (bounce) { 1184 unsigned sectors = pick.crc.compressed_size; 1185 1186 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1187 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1188 0, 1189 GFP_NOFS, 1190 &c->bio_read_split), 1191 orig); 1192 1193 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1194 rbio->bounce = true; 1195 } else if (flags & BCH_READ_must_clone) { 1196 /* 1197 * Have to clone if there were any splits, due to error 1198 * reporting issues (if a split errored, and retrying didn't 1199 * work, when it reports the error to its parent (us) we don't 1200 * know if the error was from our bio, and we should retry, or 1201 * from the whole bio, in which case we don't want to retry and 1202 * lose the error) 1203 */ 1204 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1205 &c->bio_read_split), 1206 orig); 1207 rbio->bio.bi_iter = iter; 1208 } else { 1209 rbio = orig; 1210 rbio->bio.bi_iter = iter; 1211 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1212 } 1213 1214 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1215 1216 rbio->submit_time = local_clock(); 1217 if (!rbio->split) 1218 rbio->end_io = orig->bio.bi_end_io; 1219 rbio->bvec_iter = iter; 1220 rbio->offset_into_extent= offset_into_extent; 1221 rbio->flags = flags; 1222 rbio->have_ioref = ca != NULL; 1223 rbio->narrow_crcs = narrow_crcs; 1224 rbio->ret = 0; 1225 rbio->context = 0; 1226 rbio->pick = pick; 1227 rbio->subvol = orig->subvol; 1228 rbio->read_pos = read_pos; 1229 rbio->data_btree = data_btree; 1230 rbio->data_pos = data_pos; 1231 rbio->version = k.k->bversion; 1232 INIT_WORK(&rbio->work, NULL); 1233 1234 rbio->bio.bi_opf = orig->bio.bi_opf; 1235 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1236 rbio->bio.bi_end_io = bch2_read_endio; 1237 1238 async_object_list_add(c, rbio, rbio, &rbio->list_idx); 1239 1240 if (rbio->bounce) 1241 trace_and_count(c, io_read_bounce, &rbio->bio); 1242 1243 if (!u) 1244 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1245 else 1246 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1247 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1248 1249 /* 1250 * If it's being moved internally, we don't want to flag it as a cache 1251 * hit: 1252 */ 1253 if (ca && pick.ptr.cached && !u) 1254 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1255 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1256 1257 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1258 bio_inc_remaining(&orig->bio); 1259 trace_and_count(c, io_read_split, &orig->bio); 1260 } 1261 1262 /* 1263 * Unlock the iterator while the btree node's lock is still in 1264 * cache, before doing the IO: 1265 */ 1266 if (!(flags & BCH_READ_in_retry)) 1267 bch2_trans_unlock(trans); 1268 else 1269 bch2_trans_unlock_long(trans); 1270 1271 if (likely(!rbio->pick.do_ec_reconstruct)) { 1272 if (unlikely(!rbio->have_ioref)) { 1273 bch2_rbio_error(rbio, 1274 -BCH_ERR_data_read_retry_device_offline, 1275 BLK_STS_IOERR); 1276 goto out; 1277 } 1278 1279 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1280 bio_sectors(&rbio->bio)); 1281 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1282 1283 if (unlikely(c->opts.no_data_io)) { 1284 if (likely(!(flags & BCH_READ_in_retry))) 1285 bio_endio(&rbio->bio); 1286 } else { 1287 if (likely(!(flags & BCH_READ_in_retry))) 1288 submit_bio(&rbio->bio); 1289 else 1290 submit_bio_wait(&rbio->bio); 1291 } 1292 1293 /* 1294 * We just submitted IO which may block, we expect relock fail 1295 * events and shouldn't count them: 1296 */ 1297 trans->notrace_relock_fail = true; 1298 } else { 1299 /* Attempting reconstruct read: */ 1300 if (bch2_ec_read_extent(trans, rbio, k)) { 1301 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1302 BLK_STS_IOERR); 1303 goto out; 1304 } 1305 1306 if (likely(!(flags & BCH_READ_in_retry))) 1307 bio_endio(&rbio->bio); 1308 } 1309 out: 1310 if (likely(!(flags & BCH_READ_in_retry))) { 1311 return 0; 1312 } else { 1313 bch2_trans_unlock(trans); 1314 1315 int ret; 1316 1317 rbio->context = RBIO_CONTEXT_UNBOUND; 1318 bch2_read_endio(&rbio->bio); 1319 1320 ret = rbio->ret; 1321 rbio = bch2_rbio_free(rbio); 1322 1323 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1324 bch2_mark_io_failure(failed, &pick, 1325 ret == -BCH_ERR_data_read_retry_csum_err); 1326 1327 return ret; 1328 } 1329 1330 err: 1331 if (flags & BCH_READ_in_retry) 1332 return ret; 1333 1334 orig->bio.bi_status = BLK_STS_IOERR; 1335 orig->ret = ret; 1336 goto out_read_done; 1337 1338 hole: 1339 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1340 bvec_iter_sectors(iter)); 1341 /* 1342 * won't normally happen in the data update (bch2_move_extent()) path, 1343 * but if we retry and the extent we wanted to read no longer exists we 1344 * have to signal that: 1345 */ 1346 if (u) 1347 orig->ret = bch_err_throw(c, data_read_key_overwritten); 1348 1349 zero_fill_bio_iter(&orig->bio, iter); 1350 out_read_done: 1351 if ((flags & BCH_READ_last_fragment) && 1352 !(flags & BCH_READ_in_retry)) 1353 bch2_rbio_done(orig); 1354 return 0; 1355 } 1356 1357 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1358 struct bvec_iter bvec_iter, subvol_inum inum, 1359 struct bch_io_failures *failed, 1360 struct bkey_buf *prev_read, 1361 unsigned flags) 1362 { 1363 struct bch_fs *c = trans->c; 1364 struct btree_iter iter; 1365 struct bkey_buf sk; 1366 struct bkey_s_c k; 1367 enum btree_id data_btree; 1368 int ret; 1369 1370 EBUG_ON(rbio->data_update); 1371 1372 bch2_bkey_buf_init(&sk); 1373 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1374 POS(inum.inum, bvec_iter.bi_sector), 1375 BTREE_ITER_slots); 1376 1377 while (1) { 1378 data_btree = BTREE_ID_extents; 1379 1380 bch2_trans_begin(trans); 1381 1382 u32 snapshot; 1383 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1384 if (ret) 1385 goto err; 1386 1387 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1388 1389 bch2_btree_iter_set_pos(trans, &iter, 1390 POS(inum.inum, bvec_iter.bi_sector)); 1391 1392 k = bch2_btree_iter_peek_slot(trans, &iter); 1393 ret = bkey_err(k); 1394 if (ret) 1395 goto err; 1396 1397 s64 offset_into_extent = iter.pos.offset - 1398 bkey_start_offset(k.k); 1399 unsigned sectors = k.k->size - offset_into_extent; 1400 1401 bch2_bkey_buf_reassemble(&sk, c, k); 1402 1403 ret = bch2_read_indirect_extent(trans, &data_btree, 1404 &offset_into_extent, &sk); 1405 if (ret) 1406 goto err; 1407 1408 k = bkey_i_to_s_c(sk.k); 1409 1410 if (unlikely(flags & BCH_READ_in_retry)) { 1411 if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) 1412 failed->nr = 0; 1413 bch2_bkey_buf_copy(prev_read, c, sk.k); 1414 } 1415 1416 /* 1417 * With indirect extents, the amount of data to read is the min 1418 * of the original extent and the indirect extent: 1419 */ 1420 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1421 1422 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1423 swap(bvec_iter.bi_size, bytes); 1424 1425 if (bvec_iter.bi_size == bytes) 1426 flags |= BCH_READ_last_fragment; 1427 1428 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1429 data_btree, k, 1430 offset_into_extent, failed, flags, -1); 1431 swap(bvec_iter.bi_size, bytes); 1432 1433 if (ret) 1434 goto err; 1435 1436 if (flags & BCH_READ_last_fragment) 1437 break; 1438 1439 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1440 err: 1441 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1442 flags |= BCH_READ_must_bounce; 1443 1444 if (ret && 1445 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1446 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1447 break; 1448 } 1449 1450 if (unlikely(ret)) { 1451 if (ret != -BCH_ERR_extent_poisoned) { 1452 struct printbuf buf = PRINTBUF; 1453 lockrestart_do(trans, 1454 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1455 bvec_iter.bi_sector << 9)); 1456 prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); 1457 bch_err_ratelimited(c, "%s", buf.buf); 1458 printbuf_exit(&buf); 1459 } 1460 1461 rbio->bio.bi_status = BLK_STS_IOERR; 1462 rbio->ret = ret; 1463 1464 if (!(flags & BCH_READ_in_retry)) 1465 bch2_rbio_done(rbio); 1466 } 1467 1468 bch2_trans_iter_exit(trans, &iter); 1469 bch2_bkey_buf_exit(&sk, c); 1470 return ret; 1471 } 1472 1473 static const char * const bch2_read_bio_flags[] = { 1474 #define x(n) #n, 1475 BCH_READ_FLAGS() 1476 #undef x 1477 NULL 1478 }; 1479 1480 void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) 1481 { 1482 u64 now = local_clock(); 1483 prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); 1484 prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); 1485 1486 if (!rbio->split) 1487 prt_printf(out, "end_io:\t%ps\n", rbio->end_io); 1488 else 1489 prt_printf(out, "parent:\t%px\n", rbio->parent); 1490 1491 prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); 1492 1493 prt_printf(out, "promote:\t%u\n", rbio->promote); 1494 prt_printf(out, "bounce:\t%u\n", rbio->bounce); 1495 prt_printf(out, "split:\t%u\n", rbio->split); 1496 prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); 1497 prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); 1498 prt_printf(out, "context:\t%u\n", rbio->context); 1499 1500 int ret = READ_ONCE(rbio->ret); 1501 if (ret < 0) 1502 prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); 1503 else 1504 prt_printf(out, "ret:\t%i\n", ret); 1505 1506 prt_printf(out, "flags:\t"); 1507 bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); 1508 prt_newline(out); 1509 1510 bch2_bio_to_text(out, &rbio->bio); 1511 } 1512 1513 void bch2_fs_io_read_exit(struct bch_fs *c) 1514 { 1515 if (c->promote_table.tbl) 1516 rhashtable_destroy(&c->promote_table); 1517 bioset_exit(&c->bio_read_split); 1518 bioset_exit(&c->bio_read); 1519 mempool_exit(&c->bio_bounce_pages); 1520 } 1521 1522 int bch2_fs_io_read_init(struct bch_fs *c) 1523 { 1524 if (mempool_init_page_pool(&c->bio_bounce_pages, 1525 max_t(unsigned, 1526 c->opts.btree_node_size, 1527 c->opts.encoded_extent_max) / 1528 PAGE_SIZE, 0)) 1529 return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); 1530 1531 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1532 BIOSET_NEED_BVECS)) 1533 return bch_err_throw(c, ENOMEM_bio_read_init); 1534 1535 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1536 BIOSET_NEED_BVECS)) 1537 return bch_err_throw(c, ENOMEM_bio_read_split_init); 1538 1539 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1540 return bch_err_throw(c, ENOMEM_promote_table_init); 1541 1542 return 0; 1543 } 1544