1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 .automatic_shrinking = true, 91 }; 92 93 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 94 struct bpos pos, 95 struct bch_io_opts opts, 96 unsigned flags, 97 struct bch_io_failures *failed) 98 { 99 if (!failed) { 100 BUG_ON(!opts.promote_target); 101 102 if (!(flags & BCH_READ_MAY_PROMOTE)) 103 return -BCH_ERR_nopromote_may_not; 104 105 if (bch2_bkey_has_target(c, k, opts.promote_target)) 106 return -BCH_ERR_nopromote_already_promoted; 107 108 if (bkey_extent_is_unwritten(k)) 109 return -BCH_ERR_nopromote_unwritten; 110 111 if (bch2_target_congested(c, opts.promote_target)) 112 return -BCH_ERR_nopromote_congested; 113 } 114 115 if (rhashtable_lookup_fast(&c->promote_table, &pos, 116 bch_promote_params)) 117 return -BCH_ERR_nopromote_in_flight; 118 119 return 0; 120 } 121 122 static void promote_free(struct bch_fs *c, struct promote_op *op) 123 { 124 int ret; 125 126 bch2_data_update_exit(&op->write); 127 128 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 129 bch_promote_params); 130 BUG_ON(ret); 131 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 132 kfree_rcu(op, rcu); 133 } 134 135 static void promote_done(struct bch_write_op *wop) 136 { 137 struct promote_op *op = 138 container_of(wop, struct promote_op, write.op); 139 struct bch_fs *c = op->write.op.c; 140 141 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 142 op->start_time); 143 promote_free(c, op); 144 } 145 146 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 147 { 148 struct bio *bio = &op->write.op.wbio.bio; 149 150 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 151 152 /* we now own pages: */ 153 BUG_ON(!rbio->bounce); 154 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 155 156 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 157 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 158 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 159 160 bch2_data_update_read_done(&op->write, rbio->pick.crc); 161 } 162 163 static struct promote_op *__promote_alloc(struct btree_trans *trans, 164 enum btree_id btree_id, 165 struct bkey_s_c k, 166 struct bpos pos, 167 struct extent_ptr_decoded *pick, 168 struct bch_io_opts opts, 169 unsigned sectors, 170 struct bch_read_bio **rbio, 171 struct bch_io_failures *failed) 172 { 173 struct bch_fs *c = trans->c; 174 struct promote_op *op = NULL; 175 struct bio *bio; 176 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 177 int ret; 178 179 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 180 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 181 182 op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 183 if (!op) { 184 ret = -BCH_ERR_nopromote_enomem; 185 goto err; 186 } 187 188 op->start_time = local_clock(); 189 op->pos = pos; 190 191 /* 192 * We don't use the mempool here because extents that aren't 193 * checksummed or compressed can be too big for the mempool: 194 */ 195 *rbio = kzalloc(sizeof(struct bch_read_bio) + 196 sizeof(struct bio_vec) * pages, 197 GFP_KERNEL); 198 if (!*rbio) { 199 ret = -BCH_ERR_nopromote_enomem; 200 goto err; 201 } 202 203 rbio_init(&(*rbio)->bio, opts); 204 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 205 206 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 207 ret = -BCH_ERR_nopromote_enomem; 208 goto err; 209 } 210 211 (*rbio)->bounce = true; 212 (*rbio)->split = true; 213 (*rbio)->kmalloc = true; 214 215 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 216 bch_promote_params)) { 217 ret = -BCH_ERR_nopromote_in_flight; 218 goto err; 219 } 220 221 bio = &op->write.op.wbio.bio; 222 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 223 224 struct data_update_opts update_opts = {}; 225 226 if (!failed) { 227 update_opts.target = opts.promote_target; 228 update_opts.extra_replicas = 1; 229 update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; 230 } else { 231 update_opts.target = opts.foreground_target; 232 233 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 234 unsigned i = 0; 235 bkey_for_each_ptr(ptrs, ptr) { 236 if (bch2_dev_io_failures(failed, ptr->dev)) 237 update_opts.rewrite_ptrs |= BIT(i); 238 i++; 239 } 240 } 241 242 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 243 writepoint_hashed((unsigned long) current), 244 opts, 245 update_opts, 246 btree_id, k); 247 /* 248 * possible errors: -BCH_ERR_nocow_lock_blocked, 249 * -BCH_ERR_ENOSPC_disk_reservation: 250 */ 251 if (ret) { 252 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 253 bch_promote_params)); 254 goto err; 255 } 256 257 op->write.op.end_io = promote_done; 258 259 return op; 260 err: 261 if (*rbio) 262 bio_free_pages(&(*rbio)->bio); 263 kfree(*rbio); 264 *rbio = NULL; 265 /* We may have added to the rhashtable and thus need rcu freeing: */ 266 kfree_rcu(op, rcu); 267 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 268 return ERR_PTR(ret); 269 } 270 271 noinline 272 static struct promote_op *promote_alloc(struct btree_trans *trans, 273 struct bvec_iter iter, 274 struct bkey_s_c k, 275 struct extent_ptr_decoded *pick, 276 struct bch_io_opts opts, 277 unsigned flags, 278 struct bch_read_bio **rbio, 279 bool *bounce, 280 bool *read_full, 281 struct bch_io_failures *failed) 282 { 283 struct bch_fs *c = trans->c; 284 /* 285 * if failed != NULL we're not actually doing a promote, we're 286 * recovering from an io/checksum error 287 */ 288 bool promote_full = (failed || 289 *read_full || 290 READ_ONCE(c->opts.promote_whole_extents)); 291 /* data might have to be decompressed in the write path: */ 292 unsigned sectors = promote_full 293 ? max(pick->crc.compressed_size, pick->crc.live_size) 294 : bvec_iter_sectors(iter); 295 struct bpos pos = promote_full 296 ? bkey_start_pos(k.k) 297 : POS(k.k->p.inode, iter.bi_sector); 298 struct promote_op *promote; 299 int ret; 300 301 ret = should_promote(c, k, pos, opts, flags, failed); 302 if (ret) 303 goto nopromote; 304 305 promote = __promote_alloc(trans, 306 k.k->type == KEY_TYPE_reflink_v 307 ? BTREE_ID_reflink 308 : BTREE_ID_extents, 309 k, pos, pick, opts, sectors, rbio, failed); 310 ret = PTR_ERR_OR_ZERO(promote); 311 if (ret) 312 goto nopromote; 313 314 *bounce = true; 315 *read_full = promote_full; 316 return promote; 317 nopromote: 318 trace_read_nopromote(c, ret); 319 return NULL; 320 } 321 322 /* Read */ 323 324 #define READ_RETRY_AVOID 1 325 #define READ_RETRY 2 326 #define READ_ERR 3 327 328 enum rbio_context { 329 RBIO_CONTEXT_NULL, 330 RBIO_CONTEXT_HIGHPRI, 331 RBIO_CONTEXT_UNBOUND, 332 }; 333 334 static inline struct bch_read_bio * 335 bch2_rbio_parent(struct bch_read_bio *rbio) 336 { 337 return rbio->split ? rbio->parent : rbio; 338 } 339 340 __always_inline 341 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 342 enum rbio_context context, 343 struct workqueue_struct *wq) 344 { 345 if (context <= rbio->context) { 346 fn(&rbio->work); 347 } else { 348 rbio->work.func = fn; 349 rbio->context = context; 350 queue_work(wq, &rbio->work); 351 } 352 } 353 354 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 355 { 356 BUG_ON(rbio->bounce && !rbio->split); 357 358 if (rbio->promote) 359 promote_free(rbio->c, rbio->promote); 360 rbio->promote = NULL; 361 362 if (rbio->bounce) 363 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 364 365 if (rbio->split) { 366 struct bch_read_bio *parent = rbio->parent; 367 368 if (rbio->kmalloc) 369 kfree(rbio); 370 else 371 bio_put(&rbio->bio); 372 373 rbio = parent; 374 } 375 376 return rbio; 377 } 378 379 /* 380 * Only called on a top level bch_read_bio to complete an entire read request, 381 * not a split: 382 */ 383 static void bch2_rbio_done(struct bch_read_bio *rbio) 384 { 385 if (rbio->start_time) 386 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 387 rbio->start_time); 388 bio_endio(&rbio->bio); 389 } 390 391 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 392 struct bvec_iter bvec_iter, 393 struct bch_io_failures *failed, 394 unsigned flags) 395 { 396 struct btree_trans *trans = bch2_trans_get(c); 397 struct btree_iter iter; 398 struct bkey_buf sk; 399 struct bkey_s_c k; 400 int ret; 401 402 flags &= ~BCH_READ_LAST_FRAGMENT; 403 flags |= BCH_READ_MUST_CLONE; 404 405 bch2_bkey_buf_init(&sk); 406 407 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 408 rbio->read_pos, BTREE_ITER_slots); 409 retry: 410 bch2_trans_begin(trans); 411 rbio->bio.bi_status = 0; 412 413 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 414 if (ret) 415 goto err; 416 417 bch2_bkey_buf_reassemble(&sk, c, k); 418 k = bkey_i_to_s_c(sk.k); 419 420 if (!bch2_bkey_matches_ptr(c, k, 421 rbio->pick.ptr, 422 rbio->data_pos.offset - 423 rbio->pick.crc.offset)) { 424 /* extent we wanted to read no longer exists: */ 425 rbio->hole = true; 426 goto out; 427 } 428 429 ret = __bch2_read_extent(trans, rbio, bvec_iter, 430 rbio->read_pos, 431 rbio->data_btree, 432 k, 0, failed, flags); 433 if (ret == READ_RETRY) 434 goto retry; 435 if (ret) 436 goto err; 437 out: 438 bch2_rbio_done(rbio); 439 bch2_trans_iter_exit(trans, &iter); 440 bch2_trans_put(trans); 441 bch2_bkey_buf_exit(&sk, c); 442 return; 443 err: 444 rbio->bio.bi_status = BLK_STS_IOERR; 445 goto out; 446 } 447 448 static void bch2_rbio_retry(struct work_struct *work) 449 { 450 struct bch_read_bio *rbio = 451 container_of(work, struct bch_read_bio, work); 452 struct bch_fs *c = rbio->c; 453 struct bvec_iter iter = rbio->bvec_iter; 454 unsigned flags = rbio->flags; 455 subvol_inum inum = { 456 .subvol = rbio->subvol, 457 .inum = rbio->read_pos.inode, 458 }; 459 struct bch_io_failures failed = { .nr = 0 }; 460 461 trace_and_count(c, read_retry, &rbio->bio); 462 463 if (rbio->retry == READ_RETRY_AVOID) 464 bch2_mark_io_failure(&failed, &rbio->pick); 465 466 rbio->bio.bi_status = 0; 467 468 rbio = bch2_rbio_free(rbio); 469 470 flags |= BCH_READ_IN_RETRY; 471 flags &= ~BCH_READ_MAY_PROMOTE; 472 473 if (flags & BCH_READ_NODECODE) { 474 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 475 } else { 476 flags &= ~BCH_READ_LAST_FRAGMENT; 477 flags |= BCH_READ_MUST_CLONE; 478 479 __bch2_read(c, rbio, iter, inum, &failed, flags); 480 } 481 } 482 483 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 484 blk_status_t error) 485 { 486 rbio->retry = retry; 487 488 if (rbio->flags & BCH_READ_IN_RETRY) 489 return; 490 491 if (retry == READ_ERR) { 492 rbio = bch2_rbio_free(rbio); 493 494 rbio->bio.bi_status = error; 495 bch2_rbio_done(rbio); 496 } else { 497 bch2_rbio_punt(rbio, bch2_rbio_retry, 498 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 499 } 500 } 501 502 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 503 struct bch_read_bio *rbio) 504 { 505 struct bch_fs *c = rbio->c; 506 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 507 struct bch_extent_crc_unpacked new_crc; 508 struct btree_iter iter; 509 struct bkey_i *new; 510 struct bkey_s_c k; 511 int ret = 0; 512 513 if (crc_is_compressed(rbio->pick.crc)) 514 return 0; 515 516 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 517 BTREE_ITER_slots|BTREE_ITER_intent); 518 if ((ret = bkey_err(k))) 519 goto out; 520 521 if (bversion_cmp(k.k->bversion, rbio->version) || 522 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 523 goto out; 524 525 /* Extent was merged? */ 526 if (bkey_start_offset(k.k) < data_offset || 527 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 528 goto out; 529 530 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 531 rbio->pick.crc, NULL, &new_crc, 532 bkey_start_offset(k.k) - data_offset, k.k->size, 533 rbio->pick.crc.csum_type)) { 534 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 535 ret = 0; 536 goto out; 537 } 538 539 /* 540 * going to be temporarily appending another checksum entry: 541 */ 542 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 543 sizeof(struct bch_extent_crc128)); 544 if ((ret = PTR_ERR_OR_ZERO(new))) 545 goto out; 546 547 bkey_reassemble(new, k); 548 549 if (!bch2_bkey_narrow_crcs(new, new_crc)) 550 goto out; 551 552 ret = bch2_trans_update(trans, &iter, new, 553 BTREE_UPDATE_internal_snapshot_node); 554 out: 555 bch2_trans_iter_exit(trans, &iter); 556 return ret; 557 } 558 559 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 560 { 561 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 562 __bch2_rbio_narrow_crcs(trans, rbio)); 563 } 564 565 /* Inner part that may run in process context */ 566 static void __bch2_read_endio(struct work_struct *work) 567 { 568 struct bch_read_bio *rbio = 569 container_of(work, struct bch_read_bio, work); 570 struct bch_fs *c = rbio->c; 571 struct bio *src = &rbio->bio; 572 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 573 struct bvec_iter dst_iter = rbio->bvec_iter; 574 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 575 struct nonce nonce = extent_nonce(rbio->version, crc); 576 unsigned nofs_flags; 577 struct bch_csum csum; 578 int ret; 579 580 nofs_flags = memalloc_nofs_save(); 581 582 /* Reset iterator for checksumming and copying bounced data: */ 583 if (rbio->bounce) { 584 src->bi_iter.bi_size = crc.compressed_size << 9; 585 src->bi_iter.bi_idx = 0; 586 src->bi_iter.bi_bvec_done = 0; 587 } else { 588 src->bi_iter = rbio->bvec_iter; 589 } 590 591 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 592 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 593 goto csum_err; 594 595 /* 596 * XXX 597 * We need to rework the narrow_crcs path to deliver the read completion 598 * first, and then punt to a different workqueue, otherwise we're 599 * holding up reads while doing btree updates which is bad for memory 600 * reclaim. 601 */ 602 if (unlikely(rbio->narrow_crcs)) 603 bch2_rbio_narrow_crcs(rbio); 604 605 if (rbio->flags & BCH_READ_NODECODE) 606 goto nodecode; 607 608 /* Adjust crc to point to subset of data we want: */ 609 crc.offset += rbio->offset_into_extent; 610 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 611 612 if (crc_is_compressed(crc)) { 613 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 614 if (ret) 615 goto decrypt_err; 616 617 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 618 !c->opts.no_data_io) 619 goto decompression_err; 620 } else { 621 /* don't need to decrypt the entire bio: */ 622 nonce = nonce_add(nonce, crc.offset << 9); 623 bio_advance(src, crc.offset << 9); 624 625 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 626 src->bi_iter.bi_size = dst_iter.bi_size; 627 628 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 629 if (ret) 630 goto decrypt_err; 631 632 if (rbio->bounce) { 633 struct bvec_iter src_iter = src->bi_iter; 634 635 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 636 } 637 } 638 639 if (rbio->promote) { 640 /* 641 * Re encrypt data we decrypted, so it's consistent with 642 * rbio->crc: 643 */ 644 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 645 if (ret) 646 goto decrypt_err; 647 648 promote_start(rbio->promote, rbio); 649 rbio->promote = NULL; 650 } 651 nodecode: 652 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 653 rbio = bch2_rbio_free(rbio); 654 bch2_rbio_done(rbio); 655 } 656 out: 657 memalloc_nofs_restore(nofs_flags); 658 return; 659 csum_err: 660 /* 661 * Checksum error: if the bio wasn't bounced, we may have been 662 * reading into buffers owned by userspace (that userspace can 663 * scribble over) - retry the read, bouncing it this time: 664 */ 665 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 666 rbio->flags |= BCH_READ_MUST_BOUNCE; 667 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 668 goto out; 669 } 670 671 struct printbuf buf = PRINTBUF; 672 buf.atomic++; 673 prt_str(&buf, "data "); 674 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 675 676 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 677 if (ca) { 678 bch_err_inum_offset_ratelimited(ca, 679 rbio->read_pos.inode, 680 rbio->read_pos.offset << 9, 681 "data %s", buf.buf); 682 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 683 } 684 printbuf_exit(&buf); 685 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 686 goto out; 687 decompression_err: 688 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 689 rbio->read_pos.offset << 9, 690 "decompression error"); 691 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 692 goto out; 693 decrypt_err: 694 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 695 rbio->read_pos.offset << 9, 696 "decrypt error"); 697 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 698 goto out; 699 } 700 701 static void bch2_read_endio(struct bio *bio) 702 { 703 struct bch_read_bio *rbio = 704 container_of(bio, struct bch_read_bio, bio); 705 struct bch_fs *c = rbio->c; 706 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 707 struct workqueue_struct *wq = NULL; 708 enum rbio_context context = RBIO_CONTEXT_NULL; 709 710 if (rbio->have_ioref) { 711 bch2_latency_acct(ca, rbio->submit_time, READ); 712 percpu_ref_put(&ca->io_ref); 713 } 714 715 if (!rbio->split) 716 rbio->bio.bi_end_io = rbio->end_io; 717 718 if (bio->bi_status) { 719 if (ca) { 720 bch_err_inum_offset_ratelimited(ca, 721 rbio->read_pos.inode, 722 rbio->read_pos.offset, 723 "data read error: %s", 724 bch2_blk_status_to_str(bio->bi_status)); 725 bch2_io_error(ca, BCH_MEMBER_ERROR_read); 726 } 727 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 728 return; 729 } 730 731 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 732 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 733 trace_and_count(c, read_reuse_race, &rbio->bio); 734 735 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 736 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 737 else 738 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 739 return; 740 } 741 742 if (rbio->narrow_crcs || 743 rbio->promote || 744 crc_is_compressed(rbio->pick.crc) || 745 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 746 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 747 else if (rbio->pick.crc.csum_type) 748 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 749 750 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 751 } 752 753 int __bch2_read_indirect_extent(struct btree_trans *trans, 754 unsigned *offset_into_extent, 755 struct bkey_buf *orig_k) 756 { 757 struct btree_iter iter; 758 struct bkey_s_c k; 759 u64 reflink_offset; 760 int ret; 761 762 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 763 *offset_into_extent; 764 765 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 766 POS(0, reflink_offset), 0); 767 ret = bkey_err(k); 768 if (ret) 769 goto err; 770 771 if (k.k->type != KEY_TYPE_reflink_v && 772 k.k->type != KEY_TYPE_indirect_inline_data) { 773 bch_err_inum_offset_ratelimited(trans->c, 774 orig_k->k->k.p.inode, 775 orig_k->k->k.p.offset << 9, 776 "%llu len %u points to nonexistent indirect extent %llu", 777 orig_k->k->k.p.offset, 778 orig_k->k->k.size, 779 reflink_offset); 780 bch2_inconsistent_error(trans->c); 781 ret = -BCH_ERR_missing_indirect_extent; 782 goto err; 783 } 784 785 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 786 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 787 err: 788 bch2_trans_iter_exit(trans, &iter); 789 return ret; 790 } 791 792 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 793 struct bch_dev *ca, 794 struct bkey_s_c k, 795 struct bch_extent_ptr ptr) 796 { 797 struct bch_fs *c = trans->c; 798 struct btree_iter iter; 799 struct printbuf buf = PRINTBUF; 800 int ret; 801 802 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 803 PTR_BUCKET_POS(ca, &ptr), 804 BTREE_ITER_cached); 805 806 int gen = bucket_gen_get(ca, iter.pos.offset); 807 if (gen >= 0) { 808 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 809 printbuf_indent_add(&buf, 2); 810 811 bch2_bkey_val_to_text(&buf, c, k); 812 prt_newline(&buf); 813 814 prt_printf(&buf, "memory gen: %u", gen); 815 816 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 817 if (!ret) { 818 prt_newline(&buf); 819 bch2_bkey_val_to_text(&buf, c, k); 820 } 821 } else { 822 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 823 iter.pos.inode, iter.pos.offset); 824 printbuf_indent_add(&buf, 2); 825 826 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 827 ca->mi.first_bucket, ca->mi.nbuckets); 828 829 bch2_bkey_val_to_text(&buf, c, k); 830 prt_newline(&buf); 831 } 832 833 bch2_fs_inconsistent(c, "%s", buf.buf); 834 835 bch2_trans_iter_exit(trans, &iter); 836 printbuf_exit(&buf); 837 } 838 839 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 840 struct bvec_iter iter, struct bpos read_pos, 841 enum btree_id data_btree, struct bkey_s_c k, 842 unsigned offset_into_extent, 843 struct bch_io_failures *failed, unsigned flags) 844 { 845 struct bch_fs *c = trans->c; 846 struct extent_ptr_decoded pick; 847 struct bch_read_bio *rbio = NULL; 848 struct promote_op *promote = NULL; 849 bool bounce = false, read_full = false, narrow_crcs = false; 850 struct bpos data_pos = bkey_start_pos(k.k); 851 int pick_ret; 852 853 if (bkey_extent_is_inline_data(k.k)) { 854 unsigned bytes = min_t(unsigned, iter.bi_size, 855 bkey_inline_data_bytes(k.k)); 856 857 swap(iter.bi_size, bytes); 858 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 859 swap(iter.bi_size, bytes); 860 bio_advance_iter(&orig->bio, &iter, bytes); 861 zero_fill_bio_iter(&orig->bio, iter); 862 goto out_read_done; 863 } 864 retry_pick: 865 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 866 867 /* hole or reservation - just zero fill: */ 868 if (!pick_ret) 869 goto hole; 870 871 if (pick_ret < 0) { 872 struct printbuf buf = PRINTBUF; 873 bch2_bkey_val_to_text(&buf, c, k); 874 875 bch_err_inum_offset_ratelimited(c, 876 read_pos.inode, read_pos.offset << 9, 877 "no device to read from: %s\n %s", 878 bch2_err_str(pick_ret), 879 buf.buf); 880 printbuf_exit(&buf); 881 goto err; 882 } 883 884 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 885 886 /* 887 * Stale dirty pointers are treated as IO errors, but @failed isn't 888 * allocated unless we're in the retry path - so if we're not in the 889 * retry path, don't check here, it'll be caught in bch2_read_endio() 890 * and we'll end up in the retry path: 891 */ 892 if ((flags & BCH_READ_IN_RETRY) && 893 !pick.ptr.cached && 894 ca && 895 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 896 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 897 bch2_mark_io_failure(failed, &pick); 898 percpu_ref_put(&ca->io_ref); 899 goto retry_pick; 900 } 901 902 /* 903 * Unlock the iterator while the btree node's lock is still in 904 * cache, before doing the IO: 905 */ 906 bch2_trans_unlock(trans); 907 908 if (flags & BCH_READ_NODECODE) { 909 /* 910 * can happen if we retry, and the extent we were going to read 911 * has been merged in the meantime: 912 */ 913 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 914 if (ca) 915 percpu_ref_put(&ca->io_ref); 916 goto hole; 917 } 918 919 iter.bi_size = pick.crc.compressed_size << 9; 920 goto get_bio; 921 } 922 923 if (!(flags & BCH_READ_LAST_FRAGMENT) || 924 bio_flagged(&orig->bio, BIO_CHAIN)) 925 flags |= BCH_READ_MUST_CLONE; 926 927 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 928 bch2_can_narrow_extent_crcs(k, pick.crc); 929 930 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 931 flags |= BCH_READ_MUST_BOUNCE; 932 933 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 934 935 if (crc_is_compressed(pick.crc) || 936 (pick.crc.csum_type != BCH_CSUM_none && 937 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 938 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 939 (flags & BCH_READ_USER_MAPPED)) || 940 (flags & BCH_READ_MUST_BOUNCE)))) { 941 read_full = true; 942 bounce = true; 943 } 944 945 if (orig->opts.promote_target)// || failed) 946 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 947 &rbio, &bounce, &read_full, failed); 948 949 if (!read_full) { 950 EBUG_ON(crc_is_compressed(pick.crc)); 951 EBUG_ON(pick.crc.csum_type && 952 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 953 bvec_iter_sectors(iter) != pick.crc.live_size || 954 pick.crc.offset || 955 offset_into_extent)); 956 957 data_pos.offset += offset_into_extent; 958 pick.ptr.offset += pick.crc.offset + 959 offset_into_extent; 960 offset_into_extent = 0; 961 pick.crc.compressed_size = bvec_iter_sectors(iter); 962 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 963 pick.crc.offset = 0; 964 pick.crc.live_size = bvec_iter_sectors(iter); 965 } 966 get_bio: 967 if (rbio) { 968 /* 969 * promote already allocated bounce rbio: 970 * promote needs to allocate a bio big enough for uncompressing 971 * data in the write path, but we're not going to use it all 972 * here: 973 */ 974 EBUG_ON(rbio->bio.bi_iter.bi_size < 975 pick.crc.compressed_size << 9); 976 rbio->bio.bi_iter.bi_size = 977 pick.crc.compressed_size << 9; 978 } else if (bounce) { 979 unsigned sectors = pick.crc.compressed_size; 980 981 rbio = rbio_init(bio_alloc_bioset(NULL, 982 DIV_ROUND_UP(sectors, PAGE_SECTORS), 983 0, 984 GFP_NOFS, 985 &c->bio_read_split), 986 orig->opts); 987 988 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 989 rbio->bounce = true; 990 rbio->split = true; 991 } else if (flags & BCH_READ_MUST_CLONE) { 992 /* 993 * Have to clone if there were any splits, due to error 994 * reporting issues (if a split errored, and retrying didn't 995 * work, when it reports the error to its parent (us) we don't 996 * know if the error was from our bio, and we should retry, or 997 * from the whole bio, in which case we don't want to retry and 998 * lose the error) 999 */ 1000 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1001 &c->bio_read_split), 1002 orig->opts); 1003 rbio->bio.bi_iter = iter; 1004 rbio->split = true; 1005 } else { 1006 rbio = orig; 1007 rbio->bio.bi_iter = iter; 1008 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1009 } 1010 1011 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1012 1013 rbio->c = c; 1014 rbio->submit_time = local_clock(); 1015 if (rbio->split) 1016 rbio->parent = orig; 1017 else 1018 rbio->end_io = orig->bio.bi_end_io; 1019 rbio->bvec_iter = iter; 1020 rbio->offset_into_extent= offset_into_extent; 1021 rbio->flags = flags; 1022 rbio->have_ioref = ca != NULL; 1023 rbio->narrow_crcs = narrow_crcs; 1024 rbio->hole = 0; 1025 rbio->retry = 0; 1026 rbio->context = 0; 1027 /* XXX: only initialize this if needed */ 1028 rbio->devs_have = bch2_bkey_devs(k); 1029 rbio->pick = pick; 1030 rbio->subvol = orig->subvol; 1031 rbio->read_pos = read_pos; 1032 rbio->data_btree = data_btree; 1033 rbio->data_pos = data_pos; 1034 rbio->version = k.k->bversion; 1035 rbio->promote = promote; 1036 INIT_WORK(&rbio->work, NULL); 1037 1038 if (flags & BCH_READ_NODECODE) 1039 orig->pick = pick; 1040 1041 rbio->bio.bi_opf = orig->bio.bi_opf; 1042 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1043 rbio->bio.bi_end_io = bch2_read_endio; 1044 1045 if (rbio->bounce) 1046 trace_and_count(c, read_bounce, &rbio->bio); 1047 1048 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1049 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1050 1051 /* 1052 * If it's being moved internally, we don't want to flag it as a cache 1053 * hit: 1054 */ 1055 if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 1056 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1057 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1058 1059 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1060 bio_inc_remaining(&orig->bio); 1061 trace_and_count(c, read_split, &orig->bio); 1062 } 1063 1064 if (!rbio->pick.idx) { 1065 if (!rbio->have_ioref) { 1066 bch_err_inum_offset_ratelimited(c, 1067 read_pos.inode, 1068 read_pos.offset << 9, 1069 "no device to read from"); 1070 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1071 goto out; 1072 } 1073 1074 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1075 bio_sectors(&rbio->bio)); 1076 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1077 1078 if (unlikely(c->opts.no_data_io)) { 1079 if (likely(!(flags & BCH_READ_IN_RETRY))) 1080 bio_endio(&rbio->bio); 1081 } else { 1082 if (likely(!(flags & BCH_READ_IN_RETRY))) 1083 submit_bio(&rbio->bio); 1084 else 1085 submit_bio_wait(&rbio->bio); 1086 } 1087 1088 /* 1089 * We just submitted IO which may block, we expect relock fail 1090 * events and shouldn't count them: 1091 */ 1092 trans->notrace_relock_fail = true; 1093 } else { 1094 /* Attempting reconstruct read: */ 1095 if (bch2_ec_read_extent(trans, rbio, k)) { 1096 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1097 goto out; 1098 } 1099 1100 if (likely(!(flags & BCH_READ_IN_RETRY))) 1101 bio_endio(&rbio->bio); 1102 } 1103 out: 1104 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1105 return 0; 1106 } else { 1107 int ret; 1108 1109 rbio->context = RBIO_CONTEXT_UNBOUND; 1110 bch2_read_endio(&rbio->bio); 1111 1112 ret = rbio->retry; 1113 rbio = bch2_rbio_free(rbio); 1114 1115 if (ret == READ_RETRY_AVOID) { 1116 bch2_mark_io_failure(failed, &pick); 1117 ret = READ_RETRY; 1118 } 1119 1120 if (!ret) 1121 goto out_read_done; 1122 1123 return ret; 1124 } 1125 1126 err: 1127 if (flags & BCH_READ_IN_RETRY) 1128 return READ_ERR; 1129 1130 orig->bio.bi_status = BLK_STS_IOERR; 1131 goto out_read_done; 1132 1133 hole: 1134 /* 1135 * won't normally happen in the BCH_READ_NODECODE 1136 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1137 * to read no longer exists we have to signal that: 1138 */ 1139 if (flags & BCH_READ_NODECODE) 1140 orig->hole = true; 1141 1142 zero_fill_bio_iter(&orig->bio, iter); 1143 out_read_done: 1144 if (flags & BCH_READ_LAST_FRAGMENT) 1145 bch2_rbio_done(orig); 1146 return 0; 1147 } 1148 1149 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1150 struct bvec_iter bvec_iter, subvol_inum inum, 1151 struct bch_io_failures *failed, unsigned flags) 1152 { 1153 struct btree_trans *trans = bch2_trans_get(c); 1154 struct btree_iter iter; 1155 struct bkey_buf sk; 1156 struct bkey_s_c k; 1157 int ret; 1158 1159 BUG_ON(flags & BCH_READ_NODECODE); 1160 1161 bch2_bkey_buf_init(&sk); 1162 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1163 POS(inum.inum, bvec_iter.bi_sector), 1164 BTREE_ITER_slots); 1165 1166 while (1) { 1167 unsigned bytes, sectors, offset_into_extent; 1168 enum btree_id data_btree = BTREE_ID_extents; 1169 1170 bch2_trans_begin(trans); 1171 1172 u32 snapshot; 1173 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1174 if (ret) 1175 goto err; 1176 1177 bch2_btree_iter_set_snapshot(&iter, snapshot); 1178 1179 bch2_btree_iter_set_pos(&iter, 1180 POS(inum.inum, bvec_iter.bi_sector)); 1181 1182 k = bch2_btree_iter_peek_slot(&iter); 1183 ret = bkey_err(k); 1184 if (ret) 1185 goto err; 1186 1187 offset_into_extent = iter.pos.offset - 1188 bkey_start_offset(k.k); 1189 sectors = k.k->size - offset_into_extent; 1190 1191 bch2_bkey_buf_reassemble(&sk, c, k); 1192 1193 ret = bch2_read_indirect_extent(trans, &data_btree, 1194 &offset_into_extent, &sk); 1195 if (ret) 1196 goto err; 1197 1198 k = bkey_i_to_s_c(sk.k); 1199 1200 /* 1201 * With indirect extents, the amount of data to read is the min 1202 * of the original extent and the indirect extent: 1203 */ 1204 sectors = min(sectors, k.k->size - offset_into_extent); 1205 1206 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1207 swap(bvec_iter.bi_size, bytes); 1208 1209 if (bvec_iter.bi_size == bytes) 1210 flags |= BCH_READ_LAST_FRAGMENT; 1211 1212 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1213 data_btree, k, 1214 offset_into_extent, failed, flags); 1215 if (ret) 1216 goto err; 1217 1218 if (flags & BCH_READ_LAST_FRAGMENT) 1219 break; 1220 1221 swap(bvec_iter.bi_size, bytes); 1222 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1223 err: 1224 if (ret && 1225 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1226 ret != READ_RETRY && 1227 ret != READ_RETRY_AVOID) 1228 break; 1229 } 1230 1231 bch2_trans_iter_exit(trans, &iter); 1232 bch2_trans_put(trans); 1233 bch2_bkey_buf_exit(&sk, c); 1234 1235 if (ret) { 1236 bch_err_inum_offset_ratelimited(c, inum.inum, 1237 bvec_iter.bi_sector << 9, 1238 "read error %i from btree lookup", ret); 1239 rbio->bio.bi_status = BLK_STS_IOERR; 1240 bch2_rbio_done(rbio); 1241 } 1242 } 1243 1244 void bch2_fs_io_read_exit(struct bch_fs *c) 1245 { 1246 if (c->promote_table.tbl) 1247 rhashtable_destroy(&c->promote_table); 1248 bioset_exit(&c->bio_read_split); 1249 bioset_exit(&c->bio_read); 1250 } 1251 1252 int bch2_fs_io_read_init(struct bch_fs *c) 1253 { 1254 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1255 BIOSET_NEED_BVECS)) 1256 return -BCH_ERR_ENOMEM_bio_read_init; 1257 1258 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1259 BIOSET_NEED_BVECS)) 1260 return -BCH_ERR_ENOMEM_bio_read_split_init; 1261 1262 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1263 return -BCH_ERR_ENOMEM_promote_table_init; 1264 1265 return 0; 1266 } 1267