1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 .automatic_shrinking = true, 91 }; 92 93 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 94 struct bpos pos, 95 struct bch_io_opts opts, 96 unsigned flags, 97 struct bch_io_failures *failed) 98 { 99 if (!failed) { 100 BUG_ON(!opts.promote_target); 101 102 if (!(flags & BCH_READ_MAY_PROMOTE)) 103 return -BCH_ERR_nopromote_may_not; 104 105 if (bch2_bkey_has_target(c, k, opts.promote_target)) 106 return -BCH_ERR_nopromote_already_promoted; 107 108 if (bkey_extent_is_unwritten(k)) 109 return -BCH_ERR_nopromote_unwritten; 110 111 if (bch2_target_congested(c, opts.promote_target)) 112 return -BCH_ERR_nopromote_congested; 113 } 114 115 if (rhashtable_lookup_fast(&c->promote_table, &pos, 116 bch_promote_params)) 117 return -BCH_ERR_nopromote_in_flight; 118 119 return 0; 120 } 121 122 static void promote_free(struct bch_fs *c, struct promote_op *op) 123 { 124 int ret; 125 126 bch2_data_update_exit(&op->write); 127 128 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 129 bch_promote_params); 130 BUG_ON(ret); 131 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 132 kfree_rcu(op, rcu); 133 } 134 135 static void promote_done(struct bch_write_op *wop) 136 { 137 struct promote_op *op = 138 container_of(wop, struct promote_op, write.op); 139 struct bch_fs *c = op->write.op.c; 140 141 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 142 op->start_time); 143 promote_free(c, op); 144 } 145 146 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 147 { 148 struct bio *bio = &op->write.op.wbio.bio; 149 150 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 151 152 /* we now own pages: */ 153 BUG_ON(!rbio->bounce); 154 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 155 156 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 157 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 158 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 159 160 bch2_data_update_read_done(&op->write, rbio->pick.crc); 161 } 162 163 static struct promote_op *__promote_alloc(struct btree_trans *trans, 164 enum btree_id btree_id, 165 struct bkey_s_c k, 166 struct bpos pos, 167 struct extent_ptr_decoded *pick, 168 struct bch_io_opts opts, 169 unsigned sectors, 170 struct bch_read_bio **rbio, 171 struct bch_io_failures *failed) 172 { 173 struct bch_fs *c = trans->c; 174 struct promote_op *op = NULL; 175 struct bio *bio; 176 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 177 int ret; 178 179 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 180 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 181 182 op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 183 if (!op) { 184 ret = -BCH_ERR_nopromote_enomem; 185 goto err; 186 } 187 188 op->start_time = local_clock(); 189 op->pos = pos; 190 191 /* 192 * We don't use the mempool here because extents that aren't 193 * checksummed or compressed can be too big for the mempool: 194 */ 195 *rbio = kzalloc(sizeof(struct bch_read_bio) + 196 sizeof(struct bio_vec) * pages, 197 GFP_KERNEL); 198 if (!*rbio) { 199 ret = -BCH_ERR_nopromote_enomem; 200 goto err; 201 } 202 203 rbio_init(&(*rbio)->bio, opts); 204 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 205 206 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 207 ret = -BCH_ERR_nopromote_enomem; 208 goto err; 209 } 210 211 (*rbio)->bounce = true; 212 (*rbio)->split = true; 213 (*rbio)->kmalloc = true; 214 215 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 216 bch_promote_params)) { 217 ret = -BCH_ERR_nopromote_in_flight; 218 goto err; 219 } 220 221 bio = &op->write.op.wbio.bio; 222 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 223 224 struct data_update_opts update_opts = {}; 225 226 if (!failed) { 227 update_opts.target = opts.promote_target; 228 update_opts.extra_replicas = 1; 229 update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; 230 } else { 231 update_opts.target = opts.foreground_target; 232 233 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 234 unsigned i = 0; 235 bkey_for_each_ptr(ptrs, ptr) { 236 if (bch2_dev_io_failures(failed, ptr->dev)) 237 update_opts.rewrite_ptrs |= BIT(i); 238 i++; 239 } 240 } 241 242 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 243 writepoint_hashed((unsigned long) current), 244 opts, 245 update_opts, 246 btree_id, k); 247 /* 248 * possible errors: -BCH_ERR_nocow_lock_blocked, 249 * -BCH_ERR_ENOSPC_disk_reservation: 250 */ 251 if (ret) { 252 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 253 bch_promote_params)); 254 goto err; 255 } 256 257 op->write.op.end_io = promote_done; 258 259 return op; 260 err: 261 if (*rbio) 262 bio_free_pages(&(*rbio)->bio); 263 kfree(*rbio); 264 *rbio = NULL; 265 kfree(op); 266 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 267 return ERR_PTR(ret); 268 } 269 270 noinline 271 static struct promote_op *promote_alloc(struct btree_trans *trans, 272 struct bvec_iter iter, 273 struct bkey_s_c k, 274 struct extent_ptr_decoded *pick, 275 struct bch_io_opts opts, 276 unsigned flags, 277 struct bch_read_bio **rbio, 278 bool *bounce, 279 bool *read_full, 280 struct bch_io_failures *failed) 281 { 282 struct bch_fs *c = trans->c; 283 /* 284 * if failed != NULL we're not actually doing a promote, we're 285 * recovering from an io/checksum error 286 */ 287 bool promote_full = (failed || 288 *read_full || 289 READ_ONCE(c->promote_whole_extents)); 290 /* data might have to be decompressed in the write path: */ 291 unsigned sectors = promote_full 292 ? max(pick->crc.compressed_size, pick->crc.live_size) 293 : bvec_iter_sectors(iter); 294 struct bpos pos = promote_full 295 ? bkey_start_pos(k.k) 296 : POS(k.k->p.inode, iter.bi_sector); 297 struct promote_op *promote; 298 int ret; 299 300 ret = should_promote(c, k, pos, opts, flags, failed); 301 if (ret) 302 goto nopromote; 303 304 promote = __promote_alloc(trans, 305 k.k->type == KEY_TYPE_reflink_v 306 ? BTREE_ID_reflink 307 : BTREE_ID_extents, 308 k, pos, pick, opts, sectors, rbio, failed); 309 ret = PTR_ERR_OR_ZERO(promote); 310 if (ret) 311 goto nopromote; 312 313 *bounce = true; 314 *read_full = promote_full; 315 return promote; 316 nopromote: 317 trace_read_nopromote(c, ret); 318 return NULL; 319 } 320 321 /* Read */ 322 323 #define READ_RETRY_AVOID 1 324 #define READ_RETRY 2 325 #define READ_ERR 3 326 327 enum rbio_context { 328 RBIO_CONTEXT_NULL, 329 RBIO_CONTEXT_HIGHPRI, 330 RBIO_CONTEXT_UNBOUND, 331 }; 332 333 static inline struct bch_read_bio * 334 bch2_rbio_parent(struct bch_read_bio *rbio) 335 { 336 return rbio->split ? rbio->parent : rbio; 337 } 338 339 __always_inline 340 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 341 enum rbio_context context, 342 struct workqueue_struct *wq) 343 { 344 if (context <= rbio->context) { 345 fn(&rbio->work); 346 } else { 347 rbio->work.func = fn; 348 rbio->context = context; 349 queue_work(wq, &rbio->work); 350 } 351 } 352 353 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 354 { 355 BUG_ON(rbio->bounce && !rbio->split); 356 357 if (rbio->promote) 358 promote_free(rbio->c, rbio->promote); 359 rbio->promote = NULL; 360 361 if (rbio->bounce) 362 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 363 364 if (rbio->split) { 365 struct bch_read_bio *parent = rbio->parent; 366 367 if (rbio->kmalloc) 368 kfree(rbio); 369 else 370 bio_put(&rbio->bio); 371 372 rbio = parent; 373 } 374 375 return rbio; 376 } 377 378 /* 379 * Only called on a top level bch_read_bio to complete an entire read request, 380 * not a split: 381 */ 382 static void bch2_rbio_done(struct bch_read_bio *rbio) 383 { 384 if (rbio->start_time) 385 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 386 rbio->start_time); 387 bio_endio(&rbio->bio); 388 } 389 390 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 391 struct bvec_iter bvec_iter, 392 struct bch_io_failures *failed, 393 unsigned flags) 394 { 395 struct btree_trans *trans = bch2_trans_get(c); 396 struct btree_iter iter; 397 struct bkey_buf sk; 398 struct bkey_s_c k; 399 int ret; 400 401 flags &= ~BCH_READ_LAST_FRAGMENT; 402 flags |= BCH_READ_MUST_CLONE; 403 404 bch2_bkey_buf_init(&sk); 405 406 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 407 rbio->read_pos, BTREE_ITER_slots); 408 retry: 409 rbio->bio.bi_status = 0; 410 411 k = bch2_btree_iter_peek_slot(&iter); 412 if (bkey_err(k)) 413 goto err; 414 415 bch2_bkey_buf_reassemble(&sk, c, k); 416 k = bkey_i_to_s_c(sk.k); 417 418 if (!bch2_bkey_matches_ptr(c, k, 419 rbio->pick.ptr, 420 rbio->data_pos.offset - 421 rbio->pick.crc.offset)) { 422 /* extent we wanted to read no longer exists: */ 423 rbio->hole = true; 424 goto out; 425 } 426 427 ret = __bch2_read_extent(trans, rbio, bvec_iter, 428 rbio->read_pos, 429 rbio->data_btree, 430 k, 0, failed, flags); 431 if (ret == READ_RETRY) 432 goto retry; 433 if (ret) 434 goto err; 435 out: 436 bch2_rbio_done(rbio); 437 bch2_trans_iter_exit(trans, &iter); 438 bch2_trans_put(trans); 439 bch2_bkey_buf_exit(&sk, c); 440 return; 441 err: 442 rbio->bio.bi_status = BLK_STS_IOERR; 443 goto out; 444 } 445 446 static void bch2_rbio_retry(struct work_struct *work) 447 { 448 struct bch_read_bio *rbio = 449 container_of(work, struct bch_read_bio, work); 450 struct bch_fs *c = rbio->c; 451 struct bvec_iter iter = rbio->bvec_iter; 452 unsigned flags = rbio->flags; 453 subvol_inum inum = { 454 .subvol = rbio->subvol, 455 .inum = rbio->read_pos.inode, 456 }; 457 struct bch_io_failures failed = { .nr = 0 }; 458 459 trace_and_count(c, read_retry, &rbio->bio); 460 461 if (rbio->retry == READ_RETRY_AVOID) 462 bch2_mark_io_failure(&failed, &rbio->pick); 463 464 rbio->bio.bi_status = 0; 465 466 rbio = bch2_rbio_free(rbio); 467 468 flags |= BCH_READ_IN_RETRY; 469 flags &= ~BCH_READ_MAY_PROMOTE; 470 471 if (flags & BCH_READ_NODECODE) { 472 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 473 } else { 474 flags &= ~BCH_READ_LAST_FRAGMENT; 475 flags |= BCH_READ_MUST_CLONE; 476 477 __bch2_read(c, rbio, iter, inum, &failed, flags); 478 } 479 } 480 481 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 482 blk_status_t error) 483 { 484 rbio->retry = retry; 485 486 if (rbio->flags & BCH_READ_IN_RETRY) 487 return; 488 489 if (retry == READ_ERR) { 490 rbio = bch2_rbio_free(rbio); 491 492 rbio->bio.bi_status = error; 493 bch2_rbio_done(rbio); 494 } else { 495 bch2_rbio_punt(rbio, bch2_rbio_retry, 496 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 497 } 498 } 499 500 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 501 struct bch_read_bio *rbio) 502 { 503 struct bch_fs *c = rbio->c; 504 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 505 struct bch_extent_crc_unpacked new_crc; 506 struct btree_iter iter; 507 struct bkey_i *new; 508 struct bkey_s_c k; 509 int ret = 0; 510 511 if (crc_is_compressed(rbio->pick.crc)) 512 return 0; 513 514 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 515 BTREE_ITER_slots|BTREE_ITER_intent); 516 if ((ret = bkey_err(k))) 517 goto out; 518 519 if (bversion_cmp(k.k->version, rbio->version) || 520 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 521 goto out; 522 523 /* Extent was merged? */ 524 if (bkey_start_offset(k.k) < data_offset || 525 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 526 goto out; 527 528 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 529 rbio->pick.crc, NULL, &new_crc, 530 bkey_start_offset(k.k) - data_offset, k.k->size, 531 rbio->pick.crc.csum_type)) { 532 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 533 ret = 0; 534 goto out; 535 } 536 537 /* 538 * going to be temporarily appending another checksum entry: 539 */ 540 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 541 sizeof(struct bch_extent_crc128)); 542 if ((ret = PTR_ERR_OR_ZERO(new))) 543 goto out; 544 545 bkey_reassemble(new, k); 546 547 if (!bch2_bkey_narrow_crcs(new, new_crc)) 548 goto out; 549 550 ret = bch2_trans_update(trans, &iter, new, 551 BTREE_UPDATE_internal_snapshot_node); 552 out: 553 bch2_trans_iter_exit(trans, &iter); 554 return ret; 555 } 556 557 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 558 { 559 bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 560 __bch2_rbio_narrow_crcs(trans, rbio)); 561 } 562 563 /* Inner part that may run in process context */ 564 static void __bch2_read_endio(struct work_struct *work) 565 { 566 struct bch_read_bio *rbio = 567 container_of(work, struct bch_read_bio, work); 568 struct bch_fs *c = rbio->c; 569 struct bio *src = &rbio->bio; 570 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 571 struct bvec_iter dst_iter = rbio->bvec_iter; 572 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 573 struct nonce nonce = extent_nonce(rbio->version, crc); 574 unsigned nofs_flags; 575 struct bch_csum csum; 576 int ret; 577 578 nofs_flags = memalloc_nofs_save(); 579 580 /* Reset iterator for checksumming and copying bounced data: */ 581 if (rbio->bounce) { 582 src->bi_iter.bi_size = crc.compressed_size << 9; 583 src->bi_iter.bi_idx = 0; 584 src->bi_iter.bi_bvec_done = 0; 585 } else { 586 src->bi_iter = rbio->bvec_iter; 587 } 588 589 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 590 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 591 goto csum_err; 592 593 /* 594 * XXX 595 * We need to rework the narrow_crcs path to deliver the read completion 596 * first, and then punt to a different workqueue, otherwise we're 597 * holding up reads while doing btree updates which is bad for memory 598 * reclaim. 599 */ 600 if (unlikely(rbio->narrow_crcs)) 601 bch2_rbio_narrow_crcs(rbio); 602 603 if (rbio->flags & BCH_READ_NODECODE) 604 goto nodecode; 605 606 /* Adjust crc to point to subset of data we want: */ 607 crc.offset += rbio->offset_into_extent; 608 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 609 610 if (crc_is_compressed(crc)) { 611 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 612 if (ret) 613 goto decrypt_err; 614 615 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 616 !c->opts.no_data_io) 617 goto decompression_err; 618 } else { 619 /* don't need to decrypt the entire bio: */ 620 nonce = nonce_add(nonce, crc.offset << 9); 621 bio_advance(src, crc.offset << 9); 622 623 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 624 src->bi_iter.bi_size = dst_iter.bi_size; 625 626 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 627 if (ret) 628 goto decrypt_err; 629 630 if (rbio->bounce) { 631 struct bvec_iter src_iter = src->bi_iter; 632 633 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 634 } 635 } 636 637 if (rbio->promote) { 638 /* 639 * Re encrypt data we decrypted, so it's consistent with 640 * rbio->crc: 641 */ 642 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 643 if (ret) 644 goto decrypt_err; 645 646 promote_start(rbio->promote, rbio); 647 rbio->promote = NULL; 648 } 649 nodecode: 650 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 651 rbio = bch2_rbio_free(rbio); 652 bch2_rbio_done(rbio); 653 } 654 out: 655 memalloc_nofs_restore(nofs_flags); 656 return; 657 csum_err: 658 /* 659 * Checksum error: if the bio wasn't bounced, we may have been 660 * reading into buffers owned by userspace (that userspace can 661 * scribble over) - retry the read, bouncing it this time: 662 */ 663 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 664 rbio->flags |= BCH_READ_MUST_BOUNCE; 665 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 666 goto out; 667 } 668 669 struct printbuf buf = PRINTBUF; 670 buf.atomic++; 671 prt_str(&buf, "data "); 672 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 673 674 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 675 if (ca) { 676 bch_err_inum_offset_ratelimited(ca, 677 rbio->read_pos.inode, 678 rbio->read_pos.offset << 9, 679 "data %s", buf.buf); 680 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 681 } 682 printbuf_exit(&buf); 683 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 684 goto out; 685 decompression_err: 686 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 687 rbio->read_pos.offset << 9, 688 "decompression error"); 689 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 690 goto out; 691 decrypt_err: 692 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 693 rbio->read_pos.offset << 9, 694 "decrypt error"); 695 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 696 goto out; 697 } 698 699 static void bch2_read_endio(struct bio *bio) 700 { 701 struct bch_read_bio *rbio = 702 container_of(bio, struct bch_read_bio, bio); 703 struct bch_fs *c = rbio->c; 704 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 705 struct workqueue_struct *wq = NULL; 706 enum rbio_context context = RBIO_CONTEXT_NULL; 707 708 if (rbio->have_ioref) { 709 bch2_latency_acct(ca, rbio->submit_time, READ); 710 percpu_ref_put(&ca->io_ref); 711 } 712 713 if (!rbio->split) 714 rbio->bio.bi_end_io = rbio->end_io; 715 716 if (bio->bi_status) { 717 if (ca) { 718 bch_err_inum_offset_ratelimited(ca, 719 rbio->read_pos.inode, 720 rbio->read_pos.offset, 721 "data read error: %s", 722 bch2_blk_status_to_str(bio->bi_status)); 723 bch2_io_error(ca, BCH_MEMBER_ERROR_read); 724 } 725 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 726 return; 727 } 728 729 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 730 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 731 trace_and_count(c, read_reuse_race, &rbio->bio); 732 733 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 734 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 735 else 736 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 737 return; 738 } 739 740 if (rbio->narrow_crcs || 741 rbio->promote || 742 crc_is_compressed(rbio->pick.crc) || 743 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 744 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 745 else if (rbio->pick.crc.csum_type) 746 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 747 748 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 749 } 750 751 int __bch2_read_indirect_extent(struct btree_trans *trans, 752 unsigned *offset_into_extent, 753 struct bkey_buf *orig_k) 754 { 755 struct btree_iter iter; 756 struct bkey_s_c k; 757 u64 reflink_offset; 758 int ret; 759 760 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 761 *offset_into_extent; 762 763 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 764 POS(0, reflink_offset), 0); 765 ret = bkey_err(k); 766 if (ret) 767 goto err; 768 769 if (k.k->type != KEY_TYPE_reflink_v && 770 k.k->type != KEY_TYPE_indirect_inline_data) { 771 bch_err_inum_offset_ratelimited(trans->c, 772 orig_k->k->k.p.inode, 773 orig_k->k->k.p.offset << 9, 774 "%llu len %u points to nonexistent indirect extent %llu", 775 orig_k->k->k.p.offset, 776 orig_k->k->k.size, 777 reflink_offset); 778 bch2_inconsistent_error(trans->c); 779 ret = -EIO; 780 goto err; 781 } 782 783 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 784 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 785 err: 786 bch2_trans_iter_exit(trans, &iter); 787 return ret; 788 } 789 790 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 791 struct bch_dev *ca, 792 struct bkey_s_c k, 793 struct bch_extent_ptr ptr) 794 { 795 struct bch_fs *c = trans->c; 796 struct btree_iter iter; 797 struct printbuf buf = PRINTBUF; 798 int ret; 799 800 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 801 PTR_BUCKET_POS(ca, &ptr), 802 BTREE_ITER_cached); 803 804 u8 *gen = bucket_gen(ca, iter.pos.offset); 805 if (gen) { 806 807 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 808 printbuf_indent_add(&buf, 2); 809 810 bch2_bkey_val_to_text(&buf, c, k); 811 prt_newline(&buf); 812 813 prt_printf(&buf, "memory gen: %u", *gen); 814 815 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 816 if (!ret) { 817 prt_newline(&buf); 818 bch2_bkey_val_to_text(&buf, c, k); 819 } 820 } else { 821 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 822 iter.pos.inode, iter.pos.offset); 823 printbuf_indent_add(&buf, 2); 824 825 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 826 ca->mi.first_bucket, ca->mi.nbuckets); 827 828 bch2_bkey_val_to_text(&buf, c, k); 829 prt_newline(&buf); 830 } 831 832 bch2_fs_inconsistent(c, "%s", buf.buf); 833 834 bch2_trans_iter_exit(trans, &iter); 835 printbuf_exit(&buf); 836 } 837 838 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 839 struct bvec_iter iter, struct bpos read_pos, 840 enum btree_id data_btree, struct bkey_s_c k, 841 unsigned offset_into_extent, 842 struct bch_io_failures *failed, unsigned flags) 843 { 844 struct bch_fs *c = trans->c; 845 struct extent_ptr_decoded pick; 846 struct bch_read_bio *rbio = NULL; 847 struct promote_op *promote = NULL; 848 bool bounce = false, read_full = false, narrow_crcs = false; 849 struct bpos data_pos = bkey_start_pos(k.k); 850 int pick_ret; 851 852 if (bkey_extent_is_inline_data(k.k)) { 853 unsigned bytes = min_t(unsigned, iter.bi_size, 854 bkey_inline_data_bytes(k.k)); 855 856 swap(iter.bi_size, bytes); 857 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 858 swap(iter.bi_size, bytes); 859 bio_advance_iter(&orig->bio, &iter, bytes); 860 zero_fill_bio_iter(&orig->bio, iter); 861 goto out_read_done; 862 } 863 retry_pick: 864 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 865 866 /* hole or reservation - just zero fill: */ 867 if (!pick_ret) 868 goto hole; 869 870 if (pick_ret < 0) { 871 bch_err_inum_offset_ratelimited(c, 872 read_pos.inode, read_pos.offset << 9, 873 "no device to read from"); 874 goto err; 875 } 876 877 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 878 879 /* 880 * Stale dirty pointers are treated as IO errors, but @failed isn't 881 * allocated unless we're in the retry path - so if we're not in the 882 * retry path, don't check here, it'll be caught in bch2_read_endio() 883 * and we'll end up in the retry path: 884 */ 885 if ((flags & BCH_READ_IN_RETRY) && 886 !pick.ptr.cached && 887 ca && 888 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 889 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 890 bch2_mark_io_failure(failed, &pick); 891 percpu_ref_put(&ca->io_ref); 892 goto retry_pick; 893 } 894 895 /* 896 * Unlock the iterator while the btree node's lock is still in 897 * cache, before doing the IO: 898 */ 899 bch2_trans_unlock(trans); 900 901 if (flags & BCH_READ_NODECODE) { 902 /* 903 * can happen if we retry, and the extent we were going to read 904 * has been merged in the meantime: 905 */ 906 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 907 if (ca) 908 percpu_ref_put(&ca->io_ref); 909 goto hole; 910 } 911 912 iter.bi_size = pick.crc.compressed_size << 9; 913 goto get_bio; 914 } 915 916 if (!(flags & BCH_READ_LAST_FRAGMENT) || 917 bio_flagged(&orig->bio, BIO_CHAIN)) 918 flags |= BCH_READ_MUST_CLONE; 919 920 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 921 bch2_can_narrow_extent_crcs(k, pick.crc); 922 923 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 924 flags |= BCH_READ_MUST_BOUNCE; 925 926 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 927 928 if (crc_is_compressed(pick.crc) || 929 (pick.crc.csum_type != BCH_CSUM_none && 930 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 931 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 932 (flags & BCH_READ_USER_MAPPED)) || 933 (flags & BCH_READ_MUST_BOUNCE)))) { 934 read_full = true; 935 bounce = true; 936 } 937 938 if (orig->opts.promote_target)// || failed) 939 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 940 &rbio, &bounce, &read_full, failed); 941 942 if (!read_full) { 943 EBUG_ON(crc_is_compressed(pick.crc)); 944 EBUG_ON(pick.crc.csum_type && 945 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 946 bvec_iter_sectors(iter) != pick.crc.live_size || 947 pick.crc.offset || 948 offset_into_extent)); 949 950 data_pos.offset += offset_into_extent; 951 pick.ptr.offset += pick.crc.offset + 952 offset_into_extent; 953 offset_into_extent = 0; 954 pick.crc.compressed_size = bvec_iter_sectors(iter); 955 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 956 pick.crc.offset = 0; 957 pick.crc.live_size = bvec_iter_sectors(iter); 958 } 959 get_bio: 960 if (rbio) { 961 /* 962 * promote already allocated bounce rbio: 963 * promote needs to allocate a bio big enough for uncompressing 964 * data in the write path, but we're not going to use it all 965 * here: 966 */ 967 EBUG_ON(rbio->bio.bi_iter.bi_size < 968 pick.crc.compressed_size << 9); 969 rbio->bio.bi_iter.bi_size = 970 pick.crc.compressed_size << 9; 971 } else if (bounce) { 972 unsigned sectors = pick.crc.compressed_size; 973 974 rbio = rbio_init(bio_alloc_bioset(NULL, 975 DIV_ROUND_UP(sectors, PAGE_SECTORS), 976 0, 977 GFP_NOFS, 978 &c->bio_read_split), 979 orig->opts); 980 981 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 982 rbio->bounce = true; 983 rbio->split = true; 984 } else if (flags & BCH_READ_MUST_CLONE) { 985 /* 986 * Have to clone if there were any splits, due to error 987 * reporting issues (if a split errored, and retrying didn't 988 * work, when it reports the error to its parent (us) we don't 989 * know if the error was from our bio, and we should retry, or 990 * from the whole bio, in which case we don't want to retry and 991 * lose the error) 992 */ 993 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 994 &c->bio_read_split), 995 orig->opts); 996 rbio->bio.bi_iter = iter; 997 rbio->split = true; 998 } else { 999 rbio = orig; 1000 rbio->bio.bi_iter = iter; 1001 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1002 } 1003 1004 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1005 1006 rbio->c = c; 1007 rbio->submit_time = local_clock(); 1008 if (rbio->split) 1009 rbio->parent = orig; 1010 else 1011 rbio->end_io = orig->bio.bi_end_io; 1012 rbio->bvec_iter = iter; 1013 rbio->offset_into_extent= offset_into_extent; 1014 rbio->flags = flags; 1015 rbio->have_ioref = ca != NULL; 1016 rbio->narrow_crcs = narrow_crcs; 1017 rbio->hole = 0; 1018 rbio->retry = 0; 1019 rbio->context = 0; 1020 /* XXX: only initialize this if needed */ 1021 rbio->devs_have = bch2_bkey_devs(k); 1022 rbio->pick = pick; 1023 rbio->subvol = orig->subvol; 1024 rbio->read_pos = read_pos; 1025 rbio->data_btree = data_btree; 1026 rbio->data_pos = data_pos; 1027 rbio->version = k.k->version; 1028 rbio->promote = promote; 1029 INIT_WORK(&rbio->work, NULL); 1030 1031 if (flags & BCH_READ_NODECODE) 1032 orig->pick = pick; 1033 1034 rbio->bio.bi_opf = orig->bio.bi_opf; 1035 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1036 rbio->bio.bi_end_io = bch2_read_endio; 1037 1038 if (rbio->bounce) 1039 trace_and_count(c, read_bounce, &rbio->bio); 1040 1041 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1042 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1043 1044 /* 1045 * If it's being moved internally, we don't want to flag it as a cache 1046 * hit: 1047 */ 1048 if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 1049 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1050 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1051 1052 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1053 bio_inc_remaining(&orig->bio); 1054 trace_and_count(c, read_split, &orig->bio); 1055 } 1056 1057 if (!rbio->pick.idx) { 1058 if (!rbio->have_ioref) { 1059 bch_err_inum_offset_ratelimited(c, 1060 read_pos.inode, 1061 read_pos.offset << 9, 1062 "no device to read from"); 1063 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1064 goto out; 1065 } 1066 1067 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1068 bio_sectors(&rbio->bio)); 1069 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1070 1071 if (unlikely(c->opts.no_data_io)) { 1072 if (likely(!(flags & BCH_READ_IN_RETRY))) 1073 bio_endio(&rbio->bio); 1074 } else { 1075 if (likely(!(flags & BCH_READ_IN_RETRY))) 1076 submit_bio(&rbio->bio); 1077 else 1078 submit_bio_wait(&rbio->bio); 1079 } 1080 1081 /* 1082 * We just submitted IO which may block, we expect relock fail 1083 * events and shouldn't count them: 1084 */ 1085 trans->notrace_relock_fail = true; 1086 } else { 1087 /* Attempting reconstruct read: */ 1088 if (bch2_ec_read_extent(trans, rbio)) { 1089 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1090 goto out; 1091 } 1092 1093 if (likely(!(flags & BCH_READ_IN_RETRY))) 1094 bio_endio(&rbio->bio); 1095 } 1096 out: 1097 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1098 return 0; 1099 } else { 1100 int ret; 1101 1102 rbio->context = RBIO_CONTEXT_UNBOUND; 1103 bch2_read_endio(&rbio->bio); 1104 1105 ret = rbio->retry; 1106 rbio = bch2_rbio_free(rbio); 1107 1108 if (ret == READ_RETRY_AVOID) { 1109 bch2_mark_io_failure(failed, &pick); 1110 ret = READ_RETRY; 1111 } 1112 1113 if (!ret) 1114 goto out_read_done; 1115 1116 return ret; 1117 } 1118 1119 err: 1120 if (flags & BCH_READ_IN_RETRY) 1121 return READ_ERR; 1122 1123 orig->bio.bi_status = BLK_STS_IOERR; 1124 goto out_read_done; 1125 1126 hole: 1127 /* 1128 * won't normally happen in the BCH_READ_NODECODE 1129 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1130 * to read no longer exists we have to signal that: 1131 */ 1132 if (flags & BCH_READ_NODECODE) 1133 orig->hole = true; 1134 1135 zero_fill_bio_iter(&orig->bio, iter); 1136 out_read_done: 1137 if (flags & BCH_READ_LAST_FRAGMENT) 1138 bch2_rbio_done(orig); 1139 return 0; 1140 } 1141 1142 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1143 struct bvec_iter bvec_iter, subvol_inum inum, 1144 struct bch_io_failures *failed, unsigned flags) 1145 { 1146 struct btree_trans *trans = bch2_trans_get(c); 1147 struct btree_iter iter; 1148 struct bkey_buf sk; 1149 struct bkey_s_c k; 1150 int ret; 1151 1152 BUG_ON(flags & BCH_READ_NODECODE); 1153 1154 bch2_bkey_buf_init(&sk); 1155 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1156 POS(inum.inum, bvec_iter.bi_sector), 1157 BTREE_ITER_slots); 1158 1159 while (1) { 1160 unsigned bytes, sectors, offset_into_extent; 1161 enum btree_id data_btree = BTREE_ID_extents; 1162 1163 bch2_trans_begin(trans); 1164 1165 u32 snapshot; 1166 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1167 if (ret) 1168 goto err; 1169 1170 bch2_btree_iter_set_snapshot(&iter, snapshot); 1171 1172 bch2_btree_iter_set_pos(&iter, 1173 POS(inum.inum, bvec_iter.bi_sector)); 1174 1175 k = bch2_btree_iter_peek_slot(&iter); 1176 ret = bkey_err(k); 1177 if (ret) 1178 goto err; 1179 1180 offset_into_extent = iter.pos.offset - 1181 bkey_start_offset(k.k); 1182 sectors = k.k->size - offset_into_extent; 1183 1184 bch2_bkey_buf_reassemble(&sk, c, k); 1185 1186 ret = bch2_read_indirect_extent(trans, &data_btree, 1187 &offset_into_extent, &sk); 1188 if (ret) 1189 goto err; 1190 1191 k = bkey_i_to_s_c(sk.k); 1192 1193 /* 1194 * With indirect extents, the amount of data to read is the min 1195 * of the original extent and the indirect extent: 1196 */ 1197 sectors = min(sectors, k.k->size - offset_into_extent); 1198 1199 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1200 swap(bvec_iter.bi_size, bytes); 1201 1202 if (bvec_iter.bi_size == bytes) 1203 flags |= BCH_READ_LAST_FRAGMENT; 1204 1205 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1206 data_btree, k, 1207 offset_into_extent, failed, flags); 1208 if (ret) 1209 goto err; 1210 1211 if (flags & BCH_READ_LAST_FRAGMENT) 1212 break; 1213 1214 swap(bvec_iter.bi_size, bytes); 1215 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1216 1217 ret = btree_trans_too_many_iters(trans); 1218 if (ret) 1219 goto err; 1220 err: 1221 if (ret && 1222 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1223 ret != READ_RETRY && 1224 ret != READ_RETRY_AVOID) 1225 break; 1226 } 1227 1228 bch2_trans_iter_exit(trans, &iter); 1229 bch2_trans_put(trans); 1230 bch2_bkey_buf_exit(&sk, c); 1231 1232 if (ret) { 1233 bch_err_inum_offset_ratelimited(c, inum.inum, 1234 bvec_iter.bi_sector << 9, 1235 "read error %i from btree lookup", ret); 1236 rbio->bio.bi_status = BLK_STS_IOERR; 1237 bch2_rbio_done(rbio); 1238 } 1239 } 1240 1241 void bch2_fs_io_read_exit(struct bch_fs *c) 1242 { 1243 if (c->promote_table.tbl) 1244 rhashtable_destroy(&c->promote_table); 1245 bioset_exit(&c->bio_read_split); 1246 bioset_exit(&c->bio_read); 1247 } 1248 1249 int bch2_fs_io_read_init(struct bch_fs *c) 1250 { 1251 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1252 BIOSET_NEED_BVECS)) 1253 return -BCH_ERR_ENOMEM_bio_read_init; 1254 1255 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1256 BIOSET_NEED_BVECS)) 1257 return -BCH_ERR_ENOMEM_bio_read_split_init; 1258 1259 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1260 return -BCH_ERR_ENOMEM_promote_table_init; 1261 1262 return 0; 1263 } 1264