1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 .automatic_shrinking = true, 91 }; 92 93 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 94 struct bpos pos, 95 struct bch_io_opts opts, 96 unsigned flags) 97 { 98 BUG_ON(!opts.promote_target); 99 100 if (!(flags & BCH_READ_MAY_PROMOTE)) 101 return -BCH_ERR_nopromote_may_not; 102 103 if (bch2_bkey_has_target(c, k, opts.promote_target)) 104 return -BCH_ERR_nopromote_already_promoted; 105 106 if (bkey_extent_is_unwritten(k)) 107 return -BCH_ERR_nopromote_unwritten; 108 109 if (bch2_target_congested(c, opts.promote_target)) 110 return -BCH_ERR_nopromote_congested; 111 112 if (rhashtable_lookup_fast(&c->promote_table, &pos, 113 bch_promote_params)) 114 return -BCH_ERR_nopromote_in_flight; 115 116 return 0; 117 } 118 119 static void promote_free(struct bch_fs *c, struct promote_op *op) 120 { 121 int ret; 122 123 bch2_data_update_exit(&op->write); 124 125 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 126 bch_promote_params); 127 BUG_ON(ret); 128 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 129 kfree_rcu(op, rcu); 130 } 131 132 static void promote_done(struct bch_write_op *wop) 133 { 134 struct promote_op *op = 135 container_of(wop, struct promote_op, write.op); 136 struct bch_fs *c = op->write.op.c; 137 138 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 139 op->start_time); 140 promote_free(c, op); 141 } 142 143 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 144 { 145 struct bio *bio = &op->write.op.wbio.bio; 146 147 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 148 149 /* we now own pages: */ 150 BUG_ON(!rbio->bounce); 151 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 152 153 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 154 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 155 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 156 157 bch2_data_update_read_done(&op->write, rbio->pick.crc); 158 } 159 160 static struct promote_op *__promote_alloc(struct btree_trans *trans, 161 enum btree_id btree_id, 162 struct bkey_s_c k, 163 struct bpos pos, 164 struct extent_ptr_decoded *pick, 165 struct bch_io_opts opts, 166 unsigned sectors, 167 struct bch_read_bio **rbio) 168 { 169 struct bch_fs *c = trans->c; 170 struct promote_op *op = NULL; 171 struct bio *bio; 172 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 173 int ret; 174 175 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 176 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 177 178 op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 179 if (!op) { 180 ret = -BCH_ERR_nopromote_enomem; 181 goto err; 182 } 183 184 op->start_time = local_clock(); 185 op->pos = pos; 186 187 /* 188 * We don't use the mempool here because extents that aren't 189 * checksummed or compressed can be too big for the mempool: 190 */ 191 *rbio = kzalloc(sizeof(struct bch_read_bio) + 192 sizeof(struct bio_vec) * pages, 193 GFP_KERNEL); 194 if (!*rbio) { 195 ret = -BCH_ERR_nopromote_enomem; 196 goto err; 197 } 198 199 rbio_init(&(*rbio)->bio, opts); 200 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 201 202 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 203 ret = -BCH_ERR_nopromote_enomem; 204 goto err; 205 } 206 207 (*rbio)->bounce = true; 208 (*rbio)->split = true; 209 (*rbio)->kmalloc = true; 210 211 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 212 bch_promote_params)) { 213 ret = -BCH_ERR_nopromote_in_flight; 214 goto err; 215 } 216 217 bio = &op->write.op.wbio.bio; 218 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 219 220 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 221 writepoint_hashed((unsigned long) current), 222 opts, 223 (struct data_update_opts) { 224 .target = opts.promote_target, 225 .extra_replicas = 1, 226 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, 227 }, 228 btree_id, k); 229 /* 230 * possible errors: -BCH_ERR_nocow_lock_blocked, 231 * -BCH_ERR_ENOSPC_disk_reservation: 232 */ 233 if (ret) { 234 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 235 bch_promote_params)); 236 goto err; 237 } 238 239 op->write.op.end_io = promote_done; 240 241 return op; 242 err: 243 if (*rbio) 244 bio_free_pages(&(*rbio)->bio); 245 kfree(*rbio); 246 *rbio = NULL; 247 kfree(op); 248 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 249 return ERR_PTR(ret); 250 } 251 252 noinline 253 static struct promote_op *promote_alloc(struct btree_trans *trans, 254 struct bvec_iter iter, 255 struct bkey_s_c k, 256 struct extent_ptr_decoded *pick, 257 struct bch_io_opts opts, 258 unsigned flags, 259 struct bch_read_bio **rbio, 260 bool *bounce, 261 bool *read_full) 262 { 263 struct bch_fs *c = trans->c; 264 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); 265 /* data might have to be decompressed in the write path: */ 266 unsigned sectors = promote_full 267 ? max(pick->crc.compressed_size, pick->crc.live_size) 268 : bvec_iter_sectors(iter); 269 struct bpos pos = promote_full 270 ? bkey_start_pos(k.k) 271 : POS(k.k->p.inode, iter.bi_sector); 272 struct promote_op *promote; 273 int ret; 274 275 ret = should_promote(c, k, pos, opts, flags); 276 if (ret) 277 goto nopromote; 278 279 promote = __promote_alloc(trans, 280 k.k->type == KEY_TYPE_reflink_v 281 ? BTREE_ID_reflink 282 : BTREE_ID_extents, 283 k, pos, pick, opts, sectors, rbio); 284 ret = PTR_ERR_OR_ZERO(promote); 285 if (ret) 286 goto nopromote; 287 288 *bounce = true; 289 *read_full = promote_full; 290 return promote; 291 nopromote: 292 trace_read_nopromote(c, ret); 293 return NULL; 294 } 295 296 /* Read */ 297 298 #define READ_RETRY_AVOID 1 299 #define READ_RETRY 2 300 #define READ_ERR 3 301 302 enum rbio_context { 303 RBIO_CONTEXT_NULL, 304 RBIO_CONTEXT_HIGHPRI, 305 RBIO_CONTEXT_UNBOUND, 306 }; 307 308 static inline struct bch_read_bio * 309 bch2_rbio_parent(struct bch_read_bio *rbio) 310 { 311 return rbio->split ? rbio->parent : rbio; 312 } 313 314 __always_inline 315 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 316 enum rbio_context context, 317 struct workqueue_struct *wq) 318 { 319 if (context <= rbio->context) { 320 fn(&rbio->work); 321 } else { 322 rbio->work.func = fn; 323 rbio->context = context; 324 queue_work(wq, &rbio->work); 325 } 326 } 327 328 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 329 { 330 BUG_ON(rbio->bounce && !rbio->split); 331 332 if (rbio->promote) 333 promote_free(rbio->c, rbio->promote); 334 rbio->promote = NULL; 335 336 if (rbio->bounce) 337 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 338 339 if (rbio->split) { 340 struct bch_read_bio *parent = rbio->parent; 341 342 if (rbio->kmalloc) 343 kfree(rbio); 344 else 345 bio_put(&rbio->bio); 346 347 rbio = parent; 348 } 349 350 return rbio; 351 } 352 353 /* 354 * Only called on a top level bch_read_bio to complete an entire read request, 355 * not a split: 356 */ 357 static void bch2_rbio_done(struct bch_read_bio *rbio) 358 { 359 if (rbio->start_time) 360 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 361 rbio->start_time); 362 bio_endio(&rbio->bio); 363 } 364 365 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 366 struct bvec_iter bvec_iter, 367 struct bch_io_failures *failed, 368 unsigned flags) 369 { 370 struct btree_trans *trans = bch2_trans_get(c); 371 struct btree_iter iter; 372 struct bkey_buf sk; 373 struct bkey_s_c k; 374 int ret; 375 376 flags &= ~BCH_READ_LAST_FRAGMENT; 377 flags |= BCH_READ_MUST_CLONE; 378 379 bch2_bkey_buf_init(&sk); 380 381 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 382 rbio->read_pos, BTREE_ITER_slots); 383 retry: 384 rbio->bio.bi_status = 0; 385 386 k = bch2_btree_iter_peek_slot(&iter); 387 if (bkey_err(k)) 388 goto err; 389 390 bch2_bkey_buf_reassemble(&sk, c, k); 391 k = bkey_i_to_s_c(sk.k); 392 393 if (!bch2_bkey_matches_ptr(c, k, 394 rbio->pick.ptr, 395 rbio->data_pos.offset - 396 rbio->pick.crc.offset)) { 397 /* extent we wanted to read no longer exists: */ 398 rbio->hole = true; 399 goto out; 400 } 401 402 ret = __bch2_read_extent(trans, rbio, bvec_iter, 403 rbio->read_pos, 404 rbio->data_btree, 405 k, 0, failed, flags); 406 if (ret == READ_RETRY) 407 goto retry; 408 if (ret) 409 goto err; 410 out: 411 bch2_rbio_done(rbio); 412 bch2_trans_iter_exit(trans, &iter); 413 bch2_trans_put(trans); 414 bch2_bkey_buf_exit(&sk, c); 415 return; 416 err: 417 rbio->bio.bi_status = BLK_STS_IOERR; 418 goto out; 419 } 420 421 static void bch2_rbio_retry(struct work_struct *work) 422 { 423 struct bch_read_bio *rbio = 424 container_of(work, struct bch_read_bio, work); 425 struct bch_fs *c = rbio->c; 426 struct bvec_iter iter = rbio->bvec_iter; 427 unsigned flags = rbio->flags; 428 subvol_inum inum = { 429 .subvol = rbio->subvol, 430 .inum = rbio->read_pos.inode, 431 }; 432 struct bch_io_failures failed = { .nr = 0 }; 433 434 trace_and_count(c, read_retry, &rbio->bio); 435 436 if (rbio->retry == READ_RETRY_AVOID) 437 bch2_mark_io_failure(&failed, &rbio->pick); 438 439 rbio->bio.bi_status = 0; 440 441 rbio = bch2_rbio_free(rbio); 442 443 flags |= BCH_READ_IN_RETRY; 444 flags &= ~BCH_READ_MAY_PROMOTE; 445 446 if (flags & BCH_READ_NODECODE) { 447 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 448 } else { 449 flags &= ~BCH_READ_LAST_FRAGMENT; 450 flags |= BCH_READ_MUST_CLONE; 451 452 __bch2_read(c, rbio, iter, inum, &failed, flags); 453 } 454 } 455 456 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 457 blk_status_t error) 458 { 459 rbio->retry = retry; 460 461 if (rbio->flags & BCH_READ_IN_RETRY) 462 return; 463 464 if (retry == READ_ERR) { 465 rbio = bch2_rbio_free(rbio); 466 467 rbio->bio.bi_status = error; 468 bch2_rbio_done(rbio); 469 } else { 470 bch2_rbio_punt(rbio, bch2_rbio_retry, 471 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 472 } 473 } 474 475 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 476 struct bch_read_bio *rbio) 477 { 478 struct bch_fs *c = rbio->c; 479 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 480 struct bch_extent_crc_unpacked new_crc; 481 struct btree_iter iter; 482 struct bkey_i *new; 483 struct bkey_s_c k; 484 int ret = 0; 485 486 if (crc_is_compressed(rbio->pick.crc)) 487 return 0; 488 489 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 490 BTREE_ITER_slots|BTREE_ITER_intent); 491 if ((ret = bkey_err(k))) 492 goto out; 493 494 if (bversion_cmp(k.k->version, rbio->version) || 495 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 496 goto out; 497 498 /* Extent was merged? */ 499 if (bkey_start_offset(k.k) < data_offset || 500 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 501 goto out; 502 503 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 504 rbio->pick.crc, NULL, &new_crc, 505 bkey_start_offset(k.k) - data_offset, k.k->size, 506 rbio->pick.crc.csum_type)) { 507 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 508 ret = 0; 509 goto out; 510 } 511 512 /* 513 * going to be temporarily appending another checksum entry: 514 */ 515 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 516 sizeof(struct bch_extent_crc128)); 517 if ((ret = PTR_ERR_OR_ZERO(new))) 518 goto out; 519 520 bkey_reassemble(new, k); 521 522 if (!bch2_bkey_narrow_crcs(new, new_crc)) 523 goto out; 524 525 ret = bch2_trans_update(trans, &iter, new, 526 BTREE_UPDATE_internal_snapshot_node); 527 out: 528 bch2_trans_iter_exit(trans, &iter); 529 return ret; 530 } 531 532 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 533 { 534 bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 535 __bch2_rbio_narrow_crcs(trans, rbio)); 536 } 537 538 /* Inner part that may run in process context */ 539 static void __bch2_read_endio(struct work_struct *work) 540 { 541 struct bch_read_bio *rbio = 542 container_of(work, struct bch_read_bio, work); 543 struct bch_fs *c = rbio->c; 544 struct bio *src = &rbio->bio; 545 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 546 struct bvec_iter dst_iter = rbio->bvec_iter; 547 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 548 struct nonce nonce = extent_nonce(rbio->version, crc); 549 unsigned nofs_flags; 550 struct bch_csum csum; 551 int ret; 552 553 nofs_flags = memalloc_nofs_save(); 554 555 /* Reset iterator for checksumming and copying bounced data: */ 556 if (rbio->bounce) { 557 src->bi_iter.bi_size = crc.compressed_size << 9; 558 src->bi_iter.bi_idx = 0; 559 src->bi_iter.bi_bvec_done = 0; 560 } else { 561 src->bi_iter = rbio->bvec_iter; 562 } 563 564 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 565 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 566 goto csum_err; 567 568 /* 569 * XXX 570 * We need to rework the narrow_crcs path to deliver the read completion 571 * first, and then punt to a different workqueue, otherwise we're 572 * holding up reads while doing btree updates which is bad for memory 573 * reclaim. 574 */ 575 if (unlikely(rbio->narrow_crcs)) 576 bch2_rbio_narrow_crcs(rbio); 577 578 if (rbio->flags & BCH_READ_NODECODE) 579 goto nodecode; 580 581 /* Adjust crc to point to subset of data we want: */ 582 crc.offset += rbio->offset_into_extent; 583 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 584 585 if (crc_is_compressed(crc)) { 586 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 587 if (ret) 588 goto decrypt_err; 589 590 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 591 !c->opts.no_data_io) 592 goto decompression_err; 593 } else { 594 /* don't need to decrypt the entire bio: */ 595 nonce = nonce_add(nonce, crc.offset << 9); 596 bio_advance(src, crc.offset << 9); 597 598 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 599 src->bi_iter.bi_size = dst_iter.bi_size; 600 601 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 602 if (ret) 603 goto decrypt_err; 604 605 if (rbio->bounce) { 606 struct bvec_iter src_iter = src->bi_iter; 607 608 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 609 } 610 } 611 612 if (rbio->promote) { 613 /* 614 * Re encrypt data we decrypted, so it's consistent with 615 * rbio->crc: 616 */ 617 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 618 if (ret) 619 goto decrypt_err; 620 621 promote_start(rbio->promote, rbio); 622 rbio->promote = NULL; 623 } 624 nodecode: 625 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 626 rbio = bch2_rbio_free(rbio); 627 bch2_rbio_done(rbio); 628 } 629 out: 630 memalloc_nofs_restore(nofs_flags); 631 return; 632 csum_err: 633 /* 634 * Checksum error: if the bio wasn't bounced, we may have been 635 * reading into buffers owned by userspace (that userspace can 636 * scribble over) - retry the read, bouncing it this time: 637 */ 638 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 639 rbio->flags |= BCH_READ_MUST_BOUNCE; 640 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 641 goto out; 642 } 643 644 struct printbuf buf = PRINTBUF; 645 buf.atomic++; 646 prt_str(&buf, "data "); 647 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 648 649 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 650 if (ca) { 651 bch_err_inum_offset_ratelimited(ca, 652 rbio->read_pos.inode, 653 rbio->read_pos.offset << 9, 654 "data %s", buf.buf); 655 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 656 } 657 printbuf_exit(&buf); 658 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 659 goto out; 660 decompression_err: 661 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 662 rbio->read_pos.offset << 9, 663 "decompression error"); 664 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 665 goto out; 666 decrypt_err: 667 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 668 rbio->read_pos.offset << 9, 669 "decrypt error"); 670 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 671 goto out; 672 } 673 674 static void bch2_read_endio(struct bio *bio) 675 { 676 struct bch_read_bio *rbio = 677 container_of(bio, struct bch_read_bio, bio); 678 struct bch_fs *c = rbio->c; 679 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 680 struct workqueue_struct *wq = NULL; 681 enum rbio_context context = RBIO_CONTEXT_NULL; 682 683 if (rbio->have_ioref) { 684 bch2_latency_acct(ca, rbio->submit_time, READ); 685 percpu_ref_put(&ca->io_ref); 686 } 687 688 if (!rbio->split) 689 rbio->bio.bi_end_io = rbio->end_io; 690 691 if (bio->bi_status) { 692 if (ca) { 693 bch_err_inum_offset_ratelimited(ca, 694 rbio->read_pos.inode, 695 rbio->read_pos.offset, 696 "data read error: %s", 697 bch2_blk_status_to_str(bio->bi_status)); 698 bch2_io_error(ca, BCH_MEMBER_ERROR_read); 699 } 700 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 701 return; 702 } 703 704 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 705 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 706 trace_and_count(c, read_reuse_race, &rbio->bio); 707 708 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 709 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 710 else 711 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 712 return; 713 } 714 715 if (rbio->narrow_crcs || 716 rbio->promote || 717 crc_is_compressed(rbio->pick.crc) || 718 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 719 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 720 else if (rbio->pick.crc.csum_type) 721 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 722 723 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 724 } 725 726 int __bch2_read_indirect_extent(struct btree_trans *trans, 727 unsigned *offset_into_extent, 728 struct bkey_buf *orig_k) 729 { 730 struct btree_iter iter; 731 struct bkey_s_c k; 732 u64 reflink_offset; 733 int ret; 734 735 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 736 *offset_into_extent; 737 738 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 739 POS(0, reflink_offset), 0); 740 ret = bkey_err(k); 741 if (ret) 742 goto err; 743 744 if (k.k->type != KEY_TYPE_reflink_v && 745 k.k->type != KEY_TYPE_indirect_inline_data) { 746 bch_err_inum_offset_ratelimited(trans->c, 747 orig_k->k->k.p.inode, 748 orig_k->k->k.p.offset << 9, 749 "%llu len %u points to nonexistent indirect extent %llu", 750 orig_k->k->k.p.offset, 751 orig_k->k->k.size, 752 reflink_offset); 753 bch2_inconsistent_error(trans->c); 754 ret = -EIO; 755 goto err; 756 } 757 758 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 759 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 760 err: 761 bch2_trans_iter_exit(trans, &iter); 762 return ret; 763 } 764 765 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 766 struct bch_dev *ca, 767 struct bkey_s_c k, 768 struct bch_extent_ptr ptr) 769 { 770 struct bch_fs *c = trans->c; 771 struct btree_iter iter; 772 struct printbuf buf = PRINTBUF; 773 int ret; 774 775 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 776 PTR_BUCKET_POS(ca, &ptr), 777 BTREE_ITER_cached); 778 779 u8 *gen = bucket_gen(ca, iter.pos.offset); 780 if (gen) { 781 782 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 783 printbuf_indent_add(&buf, 2); 784 785 bch2_bkey_val_to_text(&buf, c, k); 786 prt_newline(&buf); 787 788 prt_printf(&buf, "memory gen: %u", *gen); 789 790 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 791 if (!ret) { 792 prt_newline(&buf); 793 bch2_bkey_val_to_text(&buf, c, k); 794 } 795 } else { 796 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 797 iter.pos.inode, iter.pos.offset); 798 printbuf_indent_add(&buf, 2); 799 800 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 801 ca->mi.first_bucket, ca->mi.nbuckets); 802 803 bch2_bkey_val_to_text(&buf, c, k); 804 prt_newline(&buf); 805 } 806 807 bch2_fs_inconsistent(c, "%s", buf.buf); 808 809 bch2_trans_iter_exit(trans, &iter); 810 printbuf_exit(&buf); 811 } 812 813 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 814 struct bvec_iter iter, struct bpos read_pos, 815 enum btree_id data_btree, struct bkey_s_c k, 816 unsigned offset_into_extent, 817 struct bch_io_failures *failed, unsigned flags) 818 { 819 struct bch_fs *c = trans->c; 820 struct extent_ptr_decoded pick; 821 struct bch_read_bio *rbio = NULL; 822 struct promote_op *promote = NULL; 823 bool bounce = false, read_full = false, narrow_crcs = false; 824 struct bpos data_pos = bkey_start_pos(k.k); 825 int pick_ret; 826 827 if (bkey_extent_is_inline_data(k.k)) { 828 unsigned bytes = min_t(unsigned, iter.bi_size, 829 bkey_inline_data_bytes(k.k)); 830 831 swap(iter.bi_size, bytes); 832 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 833 swap(iter.bi_size, bytes); 834 bio_advance_iter(&orig->bio, &iter, bytes); 835 zero_fill_bio_iter(&orig->bio, iter); 836 goto out_read_done; 837 } 838 retry_pick: 839 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 840 841 /* hole or reservation - just zero fill: */ 842 if (!pick_ret) 843 goto hole; 844 845 if (pick_ret < 0) { 846 bch_err_inum_offset_ratelimited(c, 847 read_pos.inode, read_pos.offset << 9, 848 "no device to read from"); 849 goto err; 850 } 851 852 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 853 854 /* 855 * Stale dirty pointers are treated as IO errors, but @failed isn't 856 * allocated unless we're in the retry path - so if we're not in the 857 * retry path, don't check here, it'll be caught in bch2_read_endio() 858 * and we'll end up in the retry path: 859 */ 860 if ((flags & BCH_READ_IN_RETRY) && 861 !pick.ptr.cached && 862 ca && 863 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 864 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 865 bch2_mark_io_failure(failed, &pick); 866 percpu_ref_put(&ca->io_ref); 867 goto retry_pick; 868 } 869 870 /* 871 * Unlock the iterator while the btree node's lock is still in 872 * cache, before doing the IO: 873 */ 874 bch2_trans_unlock(trans); 875 876 if (flags & BCH_READ_NODECODE) { 877 /* 878 * can happen if we retry, and the extent we were going to read 879 * has been merged in the meantime: 880 */ 881 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 882 if (ca) 883 percpu_ref_put(&ca->io_ref); 884 goto hole; 885 } 886 887 iter.bi_size = pick.crc.compressed_size << 9; 888 goto get_bio; 889 } 890 891 if (!(flags & BCH_READ_LAST_FRAGMENT) || 892 bio_flagged(&orig->bio, BIO_CHAIN)) 893 flags |= BCH_READ_MUST_CLONE; 894 895 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 896 bch2_can_narrow_extent_crcs(k, pick.crc); 897 898 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 899 flags |= BCH_READ_MUST_BOUNCE; 900 901 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 902 903 if (crc_is_compressed(pick.crc) || 904 (pick.crc.csum_type != BCH_CSUM_none && 905 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 906 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 907 (flags & BCH_READ_USER_MAPPED)) || 908 (flags & BCH_READ_MUST_BOUNCE)))) { 909 read_full = true; 910 bounce = true; 911 } 912 913 if (orig->opts.promote_target) 914 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 915 &rbio, &bounce, &read_full); 916 917 if (!read_full) { 918 EBUG_ON(crc_is_compressed(pick.crc)); 919 EBUG_ON(pick.crc.csum_type && 920 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 921 bvec_iter_sectors(iter) != pick.crc.live_size || 922 pick.crc.offset || 923 offset_into_extent)); 924 925 data_pos.offset += offset_into_extent; 926 pick.ptr.offset += pick.crc.offset + 927 offset_into_extent; 928 offset_into_extent = 0; 929 pick.crc.compressed_size = bvec_iter_sectors(iter); 930 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 931 pick.crc.offset = 0; 932 pick.crc.live_size = bvec_iter_sectors(iter); 933 } 934 get_bio: 935 if (rbio) { 936 /* 937 * promote already allocated bounce rbio: 938 * promote needs to allocate a bio big enough for uncompressing 939 * data in the write path, but we're not going to use it all 940 * here: 941 */ 942 EBUG_ON(rbio->bio.bi_iter.bi_size < 943 pick.crc.compressed_size << 9); 944 rbio->bio.bi_iter.bi_size = 945 pick.crc.compressed_size << 9; 946 } else if (bounce) { 947 unsigned sectors = pick.crc.compressed_size; 948 949 rbio = rbio_init(bio_alloc_bioset(NULL, 950 DIV_ROUND_UP(sectors, PAGE_SECTORS), 951 0, 952 GFP_NOFS, 953 &c->bio_read_split), 954 orig->opts); 955 956 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 957 rbio->bounce = true; 958 rbio->split = true; 959 } else if (flags & BCH_READ_MUST_CLONE) { 960 /* 961 * Have to clone if there were any splits, due to error 962 * reporting issues (if a split errored, and retrying didn't 963 * work, when it reports the error to its parent (us) we don't 964 * know if the error was from our bio, and we should retry, or 965 * from the whole bio, in which case we don't want to retry and 966 * lose the error) 967 */ 968 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 969 &c->bio_read_split), 970 orig->opts); 971 rbio->bio.bi_iter = iter; 972 rbio->split = true; 973 } else { 974 rbio = orig; 975 rbio->bio.bi_iter = iter; 976 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 977 } 978 979 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 980 981 rbio->c = c; 982 rbio->submit_time = local_clock(); 983 if (rbio->split) 984 rbio->parent = orig; 985 else 986 rbio->end_io = orig->bio.bi_end_io; 987 rbio->bvec_iter = iter; 988 rbio->offset_into_extent= offset_into_extent; 989 rbio->flags = flags; 990 rbio->have_ioref = ca != NULL; 991 rbio->narrow_crcs = narrow_crcs; 992 rbio->hole = 0; 993 rbio->retry = 0; 994 rbio->context = 0; 995 /* XXX: only initialize this if needed */ 996 rbio->devs_have = bch2_bkey_devs(k); 997 rbio->pick = pick; 998 rbio->subvol = orig->subvol; 999 rbio->read_pos = read_pos; 1000 rbio->data_btree = data_btree; 1001 rbio->data_pos = data_pos; 1002 rbio->version = k.k->version; 1003 rbio->promote = promote; 1004 INIT_WORK(&rbio->work, NULL); 1005 1006 if (flags & BCH_READ_NODECODE) 1007 orig->pick = pick; 1008 1009 rbio->bio.bi_opf = orig->bio.bi_opf; 1010 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1011 rbio->bio.bi_end_io = bch2_read_endio; 1012 1013 if (rbio->bounce) 1014 trace_and_count(c, read_bounce, &rbio->bio); 1015 1016 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1017 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1018 1019 /* 1020 * If it's being moved internally, we don't want to flag it as a cache 1021 * hit: 1022 */ 1023 if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 1024 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1025 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1026 1027 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1028 bio_inc_remaining(&orig->bio); 1029 trace_and_count(c, read_split, &orig->bio); 1030 } 1031 1032 if (!rbio->pick.idx) { 1033 if (!rbio->have_ioref) { 1034 bch_err_inum_offset_ratelimited(c, 1035 read_pos.inode, 1036 read_pos.offset << 9, 1037 "no device to read from"); 1038 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1039 goto out; 1040 } 1041 1042 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1043 bio_sectors(&rbio->bio)); 1044 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1045 1046 if (unlikely(c->opts.no_data_io)) { 1047 if (likely(!(flags & BCH_READ_IN_RETRY))) 1048 bio_endio(&rbio->bio); 1049 } else { 1050 if (likely(!(flags & BCH_READ_IN_RETRY))) 1051 submit_bio(&rbio->bio); 1052 else 1053 submit_bio_wait(&rbio->bio); 1054 } 1055 1056 /* 1057 * We just submitted IO which may block, we expect relock fail 1058 * events and shouldn't count them: 1059 */ 1060 trans->notrace_relock_fail = true; 1061 } else { 1062 /* Attempting reconstruct read: */ 1063 if (bch2_ec_read_extent(trans, rbio)) { 1064 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1065 goto out; 1066 } 1067 1068 if (likely(!(flags & BCH_READ_IN_RETRY))) 1069 bio_endio(&rbio->bio); 1070 } 1071 out: 1072 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1073 return 0; 1074 } else { 1075 int ret; 1076 1077 rbio->context = RBIO_CONTEXT_UNBOUND; 1078 bch2_read_endio(&rbio->bio); 1079 1080 ret = rbio->retry; 1081 rbio = bch2_rbio_free(rbio); 1082 1083 if (ret == READ_RETRY_AVOID) { 1084 bch2_mark_io_failure(failed, &pick); 1085 ret = READ_RETRY; 1086 } 1087 1088 if (!ret) 1089 goto out_read_done; 1090 1091 return ret; 1092 } 1093 1094 err: 1095 if (flags & BCH_READ_IN_RETRY) 1096 return READ_ERR; 1097 1098 orig->bio.bi_status = BLK_STS_IOERR; 1099 goto out_read_done; 1100 1101 hole: 1102 /* 1103 * won't normally happen in the BCH_READ_NODECODE 1104 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1105 * to read no longer exists we have to signal that: 1106 */ 1107 if (flags & BCH_READ_NODECODE) 1108 orig->hole = true; 1109 1110 zero_fill_bio_iter(&orig->bio, iter); 1111 out_read_done: 1112 if (flags & BCH_READ_LAST_FRAGMENT) 1113 bch2_rbio_done(orig); 1114 return 0; 1115 } 1116 1117 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1118 struct bvec_iter bvec_iter, subvol_inum inum, 1119 struct bch_io_failures *failed, unsigned flags) 1120 { 1121 struct btree_trans *trans = bch2_trans_get(c); 1122 struct btree_iter iter; 1123 struct bkey_buf sk; 1124 struct bkey_s_c k; 1125 u32 snapshot; 1126 int ret; 1127 1128 BUG_ON(flags & BCH_READ_NODECODE); 1129 1130 bch2_bkey_buf_init(&sk); 1131 retry: 1132 bch2_trans_begin(trans); 1133 iter = (struct btree_iter) { NULL }; 1134 1135 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1136 if (ret) 1137 goto err; 1138 1139 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1140 SPOS(inum.inum, bvec_iter.bi_sector, snapshot), 1141 BTREE_ITER_slots); 1142 while (1) { 1143 unsigned bytes, sectors, offset_into_extent; 1144 enum btree_id data_btree = BTREE_ID_extents; 1145 1146 /* 1147 * read_extent -> io_time_reset may cause a transaction restart 1148 * without returning an error, we need to check for that here: 1149 */ 1150 ret = bch2_trans_relock(trans); 1151 if (ret) 1152 break; 1153 1154 bch2_btree_iter_set_pos(&iter, 1155 POS(inum.inum, bvec_iter.bi_sector)); 1156 1157 k = bch2_btree_iter_peek_slot(&iter); 1158 ret = bkey_err(k); 1159 if (ret) 1160 break; 1161 1162 offset_into_extent = iter.pos.offset - 1163 bkey_start_offset(k.k); 1164 sectors = k.k->size - offset_into_extent; 1165 1166 bch2_bkey_buf_reassemble(&sk, c, k); 1167 1168 ret = bch2_read_indirect_extent(trans, &data_btree, 1169 &offset_into_extent, &sk); 1170 if (ret) 1171 break; 1172 1173 k = bkey_i_to_s_c(sk.k); 1174 1175 /* 1176 * With indirect extents, the amount of data to read is the min 1177 * of the original extent and the indirect extent: 1178 */ 1179 sectors = min(sectors, k.k->size - offset_into_extent); 1180 1181 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1182 swap(bvec_iter.bi_size, bytes); 1183 1184 if (bvec_iter.bi_size == bytes) 1185 flags |= BCH_READ_LAST_FRAGMENT; 1186 1187 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1188 data_btree, k, 1189 offset_into_extent, failed, flags); 1190 if (ret) 1191 break; 1192 1193 if (flags & BCH_READ_LAST_FRAGMENT) 1194 break; 1195 1196 swap(bvec_iter.bi_size, bytes); 1197 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1198 1199 ret = btree_trans_too_many_iters(trans); 1200 if (ret) 1201 break; 1202 } 1203 err: 1204 bch2_trans_iter_exit(trans, &iter); 1205 1206 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1207 ret == READ_RETRY || 1208 ret == READ_RETRY_AVOID) 1209 goto retry; 1210 1211 bch2_trans_put(trans); 1212 bch2_bkey_buf_exit(&sk, c); 1213 1214 if (ret) { 1215 bch_err_inum_offset_ratelimited(c, inum.inum, 1216 bvec_iter.bi_sector << 9, 1217 "read error %i from btree lookup", ret); 1218 rbio->bio.bi_status = BLK_STS_IOERR; 1219 bch2_rbio_done(rbio); 1220 } 1221 } 1222 1223 void bch2_fs_io_read_exit(struct bch_fs *c) 1224 { 1225 if (c->promote_table.tbl) 1226 rhashtable_destroy(&c->promote_table); 1227 bioset_exit(&c->bio_read_split); 1228 bioset_exit(&c->bio_read); 1229 } 1230 1231 int bch2_fs_io_read_init(struct bch_fs *c) 1232 { 1233 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1234 BIOSET_NEED_BVECS)) 1235 return -BCH_ERR_ENOMEM_bio_read_init; 1236 1237 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1238 BIOSET_NEED_BVECS)) 1239 return -BCH_ERR_ENOMEM_bio_read_split_init; 1240 1241 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1242 return -BCH_ERR_ENOMEM_promote_table_init; 1243 1244 return 0; 1245 } 1246