1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[0]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 }; 91 92 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 93 struct bpos pos, 94 struct bch_io_opts opts, 95 unsigned flags) 96 { 97 BUG_ON(!opts.promote_target); 98 99 if (!(flags & BCH_READ_MAY_PROMOTE)) 100 return -BCH_ERR_nopromote_may_not; 101 102 if (bch2_bkey_has_target(c, k, opts.promote_target)) 103 return -BCH_ERR_nopromote_already_promoted; 104 105 if (bkey_extent_is_unwritten(k)) 106 return -BCH_ERR_nopromote_unwritten; 107 108 if (bch2_target_congested(c, opts.promote_target)) 109 return -BCH_ERR_nopromote_congested; 110 111 if (rhashtable_lookup_fast(&c->promote_table, &pos, 112 bch_promote_params)) 113 return -BCH_ERR_nopromote_in_flight; 114 115 return 0; 116 } 117 118 static void promote_free(struct bch_fs *c, struct promote_op *op) 119 { 120 int ret; 121 122 bch2_data_update_exit(&op->write); 123 124 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 125 bch_promote_params); 126 BUG_ON(ret); 127 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 128 kfree_rcu(op, rcu); 129 } 130 131 static void promote_done(struct bch_write_op *wop) 132 { 133 struct promote_op *op = 134 container_of(wop, struct promote_op, write.op); 135 struct bch_fs *c = op->write.op.c; 136 137 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 138 op->start_time); 139 promote_free(c, op); 140 } 141 142 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 143 { 144 struct bio *bio = &op->write.op.wbio.bio; 145 146 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 147 148 /* we now own pages: */ 149 BUG_ON(!rbio->bounce); 150 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 151 152 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 153 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 154 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 155 156 bch2_data_update_read_done(&op->write, rbio->pick.crc); 157 } 158 159 static struct promote_op *__promote_alloc(struct btree_trans *trans, 160 enum btree_id btree_id, 161 struct bkey_s_c k, 162 struct bpos pos, 163 struct extent_ptr_decoded *pick, 164 struct bch_io_opts opts, 165 unsigned sectors, 166 struct bch_read_bio **rbio) 167 { 168 struct bch_fs *c = trans->c; 169 struct promote_op *op = NULL; 170 struct bio *bio; 171 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 172 int ret; 173 174 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 175 return NULL; 176 177 op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); 178 if (!op) 179 goto err; 180 181 op->start_time = local_clock(); 182 op->pos = pos; 183 184 /* 185 * We don't use the mempool here because extents that aren't 186 * checksummed or compressed can be too big for the mempool: 187 */ 188 *rbio = kzalloc(sizeof(struct bch_read_bio) + 189 sizeof(struct bio_vec) * pages, 190 GFP_NOFS); 191 if (!*rbio) 192 goto err; 193 194 rbio_init(&(*rbio)->bio, opts); 195 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 196 197 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, 198 GFP_NOFS)) 199 goto err; 200 201 (*rbio)->bounce = true; 202 (*rbio)->split = true; 203 (*rbio)->kmalloc = true; 204 205 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 206 bch_promote_params)) 207 goto err; 208 209 bio = &op->write.op.wbio.bio; 210 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 211 212 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 213 writepoint_hashed((unsigned long) current), 214 opts, 215 (struct data_update_opts) { 216 .target = opts.promote_target, 217 .extra_replicas = 1, 218 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, 219 }, 220 btree_id, k); 221 /* 222 * possible errors: -BCH_ERR_nocow_lock_blocked, 223 * -BCH_ERR_ENOSPC_disk_reservation: 224 */ 225 if (ret) { 226 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 227 bch_promote_params); 228 BUG_ON(ret); 229 goto err; 230 } 231 232 op->write.op.end_io = promote_done; 233 234 return op; 235 err: 236 if (*rbio) 237 bio_free_pages(&(*rbio)->bio); 238 kfree(*rbio); 239 *rbio = NULL; 240 kfree(op); 241 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 242 return NULL; 243 } 244 245 noinline 246 static struct promote_op *promote_alloc(struct btree_trans *trans, 247 struct bvec_iter iter, 248 struct bkey_s_c k, 249 struct extent_ptr_decoded *pick, 250 struct bch_io_opts opts, 251 unsigned flags, 252 struct bch_read_bio **rbio, 253 bool *bounce, 254 bool *read_full) 255 { 256 struct bch_fs *c = trans->c; 257 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); 258 /* data might have to be decompressed in the write path: */ 259 unsigned sectors = promote_full 260 ? max(pick->crc.compressed_size, pick->crc.live_size) 261 : bvec_iter_sectors(iter); 262 struct bpos pos = promote_full 263 ? bkey_start_pos(k.k) 264 : POS(k.k->p.inode, iter.bi_sector); 265 struct promote_op *promote; 266 int ret; 267 268 ret = should_promote(c, k, pos, opts, flags); 269 if (ret) 270 goto nopromote; 271 272 promote = __promote_alloc(trans, 273 k.k->type == KEY_TYPE_reflink_v 274 ? BTREE_ID_reflink 275 : BTREE_ID_extents, 276 k, pos, pick, opts, sectors, rbio); 277 if (!promote) { 278 ret = -BCH_ERR_nopromote_enomem; 279 goto nopromote; 280 } 281 282 *bounce = true; 283 *read_full = promote_full; 284 return promote; 285 nopromote: 286 trace_read_nopromote(c, ret); 287 return NULL; 288 } 289 290 /* Read */ 291 292 #define READ_RETRY_AVOID 1 293 #define READ_RETRY 2 294 #define READ_ERR 3 295 296 enum rbio_context { 297 RBIO_CONTEXT_NULL, 298 RBIO_CONTEXT_HIGHPRI, 299 RBIO_CONTEXT_UNBOUND, 300 }; 301 302 static inline struct bch_read_bio * 303 bch2_rbio_parent(struct bch_read_bio *rbio) 304 { 305 return rbio->split ? rbio->parent : rbio; 306 } 307 308 __always_inline 309 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 310 enum rbio_context context, 311 struct workqueue_struct *wq) 312 { 313 if (context <= rbio->context) { 314 fn(&rbio->work); 315 } else { 316 rbio->work.func = fn; 317 rbio->context = context; 318 queue_work(wq, &rbio->work); 319 } 320 } 321 322 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 323 { 324 BUG_ON(rbio->bounce && !rbio->split); 325 326 if (rbio->promote) 327 promote_free(rbio->c, rbio->promote); 328 rbio->promote = NULL; 329 330 if (rbio->bounce) 331 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 332 333 if (rbio->split) { 334 struct bch_read_bio *parent = rbio->parent; 335 336 if (rbio->kmalloc) 337 kfree(rbio); 338 else 339 bio_put(&rbio->bio); 340 341 rbio = parent; 342 } 343 344 return rbio; 345 } 346 347 /* 348 * Only called on a top level bch_read_bio to complete an entire read request, 349 * not a split: 350 */ 351 static void bch2_rbio_done(struct bch_read_bio *rbio) 352 { 353 if (rbio->start_time) 354 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 355 rbio->start_time); 356 bio_endio(&rbio->bio); 357 } 358 359 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 360 struct bvec_iter bvec_iter, 361 struct bch_io_failures *failed, 362 unsigned flags) 363 { 364 struct btree_trans *trans = bch2_trans_get(c); 365 struct btree_iter iter; 366 struct bkey_buf sk; 367 struct bkey_s_c k; 368 int ret; 369 370 flags &= ~BCH_READ_LAST_FRAGMENT; 371 flags |= BCH_READ_MUST_CLONE; 372 373 bch2_bkey_buf_init(&sk); 374 375 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 376 rbio->read_pos, BTREE_ITER_SLOTS); 377 retry: 378 rbio->bio.bi_status = 0; 379 380 k = bch2_btree_iter_peek_slot(&iter); 381 if (bkey_err(k)) 382 goto err; 383 384 bch2_bkey_buf_reassemble(&sk, c, k); 385 k = bkey_i_to_s_c(sk.k); 386 bch2_trans_unlock(trans); 387 388 if (!bch2_bkey_matches_ptr(c, k, 389 rbio->pick.ptr, 390 rbio->data_pos.offset - 391 rbio->pick.crc.offset)) { 392 /* extent we wanted to read no longer exists: */ 393 rbio->hole = true; 394 goto out; 395 } 396 397 ret = __bch2_read_extent(trans, rbio, bvec_iter, 398 rbio->read_pos, 399 rbio->data_btree, 400 k, 0, failed, flags); 401 if (ret == READ_RETRY) 402 goto retry; 403 if (ret) 404 goto err; 405 out: 406 bch2_rbio_done(rbio); 407 bch2_trans_iter_exit(trans, &iter); 408 bch2_trans_put(trans); 409 bch2_bkey_buf_exit(&sk, c); 410 return; 411 err: 412 rbio->bio.bi_status = BLK_STS_IOERR; 413 goto out; 414 } 415 416 static void bch2_rbio_retry(struct work_struct *work) 417 { 418 struct bch_read_bio *rbio = 419 container_of(work, struct bch_read_bio, work); 420 struct bch_fs *c = rbio->c; 421 struct bvec_iter iter = rbio->bvec_iter; 422 unsigned flags = rbio->flags; 423 subvol_inum inum = { 424 .subvol = rbio->subvol, 425 .inum = rbio->read_pos.inode, 426 }; 427 struct bch_io_failures failed = { .nr = 0 }; 428 429 trace_and_count(c, read_retry, &rbio->bio); 430 431 if (rbio->retry == READ_RETRY_AVOID) 432 bch2_mark_io_failure(&failed, &rbio->pick); 433 434 rbio->bio.bi_status = 0; 435 436 rbio = bch2_rbio_free(rbio); 437 438 flags |= BCH_READ_IN_RETRY; 439 flags &= ~BCH_READ_MAY_PROMOTE; 440 441 if (flags & BCH_READ_NODECODE) { 442 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 443 } else { 444 flags &= ~BCH_READ_LAST_FRAGMENT; 445 flags |= BCH_READ_MUST_CLONE; 446 447 __bch2_read(c, rbio, iter, inum, &failed, flags); 448 } 449 } 450 451 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 452 blk_status_t error) 453 { 454 rbio->retry = retry; 455 456 if (rbio->flags & BCH_READ_IN_RETRY) 457 return; 458 459 if (retry == READ_ERR) { 460 rbio = bch2_rbio_free(rbio); 461 462 rbio->bio.bi_status = error; 463 bch2_rbio_done(rbio); 464 } else { 465 bch2_rbio_punt(rbio, bch2_rbio_retry, 466 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 467 } 468 } 469 470 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 471 struct bch_read_bio *rbio) 472 { 473 struct bch_fs *c = rbio->c; 474 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 475 struct bch_extent_crc_unpacked new_crc; 476 struct btree_iter iter; 477 struct bkey_i *new; 478 struct bkey_s_c k; 479 int ret = 0; 480 481 if (crc_is_compressed(rbio->pick.crc)) 482 return 0; 483 484 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 485 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 486 if ((ret = bkey_err(k))) 487 goto out; 488 489 if (bversion_cmp(k.k->version, rbio->version) || 490 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 491 goto out; 492 493 /* Extent was merged? */ 494 if (bkey_start_offset(k.k) < data_offset || 495 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 496 goto out; 497 498 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 499 rbio->pick.crc, NULL, &new_crc, 500 bkey_start_offset(k.k) - data_offset, k.k->size, 501 rbio->pick.crc.csum_type)) { 502 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 503 ret = 0; 504 goto out; 505 } 506 507 /* 508 * going to be temporarily appending another checksum entry: 509 */ 510 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 511 sizeof(struct bch_extent_crc128)); 512 if ((ret = PTR_ERR_OR_ZERO(new))) 513 goto out; 514 515 bkey_reassemble(new, k); 516 517 if (!bch2_bkey_narrow_crcs(new, new_crc)) 518 goto out; 519 520 ret = bch2_trans_update(trans, &iter, new, 521 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 522 out: 523 bch2_trans_iter_exit(trans, &iter); 524 return ret; 525 } 526 527 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 528 { 529 bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, 530 __bch2_rbio_narrow_crcs(trans, rbio)); 531 } 532 533 /* Inner part that may run in process context */ 534 static void __bch2_read_endio(struct work_struct *work) 535 { 536 struct bch_read_bio *rbio = 537 container_of(work, struct bch_read_bio, work); 538 struct bch_fs *c = rbio->c; 539 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); 540 struct bio *src = &rbio->bio; 541 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 542 struct bvec_iter dst_iter = rbio->bvec_iter; 543 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 544 struct nonce nonce = extent_nonce(rbio->version, crc); 545 unsigned nofs_flags; 546 struct bch_csum csum; 547 int ret; 548 549 nofs_flags = memalloc_nofs_save(); 550 551 /* Reset iterator for checksumming and copying bounced data: */ 552 if (rbio->bounce) { 553 src->bi_iter.bi_size = crc.compressed_size << 9; 554 src->bi_iter.bi_idx = 0; 555 src->bi_iter.bi_bvec_done = 0; 556 } else { 557 src->bi_iter = rbio->bvec_iter; 558 } 559 560 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 561 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 562 goto csum_err; 563 564 /* 565 * XXX 566 * We need to rework the narrow_crcs path to deliver the read completion 567 * first, and then punt to a different workqueue, otherwise we're 568 * holding up reads while doing btree updates which is bad for memory 569 * reclaim. 570 */ 571 if (unlikely(rbio->narrow_crcs)) 572 bch2_rbio_narrow_crcs(rbio); 573 574 if (rbio->flags & BCH_READ_NODECODE) 575 goto nodecode; 576 577 /* Adjust crc to point to subset of data we want: */ 578 crc.offset += rbio->offset_into_extent; 579 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 580 581 if (crc_is_compressed(crc)) { 582 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 583 if (ret) 584 goto decrypt_err; 585 586 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 587 !c->opts.no_data_io) 588 goto decompression_err; 589 } else { 590 /* don't need to decrypt the entire bio: */ 591 nonce = nonce_add(nonce, crc.offset << 9); 592 bio_advance(src, crc.offset << 9); 593 594 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 595 src->bi_iter.bi_size = dst_iter.bi_size; 596 597 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 598 if (ret) 599 goto decrypt_err; 600 601 if (rbio->bounce) { 602 struct bvec_iter src_iter = src->bi_iter; 603 604 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 605 } 606 } 607 608 if (rbio->promote) { 609 /* 610 * Re encrypt data we decrypted, so it's consistent with 611 * rbio->crc: 612 */ 613 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 614 if (ret) 615 goto decrypt_err; 616 617 promote_start(rbio->promote, rbio); 618 rbio->promote = NULL; 619 } 620 nodecode: 621 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 622 rbio = bch2_rbio_free(rbio); 623 bch2_rbio_done(rbio); 624 } 625 out: 626 memalloc_nofs_restore(nofs_flags); 627 return; 628 csum_err: 629 /* 630 * Checksum error: if the bio wasn't bounced, we may have been 631 * reading into buffers owned by userspace (that userspace can 632 * scribble over) - retry the read, bouncing it this time: 633 */ 634 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 635 rbio->flags |= BCH_READ_MUST_BOUNCE; 636 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 637 goto out; 638 } 639 640 bch_err_inum_offset_ratelimited(ca, 641 rbio->read_pos.inode, 642 rbio->read_pos.offset << 9, 643 "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", 644 rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, 645 csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); 646 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 647 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 648 goto out; 649 decompression_err: 650 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 651 rbio->read_pos.offset << 9, 652 "decompression error"); 653 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 654 goto out; 655 decrypt_err: 656 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 657 rbio->read_pos.offset << 9, 658 "decrypt error"); 659 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 660 goto out; 661 } 662 663 static void bch2_read_endio(struct bio *bio) 664 { 665 struct bch_read_bio *rbio = 666 container_of(bio, struct bch_read_bio, bio); 667 struct bch_fs *c = rbio->c; 668 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); 669 struct workqueue_struct *wq = NULL; 670 enum rbio_context context = RBIO_CONTEXT_NULL; 671 672 if (rbio->have_ioref) { 673 bch2_latency_acct(ca, rbio->submit_time, READ); 674 percpu_ref_put(&ca->io_ref); 675 } 676 677 if (!rbio->split) 678 rbio->bio.bi_end_io = rbio->end_io; 679 680 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 681 rbio->read_pos.inode, 682 rbio->read_pos.offset, 683 "data read error: %s", 684 bch2_blk_status_to_str(bio->bi_status))) { 685 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 686 return; 687 } 688 689 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 690 ptr_stale(ca, &rbio->pick.ptr)) { 691 trace_and_count(c, read_reuse_race, &rbio->bio); 692 693 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 694 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 695 else 696 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 697 return; 698 } 699 700 if (rbio->narrow_crcs || 701 rbio->promote || 702 crc_is_compressed(rbio->pick.crc) || 703 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 704 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 705 else if (rbio->pick.crc.csum_type) 706 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 707 708 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 709 } 710 711 int __bch2_read_indirect_extent(struct btree_trans *trans, 712 unsigned *offset_into_extent, 713 struct bkey_buf *orig_k) 714 { 715 struct btree_iter iter; 716 struct bkey_s_c k; 717 u64 reflink_offset; 718 int ret; 719 720 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 721 *offset_into_extent; 722 723 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 724 POS(0, reflink_offset), 0); 725 ret = bkey_err(k); 726 if (ret) 727 goto err; 728 729 if (k.k->type != KEY_TYPE_reflink_v && 730 k.k->type != KEY_TYPE_indirect_inline_data) { 731 bch_err_inum_offset_ratelimited(trans->c, 732 orig_k->k->k.p.inode, 733 orig_k->k->k.p.offset << 9, 734 "%llu len %u points to nonexistent indirect extent %llu", 735 orig_k->k->k.p.offset, 736 orig_k->k->k.size, 737 reflink_offset); 738 bch2_inconsistent_error(trans->c); 739 ret = -EIO; 740 goto err; 741 } 742 743 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 744 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 745 err: 746 bch2_trans_iter_exit(trans, &iter); 747 return ret; 748 } 749 750 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 751 struct bkey_s_c k, 752 struct bch_extent_ptr ptr) 753 { 754 struct bch_fs *c = trans->c; 755 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); 756 struct btree_iter iter; 757 struct printbuf buf = PRINTBUF; 758 int ret; 759 760 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 761 PTR_BUCKET_POS(c, &ptr), 762 BTREE_ITER_CACHED); 763 764 prt_printf(&buf, "Attempting to read from stale dirty pointer:"); 765 printbuf_indent_add(&buf, 2); 766 prt_newline(&buf); 767 768 bch2_bkey_val_to_text(&buf, c, k); 769 prt_newline(&buf); 770 771 prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); 772 773 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 774 if (!ret) { 775 prt_newline(&buf); 776 bch2_bkey_val_to_text(&buf, c, k); 777 } 778 779 bch2_fs_inconsistent(c, "%s", buf.buf); 780 781 bch2_trans_iter_exit(trans, &iter); 782 printbuf_exit(&buf); 783 } 784 785 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 786 struct bvec_iter iter, struct bpos read_pos, 787 enum btree_id data_btree, struct bkey_s_c k, 788 unsigned offset_into_extent, 789 struct bch_io_failures *failed, unsigned flags) 790 { 791 struct bch_fs *c = trans->c; 792 struct extent_ptr_decoded pick; 793 struct bch_read_bio *rbio = NULL; 794 struct bch_dev *ca = NULL; 795 struct promote_op *promote = NULL; 796 bool bounce = false, read_full = false, narrow_crcs = false; 797 struct bpos data_pos = bkey_start_pos(k.k); 798 int pick_ret; 799 800 if (bkey_extent_is_inline_data(k.k)) { 801 unsigned bytes = min_t(unsigned, iter.bi_size, 802 bkey_inline_data_bytes(k.k)); 803 804 swap(iter.bi_size, bytes); 805 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 806 swap(iter.bi_size, bytes); 807 bio_advance_iter(&orig->bio, &iter, bytes); 808 zero_fill_bio_iter(&orig->bio, iter); 809 goto out_read_done; 810 } 811 retry_pick: 812 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 813 814 /* hole or reservation - just zero fill: */ 815 if (!pick_ret) 816 goto hole; 817 818 if (pick_ret < 0) { 819 bch_err_inum_offset_ratelimited(c, 820 read_pos.inode, read_pos.offset << 9, 821 "no device to read from"); 822 goto err; 823 } 824 825 ca = bch_dev_bkey_exists(c, pick.ptr.dev); 826 827 /* 828 * Stale dirty pointers are treated as IO errors, but @failed isn't 829 * allocated unless we're in the retry path - so if we're not in the 830 * retry path, don't check here, it'll be caught in bch2_read_endio() 831 * and we'll end up in the retry path: 832 */ 833 if ((flags & BCH_READ_IN_RETRY) && 834 !pick.ptr.cached && 835 unlikely(ptr_stale(ca, &pick.ptr))) { 836 read_from_stale_dirty_pointer(trans, k, pick.ptr); 837 bch2_mark_io_failure(failed, &pick); 838 goto retry_pick; 839 } 840 841 /* 842 * Unlock the iterator while the btree node's lock is still in 843 * cache, before doing the IO: 844 */ 845 bch2_trans_unlock(trans); 846 847 if (flags & BCH_READ_NODECODE) { 848 /* 849 * can happen if we retry, and the extent we were going to read 850 * has been merged in the meantime: 851 */ 852 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) 853 goto hole; 854 855 iter.bi_size = pick.crc.compressed_size << 9; 856 goto get_bio; 857 } 858 859 if (!(flags & BCH_READ_LAST_FRAGMENT) || 860 bio_flagged(&orig->bio, BIO_CHAIN)) 861 flags |= BCH_READ_MUST_CLONE; 862 863 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 864 bch2_can_narrow_extent_crcs(k, pick.crc); 865 866 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 867 flags |= BCH_READ_MUST_BOUNCE; 868 869 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 870 871 if (crc_is_compressed(pick.crc) || 872 (pick.crc.csum_type != BCH_CSUM_none && 873 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 874 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 875 (flags & BCH_READ_USER_MAPPED)) || 876 (flags & BCH_READ_MUST_BOUNCE)))) { 877 read_full = true; 878 bounce = true; 879 } 880 881 if (orig->opts.promote_target) 882 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 883 &rbio, &bounce, &read_full); 884 885 if (!read_full) { 886 EBUG_ON(crc_is_compressed(pick.crc)); 887 EBUG_ON(pick.crc.csum_type && 888 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 889 bvec_iter_sectors(iter) != pick.crc.live_size || 890 pick.crc.offset || 891 offset_into_extent)); 892 893 data_pos.offset += offset_into_extent; 894 pick.ptr.offset += pick.crc.offset + 895 offset_into_extent; 896 offset_into_extent = 0; 897 pick.crc.compressed_size = bvec_iter_sectors(iter); 898 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 899 pick.crc.offset = 0; 900 pick.crc.live_size = bvec_iter_sectors(iter); 901 } 902 get_bio: 903 if (rbio) { 904 /* 905 * promote already allocated bounce rbio: 906 * promote needs to allocate a bio big enough for uncompressing 907 * data in the write path, but we're not going to use it all 908 * here: 909 */ 910 EBUG_ON(rbio->bio.bi_iter.bi_size < 911 pick.crc.compressed_size << 9); 912 rbio->bio.bi_iter.bi_size = 913 pick.crc.compressed_size << 9; 914 } else if (bounce) { 915 unsigned sectors = pick.crc.compressed_size; 916 917 rbio = rbio_init(bio_alloc_bioset(NULL, 918 DIV_ROUND_UP(sectors, PAGE_SECTORS), 919 0, 920 GFP_NOFS, 921 &c->bio_read_split), 922 orig->opts); 923 924 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 925 rbio->bounce = true; 926 rbio->split = true; 927 } else if (flags & BCH_READ_MUST_CLONE) { 928 /* 929 * Have to clone if there were any splits, due to error 930 * reporting issues (if a split errored, and retrying didn't 931 * work, when it reports the error to its parent (us) we don't 932 * know if the error was from our bio, and we should retry, or 933 * from the whole bio, in which case we don't want to retry and 934 * lose the error) 935 */ 936 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 937 &c->bio_read_split), 938 orig->opts); 939 rbio->bio.bi_iter = iter; 940 rbio->split = true; 941 } else { 942 rbio = orig; 943 rbio->bio.bi_iter = iter; 944 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 945 } 946 947 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 948 949 rbio->c = c; 950 rbio->submit_time = local_clock(); 951 if (rbio->split) 952 rbio->parent = orig; 953 else 954 rbio->end_io = orig->bio.bi_end_io; 955 rbio->bvec_iter = iter; 956 rbio->offset_into_extent= offset_into_extent; 957 rbio->flags = flags; 958 rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); 959 rbio->narrow_crcs = narrow_crcs; 960 rbio->hole = 0; 961 rbio->retry = 0; 962 rbio->context = 0; 963 /* XXX: only initialize this if needed */ 964 rbio->devs_have = bch2_bkey_devs(k); 965 rbio->pick = pick; 966 rbio->subvol = orig->subvol; 967 rbio->read_pos = read_pos; 968 rbio->data_btree = data_btree; 969 rbio->data_pos = data_pos; 970 rbio->version = k.k->version; 971 rbio->promote = promote; 972 INIT_WORK(&rbio->work, NULL); 973 974 rbio->bio.bi_opf = orig->bio.bi_opf; 975 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 976 rbio->bio.bi_end_io = bch2_read_endio; 977 978 if (rbio->bounce) 979 trace_and_count(c, read_bounce, &rbio->bio); 980 981 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 982 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 983 984 /* 985 * If it's being moved internally, we don't want to flag it as a cache 986 * hit: 987 */ 988 if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 989 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 990 PTR_BUCKET_NR(ca, &pick.ptr), READ); 991 992 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 993 bio_inc_remaining(&orig->bio); 994 trace_and_count(c, read_split, &orig->bio); 995 } 996 997 if (!rbio->pick.idx) { 998 if (!rbio->have_ioref) { 999 bch_err_inum_offset_ratelimited(c, 1000 read_pos.inode, 1001 read_pos.offset << 9, 1002 "no device to read from"); 1003 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1004 goto out; 1005 } 1006 1007 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1008 bio_sectors(&rbio->bio)); 1009 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1010 1011 if (unlikely(c->opts.no_data_io)) { 1012 if (likely(!(flags & BCH_READ_IN_RETRY))) 1013 bio_endio(&rbio->bio); 1014 } else { 1015 if (likely(!(flags & BCH_READ_IN_RETRY))) 1016 submit_bio(&rbio->bio); 1017 else 1018 submit_bio_wait(&rbio->bio); 1019 } 1020 1021 /* 1022 * We just submitted IO which may block, we expect relock fail 1023 * events and shouldn't count them: 1024 */ 1025 trans->notrace_relock_fail = true; 1026 } else { 1027 /* Attempting reconstruct read: */ 1028 if (bch2_ec_read_extent(trans, rbio)) { 1029 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1030 goto out; 1031 } 1032 1033 if (likely(!(flags & BCH_READ_IN_RETRY))) 1034 bio_endio(&rbio->bio); 1035 } 1036 out: 1037 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1038 return 0; 1039 } else { 1040 int ret; 1041 1042 rbio->context = RBIO_CONTEXT_UNBOUND; 1043 bch2_read_endio(&rbio->bio); 1044 1045 ret = rbio->retry; 1046 rbio = bch2_rbio_free(rbio); 1047 1048 if (ret == READ_RETRY_AVOID) { 1049 bch2_mark_io_failure(failed, &pick); 1050 ret = READ_RETRY; 1051 } 1052 1053 if (!ret) 1054 goto out_read_done; 1055 1056 return ret; 1057 } 1058 1059 err: 1060 if (flags & BCH_READ_IN_RETRY) 1061 return READ_ERR; 1062 1063 orig->bio.bi_status = BLK_STS_IOERR; 1064 goto out_read_done; 1065 1066 hole: 1067 /* 1068 * won't normally happen in the BCH_READ_NODECODE 1069 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1070 * to read no longer exists we have to signal that: 1071 */ 1072 if (flags & BCH_READ_NODECODE) 1073 orig->hole = true; 1074 1075 zero_fill_bio_iter(&orig->bio, iter); 1076 out_read_done: 1077 if (flags & BCH_READ_LAST_FRAGMENT) 1078 bch2_rbio_done(orig); 1079 return 0; 1080 } 1081 1082 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1083 struct bvec_iter bvec_iter, subvol_inum inum, 1084 struct bch_io_failures *failed, unsigned flags) 1085 { 1086 struct btree_trans *trans = bch2_trans_get(c); 1087 struct btree_iter iter; 1088 struct bkey_buf sk; 1089 struct bkey_s_c k; 1090 u32 snapshot; 1091 int ret; 1092 1093 BUG_ON(flags & BCH_READ_NODECODE); 1094 1095 bch2_bkey_buf_init(&sk); 1096 retry: 1097 bch2_trans_begin(trans); 1098 iter = (struct btree_iter) { NULL }; 1099 1100 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1101 if (ret) 1102 goto err; 1103 1104 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1105 SPOS(inum.inum, bvec_iter.bi_sector, snapshot), 1106 BTREE_ITER_SLOTS); 1107 while (1) { 1108 unsigned bytes, sectors, offset_into_extent; 1109 enum btree_id data_btree = BTREE_ID_extents; 1110 1111 /* 1112 * read_extent -> io_time_reset may cause a transaction restart 1113 * without returning an error, we need to check for that here: 1114 */ 1115 ret = bch2_trans_relock(trans); 1116 if (ret) 1117 break; 1118 1119 bch2_btree_iter_set_pos(&iter, 1120 POS(inum.inum, bvec_iter.bi_sector)); 1121 1122 k = bch2_btree_iter_peek_slot(&iter); 1123 ret = bkey_err(k); 1124 if (ret) 1125 break; 1126 1127 offset_into_extent = iter.pos.offset - 1128 bkey_start_offset(k.k); 1129 sectors = k.k->size - offset_into_extent; 1130 1131 bch2_bkey_buf_reassemble(&sk, c, k); 1132 1133 ret = bch2_read_indirect_extent(trans, &data_btree, 1134 &offset_into_extent, &sk); 1135 if (ret) 1136 break; 1137 1138 k = bkey_i_to_s_c(sk.k); 1139 1140 /* 1141 * With indirect extents, the amount of data to read is the min 1142 * of the original extent and the indirect extent: 1143 */ 1144 sectors = min(sectors, k.k->size - offset_into_extent); 1145 1146 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1147 swap(bvec_iter.bi_size, bytes); 1148 1149 if (bvec_iter.bi_size == bytes) 1150 flags |= BCH_READ_LAST_FRAGMENT; 1151 1152 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1153 data_btree, k, 1154 offset_into_extent, failed, flags); 1155 if (ret) 1156 break; 1157 1158 if (flags & BCH_READ_LAST_FRAGMENT) 1159 break; 1160 1161 swap(bvec_iter.bi_size, bytes); 1162 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1163 1164 ret = btree_trans_too_many_iters(trans); 1165 if (ret) 1166 break; 1167 } 1168 err: 1169 bch2_trans_iter_exit(trans, &iter); 1170 1171 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1172 ret == READ_RETRY || 1173 ret == READ_RETRY_AVOID) 1174 goto retry; 1175 1176 bch2_trans_put(trans); 1177 bch2_bkey_buf_exit(&sk, c); 1178 1179 if (ret) { 1180 bch_err_inum_offset_ratelimited(c, inum.inum, 1181 bvec_iter.bi_sector << 9, 1182 "read error %i from btree lookup", ret); 1183 rbio->bio.bi_status = BLK_STS_IOERR; 1184 bch2_rbio_done(rbio); 1185 } 1186 } 1187 1188 void bch2_fs_io_read_exit(struct bch_fs *c) 1189 { 1190 if (c->promote_table.tbl) 1191 rhashtable_destroy(&c->promote_table); 1192 bioset_exit(&c->bio_read_split); 1193 bioset_exit(&c->bio_read); 1194 } 1195 1196 int bch2_fs_io_read_init(struct bch_fs *c) 1197 { 1198 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1199 BIOSET_NEED_BVECS)) 1200 return -BCH_ERR_ENOMEM_bio_read_init; 1201 1202 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1203 BIOSET_NEED_BVECS)) 1204 return -BCH_ERR_ENOMEM_bio_read_split_init; 1205 1206 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1207 return -BCH_ERR_ENOMEM_promote_table_init; 1208 1209 return 0; 1210 } 1211