1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 .automatic_shrinking = true, 91 }; 92 93 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 94 struct bpos pos, 95 struct bch_io_opts opts, 96 unsigned flags) 97 { 98 BUG_ON(!opts.promote_target); 99 100 if (!(flags & BCH_READ_MAY_PROMOTE)) 101 return -BCH_ERR_nopromote_may_not; 102 103 if (bch2_bkey_has_target(c, k, opts.promote_target)) 104 return -BCH_ERR_nopromote_already_promoted; 105 106 if (bkey_extent_is_unwritten(k)) 107 return -BCH_ERR_nopromote_unwritten; 108 109 if (bch2_target_congested(c, opts.promote_target)) 110 return -BCH_ERR_nopromote_congested; 111 112 if (rhashtable_lookup_fast(&c->promote_table, &pos, 113 bch_promote_params)) 114 return -BCH_ERR_nopromote_in_flight; 115 116 return 0; 117 } 118 119 static void promote_free(struct bch_fs *c, struct promote_op *op) 120 { 121 int ret; 122 123 bch2_data_update_exit(&op->write); 124 125 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 126 bch_promote_params); 127 BUG_ON(ret); 128 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 129 kfree_rcu(op, rcu); 130 } 131 132 static void promote_done(struct bch_write_op *wop) 133 { 134 struct promote_op *op = 135 container_of(wop, struct promote_op, write.op); 136 struct bch_fs *c = op->write.op.c; 137 138 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 139 op->start_time); 140 promote_free(c, op); 141 } 142 143 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 144 { 145 struct bio *bio = &op->write.op.wbio.bio; 146 147 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 148 149 /* we now own pages: */ 150 BUG_ON(!rbio->bounce); 151 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 152 153 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 154 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 155 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 156 157 bch2_data_update_read_done(&op->write, rbio->pick.crc); 158 } 159 160 static struct promote_op *__promote_alloc(struct btree_trans *trans, 161 enum btree_id btree_id, 162 struct bkey_s_c k, 163 struct bpos pos, 164 struct extent_ptr_decoded *pick, 165 struct bch_io_opts opts, 166 unsigned sectors, 167 struct bch_read_bio **rbio) 168 { 169 struct bch_fs *c = trans->c; 170 struct promote_op *op = NULL; 171 struct bio *bio; 172 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 173 int ret; 174 175 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 176 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 177 178 op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 179 if (!op) { 180 ret = -BCH_ERR_nopromote_enomem; 181 goto err; 182 } 183 184 op->start_time = local_clock(); 185 op->pos = pos; 186 187 /* 188 * We don't use the mempool here because extents that aren't 189 * checksummed or compressed can be too big for the mempool: 190 */ 191 *rbio = kzalloc(sizeof(struct bch_read_bio) + 192 sizeof(struct bio_vec) * pages, 193 GFP_KERNEL); 194 if (!*rbio) { 195 ret = -BCH_ERR_nopromote_enomem; 196 goto err; 197 } 198 199 rbio_init(&(*rbio)->bio, opts); 200 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 201 202 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 203 ret = -BCH_ERR_nopromote_enomem; 204 goto err; 205 } 206 207 (*rbio)->bounce = true; 208 (*rbio)->split = true; 209 (*rbio)->kmalloc = true; 210 211 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 212 bch_promote_params)) { 213 ret = -BCH_ERR_nopromote_in_flight; 214 goto err; 215 } 216 217 bio = &op->write.op.wbio.bio; 218 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 219 220 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 221 writepoint_hashed((unsigned long) current), 222 opts, 223 (struct data_update_opts) { 224 .target = opts.promote_target, 225 .extra_replicas = 1, 226 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, 227 }, 228 btree_id, k); 229 /* 230 * possible errors: -BCH_ERR_nocow_lock_blocked, 231 * -BCH_ERR_ENOSPC_disk_reservation: 232 */ 233 if (ret) { 234 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 235 bch_promote_params)); 236 goto err; 237 } 238 239 op->write.op.end_io = promote_done; 240 241 return op; 242 err: 243 if (*rbio) 244 bio_free_pages(&(*rbio)->bio); 245 kfree(*rbio); 246 *rbio = NULL; 247 kfree(op); 248 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 249 return ERR_PTR(ret); 250 } 251 252 noinline 253 static struct promote_op *promote_alloc(struct btree_trans *trans, 254 struct bvec_iter iter, 255 struct bkey_s_c k, 256 struct extent_ptr_decoded *pick, 257 struct bch_io_opts opts, 258 unsigned flags, 259 struct bch_read_bio **rbio, 260 bool *bounce, 261 bool *read_full) 262 { 263 struct bch_fs *c = trans->c; 264 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); 265 /* data might have to be decompressed in the write path: */ 266 unsigned sectors = promote_full 267 ? max(pick->crc.compressed_size, pick->crc.live_size) 268 : bvec_iter_sectors(iter); 269 struct bpos pos = promote_full 270 ? bkey_start_pos(k.k) 271 : POS(k.k->p.inode, iter.bi_sector); 272 struct promote_op *promote; 273 int ret; 274 275 ret = should_promote(c, k, pos, opts, flags); 276 if (ret) 277 goto nopromote; 278 279 promote = __promote_alloc(trans, 280 k.k->type == KEY_TYPE_reflink_v 281 ? BTREE_ID_reflink 282 : BTREE_ID_extents, 283 k, pos, pick, opts, sectors, rbio); 284 ret = PTR_ERR_OR_ZERO(promote); 285 if (ret) 286 goto nopromote; 287 288 *bounce = true; 289 *read_full = promote_full; 290 return promote; 291 nopromote: 292 trace_read_nopromote(c, ret); 293 return NULL; 294 } 295 296 /* Read */ 297 298 #define READ_RETRY_AVOID 1 299 #define READ_RETRY 2 300 #define READ_ERR 3 301 302 enum rbio_context { 303 RBIO_CONTEXT_NULL, 304 RBIO_CONTEXT_HIGHPRI, 305 RBIO_CONTEXT_UNBOUND, 306 }; 307 308 static inline struct bch_read_bio * 309 bch2_rbio_parent(struct bch_read_bio *rbio) 310 { 311 return rbio->split ? rbio->parent : rbio; 312 } 313 314 __always_inline 315 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 316 enum rbio_context context, 317 struct workqueue_struct *wq) 318 { 319 if (context <= rbio->context) { 320 fn(&rbio->work); 321 } else { 322 rbio->work.func = fn; 323 rbio->context = context; 324 queue_work(wq, &rbio->work); 325 } 326 } 327 328 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 329 { 330 BUG_ON(rbio->bounce && !rbio->split); 331 332 if (rbio->promote) 333 promote_free(rbio->c, rbio->promote); 334 rbio->promote = NULL; 335 336 if (rbio->bounce) 337 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 338 339 if (rbio->split) { 340 struct bch_read_bio *parent = rbio->parent; 341 342 if (rbio->kmalloc) 343 kfree(rbio); 344 else 345 bio_put(&rbio->bio); 346 347 rbio = parent; 348 } 349 350 return rbio; 351 } 352 353 /* 354 * Only called on a top level bch_read_bio to complete an entire read request, 355 * not a split: 356 */ 357 static void bch2_rbio_done(struct bch_read_bio *rbio) 358 { 359 if (rbio->start_time) 360 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 361 rbio->start_time); 362 bio_endio(&rbio->bio); 363 } 364 365 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 366 struct bvec_iter bvec_iter, 367 struct bch_io_failures *failed, 368 unsigned flags) 369 { 370 struct btree_trans *trans = bch2_trans_get(c); 371 struct btree_iter iter; 372 struct bkey_buf sk; 373 struct bkey_s_c k; 374 int ret; 375 376 flags &= ~BCH_READ_LAST_FRAGMENT; 377 flags |= BCH_READ_MUST_CLONE; 378 379 bch2_bkey_buf_init(&sk); 380 381 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 382 rbio->read_pos, BTREE_ITER_slots); 383 retry: 384 rbio->bio.bi_status = 0; 385 386 k = bch2_btree_iter_peek_slot(&iter); 387 if (bkey_err(k)) 388 goto err; 389 390 bch2_bkey_buf_reassemble(&sk, c, k); 391 k = bkey_i_to_s_c(sk.k); 392 bch2_trans_unlock(trans); 393 394 if (!bch2_bkey_matches_ptr(c, k, 395 rbio->pick.ptr, 396 rbio->data_pos.offset - 397 rbio->pick.crc.offset)) { 398 /* extent we wanted to read no longer exists: */ 399 rbio->hole = true; 400 goto out; 401 } 402 403 ret = __bch2_read_extent(trans, rbio, bvec_iter, 404 rbio->read_pos, 405 rbio->data_btree, 406 k, 0, failed, flags); 407 if (ret == READ_RETRY) 408 goto retry; 409 if (ret) 410 goto err; 411 out: 412 bch2_rbio_done(rbio); 413 bch2_trans_iter_exit(trans, &iter); 414 bch2_trans_put(trans); 415 bch2_bkey_buf_exit(&sk, c); 416 return; 417 err: 418 rbio->bio.bi_status = BLK_STS_IOERR; 419 goto out; 420 } 421 422 static void bch2_rbio_retry(struct work_struct *work) 423 { 424 struct bch_read_bio *rbio = 425 container_of(work, struct bch_read_bio, work); 426 struct bch_fs *c = rbio->c; 427 struct bvec_iter iter = rbio->bvec_iter; 428 unsigned flags = rbio->flags; 429 subvol_inum inum = { 430 .subvol = rbio->subvol, 431 .inum = rbio->read_pos.inode, 432 }; 433 struct bch_io_failures failed = { .nr = 0 }; 434 435 trace_and_count(c, read_retry, &rbio->bio); 436 437 if (rbio->retry == READ_RETRY_AVOID) 438 bch2_mark_io_failure(&failed, &rbio->pick); 439 440 rbio->bio.bi_status = 0; 441 442 rbio = bch2_rbio_free(rbio); 443 444 flags |= BCH_READ_IN_RETRY; 445 flags &= ~BCH_READ_MAY_PROMOTE; 446 447 if (flags & BCH_READ_NODECODE) { 448 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 449 } else { 450 flags &= ~BCH_READ_LAST_FRAGMENT; 451 flags |= BCH_READ_MUST_CLONE; 452 453 __bch2_read(c, rbio, iter, inum, &failed, flags); 454 } 455 } 456 457 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 458 blk_status_t error) 459 { 460 rbio->retry = retry; 461 462 if (rbio->flags & BCH_READ_IN_RETRY) 463 return; 464 465 if (retry == READ_ERR) { 466 rbio = bch2_rbio_free(rbio); 467 468 rbio->bio.bi_status = error; 469 bch2_rbio_done(rbio); 470 } else { 471 bch2_rbio_punt(rbio, bch2_rbio_retry, 472 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 473 } 474 } 475 476 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 477 struct bch_read_bio *rbio) 478 { 479 struct bch_fs *c = rbio->c; 480 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 481 struct bch_extent_crc_unpacked new_crc; 482 struct btree_iter iter; 483 struct bkey_i *new; 484 struct bkey_s_c k; 485 int ret = 0; 486 487 if (crc_is_compressed(rbio->pick.crc)) 488 return 0; 489 490 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 491 BTREE_ITER_slots|BTREE_ITER_intent); 492 if ((ret = bkey_err(k))) 493 goto out; 494 495 if (bversion_cmp(k.k->version, rbio->version) || 496 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 497 goto out; 498 499 /* Extent was merged? */ 500 if (bkey_start_offset(k.k) < data_offset || 501 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 502 goto out; 503 504 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 505 rbio->pick.crc, NULL, &new_crc, 506 bkey_start_offset(k.k) - data_offset, k.k->size, 507 rbio->pick.crc.csum_type)) { 508 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 509 ret = 0; 510 goto out; 511 } 512 513 /* 514 * going to be temporarily appending another checksum entry: 515 */ 516 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 517 sizeof(struct bch_extent_crc128)); 518 if ((ret = PTR_ERR_OR_ZERO(new))) 519 goto out; 520 521 bkey_reassemble(new, k); 522 523 if (!bch2_bkey_narrow_crcs(new, new_crc)) 524 goto out; 525 526 ret = bch2_trans_update(trans, &iter, new, 527 BTREE_UPDATE_internal_snapshot_node); 528 out: 529 bch2_trans_iter_exit(trans, &iter); 530 return ret; 531 } 532 533 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 534 { 535 bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 536 __bch2_rbio_narrow_crcs(trans, rbio)); 537 } 538 539 /* Inner part that may run in process context */ 540 static void __bch2_read_endio(struct work_struct *work) 541 { 542 struct bch_read_bio *rbio = 543 container_of(work, struct bch_read_bio, work); 544 struct bch_fs *c = rbio->c; 545 struct bio *src = &rbio->bio; 546 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 547 struct bvec_iter dst_iter = rbio->bvec_iter; 548 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 549 struct nonce nonce = extent_nonce(rbio->version, crc); 550 unsigned nofs_flags; 551 struct bch_csum csum; 552 int ret; 553 554 nofs_flags = memalloc_nofs_save(); 555 556 /* Reset iterator for checksumming and copying bounced data: */ 557 if (rbio->bounce) { 558 src->bi_iter.bi_size = crc.compressed_size << 9; 559 src->bi_iter.bi_idx = 0; 560 src->bi_iter.bi_bvec_done = 0; 561 } else { 562 src->bi_iter = rbio->bvec_iter; 563 } 564 565 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 566 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 567 goto csum_err; 568 569 /* 570 * XXX 571 * We need to rework the narrow_crcs path to deliver the read completion 572 * first, and then punt to a different workqueue, otherwise we're 573 * holding up reads while doing btree updates which is bad for memory 574 * reclaim. 575 */ 576 if (unlikely(rbio->narrow_crcs)) 577 bch2_rbio_narrow_crcs(rbio); 578 579 if (rbio->flags & BCH_READ_NODECODE) 580 goto nodecode; 581 582 /* Adjust crc to point to subset of data we want: */ 583 crc.offset += rbio->offset_into_extent; 584 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 585 586 if (crc_is_compressed(crc)) { 587 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 588 if (ret) 589 goto decrypt_err; 590 591 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 592 !c->opts.no_data_io) 593 goto decompression_err; 594 } else { 595 /* don't need to decrypt the entire bio: */ 596 nonce = nonce_add(nonce, crc.offset << 9); 597 bio_advance(src, crc.offset << 9); 598 599 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 600 src->bi_iter.bi_size = dst_iter.bi_size; 601 602 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 603 if (ret) 604 goto decrypt_err; 605 606 if (rbio->bounce) { 607 struct bvec_iter src_iter = src->bi_iter; 608 609 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 610 } 611 } 612 613 if (rbio->promote) { 614 /* 615 * Re encrypt data we decrypted, so it's consistent with 616 * rbio->crc: 617 */ 618 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 619 if (ret) 620 goto decrypt_err; 621 622 promote_start(rbio->promote, rbio); 623 rbio->promote = NULL; 624 } 625 nodecode: 626 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 627 rbio = bch2_rbio_free(rbio); 628 bch2_rbio_done(rbio); 629 } 630 out: 631 memalloc_nofs_restore(nofs_flags); 632 return; 633 csum_err: 634 /* 635 * Checksum error: if the bio wasn't bounced, we may have been 636 * reading into buffers owned by userspace (that userspace can 637 * scribble over) - retry the read, bouncing it this time: 638 */ 639 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 640 rbio->flags |= BCH_READ_MUST_BOUNCE; 641 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 642 goto out; 643 } 644 645 struct printbuf buf = PRINTBUF; 646 buf.atomic++; 647 prt_str(&buf, "data "); 648 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 649 650 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 651 if (ca) { 652 bch_err_inum_offset_ratelimited(ca, 653 rbio->read_pos.inode, 654 rbio->read_pos.offset << 9, 655 "data %s", buf.buf); 656 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 657 } 658 printbuf_exit(&buf); 659 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 660 goto out; 661 decompression_err: 662 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 663 rbio->read_pos.offset << 9, 664 "decompression error"); 665 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 666 goto out; 667 decrypt_err: 668 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 669 rbio->read_pos.offset << 9, 670 "decrypt error"); 671 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 672 goto out; 673 } 674 675 static void bch2_read_endio(struct bio *bio) 676 { 677 struct bch_read_bio *rbio = 678 container_of(bio, struct bch_read_bio, bio); 679 struct bch_fs *c = rbio->c; 680 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 681 struct workqueue_struct *wq = NULL; 682 enum rbio_context context = RBIO_CONTEXT_NULL; 683 684 if (rbio->have_ioref) { 685 bch2_latency_acct(ca, rbio->submit_time, READ); 686 percpu_ref_put(&ca->io_ref); 687 } 688 689 if (!rbio->split) 690 rbio->bio.bi_end_io = rbio->end_io; 691 692 if (bio->bi_status) { 693 if (ca) { 694 bch_err_inum_offset_ratelimited(ca, 695 rbio->read_pos.inode, 696 rbio->read_pos.offset, 697 "data read error: %s", 698 bch2_blk_status_to_str(bio->bi_status)); 699 bch2_io_error(ca, BCH_MEMBER_ERROR_read); 700 } 701 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 702 return; 703 } 704 705 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 706 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 707 trace_and_count(c, read_reuse_race, &rbio->bio); 708 709 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 710 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 711 else 712 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 713 return; 714 } 715 716 if (rbio->narrow_crcs || 717 rbio->promote || 718 crc_is_compressed(rbio->pick.crc) || 719 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 720 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 721 else if (rbio->pick.crc.csum_type) 722 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 723 724 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 725 } 726 727 int __bch2_read_indirect_extent(struct btree_trans *trans, 728 unsigned *offset_into_extent, 729 struct bkey_buf *orig_k) 730 { 731 struct btree_iter iter; 732 struct bkey_s_c k; 733 u64 reflink_offset; 734 int ret; 735 736 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 737 *offset_into_extent; 738 739 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 740 POS(0, reflink_offset), 0); 741 ret = bkey_err(k); 742 if (ret) 743 goto err; 744 745 if (k.k->type != KEY_TYPE_reflink_v && 746 k.k->type != KEY_TYPE_indirect_inline_data) { 747 bch_err_inum_offset_ratelimited(trans->c, 748 orig_k->k->k.p.inode, 749 orig_k->k->k.p.offset << 9, 750 "%llu len %u points to nonexistent indirect extent %llu", 751 orig_k->k->k.p.offset, 752 orig_k->k->k.size, 753 reflink_offset); 754 bch2_inconsistent_error(trans->c); 755 ret = -EIO; 756 goto err; 757 } 758 759 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 760 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 761 err: 762 bch2_trans_iter_exit(trans, &iter); 763 return ret; 764 } 765 766 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 767 struct bch_dev *ca, 768 struct bkey_s_c k, 769 struct bch_extent_ptr ptr) 770 { 771 struct bch_fs *c = trans->c; 772 struct btree_iter iter; 773 struct printbuf buf = PRINTBUF; 774 int ret; 775 776 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 777 PTR_BUCKET_POS(ca, &ptr), 778 BTREE_ITER_cached); 779 780 u8 *gen = bucket_gen(ca, iter.pos.offset); 781 if (gen) { 782 783 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 784 printbuf_indent_add(&buf, 2); 785 786 bch2_bkey_val_to_text(&buf, c, k); 787 prt_newline(&buf); 788 789 prt_printf(&buf, "memory gen: %u", *gen); 790 791 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 792 if (!ret) { 793 prt_newline(&buf); 794 bch2_bkey_val_to_text(&buf, c, k); 795 } 796 } else { 797 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 798 iter.pos.inode, iter.pos.offset); 799 printbuf_indent_add(&buf, 2); 800 801 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 802 ca->mi.first_bucket, ca->mi.nbuckets); 803 804 bch2_bkey_val_to_text(&buf, c, k); 805 prt_newline(&buf); 806 } 807 808 bch2_fs_inconsistent(c, "%s", buf.buf); 809 810 bch2_trans_iter_exit(trans, &iter); 811 printbuf_exit(&buf); 812 } 813 814 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 815 struct bvec_iter iter, struct bpos read_pos, 816 enum btree_id data_btree, struct bkey_s_c k, 817 unsigned offset_into_extent, 818 struct bch_io_failures *failed, unsigned flags) 819 { 820 struct bch_fs *c = trans->c; 821 struct extent_ptr_decoded pick; 822 struct bch_read_bio *rbio = NULL; 823 struct promote_op *promote = NULL; 824 bool bounce = false, read_full = false, narrow_crcs = false; 825 struct bpos data_pos = bkey_start_pos(k.k); 826 int pick_ret; 827 828 if (bkey_extent_is_inline_data(k.k)) { 829 unsigned bytes = min_t(unsigned, iter.bi_size, 830 bkey_inline_data_bytes(k.k)); 831 832 swap(iter.bi_size, bytes); 833 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 834 swap(iter.bi_size, bytes); 835 bio_advance_iter(&orig->bio, &iter, bytes); 836 zero_fill_bio_iter(&orig->bio, iter); 837 goto out_read_done; 838 } 839 retry_pick: 840 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 841 842 /* hole or reservation - just zero fill: */ 843 if (!pick_ret) 844 goto hole; 845 846 if (pick_ret < 0) { 847 bch_err_inum_offset_ratelimited(c, 848 read_pos.inode, read_pos.offset << 9, 849 "no device to read from"); 850 goto err; 851 } 852 853 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 854 855 /* 856 * Stale dirty pointers are treated as IO errors, but @failed isn't 857 * allocated unless we're in the retry path - so if we're not in the 858 * retry path, don't check here, it'll be caught in bch2_read_endio() 859 * and we'll end up in the retry path: 860 */ 861 if ((flags & BCH_READ_IN_RETRY) && 862 !pick.ptr.cached && 863 ca && 864 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 865 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 866 bch2_mark_io_failure(failed, &pick); 867 percpu_ref_put(&ca->io_ref); 868 goto retry_pick; 869 } 870 871 /* 872 * Unlock the iterator while the btree node's lock is still in 873 * cache, before doing the IO: 874 */ 875 bch2_trans_unlock(trans); 876 877 if (flags & BCH_READ_NODECODE) { 878 /* 879 * can happen if we retry, and the extent we were going to read 880 * has been merged in the meantime: 881 */ 882 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 883 if (ca) 884 percpu_ref_put(&ca->io_ref); 885 goto hole; 886 } 887 888 iter.bi_size = pick.crc.compressed_size << 9; 889 goto get_bio; 890 } 891 892 if (!(flags & BCH_READ_LAST_FRAGMENT) || 893 bio_flagged(&orig->bio, BIO_CHAIN)) 894 flags |= BCH_READ_MUST_CLONE; 895 896 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 897 bch2_can_narrow_extent_crcs(k, pick.crc); 898 899 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 900 flags |= BCH_READ_MUST_BOUNCE; 901 902 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 903 904 if (crc_is_compressed(pick.crc) || 905 (pick.crc.csum_type != BCH_CSUM_none && 906 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 907 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 908 (flags & BCH_READ_USER_MAPPED)) || 909 (flags & BCH_READ_MUST_BOUNCE)))) { 910 read_full = true; 911 bounce = true; 912 } 913 914 if (orig->opts.promote_target) 915 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 916 &rbio, &bounce, &read_full); 917 918 if (!read_full) { 919 EBUG_ON(crc_is_compressed(pick.crc)); 920 EBUG_ON(pick.crc.csum_type && 921 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 922 bvec_iter_sectors(iter) != pick.crc.live_size || 923 pick.crc.offset || 924 offset_into_extent)); 925 926 data_pos.offset += offset_into_extent; 927 pick.ptr.offset += pick.crc.offset + 928 offset_into_extent; 929 offset_into_extent = 0; 930 pick.crc.compressed_size = bvec_iter_sectors(iter); 931 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 932 pick.crc.offset = 0; 933 pick.crc.live_size = bvec_iter_sectors(iter); 934 } 935 get_bio: 936 if (rbio) { 937 /* 938 * promote already allocated bounce rbio: 939 * promote needs to allocate a bio big enough for uncompressing 940 * data in the write path, but we're not going to use it all 941 * here: 942 */ 943 EBUG_ON(rbio->bio.bi_iter.bi_size < 944 pick.crc.compressed_size << 9); 945 rbio->bio.bi_iter.bi_size = 946 pick.crc.compressed_size << 9; 947 } else if (bounce) { 948 unsigned sectors = pick.crc.compressed_size; 949 950 rbio = rbio_init(bio_alloc_bioset(NULL, 951 DIV_ROUND_UP(sectors, PAGE_SECTORS), 952 0, 953 GFP_NOFS, 954 &c->bio_read_split), 955 orig->opts); 956 957 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 958 rbio->bounce = true; 959 rbio->split = true; 960 } else if (flags & BCH_READ_MUST_CLONE) { 961 /* 962 * Have to clone if there were any splits, due to error 963 * reporting issues (if a split errored, and retrying didn't 964 * work, when it reports the error to its parent (us) we don't 965 * know if the error was from our bio, and we should retry, or 966 * from the whole bio, in which case we don't want to retry and 967 * lose the error) 968 */ 969 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 970 &c->bio_read_split), 971 orig->opts); 972 rbio->bio.bi_iter = iter; 973 rbio->split = true; 974 } else { 975 rbio = orig; 976 rbio->bio.bi_iter = iter; 977 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 978 } 979 980 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 981 982 rbio->c = c; 983 rbio->submit_time = local_clock(); 984 if (rbio->split) 985 rbio->parent = orig; 986 else 987 rbio->end_io = orig->bio.bi_end_io; 988 rbio->bvec_iter = iter; 989 rbio->offset_into_extent= offset_into_extent; 990 rbio->flags = flags; 991 rbio->have_ioref = ca != NULL; 992 rbio->narrow_crcs = narrow_crcs; 993 rbio->hole = 0; 994 rbio->retry = 0; 995 rbio->context = 0; 996 /* XXX: only initialize this if needed */ 997 rbio->devs_have = bch2_bkey_devs(k); 998 rbio->pick = pick; 999 rbio->subvol = orig->subvol; 1000 rbio->read_pos = read_pos; 1001 rbio->data_btree = data_btree; 1002 rbio->data_pos = data_pos; 1003 rbio->version = k.k->version; 1004 rbio->promote = promote; 1005 INIT_WORK(&rbio->work, NULL); 1006 1007 rbio->bio.bi_opf = orig->bio.bi_opf; 1008 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1009 rbio->bio.bi_end_io = bch2_read_endio; 1010 1011 if (rbio->bounce) 1012 trace_and_count(c, read_bounce, &rbio->bio); 1013 1014 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1015 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1016 1017 /* 1018 * If it's being moved internally, we don't want to flag it as a cache 1019 * hit: 1020 */ 1021 if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 1022 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1023 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1024 1025 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1026 bio_inc_remaining(&orig->bio); 1027 trace_and_count(c, read_split, &orig->bio); 1028 } 1029 1030 if (!rbio->pick.idx) { 1031 if (!rbio->have_ioref) { 1032 bch_err_inum_offset_ratelimited(c, 1033 read_pos.inode, 1034 read_pos.offset << 9, 1035 "no device to read from"); 1036 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1037 goto out; 1038 } 1039 1040 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1041 bio_sectors(&rbio->bio)); 1042 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1043 1044 if (unlikely(c->opts.no_data_io)) { 1045 if (likely(!(flags & BCH_READ_IN_RETRY))) 1046 bio_endio(&rbio->bio); 1047 } else { 1048 if (likely(!(flags & BCH_READ_IN_RETRY))) 1049 submit_bio(&rbio->bio); 1050 else 1051 submit_bio_wait(&rbio->bio); 1052 } 1053 1054 /* 1055 * We just submitted IO which may block, we expect relock fail 1056 * events and shouldn't count them: 1057 */ 1058 trans->notrace_relock_fail = true; 1059 } else { 1060 /* Attempting reconstruct read: */ 1061 if (bch2_ec_read_extent(trans, rbio)) { 1062 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1063 goto out; 1064 } 1065 1066 if (likely(!(flags & BCH_READ_IN_RETRY))) 1067 bio_endio(&rbio->bio); 1068 } 1069 out: 1070 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1071 return 0; 1072 } else { 1073 int ret; 1074 1075 rbio->context = RBIO_CONTEXT_UNBOUND; 1076 bch2_read_endio(&rbio->bio); 1077 1078 ret = rbio->retry; 1079 rbio = bch2_rbio_free(rbio); 1080 1081 if (ret == READ_RETRY_AVOID) { 1082 bch2_mark_io_failure(failed, &pick); 1083 ret = READ_RETRY; 1084 } 1085 1086 if (!ret) 1087 goto out_read_done; 1088 1089 return ret; 1090 } 1091 1092 err: 1093 if (flags & BCH_READ_IN_RETRY) 1094 return READ_ERR; 1095 1096 orig->bio.bi_status = BLK_STS_IOERR; 1097 goto out_read_done; 1098 1099 hole: 1100 /* 1101 * won't normally happen in the BCH_READ_NODECODE 1102 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1103 * to read no longer exists we have to signal that: 1104 */ 1105 if (flags & BCH_READ_NODECODE) 1106 orig->hole = true; 1107 1108 zero_fill_bio_iter(&orig->bio, iter); 1109 out_read_done: 1110 if (flags & BCH_READ_LAST_FRAGMENT) 1111 bch2_rbio_done(orig); 1112 return 0; 1113 } 1114 1115 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1116 struct bvec_iter bvec_iter, subvol_inum inum, 1117 struct bch_io_failures *failed, unsigned flags) 1118 { 1119 struct btree_trans *trans = bch2_trans_get(c); 1120 struct btree_iter iter; 1121 struct bkey_buf sk; 1122 struct bkey_s_c k; 1123 u32 snapshot; 1124 int ret; 1125 1126 BUG_ON(flags & BCH_READ_NODECODE); 1127 1128 bch2_bkey_buf_init(&sk); 1129 retry: 1130 bch2_trans_begin(trans); 1131 iter = (struct btree_iter) { NULL }; 1132 1133 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1134 if (ret) 1135 goto err; 1136 1137 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1138 SPOS(inum.inum, bvec_iter.bi_sector, snapshot), 1139 BTREE_ITER_slots); 1140 while (1) { 1141 unsigned bytes, sectors, offset_into_extent; 1142 enum btree_id data_btree = BTREE_ID_extents; 1143 1144 /* 1145 * read_extent -> io_time_reset may cause a transaction restart 1146 * without returning an error, we need to check for that here: 1147 */ 1148 ret = bch2_trans_relock(trans); 1149 if (ret) 1150 break; 1151 1152 bch2_btree_iter_set_pos(&iter, 1153 POS(inum.inum, bvec_iter.bi_sector)); 1154 1155 k = bch2_btree_iter_peek_slot(&iter); 1156 ret = bkey_err(k); 1157 if (ret) 1158 break; 1159 1160 offset_into_extent = iter.pos.offset - 1161 bkey_start_offset(k.k); 1162 sectors = k.k->size - offset_into_extent; 1163 1164 bch2_bkey_buf_reassemble(&sk, c, k); 1165 1166 ret = bch2_read_indirect_extent(trans, &data_btree, 1167 &offset_into_extent, &sk); 1168 if (ret) 1169 break; 1170 1171 k = bkey_i_to_s_c(sk.k); 1172 1173 /* 1174 * With indirect extents, the amount of data to read is the min 1175 * of the original extent and the indirect extent: 1176 */ 1177 sectors = min(sectors, k.k->size - offset_into_extent); 1178 1179 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1180 swap(bvec_iter.bi_size, bytes); 1181 1182 if (bvec_iter.bi_size == bytes) 1183 flags |= BCH_READ_LAST_FRAGMENT; 1184 1185 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1186 data_btree, k, 1187 offset_into_extent, failed, flags); 1188 if (ret) 1189 break; 1190 1191 if (flags & BCH_READ_LAST_FRAGMENT) 1192 break; 1193 1194 swap(bvec_iter.bi_size, bytes); 1195 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1196 1197 ret = btree_trans_too_many_iters(trans); 1198 if (ret) 1199 break; 1200 } 1201 err: 1202 bch2_trans_iter_exit(trans, &iter); 1203 1204 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1205 ret == READ_RETRY || 1206 ret == READ_RETRY_AVOID) 1207 goto retry; 1208 1209 bch2_trans_put(trans); 1210 bch2_bkey_buf_exit(&sk, c); 1211 1212 if (ret) { 1213 bch_err_inum_offset_ratelimited(c, inum.inum, 1214 bvec_iter.bi_sector << 9, 1215 "read error %i from btree lookup", ret); 1216 rbio->bio.bi_status = BLK_STS_IOERR; 1217 bch2_rbio_done(rbio); 1218 } 1219 } 1220 1221 void bch2_fs_io_read_exit(struct bch_fs *c) 1222 { 1223 if (c->promote_table.tbl) 1224 rhashtable_destroy(&c->promote_table); 1225 bioset_exit(&c->bio_read_split); 1226 bioset_exit(&c->bio_read); 1227 } 1228 1229 int bch2_fs_io_read_init(struct bch_fs *c) 1230 { 1231 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1232 BIOSET_NEED_BVECS)) 1233 return -BCH_ERR_ENOMEM_bio_read_init; 1234 1235 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1236 BIOSET_NEED_BVECS)) 1237 return -BCH_ERR_ENOMEM_bio_read_split_init; 1238 1239 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1240 return -BCH_ERR_ENOMEM_promote_table_init; 1241 1242 return 0; 1243 } 1244