1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 }; 91 92 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 93 struct bpos pos, 94 struct bch_io_opts opts, 95 unsigned flags) 96 { 97 BUG_ON(!opts.promote_target); 98 99 if (!(flags & BCH_READ_MAY_PROMOTE)) 100 return -BCH_ERR_nopromote_may_not; 101 102 if (bch2_bkey_has_target(c, k, opts.promote_target)) 103 return -BCH_ERR_nopromote_already_promoted; 104 105 if (bkey_extent_is_unwritten(k)) 106 return -BCH_ERR_nopromote_unwritten; 107 108 if (bch2_target_congested(c, opts.promote_target)) 109 return -BCH_ERR_nopromote_congested; 110 111 if (rhashtable_lookup_fast(&c->promote_table, &pos, 112 bch_promote_params)) 113 return -BCH_ERR_nopromote_in_flight; 114 115 return 0; 116 } 117 118 static void promote_free(struct bch_fs *c, struct promote_op *op) 119 { 120 int ret; 121 122 bch2_data_update_exit(&op->write); 123 124 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 125 bch_promote_params); 126 BUG_ON(ret); 127 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 128 kfree_rcu(op, rcu); 129 } 130 131 static void promote_done(struct bch_write_op *wop) 132 { 133 struct promote_op *op = 134 container_of(wop, struct promote_op, write.op); 135 struct bch_fs *c = op->write.op.c; 136 137 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 138 op->start_time); 139 promote_free(c, op); 140 } 141 142 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 143 { 144 struct bio *bio = &op->write.op.wbio.bio; 145 146 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 147 148 /* we now own pages: */ 149 BUG_ON(!rbio->bounce); 150 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 151 152 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 153 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 154 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 155 156 bch2_data_update_read_done(&op->write, rbio->pick.crc); 157 } 158 159 static struct promote_op *__promote_alloc(struct btree_trans *trans, 160 enum btree_id btree_id, 161 struct bkey_s_c k, 162 struct bpos pos, 163 struct extent_ptr_decoded *pick, 164 struct bch_io_opts opts, 165 unsigned sectors, 166 struct bch_read_bio **rbio) 167 { 168 struct bch_fs *c = trans->c; 169 struct promote_op *op = NULL; 170 struct bio *bio; 171 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 172 int ret; 173 174 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 175 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 176 177 op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); 178 if (!op) { 179 ret = -BCH_ERR_nopromote_enomem; 180 goto err; 181 } 182 183 op->start_time = local_clock(); 184 op->pos = pos; 185 186 /* 187 * We don't use the mempool here because extents that aren't 188 * checksummed or compressed can be too big for the mempool: 189 */ 190 *rbio = kzalloc(sizeof(struct bch_read_bio) + 191 sizeof(struct bio_vec) * pages, 192 GFP_KERNEL); 193 if (!*rbio) { 194 ret = -BCH_ERR_nopromote_enomem; 195 goto err; 196 } 197 198 rbio_init(&(*rbio)->bio, opts); 199 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 200 201 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 202 ret = -BCH_ERR_nopromote_enomem; 203 goto err; 204 } 205 206 (*rbio)->bounce = true; 207 (*rbio)->split = true; 208 (*rbio)->kmalloc = true; 209 210 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 211 bch_promote_params)) { 212 ret = -BCH_ERR_nopromote_in_flight; 213 goto err; 214 } 215 216 bio = &op->write.op.wbio.bio; 217 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 218 219 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 220 writepoint_hashed((unsigned long) current), 221 opts, 222 (struct data_update_opts) { 223 .target = opts.promote_target, 224 .extra_replicas = 1, 225 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, 226 }, 227 btree_id, k); 228 /* 229 * possible errors: -BCH_ERR_nocow_lock_blocked, 230 * -BCH_ERR_ENOSPC_disk_reservation: 231 */ 232 if (ret) { 233 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 234 bch_promote_params)); 235 goto err; 236 } 237 238 op->write.op.end_io = promote_done; 239 240 return op; 241 err: 242 if (*rbio) 243 bio_free_pages(&(*rbio)->bio); 244 kfree(*rbio); 245 *rbio = NULL; 246 kfree(op); 247 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 248 return ERR_PTR(ret); 249 } 250 251 noinline 252 static struct promote_op *promote_alloc(struct btree_trans *trans, 253 struct bvec_iter iter, 254 struct bkey_s_c k, 255 struct extent_ptr_decoded *pick, 256 struct bch_io_opts opts, 257 unsigned flags, 258 struct bch_read_bio **rbio, 259 bool *bounce, 260 bool *read_full) 261 { 262 struct bch_fs *c = trans->c; 263 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); 264 /* data might have to be decompressed in the write path: */ 265 unsigned sectors = promote_full 266 ? max(pick->crc.compressed_size, pick->crc.live_size) 267 : bvec_iter_sectors(iter); 268 struct bpos pos = promote_full 269 ? bkey_start_pos(k.k) 270 : POS(k.k->p.inode, iter.bi_sector); 271 struct promote_op *promote; 272 int ret; 273 274 ret = should_promote(c, k, pos, opts, flags); 275 if (ret) 276 goto nopromote; 277 278 promote = __promote_alloc(trans, 279 k.k->type == KEY_TYPE_reflink_v 280 ? BTREE_ID_reflink 281 : BTREE_ID_extents, 282 k, pos, pick, opts, sectors, rbio); 283 ret = PTR_ERR_OR_ZERO(promote); 284 if (ret) 285 goto nopromote; 286 287 *bounce = true; 288 *read_full = promote_full; 289 return promote; 290 nopromote: 291 trace_read_nopromote(c, ret); 292 return NULL; 293 } 294 295 /* Read */ 296 297 #define READ_RETRY_AVOID 1 298 #define READ_RETRY 2 299 #define READ_ERR 3 300 301 enum rbio_context { 302 RBIO_CONTEXT_NULL, 303 RBIO_CONTEXT_HIGHPRI, 304 RBIO_CONTEXT_UNBOUND, 305 }; 306 307 static inline struct bch_read_bio * 308 bch2_rbio_parent(struct bch_read_bio *rbio) 309 { 310 return rbio->split ? rbio->parent : rbio; 311 } 312 313 __always_inline 314 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 315 enum rbio_context context, 316 struct workqueue_struct *wq) 317 { 318 if (context <= rbio->context) { 319 fn(&rbio->work); 320 } else { 321 rbio->work.func = fn; 322 rbio->context = context; 323 queue_work(wq, &rbio->work); 324 } 325 } 326 327 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 328 { 329 BUG_ON(rbio->bounce && !rbio->split); 330 331 if (rbio->promote) 332 promote_free(rbio->c, rbio->promote); 333 rbio->promote = NULL; 334 335 if (rbio->bounce) 336 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 337 338 if (rbio->split) { 339 struct bch_read_bio *parent = rbio->parent; 340 341 if (rbio->kmalloc) 342 kfree(rbio); 343 else 344 bio_put(&rbio->bio); 345 346 rbio = parent; 347 } 348 349 return rbio; 350 } 351 352 /* 353 * Only called on a top level bch_read_bio to complete an entire read request, 354 * not a split: 355 */ 356 static void bch2_rbio_done(struct bch_read_bio *rbio) 357 { 358 if (rbio->start_time) 359 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 360 rbio->start_time); 361 bio_endio(&rbio->bio); 362 } 363 364 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 365 struct bvec_iter bvec_iter, 366 struct bch_io_failures *failed, 367 unsigned flags) 368 { 369 struct btree_trans *trans = bch2_trans_get(c); 370 struct btree_iter iter; 371 struct bkey_buf sk; 372 struct bkey_s_c k; 373 int ret; 374 375 flags &= ~BCH_READ_LAST_FRAGMENT; 376 flags |= BCH_READ_MUST_CLONE; 377 378 bch2_bkey_buf_init(&sk); 379 380 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 381 rbio->read_pos, BTREE_ITER_SLOTS); 382 retry: 383 rbio->bio.bi_status = 0; 384 385 k = bch2_btree_iter_peek_slot(&iter); 386 if (bkey_err(k)) 387 goto err; 388 389 bch2_bkey_buf_reassemble(&sk, c, k); 390 k = bkey_i_to_s_c(sk.k); 391 bch2_trans_unlock(trans); 392 393 if (!bch2_bkey_matches_ptr(c, k, 394 rbio->pick.ptr, 395 rbio->data_pos.offset - 396 rbio->pick.crc.offset)) { 397 /* extent we wanted to read no longer exists: */ 398 rbio->hole = true; 399 goto out; 400 } 401 402 ret = __bch2_read_extent(trans, rbio, bvec_iter, 403 rbio->read_pos, 404 rbio->data_btree, 405 k, 0, failed, flags); 406 if (ret == READ_RETRY) 407 goto retry; 408 if (ret) 409 goto err; 410 out: 411 bch2_rbio_done(rbio); 412 bch2_trans_iter_exit(trans, &iter); 413 bch2_trans_put(trans); 414 bch2_bkey_buf_exit(&sk, c); 415 return; 416 err: 417 rbio->bio.bi_status = BLK_STS_IOERR; 418 goto out; 419 } 420 421 static void bch2_rbio_retry(struct work_struct *work) 422 { 423 struct bch_read_bio *rbio = 424 container_of(work, struct bch_read_bio, work); 425 struct bch_fs *c = rbio->c; 426 struct bvec_iter iter = rbio->bvec_iter; 427 unsigned flags = rbio->flags; 428 subvol_inum inum = { 429 .subvol = rbio->subvol, 430 .inum = rbio->read_pos.inode, 431 }; 432 struct bch_io_failures failed = { .nr = 0 }; 433 434 trace_and_count(c, read_retry, &rbio->bio); 435 436 if (rbio->retry == READ_RETRY_AVOID) 437 bch2_mark_io_failure(&failed, &rbio->pick); 438 439 rbio->bio.bi_status = 0; 440 441 rbio = bch2_rbio_free(rbio); 442 443 flags |= BCH_READ_IN_RETRY; 444 flags &= ~BCH_READ_MAY_PROMOTE; 445 446 if (flags & BCH_READ_NODECODE) { 447 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 448 } else { 449 flags &= ~BCH_READ_LAST_FRAGMENT; 450 flags |= BCH_READ_MUST_CLONE; 451 452 __bch2_read(c, rbio, iter, inum, &failed, flags); 453 } 454 } 455 456 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 457 blk_status_t error) 458 { 459 rbio->retry = retry; 460 461 if (rbio->flags & BCH_READ_IN_RETRY) 462 return; 463 464 if (retry == READ_ERR) { 465 rbio = bch2_rbio_free(rbio); 466 467 rbio->bio.bi_status = error; 468 bch2_rbio_done(rbio); 469 } else { 470 bch2_rbio_punt(rbio, bch2_rbio_retry, 471 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 472 } 473 } 474 475 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 476 struct bch_read_bio *rbio) 477 { 478 struct bch_fs *c = rbio->c; 479 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 480 struct bch_extent_crc_unpacked new_crc; 481 struct btree_iter iter; 482 struct bkey_i *new; 483 struct bkey_s_c k; 484 int ret = 0; 485 486 if (crc_is_compressed(rbio->pick.crc)) 487 return 0; 488 489 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 490 BTREE_ITER_SLOTS|BTREE_ITER_INTENT); 491 if ((ret = bkey_err(k))) 492 goto out; 493 494 if (bversion_cmp(k.k->version, rbio->version) || 495 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 496 goto out; 497 498 /* Extent was merged? */ 499 if (bkey_start_offset(k.k) < data_offset || 500 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 501 goto out; 502 503 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 504 rbio->pick.crc, NULL, &new_crc, 505 bkey_start_offset(k.k) - data_offset, k.k->size, 506 rbio->pick.crc.csum_type)) { 507 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 508 ret = 0; 509 goto out; 510 } 511 512 /* 513 * going to be temporarily appending another checksum entry: 514 */ 515 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 516 sizeof(struct bch_extent_crc128)); 517 if ((ret = PTR_ERR_OR_ZERO(new))) 518 goto out; 519 520 bkey_reassemble(new, k); 521 522 if (!bch2_bkey_narrow_crcs(new, new_crc)) 523 goto out; 524 525 ret = bch2_trans_update(trans, &iter, new, 526 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); 527 out: 528 bch2_trans_iter_exit(trans, &iter); 529 return ret; 530 } 531 532 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 533 { 534 bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 535 __bch2_rbio_narrow_crcs(trans, rbio)); 536 } 537 538 /* Inner part that may run in process context */ 539 static void __bch2_read_endio(struct work_struct *work) 540 { 541 struct bch_read_bio *rbio = 542 container_of(work, struct bch_read_bio, work); 543 struct bch_fs *c = rbio->c; 544 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); 545 struct bio *src = &rbio->bio; 546 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 547 struct bvec_iter dst_iter = rbio->bvec_iter; 548 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 549 struct nonce nonce = extent_nonce(rbio->version, crc); 550 unsigned nofs_flags; 551 struct bch_csum csum; 552 int ret; 553 554 nofs_flags = memalloc_nofs_save(); 555 556 /* Reset iterator for checksumming and copying bounced data: */ 557 if (rbio->bounce) { 558 src->bi_iter.bi_size = crc.compressed_size << 9; 559 src->bi_iter.bi_idx = 0; 560 src->bi_iter.bi_bvec_done = 0; 561 } else { 562 src->bi_iter = rbio->bvec_iter; 563 } 564 565 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 566 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 567 goto csum_err; 568 569 /* 570 * XXX 571 * We need to rework the narrow_crcs path to deliver the read completion 572 * first, and then punt to a different workqueue, otherwise we're 573 * holding up reads while doing btree updates which is bad for memory 574 * reclaim. 575 */ 576 if (unlikely(rbio->narrow_crcs)) 577 bch2_rbio_narrow_crcs(rbio); 578 579 if (rbio->flags & BCH_READ_NODECODE) 580 goto nodecode; 581 582 /* Adjust crc to point to subset of data we want: */ 583 crc.offset += rbio->offset_into_extent; 584 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 585 586 if (crc_is_compressed(crc)) { 587 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 588 if (ret) 589 goto decrypt_err; 590 591 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 592 !c->opts.no_data_io) 593 goto decompression_err; 594 } else { 595 /* don't need to decrypt the entire bio: */ 596 nonce = nonce_add(nonce, crc.offset << 9); 597 bio_advance(src, crc.offset << 9); 598 599 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 600 src->bi_iter.bi_size = dst_iter.bi_size; 601 602 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 603 if (ret) 604 goto decrypt_err; 605 606 if (rbio->bounce) { 607 struct bvec_iter src_iter = src->bi_iter; 608 609 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 610 } 611 } 612 613 if (rbio->promote) { 614 /* 615 * Re encrypt data we decrypted, so it's consistent with 616 * rbio->crc: 617 */ 618 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 619 if (ret) 620 goto decrypt_err; 621 622 promote_start(rbio->promote, rbio); 623 rbio->promote = NULL; 624 } 625 nodecode: 626 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 627 rbio = bch2_rbio_free(rbio); 628 bch2_rbio_done(rbio); 629 } 630 out: 631 memalloc_nofs_restore(nofs_flags); 632 return; 633 csum_err: 634 /* 635 * Checksum error: if the bio wasn't bounced, we may have been 636 * reading into buffers owned by userspace (that userspace can 637 * scribble over) - retry the read, bouncing it this time: 638 */ 639 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 640 rbio->flags |= BCH_READ_MUST_BOUNCE; 641 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 642 goto out; 643 } 644 645 struct printbuf buf = PRINTBUF; 646 buf.atomic++; 647 prt_str(&buf, "data "); 648 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 649 650 bch_err_inum_offset_ratelimited(ca, 651 rbio->read_pos.inode, 652 rbio->read_pos.offset << 9, 653 "data %s", buf.buf); 654 printbuf_exit(&buf); 655 656 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 657 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 658 goto out; 659 decompression_err: 660 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 661 rbio->read_pos.offset << 9, 662 "decompression error"); 663 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 664 goto out; 665 decrypt_err: 666 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 667 rbio->read_pos.offset << 9, 668 "decrypt error"); 669 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 670 goto out; 671 } 672 673 static void bch2_read_endio(struct bio *bio) 674 { 675 struct bch_read_bio *rbio = 676 container_of(bio, struct bch_read_bio, bio); 677 struct bch_fs *c = rbio->c; 678 struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); 679 struct workqueue_struct *wq = NULL; 680 enum rbio_context context = RBIO_CONTEXT_NULL; 681 682 if (rbio->have_ioref) { 683 bch2_latency_acct(ca, rbio->submit_time, READ); 684 percpu_ref_put(&ca->io_ref); 685 } 686 687 if (!rbio->split) 688 rbio->bio.bi_end_io = rbio->end_io; 689 690 if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, 691 rbio->read_pos.inode, 692 rbio->read_pos.offset, 693 "data read error: %s", 694 bch2_blk_status_to_str(bio->bi_status))) { 695 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 696 return; 697 } 698 699 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 700 ptr_stale(ca, &rbio->pick.ptr)) { 701 trace_and_count(c, read_reuse_race, &rbio->bio); 702 703 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 704 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 705 else 706 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 707 return; 708 } 709 710 if (rbio->narrow_crcs || 711 rbio->promote || 712 crc_is_compressed(rbio->pick.crc) || 713 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 714 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 715 else if (rbio->pick.crc.csum_type) 716 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 717 718 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 719 } 720 721 int __bch2_read_indirect_extent(struct btree_trans *trans, 722 unsigned *offset_into_extent, 723 struct bkey_buf *orig_k) 724 { 725 struct btree_iter iter; 726 struct bkey_s_c k; 727 u64 reflink_offset; 728 int ret; 729 730 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 731 *offset_into_extent; 732 733 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 734 POS(0, reflink_offset), 0); 735 ret = bkey_err(k); 736 if (ret) 737 goto err; 738 739 if (k.k->type != KEY_TYPE_reflink_v && 740 k.k->type != KEY_TYPE_indirect_inline_data) { 741 bch_err_inum_offset_ratelimited(trans->c, 742 orig_k->k->k.p.inode, 743 orig_k->k->k.p.offset << 9, 744 "%llu len %u points to nonexistent indirect extent %llu", 745 orig_k->k->k.p.offset, 746 orig_k->k->k.size, 747 reflink_offset); 748 bch2_inconsistent_error(trans->c); 749 ret = -EIO; 750 goto err; 751 } 752 753 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 754 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 755 err: 756 bch2_trans_iter_exit(trans, &iter); 757 return ret; 758 } 759 760 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 761 struct bkey_s_c k, 762 struct bch_extent_ptr ptr) 763 { 764 struct bch_fs *c = trans->c; 765 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); 766 struct btree_iter iter; 767 struct printbuf buf = PRINTBUF; 768 int ret; 769 770 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 771 PTR_BUCKET_POS(c, &ptr), 772 BTREE_ITER_CACHED); 773 774 prt_printf(&buf, "Attempting to read from stale dirty pointer:"); 775 printbuf_indent_add(&buf, 2); 776 prt_newline(&buf); 777 778 bch2_bkey_val_to_text(&buf, c, k); 779 prt_newline(&buf); 780 781 prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); 782 783 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 784 if (!ret) { 785 prt_newline(&buf); 786 bch2_bkey_val_to_text(&buf, c, k); 787 } 788 789 bch2_fs_inconsistent(c, "%s", buf.buf); 790 791 bch2_trans_iter_exit(trans, &iter); 792 printbuf_exit(&buf); 793 } 794 795 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 796 struct bvec_iter iter, struct bpos read_pos, 797 enum btree_id data_btree, struct bkey_s_c k, 798 unsigned offset_into_extent, 799 struct bch_io_failures *failed, unsigned flags) 800 { 801 struct bch_fs *c = trans->c; 802 struct extent_ptr_decoded pick; 803 struct bch_read_bio *rbio = NULL; 804 struct bch_dev *ca = NULL; 805 struct promote_op *promote = NULL; 806 bool bounce = false, read_full = false, narrow_crcs = false; 807 struct bpos data_pos = bkey_start_pos(k.k); 808 int pick_ret; 809 810 if (bkey_extent_is_inline_data(k.k)) { 811 unsigned bytes = min_t(unsigned, iter.bi_size, 812 bkey_inline_data_bytes(k.k)); 813 814 swap(iter.bi_size, bytes); 815 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 816 swap(iter.bi_size, bytes); 817 bio_advance_iter(&orig->bio, &iter, bytes); 818 zero_fill_bio_iter(&orig->bio, iter); 819 goto out_read_done; 820 } 821 retry_pick: 822 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 823 824 /* hole or reservation - just zero fill: */ 825 if (!pick_ret) 826 goto hole; 827 828 if (pick_ret < 0) { 829 bch_err_inum_offset_ratelimited(c, 830 read_pos.inode, read_pos.offset << 9, 831 "no device to read from"); 832 goto err; 833 } 834 835 ca = bch_dev_bkey_exists(c, pick.ptr.dev); 836 837 /* 838 * Stale dirty pointers are treated as IO errors, but @failed isn't 839 * allocated unless we're in the retry path - so if we're not in the 840 * retry path, don't check here, it'll be caught in bch2_read_endio() 841 * and we'll end up in the retry path: 842 */ 843 if ((flags & BCH_READ_IN_RETRY) && 844 !pick.ptr.cached && 845 unlikely(ptr_stale(ca, &pick.ptr))) { 846 read_from_stale_dirty_pointer(trans, k, pick.ptr); 847 bch2_mark_io_failure(failed, &pick); 848 goto retry_pick; 849 } 850 851 /* 852 * Unlock the iterator while the btree node's lock is still in 853 * cache, before doing the IO: 854 */ 855 bch2_trans_unlock(trans); 856 857 if (flags & BCH_READ_NODECODE) { 858 /* 859 * can happen if we retry, and the extent we were going to read 860 * has been merged in the meantime: 861 */ 862 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) 863 goto hole; 864 865 iter.bi_size = pick.crc.compressed_size << 9; 866 goto get_bio; 867 } 868 869 if (!(flags & BCH_READ_LAST_FRAGMENT) || 870 bio_flagged(&orig->bio, BIO_CHAIN)) 871 flags |= BCH_READ_MUST_CLONE; 872 873 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 874 bch2_can_narrow_extent_crcs(k, pick.crc); 875 876 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 877 flags |= BCH_READ_MUST_BOUNCE; 878 879 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 880 881 if (crc_is_compressed(pick.crc) || 882 (pick.crc.csum_type != BCH_CSUM_none && 883 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 884 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 885 (flags & BCH_READ_USER_MAPPED)) || 886 (flags & BCH_READ_MUST_BOUNCE)))) { 887 read_full = true; 888 bounce = true; 889 } 890 891 if (orig->opts.promote_target) 892 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 893 &rbio, &bounce, &read_full); 894 895 if (!read_full) { 896 EBUG_ON(crc_is_compressed(pick.crc)); 897 EBUG_ON(pick.crc.csum_type && 898 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 899 bvec_iter_sectors(iter) != pick.crc.live_size || 900 pick.crc.offset || 901 offset_into_extent)); 902 903 data_pos.offset += offset_into_extent; 904 pick.ptr.offset += pick.crc.offset + 905 offset_into_extent; 906 offset_into_extent = 0; 907 pick.crc.compressed_size = bvec_iter_sectors(iter); 908 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 909 pick.crc.offset = 0; 910 pick.crc.live_size = bvec_iter_sectors(iter); 911 } 912 get_bio: 913 if (rbio) { 914 /* 915 * promote already allocated bounce rbio: 916 * promote needs to allocate a bio big enough for uncompressing 917 * data in the write path, but we're not going to use it all 918 * here: 919 */ 920 EBUG_ON(rbio->bio.bi_iter.bi_size < 921 pick.crc.compressed_size << 9); 922 rbio->bio.bi_iter.bi_size = 923 pick.crc.compressed_size << 9; 924 } else if (bounce) { 925 unsigned sectors = pick.crc.compressed_size; 926 927 rbio = rbio_init(bio_alloc_bioset(NULL, 928 DIV_ROUND_UP(sectors, PAGE_SECTORS), 929 0, 930 GFP_NOFS, 931 &c->bio_read_split), 932 orig->opts); 933 934 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 935 rbio->bounce = true; 936 rbio->split = true; 937 } else if (flags & BCH_READ_MUST_CLONE) { 938 /* 939 * Have to clone if there were any splits, due to error 940 * reporting issues (if a split errored, and retrying didn't 941 * work, when it reports the error to its parent (us) we don't 942 * know if the error was from our bio, and we should retry, or 943 * from the whole bio, in which case we don't want to retry and 944 * lose the error) 945 */ 946 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 947 &c->bio_read_split), 948 orig->opts); 949 rbio->bio.bi_iter = iter; 950 rbio->split = true; 951 } else { 952 rbio = orig; 953 rbio->bio.bi_iter = iter; 954 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 955 } 956 957 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 958 959 rbio->c = c; 960 rbio->submit_time = local_clock(); 961 if (rbio->split) 962 rbio->parent = orig; 963 else 964 rbio->end_io = orig->bio.bi_end_io; 965 rbio->bvec_iter = iter; 966 rbio->offset_into_extent= offset_into_extent; 967 rbio->flags = flags; 968 rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); 969 rbio->narrow_crcs = narrow_crcs; 970 rbio->hole = 0; 971 rbio->retry = 0; 972 rbio->context = 0; 973 /* XXX: only initialize this if needed */ 974 rbio->devs_have = bch2_bkey_devs(k); 975 rbio->pick = pick; 976 rbio->subvol = orig->subvol; 977 rbio->read_pos = read_pos; 978 rbio->data_btree = data_btree; 979 rbio->data_pos = data_pos; 980 rbio->version = k.k->version; 981 rbio->promote = promote; 982 INIT_WORK(&rbio->work, NULL); 983 984 rbio->bio.bi_opf = orig->bio.bi_opf; 985 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 986 rbio->bio.bi_end_io = bch2_read_endio; 987 988 if (rbio->bounce) 989 trace_and_count(c, read_bounce, &rbio->bio); 990 991 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 992 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 993 994 /* 995 * If it's being moved internally, we don't want to flag it as a cache 996 * hit: 997 */ 998 if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 999 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1000 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1001 1002 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1003 bio_inc_remaining(&orig->bio); 1004 trace_and_count(c, read_split, &orig->bio); 1005 } 1006 1007 if (!rbio->pick.idx) { 1008 if (!rbio->have_ioref) { 1009 bch_err_inum_offset_ratelimited(c, 1010 read_pos.inode, 1011 read_pos.offset << 9, 1012 "no device to read from"); 1013 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1014 goto out; 1015 } 1016 1017 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1018 bio_sectors(&rbio->bio)); 1019 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1020 1021 if (unlikely(c->opts.no_data_io)) { 1022 if (likely(!(flags & BCH_READ_IN_RETRY))) 1023 bio_endio(&rbio->bio); 1024 } else { 1025 if (likely(!(flags & BCH_READ_IN_RETRY))) 1026 submit_bio(&rbio->bio); 1027 else 1028 submit_bio_wait(&rbio->bio); 1029 } 1030 1031 /* 1032 * We just submitted IO which may block, we expect relock fail 1033 * events and shouldn't count them: 1034 */ 1035 trans->notrace_relock_fail = true; 1036 } else { 1037 /* Attempting reconstruct read: */ 1038 if (bch2_ec_read_extent(trans, rbio)) { 1039 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1040 goto out; 1041 } 1042 1043 if (likely(!(flags & BCH_READ_IN_RETRY))) 1044 bio_endio(&rbio->bio); 1045 } 1046 out: 1047 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1048 return 0; 1049 } else { 1050 int ret; 1051 1052 rbio->context = RBIO_CONTEXT_UNBOUND; 1053 bch2_read_endio(&rbio->bio); 1054 1055 ret = rbio->retry; 1056 rbio = bch2_rbio_free(rbio); 1057 1058 if (ret == READ_RETRY_AVOID) { 1059 bch2_mark_io_failure(failed, &pick); 1060 ret = READ_RETRY; 1061 } 1062 1063 if (!ret) 1064 goto out_read_done; 1065 1066 return ret; 1067 } 1068 1069 err: 1070 if (flags & BCH_READ_IN_RETRY) 1071 return READ_ERR; 1072 1073 orig->bio.bi_status = BLK_STS_IOERR; 1074 goto out_read_done; 1075 1076 hole: 1077 /* 1078 * won't normally happen in the BCH_READ_NODECODE 1079 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1080 * to read no longer exists we have to signal that: 1081 */ 1082 if (flags & BCH_READ_NODECODE) 1083 orig->hole = true; 1084 1085 zero_fill_bio_iter(&orig->bio, iter); 1086 out_read_done: 1087 if (flags & BCH_READ_LAST_FRAGMENT) 1088 bch2_rbio_done(orig); 1089 return 0; 1090 } 1091 1092 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1093 struct bvec_iter bvec_iter, subvol_inum inum, 1094 struct bch_io_failures *failed, unsigned flags) 1095 { 1096 struct btree_trans *trans = bch2_trans_get(c); 1097 struct btree_iter iter; 1098 struct bkey_buf sk; 1099 struct bkey_s_c k; 1100 u32 snapshot; 1101 int ret; 1102 1103 BUG_ON(flags & BCH_READ_NODECODE); 1104 1105 bch2_bkey_buf_init(&sk); 1106 retry: 1107 bch2_trans_begin(trans); 1108 iter = (struct btree_iter) { NULL }; 1109 1110 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1111 if (ret) 1112 goto err; 1113 1114 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1115 SPOS(inum.inum, bvec_iter.bi_sector, snapshot), 1116 BTREE_ITER_SLOTS); 1117 while (1) { 1118 unsigned bytes, sectors, offset_into_extent; 1119 enum btree_id data_btree = BTREE_ID_extents; 1120 1121 /* 1122 * read_extent -> io_time_reset may cause a transaction restart 1123 * without returning an error, we need to check for that here: 1124 */ 1125 ret = bch2_trans_relock(trans); 1126 if (ret) 1127 break; 1128 1129 bch2_btree_iter_set_pos(&iter, 1130 POS(inum.inum, bvec_iter.bi_sector)); 1131 1132 k = bch2_btree_iter_peek_slot(&iter); 1133 ret = bkey_err(k); 1134 if (ret) 1135 break; 1136 1137 offset_into_extent = iter.pos.offset - 1138 bkey_start_offset(k.k); 1139 sectors = k.k->size - offset_into_extent; 1140 1141 bch2_bkey_buf_reassemble(&sk, c, k); 1142 1143 ret = bch2_read_indirect_extent(trans, &data_btree, 1144 &offset_into_extent, &sk); 1145 if (ret) 1146 break; 1147 1148 k = bkey_i_to_s_c(sk.k); 1149 1150 /* 1151 * With indirect extents, the amount of data to read is the min 1152 * of the original extent and the indirect extent: 1153 */ 1154 sectors = min(sectors, k.k->size - offset_into_extent); 1155 1156 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1157 swap(bvec_iter.bi_size, bytes); 1158 1159 if (bvec_iter.bi_size == bytes) 1160 flags |= BCH_READ_LAST_FRAGMENT; 1161 1162 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1163 data_btree, k, 1164 offset_into_extent, failed, flags); 1165 if (ret) 1166 break; 1167 1168 if (flags & BCH_READ_LAST_FRAGMENT) 1169 break; 1170 1171 swap(bvec_iter.bi_size, bytes); 1172 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1173 1174 ret = btree_trans_too_many_iters(trans); 1175 if (ret) 1176 break; 1177 } 1178 err: 1179 bch2_trans_iter_exit(trans, &iter); 1180 1181 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1182 ret == READ_RETRY || 1183 ret == READ_RETRY_AVOID) 1184 goto retry; 1185 1186 bch2_trans_put(trans); 1187 bch2_bkey_buf_exit(&sk, c); 1188 1189 if (ret) { 1190 bch_err_inum_offset_ratelimited(c, inum.inum, 1191 bvec_iter.bi_sector << 9, 1192 "read error %i from btree lookup", ret); 1193 rbio->bio.bi_status = BLK_STS_IOERR; 1194 bch2_rbio_done(rbio); 1195 } 1196 } 1197 1198 void bch2_fs_io_read_exit(struct bch_fs *c) 1199 { 1200 if (c->promote_table.tbl) 1201 rhashtable_destroy(&c->promote_table); 1202 bioset_exit(&c->bio_read_split); 1203 bioset_exit(&c->bio_read); 1204 } 1205 1206 int bch2_fs_io_read_init(struct bch_fs *c) 1207 { 1208 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1209 BIOSET_NEED_BVECS)) 1210 return -BCH_ERR_ENOMEM_bio_read_init; 1211 1212 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1213 BIOSET_NEED_BVECS)) 1214 return -BCH_ERR_ENOMEM_bio_read_split_init; 1215 1216 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1217 return -BCH_ERR_ENOMEM_promote_table_init; 1218 1219 return 0; 1220 } 1221