1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "btree_update.h" 13 #include "buckets.h" 14 #include "checksum.h" 15 #include "clock.h" 16 #include "compress.h" 17 #include "data_update.h" 18 #include "disk_groups.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "io_read.h" 22 #include "io_misc.h" 23 #include "io_write.h" 24 #include "subvolume.h" 25 #include "trace.h" 26 27 #include <linux/sched/mm.h> 28 29 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 30 31 static bool bch2_target_congested(struct bch_fs *c, u16 target) 32 { 33 const struct bch_devs_mask *devs; 34 unsigned d, nr = 0, total = 0; 35 u64 now = local_clock(), last; 36 s64 congested; 37 struct bch_dev *ca; 38 39 if (!target) 40 return false; 41 42 rcu_read_lock(); 43 devs = bch2_target_to_mask(c, target) ?: 44 &c->rw_devs[BCH_DATA_user]; 45 46 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 47 ca = rcu_dereference(c->devs[d]); 48 if (!ca) 49 continue; 50 51 congested = atomic_read(&ca->congested); 52 last = READ_ONCE(ca->congested_last); 53 if (time_after64(now, last)) 54 congested -= (now - last) >> 12; 55 56 total += max(congested, 0LL); 57 nr++; 58 } 59 rcu_read_unlock(); 60 61 return bch2_rand_range(nr * CONGESTED_MAX) < total; 62 } 63 64 #else 65 66 static bool bch2_target_congested(struct bch_fs *c, u16 target) 67 { 68 return false; 69 } 70 71 #endif 72 73 /* Cache promotion on read */ 74 75 struct promote_op { 76 struct rcu_head rcu; 77 u64 start_time; 78 79 struct rhash_head hash; 80 struct bpos pos; 81 82 struct data_update write; 83 struct bio_vec bi_inline_vecs[]; /* must be last */ 84 }; 85 86 static const struct rhashtable_params bch_promote_params = { 87 .head_offset = offsetof(struct promote_op, hash), 88 .key_offset = offsetof(struct promote_op, pos), 89 .key_len = sizeof(struct bpos), 90 }; 91 92 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 93 struct bpos pos, 94 struct bch_io_opts opts, 95 unsigned flags) 96 { 97 BUG_ON(!opts.promote_target); 98 99 if (!(flags & BCH_READ_MAY_PROMOTE)) 100 return -BCH_ERR_nopromote_may_not; 101 102 if (bch2_bkey_has_target(c, k, opts.promote_target)) 103 return -BCH_ERR_nopromote_already_promoted; 104 105 if (bkey_extent_is_unwritten(k)) 106 return -BCH_ERR_nopromote_unwritten; 107 108 if (bch2_target_congested(c, opts.promote_target)) 109 return -BCH_ERR_nopromote_congested; 110 111 if (rhashtable_lookup_fast(&c->promote_table, &pos, 112 bch_promote_params)) 113 return -BCH_ERR_nopromote_in_flight; 114 115 return 0; 116 } 117 118 static void promote_free(struct bch_fs *c, struct promote_op *op) 119 { 120 int ret; 121 122 bch2_data_update_exit(&op->write); 123 124 ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 125 bch_promote_params); 126 BUG_ON(ret); 127 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 128 kfree_rcu(op, rcu); 129 } 130 131 static void promote_done(struct bch_write_op *wop) 132 { 133 struct promote_op *op = 134 container_of(wop, struct promote_op, write.op); 135 struct bch_fs *c = op->write.op.c; 136 137 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 138 op->start_time); 139 promote_free(c, op); 140 } 141 142 static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 143 { 144 struct bio *bio = &op->write.op.wbio.bio; 145 146 trace_and_count(op->write.op.c, read_promote, &rbio->bio); 147 148 /* we now own pages: */ 149 BUG_ON(!rbio->bounce); 150 BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 151 152 memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 153 sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 154 swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 155 156 bch2_data_update_read_done(&op->write, rbio->pick.crc); 157 } 158 159 static struct promote_op *__promote_alloc(struct btree_trans *trans, 160 enum btree_id btree_id, 161 struct bkey_s_c k, 162 struct bpos pos, 163 struct extent_ptr_decoded *pick, 164 struct bch_io_opts opts, 165 unsigned sectors, 166 struct bch_read_bio **rbio) 167 { 168 struct bch_fs *c = trans->c; 169 struct promote_op *op = NULL; 170 struct bio *bio; 171 unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 172 int ret; 173 174 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 175 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 176 177 op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 178 if (!op) { 179 ret = -BCH_ERR_nopromote_enomem; 180 goto err; 181 } 182 183 op->start_time = local_clock(); 184 op->pos = pos; 185 186 /* 187 * We don't use the mempool here because extents that aren't 188 * checksummed or compressed can be too big for the mempool: 189 */ 190 *rbio = kzalloc(sizeof(struct bch_read_bio) + 191 sizeof(struct bio_vec) * pages, 192 GFP_KERNEL); 193 if (!*rbio) { 194 ret = -BCH_ERR_nopromote_enomem; 195 goto err; 196 } 197 198 rbio_init(&(*rbio)->bio, opts); 199 bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 200 201 if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 202 ret = -BCH_ERR_nopromote_enomem; 203 goto err; 204 } 205 206 (*rbio)->bounce = true; 207 (*rbio)->split = true; 208 (*rbio)->kmalloc = true; 209 210 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 211 bch_promote_params)) { 212 ret = -BCH_ERR_nopromote_in_flight; 213 goto err; 214 } 215 216 bio = &op->write.op.wbio.bio; 217 bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 218 219 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 220 writepoint_hashed((unsigned long) current), 221 opts, 222 (struct data_update_opts) { 223 .target = opts.promote_target, 224 .extra_replicas = 1, 225 .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, 226 }, 227 btree_id, k); 228 /* 229 * possible errors: -BCH_ERR_nocow_lock_blocked, 230 * -BCH_ERR_ENOSPC_disk_reservation: 231 */ 232 if (ret) { 233 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 234 bch_promote_params)); 235 goto err; 236 } 237 238 op->write.op.end_io = promote_done; 239 240 return op; 241 err: 242 if (*rbio) 243 bio_free_pages(&(*rbio)->bio); 244 kfree(*rbio); 245 *rbio = NULL; 246 kfree(op); 247 bch2_write_ref_put(c, BCH_WRITE_REF_promote); 248 return ERR_PTR(ret); 249 } 250 251 noinline 252 static struct promote_op *promote_alloc(struct btree_trans *trans, 253 struct bvec_iter iter, 254 struct bkey_s_c k, 255 struct extent_ptr_decoded *pick, 256 struct bch_io_opts opts, 257 unsigned flags, 258 struct bch_read_bio **rbio, 259 bool *bounce, 260 bool *read_full) 261 { 262 struct bch_fs *c = trans->c; 263 bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); 264 /* data might have to be decompressed in the write path: */ 265 unsigned sectors = promote_full 266 ? max(pick->crc.compressed_size, pick->crc.live_size) 267 : bvec_iter_sectors(iter); 268 struct bpos pos = promote_full 269 ? bkey_start_pos(k.k) 270 : POS(k.k->p.inode, iter.bi_sector); 271 struct promote_op *promote; 272 int ret; 273 274 ret = should_promote(c, k, pos, opts, flags); 275 if (ret) 276 goto nopromote; 277 278 promote = __promote_alloc(trans, 279 k.k->type == KEY_TYPE_reflink_v 280 ? BTREE_ID_reflink 281 : BTREE_ID_extents, 282 k, pos, pick, opts, sectors, rbio); 283 ret = PTR_ERR_OR_ZERO(promote); 284 if (ret) 285 goto nopromote; 286 287 *bounce = true; 288 *read_full = promote_full; 289 return promote; 290 nopromote: 291 trace_read_nopromote(c, ret); 292 return NULL; 293 } 294 295 /* Read */ 296 297 #define READ_RETRY_AVOID 1 298 #define READ_RETRY 2 299 #define READ_ERR 3 300 301 enum rbio_context { 302 RBIO_CONTEXT_NULL, 303 RBIO_CONTEXT_HIGHPRI, 304 RBIO_CONTEXT_UNBOUND, 305 }; 306 307 static inline struct bch_read_bio * 308 bch2_rbio_parent(struct bch_read_bio *rbio) 309 { 310 return rbio->split ? rbio->parent : rbio; 311 } 312 313 __always_inline 314 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 315 enum rbio_context context, 316 struct workqueue_struct *wq) 317 { 318 if (context <= rbio->context) { 319 fn(&rbio->work); 320 } else { 321 rbio->work.func = fn; 322 rbio->context = context; 323 queue_work(wq, &rbio->work); 324 } 325 } 326 327 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 328 { 329 BUG_ON(rbio->bounce && !rbio->split); 330 331 if (rbio->promote) 332 promote_free(rbio->c, rbio->promote); 333 rbio->promote = NULL; 334 335 if (rbio->bounce) 336 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 337 338 if (rbio->split) { 339 struct bch_read_bio *parent = rbio->parent; 340 341 if (rbio->kmalloc) 342 kfree(rbio); 343 else 344 bio_put(&rbio->bio); 345 346 rbio = parent; 347 } 348 349 return rbio; 350 } 351 352 /* 353 * Only called on a top level bch_read_bio to complete an entire read request, 354 * not a split: 355 */ 356 static void bch2_rbio_done(struct bch_read_bio *rbio) 357 { 358 if (rbio->start_time) 359 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 360 rbio->start_time); 361 bio_endio(&rbio->bio); 362 } 363 364 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 365 struct bvec_iter bvec_iter, 366 struct bch_io_failures *failed, 367 unsigned flags) 368 { 369 struct btree_trans *trans = bch2_trans_get(c); 370 struct btree_iter iter; 371 struct bkey_buf sk; 372 struct bkey_s_c k; 373 int ret; 374 375 flags &= ~BCH_READ_LAST_FRAGMENT; 376 flags |= BCH_READ_MUST_CLONE; 377 378 bch2_bkey_buf_init(&sk); 379 380 bch2_trans_iter_init(trans, &iter, rbio->data_btree, 381 rbio->read_pos, BTREE_ITER_slots); 382 retry: 383 rbio->bio.bi_status = 0; 384 385 k = bch2_btree_iter_peek_slot(&iter); 386 if (bkey_err(k)) 387 goto err; 388 389 bch2_bkey_buf_reassemble(&sk, c, k); 390 k = bkey_i_to_s_c(sk.k); 391 bch2_trans_unlock(trans); 392 393 if (!bch2_bkey_matches_ptr(c, k, 394 rbio->pick.ptr, 395 rbio->data_pos.offset - 396 rbio->pick.crc.offset)) { 397 /* extent we wanted to read no longer exists: */ 398 rbio->hole = true; 399 goto out; 400 } 401 402 ret = __bch2_read_extent(trans, rbio, bvec_iter, 403 rbio->read_pos, 404 rbio->data_btree, 405 k, 0, failed, flags); 406 if (ret == READ_RETRY) 407 goto retry; 408 if (ret) 409 goto err; 410 out: 411 bch2_rbio_done(rbio); 412 bch2_trans_iter_exit(trans, &iter); 413 bch2_trans_put(trans); 414 bch2_bkey_buf_exit(&sk, c); 415 return; 416 err: 417 rbio->bio.bi_status = BLK_STS_IOERR; 418 goto out; 419 } 420 421 static void bch2_rbio_retry(struct work_struct *work) 422 { 423 struct bch_read_bio *rbio = 424 container_of(work, struct bch_read_bio, work); 425 struct bch_fs *c = rbio->c; 426 struct bvec_iter iter = rbio->bvec_iter; 427 unsigned flags = rbio->flags; 428 subvol_inum inum = { 429 .subvol = rbio->subvol, 430 .inum = rbio->read_pos.inode, 431 }; 432 struct bch_io_failures failed = { .nr = 0 }; 433 434 trace_and_count(c, read_retry, &rbio->bio); 435 436 if (rbio->retry == READ_RETRY_AVOID) 437 bch2_mark_io_failure(&failed, &rbio->pick); 438 439 rbio->bio.bi_status = 0; 440 441 rbio = bch2_rbio_free(rbio); 442 443 flags |= BCH_READ_IN_RETRY; 444 flags &= ~BCH_READ_MAY_PROMOTE; 445 446 if (flags & BCH_READ_NODECODE) { 447 bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 448 } else { 449 flags &= ~BCH_READ_LAST_FRAGMENT; 450 flags |= BCH_READ_MUST_CLONE; 451 452 __bch2_read(c, rbio, iter, inum, &failed, flags); 453 } 454 } 455 456 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 457 blk_status_t error) 458 { 459 rbio->retry = retry; 460 461 if (rbio->flags & BCH_READ_IN_RETRY) 462 return; 463 464 if (retry == READ_ERR) { 465 rbio = bch2_rbio_free(rbio); 466 467 rbio->bio.bi_status = error; 468 bch2_rbio_done(rbio); 469 } else { 470 bch2_rbio_punt(rbio, bch2_rbio_retry, 471 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 472 } 473 } 474 475 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 476 struct bch_read_bio *rbio) 477 { 478 struct bch_fs *c = rbio->c; 479 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 480 struct bch_extent_crc_unpacked new_crc; 481 struct btree_iter iter; 482 struct bkey_i *new; 483 struct bkey_s_c k; 484 int ret = 0; 485 486 if (crc_is_compressed(rbio->pick.crc)) 487 return 0; 488 489 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 490 BTREE_ITER_slots|BTREE_ITER_intent); 491 if ((ret = bkey_err(k))) 492 goto out; 493 494 if (bversion_cmp(k.k->version, rbio->version) || 495 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 496 goto out; 497 498 /* Extent was merged? */ 499 if (bkey_start_offset(k.k) < data_offset || 500 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 501 goto out; 502 503 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 504 rbio->pick.crc, NULL, &new_crc, 505 bkey_start_offset(k.k) - data_offset, k.k->size, 506 rbio->pick.crc.csum_type)) { 507 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 508 ret = 0; 509 goto out; 510 } 511 512 /* 513 * going to be temporarily appending another checksum entry: 514 */ 515 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 516 sizeof(struct bch_extent_crc128)); 517 if ((ret = PTR_ERR_OR_ZERO(new))) 518 goto out; 519 520 bkey_reassemble(new, k); 521 522 if (!bch2_bkey_narrow_crcs(new, new_crc)) 523 goto out; 524 525 ret = bch2_trans_update(trans, &iter, new, 526 BTREE_UPDATE_internal_snapshot_node); 527 out: 528 bch2_trans_iter_exit(trans, &iter); 529 return ret; 530 } 531 532 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 533 { 534 bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 535 __bch2_rbio_narrow_crcs(trans, rbio)); 536 } 537 538 /* Inner part that may run in process context */ 539 static void __bch2_read_endio(struct work_struct *work) 540 { 541 struct bch_read_bio *rbio = 542 container_of(work, struct bch_read_bio, work); 543 struct bch_fs *c = rbio->c; 544 struct bio *src = &rbio->bio; 545 struct bio *dst = &bch2_rbio_parent(rbio)->bio; 546 struct bvec_iter dst_iter = rbio->bvec_iter; 547 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 548 struct nonce nonce = extent_nonce(rbio->version, crc); 549 unsigned nofs_flags; 550 struct bch_csum csum; 551 int ret; 552 553 nofs_flags = memalloc_nofs_save(); 554 555 /* Reset iterator for checksumming and copying bounced data: */ 556 if (rbio->bounce) { 557 src->bi_iter.bi_size = crc.compressed_size << 9; 558 src->bi_iter.bi_idx = 0; 559 src->bi_iter.bi_bvec_done = 0; 560 } else { 561 src->bi_iter = rbio->bvec_iter; 562 } 563 564 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 565 if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 566 goto csum_err; 567 568 /* 569 * XXX 570 * We need to rework the narrow_crcs path to deliver the read completion 571 * first, and then punt to a different workqueue, otherwise we're 572 * holding up reads while doing btree updates which is bad for memory 573 * reclaim. 574 */ 575 if (unlikely(rbio->narrow_crcs)) 576 bch2_rbio_narrow_crcs(rbio); 577 578 if (rbio->flags & BCH_READ_NODECODE) 579 goto nodecode; 580 581 /* Adjust crc to point to subset of data we want: */ 582 crc.offset += rbio->offset_into_extent; 583 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 584 585 if (crc_is_compressed(crc)) { 586 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 587 if (ret) 588 goto decrypt_err; 589 590 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 591 !c->opts.no_data_io) 592 goto decompression_err; 593 } else { 594 /* don't need to decrypt the entire bio: */ 595 nonce = nonce_add(nonce, crc.offset << 9); 596 bio_advance(src, crc.offset << 9); 597 598 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 599 src->bi_iter.bi_size = dst_iter.bi_size; 600 601 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 602 if (ret) 603 goto decrypt_err; 604 605 if (rbio->bounce) { 606 struct bvec_iter src_iter = src->bi_iter; 607 608 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 609 } 610 } 611 612 if (rbio->promote) { 613 /* 614 * Re encrypt data we decrypted, so it's consistent with 615 * rbio->crc: 616 */ 617 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 618 if (ret) 619 goto decrypt_err; 620 621 promote_start(rbio->promote, rbio); 622 rbio->promote = NULL; 623 } 624 nodecode: 625 if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 626 rbio = bch2_rbio_free(rbio); 627 bch2_rbio_done(rbio); 628 } 629 out: 630 memalloc_nofs_restore(nofs_flags); 631 return; 632 csum_err: 633 /* 634 * Checksum error: if the bio wasn't bounced, we may have been 635 * reading into buffers owned by userspace (that userspace can 636 * scribble over) - retry the read, bouncing it this time: 637 */ 638 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 639 rbio->flags |= BCH_READ_MUST_BOUNCE; 640 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 641 goto out; 642 } 643 644 struct printbuf buf = PRINTBUF; 645 buf.atomic++; 646 prt_str(&buf, "data "); 647 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 648 649 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 650 if (ca) { 651 bch_err_inum_offset_ratelimited(ca, 652 rbio->read_pos.inode, 653 rbio->read_pos.offset << 9, 654 "data %s", buf.buf); 655 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 656 } 657 printbuf_exit(&buf); 658 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 659 goto out; 660 decompression_err: 661 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 662 rbio->read_pos.offset << 9, 663 "decompression error"); 664 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 665 goto out; 666 decrypt_err: 667 bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 668 rbio->read_pos.offset << 9, 669 "decrypt error"); 670 bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 671 goto out; 672 } 673 674 static void bch2_read_endio(struct bio *bio) 675 { 676 struct bch_read_bio *rbio = 677 container_of(bio, struct bch_read_bio, bio); 678 struct bch_fs *c = rbio->c; 679 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 680 struct workqueue_struct *wq = NULL; 681 enum rbio_context context = RBIO_CONTEXT_NULL; 682 683 if (rbio->have_ioref) { 684 bch2_latency_acct(ca, rbio->submit_time, READ); 685 percpu_ref_put(&ca->io_ref); 686 } 687 688 if (!rbio->split) 689 rbio->bio.bi_end_io = rbio->end_io; 690 691 if (bio->bi_status) { 692 if (ca) { 693 bch_err_inum_offset_ratelimited(ca, 694 rbio->read_pos.inode, 695 rbio->read_pos.offset, 696 "data read error: %s", 697 bch2_blk_status_to_str(bio->bi_status)); 698 bch2_io_error(ca, BCH_MEMBER_ERROR_read); 699 } 700 bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 701 return; 702 } 703 704 if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 705 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 706 trace_and_count(c, read_reuse_race, &rbio->bio); 707 708 if (rbio->flags & BCH_READ_RETRY_IF_STALE) 709 bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 710 else 711 bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 712 return; 713 } 714 715 if (rbio->narrow_crcs || 716 rbio->promote || 717 crc_is_compressed(rbio->pick.crc) || 718 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 719 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 720 else if (rbio->pick.crc.csum_type) 721 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 722 723 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 724 } 725 726 int __bch2_read_indirect_extent(struct btree_trans *trans, 727 unsigned *offset_into_extent, 728 struct bkey_buf *orig_k) 729 { 730 struct btree_iter iter; 731 struct bkey_s_c k; 732 u64 reflink_offset; 733 int ret; 734 735 reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 736 *offset_into_extent; 737 738 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 739 POS(0, reflink_offset), 0); 740 ret = bkey_err(k); 741 if (ret) 742 goto err; 743 744 if (k.k->type != KEY_TYPE_reflink_v && 745 k.k->type != KEY_TYPE_indirect_inline_data) { 746 bch_err_inum_offset_ratelimited(trans->c, 747 orig_k->k->k.p.inode, 748 orig_k->k->k.p.offset << 9, 749 "%llu len %u points to nonexistent indirect extent %llu", 750 orig_k->k->k.p.offset, 751 orig_k->k->k.size, 752 reflink_offset); 753 bch2_inconsistent_error(trans->c); 754 ret = -EIO; 755 goto err; 756 } 757 758 *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 759 bch2_bkey_buf_reassemble(orig_k, trans->c, k); 760 err: 761 bch2_trans_iter_exit(trans, &iter); 762 return ret; 763 } 764 765 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 766 struct bch_dev *ca, 767 struct bkey_s_c k, 768 struct bch_extent_ptr ptr) 769 { 770 struct bch_fs *c = trans->c; 771 struct btree_iter iter; 772 struct printbuf buf = PRINTBUF; 773 int ret; 774 775 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 776 PTR_BUCKET_POS(ca, &ptr), 777 BTREE_ITER_cached); 778 779 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 780 printbuf_indent_add(&buf, 2); 781 782 bch2_bkey_val_to_text(&buf, c, k); 783 prt_newline(&buf); 784 785 prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); 786 787 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 788 if (!ret) { 789 prt_newline(&buf); 790 bch2_bkey_val_to_text(&buf, c, k); 791 } 792 793 bch2_fs_inconsistent(c, "%s", buf.buf); 794 795 bch2_trans_iter_exit(trans, &iter); 796 printbuf_exit(&buf); 797 } 798 799 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 800 struct bvec_iter iter, struct bpos read_pos, 801 enum btree_id data_btree, struct bkey_s_c k, 802 unsigned offset_into_extent, 803 struct bch_io_failures *failed, unsigned flags) 804 { 805 struct bch_fs *c = trans->c; 806 struct extent_ptr_decoded pick; 807 struct bch_read_bio *rbio = NULL; 808 struct promote_op *promote = NULL; 809 bool bounce = false, read_full = false, narrow_crcs = false; 810 struct bpos data_pos = bkey_start_pos(k.k); 811 int pick_ret; 812 813 if (bkey_extent_is_inline_data(k.k)) { 814 unsigned bytes = min_t(unsigned, iter.bi_size, 815 bkey_inline_data_bytes(k.k)); 816 817 swap(iter.bi_size, bytes); 818 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 819 swap(iter.bi_size, bytes); 820 bio_advance_iter(&orig->bio, &iter, bytes); 821 zero_fill_bio_iter(&orig->bio, iter); 822 goto out_read_done; 823 } 824 retry_pick: 825 pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 826 827 /* hole or reservation - just zero fill: */ 828 if (!pick_ret) 829 goto hole; 830 831 if (pick_ret < 0) { 832 bch_err_inum_offset_ratelimited(c, 833 read_pos.inode, read_pos.offset << 9, 834 "no device to read from"); 835 goto err; 836 } 837 838 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 839 840 /* 841 * Stale dirty pointers are treated as IO errors, but @failed isn't 842 * allocated unless we're in the retry path - so if we're not in the 843 * retry path, don't check here, it'll be caught in bch2_read_endio() 844 * and we'll end up in the retry path: 845 */ 846 if ((flags & BCH_READ_IN_RETRY) && 847 !pick.ptr.cached && 848 ca && 849 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 850 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 851 bch2_mark_io_failure(failed, &pick); 852 percpu_ref_put(&ca->io_ref); 853 goto retry_pick; 854 } 855 856 /* 857 * Unlock the iterator while the btree node's lock is still in 858 * cache, before doing the IO: 859 */ 860 bch2_trans_unlock(trans); 861 862 if (flags & BCH_READ_NODECODE) { 863 /* 864 * can happen if we retry, and the extent we were going to read 865 * has been merged in the meantime: 866 */ 867 if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 868 if (ca) 869 percpu_ref_put(&ca->io_ref); 870 goto hole; 871 } 872 873 iter.bi_size = pick.crc.compressed_size << 9; 874 goto get_bio; 875 } 876 877 if (!(flags & BCH_READ_LAST_FRAGMENT) || 878 bio_flagged(&orig->bio, BIO_CHAIN)) 879 flags |= BCH_READ_MUST_CLONE; 880 881 narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 882 bch2_can_narrow_extent_crcs(k, pick.crc); 883 884 if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 885 flags |= BCH_READ_MUST_BOUNCE; 886 887 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 888 889 if (crc_is_compressed(pick.crc) || 890 (pick.crc.csum_type != BCH_CSUM_none && 891 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 892 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 893 (flags & BCH_READ_USER_MAPPED)) || 894 (flags & BCH_READ_MUST_BOUNCE)))) { 895 read_full = true; 896 bounce = true; 897 } 898 899 if (orig->opts.promote_target) 900 promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 901 &rbio, &bounce, &read_full); 902 903 if (!read_full) { 904 EBUG_ON(crc_is_compressed(pick.crc)); 905 EBUG_ON(pick.crc.csum_type && 906 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 907 bvec_iter_sectors(iter) != pick.crc.live_size || 908 pick.crc.offset || 909 offset_into_extent)); 910 911 data_pos.offset += offset_into_extent; 912 pick.ptr.offset += pick.crc.offset + 913 offset_into_extent; 914 offset_into_extent = 0; 915 pick.crc.compressed_size = bvec_iter_sectors(iter); 916 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 917 pick.crc.offset = 0; 918 pick.crc.live_size = bvec_iter_sectors(iter); 919 } 920 get_bio: 921 if (rbio) { 922 /* 923 * promote already allocated bounce rbio: 924 * promote needs to allocate a bio big enough for uncompressing 925 * data in the write path, but we're not going to use it all 926 * here: 927 */ 928 EBUG_ON(rbio->bio.bi_iter.bi_size < 929 pick.crc.compressed_size << 9); 930 rbio->bio.bi_iter.bi_size = 931 pick.crc.compressed_size << 9; 932 } else if (bounce) { 933 unsigned sectors = pick.crc.compressed_size; 934 935 rbio = rbio_init(bio_alloc_bioset(NULL, 936 DIV_ROUND_UP(sectors, PAGE_SECTORS), 937 0, 938 GFP_NOFS, 939 &c->bio_read_split), 940 orig->opts); 941 942 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 943 rbio->bounce = true; 944 rbio->split = true; 945 } else if (flags & BCH_READ_MUST_CLONE) { 946 /* 947 * Have to clone if there were any splits, due to error 948 * reporting issues (if a split errored, and retrying didn't 949 * work, when it reports the error to its parent (us) we don't 950 * know if the error was from our bio, and we should retry, or 951 * from the whole bio, in which case we don't want to retry and 952 * lose the error) 953 */ 954 rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 955 &c->bio_read_split), 956 orig->opts); 957 rbio->bio.bi_iter = iter; 958 rbio->split = true; 959 } else { 960 rbio = orig; 961 rbio->bio.bi_iter = iter; 962 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 963 } 964 965 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 966 967 rbio->c = c; 968 rbio->submit_time = local_clock(); 969 if (rbio->split) 970 rbio->parent = orig; 971 else 972 rbio->end_io = orig->bio.bi_end_io; 973 rbio->bvec_iter = iter; 974 rbio->offset_into_extent= offset_into_extent; 975 rbio->flags = flags; 976 rbio->have_ioref = ca != NULL; 977 rbio->narrow_crcs = narrow_crcs; 978 rbio->hole = 0; 979 rbio->retry = 0; 980 rbio->context = 0; 981 /* XXX: only initialize this if needed */ 982 rbio->devs_have = bch2_bkey_devs(k); 983 rbio->pick = pick; 984 rbio->subvol = orig->subvol; 985 rbio->read_pos = read_pos; 986 rbio->data_btree = data_btree; 987 rbio->data_pos = data_pos; 988 rbio->version = k.k->version; 989 rbio->promote = promote; 990 INIT_WORK(&rbio->work, NULL); 991 992 rbio->bio.bi_opf = orig->bio.bi_opf; 993 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 994 rbio->bio.bi_end_io = bch2_read_endio; 995 996 if (rbio->bounce) 997 trace_and_count(c, read_bounce, &rbio->bio); 998 999 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1000 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1001 1002 /* 1003 * If it's being moved internally, we don't want to flag it as a cache 1004 * hit: 1005 */ 1006 if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 1007 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1008 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1009 1010 if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 1011 bio_inc_remaining(&orig->bio); 1012 trace_and_count(c, read_split, &orig->bio); 1013 } 1014 1015 if (!rbio->pick.idx) { 1016 if (!rbio->have_ioref) { 1017 bch_err_inum_offset_ratelimited(c, 1018 read_pos.inode, 1019 read_pos.offset << 9, 1020 "no device to read from"); 1021 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1022 goto out; 1023 } 1024 1025 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1026 bio_sectors(&rbio->bio)); 1027 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1028 1029 if (unlikely(c->opts.no_data_io)) { 1030 if (likely(!(flags & BCH_READ_IN_RETRY))) 1031 bio_endio(&rbio->bio); 1032 } else { 1033 if (likely(!(flags & BCH_READ_IN_RETRY))) 1034 submit_bio(&rbio->bio); 1035 else 1036 submit_bio_wait(&rbio->bio); 1037 } 1038 1039 /* 1040 * We just submitted IO which may block, we expect relock fail 1041 * events and shouldn't count them: 1042 */ 1043 trans->notrace_relock_fail = true; 1044 } else { 1045 /* Attempting reconstruct read: */ 1046 if (bch2_ec_read_extent(trans, rbio)) { 1047 bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 1048 goto out; 1049 } 1050 1051 if (likely(!(flags & BCH_READ_IN_RETRY))) 1052 bio_endio(&rbio->bio); 1053 } 1054 out: 1055 if (likely(!(flags & BCH_READ_IN_RETRY))) { 1056 return 0; 1057 } else { 1058 int ret; 1059 1060 rbio->context = RBIO_CONTEXT_UNBOUND; 1061 bch2_read_endio(&rbio->bio); 1062 1063 ret = rbio->retry; 1064 rbio = bch2_rbio_free(rbio); 1065 1066 if (ret == READ_RETRY_AVOID) { 1067 bch2_mark_io_failure(failed, &pick); 1068 ret = READ_RETRY; 1069 } 1070 1071 if (!ret) 1072 goto out_read_done; 1073 1074 return ret; 1075 } 1076 1077 err: 1078 if (flags & BCH_READ_IN_RETRY) 1079 return READ_ERR; 1080 1081 orig->bio.bi_status = BLK_STS_IOERR; 1082 goto out_read_done; 1083 1084 hole: 1085 /* 1086 * won't normally happen in the BCH_READ_NODECODE 1087 * (bch2_move_extent()) path, but if we retry and the extent we wanted 1088 * to read no longer exists we have to signal that: 1089 */ 1090 if (flags & BCH_READ_NODECODE) 1091 orig->hole = true; 1092 1093 zero_fill_bio_iter(&orig->bio, iter); 1094 out_read_done: 1095 if (flags & BCH_READ_LAST_FRAGMENT) 1096 bch2_rbio_done(orig); 1097 return 0; 1098 } 1099 1100 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 1101 struct bvec_iter bvec_iter, subvol_inum inum, 1102 struct bch_io_failures *failed, unsigned flags) 1103 { 1104 struct btree_trans *trans = bch2_trans_get(c); 1105 struct btree_iter iter; 1106 struct bkey_buf sk; 1107 struct bkey_s_c k; 1108 u32 snapshot; 1109 int ret; 1110 1111 BUG_ON(flags & BCH_READ_NODECODE); 1112 1113 bch2_bkey_buf_init(&sk); 1114 retry: 1115 bch2_trans_begin(trans); 1116 iter = (struct btree_iter) { NULL }; 1117 1118 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1119 if (ret) 1120 goto err; 1121 1122 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1123 SPOS(inum.inum, bvec_iter.bi_sector, snapshot), 1124 BTREE_ITER_slots); 1125 while (1) { 1126 unsigned bytes, sectors, offset_into_extent; 1127 enum btree_id data_btree = BTREE_ID_extents; 1128 1129 /* 1130 * read_extent -> io_time_reset may cause a transaction restart 1131 * without returning an error, we need to check for that here: 1132 */ 1133 ret = bch2_trans_relock(trans); 1134 if (ret) 1135 break; 1136 1137 bch2_btree_iter_set_pos(&iter, 1138 POS(inum.inum, bvec_iter.bi_sector)); 1139 1140 k = bch2_btree_iter_peek_slot(&iter); 1141 ret = bkey_err(k); 1142 if (ret) 1143 break; 1144 1145 offset_into_extent = iter.pos.offset - 1146 bkey_start_offset(k.k); 1147 sectors = k.k->size - offset_into_extent; 1148 1149 bch2_bkey_buf_reassemble(&sk, c, k); 1150 1151 ret = bch2_read_indirect_extent(trans, &data_btree, 1152 &offset_into_extent, &sk); 1153 if (ret) 1154 break; 1155 1156 k = bkey_i_to_s_c(sk.k); 1157 1158 /* 1159 * With indirect extents, the amount of data to read is the min 1160 * of the original extent and the indirect extent: 1161 */ 1162 sectors = min(sectors, k.k->size - offset_into_extent); 1163 1164 bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1165 swap(bvec_iter.bi_size, bytes); 1166 1167 if (bvec_iter.bi_size == bytes) 1168 flags |= BCH_READ_LAST_FRAGMENT; 1169 1170 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1171 data_btree, k, 1172 offset_into_extent, failed, flags); 1173 if (ret) 1174 break; 1175 1176 if (flags & BCH_READ_LAST_FRAGMENT) 1177 break; 1178 1179 swap(bvec_iter.bi_size, bytes); 1180 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1181 1182 ret = btree_trans_too_many_iters(trans); 1183 if (ret) 1184 break; 1185 } 1186 err: 1187 bch2_trans_iter_exit(trans, &iter); 1188 1189 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1190 ret == READ_RETRY || 1191 ret == READ_RETRY_AVOID) 1192 goto retry; 1193 1194 bch2_trans_put(trans); 1195 bch2_bkey_buf_exit(&sk, c); 1196 1197 if (ret) { 1198 bch_err_inum_offset_ratelimited(c, inum.inum, 1199 bvec_iter.bi_sector << 9, 1200 "read error %i from btree lookup", ret); 1201 rbio->bio.bi_status = BLK_STS_IOERR; 1202 bch2_rbio_done(rbio); 1203 } 1204 } 1205 1206 void bch2_fs_io_read_exit(struct bch_fs *c) 1207 { 1208 if (c->promote_table.tbl) 1209 rhashtable_destroy(&c->promote_table); 1210 bioset_exit(&c->bio_read_split); 1211 bioset_exit(&c->bio_read); 1212 } 1213 1214 int bch2_fs_io_read_init(struct bch_fs *c) 1215 { 1216 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1217 BIOSET_NEED_BVECS)) 1218 return -BCH_ERR_ENOMEM_bio_read_init; 1219 1220 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1221 BIOSET_NEED_BVECS)) 1222 return -BCH_ERR_ENOMEM_bio_read_split_init; 1223 1224 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1225 return -BCH_ERR_ENOMEM_promote_table_init; 1226 1227 return 0; 1228 } 1229