11809b8cbSKent Overstreet // SPDX-License-Identifier: GPL-2.0 21809b8cbSKent Overstreet /* 31809b8cbSKent Overstreet * Some low level IO code, and hacks for various block layer limitations 41809b8cbSKent Overstreet * 51809b8cbSKent Overstreet * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 61809b8cbSKent Overstreet * Copyright 2012 Google, Inc. 71809b8cbSKent Overstreet */ 81809b8cbSKent Overstreet 91809b8cbSKent Overstreet #include "bcachefs.h" 101809b8cbSKent Overstreet #include "alloc_background.h" 111809b8cbSKent Overstreet #include "alloc_foreground.h" 121809b8cbSKent Overstreet #include "btree_update.h" 131809b8cbSKent Overstreet #include "buckets.h" 141809b8cbSKent Overstreet #include "checksum.h" 151809b8cbSKent Overstreet #include "clock.h" 161809b8cbSKent Overstreet #include "compress.h" 171809b8cbSKent Overstreet #include "data_update.h" 181809b8cbSKent Overstreet #include "disk_groups.h" 191809b8cbSKent Overstreet #include "ec.h" 201809b8cbSKent Overstreet #include "error.h" 211809b8cbSKent Overstreet #include "io_read.h" 221809b8cbSKent Overstreet #include "io_misc.h" 231809b8cbSKent Overstreet #include "io_write.h" 241809b8cbSKent Overstreet #include "subvolume.h" 251809b8cbSKent Overstreet #include "trace.h" 261809b8cbSKent Overstreet 27793a06d9SKent Overstreet #include <linux/sched/mm.h> 28793a06d9SKent Overstreet 291809b8cbSKent Overstreet #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 301809b8cbSKent Overstreet 311809b8cbSKent Overstreet static bool bch2_target_congested(struct bch_fs *c, u16 target) 321809b8cbSKent Overstreet { 331809b8cbSKent Overstreet const struct bch_devs_mask *devs; 341809b8cbSKent Overstreet unsigned d, nr = 0, total = 0; 351809b8cbSKent Overstreet u64 now = local_clock(), last; 361809b8cbSKent Overstreet s64 congested; 371809b8cbSKent Overstreet struct bch_dev *ca; 381809b8cbSKent Overstreet 391809b8cbSKent Overstreet if (!target) 401809b8cbSKent Overstreet return false; 411809b8cbSKent Overstreet 421809b8cbSKent Overstreet rcu_read_lock(); 431809b8cbSKent Overstreet devs = bch2_target_to_mask(c, target) ?: 441809b8cbSKent Overstreet &c->rw_devs[BCH_DATA_user]; 451809b8cbSKent Overstreet 461809b8cbSKent Overstreet for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 471809b8cbSKent Overstreet ca = rcu_dereference(c->devs[d]); 481809b8cbSKent Overstreet if (!ca) 491809b8cbSKent Overstreet continue; 501809b8cbSKent Overstreet 511809b8cbSKent Overstreet congested = atomic_read(&ca->congested); 521809b8cbSKent Overstreet last = READ_ONCE(ca->congested_last); 531809b8cbSKent Overstreet if (time_after64(now, last)) 541809b8cbSKent Overstreet congested -= (now - last) >> 12; 551809b8cbSKent Overstreet 561809b8cbSKent Overstreet total += max(congested, 0LL); 571809b8cbSKent Overstreet nr++; 581809b8cbSKent Overstreet } 591809b8cbSKent Overstreet rcu_read_unlock(); 601809b8cbSKent Overstreet 611809b8cbSKent Overstreet return bch2_rand_range(nr * CONGESTED_MAX) < total; 621809b8cbSKent Overstreet } 631809b8cbSKent Overstreet 641809b8cbSKent Overstreet #else 651809b8cbSKent Overstreet 661809b8cbSKent Overstreet static bool bch2_target_congested(struct bch_fs *c, u16 target) 671809b8cbSKent Overstreet { 681809b8cbSKent Overstreet return false; 691809b8cbSKent Overstreet } 701809b8cbSKent Overstreet 711809b8cbSKent Overstreet #endif 721809b8cbSKent Overstreet 731809b8cbSKent Overstreet /* Cache promotion on read */ 741809b8cbSKent Overstreet 751809b8cbSKent Overstreet struct promote_op { 761809b8cbSKent Overstreet struct rcu_head rcu; 771809b8cbSKent Overstreet u64 start_time; 781809b8cbSKent Overstreet 791809b8cbSKent Overstreet struct rhash_head hash; 801809b8cbSKent Overstreet struct bpos pos; 811809b8cbSKent Overstreet 821809b8cbSKent Overstreet struct data_update write; 8362286a08SGustavo A. R. Silva struct bio_vec bi_inline_vecs[]; /* must be last */ 841809b8cbSKent Overstreet }; 851809b8cbSKent Overstreet 861809b8cbSKent Overstreet static const struct rhashtable_params bch_promote_params = { 871809b8cbSKent Overstreet .head_offset = offsetof(struct promote_op, hash), 881809b8cbSKent Overstreet .key_offset = offsetof(struct promote_op, pos), 891809b8cbSKent Overstreet .key_len = sizeof(struct bpos), 905ae67abcSKent Overstreet .automatic_shrinking = true, 911809b8cbSKent Overstreet }; 921809b8cbSKent Overstreet 93feb5cc39SKent Overstreet static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 941809b8cbSKent Overstreet struct bpos pos, 951809b8cbSKent Overstreet struct bch_io_opts opts, 96a2cb8a62SKent Overstreet unsigned flags, 97a2cb8a62SKent Overstreet struct bch_io_failures *failed) 981809b8cbSKent Overstreet { 99a2cb8a62SKent Overstreet if (!failed) { 100feb5cc39SKent Overstreet BUG_ON(!opts.promote_target); 1011809b8cbSKent Overstreet 102feb5cc39SKent Overstreet if (!(flags & BCH_READ_MAY_PROMOTE)) 103feb5cc39SKent Overstreet return -BCH_ERR_nopromote_may_not; 1041809b8cbSKent Overstreet 1051809b8cbSKent Overstreet if (bch2_bkey_has_target(c, k, opts.promote_target)) 106feb5cc39SKent Overstreet return -BCH_ERR_nopromote_already_promoted; 1071809b8cbSKent Overstreet 1081809b8cbSKent Overstreet if (bkey_extent_is_unwritten(k)) 109feb5cc39SKent Overstreet return -BCH_ERR_nopromote_unwritten; 1101809b8cbSKent Overstreet 111feb5cc39SKent Overstreet if (bch2_target_congested(c, opts.promote_target)) 112feb5cc39SKent Overstreet return -BCH_ERR_nopromote_congested; 113a2cb8a62SKent Overstreet } 1141809b8cbSKent Overstreet 1151809b8cbSKent Overstreet if (rhashtable_lookup_fast(&c->promote_table, &pos, 1161809b8cbSKent Overstreet bch_promote_params)) 117feb5cc39SKent Overstreet return -BCH_ERR_nopromote_in_flight; 1181809b8cbSKent Overstreet 119feb5cc39SKent Overstreet return 0; 1201809b8cbSKent Overstreet } 1211809b8cbSKent Overstreet 1221809b8cbSKent Overstreet static void promote_free(struct bch_fs *c, struct promote_op *op) 1231809b8cbSKent Overstreet { 1241809b8cbSKent Overstreet int ret; 1251809b8cbSKent Overstreet 1261809b8cbSKent Overstreet bch2_data_update_exit(&op->write); 1271809b8cbSKent Overstreet 1281809b8cbSKent Overstreet ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 1291809b8cbSKent Overstreet bch_promote_params); 1301809b8cbSKent Overstreet BUG_ON(ret); 1311809b8cbSKent Overstreet bch2_write_ref_put(c, BCH_WRITE_REF_promote); 1321809b8cbSKent Overstreet kfree_rcu(op, rcu); 1331809b8cbSKent Overstreet } 1341809b8cbSKent Overstreet 1351809b8cbSKent Overstreet static void promote_done(struct bch_write_op *wop) 1361809b8cbSKent Overstreet { 1371809b8cbSKent Overstreet struct promote_op *op = 1381809b8cbSKent Overstreet container_of(wop, struct promote_op, write.op); 1391809b8cbSKent Overstreet struct bch_fs *c = op->write.op.c; 1401809b8cbSKent Overstreet 1411809b8cbSKent Overstreet bch2_time_stats_update(&c->times[BCH_TIME_data_promote], 1421809b8cbSKent Overstreet op->start_time); 1431809b8cbSKent Overstreet promote_free(c, op); 1441809b8cbSKent Overstreet } 1451809b8cbSKent Overstreet 1461809b8cbSKent Overstreet static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) 1471809b8cbSKent Overstreet { 1481809b8cbSKent Overstreet struct bio *bio = &op->write.op.wbio.bio; 1491809b8cbSKent Overstreet 1501809b8cbSKent Overstreet trace_and_count(op->write.op.c, read_promote, &rbio->bio); 1511809b8cbSKent Overstreet 1521809b8cbSKent Overstreet /* we now own pages: */ 1531809b8cbSKent Overstreet BUG_ON(!rbio->bounce); 1541809b8cbSKent Overstreet BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); 1551809b8cbSKent Overstreet 1561809b8cbSKent Overstreet memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, 1571809b8cbSKent Overstreet sizeof(struct bio_vec) * rbio->bio.bi_vcnt); 1581809b8cbSKent Overstreet swap(bio->bi_vcnt, rbio->bio.bi_vcnt); 1591809b8cbSKent Overstreet 1601809b8cbSKent Overstreet bch2_data_update_read_done(&op->write, rbio->pick.crc); 1611809b8cbSKent Overstreet } 1621809b8cbSKent Overstreet 1631809b8cbSKent Overstreet static struct promote_op *__promote_alloc(struct btree_trans *trans, 1641809b8cbSKent Overstreet enum btree_id btree_id, 1651809b8cbSKent Overstreet struct bkey_s_c k, 1661809b8cbSKent Overstreet struct bpos pos, 1671809b8cbSKent Overstreet struct extent_ptr_decoded *pick, 1681809b8cbSKent Overstreet struct bch_io_opts opts, 1691809b8cbSKent Overstreet unsigned sectors, 170a2cb8a62SKent Overstreet struct bch_read_bio **rbio, 171a2cb8a62SKent Overstreet struct bch_io_failures *failed) 1721809b8cbSKent Overstreet { 1731809b8cbSKent Overstreet struct bch_fs *c = trans->c; 1741809b8cbSKent Overstreet struct promote_op *op = NULL; 1751809b8cbSKent Overstreet struct bio *bio; 1761809b8cbSKent Overstreet unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 1771809b8cbSKent Overstreet int ret; 1781809b8cbSKent Overstreet 1791809b8cbSKent Overstreet if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) 180cee0a8eaSKent Overstreet return ERR_PTR(-BCH_ERR_nopromote_no_writes); 1811809b8cbSKent Overstreet 1823e489998SErick Archer op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); 183cee0a8eaSKent Overstreet if (!op) { 184cee0a8eaSKent Overstreet ret = -BCH_ERR_nopromote_enomem; 1851809b8cbSKent Overstreet goto err; 186cee0a8eaSKent Overstreet } 1871809b8cbSKent Overstreet 1881809b8cbSKent Overstreet op->start_time = local_clock(); 1891809b8cbSKent Overstreet op->pos = pos; 1901809b8cbSKent Overstreet 1911809b8cbSKent Overstreet /* 1921809b8cbSKent Overstreet * We don't use the mempool here because extents that aren't 1931809b8cbSKent Overstreet * checksummed or compressed can be too big for the mempool: 1941809b8cbSKent Overstreet */ 1951809b8cbSKent Overstreet *rbio = kzalloc(sizeof(struct bch_read_bio) + 1961809b8cbSKent Overstreet sizeof(struct bio_vec) * pages, 1971ad36a01SKent Overstreet GFP_KERNEL); 198cee0a8eaSKent Overstreet if (!*rbio) { 199cee0a8eaSKent Overstreet ret = -BCH_ERR_nopromote_enomem; 2001809b8cbSKent Overstreet goto err; 201cee0a8eaSKent Overstreet } 2021809b8cbSKent Overstreet 2031809b8cbSKent Overstreet rbio_init(&(*rbio)->bio, opts); 2041809b8cbSKent Overstreet bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); 2051809b8cbSKent Overstreet 206cee0a8eaSKent Overstreet if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { 207cee0a8eaSKent Overstreet ret = -BCH_ERR_nopromote_enomem; 2081809b8cbSKent Overstreet goto err; 209cee0a8eaSKent Overstreet } 2101809b8cbSKent Overstreet 2111809b8cbSKent Overstreet (*rbio)->bounce = true; 2121809b8cbSKent Overstreet (*rbio)->split = true; 2131809b8cbSKent Overstreet (*rbio)->kmalloc = true; 2141809b8cbSKent Overstreet 2151809b8cbSKent Overstreet if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 216cee0a8eaSKent Overstreet bch_promote_params)) { 217cee0a8eaSKent Overstreet ret = -BCH_ERR_nopromote_in_flight; 2181809b8cbSKent Overstreet goto err; 219cee0a8eaSKent Overstreet } 2201809b8cbSKent Overstreet 2211809b8cbSKent Overstreet bio = &op->write.op.wbio.bio; 2221809b8cbSKent Overstreet bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); 2231809b8cbSKent Overstreet 224a2cb8a62SKent Overstreet struct data_update_opts update_opts = {}; 225a2cb8a62SKent Overstreet 226a2cb8a62SKent Overstreet if (!failed) { 227a2cb8a62SKent Overstreet update_opts.target = opts.promote_target; 228a2cb8a62SKent Overstreet update_opts.extra_replicas = 1; 229a2cb8a62SKent Overstreet update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; 230a2cb8a62SKent Overstreet } else { 231a2cb8a62SKent Overstreet update_opts.target = opts.foreground_target; 232a2cb8a62SKent Overstreet 233a2cb8a62SKent Overstreet struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 234a2cb8a62SKent Overstreet unsigned i = 0; 235a2cb8a62SKent Overstreet bkey_for_each_ptr(ptrs, ptr) { 236a2cb8a62SKent Overstreet if (bch2_dev_io_failures(failed, ptr->dev)) 237a2cb8a62SKent Overstreet update_opts.rewrite_ptrs |= BIT(i); 238a2cb8a62SKent Overstreet i++; 239a2cb8a62SKent Overstreet } 240a2cb8a62SKent Overstreet } 241a2cb8a62SKent Overstreet 2427d9f8468SKent Overstreet ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 2431809b8cbSKent Overstreet writepoint_hashed((unsigned long) current), 2441809b8cbSKent Overstreet opts, 245a2cb8a62SKent Overstreet update_opts, 2461809b8cbSKent Overstreet btree_id, k); 2471809b8cbSKent Overstreet /* 2481809b8cbSKent Overstreet * possible errors: -BCH_ERR_nocow_lock_blocked, 2491809b8cbSKent Overstreet * -BCH_ERR_ENOSPC_disk_reservation: 2501809b8cbSKent Overstreet */ 2511809b8cbSKent Overstreet if (ret) { 252cee0a8eaSKent Overstreet BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 253cee0a8eaSKent Overstreet bch_promote_params)); 2541809b8cbSKent Overstreet goto err; 2551809b8cbSKent Overstreet } 2561809b8cbSKent Overstreet 2571809b8cbSKent Overstreet op->write.op.end_io = promote_done; 2581809b8cbSKent Overstreet 2591809b8cbSKent Overstreet return op; 2601809b8cbSKent Overstreet err: 2611809b8cbSKent Overstreet if (*rbio) 2621809b8cbSKent Overstreet bio_free_pages(&(*rbio)->bio); 2631809b8cbSKent Overstreet kfree(*rbio); 2641809b8cbSKent Overstreet *rbio = NULL; 2651809b8cbSKent Overstreet kfree(op); 2661809b8cbSKent Overstreet bch2_write_ref_put(c, BCH_WRITE_REF_promote); 267cee0a8eaSKent Overstreet return ERR_PTR(ret); 2681809b8cbSKent Overstreet } 2691809b8cbSKent Overstreet 2701809b8cbSKent Overstreet noinline 2711809b8cbSKent Overstreet static struct promote_op *promote_alloc(struct btree_trans *trans, 2721809b8cbSKent Overstreet struct bvec_iter iter, 2731809b8cbSKent Overstreet struct bkey_s_c k, 2741809b8cbSKent Overstreet struct extent_ptr_decoded *pick, 2751809b8cbSKent Overstreet struct bch_io_opts opts, 2761809b8cbSKent Overstreet unsigned flags, 2771809b8cbSKent Overstreet struct bch_read_bio **rbio, 2781809b8cbSKent Overstreet bool *bounce, 279a2cb8a62SKent Overstreet bool *read_full, 280a2cb8a62SKent Overstreet struct bch_io_failures *failed) 2811809b8cbSKent Overstreet { 2821809b8cbSKent Overstreet struct bch_fs *c = trans->c; 283a2cb8a62SKent Overstreet /* 284a2cb8a62SKent Overstreet * if failed != NULL we're not actually doing a promote, we're 285a2cb8a62SKent Overstreet * recovering from an io/checksum error 286a2cb8a62SKent Overstreet */ 287a2cb8a62SKent Overstreet bool promote_full = (failed || 288a2cb8a62SKent Overstreet *read_full || 289a2cb8a62SKent Overstreet READ_ONCE(c->promote_whole_extents)); 2901809b8cbSKent Overstreet /* data might have to be decompressed in the write path: */ 2911809b8cbSKent Overstreet unsigned sectors = promote_full 2921809b8cbSKent Overstreet ? max(pick->crc.compressed_size, pick->crc.live_size) 2931809b8cbSKent Overstreet : bvec_iter_sectors(iter); 2941809b8cbSKent Overstreet struct bpos pos = promote_full 2951809b8cbSKent Overstreet ? bkey_start_pos(k.k) 2961809b8cbSKent Overstreet : POS(k.k->p.inode, iter.bi_sector); 2971809b8cbSKent Overstreet struct promote_op *promote; 298feb5cc39SKent Overstreet int ret; 2991809b8cbSKent Overstreet 300a2cb8a62SKent Overstreet ret = should_promote(c, k, pos, opts, flags, failed); 301feb5cc39SKent Overstreet if (ret) 302feb5cc39SKent Overstreet goto nopromote; 3031809b8cbSKent Overstreet 3041809b8cbSKent Overstreet promote = __promote_alloc(trans, 3051809b8cbSKent Overstreet k.k->type == KEY_TYPE_reflink_v 3061809b8cbSKent Overstreet ? BTREE_ID_reflink 3071809b8cbSKent Overstreet : BTREE_ID_extents, 308a2cb8a62SKent Overstreet k, pos, pick, opts, sectors, rbio, failed); 309cee0a8eaSKent Overstreet ret = PTR_ERR_OR_ZERO(promote); 310cee0a8eaSKent Overstreet if (ret) 311feb5cc39SKent Overstreet goto nopromote; 3121809b8cbSKent Overstreet 3131809b8cbSKent Overstreet *bounce = true; 3141809b8cbSKent Overstreet *read_full = promote_full; 3151809b8cbSKent Overstreet return promote; 316feb5cc39SKent Overstreet nopromote: 317feb5cc39SKent Overstreet trace_read_nopromote(c, ret); 318feb5cc39SKent Overstreet return NULL; 3191809b8cbSKent Overstreet } 3201809b8cbSKent Overstreet 3211809b8cbSKent Overstreet /* Read */ 3221809b8cbSKent Overstreet 3231809b8cbSKent Overstreet #define READ_RETRY_AVOID 1 3241809b8cbSKent Overstreet #define READ_RETRY 2 3251809b8cbSKent Overstreet #define READ_ERR 3 3261809b8cbSKent Overstreet 3271809b8cbSKent Overstreet enum rbio_context { 3281809b8cbSKent Overstreet RBIO_CONTEXT_NULL, 3291809b8cbSKent Overstreet RBIO_CONTEXT_HIGHPRI, 3301809b8cbSKent Overstreet RBIO_CONTEXT_UNBOUND, 3311809b8cbSKent Overstreet }; 3321809b8cbSKent Overstreet 3331809b8cbSKent Overstreet static inline struct bch_read_bio * 3341809b8cbSKent Overstreet bch2_rbio_parent(struct bch_read_bio *rbio) 3351809b8cbSKent Overstreet { 3361809b8cbSKent Overstreet return rbio->split ? rbio->parent : rbio; 3371809b8cbSKent Overstreet } 3381809b8cbSKent Overstreet 3391809b8cbSKent Overstreet __always_inline 3401809b8cbSKent Overstreet static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 3411809b8cbSKent Overstreet enum rbio_context context, 3421809b8cbSKent Overstreet struct workqueue_struct *wq) 3431809b8cbSKent Overstreet { 3441809b8cbSKent Overstreet if (context <= rbio->context) { 3451809b8cbSKent Overstreet fn(&rbio->work); 3461809b8cbSKent Overstreet } else { 3471809b8cbSKent Overstreet rbio->work.func = fn; 3481809b8cbSKent Overstreet rbio->context = context; 3491809b8cbSKent Overstreet queue_work(wq, &rbio->work); 3501809b8cbSKent Overstreet } 3511809b8cbSKent Overstreet } 3521809b8cbSKent Overstreet 3531809b8cbSKent Overstreet static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 3541809b8cbSKent Overstreet { 3551809b8cbSKent Overstreet BUG_ON(rbio->bounce && !rbio->split); 3561809b8cbSKent Overstreet 3571809b8cbSKent Overstreet if (rbio->promote) 3581809b8cbSKent Overstreet promote_free(rbio->c, rbio->promote); 3591809b8cbSKent Overstreet rbio->promote = NULL; 3601809b8cbSKent Overstreet 3611809b8cbSKent Overstreet if (rbio->bounce) 3621809b8cbSKent Overstreet bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 3631809b8cbSKent Overstreet 3641809b8cbSKent Overstreet if (rbio->split) { 3651809b8cbSKent Overstreet struct bch_read_bio *parent = rbio->parent; 3661809b8cbSKent Overstreet 3671809b8cbSKent Overstreet if (rbio->kmalloc) 3681809b8cbSKent Overstreet kfree(rbio); 3691809b8cbSKent Overstreet else 3701809b8cbSKent Overstreet bio_put(&rbio->bio); 3711809b8cbSKent Overstreet 3721809b8cbSKent Overstreet rbio = parent; 3731809b8cbSKent Overstreet } 3741809b8cbSKent Overstreet 3751809b8cbSKent Overstreet return rbio; 3761809b8cbSKent Overstreet } 3771809b8cbSKent Overstreet 3781809b8cbSKent Overstreet /* 3791809b8cbSKent Overstreet * Only called on a top level bch_read_bio to complete an entire read request, 3801809b8cbSKent Overstreet * not a split: 3811809b8cbSKent Overstreet */ 3821809b8cbSKent Overstreet static void bch2_rbio_done(struct bch_read_bio *rbio) 3831809b8cbSKent Overstreet { 3841809b8cbSKent Overstreet if (rbio->start_time) 3851809b8cbSKent Overstreet bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 3861809b8cbSKent Overstreet rbio->start_time); 3871809b8cbSKent Overstreet bio_endio(&rbio->bio); 3881809b8cbSKent Overstreet } 3891809b8cbSKent Overstreet 3901809b8cbSKent Overstreet static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, 3911809b8cbSKent Overstreet struct bvec_iter bvec_iter, 3921809b8cbSKent Overstreet struct bch_io_failures *failed, 3931809b8cbSKent Overstreet unsigned flags) 3941809b8cbSKent Overstreet { 3956bd68ec2SKent Overstreet struct btree_trans *trans = bch2_trans_get(c); 3961809b8cbSKent Overstreet struct btree_iter iter; 3971809b8cbSKent Overstreet struct bkey_buf sk; 3981809b8cbSKent Overstreet struct bkey_s_c k; 3991809b8cbSKent Overstreet int ret; 4001809b8cbSKent Overstreet 4011809b8cbSKent Overstreet flags &= ~BCH_READ_LAST_FRAGMENT; 4021809b8cbSKent Overstreet flags |= BCH_READ_MUST_CLONE; 4031809b8cbSKent Overstreet 4041809b8cbSKent Overstreet bch2_bkey_buf_init(&sk); 4051809b8cbSKent Overstreet 4066bd68ec2SKent Overstreet bch2_trans_iter_init(trans, &iter, rbio->data_btree, 4075dd8c60eSKent Overstreet rbio->read_pos, BTREE_ITER_slots); 4081809b8cbSKent Overstreet retry: 4091809b8cbSKent Overstreet rbio->bio.bi_status = 0; 4101809b8cbSKent Overstreet 4111809b8cbSKent Overstreet k = bch2_btree_iter_peek_slot(&iter); 4121809b8cbSKent Overstreet if (bkey_err(k)) 4131809b8cbSKent Overstreet goto err; 4141809b8cbSKent Overstreet 4151809b8cbSKent Overstreet bch2_bkey_buf_reassemble(&sk, c, k); 4161809b8cbSKent Overstreet k = bkey_i_to_s_c(sk.k); 4171809b8cbSKent Overstreet 4181809b8cbSKent Overstreet if (!bch2_bkey_matches_ptr(c, k, 4191809b8cbSKent Overstreet rbio->pick.ptr, 4201809b8cbSKent Overstreet rbio->data_pos.offset - 4211809b8cbSKent Overstreet rbio->pick.crc.offset)) { 4221809b8cbSKent Overstreet /* extent we wanted to read no longer exists: */ 4231809b8cbSKent Overstreet rbio->hole = true; 4241809b8cbSKent Overstreet goto out; 4251809b8cbSKent Overstreet } 4261809b8cbSKent Overstreet 4276bd68ec2SKent Overstreet ret = __bch2_read_extent(trans, rbio, bvec_iter, 4281809b8cbSKent Overstreet rbio->read_pos, 4291809b8cbSKent Overstreet rbio->data_btree, 4301809b8cbSKent Overstreet k, 0, failed, flags); 4311809b8cbSKent Overstreet if (ret == READ_RETRY) 4321809b8cbSKent Overstreet goto retry; 4331809b8cbSKent Overstreet if (ret) 4341809b8cbSKent Overstreet goto err; 4351809b8cbSKent Overstreet out: 4361809b8cbSKent Overstreet bch2_rbio_done(rbio); 4376bd68ec2SKent Overstreet bch2_trans_iter_exit(trans, &iter); 4386bd68ec2SKent Overstreet bch2_trans_put(trans); 4391809b8cbSKent Overstreet bch2_bkey_buf_exit(&sk, c); 4401809b8cbSKent Overstreet return; 4411809b8cbSKent Overstreet err: 4421809b8cbSKent Overstreet rbio->bio.bi_status = BLK_STS_IOERR; 4431809b8cbSKent Overstreet goto out; 4441809b8cbSKent Overstreet } 4451809b8cbSKent Overstreet 4461809b8cbSKent Overstreet static void bch2_rbio_retry(struct work_struct *work) 4471809b8cbSKent Overstreet { 4481809b8cbSKent Overstreet struct bch_read_bio *rbio = 4491809b8cbSKent Overstreet container_of(work, struct bch_read_bio, work); 4501809b8cbSKent Overstreet struct bch_fs *c = rbio->c; 4511809b8cbSKent Overstreet struct bvec_iter iter = rbio->bvec_iter; 4521809b8cbSKent Overstreet unsigned flags = rbio->flags; 4531809b8cbSKent Overstreet subvol_inum inum = { 4541809b8cbSKent Overstreet .subvol = rbio->subvol, 4551809b8cbSKent Overstreet .inum = rbio->read_pos.inode, 4561809b8cbSKent Overstreet }; 4571809b8cbSKent Overstreet struct bch_io_failures failed = { .nr = 0 }; 4581809b8cbSKent Overstreet 4591809b8cbSKent Overstreet trace_and_count(c, read_retry, &rbio->bio); 4601809b8cbSKent Overstreet 4611809b8cbSKent Overstreet if (rbio->retry == READ_RETRY_AVOID) 4621809b8cbSKent Overstreet bch2_mark_io_failure(&failed, &rbio->pick); 4631809b8cbSKent Overstreet 4641809b8cbSKent Overstreet rbio->bio.bi_status = 0; 4651809b8cbSKent Overstreet 4661809b8cbSKent Overstreet rbio = bch2_rbio_free(rbio); 4671809b8cbSKent Overstreet 4681809b8cbSKent Overstreet flags |= BCH_READ_IN_RETRY; 4691809b8cbSKent Overstreet flags &= ~BCH_READ_MAY_PROMOTE; 4701809b8cbSKent Overstreet 4711809b8cbSKent Overstreet if (flags & BCH_READ_NODECODE) { 4721809b8cbSKent Overstreet bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); 4731809b8cbSKent Overstreet } else { 4741809b8cbSKent Overstreet flags &= ~BCH_READ_LAST_FRAGMENT; 4751809b8cbSKent Overstreet flags |= BCH_READ_MUST_CLONE; 4761809b8cbSKent Overstreet 4771809b8cbSKent Overstreet __bch2_read(c, rbio, iter, inum, &failed, flags); 4781809b8cbSKent Overstreet } 4791809b8cbSKent Overstreet } 4801809b8cbSKent Overstreet 4811809b8cbSKent Overstreet static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, 4821809b8cbSKent Overstreet blk_status_t error) 4831809b8cbSKent Overstreet { 4841809b8cbSKent Overstreet rbio->retry = retry; 4851809b8cbSKent Overstreet 4861809b8cbSKent Overstreet if (rbio->flags & BCH_READ_IN_RETRY) 4871809b8cbSKent Overstreet return; 4881809b8cbSKent Overstreet 4891809b8cbSKent Overstreet if (retry == READ_ERR) { 4901809b8cbSKent Overstreet rbio = bch2_rbio_free(rbio); 4911809b8cbSKent Overstreet 4921809b8cbSKent Overstreet rbio->bio.bi_status = error; 4931809b8cbSKent Overstreet bch2_rbio_done(rbio); 4941809b8cbSKent Overstreet } else { 4951809b8cbSKent Overstreet bch2_rbio_punt(rbio, bch2_rbio_retry, 4961809b8cbSKent Overstreet RBIO_CONTEXT_UNBOUND, system_unbound_wq); 4971809b8cbSKent Overstreet } 4981809b8cbSKent Overstreet } 4991809b8cbSKent Overstreet 5001809b8cbSKent Overstreet static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 5011809b8cbSKent Overstreet struct bch_read_bio *rbio) 5021809b8cbSKent Overstreet { 5031809b8cbSKent Overstreet struct bch_fs *c = rbio->c; 5041809b8cbSKent Overstreet u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 5051809b8cbSKent Overstreet struct bch_extent_crc_unpacked new_crc; 5061809b8cbSKent Overstreet struct btree_iter iter; 5071809b8cbSKent Overstreet struct bkey_i *new; 5081809b8cbSKent Overstreet struct bkey_s_c k; 5091809b8cbSKent Overstreet int ret = 0; 5101809b8cbSKent Overstreet 5111809b8cbSKent Overstreet if (crc_is_compressed(rbio->pick.crc)) 5121809b8cbSKent Overstreet return 0; 5131809b8cbSKent Overstreet 5141809b8cbSKent Overstreet k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 5155dd8c60eSKent Overstreet BTREE_ITER_slots|BTREE_ITER_intent); 5161809b8cbSKent Overstreet if ((ret = bkey_err(k))) 5171809b8cbSKent Overstreet goto out; 5181809b8cbSKent Overstreet 5191809b8cbSKent Overstreet if (bversion_cmp(k.k->version, rbio->version) || 5201809b8cbSKent Overstreet !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 5211809b8cbSKent Overstreet goto out; 5221809b8cbSKent Overstreet 5231809b8cbSKent Overstreet /* Extent was merged? */ 5241809b8cbSKent Overstreet if (bkey_start_offset(k.k) < data_offset || 5251809b8cbSKent Overstreet k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 5261809b8cbSKent Overstreet goto out; 5271809b8cbSKent Overstreet 5281809b8cbSKent Overstreet if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 5291809b8cbSKent Overstreet rbio->pick.crc, NULL, &new_crc, 5301809b8cbSKent Overstreet bkey_start_offset(k.k) - data_offset, k.k->size, 5311809b8cbSKent Overstreet rbio->pick.crc.csum_type)) { 5321809b8cbSKent Overstreet bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 5331809b8cbSKent Overstreet ret = 0; 5341809b8cbSKent Overstreet goto out; 5351809b8cbSKent Overstreet } 5361809b8cbSKent Overstreet 5371809b8cbSKent Overstreet /* 5381809b8cbSKent Overstreet * going to be temporarily appending another checksum entry: 5391809b8cbSKent Overstreet */ 5401809b8cbSKent Overstreet new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 5411809b8cbSKent Overstreet sizeof(struct bch_extent_crc128)); 5421809b8cbSKent Overstreet if ((ret = PTR_ERR_OR_ZERO(new))) 5431809b8cbSKent Overstreet goto out; 5441809b8cbSKent Overstreet 5451809b8cbSKent Overstreet bkey_reassemble(new, k); 5461809b8cbSKent Overstreet 5471809b8cbSKent Overstreet if (!bch2_bkey_narrow_crcs(new, new_crc)) 5481809b8cbSKent Overstreet goto out; 5491809b8cbSKent Overstreet 5501809b8cbSKent Overstreet ret = bch2_trans_update(trans, &iter, new, 5515dd8c60eSKent Overstreet BTREE_UPDATE_internal_snapshot_node); 5521809b8cbSKent Overstreet out: 5531809b8cbSKent Overstreet bch2_trans_iter_exit(trans, &iter); 5541809b8cbSKent Overstreet return ret; 5551809b8cbSKent Overstreet } 5561809b8cbSKent Overstreet 5571809b8cbSKent Overstreet static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 5581809b8cbSKent Overstreet { 559cb52d23eSKent Overstreet bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 5606bd68ec2SKent Overstreet __bch2_rbio_narrow_crcs(trans, rbio)); 5611809b8cbSKent Overstreet } 5621809b8cbSKent Overstreet 5631809b8cbSKent Overstreet /* Inner part that may run in process context */ 5641809b8cbSKent Overstreet static void __bch2_read_endio(struct work_struct *work) 5651809b8cbSKent Overstreet { 5661809b8cbSKent Overstreet struct bch_read_bio *rbio = 5671809b8cbSKent Overstreet container_of(work, struct bch_read_bio, work); 5681809b8cbSKent Overstreet struct bch_fs *c = rbio->c; 5691809b8cbSKent Overstreet struct bio *src = &rbio->bio; 5701809b8cbSKent Overstreet struct bio *dst = &bch2_rbio_parent(rbio)->bio; 5711809b8cbSKent Overstreet struct bvec_iter dst_iter = rbio->bvec_iter; 5721809b8cbSKent Overstreet struct bch_extent_crc_unpacked crc = rbio->pick.crc; 5731809b8cbSKent Overstreet struct nonce nonce = extent_nonce(rbio->version, crc); 5741809b8cbSKent Overstreet unsigned nofs_flags; 5751809b8cbSKent Overstreet struct bch_csum csum; 5761809b8cbSKent Overstreet int ret; 5771809b8cbSKent Overstreet 5781809b8cbSKent Overstreet nofs_flags = memalloc_nofs_save(); 5791809b8cbSKent Overstreet 5801809b8cbSKent Overstreet /* Reset iterator for checksumming and copying bounced data: */ 5811809b8cbSKent Overstreet if (rbio->bounce) { 5821809b8cbSKent Overstreet src->bi_iter.bi_size = crc.compressed_size << 9; 5831809b8cbSKent Overstreet src->bi_iter.bi_idx = 0; 5841809b8cbSKent Overstreet src->bi_iter.bi_bvec_done = 0; 5851809b8cbSKent Overstreet } else { 5861809b8cbSKent Overstreet src->bi_iter = rbio->bvec_iter; 5871809b8cbSKent Overstreet } 5881809b8cbSKent Overstreet 5891809b8cbSKent Overstreet csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 5901809b8cbSKent Overstreet if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) 5911809b8cbSKent Overstreet goto csum_err; 5921809b8cbSKent Overstreet 5931809b8cbSKent Overstreet /* 5941809b8cbSKent Overstreet * XXX 5951809b8cbSKent Overstreet * We need to rework the narrow_crcs path to deliver the read completion 5961809b8cbSKent Overstreet * first, and then punt to a different workqueue, otherwise we're 5971809b8cbSKent Overstreet * holding up reads while doing btree updates which is bad for memory 5981809b8cbSKent Overstreet * reclaim. 5991809b8cbSKent Overstreet */ 6001809b8cbSKent Overstreet if (unlikely(rbio->narrow_crcs)) 6011809b8cbSKent Overstreet bch2_rbio_narrow_crcs(rbio); 6021809b8cbSKent Overstreet 6031809b8cbSKent Overstreet if (rbio->flags & BCH_READ_NODECODE) 6041809b8cbSKent Overstreet goto nodecode; 6051809b8cbSKent Overstreet 6061809b8cbSKent Overstreet /* Adjust crc to point to subset of data we want: */ 6071809b8cbSKent Overstreet crc.offset += rbio->offset_into_extent; 6081809b8cbSKent Overstreet crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 6091809b8cbSKent Overstreet 6101809b8cbSKent Overstreet if (crc_is_compressed(crc)) { 6111809b8cbSKent Overstreet ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 6121809b8cbSKent Overstreet if (ret) 6131809b8cbSKent Overstreet goto decrypt_err; 6141809b8cbSKent Overstreet 6151809b8cbSKent Overstreet if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 6161809b8cbSKent Overstreet !c->opts.no_data_io) 6171809b8cbSKent Overstreet goto decompression_err; 6181809b8cbSKent Overstreet } else { 6191809b8cbSKent Overstreet /* don't need to decrypt the entire bio: */ 6201809b8cbSKent Overstreet nonce = nonce_add(nonce, crc.offset << 9); 6211809b8cbSKent Overstreet bio_advance(src, crc.offset << 9); 6221809b8cbSKent Overstreet 6231809b8cbSKent Overstreet BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 6241809b8cbSKent Overstreet src->bi_iter.bi_size = dst_iter.bi_size; 6251809b8cbSKent Overstreet 6261809b8cbSKent Overstreet ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 6271809b8cbSKent Overstreet if (ret) 6281809b8cbSKent Overstreet goto decrypt_err; 6291809b8cbSKent Overstreet 6301809b8cbSKent Overstreet if (rbio->bounce) { 6311809b8cbSKent Overstreet struct bvec_iter src_iter = src->bi_iter; 6321809b8cbSKent Overstreet 6331809b8cbSKent Overstreet bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 6341809b8cbSKent Overstreet } 6351809b8cbSKent Overstreet } 6361809b8cbSKent Overstreet 6371809b8cbSKent Overstreet if (rbio->promote) { 6381809b8cbSKent Overstreet /* 6391809b8cbSKent Overstreet * Re encrypt data we decrypted, so it's consistent with 6401809b8cbSKent Overstreet * rbio->crc: 6411809b8cbSKent Overstreet */ 6421809b8cbSKent Overstreet ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 6431809b8cbSKent Overstreet if (ret) 6441809b8cbSKent Overstreet goto decrypt_err; 6451809b8cbSKent Overstreet 6461809b8cbSKent Overstreet promote_start(rbio->promote, rbio); 6471809b8cbSKent Overstreet rbio->promote = NULL; 6481809b8cbSKent Overstreet } 6491809b8cbSKent Overstreet nodecode: 6501809b8cbSKent Overstreet if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { 6511809b8cbSKent Overstreet rbio = bch2_rbio_free(rbio); 6521809b8cbSKent Overstreet bch2_rbio_done(rbio); 6531809b8cbSKent Overstreet } 6541809b8cbSKent Overstreet out: 6551809b8cbSKent Overstreet memalloc_nofs_restore(nofs_flags); 6561809b8cbSKent Overstreet return; 6571809b8cbSKent Overstreet csum_err: 6581809b8cbSKent Overstreet /* 6591809b8cbSKent Overstreet * Checksum error: if the bio wasn't bounced, we may have been 6601809b8cbSKent Overstreet * reading into buffers owned by userspace (that userspace can 6611809b8cbSKent Overstreet * scribble over) - retry the read, bouncing it this time: 6621809b8cbSKent Overstreet */ 6631809b8cbSKent Overstreet if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { 6641809b8cbSKent Overstreet rbio->flags |= BCH_READ_MUST_BOUNCE; 6651809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); 6661809b8cbSKent Overstreet goto out; 6671809b8cbSKent Overstreet } 6681809b8cbSKent Overstreet 6694819b66eSKent Overstreet struct printbuf buf = PRINTBUF; 6704819b66eSKent Overstreet buf.atomic++; 6714819b66eSKent Overstreet prt_str(&buf, "data "); 6724819b66eSKent Overstreet bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); 6734819b66eSKent Overstreet 67402b7fa4fSKent Overstreet struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 67502b7fa4fSKent Overstreet if (ca) { 6761809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(ca, 6771809b8cbSKent Overstreet rbio->read_pos.inode, 6781809b8cbSKent Overstreet rbio->read_pos.offset << 9, 6794819b66eSKent Overstreet "data %s", buf.buf); 68094119eebSKent Overstreet bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); 68102b7fa4fSKent Overstreet } 68202b7fa4fSKent Overstreet printbuf_exit(&buf); 6831809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 6841809b8cbSKent Overstreet goto out; 6851809b8cbSKent Overstreet decompression_err: 6861809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 6871809b8cbSKent Overstreet rbio->read_pos.offset << 9, 6881809b8cbSKent Overstreet "decompression error"); 6891809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 6901809b8cbSKent Overstreet goto out; 6911809b8cbSKent Overstreet decrypt_err: 6921809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, 6931809b8cbSKent Overstreet rbio->read_pos.offset << 9, 6941809b8cbSKent Overstreet "decrypt error"); 6951809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); 6961809b8cbSKent Overstreet goto out; 6971809b8cbSKent Overstreet } 6981809b8cbSKent Overstreet 6991809b8cbSKent Overstreet static void bch2_read_endio(struct bio *bio) 7001809b8cbSKent Overstreet { 7011809b8cbSKent Overstreet struct bch_read_bio *rbio = 7021809b8cbSKent Overstreet container_of(bio, struct bch_read_bio, bio); 7031809b8cbSKent Overstreet struct bch_fs *c = rbio->c; 70402b7fa4fSKent Overstreet struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 7051809b8cbSKent Overstreet struct workqueue_struct *wq = NULL; 7061809b8cbSKent Overstreet enum rbio_context context = RBIO_CONTEXT_NULL; 7071809b8cbSKent Overstreet 7081809b8cbSKent Overstreet if (rbio->have_ioref) { 7091809b8cbSKent Overstreet bch2_latency_acct(ca, rbio->submit_time, READ); 7101809b8cbSKent Overstreet percpu_ref_put(&ca->io_ref); 7111809b8cbSKent Overstreet } 7121809b8cbSKent Overstreet 7131809b8cbSKent Overstreet if (!rbio->split) 7141809b8cbSKent Overstreet rbio->bio.bi_end_io = rbio->end_io; 7151809b8cbSKent Overstreet 71602b7fa4fSKent Overstreet if (bio->bi_status) { 71702b7fa4fSKent Overstreet if (ca) { 71802b7fa4fSKent Overstreet bch_err_inum_offset_ratelimited(ca, 7191809b8cbSKent Overstreet rbio->read_pos.inode, 7201809b8cbSKent Overstreet rbio->read_pos.offset, 7211809b8cbSKent Overstreet "data read error: %s", 72202b7fa4fSKent Overstreet bch2_blk_status_to_str(bio->bi_status)); 72302b7fa4fSKent Overstreet bch2_io_error(ca, BCH_MEMBER_ERROR_read); 72402b7fa4fSKent Overstreet } 7251809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); 7261809b8cbSKent Overstreet return; 7271809b8cbSKent Overstreet } 7281809b8cbSKent Overstreet 7291809b8cbSKent Overstreet if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || 73002b7fa4fSKent Overstreet (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 7311809b8cbSKent Overstreet trace_and_count(c, read_reuse_race, &rbio->bio); 7321809b8cbSKent Overstreet 7331809b8cbSKent Overstreet if (rbio->flags & BCH_READ_RETRY_IF_STALE) 7341809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); 7351809b8cbSKent Overstreet else 7361809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); 7371809b8cbSKent Overstreet return; 7381809b8cbSKent Overstreet } 7391809b8cbSKent Overstreet 7401809b8cbSKent Overstreet if (rbio->narrow_crcs || 7411809b8cbSKent Overstreet rbio->promote || 7421809b8cbSKent Overstreet crc_is_compressed(rbio->pick.crc) || 7431809b8cbSKent Overstreet bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 7441809b8cbSKent Overstreet context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 7451809b8cbSKent Overstreet else if (rbio->pick.crc.csum_type) 7461809b8cbSKent Overstreet context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 7471809b8cbSKent Overstreet 7481809b8cbSKent Overstreet bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 7491809b8cbSKent Overstreet } 7501809b8cbSKent Overstreet 7511809b8cbSKent Overstreet int __bch2_read_indirect_extent(struct btree_trans *trans, 7521809b8cbSKent Overstreet unsigned *offset_into_extent, 7531809b8cbSKent Overstreet struct bkey_buf *orig_k) 7541809b8cbSKent Overstreet { 7551809b8cbSKent Overstreet struct btree_iter iter; 7561809b8cbSKent Overstreet struct bkey_s_c k; 7571809b8cbSKent Overstreet u64 reflink_offset; 7581809b8cbSKent Overstreet int ret; 7591809b8cbSKent Overstreet 7601809b8cbSKent Overstreet reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + 7611809b8cbSKent Overstreet *offset_into_extent; 7621809b8cbSKent Overstreet 7631809b8cbSKent Overstreet k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, 7641809b8cbSKent Overstreet POS(0, reflink_offset), 0); 7651809b8cbSKent Overstreet ret = bkey_err(k); 7661809b8cbSKent Overstreet if (ret) 7671809b8cbSKent Overstreet goto err; 7681809b8cbSKent Overstreet 7691809b8cbSKent Overstreet if (k.k->type != KEY_TYPE_reflink_v && 7701809b8cbSKent Overstreet k.k->type != KEY_TYPE_indirect_inline_data) { 7711809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(trans->c, 7721809b8cbSKent Overstreet orig_k->k->k.p.inode, 7731809b8cbSKent Overstreet orig_k->k->k.p.offset << 9, 7741809b8cbSKent Overstreet "%llu len %u points to nonexistent indirect extent %llu", 7751809b8cbSKent Overstreet orig_k->k->k.p.offset, 7761809b8cbSKent Overstreet orig_k->k->k.size, 7771809b8cbSKent Overstreet reflink_offset); 7781809b8cbSKent Overstreet bch2_inconsistent_error(trans->c); 7791809b8cbSKent Overstreet ret = -EIO; 7801809b8cbSKent Overstreet goto err; 7811809b8cbSKent Overstreet } 7821809b8cbSKent Overstreet 7831809b8cbSKent Overstreet *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); 7841809b8cbSKent Overstreet bch2_bkey_buf_reassemble(orig_k, trans->c, k); 7851809b8cbSKent Overstreet err: 7861809b8cbSKent Overstreet bch2_trans_iter_exit(trans, &iter); 7871809b8cbSKent Overstreet return ret; 7881809b8cbSKent Overstreet } 7891809b8cbSKent Overstreet 7901809b8cbSKent Overstreet static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 791db39a35dSKent Overstreet struct bch_dev *ca, 7921809b8cbSKent Overstreet struct bkey_s_c k, 7931809b8cbSKent Overstreet struct bch_extent_ptr ptr) 7941809b8cbSKent Overstreet { 7951809b8cbSKent Overstreet struct bch_fs *c = trans->c; 7961809b8cbSKent Overstreet struct btree_iter iter; 7971809b8cbSKent Overstreet struct printbuf buf = PRINTBUF; 7981809b8cbSKent Overstreet int ret; 7991809b8cbSKent Overstreet 8001809b8cbSKent Overstreet bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 8011f2f92ecSKent Overstreet PTR_BUCKET_POS(ca, &ptr), 8025dd8c60eSKent Overstreet BTREE_ITER_cached); 8031809b8cbSKent Overstreet 8049432e90dSKent Overstreet u8 *gen = bucket_gen(ca, iter.pos.offset); 8059432e90dSKent Overstreet if (gen) { 8069432e90dSKent Overstreet 8077423330eSKent Overstreet prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 8081809b8cbSKent Overstreet printbuf_indent_add(&buf, 2); 8091809b8cbSKent Overstreet 8101809b8cbSKent Overstreet bch2_bkey_val_to_text(&buf, c, k); 8111809b8cbSKent Overstreet prt_newline(&buf); 8121809b8cbSKent Overstreet 8139432e90dSKent Overstreet prt_printf(&buf, "memory gen: %u", *gen); 8141809b8cbSKent Overstreet 8151809b8cbSKent Overstreet ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 8161809b8cbSKent Overstreet if (!ret) { 8171809b8cbSKent Overstreet prt_newline(&buf); 8181809b8cbSKent Overstreet bch2_bkey_val_to_text(&buf, c, k); 8191809b8cbSKent Overstreet } 8209432e90dSKent Overstreet } else { 8219432e90dSKent Overstreet prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 8229432e90dSKent Overstreet iter.pos.inode, iter.pos.offset); 8239432e90dSKent Overstreet printbuf_indent_add(&buf, 2); 8249432e90dSKent Overstreet 8259432e90dSKent Overstreet prt_printf(&buf, "first bucket %u nbuckets %llu\n", 8269432e90dSKent Overstreet ca->mi.first_bucket, ca->mi.nbuckets); 8279432e90dSKent Overstreet 8289432e90dSKent Overstreet bch2_bkey_val_to_text(&buf, c, k); 8299432e90dSKent Overstreet prt_newline(&buf); 8309432e90dSKent Overstreet } 8311809b8cbSKent Overstreet 8321809b8cbSKent Overstreet bch2_fs_inconsistent(c, "%s", buf.buf); 8331809b8cbSKent Overstreet 8341809b8cbSKent Overstreet bch2_trans_iter_exit(trans, &iter); 8351809b8cbSKent Overstreet printbuf_exit(&buf); 8361809b8cbSKent Overstreet } 8371809b8cbSKent Overstreet 8381809b8cbSKent Overstreet int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 8391809b8cbSKent Overstreet struct bvec_iter iter, struct bpos read_pos, 8401809b8cbSKent Overstreet enum btree_id data_btree, struct bkey_s_c k, 8411809b8cbSKent Overstreet unsigned offset_into_extent, 8421809b8cbSKent Overstreet struct bch_io_failures *failed, unsigned flags) 8431809b8cbSKent Overstreet { 8441809b8cbSKent Overstreet struct bch_fs *c = trans->c; 8451809b8cbSKent Overstreet struct extent_ptr_decoded pick; 8461809b8cbSKent Overstreet struct bch_read_bio *rbio = NULL; 8471809b8cbSKent Overstreet struct promote_op *promote = NULL; 8481809b8cbSKent Overstreet bool bounce = false, read_full = false, narrow_crcs = false; 8491809b8cbSKent Overstreet struct bpos data_pos = bkey_start_pos(k.k); 8501809b8cbSKent Overstreet int pick_ret; 8511809b8cbSKent Overstreet 8521809b8cbSKent Overstreet if (bkey_extent_is_inline_data(k.k)) { 8531809b8cbSKent Overstreet unsigned bytes = min_t(unsigned, iter.bi_size, 8541809b8cbSKent Overstreet bkey_inline_data_bytes(k.k)); 8551809b8cbSKent Overstreet 8561809b8cbSKent Overstreet swap(iter.bi_size, bytes); 8571809b8cbSKent Overstreet memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 8581809b8cbSKent Overstreet swap(iter.bi_size, bytes); 8591809b8cbSKent Overstreet bio_advance_iter(&orig->bio, &iter, bytes); 8601809b8cbSKent Overstreet zero_fill_bio_iter(&orig->bio, iter); 8611809b8cbSKent Overstreet goto out_read_done; 8621809b8cbSKent Overstreet } 8631809b8cbSKent Overstreet retry_pick: 8641809b8cbSKent Overstreet pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); 8651809b8cbSKent Overstreet 8661809b8cbSKent Overstreet /* hole or reservation - just zero fill: */ 8671809b8cbSKent Overstreet if (!pick_ret) 8681809b8cbSKent Overstreet goto hole; 8691809b8cbSKent Overstreet 8701809b8cbSKent Overstreet if (pick_ret < 0) { 8711809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(c, 8721809b8cbSKent Overstreet read_pos.inode, read_pos.offset << 9, 8731809b8cbSKent Overstreet "no device to read from"); 8741809b8cbSKent Overstreet goto err; 8751809b8cbSKent Overstreet } 8761809b8cbSKent Overstreet 8772c91ab72SKent Overstreet struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); 8781809b8cbSKent Overstreet 8791809b8cbSKent Overstreet /* 8801809b8cbSKent Overstreet * Stale dirty pointers are treated as IO errors, but @failed isn't 8811809b8cbSKent Overstreet * allocated unless we're in the retry path - so if we're not in the 8821809b8cbSKent Overstreet * retry path, don't check here, it'll be caught in bch2_read_endio() 8831809b8cbSKent Overstreet * and we'll end up in the retry path: 8841809b8cbSKent Overstreet */ 8851809b8cbSKent Overstreet if ((flags & BCH_READ_IN_RETRY) && 8861809b8cbSKent Overstreet !pick.ptr.cached && 887465bf6f4SKent Overstreet ca && 8883858aa42SKent Overstreet unlikely(dev_ptr_stale(ca, &pick.ptr))) { 889db39a35dSKent Overstreet read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 8901809b8cbSKent Overstreet bch2_mark_io_failure(failed, &pick); 891465bf6f4SKent Overstreet percpu_ref_put(&ca->io_ref); 8921809b8cbSKent Overstreet goto retry_pick; 8931809b8cbSKent Overstreet } 8941809b8cbSKent Overstreet 8951809b8cbSKent Overstreet /* 8961809b8cbSKent Overstreet * Unlock the iterator while the btree node's lock is still in 8971809b8cbSKent Overstreet * cache, before doing the IO: 8981809b8cbSKent Overstreet */ 8991809b8cbSKent Overstreet bch2_trans_unlock(trans); 9001809b8cbSKent Overstreet 9011809b8cbSKent Overstreet if (flags & BCH_READ_NODECODE) { 9021809b8cbSKent Overstreet /* 9031809b8cbSKent Overstreet * can happen if we retry, and the extent we were going to read 9041809b8cbSKent Overstreet * has been merged in the meantime: 9051809b8cbSKent Overstreet */ 906465bf6f4SKent Overstreet if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { 907465bf6f4SKent Overstreet if (ca) 908465bf6f4SKent Overstreet percpu_ref_put(&ca->io_ref); 9091809b8cbSKent Overstreet goto hole; 910465bf6f4SKent Overstreet } 9111809b8cbSKent Overstreet 9121809b8cbSKent Overstreet iter.bi_size = pick.crc.compressed_size << 9; 9131809b8cbSKent Overstreet goto get_bio; 9141809b8cbSKent Overstreet } 9151809b8cbSKent Overstreet 9161809b8cbSKent Overstreet if (!(flags & BCH_READ_LAST_FRAGMENT) || 9171809b8cbSKent Overstreet bio_flagged(&orig->bio, BIO_CHAIN)) 9181809b8cbSKent Overstreet flags |= BCH_READ_MUST_CLONE; 9191809b8cbSKent Overstreet 9201809b8cbSKent Overstreet narrow_crcs = !(flags & BCH_READ_IN_RETRY) && 9211809b8cbSKent Overstreet bch2_can_narrow_extent_crcs(k, pick.crc); 9221809b8cbSKent Overstreet 9231809b8cbSKent Overstreet if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) 9241809b8cbSKent Overstreet flags |= BCH_READ_MUST_BOUNCE; 9251809b8cbSKent Overstreet 9261809b8cbSKent Overstreet EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 9271809b8cbSKent Overstreet 9281809b8cbSKent Overstreet if (crc_is_compressed(pick.crc) || 9291809b8cbSKent Overstreet (pick.crc.csum_type != BCH_CSUM_none && 9301809b8cbSKent Overstreet (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 9311809b8cbSKent Overstreet (bch2_csum_type_is_encryption(pick.crc.csum_type) && 9321809b8cbSKent Overstreet (flags & BCH_READ_USER_MAPPED)) || 9331809b8cbSKent Overstreet (flags & BCH_READ_MUST_BOUNCE)))) { 9341809b8cbSKent Overstreet read_full = true; 9351809b8cbSKent Overstreet bounce = true; 9361809b8cbSKent Overstreet } 9371809b8cbSKent Overstreet 938a2cb8a62SKent Overstreet if (orig->opts.promote_target)// || failed) 9391809b8cbSKent Overstreet promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, 940a2cb8a62SKent Overstreet &rbio, &bounce, &read_full, failed); 9411809b8cbSKent Overstreet 9421809b8cbSKent Overstreet if (!read_full) { 9431809b8cbSKent Overstreet EBUG_ON(crc_is_compressed(pick.crc)); 9441809b8cbSKent Overstreet EBUG_ON(pick.crc.csum_type && 9451809b8cbSKent Overstreet (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 9461809b8cbSKent Overstreet bvec_iter_sectors(iter) != pick.crc.live_size || 9471809b8cbSKent Overstreet pick.crc.offset || 9481809b8cbSKent Overstreet offset_into_extent)); 9491809b8cbSKent Overstreet 9501809b8cbSKent Overstreet data_pos.offset += offset_into_extent; 9511809b8cbSKent Overstreet pick.ptr.offset += pick.crc.offset + 9521809b8cbSKent Overstreet offset_into_extent; 9531809b8cbSKent Overstreet offset_into_extent = 0; 9541809b8cbSKent Overstreet pick.crc.compressed_size = bvec_iter_sectors(iter); 9551809b8cbSKent Overstreet pick.crc.uncompressed_size = bvec_iter_sectors(iter); 9561809b8cbSKent Overstreet pick.crc.offset = 0; 9571809b8cbSKent Overstreet pick.crc.live_size = bvec_iter_sectors(iter); 9581809b8cbSKent Overstreet } 9591809b8cbSKent Overstreet get_bio: 9601809b8cbSKent Overstreet if (rbio) { 9611809b8cbSKent Overstreet /* 9621809b8cbSKent Overstreet * promote already allocated bounce rbio: 9631809b8cbSKent Overstreet * promote needs to allocate a bio big enough for uncompressing 9641809b8cbSKent Overstreet * data in the write path, but we're not going to use it all 9651809b8cbSKent Overstreet * here: 9661809b8cbSKent Overstreet */ 9671809b8cbSKent Overstreet EBUG_ON(rbio->bio.bi_iter.bi_size < 9681809b8cbSKent Overstreet pick.crc.compressed_size << 9); 9691809b8cbSKent Overstreet rbio->bio.bi_iter.bi_size = 9701809b8cbSKent Overstreet pick.crc.compressed_size << 9; 9711809b8cbSKent Overstreet } else if (bounce) { 9721809b8cbSKent Overstreet unsigned sectors = pick.crc.compressed_size; 9731809b8cbSKent Overstreet 9741809b8cbSKent Overstreet rbio = rbio_init(bio_alloc_bioset(NULL, 9751809b8cbSKent Overstreet DIV_ROUND_UP(sectors, PAGE_SECTORS), 9761809b8cbSKent Overstreet 0, 9771809b8cbSKent Overstreet GFP_NOFS, 9781809b8cbSKent Overstreet &c->bio_read_split), 9791809b8cbSKent Overstreet orig->opts); 9801809b8cbSKent Overstreet 9811809b8cbSKent Overstreet bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 9821809b8cbSKent Overstreet rbio->bounce = true; 9831809b8cbSKent Overstreet rbio->split = true; 9841809b8cbSKent Overstreet } else if (flags & BCH_READ_MUST_CLONE) { 9851809b8cbSKent Overstreet /* 9861809b8cbSKent Overstreet * Have to clone if there were any splits, due to error 9871809b8cbSKent Overstreet * reporting issues (if a split errored, and retrying didn't 9881809b8cbSKent Overstreet * work, when it reports the error to its parent (us) we don't 9891809b8cbSKent Overstreet * know if the error was from our bio, and we should retry, or 9901809b8cbSKent Overstreet * from the whole bio, in which case we don't want to retry and 9911809b8cbSKent Overstreet * lose the error) 9921809b8cbSKent Overstreet */ 9931809b8cbSKent Overstreet rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 9941809b8cbSKent Overstreet &c->bio_read_split), 9951809b8cbSKent Overstreet orig->opts); 9961809b8cbSKent Overstreet rbio->bio.bi_iter = iter; 9971809b8cbSKent Overstreet rbio->split = true; 9981809b8cbSKent Overstreet } else { 9991809b8cbSKent Overstreet rbio = orig; 10001809b8cbSKent Overstreet rbio->bio.bi_iter = iter; 10011809b8cbSKent Overstreet EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 10021809b8cbSKent Overstreet } 10031809b8cbSKent Overstreet 10041809b8cbSKent Overstreet EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 10051809b8cbSKent Overstreet 10061809b8cbSKent Overstreet rbio->c = c; 10071809b8cbSKent Overstreet rbio->submit_time = local_clock(); 10081809b8cbSKent Overstreet if (rbio->split) 10091809b8cbSKent Overstreet rbio->parent = orig; 10101809b8cbSKent Overstreet else 10111809b8cbSKent Overstreet rbio->end_io = orig->bio.bi_end_io; 10121809b8cbSKent Overstreet rbio->bvec_iter = iter; 10131809b8cbSKent Overstreet rbio->offset_into_extent= offset_into_extent; 10141809b8cbSKent Overstreet rbio->flags = flags; 1015465bf6f4SKent Overstreet rbio->have_ioref = ca != NULL; 10161809b8cbSKent Overstreet rbio->narrow_crcs = narrow_crcs; 10171809b8cbSKent Overstreet rbio->hole = 0; 10181809b8cbSKent Overstreet rbio->retry = 0; 10191809b8cbSKent Overstreet rbio->context = 0; 10201809b8cbSKent Overstreet /* XXX: only initialize this if needed */ 10211809b8cbSKent Overstreet rbio->devs_have = bch2_bkey_devs(k); 10221809b8cbSKent Overstreet rbio->pick = pick; 10231809b8cbSKent Overstreet rbio->subvol = orig->subvol; 10241809b8cbSKent Overstreet rbio->read_pos = read_pos; 10251809b8cbSKent Overstreet rbio->data_btree = data_btree; 10261809b8cbSKent Overstreet rbio->data_pos = data_pos; 10271809b8cbSKent Overstreet rbio->version = k.k->version; 10281809b8cbSKent Overstreet rbio->promote = promote; 10291809b8cbSKent Overstreet INIT_WORK(&rbio->work, NULL); 10301809b8cbSKent Overstreet 10311539bdf5SKent Overstreet if (flags & BCH_READ_NODECODE) 10321539bdf5SKent Overstreet orig->pick = pick; 10331539bdf5SKent Overstreet 10341809b8cbSKent Overstreet rbio->bio.bi_opf = orig->bio.bi_opf; 10351809b8cbSKent Overstreet rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 10361809b8cbSKent Overstreet rbio->bio.bi_end_io = bch2_read_endio; 10371809b8cbSKent Overstreet 10381809b8cbSKent Overstreet if (rbio->bounce) 10391809b8cbSKent Overstreet trace_and_count(c, read_bounce, &rbio->bio); 10401809b8cbSKent Overstreet 10411809b8cbSKent Overstreet this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 10421809b8cbSKent Overstreet bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 10431809b8cbSKent Overstreet 10441809b8cbSKent Overstreet /* 10451809b8cbSKent Overstreet * If it's being moved internally, we don't want to flag it as a cache 10461809b8cbSKent Overstreet * hit: 10471809b8cbSKent Overstreet */ 1048465bf6f4SKent Overstreet if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) 10491809b8cbSKent Overstreet bch2_bucket_io_time_reset(trans, pick.ptr.dev, 10501809b8cbSKent Overstreet PTR_BUCKET_NR(ca, &pick.ptr), READ); 10511809b8cbSKent Overstreet 10521809b8cbSKent Overstreet if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { 10531809b8cbSKent Overstreet bio_inc_remaining(&orig->bio); 10541809b8cbSKent Overstreet trace_and_count(c, read_split, &orig->bio); 10551809b8cbSKent Overstreet } 10561809b8cbSKent Overstreet 10571809b8cbSKent Overstreet if (!rbio->pick.idx) { 10581809b8cbSKent Overstreet if (!rbio->have_ioref) { 10591809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(c, 10601809b8cbSKent Overstreet read_pos.inode, 10611809b8cbSKent Overstreet read_pos.offset << 9, 10621809b8cbSKent Overstreet "no device to read from"); 10631809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 10641809b8cbSKent Overstreet goto out; 10651809b8cbSKent Overstreet } 10661809b8cbSKent Overstreet 10671809b8cbSKent Overstreet this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 10681809b8cbSKent Overstreet bio_sectors(&rbio->bio)); 10691809b8cbSKent Overstreet bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 10701809b8cbSKent Overstreet 10711809b8cbSKent Overstreet if (unlikely(c->opts.no_data_io)) { 10721809b8cbSKent Overstreet if (likely(!(flags & BCH_READ_IN_RETRY))) 10731809b8cbSKent Overstreet bio_endio(&rbio->bio); 10741809b8cbSKent Overstreet } else { 10751809b8cbSKent Overstreet if (likely(!(flags & BCH_READ_IN_RETRY))) 10761809b8cbSKent Overstreet submit_bio(&rbio->bio); 10771809b8cbSKent Overstreet else 10781809b8cbSKent Overstreet submit_bio_wait(&rbio->bio); 10791809b8cbSKent Overstreet } 10801809b8cbSKent Overstreet 10811809b8cbSKent Overstreet /* 10821809b8cbSKent Overstreet * We just submitted IO which may block, we expect relock fail 10831809b8cbSKent Overstreet * events and shouldn't count them: 10841809b8cbSKent Overstreet */ 10851809b8cbSKent Overstreet trans->notrace_relock_fail = true; 10861809b8cbSKent Overstreet } else { 10871809b8cbSKent Overstreet /* Attempting reconstruct read: */ 1088aa982665SKent Overstreet if (bch2_ec_read_extent(trans, rbio)) { 10891809b8cbSKent Overstreet bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); 10901809b8cbSKent Overstreet goto out; 10911809b8cbSKent Overstreet } 10921809b8cbSKent Overstreet 10931809b8cbSKent Overstreet if (likely(!(flags & BCH_READ_IN_RETRY))) 10941809b8cbSKent Overstreet bio_endio(&rbio->bio); 10951809b8cbSKent Overstreet } 10961809b8cbSKent Overstreet out: 10971809b8cbSKent Overstreet if (likely(!(flags & BCH_READ_IN_RETRY))) { 10981809b8cbSKent Overstreet return 0; 10991809b8cbSKent Overstreet } else { 11001809b8cbSKent Overstreet int ret; 11011809b8cbSKent Overstreet 11021809b8cbSKent Overstreet rbio->context = RBIO_CONTEXT_UNBOUND; 11031809b8cbSKent Overstreet bch2_read_endio(&rbio->bio); 11041809b8cbSKent Overstreet 11051809b8cbSKent Overstreet ret = rbio->retry; 11061809b8cbSKent Overstreet rbio = bch2_rbio_free(rbio); 11071809b8cbSKent Overstreet 11081809b8cbSKent Overstreet if (ret == READ_RETRY_AVOID) { 11091809b8cbSKent Overstreet bch2_mark_io_failure(failed, &pick); 11101809b8cbSKent Overstreet ret = READ_RETRY; 11111809b8cbSKent Overstreet } 11121809b8cbSKent Overstreet 11131809b8cbSKent Overstreet if (!ret) 11141809b8cbSKent Overstreet goto out_read_done; 11151809b8cbSKent Overstreet 11161809b8cbSKent Overstreet return ret; 11171809b8cbSKent Overstreet } 11181809b8cbSKent Overstreet 11191809b8cbSKent Overstreet err: 11201809b8cbSKent Overstreet if (flags & BCH_READ_IN_RETRY) 11211809b8cbSKent Overstreet return READ_ERR; 11221809b8cbSKent Overstreet 11231809b8cbSKent Overstreet orig->bio.bi_status = BLK_STS_IOERR; 11241809b8cbSKent Overstreet goto out_read_done; 11251809b8cbSKent Overstreet 11261809b8cbSKent Overstreet hole: 11271809b8cbSKent Overstreet /* 11281809b8cbSKent Overstreet * won't normally happen in the BCH_READ_NODECODE 11291809b8cbSKent Overstreet * (bch2_move_extent()) path, but if we retry and the extent we wanted 11301809b8cbSKent Overstreet * to read no longer exists we have to signal that: 11311809b8cbSKent Overstreet */ 11321809b8cbSKent Overstreet if (flags & BCH_READ_NODECODE) 11331809b8cbSKent Overstreet orig->hole = true; 11341809b8cbSKent Overstreet 11351809b8cbSKent Overstreet zero_fill_bio_iter(&orig->bio, iter); 11361809b8cbSKent Overstreet out_read_done: 11371809b8cbSKent Overstreet if (flags & BCH_READ_LAST_FRAGMENT) 11381809b8cbSKent Overstreet bch2_rbio_done(orig); 11391809b8cbSKent Overstreet return 0; 11401809b8cbSKent Overstreet } 11411809b8cbSKent Overstreet 11421809b8cbSKent Overstreet void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, 11431809b8cbSKent Overstreet struct bvec_iter bvec_iter, subvol_inum inum, 11441809b8cbSKent Overstreet struct bch_io_failures *failed, unsigned flags) 11451809b8cbSKent Overstreet { 11466bd68ec2SKent Overstreet struct btree_trans *trans = bch2_trans_get(c); 11471809b8cbSKent Overstreet struct btree_iter iter; 11481809b8cbSKent Overstreet struct bkey_buf sk; 11491809b8cbSKent Overstreet struct bkey_s_c k; 11501809b8cbSKent Overstreet int ret; 11511809b8cbSKent Overstreet 11521809b8cbSKent Overstreet BUG_ON(flags & BCH_READ_NODECODE); 11531809b8cbSKent Overstreet 11541809b8cbSKent Overstreet bch2_bkey_buf_init(&sk); 11556bd68ec2SKent Overstreet bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1156*1d18b5caSKent Overstreet POS(inum.inum, bvec_iter.bi_sector), 11575dd8c60eSKent Overstreet BTREE_ITER_slots); 1158*1d18b5caSKent Overstreet 11591809b8cbSKent Overstreet while (1) { 11601809b8cbSKent Overstreet unsigned bytes, sectors, offset_into_extent; 11611809b8cbSKent Overstreet enum btree_id data_btree = BTREE_ID_extents; 11621809b8cbSKent Overstreet 1163*1d18b5caSKent Overstreet bch2_trans_begin(trans); 1164*1d18b5caSKent Overstreet 1165*1d18b5caSKent Overstreet u32 snapshot; 1166*1d18b5caSKent Overstreet ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 11671809b8cbSKent Overstreet if (ret) 1168*1d18b5caSKent Overstreet goto err; 1169*1d18b5caSKent Overstreet 1170*1d18b5caSKent Overstreet bch2_btree_iter_set_snapshot(&iter, snapshot); 11711809b8cbSKent Overstreet 11721809b8cbSKent Overstreet bch2_btree_iter_set_pos(&iter, 11731809b8cbSKent Overstreet POS(inum.inum, bvec_iter.bi_sector)); 11741809b8cbSKent Overstreet 11751809b8cbSKent Overstreet k = bch2_btree_iter_peek_slot(&iter); 11761809b8cbSKent Overstreet ret = bkey_err(k); 11771809b8cbSKent Overstreet if (ret) 1178*1d18b5caSKent Overstreet goto err; 11791809b8cbSKent Overstreet 11801809b8cbSKent Overstreet offset_into_extent = iter.pos.offset - 11811809b8cbSKent Overstreet bkey_start_offset(k.k); 11821809b8cbSKent Overstreet sectors = k.k->size - offset_into_extent; 11831809b8cbSKent Overstreet 11841809b8cbSKent Overstreet bch2_bkey_buf_reassemble(&sk, c, k); 11851809b8cbSKent Overstreet 11866bd68ec2SKent Overstreet ret = bch2_read_indirect_extent(trans, &data_btree, 11871809b8cbSKent Overstreet &offset_into_extent, &sk); 11881809b8cbSKent Overstreet if (ret) 1189*1d18b5caSKent Overstreet goto err; 11901809b8cbSKent Overstreet 11911809b8cbSKent Overstreet k = bkey_i_to_s_c(sk.k); 11921809b8cbSKent Overstreet 11931809b8cbSKent Overstreet /* 11941809b8cbSKent Overstreet * With indirect extents, the amount of data to read is the min 11951809b8cbSKent Overstreet * of the original extent and the indirect extent: 11961809b8cbSKent Overstreet */ 11971809b8cbSKent Overstreet sectors = min(sectors, k.k->size - offset_into_extent); 11981809b8cbSKent Overstreet 11991809b8cbSKent Overstreet bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 12001809b8cbSKent Overstreet swap(bvec_iter.bi_size, bytes); 12011809b8cbSKent Overstreet 12021809b8cbSKent Overstreet if (bvec_iter.bi_size == bytes) 12031809b8cbSKent Overstreet flags |= BCH_READ_LAST_FRAGMENT; 12041809b8cbSKent Overstreet 12056bd68ec2SKent Overstreet ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 12061809b8cbSKent Overstreet data_btree, k, 12071809b8cbSKent Overstreet offset_into_extent, failed, flags); 12081809b8cbSKent Overstreet if (ret) 1209*1d18b5caSKent Overstreet goto err; 12101809b8cbSKent Overstreet 12111809b8cbSKent Overstreet if (flags & BCH_READ_LAST_FRAGMENT) 12121809b8cbSKent Overstreet break; 12131809b8cbSKent Overstreet 12141809b8cbSKent Overstreet swap(bvec_iter.bi_size, bytes); 12151809b8cbSKent Overstreet bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 12161809b8cbSKent Overstreet 12176bd68ec2SKent Overstreet ret = btree_trans_too_many_iters(trans); 12181809b8cbSKent Overstreet if (ret) 1219*1d18b5caSKent Overstreet goto err; 1220*1d18b5caSKent Overstreet err: 1221*1d18b5caSKent Overstreet if (ret && 1222*1d18b5caSKent Overstreet !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1223*1d18b5caSKent Overstreet ret != READ_RETRY && 1224*1d18b5caSKent Overstreet ret != READ_RETRY_AVOID) 12251809b8cbSKent Overstreet break; 12261809b8cbSKent Overstreet } 1227*1d18b5caSKent Overstreet 12286bd68ec2SKent Overstreet bch2_trans_iter_exit(trans, &iter); 12296bd68ec2SKent Overstreet bch2_trans_put(trans); 12301809b8cbSKent Overstreet bch2_bkey_buf_exit(&sk, c); 12311809b8cbSKent Overstreet 12321809b8cbSKent Overstreet if (ret) { 12331809b8cbSKent Overstreet bch_err_inum_offset_ratelimited(c, inum.inum, 12341809b8cbSKent Overstreet bvec_iter.bi_sector << 9, 12351809b8cbSKent Overstreet "read error %i from btree lookup", ret); 12361809b8cbSKent Overstreet rbio->bio.bi_status = BLK_STS_IOERR; 12371809b8cbSKent Overstreet bch2_rbio_done(rbio); 12381809b8cbSKent Overstreet } 12391809b8cbSKent Overstreet } 12401809b8cbSKent Overstreet 12411809b8cbSKent Overstreet void bch2_fs_io_read_exit(struct bch_fs *c) 12421809b8cbSKent Overstreet { 12431809b8cbSKent Overstreet if (c->promote_table.tbl) 12441809b8cbSKent Overstreet rhashtable_destroy(&c->promote_table); 12451809b8cbSKent Overstreet bioset_exit(&c->bio_read_split); 12461809b8cbSKent Overstreet bioset_exit(&c->bio_read); 12471809b8cbSKent Overstreet } 12481809b8cbSKent Overstreet 12491809b8cbSKent Overstreet int bch2_fs_io_read_init(struct bch_fs *c) 12501809b8cbSKent Overstreet { 12511809b8cbSKent Overstreet if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 12521809b8cbSKent Overstreet BIOSET_NEED_BVECS)) 12531809b8cbSKent Overstreet return -BCH_ERR_ENOMEM_bio_read_init; 12541809b8cbSKent Overstreet 12551809b8cbSKent Overstreet if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 12561809b8cbSKent Overstreet BIOSET_NEED_BVECS)) 12571809b8cbSKent Overstreet return -BCH_ERR_ENOMEM_bio_read_split_init; 12581809b8cbSKent Overstreet 12591809b8cbSKent Overstreet if (rhashtable_init(&c->promote_table, &bch_promote_params)) 12601809b8cbSKent Overstreet return -BCH_ERR_ENOMEM_promote_table_init; 12611809b8cbSKent Overstreet 12621809b8cbSKent Overstreet return 0; 12631809b8cbSKent Overstreet } 1264