1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Some low level IO code, and hacks for various block layer limitations
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38
bch2_target_congested(struct bch_fs * c,u16 target)39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 const struct bch_devs_mask *devs;
42 unsigned d, nr = 0, total = 0;
43 u64 now = local_clock(), last;
44 s64 congested;
45 struct bch_dev *ca;
46
47 if (!target)
48 return false;
49
50 rcu_read_lock();
51 devs = bch2_target_to_mask(c, target) ?:
52 &c->rw_devs[BCH_DATA_user];
53
54 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 ca = rcu_dereference(c->devs[d]);
56 if (!ca)
57 continue;
58
59 congested = atomic_read(&ca->congested);
60 last = READ_ONCE(ca->congested_last);
61 if (time_after64(now, last))
62 congested -= (now - last) >> 12;
63
64 total += max(congested, 0LL);
65 nr++;
66 }
67 rcu_read_unlock();
68
69 return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71
72 #else
73
bch2_target_congested(struct bch_fs * c,u16 target)74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 return false;
77 }
78
79 #endif
80
81 /* Cache promotion on read */
82
83 struct promote_op {
84 struct rcu_head rcu;
85 u64 start_time;
86
87 struct rhash_head hash;
88 struct bpos pos;
89
90 struct work_struct work;
91 struct data_update write;
92 struct bio_vec bi_inline_vecs[]; /* must be last */
93 };
94
95 static const struct rhashtable_params bch_promote_params = {
96 .head_offset = offsetof(struct promote_op, hash),
97 .key_offset = offsetof(struct promote_op, pos),
98 .key_len = sizeof(struct bpos),
99 .automatic_shrinking = true,
100 };
101
have_io_error(struct bch_io_failures * failed)102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 return failed && failed->nr;
105 }
106
rbio_data_update(struct bch_read_bio * rbio)107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 EBUG_ON(rbio->split);
110
111 return rbio->data_update
112 ? container_of(rbio, struct data_update, rbio)
113 : NULL;
114 }
115
ptr_being_rewritten(struct bch_read_bio * orig,unsigned dev)116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 struct data_update *u = rbio_data_update(orig);
119 if (!u)
120 return false;
121
122 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 unsigned i = 0;
124 bkey_for_each_ptr(ptrs, ptr) {
125 if (ptr->dev == dev &&
126 u->data_opts.rewrite_ptrs & BIT(i))
127 return true;
128 i++;
129 }
130
131 return false;
132 }
133
should_promote(struct bch_fs * c,struct bkey_s_c k,struct bpos pos,struct bch_io_opts opts,unsigned flags,struct bch_io_failures * failed)134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 struct bpos pos,
136 struct bch_io_opts opts,
137 unsigned flags,
138 struct bch_io_failures *failed)
139 {
140 if (!have_io_error(failed)) {
141 BUG_ON(!opts.promote_target);
142
143 if (!(flags & BCH_READ_may_promote))
144 return -BCH_ERR_nopromote_may_not;
145
146 if (bch2_bkey_has_target(c, k, opts.promote_target))
147 return -BCH_ERR_nopromote_already_promoted;
148
149 if (bkey_extent_is_unwritten(k))
150 return -BCH_ERR_nopromote_unwritten;
151
152 if (bch2_target_congested(c, opts.promote_target))
153 return -BCH_ERR_nopromote_congested;
154 }
155
156 if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 bch_promote_params))
158 return -BCH_ERR_nopromote_in_flight;
159
160 return 0;
161 }
162
promote_free(struct bch_read_bio * rbio)163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 struct bch_fs *c = rbio->c;
167
168 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 bch_promote_params);
170 BUG_ON(ret);
171
172 bch2_data_update_exit(&op->write);
173
174 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 kfree_rcu(op, rcu);
176 }
177
promote_done(struct bch_write_op * wop)178 static void promote_done(struct bch_write_op *wop)
179 {
180 struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 struct bch_fs *c = op->write.rbio.c;
182
183 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 promote_free(&op->write.rbio);
185 }
186
promote_start_work(struct work_struct * work)187 static void promote_start_work(struct work_struct *work)
188 {
189 struct promote_op *op = container_of(work, struct promote_op, work);
190
191 bch2_data_update_read_done(&op->write);
192 }
193
promote_start(struct bch_read_bio * rbio)194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197
198 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199
200 INIT_WORK(&op->work, promote_start_work);
201 queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203
__promote_alloc(struct btree_trans * trans,enum btree_id btree_id,struct bkey_s_c k,struct bpos pos,struct extent_ptr_decoded * pick,unsigned sectors,struct bch_read_bio * orig,struct bch_io_failures * failed)204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 enum btree_id btree_id,
206 struct bkey_s_c k,
207 struct bpos pos,
208 struct extent_ptr_decoded *pick,
209 unsigned sectors,
210 struct bch_read_bio *orig,
211 struct bch_io_failures *failed)
212 {
213 struct bch_fs *c = trans->c;
214 int ret;
215
216 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217
218 if (!have_io_error(failed)) {
219 update_opts.target = orig->opts.promote_target;
220 update_opts.extra_replicas = 1;
221 update_opts.write_flags |= BCH_WRITE_cached;
222 update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 } else {
224 update_opts.target = orig->opts.foreground_target;
225
226 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 unsigned ptr_bit = 1;
228 bkey_for_each_ptr(ptrs, ptr) {
229 if (bch2_dev_io_failures(failed, ptr->dev) &&
230 !ptr_being_rewritten(orig, ptr->dev))
231 update_opts.rewrite_ptrs |= ptr_bit;
232 ptr_bit <<= 1;
233 }
234
235 if (!update_opts.rewrite_ptrs)
236 return NULL;
237 }
238
239 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241
242 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 if (!op) {
244 ret = -BCH_ERR_nopromote_enomem;
245 goto err_put;
246 }
247
248 op->start_time = local_clock();
249 op->pos = pos;
250
251 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 bch_promote_params)) {
253 ret = -BCH_ERR_nopromote_in_flight;
254 goto err;
255 }
256
257 ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 writepoint_hashed((unsigned long) current),
259 &orig->opts,
260 update_opts,
261 btree_id, k);
262 /*
263 * possible errors: -BCH_ERR_nocow_lock_blocked,
264 * -BCH_ERR_ENOSPC_disk_reservation:
265 */
266 if (ret)
267 goto err_remove_hash;
268
269 rbio_init_fragment(&op->write.rbio.bio, orig);
270 op->write.rbio.bounce = true;
271 op->write.rbio.promote = true;
272 op->write.op.end_io = promote_done;
273
274 return &op->write.rbio;
275 err_remove_hash:
276 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
277 bch_promote_params));
278 err:
279 bio_free_pages(&op->write.op.wbio.bio);
280 /* We may have added to the rhashtable and thus need rcu freeing: */
281 kfree_rcu(op, rcu);
282 err_put:
283 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
284 return ERR_PTR(ret);
285 }
286
287 noinline
promote_alloc(struct btree_trans * trans,struct bvec_iter iter,struct bkey_s_c k,struct extent_ptr_decoded * pick,unsigned flags,struct bch_read_bio * orig,bool * bounce,bool * read_full,struct bch_io_failures * failed)288 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
289 struct bvec_iter iter,
290 struct bkey_s_c k,
291 struct extent_ptr_decoded *pick,
292 unsigned flags,
293 struct bch_read_bio *orig,
294 bool *bounce,
295 bool *read_full,
296 struct bch_io_failures *failed)
297 {
298 struct bch_fs *c = trans->c;
299 /*
300 * if failed != NULL we're not actually doing a promote, we're
301 * recovering from an io/checksum error
302 */
303 bool promote_full = (have_io_error(failed) ||
304 *read_full ||
305 READ_ONCE(c->opts.promote_whole_extents));
306 /* data might have to be decompressed in the write path: */
307 unsigned sectors = promote_full
308 ? max(pick->crc.compressed_size, pick->crc.live_size)
309 : bvec_iter_sectors(iter);
310 struct bpos pos = promote_full
311 ? bkey_start_pos(k.k)
312 : POS(k.k->p.inode, iter.bi_sector);
313 int ret;
314
315 ret = should_promote(c, k, pos, orig->opts, flags, failed);
316 if (ret)
317 goto nopromote;
318
319 struct bch_read_bio *promote =
320 __promote_alloc(trans,
321 k.k->type == KEY_TYPE_reflink_v
322 ? BTREE_ID_reflink
323 : BTREE_ID_extents,
324 k, pos, pick, sectors, orig, failed);
325 if (!promote)
326 return NULL;
327
328 ret = PTR_ERR_OR_ZERO(promote);
329 if (ret)
330 goto nopromote;
331
332 *bounce = true;
333 *read_full = promote_full;
334 return promote;
335 nopromote:
336 trace_io_read_nopromote(c, ret);
337 return NULL;
338 }
339
340 /* Read */
341
bch2_read_err_msg_trans(struct btree_trans * trans,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)342 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
343 struct bch_read_bio *rbio, struct bpos read_pos)
344 {
345 int ret = lockrestart_do(trans,
346 bch2_inum_offset_err_msg_trans(trans, out,
347 (subvol_inum) { rbio->subvol, read_pos.inode },
348 read_pos.offset << 9));
349 if (ret)
350 return ret;
351
352 if (rbio->data_update)
353 prt_str(out, "(internal move) ");
354
355 return 0;
356 }
357
bch2_read_err_msg(struct bch_fs * c,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)358 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
359 struct bch_read_bio *rbio, struct bpos read_pos)
360 {
361 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
362 }
363
364 enum rbio_context {
365 RBIO_CONTEXT_NULL,
366 RBIO_CONTEXT_HIGHPRI,
367 RBIO_CONTEXT_UNBOUND,
368 };
369
370 static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio * rbio)371 bch2_rbio_parent(struct bch_read_bio *rbio)
372 {
373 return rbio->split ? rbio->parent : rbio;
374 }
375
376 __always_inline
bch2_rbio_punt(struct bch_read_bio * rbio,work_func_t fn,enum rbio_context context,struct workqueue_struct * wq)377 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
378 enum rbio_context context,
379 struct workqueue_struct *wq)
380 {
381 if (context <= rbio->context) {
382 fn(&rbio->work);
383 } else {
384 rbio->work.func = fn;
385 rbio->context = context;
386 queue_work(wq, &rbio->work);
387 }
388 }
389
bch2_rbio_free(struct bch_read_bio * rbio)390 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
391 {
392 BUG_ON(rbio->bounce && !rbio->split);
393
394 if (rbio->have_ioref) {
395 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
396 percpu_ref_put(&ca->io_ref);
397 }
398
399 if (rbio->split) {
400 struct bch_read_bio *parent = rbio->parent;
401
402 if (unlikely(rbio->promote)) {
403 if (!rbio->bio.bi_status)
404 promote_start(rbio);
405 else
406 promote_free(rbio);
407 } else {
408 if (rbio->bounce)
409 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
410
411 bio_put(&rbio->bio);
412 }
413
414 rbio = parent;
415 }
416
417 return rbio;
418 }
419
420 /*
421 * Only called on a top level bch_read_bio to complete an entire read request,
422 * not a split:
423 */
bch2_rbio_done(struct bch_read_bio * rbio)424 static void bch2_rbio_done(struct bch_read_bio *rbio)
425 {
426 if (rbio->start_time)
427 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
428 rbio->start_time);
429 bio_endio(&rbio->bio);
430 }
431
bch2_read_retry_nodecode(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,struct bch_io_failures * failed,unsigned flags)432 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
433 struct bch_read_bio *rbio,
434 struct bvec_iter bvec_iter,
435 struct bch_io_failures *failed,
436 unsigned flags)
437 {
438 struct data_update *u = container_of(rbio, struct data_update, rbio);
439 retry:
440 bch2_trans_begin(trans);
441
442 struct btree_iter iter;
443 struct bkey_s_c k;
444 int ret = lockrestart_do(trans,
445 bkey_err(k = bch2_bkey_get_iter(trans, &iter,
446 u->btree_id, bkey_start_pos(&u->k.k->k),
447 0)));
448 if (ret)
449 goto err;
450
451 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
452 /* extent we wanted to read no longer exists: */
453 rbio->ret = -BCH_ERR_data_read_key_overwritten;
454 goto err;
455 }
456
457 ret = __bch2_read_extent(trans, rbio, bvec_iter,
458 bkey_start_pos(&u->k.k->k),
459 u->btree_id,
460 bkey_i_to_s_c(u->k.k),
461 0, failed, flags, -1);
462 err:
463 bch2_trans_iter_exit(trans, &iter);
464
465 if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
466 goto retry;
467
468 if (ret) {
469 rbio->bio.bi_status = BLK_STS_IOERR;
470 rbio->ret = ret;
471 }
472
473 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
474 return ret;
475 }
476
bch2_rbio_retry(struct work_struct * work)477 static void bch2_rbio_retry(struct work_struct *work)
478 {
479 struct bch_read_bio *rbio =
480 container_of(work, struct bch_read_bio, work);
481 struct bch_fs *c = rbio->c;
482 struct bvec_iter iter = rbio->bvec_iter;
483 unsigned flags = rbio->flags;
484 subvol_inum inum = {
485 .subvol = rbio->subvol,
486 .inum = rbio->read_pos.inode,
487 };
488 struct bch_io_failures failed = { .nr = 0 };
489 struct btree_trans *trans = bch2_trans_get(c);
490
491 trace_io_read_retry(&rbio->bio);
492 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
493 bvec_iter_sectors(rbio->bvec_iter));
494
495 if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
496 bch2_mark_io_failure(&failed, &rbio->pick,
497 rbio->ret == -BCH_ERR_data_read_retry_csum_err);
498
499 if (!rbio->split) {
500 rbio->bio.bi_status = 0;
501 rbio->ret = 0;
502 }
503
504 unsigned subvol = rbio->subvol;
505 struct bpos read_pos = rbio->read_pos;
506
507 rbio = bch2_rbio_free(rbio);
508
509 flags |= BCH_READ_in_retry;
510 flags &= ~BCH_READ_may_promote;
511 flags &= ~BCH_READ_last_fragment;
512 flags |= BCH_READ_must_clone;
513
514 int ret = rbio->data_update
515 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
516 : __bch2_read(trans, rbio, iter, inum, &failed, flags);
517
518 if (ret) {
519 rbio->ret = ret;
520 rbio->bio.bi_status = BLK_STS_IOERR;
521 } else {
522 struct printbuf buf = PRINTBUF;
523
524 lockrestart_do(trans,
525 bch2_inum_offset_err_msg_trans(trans, &buf,
526 (subvol_inum) { subvol, read_pos.inode },
527 read_pos.offset << 9));
528 if (rbio->data_update)
529 prt_str(&buf, "(internal move) ");
530 prt_str(&buf, "successful retry");
531
532 bch_err_ratelimited(c, "%s", buf.buf);
533 printbuf_exit(&buf);
534 }
535
536 bch2_rbio_done(rbio);
537 bch2_trans_put(trans);
538 }
539
bch2_rbio_error(struct bch_read_bio * rbio,int ret,blk_status_t blk_error)540 static void bch2_rbio_error(struct bch_read_bio *rbio,
541 int ret, blk_status_t blk_error)
542 {
543 BUG_ON(ret >= 0);
544
545 rbio->ret = ret;
546 rbio->bio.bi_status = blk_error;
547
548 bch2_rbio_parent(rbio)->saw_error = true;
549
550 if (rbio->flags & BCH_READ_in_retry)
551 return;
552
553 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
554 bch2_rbio_punt(rbio, bch2_rbio_retry,
555 RBIO_CONTEXT_UNBOUND, system_unbound_wq);
556 } else {
557 rbio = bch2_rbio_free(rbio);
558
559 rbio->ret = ret;
560 rbio->bio.bi_status = blk_error;
561
562 bch2_rbio_done(rbio);
563 }
564 }
565
bch2_read_io_err(struct work_struct * work)566 static void bch2_read_io_err(struct work_struct *work)
567 {
568 struct bch_read_bio *rbio =
569 container_of(work, struct bch_read_bio, work);
570 struct bio *bio = &rbio->bio;
571 struct bch_fs *c = rbio->c;
572 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
573 struct printbuf buf = PRINTBUF;
574
575 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
576 prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
577
578 if (ca)
579 bch_err_ratelimited(ca, "%s", buf.buf);
580 else
581 bch_err_ratelimited(c, "%s", buf.buf);
582
583 printbuf_exit(&buf);
584 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
585 }
586
__bch2_rbio_narrow_crcs(struct btree_trans * trans,struct bch_read_bio * rbio)587 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
588 struct bch_read_bio *rbio)
589 {
590 struct bch_fs *c = rbio->c;
591 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
592 struct bch_extent_crc_unpacked new_crc;
593 struct btree_iter iter;
594 struct bkey_i *new;
595 struct bkey_s_c k;
596 int ret = 0;
597
598 if (crc_is_compressed(rbio->pick.crc))
599 return 0;
600
601 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
602 BTREE_ITER_slots|BTREE_ITER_intent);
603 if ((ret = bkey_err(k)))
604 goto out;
605
606 if (bversion_cmp(k.k->bversion, rbio->version) ||
607 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
608 goto out;
609
610 /* Extent was merged? */
611 if (bkey_start_offset(k.k) < data_offset ||
612 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
613 goto out;
614
615 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
616 rbio->pick.crc, NULL, &new_crc,
617 bkey_start_offset(k.k) - data_offset, k.k->size,
618 rbio->pick.crc.csum_type)) {
619 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
620 ret = 0;
621 goto out;
622 }
623
624 /*
625 * going to be temporarily appending another checksum entry:
626 */
627 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
628 sizeof(struct bch_extent_crc128));
629 if ((ret = PTR_ERR_OR_ZERO(new)))
630 goto out;
631
632 bkey_reassemble(new, k);
633
634 if (!bch2_bkey_narrow_crcs(new, new_crc))
635 goto out;
636
637 ret = bch2_trans_update(trans, &iter, new,
638 BTREE_UPDATE_internal_snapshot_node);
639 out:
640 bch2_trans_iter_exit(trans, &iter);
641 return ret;
642 }
643
bch2_rbio_narrow_crcs(struct bch_read_bio * rbio)644 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
645 {
646 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
647 __bch2_rbio_narrow_crcs(trans, rbio));
648 }
649
bch2_read_csum_err(struct work_struct * work)650 static void bch2_read_csum_err(struct work_struct *work)
651 {
652 struct bch_read_bio *rbio =
653 container_of(work, struct bch_read_bio, work);
654 struct bch_fs *c = rbio->c;
655 struct bio *src = &rbio->bio;
656 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
657 struct nonce nonce = extent_nonce(rbio->version, crc);
658 struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
659 struct printbuf buf = PRINTBUF;
660
661 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
662 prt_str(&buf, "data ");
663 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
664
665 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
666 if (ca)
667 bch_err_ratelimited(ca, "%s", buf.buf);
668 else
669 bch_err_ratelimited(c, "%s", buf.buf);
670
671 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
672 printbuf_exit(&buf);
673 }
674
bch2_read_decompress_err(struct work_struct * work)675 static void bch2_read_decompress_err(struct work_struct *work)
676 {
677 struct bch_read_bio *rbio =
678 container_of(work, struct bch_read_bio, work);
679 struct bch_fs *c = rbio->c;
680 struct printbuf buf = PRINTBUF;
681
682 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
683 prt_str(&buf, "decompression error");
684
685 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
686 if (ca)
687 bch_err_ratelimited(ca, "%s", buf.buf);
688 else
689 bch_err_ratelimited(c, "%s", buf.buf);
690
691 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
692 printbuf_exit(&buf);
693 }
694
bch2_read_decrypt_err(struct work_struct * work)695 static void bch2_read_decrypt_err(struct work_struct *work)
696 {
697 struct bch_read_bio *rbio =
698 container_of(work, struct bch_read_bio, work);
699 struct bch_fs *c = rbio->c;
700 struct printbuf buf = PRINTBUF;
701
702 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
703 prt_str(&buf, "decrypt error");
704
705 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
706 if (ca)
707 bch_err_ratelimited(ca, "%s", buf.buf);
708 else
709 bch_err_ratelimited(c, "%s", buf.buf);
710
711 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
712 printbuf_exit(&buf);
713 }
714
715 /* Inner part that may run in process context */
__bch2_read_endio(struct work_struct * work)716 static void __bch2_read_endio(struct work_struct *work)
717 {
718 struct bch_read_bio *rbio =
719 container_of(work, struct bch_read_bio, work);
720 struct bch_fs *c = rbio->c;
721 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
722 struct bch_read_bio *parent = bch2_rbio_parent(rbio);
723 struct bio *src = &rbio->bio;
724 struct bio *dst = &parent->bio;
725 struct bvec_iter dst_iter = rbio->bvec_iter;
726 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
727 struct nonce nonce = extent_nonce(rbio->version, crc);
728 unsigned nofs_flags;
729 struct bch_csum csum;
730 int ret;
731
732 nofs_flags = memalloc_nofs_save();
733
734 /* Reset iterator for checksumming and copying bounced data: */
735 if (rbio->bounce) {
736 src->bi_iter.bi_size = crc.compressed_size << 9;
737 src->bi_iter.bi_idx = 0;
738 src->bi_iter.bi_bvec_done = 0;
739 } else {
740 src->bi_iter = rbio->bvec_iter;
741 }
742
743 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
744
745 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
746 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
747
748 /*
749 * Checksum error: if the bio wasn't bounced, we may have been
750 * reading into buffers owned by userspace (that userspace can
751 * scribble over) - retry the read, bouncing it this time:
752 */
753 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
754 rbio->flags |= BCH_READ_must_bounce;
755 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
756 BLK_STS_IOERR);
757 goto out;
758 }
759
760 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
761
762 if (!csum_good)
763 goto csum_err;
764
765 /*
766 * XXX
767 * We need to rework the narrow_crcs path to deliver the read completion
768 * first, and then punt to a different workqueue, otherwise we're
769 * holding up reads while doing btree updates which is bad for memory
770 * reclaim.
771 */
772 if (unlikely(rbio->narrow_crcs))
773 bch2_rbio_narrow_crcs(rbio);
774
775 if (likely(!parent->data_update)) {
776 /* Adjust crc to point to subset of data we want: */
777 crc.offset += rbio->offset_into_extent;
778 crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
779
780 if (crc_is_compressed(crc)) {
781 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
782 if (ret)
783 goto decrypt_err;
784
785 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
786 !c->opts.no_data_io)
787 goto decompression_err;
788 } else {
789 /* don't need to decrypt the entire bio: */
790 nonce = nonce_add(nonce, crc.offset << 9);
791 bio_advance(src, crc.offset << 9);
792
793 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
794 src->bi_iter.bi_size = dst_iter.bi_size;
795
796 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
797 if (ret)
798 goto decrypt_err;
799
800 if (rbio->bounce) {
801 struct bvec_iter src_iter = src->bi_iter;
802
803 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
804 }
805 }
806 } else {
807 if (rbio->split)
808 rbio->parent->pick = rbio->pick;
809
810 if (rbio->bounce) {
811 struct bvec_iter src_iter = src->bi_iter;
812
813 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
814 }
815 }
816
817 if (rbio->promote) {
818 /*
819 * Re encrypt data we decrypted, so it's consistent with
820 * rbio->crc:
821 */
822 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
823 if (ret)
824 goto decrypt_err;
825 }
826
827 if (likely(!(rbio->flags & BCH_READ_in_retry))) {
828 rbio = bch2_rbio_free(rbio);
829 bch2_rbio_done(rbio);
830 }
831 out:
832 memalloc_nofs_restore(nofs_flags);
833 return;
834 csum_err:
835 bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
836 goto out;
837 decompression_err:
838 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
839 goto out;
840 decrypt_err:
841 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
842 goto out;
843 }
844
bch2_read_endio(struct bio * bio)845 static void bch2_read_endio(struct bio *bio)
846 {
847 struct bch_read_bio *rbio =
848 container_of(bio, struct bch_read_bio, bio);
849 struct bch_fs *c = rbio->c;
850 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
851 struct workqueue_struct *wq = NULL;
852 enum rbio_context context = RBIO_CONTEXT_NULL;
853
854 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
855 rbio->submit_time, !bio->bi_status);
856
857 if (!rbio->split)
858 rbio->bio.bi_end_io = rbio->end_io;
859
860 if (unlikely(bio->bi_status)) {
861 bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
862 return;
863 }
864
865 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
866 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
867 trace_and_count(c, io_read_reuse_race, &rbio->bio);
868
869 if (rbio->flags & BCH_READ_retry_if_stale)
870 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
871 else
872 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
873 return;
874 }
875
876 if (rbio->narrow_crcs ||
877 rbio->promote ||
878 crc_is_compressed(rbio->pick.crc) ||
879 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
880 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
881 else if (rbio->pick.crc.csum_type)
882 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
883
884 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
885 }
886
read_from_stale_dirty_pointer(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,struct bch_extent_ptr ptr)887 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
888 struct bch_dev *ca,
889 struct bkey_s_c k,
890 struct bch_extent_ptr ptr)
891 {
892 struct bch_fs *c = trans->c;
893 struct btree_iter iter;
894 struct printbuf buf = PRINTBUF;
895 int ret;
896
897 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
898 PTR_BUCKET_POS(ca, &ptr),
899 BTREE_ITER_cached);
900
901 int gen = bucket_gen_get(ca, iter.pos.offset);
902 if (gen >= 0) {
903 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
904 printbuf_indent_add(&buf, 2);
905
906 bch2_bkey_val_to_text(&buf, c, k);
907 prt_newline(&buf);
908
909 prt_printf(&buf, "memory gen: %u", gen);
910
911 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
912 if (!ret) {
913 prt_newline(&buf);
914 bch2_bkey_val_to_text(&buf, c, k);
915 }
916 } else {
917 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
918 iter.pos.inode, iter.pos.offset);
919 printbuf_indent_add(&buf, 2);
920
921 prt_printf(&buf, "first bucket %u nbuckets %llu\n",
922 ca->mi.first_bucket, ca->mi.nbuckets);
923
924 bch2_bkey_val_to_text(&buf, c, k);
925 prt_newline(&buf);
926 }
927
928 bch2_fs_inconsistent(c, "%s", buf.buf);
929
930 bch2_trans_iter_exit(trans, &iter);
931 printbuf_exit(&buf);
932 }
933
__bch2_read_extent(struct btree_trans * trans,struct bch_read_bio * orig,struct bvec_iter iter,struct bpos read_pos,enum btree_id data_btree,struct bkey_s_c k,unsigned offset_into_extent,struct bch_io_failures * failed,unsigned flags,int dev)934 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
935 struct bvec_iter iter, struct bpos read_pos,
936 enum btree_id data_btree, struct bkey_s_c k,
937 unsigned offset_into_extent,
938 struct bch_io_failures *failed, unsigned flags, int dev)
939 {
940 struct bch_fs *c = trans->c;
941 struct extent_ptr_decoded pick;
942 struct bch_read_bio *rbio = NULL;
943 bool bounce = false, read_full = false, narrow_crcs = false;
944 struct bpos data_pos = bkey_start_pos(k.k);
945 struct data_update *u = rbio_data_update(orig);
946 int ret = 0;
947
948 if (bkey_extent_is_inline_data(k.k)) {
949 unsigned bytes = min_t(unsigned, iter.bi_size,
950 bkey_inline_data_bytes(k.k));
951
952 swap(iter.bi_size, bytes);
953 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
954 swap(iter.bi_size, bytes);
955 bio_advance_iter(&orig->bio, &iter, bytes);
956 zero_fill_bio_iter(&orig->bio, iter);
957 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
958 bvec_iter_sectors(iter));
959 goto out_read_done;
960 }
961 retry_pick:
962 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
963
964 /* hole or reservation - just zero fill: */
965 if (!ret)
966 goto hole;
967
968 if (unlikely(ret < 0)) {
969 struct printbuf buf = PRINTBUF;
970 bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
971 prt_printf(&buf, "%s\n ", bch2_err_str(ret));
972 bch2_bkey_val_to_text(&buf, c, k);
973
974 bch_err_ratelimited(c, "%s", buf.buf);
975 printbuf_exit(&buf);
976 goto err;
977 }
978
979 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
980 struct printbuf buf = PRINTBUF;
981 bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
982 prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
983 bch2_bkey_val_to_text(&buf, c, k);
984
985 bch_err_ratelimited(c, "%s", buf.buf);
986 printbuf_exit(&buf);
987 ret = -BCH_ERR_data_read_no_encryption_key;
988 goto err;
989 }
990
991 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
992
993 /*
994 * Stale dirty pointers are treated as IO errors, but @failed isn't
995 * allocated unless we're in the retry path - so if we're not in the
996 * retry path, don't check here, it'll be caught in bch2_read_endio()
997 * and we'll end up in the retry path:
998 */
999 if ((flags & BCH_READ_in_retry) &&
1000 !pick.ptr.cached &&
1001 ca &&
1002 unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1003 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1004 bch2_mark_io_failure(failed, &pick, false);
1005 percpu_ref_put(&ca->io_ref);
1006 goto retry_pick;
1007 }
1008
1009 if (likely(!u)) {
1010 if (!(flags & BCH_READ_last_fragment) ||
1011 bio_flagged(&orig->bio, BIO_CHAIN))
1012 flags |= BCH_READ_must_clone;
1013
1014 narrow_crcs = !(flags & BCH_READ_in_retry) &&
1015 bch2_can_narrow_extent_crcs(k, pick.crc);
1016
1017 if (narrow_crcs && (flags & BCH_READ_user_mapped))
1018 flags |= BCH_READ_must_bounce;
1019
1020 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1021
1022 if (crc_is_compressed(pick.crc) ||
1023 (pick.crc.csum_type != BCH_CSUM_none &&
1024 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1025 (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1026 (flags & BCH_READ_user_mapped)) ||
1027 (flags & BCH_READ_must_bounce)))) {
1028 read_full = true;
1029 bounce = true;
1030 }
1031 } else {
1032 /*
1033 * can happen if we retry, and the extent we were going to read
1034 * has been merged in the meantime:
1035 */
1036 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1037 if (ca)
1038 percpu_ref_put(&ca->io_ref);
1039 rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1040 goto out_read_done;
1041 }
1042
1043 iter.bi_size = pick.crc.compressed_size << 9;
1044 read_full = true;
1045 }
1046
1047 if (orig->opts.promote_target || have_io_error(failed))
1048 rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1049 &bounce, &read_full, failed);
1050
1051 if (!read_full) {
1052 EBUG_ON(crc_is_compressed(pick.crc));
1053 EBUG_ON(pick.crc.csum_type &&
1054 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1055 bvec_iter_sectors(iter) != pick.crc.live_size ||
1056 pick.crc.offset ||
1057 offset_into_extent));
1058
1059 data_pos.offset += offset_into_extent;
1060 pick.ptr.offset += pick.crc.offset +
1061 offset_into_extent;
1062 offset_into_extent = 0;
1063 pick.crc.compressed_size = bvec_iter_sectors(iter);
1064 pick.crc.uncompressed_size = bvec_iter_sectors(iter);
1065 pick.crc.offset = 0;
1066 pick.crc.live_size = bvec_iter_sectors(iter);
1067 }
1068
1069 if (rbio) {
1070 /*
1071 * promote already allocated bounce rbio:
1072 * promote needs to allocate a bio big enough for uncompressing
1073 * data in the write path, but we're not going to use it all
1074 * here:
1075 */
1076 EBUG_ON(rbio->bio.bi_iter.bi_size <
1077 pick.crc.compressed_size << 9);
1078 rbio->bio.bi_iter.bi_size =
1079 pick.crc.compressed_size << 9;
1080 } else if (bounce) {
1081 unsigned sectors = pick.crc.compressed_size;
1082
1083 rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1084 DIV_ROUND_UP(sectors, PAGE_SECTORS),
1085 0,
1086 GFP_NOFS,
1087 &c->bio_read_split),
1088 orig);
1089
1090 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1091 rbio->bounce = true;
1092 } else if (flags & BCH_READ_must_clone) {
1093 /*
1094 * Have to clone if there were any splits, due to error
1095 * reporting issues (if a split errored, and retrying didn't
1096 * work, when it reports the error to its parent (us) we don't
1097 * know if the error was from our bio, and we should retry, or
1098 * from the whole bio, in which case we don't want to retry and
1099 * lose the error)
1100 */
1101 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1102 &c->bio_read_split),
1103 orig);
1104 rbio->bio.bi_iter = iter;
1105 } else {
1106 rbio = orig;
1107 rbio->bio.bi_iter = iter;
1108 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1109 }
1110
1111 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1112
1113 rbio->submit_time = local_clock();
1114 if (!rbio->split)
1115 rbio->end_io = orig->bio.bi_end_io;
1116 rbio->bvec_iter = iter;
1117 rbio->offset_into_extent= offset_into_extent;
1118 rbio->flags = flags;
1119 rbio->have_ioref = ca != NULL;
1120 rbio->narrow_crcs = narrow_crcs;
1121 rbio->ret = 0;
1122 rbio->context = 0;
1123 rbio->pick = pick;
1124 rbio->subvol = orig->subvol;
1125 rbio->read_pos = read_pos;
1126 rbio->data_btree = data_btree;
1127 rbio->data_pos = data_pos;
1128 rbio->version = k.k->bversion;
1129 INIT_WORK(&rbio->work, NULL);
1130
1131 rbio->bio.bi_opf = orig->bio.bi_opf;
1132 rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1133 rbio->bio.bi_end_io = bch2_read_endio;
1134
1135 if (rbio->bounce)
1136 trace_and_count(c, io_read_bounce, &rbio->bio);
1137
1138 if (!u)
1139 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1140 else
1141 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1142 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1143
1144 /*
1145 * If it's being moved internally, we don't want to flag it as a cache
1146 * hit:
1147 */
1148 if (ca && pick.ptr.cached && !u)
1149 bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1150 PTR_BUCKET_NR(ca, &pick.ptr), READ);
1151
1152 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1153 bio_inc_remaining(&orig->bio);
1154 trace_and_count(c, io_read_split, &orig->bio);
1155 }
1156
1157 /*
1158 * Unlock the iterator while the btree node's lock is still in
1159 * cache, before doing the IO:
1160 */
1161 if (!(flags & BCH_READ_in_retry))
1162 bch2_trans_unlock(trans);
1163 else
1164 bch2_trans_unlock_long(trans);
1165
1166 if (likely(!rbio->pick.do_ec_reconstruct)) {
1167 if (unlikely(!rbio->have_ioref)) {
1168 struct printbuf buf = PRINTBUF;
1169 bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1170 prt_printf(&buf, "no device to read from:\n ");
1171 bch2_bkey_val_to_text(&buf, c, k);
1172
1173 bch_err_ratelimited(c, "%s", buf.buf);
1174 printbuf_exit(&buf);
1175
1176 bch2_rbio_error(rbio,
1177 -BCH_ERR_data_read_retry_device_offline,
1178 BLK_STS_IOERR);
1179 goto out;
1180 }
1181
1182 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1183 bio_sectors(&rbio->bio));
1184 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1185
1186 if (unlikely(c->opts.no_data_io)) {
1187 if (likely(!(flags & BCH_READ_in_retry)))
1188 bio_endio(&rbio->bio);
1189 } else {
1190 if (likely(!(flags & BCH_READ_in_retry)))
1191 submit_bio(&rbio->bio);
1192 else
1193 submit_bio_wait(&rbio->bio);
1194 }
1195
1196 /*
1197 * We just submitted IO which may block, we expect relock fail
1198 * events and shouldn't count them:
1199 */
1200 trans->notrace_relock_fail = true;
1201 } else {
1202 /* Attempting reconstruct read: */
1203 if (bch2_ec_read_extent(trans, rbio, k)) {
1204 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1205 BLK_STS_IOERR);
1206 goto out;
1207 }
1208
1209 if (likely(!(flags & BCH_READ_in_retry)))
1210 bio_endio(&rbio->bio);
1211 }
1212 out:
1213 if (likely(!(flags & BCH_READ_in_retry))) {
1214 return 0;
1215 } else {
1216 bch2_trans_unlock(trans);
1217
1218 int ret;
1219
1220 rbio->context = RBIO_CONTEXT_UNBOUND;
1221 bch2_read_endio(&rbio->bio);
1222
1223 ret = rbio->ret;
1224 rbio = bch2_rbio_free(rbio);
1225
1226 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1227 bch2_mark_io_failure(failed, &pick,
1228 ret == -BCH_ERR_data_read_retry_csum_err);
1229
1230 return ret;
1231 }
1232
1233 err:
1234 if (flags & BCH_READ_in_retry)
1235 return ret;
1236
1237 orig->bio.bi_status = BLK_STS_IOERR;
1238 orig->ret = ret;
1239 goto out_read_done;
1240
1241 hole:
1242 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1243 bvec_iter_sectors(iter));
1244 /*
1245 * won't normally happen in the data update (bch2_move_extent()) path,
1246 * but if we retry and the extent we wanted to read no longer exists we
1247 * have to signal that:
1248 */
1249 if (u)
1250 orig->ret = -BCH_ERR_data_read_key_overwritten;
1251
1252 zero_fill_bio_iter(&orig->bio, iter);
1253 out_read_done:
1254 if ((flags & BCH_READ_last_fragment) &&
1255 !(flags & BCH_READ_in_retry))
1256 bch2_rbio_done(orig);
1257 return 0;
1258 }
1259
__bch2_read(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,subvol_inum inum,struct bch_io_failures * failed,unsigned flags)1260 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1261 struct bvec_iter bvec_iter, subvol_inum inum,
1262 struct bch_io_failures *failed, unsigned flags)
1263 {
1264 struct bch_fs *c = trans->c;
1265 struct btree_iter iter;
1266 struct bkey_buf sk;
1267 struct bkey_s_c k;
1268 int ret;
1269
1270 EBUG_ON(rbio->data_update);
1271
1272 bch2_bkey_buf_init(&sk);
1273 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1274 POS(inum.inum, bvec_iter.bi_sector),
1275 BTREE_ITER_slots);
1276
1277 while (1) {
1278 enum btree_id data_btree = BTREE_ID_extents;
1279
1280 bch2_trans_begin(trans);
1281
1282 u32 snapshot;
1283 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1284 if (ret)
1285 goto err;
1286
1287 bch2_btree_iter_set_snapshot(&iter, snapshot);
1288
1289 bch2_btree_iter_set_pos(&iter,
1290 POS(inum.inum, bvec_iter.bi_sector));
1291
1292 k = bch2_btree_iter_peek_slot(&iter);
1293 ret = bkey_err(k);
1294 if (ret)
1295 goto err;
1296
1297 s64 offset_into_extent = iter.pos.offset -
1298 bkey_start_offset(k.k);
1299 unsigned sectors = k.k->size - offset_into_extent;
1300
1301 bch2_bkey_buf_reassemble(&sk, c, k);
1302
1303 ret = bch2_read_indirect_extent(trans, &data_btree,
1304 &offset_into_extent, &sk);
1305 if (ret)
1306 goto err;
1307
1308 k = bkey_i_to_s_c(sk.k);
1309
1310 /*
1311 * With indirect extents, the amount of data to read is the min
1312 * of the original extent and the indirect extent:
1313 */
1314 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1315
1316 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1317 swap(bvec_iter.bi_size, bytes);
1318
1319 if (bvec_iter.bi_size == bytes)
1320 flags |= BCH_READ_last_fragment;
1321
1322 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1323 data_btree, k,
1324 offset_into_extent, failed, flags, -1);
1325 if (ret)
1326 goto err;
1327
1328 if (flags & BCH_READ_last_fragment)
1329 break;
1330
1331 swap(bvec_iter.bi_size, bytes);
1332 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1333 err:
1334 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1335 flags |= BCH_READ_must_bounce;
1336
1337 if (ret &&
1338 !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1339 !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1340 break;
1341 }
1342
1343 bch2_trans_iter_exit(trans, &iter);
1344
1345 if (ret) {
1346 struct printbuf buf = PRINTBUF;
1347 lockrestart_do(trans,
1348 bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1349 bvec_iter.bi_sector << 9));
1350 prt_printf(&buf, "read error: %s", bch2_err_str(ret));
1351 bch_err_ratelimited(c, "%s", buf.buf);
1352 printbuf_exit(&buf);
1353
1354 rbio->bio.bi_status = BLK_STS_IOERR;
1355 rbio->ret = ret;
1356
1357 if (!(flags & BCH_READ_in_retry))
1358 bch2_rbio_done(rbio);
1359 }
1360
1361 bch2_bkey_buf_exit(&sk, c);
1362 return ret;
1363 }
1364
bch2_fs_io_read_exit(struct bch_fs * c)1365 void bch2_fs_io_read_exit(struct bch_fs *c)
1366 {
1367 if (c->promote_table.tbl)
1368 rhashtable_destroy(&c->promote_table);
1369 bioset_exit(&c->bio_read_split);
1370 bioset_exit(&c->bio_read);
1371 }
1372
bch2_fs_io_read_init(struct bch_fs * c)1373 int bch2_fs_io_read_init(struct bch_fs *c)
1374 {
1375 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1376 BIOSET_NEED_BVECS))
1377 return -BCH_ERR_ENOMEM_bio_read_init;
1378
1379 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1380 BIOSET_NEED_BVECS))
1381 return -BCH_ERR_ENOMEM_bio_read_split_init;
1382
1383 if (rhashtable_init(&c->promote_table, &bch_promote_params))
1384 return -BCH_ERR_ENOMEM_promote_table_init;
1385
1386 return 0;
1387 }
1388