xref: /linux/fs/bcachefs/io_read.c (revision 29d34a4d785bbf389d57bfdafe2a19dad6ced3a4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8 
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27 
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30 
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36 
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38 
39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 	const struct bch_devs_mask *devs;
42 	unsigned d, nr = 0, total = 0;
43 	u64 now = local_clock(), last;
44 	s64 congested;
45 	struct bch_dev *ca;
46 
47 	if (!target)
48 		return false;
49 
50 	rcu_read_lock();
51 	devs = bch2_target_to_mask(c, target) ?:
52 		&c->rw_devs[BCH_DATA_user];
53 
54 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 		ca = rcu_dereference(c->devs[d]);
56 		if (!ca)
57 			continue;
58 
59 		congested = atomic_read(&ca->congested);
60 		last = READ_ONCE(ca->congested_last);
61 		if (time_after64(now, last))
62 			congested -= (now - last) >> 12;
63 
64 		total += max(congested, 0LL);
65 		nr++;
66 	}
67 	rcu_read_unlock();
68 
69 	return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71 
72 #else
73 
74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 	return false;
77 }
78 
79 #endif
80 
81 /* Cache promotion on read */
82 
83 struct promote_op {
84 	struct rcu_head		rcu;
85 	u64			start_time;
86 
87 	struct rhash_head	hash;
88 	struct bpos		pos;
89 
90 	struct work_struct	work;
91 	struct data_update	write;
92 	struct bio_vec		bi_inline_vecs[]; /* must be last */
93 };
94 
95 static const struct rhashtable_params bch_promote_params = {
96 	.head_offset		= offsetof(struct promote_op, hash),
97 	.key_offset		= offsetof(struct promote_op, pos),
98 	.key_len		= sizeof(struct bpos),
99 	.automatic_shrinking	= true,
100 };
101 
102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 	return failed && failed->nr;
105 }
106 
107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 	EBUG_ON(rbio->split);
110 
111 	return rbio->data_update
112 		? container_of(rbio, struct data_update, rbio)
113 		: NULL;
114 }
115 
116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 	struct data_update *u = rbio_data_update(orig);
119 	if (!u)
120 		return false;
121 
122 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 	unsigned i = 0;
124 	bkey_for_each_ptr(ptrs, ptr) {
125 		if (ptr->dev == dev &&
126 		    u->data_opts.rewrite_ptrs & BIT(i))
127 			return true;
128 		i++;
129 	}
130 
131 	return false;
132 }
133 
134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 				  struct bpos pos,
136 				  struct bch_io_opts opts,
137 				  unsigned flags,
138 				  struct bch_io_failures *failed)
139 {
140 	if (!have_io_error(failed)) {
141 		BUG_ON(!opts.promote_target);
142 
143 		if (!(flags & BCH_READ_may_promote))
144 			return -BCH_ERR_nopromote_may_not;
145 
146 		if (bch2_bkey_has_target(c, k, opts.promote_target))
147 			return -BCH_ERR_nopromote_already_promoted;
148 
149 		if (bkey_extent_is_unwritten(k))
150 			return -BCH_ERR_nopromote_unwritten;
151 
152 		if (bch2_target_congested(c, opts.promote_target))
153 			return -BCH_ERR_nopromote_congested;
154 	}
155 
156 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 				   bch_promote_params))
158 		return -BCH_ERR_nopromote_in_flight;
159 
160 	return 0;
161 }
162 
163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 	struct bch_fs *c = rbio->c;
167 
168 	int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 					 bch_promote_params);
170 	BUG_ON(ret);
171 
172 	bch2_data_update_exit(&op->write);
173 
174 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 	kfree_rcu(op, rcu);
176 }
177 
178 static void promote_done(struct bch_write_op *wop)
179 {
180 	struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 	struct bch_fs *c = op->write.rbio.c;
182 
183 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 	promote_free(&op->write.rbio);
185 }
186 
187 static void promote_start_work(struct work_struct *work)
188 {
189 	struct promote_op *op = container_of(work, struct promote_op, work);
190 
191 	bch2_data_update_read_done(&op->write);
192 }
193 
194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197 
198 	trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199 
200 	INIT_WORK(&op->work, promote_start_work);
201 	queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203 
204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 					    enum btree_id btree_id,
206 					    struct bkey_s_c k,
207 					    struct bpos pos,
208 					    struct extent_ptr_decoded *pick,
209 					    unsigned sectors,
210 					    struct bch_read_bio *orig,
211 					    struct bch_io_failures *failed)
212 {
213 	struct bch_fs *c = trans->c;
214 	int ret;
215 
216 	struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217 
218 	if (!have_io_error(failed)) {
219 		update_opts.target = orig->opts.promote_target;
220 		update_opts.extra_replicas = 1;
221 		update_opts.write_flags |= BCH_WRITE_cached;
222 		update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 	} else {
224 		update_opts.target = orig->opts.foreground_target;
225 
226 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 		unsigned ptr_bit = 1;
228 		bkey_for_each_ptr(ptrs, ptr) {
229 			if (bch2_dev_io_failures(failed, ptr->dev) &&
230 			    !ptr_being_rewritten(orig, ptr->dev))
231 				update_opts.rewrite_ptrs |= ptr_bit;
232 			ptr_bit <<= 1;
233 		}
234 
235 		if (!update_opts.rewrite_ptrs)
236 			return NULL;
237 	}
238 
239 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241 
242 	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 	if (!op) {
244 		ret = -BCH_ERR_nopromote_enomem;
245 		goto err_put;
246 	}
247 
248 	op->start_time = local_clock();
249 	op->pos = pos;
250 
251 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 					  bch_promote_params)) {
253 		ret = -BCH_ERR_nopromote_in_flight;
254 		goto err;
255 	}
256 
257 	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 			writepoint_hashed((unsigned long) current),
259 			&orig->opts,
260 			update_opts,
261 			btree_id, k);
262 	op->write.type = BCH_DATA_UPDATE_promote;
263 	/*
264 	 * possible errors: -BCH_ERR_nocow_lock_blocked,
265 	 * -BCH_ERR_ENOSPC_disk_reservation:
266 	 */
267 	if (ret)
268 		goto err_remove_hash;
269 
270 	rbio_init_fragment(&op->write.rbio.bio, orig);
271 	op->write.rbio.bounce	= true;
272 	op->write.rbio.promote	= true;
273 	op->write.op.end_io = promote_done;
274 
275 	return &op->write.rbio;
276 err_remove_hash:
277 	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
278 				      bch_promote_params));
279 err:
280 	bio_free_pages(&op->write.op.wbio.bio);
281 	/* We may have added to the rhashtable and thus need rcu freeing: */
282 	kfree_rcu(op, rcu);
283 err_put:
284 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
285 	return ERR_PTR(ret);
286 }
287 
288 noinline
289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
290 					struct bvec_iter iter,
291 					struct bkey_s_c k,
292 					struct extent_ptr_decoded *pick,
293 					unsigned flags,
294 					struct bch_read_bio *orig,
295 					bool *bounce,
296 					bool *read_full,
297 					struct bch_io_failures *failed)
298 {
299 	struct bch_fs *c = trans->c;
300 	/*
301 	 * if failed != NULL we're not actually doing a promote, we're
302 	 * recovering from an io/checksum error
303 	 */
304 	bool promote_full = (have_io_error(failed) ||
305 			     *read_full ||
306 			     READ_ONCE(c->opts.promote_whole_extents));
307 	/* data might have to be decompressed in the write path: */
308 	unsigned sectors = promote_full
309 		? max(pick->crc.compressed_size, pick->crc.live_size)
310 		: bvec_iter_sectors(iter);
311 	struct bpos pos = promote_full
312 		? bkey_start_pos(k.k)
313 		: POS(k.k->p.inode, iter.bi_sector);
314 	int ret;
315 
316 	ret = should_promote(c, k, pos, orig->opts, flags, failed);
317 	if (ret)
318 		goto nopromote;
319 
320 	struct bch_read_bio *promote =
321 		__promote_alloc(trans,
322 				k.k->type == KEY_TYPE_reflink_v
323 				? BTREE_ID_reflink
324 				: BTREE_ID_extents,
325 				k, pos, pick, sectors, orig, failed);
326 	if (!promote)
327 		return NULL;
328 
329 	ret = PTR_ERR_OR_ZERO(promote);
330 	if (ret)
331 		goto nopromote;
332 
333 	*bounce		= true;
334 	*read_full	= promote_full;
335 	return promote;
336 nopromote:
337 	trace_io_read_nopromote(c, ret);
338 	return NULL;
339 }
340 
341 /* Read */
342 
343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
344 				   struct bch_read_bio *rbio, struct bpos read_pos)
345 {
346 	int ret = lockrestart_do(trans,
347 		bch2_inum_offset_err_msg_trans(trans, out,
348 				(subvol_inum) { rbio->subvol, read_pos.inode },
349 				read_pos.offset << 9));
350 	if (ret)
351 		return ret;
352 
353 	if (rbio->data_update)
354 		prt_str(out, "(internal move) ");
355 
356 	return 0;
357 }
358 
359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
360 			      struct bch_read_bio *rbio, struct bpos read_pos)
361 {
362 	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
363 }
364 
365 enum rbio_context {
366 	RBIO_CONTEXT_NULL,
367 	RBIO_CONTEXT_HIGHPRI,
368 	RBIO_CONTEXT_UNBOUND,
369 };
370 
371 static inline struct bch_read_bio *
372 bch2_rbio_parent(struct bch_read_bio *rbio)
373 {
374 	return rbio->split ? rbio->parent : rbio;
375 }
376 
377 __always_inline
378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
379 			   enum rbio_context context,
380 			   struct workqueue_struct *wq)
381 {
382 	if (context <= rbio->context) {
383 		fn(&rbio->work);
384 	} else {
385 		rbio->work.func		= fn;
386 		rbio->context		= context;
387 		queue_work(wq, &rbio->work);
388 	}
389 }
390 
391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
392 {
393 	BUG_ON(rbio->bounce && !rbio->split);
394 
395 	if (rbio->have_ioref) {
396 		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
397 		percpu_ref_put(&ca->io_ref[READ]);
398 	}
399 
400 	if (rbio->split) {
401 		struct bch_read_bio *parent = rbio->parent;
402 
403 		if (unlikely(rbio->promote)) {
404 			if (!rbio->bio.bi_status)
405 				promote_start(rbio);
406 			else
407 				promote_free(rbio);
408 		} else {
409 			if (rbio->bounce)
410 				bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
411 
412 			bio_put(&rbio->bio);
413 		}
414 
415 		rbio = parent;
416 	}
417 
418 	return rbio;
419 }
420 
421 /*
422  * Only called on a top level bch_read_bio to complete an entire read request,
423  * not a split:
424  */
425 static void bch2_rbio_done(struct bch_read_bio *rbio)
426 {
427 	if (rbio->start_time)
428 		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
429 				       rbio->start_time);
430 	bio_endio(&rbio->bio);
431 }
432 
433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
434 					struct bch_read_bio *rbio,
435 					struct bvec_iter bvec_iter,
436 					struct bch_io_failures *failed,
437 					unsigned flags)
438 {
439 	struct data_update *u = container_of(rbio, struct data_update, rbio);
440 retry:
441 	bch2_trans_begin(trans);
442 
443 	struct btree_iter iter;
444 	struct bkey_s_c k;
445 	int ret = lockrestart_do(trans,
446 		bkey_err(k = bch2_bkey_get_iter(trans, &iter,
447 				u->btree_id, bkey_start_pos(&u->k.k->k),
448 				0)));
449 	if (ret)
450 		goto err;
451 
452 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
453 		/* extent we wanted to read no longer exists: */
454 		rbio->ret = -BCH_ERR_data_read_key_overwritten;
455 		goto err;
456 	}
457 
458 	ret = __bch2_read_extent(trans, rbio, bvec_iter,
459 				 bkey_start_pos(&u->k.k->k),
460 				 u->btree_id,
461 				 bkey_i_to_s_c(u->k.k),
462 				 0, failed, flags, -1);
463 err:
464 	bch2_trans_iter_exit(trans, &iter);
465 
466 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
467 		goto retry;
468 
469 	if (ret) {
470 		rbio->bio.bi_status	= BLK_STS_IOERR;
471 		rbio->ret		= ret;
472 	}
473 
474 	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
475 	return ret;
476 }
477 
478 static void bch2_rbio_retry(struct work_struct *work)
479 {
480 	struct bch_read_bio *rbio =
481 		container_of(work, struct bch_read_bio, work);
482 	struct bch_fs *c	= rbio->c;
483 	struct bvec_iter iter	= rbio->bvec_iter;
484 	unsigned flags		= rbio->flags;
485 	subvol_inum inum = {
486 		.subvol = rbio->subvol,
487 		.inum	= rbio->read_pos.inode,
488 	};
489 	struct bch_io_failures failed = { .nr = 0 };
490 	struct btree_trans *trans = bch2_trans_get(c);
491 
492 	trace_io_read_retry(&rbio->bio);
493 	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
494 		     bvec_iter_sectors(rbio->bvec_iter));
495 
496 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
497 		bch2_mark_io_failure(&failed, &rbio->pick,
498 				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
499 
500 	if (!rbio->split) {
501 		rbio->bio.bi_status	= 0;
502 		rbio->ret		= 0;
503 	}
504 
505 	unsigned subvol		= rbio->subvol;
506 	struct bpos read_pos	= rbio->read_pos;
507 
508 	rbio = bch2_rbio_free(rbio);
509 
510 	flags |= BCH_READ_in_retry;
511 	flags &= ~BCH_READ_may_promote;
512 	flags &= ~BCH_READ_last_fragment;
513 	flags |= BCH_READ_must_clone;
514 
515 	int ret = rbio->data_update
516 		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
517 		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
518 
519 	if (ret) {
520 		rbio->ret = ret;
521 		rbio->bio.bi_status = BLK_STS_IOERR;
522 	} else {
523 		struct printbuf buf = PRINTBUF;
524 
525 		lockrestart_do(trans,
526 			bch2_inum_offset_err_msg_trans(trans, &buf,
527 					(subvol_inum) { subvol, read_pos.inode },
528 					read_pos.offset << 9));
529 		if (rbio->data_update)
530 			prt_str(&buf, "(internal move) ");
531 		prt_str(&buf, "successful retry");
532 
533 		bch_err_ratelimited(c, "%s", buf.buf);
534 		printbuf_exit(&buf);
535 	}
536 
537 	bch2_rbio_done(rbio);
538 	bch2_trans_put(trans);
539 }
540 
541 static void bch2_rbio_error(struct bch_read_bio *rbio,
542 			    int ret, blk_status_t blk_error)
543 {
544 	BUG_ON(ret >= 0);
545 
546 	rbio->ret		= ret;
547 	rbio->bio.bi_status	= blk_error;
548 
549 	bch2_rbio_parent(rbio)->saw_error = true;
550 
551 	if (rbio->flags & BCH_READ_in_retry)
552 		return;
553 
554 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
555 		bch2_rbio_punt(rbio, bch2_rbio_retry,
556 			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
557 	} else {
558 		rbio = bch2_rbio_free(rbio);
559 
560 		rbio->ret		= ret;
561 		rbio->bio.bi_status	= blk_error;
562 
563 		bch2_rbio_done(rbio);
564 	}
565 }
566 
567 static void bch2_read_io_err(struct work_struct *work)
568 {
569 	struct bch_read_bio *rbio =
570 		container_of(work, struct bch_read_bio, work);
571 	struct bio *bio = &rbio->bio;
572 	struct bch_fs *c	= rbio->c;
573 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
574 	struct printbuf buf = PRINTBUF;
575 
576 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
577 	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
578 
579 	if (ca)
580 		bch_err_ratelimited(ca, "%s", buf.buf);
581 	else
582 		bch_err_ratelimited(c, "%s", buf.buf);
583 
584 	printbuf_exit(&buf);
585 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
586 }
587 
588 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
589 				   struct bch_read_bio *rbio)
590 {
591 	struct bch_fs *c = rbio->c;
592 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
593 	struct bch_extent_crc_unpacked new_crc;
594 	struct btree_iter iter;
595 	struct bkey_i *new;
596 	struct bkey_s_c k;
597 	int ret = 0;
598 
599 	if (crc_is_compressed(rbio->pick.crc))
600 		return 0;
601 
602 	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
603 			       BTREE_ITER_slots|BTREE_ITER_intent);
604 	if ((ret = bkey_err(k)))
605 		goto out;
606 
607 	if (bversion_cmp(k.k->bversion, rbio->version) ||
608 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
609 		goto out;
610 
611 	/* Extent was merged? */
612 	if (bkey_start_offset(k.k) < data_offset ||
613 	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
614 		goto out;
615 
616 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
617 			rbio->pick.crc, NULL, &new_crc,
618 			bkey_start_offset(k.k) - data_offset, k.k->size,
619 			rbio->pick.crc.csum_type)) {
620 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
621 		ret = 0;
622 		goto out;
623 	}
624 
625 	/*
626 	 * going to be temporarily appending another checksum entry:
627 	 */
628 	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
629 				 sizeof(struct bch_extent_crc128));
630 	if ((ret = PTR_ERR_OR_ZERO(new)))
631 		goto out;
632 
633 	bkey_reassemble(new, k);
634 
635 	if (!bch2_bkey_narrow_crcs(new, new_crc))
636 		goto out;
637 
638 	ret = bch2_trans_update(trans, &iter, new,
639 				BTREE_UPDATE_internal_snapshot_node);
640 out:
641 	bch2_trans_iter_exit(trans, &iter);
642 	return ret;
643 }
644 
645 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
646 {
647 	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
648 			     __bch2_rbio_narrow_crcs(trans, rbio));
649 }
650 
651 static void bch2_read_csum_err(struct work_struct *work)
652 {
653 	struct bch_read_bio *rbio =
654 		container_of(work, struct bch_read_bio, work);
655 	struct bch_fs *c	= rbio->c;
656 	struct bio *src		= &rbio->bio;
657 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
658 	struct nonce nonce = extent_nonce(rbio->version, crc);
659 	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
660 	struct printbuf buf = PRINTBUF;
661 
662 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
663 	prt_str(&buf, "data ");
664 	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
665 
666 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
667 	if (ca)
668 		bch_err_ratelimited(ca, "%s", buf.buf);
669 	else
670 		bch_err_ratelimited(c, "%s", buf.buf);
671 
672 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
673 	printbuf_exit(&buf);
674 }
675 
676 static void bch2_read_decompress_err(struct work_struct *work)
677 {
678 	struct bch_read_bio *rbio =
679 		container_of(work, struct bch_read_bio, work);
680 	struct bch_fs *c	= rbio->c;
681 	struct printbuf buf = PRINTBUF;
682 
683 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
684 	prt_str(&buf, "decompression error");
685 
686 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
687 	if (ca)
688 		bch_err_ratelimited(ca, "%s", buf.buf);
689 	else
690 		bch_err_ratelimited(c, "%s", buf.buf);
691 
692 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
693 	printbuf_exit(&buf);
694 }
695 
696 static void bch2_read_decrypt_err(struct work_struct *work)
697 {
698 	struct bch_read_bio *rbio =
699 		container_of(work, struct bch_read_bio, work);
700 	struct bch_fs *c	= rbio->c;
701 	struct printbuf buf = PRINTBUF;
702 
703 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
704 	prt_str(&buf, "decrypt error");
705 
706 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
707 	if (ca)
708 		bch_err_ratelimited(ca, "%s", buf.buf);
709 	else
710 		bch_err_ratelimited(c, "%s", buf.buf);
711 
712 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
713 	printbuf_exit(&buf);
714 }
715 
716 /* Inner part that may run in process context */
717 static void __bch2_read_endio(struct work_struct *work)
718 {
719 	struct bch_read_bio *rbio =
720 		container_of(work, struct bch_read_bio, work);
721 	struct bch_fs *c	= rbio->c;
722 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
723 	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
724 	struct bio *src			= &rbio->bio;
725 	struct bio *dst			= &parent->bio;
726 	struct bvec_iter dst_iter	= rbio->bvec_iter;
727 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
728 	struct nonce nonce = extent_nonce(rbio->version, crc);
729 	unsigned nofs_flags;
730 	struct bch_csum csum;
731 	int ret;
732 
733 	nofs_flags = memalloc_nofs_save();
734 
735 	/* Reset iterator for checksumming and copying bounced data: */
736 	if (rbio->bounce) {
737 		src->bi_iter.bi_size		= crc.compressed_size << 9;
738 		src->bi_iter.bi_idx		= 0;
739 		src->bi_iter.bi_bvec_done	= 0;
740 	} else {
741 		src->bi_iter			= rbio->bvec_iter;
742 	}
743 
744 	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
745 
746 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
747 	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
748 
749 	/*
750 	 * Checksum error: if the bio wasn't bounced, we may have been
751 	 * reading into buffers owned by userspace (that userspace can
752 	 * scribble over) - retry the read, bouncing it this time:
753 	 */
754 	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
755 		rbio->flags |= BCH_READ_must_bounce;
756 		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
757 				BLK_STS_IOERR);
758 		goto out;
759 	}
760 
761 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
762 
763 	if (!csum_good)
764 		goto csum_err;
765 
766 	/*
767 	 * XXX
768 	 * We need to rework the narrow_crcs path to deliver the read completion
769 	 * first, and then punt to a different workqueue, otherwise we're
770 	 * holding up reads while doing btree updates which is bad for memory
771 	 * reclaim.
772 	 */
773 	if (unlikely(rbio->narrow_crcs))
774 		bch2_rbio_narrow_crcs(rbio);
775 
776 	if (likely(!parent->data_update)) {
777 		/* Adjust crc to point to subset of data we want: */
778 		crc.offset     += rbio->offset_into_extent;
779 		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
780 
781 		if (crc_is_compressed(crc)) {
782 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
783 			if (ret)
784 				goto decrypt_err;
785 
786 			if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
787 			    !c->opts.no_data_io)
788 				goto decompression_err;
789 		} else {
790 			/* don't need to decrypt the entire bio: */
791 			nonce = nonce_add(nonce, crc.offset << 9);
792 			bio_advance(src, crc.offset << 9);
793 
794 			BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
795 			src->bi_iter.bi_size = dst_iter.bi_size;
796 
797 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
798 			if (ret)
799 				goto decrypt_err;
800 
801 			if (rbio->bounce) {
802 				struct bvec_iter src_iter = src->bi_iter;
803 
804 				bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
805 			}
806 		}
807 	} else {
808 		if (rbio->split)
809 			rbio->parent->pick = rbio->pick;
810 
811 		if (rbio->bounce) {
812 			struct bvec_iter src_iter = src->bi_iter;
813 
814 			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
815 		}
816 	}
817 
818 	if (rbio->promote) {
819 		/*
820 		 * Re encrypt data we decrypted, so it's consistent with
821 		 * rbio->crc:
822 		 */
823 		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
824 		if (ret)
825 			goto decrypt_err;
826 	}
827 
828 	if (likely(!(rbio->flags & BCH_READ_in_retry))) {
829 		rbio = bch2_rbio_free(rbio);
830 		bch2_rbio_done(rbio);
831 	}
832 out:
833 	memalloc_nofs_restore(nofs_flags);
834 	return;
835 csum_err:
836 	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
837 	goto out;
838 decompression_err:
839 	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
840 	goto out;
841 decrypt_err:
842 	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
843 	goto out;
844 }
845 
846 static void bch2_read_endio(struct bio *bio)
847 {
848 	struct bch_read_bio *rbio =
849 		container_of(bio, struct bch_read_bio, bio);
850 	struct bch_fs *c	= rbio->c;
851 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
852 	struct workqueue_struct *wq = NULL;
853 	enum rbio_context context = RBIO_CONTEXT_NULL;
854 
855 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
856 				   rbio->submit_time, !bio->bi_status);
857 
858 	if (!rbio->split)
859 		rbio->bio.bi_end_io = rbio->end_io;
860 
861 	if (unlikely(bio->bi_status)) {
862 		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
863 		return;
864 	}
865 
866 	if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
867 	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
868 		trace_and_count(c, io_read_reuse_race, &rbio->bio);
869 
870 		if (rbio->flags & BCH_READ_retry_if_stale)
871 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
872 		else
873 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
874 		return;
875 	}
876 
877 	if (rbio->narrow_crcs ||
878 	    rbio->promote ||
879 	    crc_is_compressed(rbio->pick.crc) ||
880 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
881 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
882 	else if (rbio->pick.crc.csum_type)
883 		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
884 
885 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
886 }
887 
888 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
889 						   struct bch_dev *ca,
890 						   struct bkey_s_c k,
891 						   struct bch_extent_ptr ptr)
892 {
893 	struct bch_fs *c = trans->c;
894 	struct btree_iter iter;
895 	struct printbuf buf = PRINTBUF;
896 	int ret;
897 
898 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
899 			     PTR_BUCKET_POS(ca, &ptr),
900 			     BTREE_ITER_cached);
901 
902 	int gen = bucket_gen_get(ca, iter.pos.offset);
903 	if (gen >= 0) {
904 		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
905 		printbuf_indent_add(&buf, 2);
906 
907 		bch2_bkey_val_to_text(&buf, c, k);
908 		prt_newline(&buf);
909 
910 		prt_printf(&buf, "memory gen: %u", gen);
911 
912 		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
913 		if (!ret) {
914 			prt_newline(&buf);
915 			bch2_bkey_val_to_text(&buf, c, k);
916 		}
917 	} else {
918 		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
919 			   iter.pos.inode, iter.pos.offset);
920 		printbuf_indent_add(&buf, 2);
921 
922 		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
923 			   ca->mi.first_bucket, ca->mi.nbuckets);
924 
925 		bch2_bkey_val_to_text(&buf, c, k);
926 		prt_newline(&buf);
927 	}
928 
929 	bch2_fs_inconsistent(c, "%s", buf.buf);
930 
931 	bch2_trans_iter_exit(trans, &iter);
932 	printbuf_exit(&buf);
933 }
934 
935 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
936 		       struct bvec_iter iter, struct bpos read_pos,
937 		       enum btree_id data_btree, struct bkey_s_c k,
938 		       unsigned offset_into_extent,
939 		       struct bch_io_failures *failed, unsigned flags, int dev)
940 {
941 	struct bch_fs *c = trans->c;
942 	struct extent_ptr_decoded pick;
943 	struct bch_read_bio *rbio = NULL;
944 	bool bounce = false, read_full = false, narrow_crcs = false;
945 	struct bpos data_pos = bkey_start_pos(k.k);
946 	struct data_update *u = rbio_data_update(orig);
947 	int ret = 0;
948 
949 	if (bkey_extent_is_inline_data(k.k)) {
950 		unsigned bytes = min_t(unsigned, iter.bi_size,
951 				       bkey_inline_data_bytes(k.k));
952 
953 		swap(iter.bi_size, bytes);
954 		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
955 		swap(iter.bi_size, bytes);
956 		bio_advance_iter(&orig->bio, &iter, bytes);
957 		zero_fill_bio_iter(&orig->bio, iter);
958 		this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
959 			     bvec_iter_sectors(iter));
960 		goto out_read_done;
961 	}
962 retry_pick:
963 	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
964 
965 	/* hole or reservation - just zero fill: */
966 	if (!ret)
967 		goto hole;
968 
969 	if (unlikely(ret < 0)) {
970 		struct printbuf buf = PRINTBUF;
971 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
972 		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
973 		bch2_bkey_val_to_text(&buf, c, k);
974 
975 		bch_err_ratelimited(c, "%s", buf.buf);
976 		printbuf_exit(&buf);
977 		goto err;
978 	}
979 
980 	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
981 		struct printbuf buf = PRINTBUF;
982 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
983 		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
984 		bch2_bkey_val_to_text(&buf, c, k);
985 
986 		bch_err_ratelimited(c, "%s", buf.buf);
987 		printbuf_exit(&buf);
988 		ret = -BCH_ERR_data_read_no_encryption_key;
989 		goto err;
990 	}
991 
992 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
993 
994 	/*
995 	 * Stale dirty pointers are treated as IO errors, but @failed isn't
996 	 * allocated unless we're in the retry path - so if we're not in the
997 	 * retry path, don't check here, it'll be caught in bch2_read_endio()
998 	 * and we'll end up in the retry path:
999 	 */
1000 	if ((flags & BCH_READ_in_retry) &&
1001 	    !pick.ptr.cached &&
1002 	    ca &&
1003 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1004 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1005 		bch2_mark_io_failure(failed, &pick, false);
1006 		percpu_ref_put(&ca->io_ref[READ]);
1007 		goto retry_pick;
1008 	}
1009 
1010 	if (likely(!u)) {
1011 		if (!(flags & BCH_READ_last_fragment) ||
1012 		    bio_flagged(&orig->bio, BIO_CHAIN))
1013 			flags |= BCH_READ_must_clone;
1014 
1015 		narrow_crcs = !(flags & BCH_READ_in_retry) &&
1016 			bch2_can_narrow_extent_crcs(k, pick.crc);
1017 
1018 		if (narrow_crcs && (flags & BCH_READ_user_mapped))
1019 			flags |= BCH_READ_must_bounce;
1020 
1021 		EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1022 
1023 		if (crc_is_compressed(pick.crc) ||
1024 		    (pick.crc.csum_type != BCH_CSUM_none &&
1025 		     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1026 		      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1027 		       (flags & BCH_READ_user_mapped)) ||
1028 		      (flags & BCH_READ_must_bounce)))) {
1029 			read_full = true;
1030 			bounce = true;
1031 		}
1032 	} else {
1033 		/*
1034 		 * can happen if we retry, and the extent we were going to read
1035 		 * has been merged in the meantime:
1036 		 */
1037 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1038 			if (ca)
1039 				percpu_ref_put(&ca->io_ref[READ]);
1040 			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1041 			goto out_read_done;
1042 		}
1043 
1044 		iter.bi_size	= pick.crc.compressed_size << 9;
1045 		read_full = true;
1046 	}
1047 
1048 	if (orig->opts.promote_target || have_io_error(failed))
1049 		rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1050 				     &bounce, &read_full, failed);
1051 
1052 	if (!read_full) {
1053 		EBUG_ON(crc_is_compressed(pick.crc));
1054 		EBUG_ON(pick.crc.csum_type &&
1055 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1056 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
1057 			 pick.crc.offset ||
1058 			 offset_into_extent));
1059 
1060 		data_pos.offset += offset_into_extent;
1061 		pick.ptr.offset += pick.crc.offset +
1062 			offset_into_extent;
1063 		offset_into_extent		= 0;
1064 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
1065 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
1066 		pick.crc.offset			= 0;
1067 		pick.crc.live_size		= bvec_iter_sectors(iter);
1068 	}
1069 
1070 	if (rbio) {
1071 		/*
1072 		 * promote already allocated bounce rbio:
1073 		 * promote needs to allocate a bio big enough for uncompressing
1074 		 * data in the write path, but we're not going to use it all
1075 		 * here:
1076 		 */
1077 		EBUG_ON(rbio->bio.bi_iter.bi_size <
1078 		       pick.crc.compressed_size << 9);
1079 		rbio->bio.bi_iter.bi_size =
1080 			pick.crc.compressed_size << 9;
1081 	} else if (bounce) {
1082 		unsigned sectors = pick.crc.compressed_size;
1083 
1084 		rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1085 						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
1086 						  0,
1087 						  GFP_NOFS,
1088 						  &c->bio_read_split),
1089 				 orig);
1090 
1091 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1092 		rbio->bounce	= true;
1093 	} else if (flags & BCH_READ_must_clone) {
1094 		/*
1095 		 * Have to clone if there were any splits, due to error
1096 		 * reporting issues (if a split errored, and retrying didn't
1097 		 * work, when it reports the error to its parent (us) we don't
1098 		 * know if the error was from our bio, and we should retry, or
1099 		 * from the whole bio, in which case we don't want to retry and
1100 		 * lose the error)
1101 		 */
1102 		rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1103 						 &c->bio_read_split),
1104 				 orig);
1105 		rbio->bio.bi_iter = iter;
1106 	} else {
1107 		rbio = orig;
1108 		rbio->bio.bi_iter = iter;
1109 		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1110 	}
1111 
1112 	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1113 
1114 	rbio->submit_time	= local_clock();
1115 	if (!rbio->split)
1116 		rbio->end_io	= orig->bio.bi_end_io;
1117 	rbio->bvec_iter		= iter;
1118 	rbio->offset_into_extent= offset_into_extent;
1119 	rbio->flags		= flags;
1120 	rbio->have_ioref	= ca != NULL;
1121 	rbio->narrow_crcs	= narrow_crcs;
1122 	rbio->ret		= 0;
1123 	rbio->context		= 0;
1124 	rbio->pick		= pick;
1125 	rbio->subvol		= orig->subvol;
1126 	rbio->read_pos		= read_pos;
1127 	rbio->data_btree	= data_btree;
1128 	rbio->data_pos		= data_pos;
1129 	rbio->version		= k.k->bversion;
1130 	INIT_WORK(&rbio->work, NULL);
1131 
1132 	rbio->bio.bi_opf	= orig->bio.bi_opf;
1133 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1134 	rbio->bio.bi_end_io	= bch2_read_endio;
1135 
1136 	if (rbio->bounce)
1137 		trace_and_count(c, io_read_bounce, &rbio->bio);
1138 
1139 	if (!u)
1140 		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1141 	else
1142 		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1143 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1144 
1145 	/*
1146 	 * If it's being moved internally, we don't want to flag it as a cache
1147 	 * hit:
1148 	 */
1149 	if (ca && pick.ptr.cached && !u)
1150 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1151 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
1152 
1153 	if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1154 		bio_inc_remaining(&orig->bio);
1155 		trace_and_count(c, io_read_split, &orig->bio);
1156 	}
1157 
1158 	/*
1159 	 * Unlock the iterator while the btree node's lock is still in
1160 	 * cache, before doing the IO:
1161 	 */
1162 	if (!(flags & BCH_READ_in_retry))
1163 		bch2_trans_unlock(trans);
1164 	else
1165 		bch2_trans_unlock_long(trans);
1166 
1167 	if (likely(!rbio->pick.do_ec_reconstruct)) {
1168 		if (unlikely(!rbio->have_ioref)) {
1169 			struct printbuf buf = PRINTBUF;
1170 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1171 			prt_printf(&buf, "no device to read from:\n  ");
1172 			bch2_bkey_val_to_text(&buf, c, k);
1173 
1174 			bch_err_ratelimited(c, "%s", buf.buf);
1175 			printbuf_exit(&buf);
1176 
1177 			bch2_rbio_error(rbio,
1178 					-BCH_ERR_data_read_retry_device_offline,
1179 					BLK_STS_IOERR);
1180 			goto out;
1181 		}
1182 
1183 		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1184 			     bio_sectors(&rbio->bio));
1185 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1186 
1187 		if (unlikely(c->opts.no_data_io)) {
1188 			if (likely(!(flags & BCH_READ_in_retry)))
1189 				bio_endio(&rbio->bio);
1190 		} else {
1191 			if (likely(!(flags & BCH_READ_in_retry)))
1192 				submit_bio(&rbio->bio);
1193 			else
1194 				submit_bio_wait(&rbio->bio);
1195 		}
1196 
1197 		/*
1198 		 * We just submitted IO which may block, we expect relock fail
1199 		 * events and shouldn't count them:
1200 		 */
1201 		trans->notrace_relock_fail = true;
1202 	} else {
1203 		/* Attempting reconstruct read: */
1204 		if (bch2_ec_read_extent(trans, rbio, k)) {
1205 			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1206 					BLK_STS_IOERR);
1207 			goto out;
1208 		}
1209 
1210 		if (likely(!(flags & BCH_READ_in_retry)))
1211 			bio_endio(&rbio->bio);
1212 	}
1213 out:
1214 	if (likely(!(flags & BCH_READ_in_retry))) {
1215 		return 0;
1216 	} else {
1217 		bch2_trans_unlock(trans);
1218 
1219 		int ret;
1220 
1221 		rbio->context = RBIO_CONTEXT_UNBOUND;
1222 		bch2_read_endio(&rbio->bio);
1223 
1224 		ret = rbio->ret;
1225 		rbio = bch2_rbio_free(rbio);
1226 
1227 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1228 			bch2_mark_io_failure(failed, &pick,
1229 					ret == -BCH_ERR_data_read_retry_csum_err);
1230 
1231 		return ret;
1232 	}
1233 
1234 err:
1235 	if (flags & BCH_READ_in_retry)
1236 		return ret;
1237 
1238 	orig->bio.bi_status	= BLK_STS_IOERR;
1239 	orig->ret		= ret;
1240 	goto out_read_done;
1241 
1242 hole:
1243 	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1244 		     bvec_iter_sectors(iter));
1245 	/*
1246 	 * won't normally happen in the data update (bch2_move_extent()) path,
1247 	 * but if we retry and the extent we wanted to read no longer exists we
1248 	 * have to signal that:
1249 	 */
1250 	if (u)
1251 		orig->ret = -BCH_ERR_data_read_key_overwritten;
1252 
1253 	zero_fill_bio_iter(&orig->bio, iter);
1254 out_read_done:
1255 	if ((flags & BCH_READ_last_fragment) &&
1256 	    !(flags & BCH_READ_in_retry))
1257 		bch2_rbio_done(orig);
1258 	return 0;
1259 }
1260 
1261 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1262 		struct bvec_iter bvec_iter, subvol_inum inum,
1263 		struct bch_io_failures *failed, unsigned flags)
1264 {
1265 	struct bch_fs *c = trans->c;
1266 	struct btree_iter iter;
1267 	struct bkey_buf sk;
1268 	struct bkey_s_c k;
1269 	int ret;
1270 
1271 	EBUG_ON(rbio->data_update);
1272 
1273 	bch2_bkey_buf_init(&sk);
1274 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1275 			     POS(inum.inum, bvec_iter.bi_sector),
1276 			     BTREE_ITER_slots);
1277 
1278 	while (1) {
1279 		enum btree_id data_btree = BTREE_ID_extents;
1280 
1281 		bch2_trans_begin(trans);
1282 
1283 		u32 snapshot;
1284 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1285 		if (ret)
1286 			goto err;
1287 
1288 		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
1289 
1290 		bch2_btree_iter_set_pos(trans, &iter,
1291 				POS(inum.inum, bvec_iter.bi_sector));
1292 
1293 		k = bch2_btree_iter_peek_slot(trans, &iter);
1294 		ret = bkey_err(k);
1295 		if (ret)
1296 			goto err;
1297 
1298 		s64 offset_into_extent = iter.pos.offset -
1299 			bkey_start_offset(k.k);
1300 		unsigned sectors = k.k->size - offset_into_extent;
1301 
1302 		bch2_bkey_buf_reassemble(&sk, c, k);
1303 
1304 		ret = bch2_read_indirect_extent(trans, &data_btree,
1305 					&offset_into_extent, &sk);
1306 		if (ret)
1307 			goto err;
1308 
1309 		k = bkey_i_to_s_c(sk.k);
1310 
1311 		/*
1312 		 * With indirect extents, the amount of data to read is the min
1313 		 * of the original extent and the indirect extent:
1314 		 */
1315 		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1316 
1317 		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1318 		swap(bvec_iter.bi_size, bytes);
1319 
1320 		if (bvec_iter.bi_size == bytes)
1321 			flags |= BCH_READ_last_fragment;
1322 
1323 		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1324 					 data_btree, k,
1325 					 offset_into_extent, failed, flags, -1);
1326 		swap(bvec_iter.bi_size, bytes);
1327 
1328 		if (ret)
1329 			goto err;
1330 
1331 		if (flags & BCH_READ_last_fragment)
1332 			break;
1333 
1334 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1335 err:
1336 		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1337 			flags |= BCH_READ_must_bounce;
1338 
1339 		if (ret &&
1340 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1341 		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1342 			break;
1343 	}
1344 
1345 	bch2_trans_iter_exit(trans, &iter);
1346 
1347 	if (ret) {
1348 		struct printbuf buf = PRINTBUF;
1349 		lockrestart_do(trans,
1350 			bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1351 						       bvec_iter.bi_sector << 9));
1352 		prt_printf(&buf, "read error: %s", bch2_err_str(ret));
1353 		bch_err_ratelimited(c, "%s", buf.buf);
1354 		printbuf_exit(&buf);
1355 
1356 		rbio->bio.bi_status	= BLK_STS_IOERR;
1357 		rbio->ret		= ret;
1358 
1359 		if (!(flags & BCH_READ_in_retry))
1360 			bch2_rbio_done(rbio);
1361 	}
1362 
1363 	bch2_bkey_buf_exit(&sk, c);
1364 	return ret;
1365 }
1366 
1367 void bch2_fs_io_read_exit(struct bch_fs *c)
1368 {
1369 	if (c->promote_table.tbl)
1370 		rhashtable_destroy(&c->promote_table);
1371 	bioset_exit(&c->bio_read_split);
1372 	bioset_exit(&c->bio_read);
1373 }
1374 
1375 int bch2_fs_io_read_init(struct bch_fs *c)
1376 {
1377 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1378 			BIOSET_NEED_BVECS))
1379 		return -BCH_ERR_ENOMEM_bio_read_init;
1380 
1381 	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1382 			BIOSET_NEED_BVECS))
1383 		return -BCH_ERR_ENOMEM_bio_read_split_init;
1384 
1385 	if (rhashtable_init(&c->promote_table, &bch_promote_params))
1386 		return -BCH_ERR_ENOMEM_promote_table_init;
1387 
1388 	return 0;
1389 }
1390