xref: /linux/fs/bcachefs/io_read.c (revision f694f30e81c4ade358eb8c75273bac1a48f0cb8f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8 
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27 
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30 
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36 
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38 
39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 	const struct bch_devs_mask *devs;
42 	unsigned d, nr = 0, total = 0;
43 	u64 now = local_clock(), last;
44 	s64 congested;
45 	struct bch_dev *ca;
46 
47 	if (!target)
48 		return false;
49 
50 	rcu_read_lock();
51 	devs = bch2_target_to_mask(c, target) ?:
52 		&c->rw_devs[BCH_DATA_user];
53 
54 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 		ca = rcu_dereference(c->devs[d]);
56 		if (!ca)
57 			continue;
58 
59 		congested = atomic_read(&ca->congested);
60 		last = READ_ONCE(ca->congested_last);
61 		if (time_after64(now, last))
62 			congested -= (now - last) >> 12;
63 
64 		total += max(congested, 0LL);
65 		nr++;
66 	}
67 	rcu_read_unlock();
68 
69 	return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71 
72 #else
73 
74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 	return false;
77 }
78 
79 #endif
80 
81 /* Cache promotion on read */
82 
83 struct promote_op {
84 	struct rcu_head		rcu;
85 	u64			start_time;
86 
87 	struct rhash_head	hash;
88 	struct bpos		pos;
89 
90 	struct work_struct	work;
91 	struct data_update	write;
92 	struct bio_vec		bi_inline_vecs[]; /* must be last */
93 };
94 
95 static const struct rhashtable_params bch_promote_params = {
96 	.head_offset		= offsetof(struct promote_op, hash),
97 	.key_offset		= offsetof(struct promote_op, pos),
98 	.key_len		= sizeof(struct bpos),
99 	.automatic_shrinking	= true,
100 };
101 
102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 	return failed && failed->nr;
105 }
106 
107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 	EBUG_ON(rbio->split);
110 
111 	return rbio->data_update
112 		? container_of(rbio, struct data_update, rbio)
113 		: NULL;
114 }
115 
116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 	struct data_update *u = rbio_data_update(orig);
119 	if (!u)
120 		return false;
121 
122 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 	unsigned i = 0;
124 	bkey_for_each_ptr(ptrs, ptr) {
125 		if (ptr->dev == dev &&
126 		    u->data_opts.rewrite_ptrs & BIT(i))
127 			return true;
128 		i++;
129 	}
130 
131 	return false;
132 }
133 
134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 				  struct bpos pos,
136 				  struct bch_io_opts opts,
137 				  unsigned flags,
138 				  struct bch_io_failures *failed)
139 {
140 	if (!have_io_error(failed)) {
141 		BUG_ON(!opts.promote_target);
142 
143 		if (!(flags & BCH_READ_may_promote))
144 			return -BCH_ERR_nopromote_may_not;
145 
146 		if (bch2_bkey_has_target(c, k, opts.promote_target))
147 			return -BCH_ERR_nopromote_already_promoted;
148 
149 		if (bkey_extent_is_unwritten(k))
150 			return -BCH_ERR_nopromote_unwritten;
151 
152 		if (bch2_target_congested(c, opts.promote_target))
153 			return -BCH_ERR_nopromote_congested;
154 	}
155 
156 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 				   bch_promote_params))
158 		return -BCH_ERR_nopromote_in_flight;
159 
160 	return 0;
161 }
162 
163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 	struct bch_fs *c = rbio->c;
167 
168 	int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 					 bch_promote_params);
170 	BUG_ON(ret);
171 
172 	bch2_data_update_exit(&op->write);
173 
174 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 	kfree_rcu(op, rcu);
176 }
177 
178 static void promote_done(struct bch_write_op *wop)
179 {
180 	struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 	struct bch_fs *c = op->write.rbio.c;
182 
183 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 	promote_free(&op->write.rbio);
185 }
186 
187 static void promote_start_work(struct work_struct *work)
188 {
189 	struct promote_op *op = container_of(work, struct promote_op, work);
190 
191 	bch2_data_update_read_done(&op->write);
192 }
193 
194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197 
198 	trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199 
200 	INIT_WORK(&op->work, promote_start_work);
201 	queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203 
204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 					    enum btree_id btree_id,
206 					    struct bkey_s_c k,
207 					    struct bpos pos,
208 					    struct extent_ptr_decoded *pick,
209 					    unsigned sectors,
210 					    struct bch_read_bio *orig,
211 					    struct bch_io_failures *failed)
212 {
213 	struct bch_fs *c = trans->c;
214 	int ret;
215 
216 	struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217 
218 	if (!have_io_error(failed)) {
219 		update_opts.target = orig->opts.promote_target;
220 		update_opts.extra_replicas = 1;
221 		update_opts.write_flags |= BCH_WRITE_cached;
222 		update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 	} else {
224 		update_opts.target = orig->opts.foreground_target;
225 
226 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 		unsigned ptr_bit = 1;
228 		bkey_for_each_ptr(ptrs, ptr) {
229 			if (bch2_dev_io_failures(failed, ptr->dev) &&
230 			    !ptr_being_rewritten(orig, ptr->dev))
231 				update_opts.rewrite_ptrs |= ptr_bit;
232 			ptr_bit <<= 1;
233 		}
234 
235 		if (!update_opts.rewrite_ptrs)
236 			return NULL;
237 	}
238 
239 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241 
242 	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 	if (!op) {
244 		ret = -BCH_ERR_nopromote_enomem;
245 		goto err_put;
246 	}
247 
248 	op->start_time = local_clock();
249 	op->pos = pos;
250 
251 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 					  bch_promote_params)) {
253 		ret = -BCH_ERR_nopromote_in_flight;
254 		goto err;
255 	}
256 
257 	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 			writepoint_hashed((unsigned long) current),
259 			&orig->opts,
260 			update_opts,
261 			btree_id, k);
262 	op->write.type = BCH_DATA_UPDATE_promote;
263 	/*
264 	 * possible errors: -BCH_ERR_nocow_lock_blocked,
265 	 * -BCH_ERR_ENOSPC_disk_reservation:
266 	 */
267 	if (ret)
268 		goto err_remove_hash;
269 
270 	rbio_init_fragment(&op->write.rbio.bio, orig);
271 	op->write.rbio.bounce	= true;
272 	op->write.rbio.promote	= true;
273 	op->write.op.end_io = promote_done;
274 
275 	return &op->write.rbio;
276 err_remove_hash:
277 	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
278 				      bch_promote_params));
279 err:
280 	bio_free_pages(&op->write.op.wbio.bio);
281 	/* We may have added to the rhashtable and thus need rcu freeing: */
282 	kfree_rcu(op, rcu);
283 err_put:
284 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
285 	return ERR_PTR(ret);
286 }
287 
288 noinline
289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
290 					struct bvec_iter iter,
291 					struct bkey_s_c k,
292 					struct extent_ptr_decoded *pick,
293 					unsigned flags,
294 					struct bch_read_bio *orig,
295 					bool *bounce,
296 					bool *read_full,
297 					struct bch_io_failures *failed)
298 {
299 	struct bch_fs *c = trans->c;
300 	/*
301 	 * if failed != NULL we're not actually doing a promote, we're
302 	 * recovering from an io/checksum error
303 	 */
304 	bool promote_full = (have_io_error(failed) ||
305 			     *read_full ||
306 			     READ_ONCE(c->opts.promote_whole_extents));
307 	/* data might have to be decompressed in the write path: */
308 	unsigned sectors = promote_full
309 		? max(pick->crc.compressed_size, pick->crc.live_size)
310 		: bvec_iter_sectors(iter);
311 	struct bpos pos = promote_full
312 		? bkey_start_pos(k.k)
313 		: POS(k.k->p.inode, iter.bi_sector);
314 	int ret;
315 
316 	ret = should_promote(c, k, pos, orig->opts, flags, failed);
317 	if (ret)
318 		goto nopromote;
319 
320 	struct bch_read_bio *promote =
321 		__promote_alloc(trans,
322 				k.k->type == KEY_TYPE_reflink_v
323 				? BTREE_ID_reflink
324 				: BTREE_ID_extents,
325 				k, pos, pick, sectors, orig, failed);
326 	if (!promote)
327 		return NULL;
328 
329 	ret = PTR_ERR_OR_ZERO(promote);
330 	if (ret)
331 		goto nopromote;
332 
333 	*bounce		= true;
334 	*read_full	= promote_full;
335 	return promote;
336 nopromote:
337 	trace_io_read_nopromote(c, ret);
338 	return NULL;
339 }
340 
341 /* Read */
342 
343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
344 				   struct bch_read_bio *rbio, struct bpos read_pos)
345 {
346 	int ret = lockrestart_do(trans,
347 		bch2_inum_offset_err_msg_trans(trans, out,
348 				(subvol_inum) { rbio->subvol, read_pos.inode },
349 				read_pos.offset << 9));
350 	if (ret)
351 		return ret;
352 
353 	if (rbio->data_update)
354 		prt_str(out, "(internal move) ");
355 
356 	return 0;
357 }
358 
359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
360 			      struct bch_read_bio *rbio, struct bpos read_pos)
361 {
362 	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
363 }
364 
365 enum rbio_context {
366 	RBIO_CONTEXT_NULL,
367 	RBIO_CONTEXT_HIGHPRI,
368 	RBIO_CONTEXT_UNBOUND,
369 };
370 
371 static inline struct bch_read_bio *
372 bch2_rbio_parent(struct bch_read_bio *rbio)
373 {
374 	return rbio->split ? rbio->parent : rbio;
375 }
376 
377 __always_inline
378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
379 			   enum rbio_context context,
380 			   struct workqueue_struct *wq)
381 {
382 	if (context <= rbio->context) {
383 		fn(&rbio->work);
384 	} else {
385 		rbio->work.func		= fn;
386 		rbio->context		= context;
387 		queue_work(wq, &rbio->work);
388 	}
389 }
390 
391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
392 {
393 	BUG_ON(rbio->bounce && !rbio->split);
394 
395 	if (rbio->have_ioref) {
396 		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
397 		percpu_ref_put(&ca->io_ref[READ]);
398 	}
399 
400 	if (rbio->split) {
401 		struct bch_read_bio *parent = rbio->parent;
402 
403 		if (unlikely(rbio->promote)) {
404 			if (!rbio->bio.bi_status)
405 				promote_start(rbio);
406 			else
407 				promote_free(rbio);
408 		} else {
409 			if (rbio->bounce)
410 				bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
411 
412 			bio_put(&rbio->bio);
413 		}
414 
415 		rbio = parent;
416 	}
417 
418 	return rbio;
419 }
420 
421 /*
422  * Only called on a top level bch_read_bio to complete an entire read request,
423  * not a split:
424  */
425 static void bch2_rbio_done(struct bch_read_bio *rbio)
426 {
427 	if (rbio->start_time)
428 		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
429 				       rbio->start_time);
430 	bio_endio(&rbio->bio);
431 }
432 
433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
434 					struct bch_read_bio *rbio,
435 					struct bvec_iter bvec_iter,
436 					struct bch_io_failures *failed,
437 					unsigned flags)
438 {
439 	struct data_update *u = container_of(rbio, struct data_update, rbio);
440 retry:
441 	bch2_trans_begin(trans);
442 
443 	struct btree_iter iter;
444 	struct bkey_s_c k;
445 	int ret = lockrestart_do(trans,
446 		bkey_err(k = bch2_bkey_get_iter(trans, &iter,
447 				u->btree_id, bkey_start_pos(&u->k.k->k),
448 				0)));
449 	if (ret)
450 		goto err;
451 
452 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
453 		/* extent we wanted to read no longer exists: */
454 		rbio->ret = -BCH_ERR_data_read_key_overwritten;
455 		goto err;
456 	}
457 
458 	ret = __bch2_read_extent(trans, rbio, bvec_iter,
459 				 bkey_start_pos(&u->k.k->k),
460 				 u->btree_id,
461 				 bkey_i_to_s_c(u->k.k),
462 				 0, failed, flags, -1);
463 err:
464 	bch2_trans_iter_exit(trans, &iter);
465 
466 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
467 		goto retry;
468 
469 	if (ret) {
470 		rbio->bio.bi_status	= BLK_STS_IOERR;
471 		rbio->ret		= ret;
472 	}
473 
474 	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
475 	return ret;
476 }
477 
478 static void bch2_rbio_retry(struct work_struct *work)
479 {
480 	struct bch_read_bio *rbio =
481 		container_of(work, struct bch_read_bio, work);
482 	struct bch_fs *c	= rbio->c;
483 	struct bvec_iter iter	= rbio->bvec_iter;
484 	unsigned flags		= rbio->flags;
485 	subvol_inum inum = {
486 		.subvol = rbio->subvol,
487 		.inum	= rbio->read_pos.inode,
488 	};
489 	struct bch_io_failures failed = { .nr = 0 };
490 	struct btree_trans *trans = bch2_trans_get(c);
491 
492 	trace_io_read_retry(&rbio->bio);
493 	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
494 		     bvec_iter_sectors(rbio->bvec_iter));
495 
496 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
497 		bch2_mark_io_failure(&failed, &rbio->pick,
498 				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
499 
500 	if (!rbio->split) {
501 		rbio->bio.bi_status	= 0;
502 		rbio->ret		= 0;
503 	}
504 
505 	unsigned subvol		= rbio->subvol;
506 	struct bpos read_pos	= rbio->read_pos;
507 
508 	rbio = bch2_rbio_free(rbio);
509 
510 	flags |= BCH_READ_in_retry;
511 	flags &= ~BCH_READ_may_promote;
512 	flags &= ~BCH_READ_last_fragment;
513 	flags |= BCH_READ_must_clone;
514 
515 	int ret = rbio->data_update
516 		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
517 		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
518 
519 	if (ret) {
520 		rbio->ret = ret;
521 		rbio->bio.bi_status = BLK_STS_IOERR;
522 	} else {
523 		struct printbuf buf = PRINTBUF;
524 
525 		lockrestart_do(trans,
526 			bch2_inum_offset_err_msg_trans(trans, &buf,
527 					(subvol_inum) { subvol, read_pos.inode },
528 					read_pos.offset << 9));
529 		if (rbio->data_update)
530 			prt_str(&buf, "(internal move) ");
531 		prt_str(&buf, "successful retry");
532 
533 		bch_err_ratelimited(c, "%s", buf.buf);
534 		printbuf_exit(&buf);
535 	}
536 
537 	bch2_rbio_done(rbio);
538 	bch2_trans_put(trans);
539 }
540 
541 static void bch2_rbio_error(struct bch_read_bio *rbio,
542 			    int ret, blk_status_t blk_error)
543 {
544 	BUG_ON(ret >= 0);
545 
546 	rbio->ret		= ret;
547 	rbio->bio.bi_status	= blk_error;
548 
549 	bch2_rbio_parent(rbio)->saw_error = true;
550 
551 	if (rbio->flags & BCH_READ_in_retry)
552 		return;
553 
554 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
555 		bch2_rbio_punt(rbio, bch2_rbio_retry,
556 			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
557 	} else {
558 		rbio = bch2_rbio_free(rbio);
559 
560 		rbio->ret		= ret;
561 		rbio->bio.bi_status	= blk_error;
562 
563 		bch2_rbio_done(rbio);
564 	}
565 }
566 
567 static void bch2_read_io_err(struct work_struct *work)
568 {
569 	struct bch_read_bio *rbio =
570 		container_of(work, struct bch_read_bio, work);
571 	struct bio *bio = &rbio->bio;
572 	struct bch_fs *c	= rbio->c;
573 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
574 	struct printbuf buf = PRINTBUF;
575 
576 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
577 	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
578 
579 	if (ca)
580 		bch_err_ratelimited(ca, "%s", buf.buf);
581 	else
582 		bch_err_ratelimited(c, "%s", buf.buf);
583 
584 	printbuf_exit(&buf);
585 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
586 }
587 
588 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
589 				   struct bch_read_bio *rbio)
590 {
591 	struct bch_fs *c = rbio->c;
592 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
593 	struct bch_extent_crc_unpacked new_crc;
594 	struct btree_iter iter;
595 	struct bkey_i *new;
596 	struct bkey_s_c k;
597 	int ret = 0;
598 
599 	if (crc_is_compressed(rbio->pick.crc))
600 		return 0;
601 
602 	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
603 			       BTREE_ITER_slots|BTREE_ITER_intent);
604 	if ((ret = bkey_err(k)))
605 		goto out;
606 
607 	if (bversion_cmp(k.k->bversion, rbio->version) ||
608 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
609 		goto out;
610 
611 	/* Extent was merged? */
612 	if (bkey_start_offset(k.k) < data_offset ||
613 	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
614 		goto out;
615 
616 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
617 			rbio->pick.crc, NULL, &new_crc,
618 			bkey_start_offset(k.k) - data_offset, k.k->size,
619 			rbio->pick.crc.csum_type)) {
620 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
621 		ret = 0;
622 		goto out;
623 	}
624 
625 	/*
626 	 * going to be temporarily appending another checksum entry:
627 	 */
628 	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
629 				 sizeof(struct bch_extent_crc128));
630 	if ((ret = PTR_ERR_OR_ZERO(new)))
631 		goto out;
632 
633 	bkey_reassemble(new, k);
634 
635 	if (!bch2_bkey_narrow_crcs(new, new_crc))
636 		goto out;
637 
638 	ret = bch2_trans_update(trans, &iter, new,
639 				BTREE_UPDATE_internal_snapshot_node);
640 out:
641 	bch2_trans_iter_exit(trans, &iter);
642 	return ret;
643 }
644 
645 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
646 {
647 	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
648 			     __bch2_rbio_narrow_crcs(trans, rbio));
649 }
650 
651 static void bch2_read_csum_err(struct work_struct *work)
652 {
653 	struct bch_read_bio *rbio =
654 		container_of(work, struct bch_read_bio, work);
655 	struct bch_fs *c	= rbio->c;
656 	struct bio *src		= &rbio->bio;
657 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
658 	struct nonce nonce = extent_nonce(rbio->version, crc);
659 	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
660 	struct printbuf buf = PRINTBUF;
661 
662 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
663 	prt_str(&buf, "data ");
664 	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
665 
666 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
667 	if (ca)
668 		bch_err_ratelimited(ca, "%s", buf.buf);
669 	else
670 		bch_err_ratelimited(c, "%s", buf.buf);
671 
672 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
673 	printbuf_exit(&buf);
674 }
675 
676 static void bch2_read_decompress_err(struct work_struct *work)
677 {
678 	struct bch_read_bio *rbio =
679 		container_of(work, struct bch_read_bio, work);
680 	struct bch_fs *c	= rbio->c;
681 	struct printbuf buf = PRINTBUF;
682 
683 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
684 	prt_str(&buf, "decompression error");
685 
686 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
687 	if (ca)
688 		bch_err_ratelimited(ca, "%s", buf.buf);
689 	else
690 		bch_err_ratelimited(c, "%s", buf.buf);
691 
692 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
693 	printbuf_exit(&buf);
694 }
695 
696 static void bch2_read_decrypt_err(struct work_struct *work)
697 {
698 	struct bch_read_bio *rbio =
699 		container_of(work, struct bch_read_bio, work);
700 	struct bch_fs *c	= rbio->c;
701 	struct printbuf buf = PRINTBUF;
702 
703 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
704 	prt_str(&buf, "decrypt error");
705 
706 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
707 	if (ca)
708 		bch_err_ratelimited(ca, "%s", buf.buf);
709 	else
710 		bch_err_ratelimited(c, "%s", buf.buf);
711 
712 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
713 	printbuf_exit(&buf);
714 }
715 
716 /* Inner part that may run in process context */
717 static void __bch2_read_endio(struct work_struct *work)
718 {
719 	struct bch_read_bio *rbio =
720 		container_of(work, struct bch_read_bio, work);
721 	struct bch_fs *c	= rbio->c;
722 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
723 	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
724 	struct bio *src			= &rbio->bio;
725 	struct bio *dst			= &parent->bio;
726 	struct bvec_iter dst_iter	= rbio->bvec_iter;
727 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
728 	struct nonce nonce = extent_nonce(rbio->version, crc);
729 	unsigned nofs_flags;
730 	struct bch_csum csum;
731 	int ret;
732 
733 	nofs_flags = memalloc_nofs_save();
734 
735 	/* Reset iterator for checksumming and copying bounced data: */
736 	if (rbio->bounce) {
737 		src->bi_iter.bi_size		= crc.compressed_size << 9;
738 		src->bi_iter.bi_idx		= 0;
739 		src->bi_iter.bi_bvec_done	= 0;
740 	} else {
741 		src->bi_iter			= rbio->bvec_iter;
742 	}
743 
744 	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
745 
746 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
747 	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
748 
749 	/*
750 	 * Checksum error: if the bio wasn't bounced, we may have been
751 	 * reading into buffers owned by userspace (that userspace can
752 	 * scribble over) - retry the read, bouncing it this time:
753 	 */
754 	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
755 		rbio->flags |= BCH_READ_must_bounce;
756 		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
757 				BLK_STS_IOERR);
758 		goto out;
759 	}
760 
761 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
762 
763 	if (!csum_good)
764 		goto csum_err;
765 
766 	/*
767 	 * XXX
768 	 * We need to rework the narrow_crcs path to deliver the read completion
769 	 * first, and then punt to a different workqueue, otherwise we're
770 	 * holding up reads while doing btree updates which is bad for memory
771 	 * reclaim.
772 	 */
773 	if (unlikely(rbio->narrow_crcs))
774 		bch2_rbio_narrow_crcs(rbio);
775 
776 	if (likely(!parent->data_update)) {
777 		/* Adjust crc to point to subset of data we want: */
778 		crc.offset     += rbio->offset_into_extent;
779 		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
780 
781 		if (crc_is_compressed(crc)) {
782 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
783 			if (ret)
784 				goto decrypt_err;
785 
786 			if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
787 			    !c->opts.no_data_io)
788 				goto decompression_err;
789 		} else {
790 			/* don't need to decrypt the entire bio: */
791 			nonce = nonce_add(nonce, crc.offset << 9);
792 			bio_advance(src, crc.offset << 9);
793 
794 			BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
795 			src->bi_iter.bi_size = dst_iter.bi_size;
796 
797 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
798 			if (ret)
799 				goto decrypt_err;
800 
801 			if (rbio->bounce) {
802 				struct bvec_iter src_iter = src->bi_iter;
803 
804 				bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
805 			}
806 		}
807 	} else {
808 		if (rbio->split)
809 			rbio->parent->pick = rbio->pick;
810 
811 		if (rbio->bounce) {
812 			struct bvec_iter src_iter = src->bi_iter;
813 
814 			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
815 		}
816 	}
817 
818 	if (rbio->promote) {
819 		/*
820 		 * Re encrypt data we decrypted, so it's consistent with
821 		 * rbio->crc:
822 		 */
823 		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
824 		if (ret)
825 			goto decrypt_err;
826 	}
827 
828 	if (likely(!(rbio->flags & BCH_READ_in_retry))) {
829 		rbio = bch2_rbio_free(rbio);
830 		bch2_rbio_done(rbio);
831 	}
832 out:
833 	memalloc_nofs_restore(nofs_flags);
834 	return;
835 csum_err:
836 	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
837 	goto out;
838 decompression_err:
839 	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
840 	goto out;
841 decrypt_err:
842 	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
843 	goto out;
844 }
845 
846 static void bch2_read_endio(struct bio *bio)
847 {
848 	struct bch_read_bio *rbio =
849 		container_of(bio, struct bch_read_bio, bio);
850 	struct bch_fs *c	= rbio->c;
851 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
852 	struct workqueue_struct *wq = NULL;
853 	enum rbio_context context = RBIO_CONTEXT_NULL;
854 
855 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
856 				   rbio->submit_time, !bio->bi_status);
857 
858 	if (!rbio->split)
859 		rbio->bio.bi_end_io = rbio->end_io;
860 
861 	if (unlikely(bio->bi_status)) {
862 		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
863 		return;
864 	}
865 
866 	if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
867 	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
868 		trace_and_count(c, io_read_reuse_race, &rbio->bio);
869 
870 		if (rbio->flags & BCH_READ_retry_if_stale)
871 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
872 		else
873 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
874 		return;
875 	}
876 
877 	if (rbio->narrow_crcs ||
878 	    rbio->promote ||
879 	    crc_is_compressed(rbio->pick.crc) ||
880 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
881 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
882 	else if (rbio->pick.crc.csum_type)
883 		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
884 
885 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
886 }
887 
888 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
889 						   struct bch_dev *ca,
890 						   struct bkey_s_c k,
891 						   struct bch_extent_ptr ptr)
892 {
893 	struct bch_fs *c = trans->c;
894 	struct btree_iter iter;
895 	struct printbuf buf = PRINTBUF;
896 	int ret;
897 
898 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
899 			     PTR_BUCKET_POS(ca, &ptr),
900 			     BTREE_ITER_cached);
901 
902 	int gen = bucket_gen_get(ca, iter.pos.offset);
903 	if (gen >= 0) {
904 		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
905 		printbuf_indent_add(&buf, 2);
906 
907 		bch2_bkey_val_to_text(&buf, c, k);
908 		prt_newline(&buf);
909 
910 		prt_printf(&buf, "memory gen: %u", gen);
911 
912 		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
913 		if (!ret) {
914 			prt_newline(&buf);
915 			bch2_bkey_val_to_text(&buf, c, k);
916 		}
917 	} else {
918 		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
919 			   iter.pos.inode, iter.pos.offset);
920 		printbuf_indent_add(&buf, 2);
921 
922 		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
923 			   ca->mi.first_bucket, ca->mi.nbuckets);
924 
925 		bch2_bkey_val_to_text(&buf, c, k);
926 		prt_newline(&buf);
927 	}
928 
929 	bch2_fs_inconsistent(c, "%s", buf.buf);
930 
931 	bch2_trans_iter_exit(trans, &iter);
932 	printbuf_exit(&buf);
933 }
934 
935 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
936 		       struct bvec_iter iter, struct bpos read_pos,
937 		       enum btree_id data_btree, struct bkey_s_c k,
938 		       unsigned offset_into_extent,
939 		       struct bch_io_failures *failed, unsigned flags, int dev)
940 {
941 	struct bch_fs *c = trans->c;
942 	struct extent_ptr_decoded pick;
943 	struct bch_read_bio *rbio = NULL;
944 	bool bounce = false, read_full = false, narrow_crcs = false;
945 	struct bpos data_pos = bkey_start_pos(k.k);
946 	struct data_update *u = rbio_data_update(orig);
947 	int ret = 0;
948 
949 	if (bkey_extent_is_inline_data(k.k)) {
950 		unsigned bytes = min_t(unsigned, iter.bi_size,
951 				       bkey_inline_data_bytes(k.k));
952 
953 		swap(iter.bi_size, bytes);
954 		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
955 		swap(iter.bi_size, bytes);
956 		bio_advance_iter(&orig->bio, &iter, bytes);
957 		zero_fill_bio_iter(&orig->bio, iter);
958 		this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
959 			     bvec_iter_sectors(iter));
960 		goto out_read_done;
961 	}
962 retry_pick:
963 	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
964 
965 	/* hole or reservation - just zero fill: */
966 	if (!ret)
967 		goto hole;
968 
969 	if (unlikely(ret < 0)) {
970 		struct printbuf buf = PRINTBUF;
971 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
972 		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
973 		bch2_bkey_val_to_text(&buf, c, k);
974 
975 		bch_err_ratelimited(c, "%s", buf.buf);
976 		printbuf_exit(&buf);
977 		goto err;
978 	}
979 
980 	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
981 	    !c->chacha20_key_set) {
982 		struct printbuf buf = PRINTBUF;
983 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
984 		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
985 		bch2_bkey_val_to_text(&buf, c, k);
986 
987 		bch_err_ratelimited(c, "%s", buf.buf);
988 		printbuf_exit(&buf);
989 		ret = -BCH_ERR_data_read_no_encryption_key;
990 		goto err;
991 	}
992 
993 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
994 
995 	/*
996 	 * Stale dirty pointers are treated as IO errors, but @failed isn't
997 	 * allocated unless we're in the retry path - so if we're not in the
998 	 * retry path, don't check here, it'll be caught in bch2_read_endio()
999 	 * and we'll end up in the retry path:
1000 	 */
1001 	if ((flags & BCH_READ_in_retry) &&
1002 	    !pick.ptr.cached &&
1003 	    ca &&
1004 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1005 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1006 		bch2_mark_io_failure(failed, &pick, false);
1007 		percpu_ref_put(&ca->io_ref[READ]);
1008 		goto retry_pick;
1009 	}
1010 
1011 	if (likely(!u)) {
1012 		if (!(flags & BCH_READ_last_fragment) ||
1013 		    bio_flagged(&orig->bio, BIO_CHAIN))
1014 			flags |= BCH_READ_must_clone;
1015 
1016 		narrow_crcs = !(flags & BCH_READ_in_retry) &&
1017 			bch2_can_narrow_extent_crcs(k, pick.crc);
1018 
1019 		if (narrow_crcs && (flags & BCH_READ_user_mapped))
1020 			flags |= BCH_READ_must_bounce;
1021 
1022 		EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1023 
1024 		if (crc_is_compressed(pick.crc) ||
1025 		    (pick.crc.csum_type != BCH_CSUM_none &&
1026 		     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1027 		      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1028 		       (flags & BCH_READ_user_mapped)) ||
1029 		      (flags & BCH_READ_must_bounce)))) {
1030 			read_full = true;
1031 			bounce = true;
1032 		}
1033 	} else {
1034 		/*
1035 		 * can happen if we retry, and the extent we were going to read
1036 		 * has been merged in the meantime:
1037 		 */
1038 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1039 			if (ca)
1040 				percpu_ref_put(&ca->io_ref[READ]);
1041 			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1042 			goto out_read_done;
1043 		}
1044 
1045 		iter.bi_size	= pick.crc.compressed_size << 9;
1046 		read_full = true;
1047 	}
1048 
1049 	if (orig->opts.promote_target || have_io_error(failed))
1050 		rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1051 				     &bounce, &read_full, failed);
1052 
1053 	if (!read_full) {
1054 		EBUG_ON(crc_is_compressed(pick.crc));
1055 		EBUG_ON(pick.crc.csum_type &&
1056 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1057 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
1058 			 pick.crc.offset ||
1059 			 offset_into_extent));
1060 
1061 		data_pos.offset += offset_into_extent;
1062 		pick.ptr.offset += pick.crc.offset +
1063 			offset_into_extent;
1064 		offset_into_extent		= 0;
1065 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
1066 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
1067 		pick.crc.offset			= 0;
1068 		pick.crc.live_size		= bvec_iter_sectors(iter);
1069 	}
1070 
1071 	if (rbio) {
1072 		/*
1073 		 * promote already allocated bounce rbio:
1074 		 * promote needs to allocate a bio big enough for uncompressing
1075 		 * data in the write path, but we're not going to use it all
1076 		 * here:
1077 		 */
1078 		EBUG_ON(rbio->bio.bi_iter.bi_size <
1079 		       pick.crc.compressed_size << 9);
1080 		rbio->bio.bi_iter.bi_size =
1081 			pick.crc.compressed_size << 9;
1082 	} else if (bounce) {
1083 		unsigned sectors = pick.crc.compressed_size;
1084 
1085 		rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1086 						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
1087 						  0,
1088 						  GFP_NOFS,
1089 						  &c->bio_read_split),
1090 				 orig);
1091 
1092 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1093 		rbio->bounce	= true;
1094 	} else if (flags & BCH_READ_must_clone) {
1095 		/*
1096 		 * Have to clone if there were any splits, due to error
1097 		 * reporting issues (if a split errored, and retrying didn't
1098 		 * work, when it reports the error to its parent (us) we don't
1099 		 * know if the error was from our bio, and we should retry, or
1100 		 * from the whole bio, in which case we don't want to retry and
1101 		 * lose the error)
1102 		 */
1103 		rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1104 						 &c->bio_read_split),
1105 				 orig);
1106 		rbio->bio.bi_iter = iter;
1107 	} else {
1108 		rbio = orig;
1109 		rbio->bio.bi_iter = iter;
1110 		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1111 	}
1112 
1113 	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1114 
1115 	rbio->submit_time	= local_clock();
1116 	if (!rbio->split)
1117 		rbio->end_io	= orig->bio.bi_end_io;
1118 	rbio->bvec_iter		= iter;
1119 	rbio->offset_into_extent= offset_into_extent;
1120 	rbio->flags		= flags;
1121 	rbio->have_ioref	= ca != NULL;
1122 	rbio->narrow_crcs	= narrow_crcs;
1123 	rbio->ret		= 0;
1124 	rbio->context		= 0;
1125 	rbio->pick		= pick;
1126 	rbio->subvol		= orig->subvol;
1127 	rbio->read_pos		= read_pos;
1128 	rbio->data_btree	= data_btree;
1129 	rbio->data_pos		= data_pos;
1130 	rbio->version		= k.k->bversion;
1131 	INIT_WORK(&rbio->work, NULL);
1132 
1133 	rbio->bio.bi_opf	= orig->bio.bi_opf;
1134 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1135 	rbio->bio.bi_end_io	= bch2_read_endio;
1136 
1137 	if (rbio->bounce)
1138 		trace_and_count(c, io_read_bounce, &rbio->bio);
1139 
1140 	if (!u)
1141 		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1142 	else
1143 		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1144 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1145 
1146 	/*
1147 	 * If it's being moved internally, we don't want to flag it as a cache
1148 	 * hit:
1149 	 */
1150 	if (ca && pick.ptr.cached && !u)
1151 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1152 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
1153 
1154 	if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1155 		bio_inc_remaining(&orig->bio);
1156 		trace_and_count(c, io_read_split, &orig->bio);
1157 	}
1158 
1159 	/*
1160 	 * Unlock the iterator while the btree node's lock is still in
1161 	 * cache, before doing the IO:
1162 	 */
1163 	if (!(flags & BCH_READ_in_retry))
1164 		bch2_trans_unlock(trans);
1165 	else
1166 		bch2_trans_unlock_long(trans);
1167 
1168 	if (likely(!rbio->pick.do_ec_reconstruct)) {
1169 		if (unlikely(!rbio->have_ioref)) {
1170 			struct printbuf buf = PRINTBUF;
1171 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1172 			prt_printf(&buf, "no device to read from:\n  ");
1173 			bch2_bkey_val_to_text(&buf, c, k);
1174 
1175 			bch_err_ratelimited(c, "%s", buf.buf);
1176 			printbuf_exit(&buf);
1177 
1178 			bch2_rbio_error(rbio,
1179 					-BCH_ERR_data_read_retry_device_offline,
1180 					BLK_STS_IOERR);
1181 			goto out;
1182 		}
1183 
1184 		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1185 			     bio_sectors(&rbio->bio));
1186 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1187 
1188 		if (unlikely(c->opts.no_data_io)) {
1189 			if (likely(!(flags & BCH_READ_in_retry)))
1190 				bio_endio(&rbio->bio);
1191 		} else {
1192 			if (likely(!(flags & BCH_READ_in_retry)))
1193 				submit_bio(&rbio->bio);
1194 			else
1195 				submit_bio_wait(&rbio->bio);
1196 		}
1197 
1198 		/*
1199 		 * We just submitted IO which may block, we expect relock fail
1200 		 * events and shouldn't count them:
1201 		 */
1202 		trans->notrace_relock_fail = true;
1203 	} else {
1204 		/* Attempting reconstruct read: */
1205 		if (bch2_ec_read_extent(trans, rbio, k)) {
1206 			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1207 					BLK_STS_IOERR);
1208 			goto out;
1209 		}
1210 
1211 		if (likely(!(flags & BCH_READ_in_retry)))
1212 			bio_endio(&rbio->bio);
1213 	}
1214 out:
1215 	if (likely(!(flags & BCH_READ_in_retry))) {
1216 		return 0;
1217 	} else {
1218 		bch2_trans_unlock(trans);
1219 
1220 		int ret;
1221 
1222 		rbio->context = RBIO_CONTEXT_UNBOUND;
1223 		bch2_read_endio(&rbio->bio);
1224 
1225 		ret = rbio->ret;
1226 		rbio = bch2_rbio_free(rbio);
1227 
1228 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1229 			bch2_mark_io_failure(failed, &pick,
1230 					ret == -BCH_ERR_data_read_retry_csum_err);
1231 
1232 		return ret;
1233 	}
1234 
1235 err:
1236 	if (flags & BCH_READ_in_retry)
1237 		return ret;
1238 
1239 	orig->bio.bi_status	= BLK_STS_IOERR;
1240 	orig->ret		= ret;
1241 	goto out_read_done;
1242 
1243 hole:
1244 	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1245 		     bvec_iter_sectors(iter));
1246 	/*
1247 	 * won't normally happen in the data update (bch2_move_extent()) path,
1248 	 * but if we retry and the extent we wanted to read no longer exists we
1249 	 * have to signal that:
1250 	 */
1251 	if (u)
1252 		orig->ret = -BCH_ERR_data_read_key_overwritten;
1253 
1254 	zero_fill_bio_iter(&orig->bio, iter);
1255 out_read_done:
1256 	if ((flags & BCH_READ_last_fragment) &&
1257 	    !(flags & BCH_READ_in_retry))
1258 		bch2_rbio_done(orig);
1259 	return 0;
1260 }
1261 
1262 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1263 		struct bvec_iter bvec_iter, subvol_inum inum,
1264 		struct bch_io_failures *failed, unsigned flags)
1265 {
1266 	struct bch_fs *c = trans->c;
1267 	struct btree_iter iter;
1268 	struct bkey_buf sk;
1269 	struct bkey_s_c k;
1270 	int ret;
1271 
1272 	EBUG_ON(rbio->data_update);
1273 
1274 	bch2_bkey_buf_init(&sk);
1275 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1276 			     POS(inum.inum, bvec_iter.bi_sector),
1277 			     BTREE_ITER_slots);
1278 
1279 	while (1) {
1280 		enum btree_id data_btree = BTREE_ID_extents;
1281 
1282 		bch2_trans_begin(trans);
1283 
1284 		u32 snapshot;
1285 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1286 		if (ret)
1287 			goto err;
1288 
1289 		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
1290 
1291 		bch2_btree_iter_set_pos(trans, &iter,
1292 				POS(inum.inum, bvec_iter.bi_sector));
1293 
1294 		k = bch2_btree_iter_peek_slot(trans, &iter);
1295 		ret = bkey_err(k);
1296 		if (ret)
1297 			goto err;
1298 
1299 		s64 offset_into_extent = iter.pos.offset -
1300 			bkey_start_offset(k.k);
1301 		unsigned sectors = k.k->size - offset_into_extent;
1302 
1303 		bch2_bkey_buf_reassemble(&sk, c, k);
1304 
1305 		ret = bch2_read_indirect_extent(trans, &data_btree,
1306 					&offset_into_extent, &sk);
1307 		if (ret)
1308 			goto err;
1309 
1310 		k = bkey_i_to_s_c(sk.k);
1311 
1312 		/*
1313 		 * With indirect extents, the amount of data to read is the min
1314 		 * of the original extent and the indirect extent:
1315 		 */
1316 		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1317 
1318 		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1319 		swap(bvec_iter.bi_size, bytes);
1320 
1321 		if (bvec_iter.bi_size == bytes)
1322 			flags |= BCH_READ_last_fragment;
1323 
1324 		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1325 					 data_btree, k,
1326 					 offset_into_extent, failed, flags, -1);
1327 		swap(bvec_iter.bi_size, bytes);
1328 
1329 		if (ret)
1330 			goto err;
1331 
1332 		if (flags & BCH_READ_last_fragment)
1333 			break;
1334 
1335 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1336 err:
1337 		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1338 			flags |= BCH_READ_must_bounce;
1339 
1340 		if (ret &&
1341 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1342 		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1343 			break;
1344 	}
1345 
1346 	bch2_trans_iter_exit(trans, &iter);
1347 
1348 	if (ret) {
1349 		struct printbuf buf = PRINTBUF;
1350 		lockrestart_do(trans,
1351 			bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1352 						       bvec_iter.bi_sector << 9));
1353 		prt_printf(&buf, "read error: %s", bch2_err_str(ret));
1354 		bch_err_ratelimited(c, "%s", buf.buf);
1355 		printbuf_exit(&buf);
1356 
1357 		rbio->bio.bi_status	= BLK_STS_IOERR;
1358 		rbio->ret		= ret;
1359 
1360 		if (!(flags & BCH_READ_in_retry))
1361 			bch2_rbio_done(rbio);
1362 	}
1363 
1364 	bch2_bkey_buf_exit(&sk, c);
1365 	return ret;
1366 }
1367 
1368 void bch2_fs_io_read_exit(struct bch_fs *c)
1369 {
1370 	if (c->promote_table.tbl)
1371 		rhashtable_destroy(&c->promote_table);
1372 	bioset_exit(&c->bio_read_split);
1373 	bioset_exit(&c->bio_read);
1374 }
1375 
1376 int bch2_fs_io_read_init(struct bch_fs *c)
1377 {
1378 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1379 			BIOSET_NEED_BVECS))
1380 		return -BCH_ERR_ENOMEM_bio_read_init;
1381 
1382 	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1383 			BIOSET_NEED_BVECS))
1384 		return -BCH_ERR_ENOMEM_bio_read_split_init;
1385 
1386 	if (rhashtable_init(&c->promote_table, &bch_promote_params))
1387 		return -BCH_ERR_ENOMEM_promote_table_init;
1388 
1389 	return 0;
1390 }
1391