xref: /linux/fs/bcachefs/io_read.c (revision 4a4b30ea80d8cb5e8c4c62bb86201f4ea0d9b030)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8 
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27 
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30 
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36 
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38 
bch2_target_congested(struct bch_fs * c,u16 target)39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 	const struct bch_devs_mask *devs;
42 	unsigned d, nr = 0, total = 0;
43 	u64 now = local_clock(), last;
44 	s64 congested;
45 	struct bch_dev *ca;
46 
47 	if (!target)
48 		return false;
49 
50 	rcu_read_lock();
51 	devs = bch2_target_to_mask(c, target) ?:
52 		&c->rw_devs[BCH_DATA_user];
53 
54 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 		ca = rcu_dereference(c->devs[d]);
56 		if (!ca)
57 			continue;
58 
59 		congested = atomic_read(&ca->congested);
60 		last = READ_ONCE(ca->congested_last);
61 		if (time_after64(now, last))
62 			congested -= (now - last) >> 12;
63 
64 		total += max(congested, 0LL);
65 		nr++;
66 	}
67 	rcu_read_unlock();
68 
69 	return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71 
72 #else
73 
bch2_target_congested(struct bch_fs * c,u16 target)74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 	return false;
77 }
78 
79 #endif
80 
81 /* Cache promotion on read */
82 
83 struct promote_op {
84 	struct rcu_head		rcu;
85 	u64			start_time;
86 
87 	struct rhash_head	hash;
88 	struct bpos		pos;
89 
90 	struct work_struct	work;
91 	struct data_update	write;
92 	struct bio_vec		bi_inline_vecs[]; /* must be last */
93 };
94 
95 static const struct rhashtable_params bch_promote_params = {
96 	.head_offset		= offsetof(struct promote_op, hash),
97 	.key_offset		= offsetof(struct promote_op, pos),
98 	.key_len		= sizeof(struct bpos),
99 	.automatic_shrinking	= true,
100 };
101 
have_io_error(struct bch_io_failures * failed)102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 	return failed && failed->nr;
105 }
106 
rbio_data_update(struct bch_read_bio * rbio)107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 	EBUG_ON(rbio->split);
110 
111 	return rbio->data_update
112 		? container_of(rbio, struct data_update, rbio)
113 		: NULL;
114 }
115 
ptr_being_rewritten(struct bch_read_bio * orig,unsigned dev)116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 	struct data_update *u = rbio_data_update(orig);
119 	if (!u)
120 		return false;
121 
122 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 	unsigned i = 0;
124 	bkey_for_each_ptr(ptrs, ptr) {
125 		if (ptr->dev == dev &&
126 		    u->data_opts.rewrite_ptrs & BIT(i))
127 			return true;
128 		i++;
129 	}
130 
131 	return false;
132 }
133 
should_promote(struct bch_fs * c,struct bkey_s_c k,struct bpos pos,struct bch_io_opts opts,unsigned flags,struct bch_io_failures * failed)134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 				  struct bpos pos,
136 				  struct bch_io_opts opts,
137 				  unsigned flags,
138 				  struct bch_io_failures *failed)
139 {
140 	if (!have_io_error(failed)) {
141 		BUG_ON(!opts.promote_target);
142 
143 		if (!(flags & BCH_READ_may_promote))
144 			return -BCH_ERR_nopromote_may_not;
145 
146 		if (bch2_bkey_has_target(c, k, opts.promote_target))
147 			return -BCH_ERR_nopromote_already_promoted;
148 
149 		if (bkey_extent_is_unwritten(k))
150 			return -BCH_ERR_nopromote_unwritten;
151 
152 		if (bch2_target_congested(c, opts.promote_target))
153 			return -BCH_ERR_nopromote_congested;
154 	}
155 
156 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 				   bch_promote_params))
158 		return -BCH_ERR_nopromote_in_flight;
159 
160 	return 0;
161 }
162 
promote_free(struct bch_read_bio * rbio)163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 	struct bch_fs *c = rbio->c;
167 
168 	int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 					 bch_promote_params);
170 	BUG_ON(ret);
171 
172 	bch2_data_update_exit(&op->write);
173 
174 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 	kfree_rcu(op, rcu);
176 }
177 
promote_done(struct bch_write_op * wop)178 static void promote_done(struct bch_write_op *wop)
179 {
180 	struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 	struct bch_fs *c = op->write.rbio.c;
182 
183 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 	promote_free(&op->write.rbio);
185 }
186 
promote_start_work(struct work_struct * work)187 static void promote_start_work(struct work_struct *work)
188 {
189 	struct promote_op *op = container_of(work, struct promote_op, work);
190 
191 	bch2_data_update_read_done(&op->write);
192 }
193 
promote_start(struct bch_read_bio * rbio)194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197 
198 	trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199 
200 	INIT_WORK(&op->work, promote_start_work);
201 	queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203 
__promote_alloc(struct btree_trans * trans,enum btree_id btree_id,struct bkey_s_c k,struct bpos pos,struct extent_ptr_decoded * pick,unsigned sectors,struct bch_read_bio * orig,struct bch_io_failures * failed)204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 					    enum btree_id btree_id,
206 					    struct bkey_s_c k,
207 					    struct bpos pos,
208 					    struct extent_ptr_decoded *pick,
209 					    unsigned sectors,
210 					    struct bch_read_bio *orig,
211 					    struct bch_io_failures *failed)
212 {
213 	struct bch_fs *c = trans->c;
214 	int ret;
215 
216 	struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217 
218 	if (!have_io_error(failed)) {
219 		update_opts.target = orig->opts.promote_target;
220 		update_opts.extra_replicas = 1;
221 		update_opts.write_flags |= BCH_WRITE_cached;
222 		update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 	} else {
224 		update_opts.target = orig->opts.foreground_target;
225 
226 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 		unsigned ptr_bit = 1;
228 		bkey_for_each_ptr(ptrs, ptr) {
229 			if (bch2_dev_io_failures(failed, ptr->dev) &&
230 			    !ptr_being_rewritten(orig, ptr->dev))
231 				update_opts.rewrite_ptrs |= ptr_bit;
232 			ptr_bit <<= 1;
233 		}
234 
235 		if (!update_opts.rewrite_ptrs)
236 			return NULL;
237 	}
238 
239 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241 
242 	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 	if (!op) {
244 		ret = -BCH_ERR_nopromote_enomem;
245 		goto err_put;
246 	}
247 
248 	op->start_time = local_clock();
249 	op->pos = pos;
250 
251 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 					  bch_promote_params)) {
253 		ret = -BCH_ERR_nopromote_in_flight;
254 		goto err;
255 	}
256 
257 	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 			writepoint_hashed((unsigned long) current),
259 			&orig->opts,
260 			update_opts,
261 			btree_id, k);
262 	/*
263 	 * possible errors: -BCH_ERR_nocow_lock_blocked,
264 	 * -BCH_ERR_ENOSPC_disk_reservation:
265 	 */
266 	if (ret)
267 		goto err_remove_hash;
268 
269 	rbio_init_fragment(&op->write.rbio.bio, orig);
270 	op->write.rbio.bounce	= true;
271 	op->write.rbio.promote	= true;
272 	op->write.op.end_io = promote_done;
273 
274 	return &op->write.rbio;
275 err_remove_hash:
276 	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
277 				      bch_promote_params));
278 err:
279 	bio_free_pages(&op->write.op.wbio.bio);
280 	/* We may have added to the rhashtable and thus need rcu freeing: */
281 	kfree_rcu(op, rcu);
282 err_put:
283 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
284 	return ERR_PTR(ret);
285 }
286 
287 noinline
promote_alloc(struct btree_trans * trans,struct bvec_iter iter,struct bkey_s_c k,struct extent_ptr_decoded * pick,unsigned flags,struct bch_read_bio * orig,bool * bounce,bool * read_full,struct bch_io_failures * failed)288 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
289 					struct bvec_iter iter,
290 					struct bkey_s_c k,
291 					struct extent_ptr_decoded *pick,
292 					unsigned flags,
293 					struct bch_read_bio *orig,
294 					bool *bounce,
295 					bool *read_full,
296 					struct bch_io_failures *failed)
297 {
298 	struct bch_fs *c = trans->c;
299 	/*
300 	 * if failed != NULL we're not actually doing a promote, we're
301 	 * recovering from an io/checksum error
302 	 */
303 	bool promote_full = (have_io_error(failed) ||
304 			     *read_full ||
305 			     READ_ONCE(c->opts.promote_whole_extents));
306 	/* data might have to be decompressed in the write path: */
307 	unsigned sectors = promote_full
308 		? max(pick->crc.compressed_size, pick->crc.live_size)
309 		: bvec_iter_sectors(iter);
310 	struct bpos pos = promote_full
311 		? bkey_start_pos(k.k)
312 		: POS(k.k->p.inode, iter.bi_sector);
313 	int ret;
314 
315 	ret = should_promote(c, k, pos, orig->opts, flags, failed);
316 	if (ret)
317 		goto nopromote;
318 
319 	struct bch_read_bio *promote =
320 		__promote_alloc(trans,
321 				k.k->type == KEY_TYPE_reflink_v
322 				? BTREE_ID_reflink
323 				: BTREE_ID_extents,
324 				k, pos, pick, sectors, orig, failed);
325 	if (!promote)
326 		return NULL;
327 
328 	ret = PTR_ERR_OR_ZERO(promote);
329 	if (ret)
330 		goto nopromote;
331 
332 	*bounce		= true;
333 	*read_full	= promote_full;
334 	return promote;
335 nopromote:
336 	trace_io_read_nopromote(c, ret);
337 	return NULL;
338 }
339 
340 /* Read */
341 
bch2_read_err_msg_trans(struct btree_trans * trans,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)342 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
343 				   struct bch_read_bio *rbio, struct bpos read_pos)
344 {
345 	int ret = lockrestart_do(trans,
346 		bch2_inum_offset_err_msg_trans(trans, out,
347 				(subvol_inum) { rbio->subvol, read_pos.inode },
348 				read_pos.offset << 9));
349 	if (ret)
350 		return ret;
351 
352 	if (rbio->data_update)
353 		prt_str(out, "(internal move) ");
354 
355 	return 0;
356 }
357 
bch2_read_err_msg(struct bch_fs * c,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)358 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
359 			      struct bch_read_bio *rbio, struct bpos read_pos)
360 {
361 	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
362 }
363 
364 enum rbio_context {
365 	RBIO_CONTEXT_NULL,
366 	RBIO_CONTEXT_HIGHPRI,
367 	RBIO_CONTEXT_UNBOUND,
368 };
369 
370 static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio * rbio)371 bch2_rbio_parent(struct bch_read_bio *rbio)
372 {
373 	return rbio->split ? rbio->parent : rbio;
374 }
375 
376 __always_inline
bch2_rbio_punt(struct bch_read_bio * rbio,work_func_t fn,enum rbio_context context,struct workqueue_struct * wq)377 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
378 			   enum rbio_context context,
379 			   struct workqueue_struct *wq)
380 {
381 	if (context <= rbio->context) {
382 		fn(&rbio->work);
383 	} else {
384 		rbio->work.func		= fn;
385 		rbio->context		= context;
386 		queue_work(wq, &rbio->work);
387 	}
388 }
389 
bch2_rbio_free(struct bch_read_bio * rbio)390 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
391 {
392 	BUG_ON(rbio->bounce && !rbio->split);
393 
394 	if (rbio->have_ioref) {
395 		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
396 		percpu_ref_put(&ca->io_ref);
397 	}
398 
399 	if (rbio->split) {
400 		struct bch_read_bio *parent = rbio->parent;
401 
402 		if (unlikely(rbio->promote)) {
403 			if (!rbio->bio.bi_status)
404 				promote_start(rbio);
405 			else
406 				promote_free(rbio);
407 		} else {
408 			if (rbio->bounce)
409 				bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
410 
411 			bio_put(&rbio->bio);
412 		}
413 
414 		rbio = parent;
415 	}
416 
417 	return rbio;
418 }
419 
420 /*
421  * Only called on a top level bch_read_bio to complete an entire read request,
422  * not a split:
423  */
bch2_rbio_done(struct bch_read_bio * rbio)424 static void bch2_rbio_done(struct bch_read_bio *rbio)
425 {
426 	if (rbio->start_time)
427 		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
428 				       rbio->start_time);
429 	bio_endio(&rbio->bio);
430 }
431 
bch2_read_retry_nodecode(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,struct bch_io_failures * failed,unsigned flags)432 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
433 					struct bch_read_bio *rbio,
434 					struct bvec_iter bvec_iter,
435 					struct bch_io_failures *failed,
436 					unsigned flags)
437 {
438 	struct data_update *u = container_of(rbio, struct data_update, rbio);
439 retry:
440 	bch2_trans_begin(trans);
441 
442 	struct btree_iter iter;
443 	struct bkey_s_c k;
444 	int ret = lockrestart_do(trans,
445 		bkey_err(k = bch2_bkey_get_iter(trans, &iter,
446 				u->btree_id, bkey_start_pos(&u->k.k->k),
447 				0)));
448 	if (ret)
449 		goto err;
450 
451 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
452 		/* extent we wanted to read no longer exists: */
453 		rbio->ret = -BCH_ERR_data_read_key_overwritten;
454 		goto err;
455 	}
456 
457 	ret = __bch2_read_extent(trans, rbio, bvec_iter,
458 				 bkey_start_pos(&u->k.k->k),
459 				 u->btree_id,
460 				 bkey_i_to_s_c(u->k.k),
461 				 0, failed, flags, -1);
462 err:
463 	bch2_trans_iter_exit(trans, &iter);
464 
465 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
466 		goto retry;
467 
468 	if (ret) {
469 		rbio->bio.bi_status	= BLK_STS_IOERR;
470 		rbio->ret		= ret;
471 	}
472 
473 	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
474 	return ret;
475 }
476 
bch2_rbio_retry(struct work_struct * work)477 static void bch2_rbio_retry(struct work_struct *work)
478 {
479 	struct bch_read_bio *rbio =
480 		container_of(work, struct bch_read_bio, work);
481 	struct bch_fs *c	= rbio->c;
482 	struct bvec_iter iter	= rbio->bvec_iter;
483 	unsigned flags		= rbio->flags;
484 	subvol_inum inum = {
485 		.subvol = rbio->subvol,
486 		.inum	= rbio->read_pos.inode,
487 	};
488 	struct bch_io_failures failed = { .nr = 0 };
489 	struct btree_trans *trans = bch2_trans_get(c);
490 
491 	trace_io_read_retry(&rbio->bio);
492 	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
493 		     bvec_iter_sectors(rbio->bvec_iter));
494 
495 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
496 		bch2_mark_io_failure(&failed, &rbio->pick,
497 				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
498 
499 	if (!rbio->split) {
500 		rbio->bio.bi_status	= 0;
501 		rbio->ret		= 0;
502 	}
503 
504 	unsigned subvol		= rbio->subvol;
505 	struct bpos read_pos	= rbio->read_pos;
506 
507 	rbio = bch2_rbio_free(rbio);
508 
509 	flags |= BCH_READ_in_retry;
510 	flags &= ~BCH_READ_may_promote;
511 	flags &= ~BCH_READ_last_fragment;
512 	flags |= BCH_READ_must_clone;
513 
514 	int ret = rbio->data_update
515 		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
516 		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
517 
518 	if (ret) {
519 		rbio->ret = ret;
520 		rbio->bio.bi_status = BLK_STS_IOERR;
521 	} else {
522 		struct printbuf buf = PRINTBUF;
523 
524 		lockrestart_do(trans,
525 			bch2_inum_offset_err_msg_trans(trans, &buf,
526 					(subvol_inum) { subvol, read_pos.inode },
527 					read_pos.offset << 9));
528 		if (rbio->data_update)
529 			prt_str(&buf, "(internal move) ");
530 		prt_str(&buf, "successful retry");
531 
532 		bch_err_ratelimited(c, "%s", buf.buf);
533 		printbuf_exit(&buf);
534 	}
535 
536 	bch2_rbio_done(rbio);
537 	bch2_trans_put(trans);
538 }
539 
bch2_rbio_error(struct bch_read_bio * rbio,int ret,blk_status_t blk_error)540 static void bch2_rbio_error(struct bch_read_bio *rbio,
541 			    int ret, blk_status_t blk_error)
542 {
543 	BUG_ON(ret >= 0);
544 
545 	rbio->ret		= ret;
546 	rbio->bio.bi_status	= blk_error;
547 
548 	bch2_rbio_parent(rbio)->saw_error = true;
549 
550 	if (rbio->flags & BCH_READ_in_retry)
551 		return;
552 
553 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
554 		bch2_rbio_punt(rbio, bch2_rbio_retry,
555 			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
556 	} else {
557 		rbio = bch2_rbio_free(rbio);
558 
559 		rbio->ret		= ret;
560 		rbio->bio.bi_status	= blk_error;
561 
562 		bch2_rbio_done(rbio);
563 	}
564 }
565 
bch2_read_io_err(struct work_struct * work)566 static void bch2_read_io_err(struct work_struct *work)
567 {
568 	struct bch_read_bio *rbio =
569 		container_of(work, struct bch_read_bio, work);
570 	struct bio *bio = &rbio->bio;
571 	struct bch_fs *c	= rbio->c;
572 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
573 	struct printbuf buf = PRINTBUF;
574 
575 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
576 	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
577 
578 	if (ca)
579 		bch_err_ratelimited(ca, "%s", buf.buf);
580 	else
581 		bch_err_ratelimited(c, "%s", buf.buf);
582 
583 	printbuf_exit(&buf);
584 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
585 }
586 
__bch2_rbio_narrow_crcs(struct btree_trans * trans,struct bch_read_bio * rbio)587 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
588 				   struct bch_read_bio *rbio)
589 {
590 	struct bch_fs *c = rbio->c;
591 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
592 	struct bch_extent_crc_unpacked new_crc;
593 	struct btree_iter iter;
594 	struct bkey_i *new;
595 	struct bkey_s_c k;
596 	int ret = 0;
597 
598 	if (crc_is_compressed(rbio->pick.crc))
599 		return 0;
600 
601 	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
602 			       BTREE_ITER_slots|BTREE_ITER_intent);
603 	if ((ret = bkey_err(k)))
604 		goto out;
605 
606 	if (bversion_cmp(k.k->bversion, rbio->version) ||
607 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
608 		goto out;
609 
610 	/* Extent was merged? */
611 	if (bkey_start_offset(k.k) < data_offset ||
612 	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
613 		goto out;
614 
615 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
616 			rbio->pick.crc, NULL, &new_crc,
617 			bkey_start_offset(k.k) - data_offset, k.k->size,
618 			rbio->pick.crc.csum_type)) {
619 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
620 		ret = 0;
621 		goto out;
622 	}
623 
624 	/*
625 	 * going to be temporarily appending another checksum entry:
626 	 */
627 	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
628 				 sizeof(struct bch_extent_crc128));
629 	if ((ret = PTR_ERR_OR_ZERO(new)))
630 		goto out;
631 
632 	bkey_reassemble(new, k);
633 
634 	if (!bch2_bkey_narrow_crcs(new, new_crc))
635 		goto out;
636 
637 	ret = bch2_trans_update(trans, &iter, new,
638 				BTREE_UPDATE_internal_snapshot_node);
639 out:
640 	bch2_trans_iter_exit(trans, &iter);
641 	return ret;
642 }
643 
bch2_rbio_narrow_crcs(struct bch_read_bio * rbio)644 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
645 {
646 	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
647 			     __bch2_rbio_narrow_crcs(trans, rbio));
648 }
649 
bch2_read_csum_err(struct work_struct * work)650 static void bch2_read_csum_err(struct work_struct *work)
651 {
652 	struct bch_read_bio *rbio =
653 		container_of(work, struct bch_read_bio, work);
654 	struct bch_fs *c	= rbio->c;
655 	struct bio *src		= &rbio->bio;
656 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
657 	struct nonce nonce = extent_nonce(rbio->version, crc);
658 	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
659 	struct printbuf buf = PRINTBUF;
660 
661 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
662 	prt_str(&buf, "data ");
663 	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
664 
665 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
666 	if (ca)
667 		bch_err_ratelimited(ca, "%s", buf.buf);
668 	else
669 		bch_err_ratelimited(c, "%s", buf.buf);
670 
671 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
672 	printbuf_exit(&buf);
673 }
674 
bch2_read_decompress_err(struct work_struct * work)675 static void bch2_read_decompress_err(struct work_struct *work)
676 {
677 	struct bch_read_bio *rbio =
678 		container_of(work, struct bch_read_bio, work);
679 	struct bch_fs *c	= rbio->c;
680 	struct printbuf buf = PRINTBUF;
681 
682 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
683 	prt_str(&buf, "decompression error");
684 
685 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
686 	if (ca)
687 		bch_err_ratelimited(ca, "%s", buf.buf);
688 	else
689 		bch_err_ratelimited(c, "%s", buf.buf);
690 
691 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
692 	printbuf_exit(&buf);
693 }
694 
bch2_read_decrypt_err(struct work_struct * work)695 static void bch2_read_decrypt_err(struct work_struct *work)
696 {
697 	struct bch_read_bio *rbio =
698 		container_of(work, struct bch_read_bio, work);
699 	struct bch_fs *c	= rbio->c;
700 	struct printbuf buf = PRINTBUF;
701 
702 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
703 	prt_str(&buf, "decrypt error");
704 
705 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
706 	if (ca)
707 		bch_err_ratelimited(ca, "%s", buf.buf);
708 	else
709 		bch_err_ratelimited(c, "%s", buf.buf);
710 
711 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
712 	printbuf_exit(&buf);
713 }
714 
715 /* Inner part that may run in process context */
__bch2_read_endio(struct work_struct * work)716 static void __bch2_read_endio(struct work_struct *work)
717 {
718 	struct bch_read_bio *rbio =
719 		container_of(work, struct bch_read_bio, work);
720 	struct bch_fs *c	= rbio->c;
721 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
722 	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
723 	struct bio *src			= &rbio->bio;
724 	struct bio *dst			= &parent->bio;
725 	struct bvec_iter dst_iter	= rbio->bvec_iter;
726 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
727 	struct nonce nonce = extent_nonce(rbio->version, crc);
728 	unsigned nofs_flags;
729 	struct bch_csum csum;
730 	int ret;
731 
732 	nofs_flags = memalloc_nofs_save();
733 
734 	/* Reset iterator for checksumming and copying bounced data: */
735 	if (rbio->bounce) {
736 		src->bi_iter.bi_size		= crc.compressed_size << 9;
737 		src->bi_iter.bi_idx		= 0;
738 		src->bi_iter.bi_bvec_done	= 0;
739 	} else {
740 		src->bi_iter			= rbio->bvec_iter;
741 	}
742 
743 	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
744 
745 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
746 	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
747 
748 	/*
749 	 * Checksum error: if the bio wasn't bounced, we may have been
750 	 * reading into buffers owned by userspace (that userspace can
751 	 * scribble over) - retry the read, bouncing it this time:
752 	 */
753 	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
754 		rbio->flags |= BCH_READ_must_bounce;
755 		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
756 				BLK_STS_IOERR);
757 		goto out;
758 	}
759 
760 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
761 
762 	if (!csum_good)
763 		goto csum_err;
764 
765 	/*
766 	 * XXX
767 	 * We need to rework the narrow_crcs path to deliver the read completion
768 	 * first, and then punt to a different workqueue, otherwise we're
769 	 * holding up reads while doing btree updates which is bad for memory
770 	 * reclaim.
771 	 */
772 	if (unlikely(rbio->narrow_crcs))
773 		bch2_rbio_narrow_crcs(rbio);
774 
775 	if (likely(!parent->data_update)) {
776 		/* Adjust crc to point to subset of data we want: */
777 		crc.offset     += rbio->offset_into_extent;
778 		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
779 
780 		if (crc_is_compressed(crc)) {
781 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
782 			if (ret)
783 				goto decrypt_err;
784 
785 			if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
786 			    !c->opts.no_data_io)
787 				goto decompression_err;
788 		} else {
789 			/* don't need to decrypt the entire bio: */
790 			nonce = nonce_add(nonce, crc.offset << 9);
791 			bio_advance(src, crc.offset << 9);
792 
793 			BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
794 			src->bi_iter.bi_size = dst_iter.bi_size;
795 
796 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
797 			if (ret)
798 				goto decrypt_err;
799 
800 			if (rbio->bounce) {
801 				struct bvec_iter src_iter = src->bi_iter;
802 
803 				bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
804 			}
805 		}
806 	} else {
807 		if (rbio->split)
808 			rbio->parent->pick = rbio->pick;
809 
810 		if (rbio->bounce) {
811 			struct bvec_iter src_iter = src->bi_iter;
812 
813 			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
814 		}
815 	}
816 
817 	if (rbio->promote) {
818 		/*
819 		 * Re encrypt data we decrypted, so it's consistent with
820 		 * rbio->crc:
821 		 */
822 		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
823 		if (ret)
824 			goto decrypt_err;
825 	}
826 
827 	if (likely(!(rbio->flags & BCH_READ_in_retry))) {
828 		rbio = bch2_rbio_free(rbio);
829 		bch2_rbio_done(rbio);
830 	}
831 out:
832 	memalloc_nofs_restore(nofs_flags);
833 	return;
834 csum_err:
835 	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
836 	goto out;
837 decompression_err:
838 	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
839 	goto out;
840 decrypt_err:
841 	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
842 	goto out;
843 }
844 
bch2_read_endio(struct bio * bio)845 static void bch2_read_endio(struct bio *bio)
846 {
847 	struct bch_read_bio *rbio =
848 		container_of(bio, struct bch_read_bio, bio);
849 	struct bch_fs *c	= rbio->c;
850 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
851 	struct workqueue_struct *wq = NULL;
852 	enum rbio_context context = RBIO_CONTEXT_NULL;
853 
854 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
855 				   rbio->submit_time, !bio->bi_status);
856 
857 	if (!rbio->split)
858 		rbio->bio.bi_end_io = rbio->end_io;
859 
860 	if (unlikely(bio->bi_status)) {
861 		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
862 		return;
863 	}
864 
865 	if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
866 	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
867 		trace_and_count(c, io_read_reuse_race, &rbio->bio);
868 
869 		if (rbio->flags & BCH_READ_retry_if_stale)
870 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
871 		else
872 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
873 		return;
874 	}
875 
876 	if (rbio->narrow_crcs ||
877 	    rbio->promote ||
878 	    crc_is_compressed(rbio->pick.crc) ||
879 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
880 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
881 	else if (rbio->pick.crc.csum_type)
882 		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
883 
884 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
885 }
886 
read_from_stale_dirty_pointer(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,struct bch_extent_ptr ptr)887 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
888 						   struct bch_dev *ca,
889 						   struct bkey_s_c k,
890 						   struct bch_extent_ptr ptr)
891 {
892 	struct bch_fs *c = trans->c;
893 	struct btree_iter iter;
894 	struct printbuf buf = PRINTBUF;
895 	int ret;
896 
897 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
898 			     PTR_BUCKET_POS(ca, &ptr),
899 			     BTREE_ITER_cached);
900 
901 	int gen = bucket_gen_get(ca, iter.pos.offset);
902 	if (gen >= 0) {
903 		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
904 		printbuf_indent_add(&buf, 2);
905 
906 		bch2_bkey_val_to_text(&buf, c, k);
907 		prt_newline(&buf);
908 
909 		prt_printf(&buf, "memory gen: %u", gen);
910 
911 		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
912 		if (!ret) {
913 			prt_newline(&buf);
914 			bch2_bkey_val_to_text(&buf, c, k);
915 		}
916 	} else {
917 		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
918 			   iter.pos.inode, iter.pos.offset);
919 		printbuf_indent_add(&buf, 2);
920 
921 		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
922 			   ca->mi.first_bucket, ca->mi.nbuckets);
923 
924 		bch2_bkey_val_to_text(&buf, c, k);
925 		prt_newline(&buf);
926 	}
927 
928 	bch2_fs_inconsistent(c, "%s", buf.buf);
929 
930 	bch2_trans_iter_exit(trans, &iter);
931 	printbuf_exit(&buf);
932 }
933 
__bch2_read_extent(struct btree_trans * trans,struct bch_read_bio * orig,struct bvec_iter iter,struct bpos read_pos,enum btree_id data_btree,struct bkey_s_c k,unsigned offset_into_extent,struct bch_io_failures * failed,unsigned flags,int dev)934 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
935 		       struct bvec_iter iter, struct bpos read_pos,
936 		       enum btree_id data_btree, struct bkey_s_c k,
937 		       unsigned offset_into_extent,
938 		       struct bch_io_failures *failed, unsigned flags, int dev)
939 {
940 	struct bch_fs *c = trans->c;
941 	struct extent_ptr_decoded pick;
942 	struct bch_read_bio *rbio = NULL;
943 	bool bounce = false, read_full = false, narrow_crcs = false;
944 	struct bpos data_pos = bkey_start_pos(k.k);
945 	struct data_update *u = rbio_data_update(orig);
946 	int ret = 0;
947 
948 	if (bkey_extent_is_inline_data(k.k)) {
949 		unsigned bytes = min_t(unsigned, iter.bi_size,
950 				       bkey_inline_data_bytes(k.k));
951 
952 		swap(iter.bi_size, bytes);
953 		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
954 		swap(iter.bi_size, bytes);
955 		bio_advance_iter(&orig->bio, &iter, bytes);
956 		zero_fill_bio_iter(&orig->bio, iter);
957 		this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
958 			     bvec_iter_sectors(iter));
959 		goto out_read_done;
960 	}
961 retry_pick:
962 	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
963 
964 	/* hole or reservation - just zero fill: */
965 	if (!ret)
966 		goto hole;
967 
968 	if (unlikely(ret < 0)) {
969 		struct printbuf buf = PRINTBUF;
970 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
971 		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
972 		bch2_bkey_val_to_text(&buf, c, k);
973 
974 		bch_err_ratelimited(c, "%s", buf.buf);
975 		printbuf_exit(&buf);
976 		goto err;
977 	}
978 
979 	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
980 		struct printbuf buf = PRINTBUF;
981 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
982 		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
983 		bch2_bkey_val_to_text(&buf, c, k);
984 
985 		bch_err_ratelimited(c, "%s", buf.buf);
986 		printbuf_exit(&buf);
987 		ret = -BCH_ERR_data_read_no_encryption_key;
988 		goto err;
989 	}
990 
991 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
992 
993 	/*
994 	 * Stale dirty pointers are treated as IO errors, but @failed isn't
995 	 * allocated unless we're in the retry path - so if we're not in the
996 	 * retry path, don't check here, it'll be caught in bch2_read_endio()
997 	 * and we'll end up in the retry path:
998 	 */
999 	if ((flags & BCH_READ_in_retry) &&
1000 	    !pick.ptr.cached &&
1001 	    ca &&
1002 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1003 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1004 		bch2_mark_io_failure(failed, &pick, false);
1005 		percpu_ref_put(&ca->io_ref);
1006 		goto retry_pick;
1007 	}
1008 
1009 	if (likely(!u)) {
1010 		if (!(flags & BCH_READ_last_fragment) ||
1011 		    bio_flagged(&orig->bio, BIO_CHAIN))
1012 			flags |= BCH_READ_must_clone;
1013 
1014 		narrow_crcs = !(flags & BCH_READ_in_retry) &&
1015 			bch2_can_narrow_extent_crcs(k, pick.crc);
1016 
1017 		if (narrow_crcs && (flags & BCH_READ_user_mapped))
1018 			flags |= BCH_READ_must_bounce;
1019 
1020 		EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1021 
1022 		if (crc_is_compressed(pick.crc) ||
1023 		    (pick.crc.csum_type != BCH_CSUM_none &&
1024 		     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1025 		      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1026 		       (flags & BCH_READ_user_mapped)) ||
1027 		      (flags & BCH_READ_must_bounce)))) {
1028 			read_full = true;
1029 			bounce = true;
1030 		}
1031 	} else {
1032 		/*
1033 		 * can happen if we retry, and the extent we were going to read
1034 		 * has been merged in the meantime:
1035 		 */
1036 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1037 			if (ca)
1038 				percpu_ref_put(&ca->io_ref);
1039 			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1040 			goto out_read_done;
1041 		}
1042 
1043 		iter.bi_size	= pick.crc.compressed_size << 9;
1044 		read_full = true;
1045 	}
1046 
1047 	if (orig->opts.promote_target || have_io_error(failed))
1048 		rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1049 				     &bounce, &read_full, failed);
1050 
1051 	if (!read_full) {
1052 		EBUG_ON(crc_is_compressed(pick.crc));
1053 		EBUG_ON(pick.crc.csum_type &&
1054 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1055 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
1056 			 pick.crc.offset ||
1057 			 offset_into_extent));
1058 
1059 		data_pos.offset += offset_into_extent;
1060 		pick.ptr.offset += pick.crc.offset +
1061 			offset_into_extent;
1062 		offset_into_extent		= 0;
1063 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
1064 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
1065 		pick.crc.offset			= 0;
1066 		pick.crc.live_size		= bvec_iter_sectors(iter);
1067 	}
1068 
1069 	if (rbio) {
1070 		/*
1071 		 * promote already allocated bounce rbio:
1072 		 * promote needs to allocate a bio big enough for uncompressing
1073 		 * data in the write path, but we're not going to use it all
1074 		 * here:
1075 		 */
1076 		EBUG_ON(rbio->bio.bi_iter.bi_size <
1077 		       pick.crc.compressed_size << 9);
1078 		rbio->bio.bi_iter.bi_size =
1079 			pick.crc.compressed_size << 9;
1080 	} else if (bounce) {
1081 		unsigned sectors = pick.crc.compressed_size;
1082 
1083 		rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1084 						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
1085 						  0,
1086 						  GFP_NOFS,
1087 						  &c->bio_read_split),
1088 				 orig);
1089 
1090 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1091 		rbio->bounce	= true;
1092 	} else if (flags & BCH_READ_must_clone) {
1093 		/*
1094 		 * Have to clone if there were any splits, due to error
1095 		 * reporting issues (if a split errored, and retrying didn't
1096 		 * work, when it reports the error to its parent (us) we don't
1097 		 * know if the error was from our bio, and we should retry, or
1098 		 * from the whole bio, in which case we don't want to retry and
1099 		 * lose the error)
1100 		 */
1101 		rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1102 						 &c->bio_read_split),
1103 				 orig);
1104 		rbio->bio.bi_iter = iter;
1105 	} else {
1106 		rbio = orig;
1107 		rbio->bio.bi_iter = iter;
1108 		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1109 	}
1110 
1111 	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1112 
1113 	rbio->submit_time	= local_clock();
1114 	if (!rbio->split)
1115 		rbio->end_io	= orig->bio.bi_end_io;
1116 	rbio->bvec_iter		= iter;
1117 	rbio->offset_into_extent= offset_into_extent;
1118 	rbio->flags		= flags;
1119 	rbio->have_ioref	= ca != NULL;
1120 	rbio->narrow_crcs	= narrow_crcs;
1121 	rbio->ret		= 0;
1122 	rbio->context		= 0;
1123 	rbio->pick		= pick;
1124 	rbio->subvol		= orig->subvol;
1125 	rbio->read_pos		= read_pos;
1126 	rbio->data_btree	= data_btree;
1127 	rbio->data_pos		= data_pos;
1128 	rbio->version		= k.k->bversion;
1129 	INIT_WORK(&rbio->work, NULL);
1130 
1131 	rbio->bio.bi_opf	= orig->bio.bi_opf;
1132 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1133 	rbio->bio.bi_end_io	= bch2_read_endio;
1134 
1135 	if (rbio->bounce)
1136 		trace_and_count(c, io_read_bounce, &rbio->bio);
1137 
1138 	if (!u)
1139 		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1140 	else
1141 		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1142 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1143 
1144 	/*
1145 	 * If it's being moved internally, we don't want to flag it as a cache
1146 	 * hit:
1147 	 */
1148 	if (ca && pick.ptr.cached && !u)
1149 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1150 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
1151 
1152 	if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1153 		bio_inc_remaining(&orig->bio);
1154 		trace_and_count(c, io_read_split, &orig->bio);
1155 	}
1156 
1157 	/*
1158 	 * Unlock the iterator while the btree node's lock is still in
1159 	 * cache, before doing the IO:
1160 	 */
1161 	if (!(flags & BCH_READ_in_retry))
1162 		bch2_trans_unlock(trans);
1163 	else
1164 		bch2_trans_unlock_long(trans);
1165 
1166 	if (likely(!rbio->pick.do_ec_reconstruct)) {
1167 		if (unlikely(!rbio->have_ioref)) {
1168 			struct printbuf buf = PRINTBUF;
1169 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1170 			prt_printf(&buf, "no device to read from:\n  ");
1171 			bch2_bkey_val_to_text(&buf, c, k);
1172 
1173 			bch_err_ratelimited(c, "%s", buf.buf);
1174 			printbuf_exit(&buf);
1175 
1176 			bch2_rbio_error(rbio,
1177 					-BCH_ERR_data_read_retry_device_offline,
1178 					BLK_STS_IOERR);
1179 			goto out;
1180 		}
1181 
1182 		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1183 			     bio_sectors(&rbio->bio));
1184 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1185 
1186 		if (unlikely(c->opts.no_data_io)) {
1187 			if (likely(!(flags & BCH_READ_in_retry)))
1188 				bio_endio(&rbio->bio);
1189 		} else {
1190 			if (likely(!(flags & BCH_READ_in_retry)))
1191 				submit_bio(&rbio->bio);
1192 			else
1193 				submit_bio_wait(&rbio->bio);
1194 		}
1195 
1196 		/*
1197 		 * We just submitted IO which may block, we expect relock fail
1198 		 * events and shouldn't count them:
1199 		 */
1200 		trans->notrace_relock_fail = true;
1201 	} else {
1202 		/* Attempting reconstruct read: */
1203 		if (bch2_ec_read_extent(trans, rbio, k)) {
1204 			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1205 					BLK_STS_IOERR);
1206 			goto out;
1207 		}
1208 
1209 		if (likely(!(flags & BCH_READ_in_retry)))
1210 			bio_endio(&rbio->bio);
1211 	}
1212 out:
1213 	if (likely(!(flags & BCH_READ_in_retry))) {
1214 		return 0;
1215 	} else {
1216 		bch2_trans_unlock(trans);
1217 
1218 		int ret;
1219 
1220 		rbio->context = RBIO_CONTEXT_UNBOUND;
1221 		bch2_read_endio(&rbio->bio);
1222 
1223 		ret = rbio->ret;
1224 		rbio = bch2_rbio_free(rbio);
1225 
1226 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1227 			bch2_mark_io_failure(failed, &pick,
1228 					ret == -BCH_ERR_data_read_retry_csum_err);
1229 
1230 		return ret;
1231 	}
1232 
1233 err:
1234 	if (flags & BCH_READ_in_retry)
1235 		return ret;
1236 
1237 	orig->bio.bi_status	= BLK_STS_IOERR;
1238 	orig->ret		= ret;
1239 	goto out_read_done;
1240 
1241 hole:
1242 	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1243 		     bvec_iter_sectors(iter));
1244 	/*
1245 	 * won't normally happen in the data update (bch2_move_extent()) path,
1246 	 * but if we retry and the extent we wanted to read no longer exists we
1247 	 * have to signal that:
1248 	 */
1249 	if (u)
1250 		orig->ret = -BCH_ERR_data_read_key_overwritten;
1251 
1252 	zero_fill_bio_iter(&orig->bio, iter);
1253 out_read_done:
1254 	if ((flags & BCH_READ_last_fragment) &&
1255 	    !(flags & BCH_READ_in_retry))
1256 		bch2_rbio_done(orig);
1257 	return 0;
1258 }
1259 
__bch2_read(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,subvol_inum inum,struct bch_io_failures * failed,unsigned flags)1260 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1261 		struct bvec_iter bvec_iter, subvol_inum inum,
1262 		struct bch_io_failures *failed, unsigned flags)
1263 {
1264 	struct bch_fs *c = trans->c;
1265 	struct btree_iter iter;
1266 	struct bkey_buf sk;
1267 	struct bkey_s_c k;
1268 	int ret;
1269 
1270 	EBUG_ON(rbio->data_update);
1271 
1272 	bch2_bkey_buf_init(&sk);
1273 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1274 			     POS(inum.inum, bvec_iter.bi_sector),
1275 			     BTREE_ITER_slots);
1276 
1277 	while (1) {
1278 		enum btree_id data_btree = BTREE_ID_extents;
1279 
1280 		bch2_trans_begin(trans);
1281 
1282 		u32 snapshot;
1283 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1284 		if (ret)
1285 			goto err;
1286 
1287 		bch2_btree_iter_set_snapshot(&iter, snapshot);
1288 
1289 		bch2_btree_iter_set_pos(&iter,
1290 				POS(inum.inum, bvec_iter.bi_sector));
1291 
1292 		k = bch2_btree_iter_peek_slot(&iter);
1293 		ret = bkey_err(k);
1294 		if (ret)
1295 			goto err;
1296 
1297 		s64 offset_into_extent = iter.pos.offset -
1298 			bkey_start_offset(k.k);
1299 		unsigned sectors = k.k->size - offset_into_extent;
1300 
1301 		bch2_bkey_buf_reassemble(&sk, c, k);
1302 
1303 		ret = bch2_read_indirect_extent(trans, &data_btree,
1304 					&offset_into_extent, &sk);
1305 		if (ret)
1306 			goto err;
1307 
1308 		k = bkey_i_to_s_c(sk.k);
1309 
1310 		/*
1311 		 * With indirect extents, the amount of data to read is the min
1312 		 * of the original extent and the indirect extent:
1313 		 */
1314 		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1315 
1316 		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1317 		swap(bvec_iter.bi_size, bytes);
1318 
1319 		if (bvec_iter.bi_size == bytes)
1320 			flags |= BCH_READ_last_fragment;
1321 
1322 		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1323 					 data_btree, k,
1324 					 offset_into_extent, failed, flags, -1);
1325 		if (ret)
1326 			goto err;
1327 
1328 		if (flags & BCH_READ_last_fragment)
1329 			break;
1330 
1331 		swap(bvec_iter.bi_size, bytes);
1332 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1333 err:
1334 		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1335 			flags |= BCH_READ_must_bounce;
1336 
1337 		if (ret &&
1338 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1339 		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1340 			break;
1341 	}
1342 
1343 	bch2_trans_iter_exit(trans, &iter);
1344 
1345 	if (ret) {
1346 		struct printbuf buf = PRINTBUF;
1347 		lockrestart_do(trans,
1348 			bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1349 						       bvec_iter.bi_sector << 9));
1350 		prt_printf(&buf, "read error: %s", bch2_err_str(ret));
1351 		bch_err_ratelimited(c, "%s", buf.buf);
1352 		printbuf_exit(&buf);
1353 
1354 		rbio->bio.bi_status	= BLK_STS_IOERR;
1355 		rbio->ret		= ret;
1356 
1357 		if (!(flags & BCH_READ_in_retry))
1358 			bch2_rbio_done(rbio);
1359 	}
1360 
1361 	bch2_bkey_buf_exit(&sk, c);
1362 	return ret;
1363 }
1364 
bch2_fs_io_read_exit(struct bch_fs * c)1365 void bch2_fs_io_read_exit(struct bch_fs *c)
1366 {
1367 	if (c->promote_table.tbl)
1368 		rhashtable_destroy(&c->promote_table);
1369 	bioset_exit(&c->bio_read_split);
1370 	bioset_exit(&c->bio_read);
1371 }
1372 
bch2_fs_io_read_init(struct bch_fs * c)1373 int bch2_fs_io_read_init(struct bch_fs *c)
1374 {
1375 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1376 			BIOSET_NEED_BVECS))
1377 		return -BCH_ERR_ENOMEM_bio_read_init;
1378 
1379 	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1380 			BIOSET_NEED_BVECS))
1381 		return -BCH_ERR_ENOMEM_bio_read_split_init;
1382 
1383 	if (rhashtable_init(&c->promote_table, &bch_promote_params))
1384 		return -BCH_ERR_ENOMEM_promote_table_init;
1385 
1386 	return 0;
1387 }
1388