xref: /linux/fs/btrfs/bio.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  * Copyright (C) 2022 Christoph Hellwig.
5  */
6 
7 #include <linux/bio.h>
8 #include "bio.h"
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "raid56.h"
12 #include "async-thread.h"
13 #include "dev-replace.h"
14 #include "zoned.h"
15 #include "file-item.h"
16 #include "raid-stripe-tree.h"
17 
18 static struct bio_set btrfs_bioset;
19 static struct bio_set btrfs_clone_bioset;
20 static struct bio_set btrfs_repair_bioset;
21 static mempool_t btrfs_failed_bio_pool;
22 
23 struct btrfs_failed_bio {
24 	struct btrfs_bio *bbio;
25 	int num_copies;
26 	atomic_t repair_count;
27 };
28 
29 /* Is this a data path I/O that needs storage layer checksum and repair? */
30 static inline bool is_data_bbio(const struct btrfs_bio *bbio)
31 {
32 	return bbio->inode && is_data_inode(bbio->inode);
33 }
34 
35 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
36 {
37 	return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
38 }
39 
40 /*
41  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
42  * is already initialized by the block layer.
43  */
44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
45 		    btrfs_bio_end_io_t end_io, void *private)
46 {
47 	/* @inode parameter is mandatory. */
48 	ASSERT(inode);
49 
50 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
51 	bbio->inode = inode;
52 	bbio->end_io = end_io;
53 	bbio->private = private;
54 	bbio->file_offset = file_offset;
55 	atomic_set(&bbio->pending_ios, 1);
56 	WRITE_ONCE(bbio->status, BLK_STS_OK);
57 }
58 
59 /*
60  * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
61  * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
62  *
63  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
64  * a mempool.
65  */
66 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
67 				  struct btrfs_inode *inode, u64 file_offset,
68 				  btrfs_bio_end_io_t end_io, void *private)
69 {
70 	struct btrfs_bio *bbio;
71 	struct bio *bio;
72 
73 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
74 	bbio = btrfs_bio(bio);
75 	btrfs_bio_init(bbio, inode, file_offset, end_io, private);
76 	return bbio;
77 }
78 
79 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
80 					 struct btrfs_bio *orig_bbio,
81 					 u64 map_length)
82 {
83 	struct btrfs_bio *bbio;
84 	struct bio *bio;
85 
86 	bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
87 			&btrfs_clone_bioset);
88 	if (IS_ERR(bio))
89 		return ERR_CAST(bio);
90 
91 	bbio = btrfs_bio(bio);
92 	btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
93 	orig_bbio->file_offset += map_length;
94 	if (bbio_has_ordered_extent(bbio)) {
95 		refcount_inc(&orig_bbio->ordered->refs);
96 		bbio->ordered = orig_bbio->ordered;
97 		bbio->orig_logical = orig_bbio->orig_logical;
98 		orig_bbio->orig_logical += map_length;
99 	}
100 
101 	bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
102 	bbio->can_use_append = orig_bbio->can_use_append;
103 	bbio->is_scrub = orig_bbio->is_scrub;
104 	bbio->is_remap = orig_bbio->is_remap;
105 	bbio->async_csum = orig_bbio->async_csum;
106 
107 	atomic_inc(&orig_bbio->pending_ios);
108 	return bbio;
109 }
110 
111 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
112 {
113 	/* Make sure we're already in task context. */
114 	ASSERT(in_task());
115 
116 	if (bbio->async_csum)
117 		wait_for_completion(&bbio->csum_done);
118 
119 	bbio->bio.bi_status = status;
120 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
121 		struct btrfs_bio *orig_bbio = bbio->private;
122 
123 		/* Free bio that was never submitted to the underlying device. */
124 		if (bbio_has_ordered_extent(bbio))
125 			btrfs_put_ordered_extent(bbio->ordered);
126 		bio_put(&bbio->bio);
127 
128 		bbio = orig_bbio;
129 	}
130 
131 	/*
132 	 * At this point, bbio always points to the original btrfs_bio. Save
133 	 * the first error in it.
134 	 */
135 	if (status != BLK_STS_OK)
136 		cmpxchg(&bbio->status, BLK_STS_OK, status);
137 
138 	if (atomic_dec_and_test(&bbio->pending_ios)) {
139 		/* Load split bio's error which might be set above. */
140 		if (status == BLK_STS_OK)
141 			bbio->bio.bi_status = READ_ONCE(bbio->status);
142 
143 		if (bbio_has_ordered_extent(bbio)) {
144 			struct btrfs_ordered_extent *ordered = bbio->ordered;
145 
146 			bbio->end_io(bbio);
147 			btrfs_put_ordered_extent(ordered);
148 		} else {
149 			bbio->end_io(bbio);
150 		}
151 	}
152 }
153 
154 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
155 {
156 	if (cur_mirror == fbio->num_copies)
157 		return cur_mirror + 1 - fbio->num_copies;
158 	return cur_mirror + 1;
159 }
160 
161 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
162 {
163 	if (cur_mirror == 1)
164 		return fbio->num_copies;
165 	return cur_mirror - 1;
166 }
167 
168 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
169 {
170 	if (atomic_dec_and_test(&fbio->repair_count)) {
171 		btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
172 		mempool_free(fbio, &btrfs_failed_bio_pool);
173 	}
174 }
175 
176 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
177 				 struct btrfs_device *dev)
178 {
179 	struct btrfs_failed_bio *fbio = repair_bbio->private;
180 	struct btrfs_inode *inode = repair_bbio->inode;
181 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
182 	/*
183 	 * We can not move forward the saved_iter, as it will be later
184 	 * utilized by repair_bbio again.
185 	 */
186 	struct bvec_iter saved_iter = repair_bbio->saved_iter;
187 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
188 	const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
189 	const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
190 	int mirror = repair_bbio->mirror_num;
191 	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
192 	phys_addr_t paddr;
193 	unsigned int slot = 0;
194 
195 	/* Repair bbio should be eaxctly one block sized. */
196 	ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
197 
198 	btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
199 		ASSERT(slot < nr_steps);
200 		paddrs[slot] = paddr;
201 		slot++;
202 	}
203 
204 	if (repair_bbio->bio.bi_status ||
205 	    !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
206 		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
207 		repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
208 
209 		mirror = next_repair_mirror(fbio, mirror);
210 		if (mirror == fbio->bbio->mirror_num) {
211 			btrfs_debug(fs_info, "no mirror left");
212 			fbio->bbio->bio.bi_status = BLK_STS_IOERR;
213 			goto done;
214 		}
215 
216 		btrfs_submit_bbio(repair_bbio, mirror);
217 		return;
218 	}
219 
220 	do {
221 		mirror = prev_repair_mirror(fbio, mirror);
222 		btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
223 				  repair_bbio->file_offset, fs_info->sectorsize,
224 				  logical, paddrs, step, mirror);
225 	} while (mirror != fbio->bbio->mirror_num);
226 
227 done:
228 	btrfs_repair_done(fbio);
229 	bio_put(&repair_bbio->bio);
230 }
231 
232 /*
233  * Try to kick off a repair read to the next available mirror for a bad sector.
234  *
235  * This primarily tries to recover good data to serve the actual read request,
236  * but also tries to write the good data back to the bad mirror(s) when a
237  * read succeeded to restore the redundancy.
238  */
239 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
240 						  u32 bio_offset,
241 						  phys_addr_t paddrs[],
242 						  struct btrfs_failed_bio *fbio)
243 {
244 	struct btrfs_inode *inode = failed_bbio->inode;
245 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
246 	const u32 sectorsize = fs_info->sectorsize;
247 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
248 	const u32 nr_steps = sectorsize / step;
249 	/*
250 	 * For bs > ps cases, the saved_iter can be partially moved forward.
251 	 * In that case we should round it down to the block boundary.
252 	 */
253 	const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
254 				       sectorsize);
255 	struct btrfs_bio *repair_bbio;
256 	struct bio *repair_bio;
257 	int num_copies;
258 	int mirror;
259 
260 	btrfs_debug(fs_info, "repair read error: read error at %llu",
261 		    failed_bbio->file_offset + bio_offset);
262 
263 	num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
264 	if (num_copies == 1) {
265 		btrfs_debug(fs_info, "no copy to repair from");
266 		failed_bbio->bio.bi_status = BLK_STS_IOERR;
267 		return fbio;
268 	}
269 
270 	if (!fbio) {
271 		fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
272 		fbio->bbio = failed_bbio;
273 		fbio->num_copies = num_copies;
274 		atomic_set(&fbio->repair_count, 1);
275 	}
276 
277 	atomic_inc(&fbio->repair_count);
278 
279 	repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
280 				      &btrfs_repair_bioset);
281 	repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
282 	for (int i = 0; i < nr_steps; i++) {
283 		int ret;
284 
285 		ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
286 
287 		ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
288 				   offset_in_page(paddrs[i]));
289 		ASSERT(ret == step);
290 	}
291 
292 	repair_bbio = btrfs_bio(repair_bio);
293 	btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
294 		       NULL, fbio);
295 
296 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
297 	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
298 	btrfs_submit_bbio(repair_bbio, mirror);
299 	return fbio;
300 }
301 
302 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
303 {
304 	struct btrfs_inode *inode = bbio->inode;
305 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
306 	const u32 sectorsize = fs_info->sectorsize;
307 	const u32 step = min(sectorsize, PAGE_SIZE);
308 	const u32 nr_steps = sectorsize / step;
309 	struct bvec_iter *iter = &bbio->saved_iter;
310 	blk_status_t status = bbio->bio.bi_status;
311 	struct btrfs_failed_bio *fbio = NULL;
312 	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
313 	phys_addr_t paddr;
314 	u32 offset = 0;
315 
316 	/* Read-repair requires the inode field to be set by the submitter. */
317 	ASSERT(inode);
318 
319 	/*
320 	 * Hand off repair bios to the repair code as there is no upper level
321 	 * submitter for them.
322 	 */
323 	if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
324 		btrfs_end_repair_bio(bbio, dev);
325 		return;
326 	}
327 
328 	/* Clear the I/O error. A failed repair will reset it. */
329 	bbio->bio.bi_status = BLK_STS_OK;
330 
331 	btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
332 		paddrs[(offset / step) % nr_steps] = paddr;
333 		offset += step;
334 
335 		if (IS_ALIGNED(offset, sectorsize)) {
336 			if (status ||
337 			    !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
338 				fbio = repair_one_sector(bbio, offset - sectorsize,
339 							 paddrs, fbio);
340 		}
341 	}
342 	if (bbio->csum != bbio->csum_inline)
343 		kvfree(bbio->csum);
344 
345 	if (fbio)
346 		btrfs_repair_done(fbio);
347 	else
348 		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
349 }
350 
351 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
352 {
353 	if (!dev || !dev->bdev)
354 		return;
355 	if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
356 		return;
357 
358 	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
359 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
360 	else if (!(bio->bi_opf & REQ_RAHEAD))
361 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
362 	if (bio->bi_opf & REQ_PREFLUSH)
363 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
364 }
365 
366 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
367 						const struct bio *bio)
368 {
369 	if (bio->bi_opf & REQ_META)
370 		return fs_info->endio_meta_workers;
371 	return fs_info->endio_workers;
372 }
373 
374 static void simple_end_io_work(struct work_struct *work)
375 {
376 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
377 	struct bio *bio = &bbio->bio;
378 
379 	if (bio_op(bio) == REQ_OP_READ) {
380 		/* Metadata reads are checked and repaired by the submitter. */
381 		if (is_data_bbio(bbio))
382 			return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
383 		return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
384 	}
385 	if (bio_is_zone_append(bio) && !bio->bi_status)
386 		btrfs_record_physical_zoned(bbio);
387 	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
388 }
389 
390 static void btrfs_simple_end_io(struct bio *bio)
391 {
392 	struct btrfs_bio *bbio = btrfs_bio(bio);
393 	struct btrfs_device *dev = bio->bi_private;
394 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
395 
396 	btrfs_bio_counter_dec(fs_info);
397 
398 	if (bio->bi_status)
399 		btrfs_log_dev_io_error(bio, dev);
400 
401 	INIT_WORK(&bbio->end_io_work, simple_end_io_work);
402 	queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
403 }
404 
405 static void btrfs_raid56_end_io(struct bio *bio)
406 {
407 	struct btrfs_io_context *bioc = bio->bi_private;
408 	struct btrfs_bio *bbio = btrfs_bio(bio);
409 
410 	/* RAID56 endio is always handled in workqueue. */
411 	ASSERT(in_task());
412 
413 	btrfs_bio_counter_dec(bioc->fs_info);
414 	bbio->mirror_num = bioc->mirror_num;
415 	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
416 		btrfs_check_read_bio(bbio, NULL);
417 	else
418 		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
419 
420 	btrfs_put_bioc(bioc);
421 }
422 
423 static void orig_write_end_io_work(struct work_struct *work)
424 {
425 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
426 	struct bio *bio = &bbio->bio;
427 	struct btrfs_io_stripe *stripe = bio->bi_private;
428 	struct btrfs_io_context *bioc = stripe->bioc;
429 
430 	btrfs_bio_counter_dec(bioc->fs_info);
431 
432 	if (bio->bi_status) {
433 		atomic_inc(&bioc->error);
434 		btrfs_log_dev_io_error(bio, stripe->dev);
435 	}
436 
437 	/*
438 	 * Only send an error to the higher layers if it is beyond the tolerance
439 	 * threshold.
440 	 */
441 	if (atomic_read(&bioc->error) > bioc->max_errors)
442 		bio->bi_status = BLK_STS_IOERR;
443 	else
444 		bio->bi_status = BLK_STS_OK;
445 
446 	if (bio_is_zone_append(bio) && !bio->bi_status)
447 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
448 
449 	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
450 	btrfs_put_bioc(bioc);
451 }
452 
453 static void btrfs_orig_write_end_io(struct bio *bio)
454 {
455 	struct btrfs_bio *bbio = btrfs_bio(bio);
456 
457 	INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
458 	queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
459 }
460 
461 static void clone_write_end_io_work(struct work_struct *work)
462 {
463 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
464 	struct bio *bio = &bbio->bio;
465 	struct btrfs_io_stripe *stripe = bio->bi_private;
466 
467 	if (bio->bi_status) {
468 		atomic_inc(&stripe->bioc->error);
469 		btrfs_log_dev_io_error(bio, stripe->dev);
470 	} else if (bio_is_zone_append(bio)) {
471 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
472 	}
473 
474 	/* Pass on control to the original bio this one was cloned from */
475 	bio_endio(stripe->bioc->orig_bio);
476 	bio_put(bio);
477 }
478 
479 static void btrfs_clone_write_end_io(struct bio *bio)
480 {
481 	struct btrfs_bio *bbio = btrfs_bio(bio);
482 
483 	INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
484 	queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
485 }
486 
487 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
488 {
489 	u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
490 
491 	if (!dev || !dev->bdev ||
492 	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
493 	    (btrfs_op(bio) == BTRFS_MAP_WRITE &&
494 	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
495 		bio_io_error(bio);
496 		return;
497 	}
498 
499 	bio_set_dev(bio, dev->bdev);
500 
501 	/*
502 	 * For zone append writing, bi_sector must point the beginning of the
503 	 * zone
504 	 */
505 	if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) {
506 		u64 zone_start = round_down(physical, dev->fs_info->zone_size);
507 
508 		ASSERT(btrfs_dev_is_sequential(dev, physical));
509 		bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
510 		bio->bi_opf &= ~REQ_OP_WRITE;
511 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
512 	}
513 	btrfs_debug(dev->fs_info,
514 	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
515 		__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
516 		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
517 		dev->devid, bio->bi_iter.bi_size);
518 
519 	/*
520 	 * Track reads if tracking is enabled; ignore I/O operations before the
521 	 * filesystem is fully initialized.
522 	 */
523 	if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
524 		percpu_counter_add(&dev->fs_info->stats_read_blocks,
525 				   bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
526 
527 	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
528 		blkcg_punt_bio_submit(bio);
529 	else
530 		submit_bio(bio);
531 }
532 
533 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
534 {
535 	struct bio *orig_bio = bioc->orig_bio, *bio;
536 	struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
537 
538 	ASSERT(bio_op(orig_bio) != REQ_OP_READ);
539 
540 	/* Reuse the bio embedded into the btrfs_bio for the last mirror */
541 	if (dev_nr == bioc->num_stripes - 1) {
542 		bio = orig_bio;
543 		bio->bi_end_io = btrfs_orig_write_end_io;
544 	} else {
545 		/* We need to use endio_work to run end_io in task context. */
546 		bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
547 		bio_inc_remaining(orig_bio);
548 		btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
549 			       orig_bbio->file_offset, NULL, NULL);
550 		bio->bi_end_io = btrfs_clone_write_end_io;
551 	}
552 
553 	bio->bi_private = &bioc->stripes[dev_nr];
554 	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
555 	bioc->stripes[dev_nr].bioc = bioc;
556 	bioc->size = bio->bi_iter.bi_size;
557 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
558 }
559 
560 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
561 			     struct btrfs_io_stripe *smap, int mirror_num)
562 {
563 	if (!bioc) {
564 		/* Single mirror read/write fast path. */
565 		btrfs_bio(bio)->mirror_num = mirror_num;
566 		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
567 		if (bio_op(bio) != REQ_OP_READ)
568 			btrfs_bio(bio)->orig_physical = smap->physical;
569 		bio->bi_private = smap->dev;
570 		bio->bi_end_io = btrfs_simple_end_io;
571 		btrfs_submit_dev_bio(smap->dev, bio);
572 	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
573 		/* Parity RAID write or read recovery. */
574 		bio->bi_private = bioc;
575 		bio->bi_end_io = btrfs_raid56_end_io;
576 		if (bio_op(bio) == REQ_OP_READ)
577 			raid56_parity_recover(bio, bioc, mirror_num);
578 		else
579 			raid56_parity_write(bio, bioc);
580 	} else {
581 		/* Write to multiple mirrors. */
582 		int total_devs = bioc->num_stripes;
583 
584 		bioc->orig_bio = bio;
585 		for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
586 			btrfs_submit_mirrored_bio(bioc, dev_nr);
587 	}
588 }
589 
590 static int btrfs_bio_csum(struct btrfs_bio *bbio)
591 {
592 	if (bbio->bio.bi_opf & REQ_META)
593 		return btree_csum_one_bio(bbio);
594 #ifdef CONFIG_BTRFS_EXPERIMENTAL
595 	return btrfs_csum_one_bio(bbio, true);
596 #else
597 	return btrfs_csum_one_bio(bbio, false);
598 #endif
599 }
600 
601 /*
602  * Async submit bios are used to offload expensive checksumming onto the worker
603  * threads.
604  */
605 struct async_submit_bio {
606 	struct btrfs_bio *bbio;
607 	struct btrfs_io_context *bioc;
608 	struct btrfs_io_stripe smap;
609 	int mirror_num;
610 	struct btrfs_work work;
611 };
612 
613 /*
614  * In order to insert checksums into the metadata in large chunks, we wait
615  * until bio submission time.   All the pages in the bio are checksummed and
616  * sums are attached onto the ordered extent record.
617  *
618  * At IO completion time the csums attached on the ordered extent record are
619  * inserted into the btree.
620  */
621 static void run_one_async_start(struct btrfs_work *work)
622 {
623 	struct async_submit_bio *async =
624 		container_of(work, struct async_submit_bio, work);
625 	int ret;
626 
627 	ret = btrfs_bio_csum(async->bbio);
628 	if (ret)
629 		async->bbio->bio.bi_status = errno_to_blk_status(ret);
630 }
631 
632 /*
633  * In order to insert checksums into the metadata in large chunks, we wait
634  * until bio submission time.   All the pages in the bio are checksummed and
635  * sums are attached onto the ordered extent record.
636  *
637  * At IO completion time the csums attached on the ordered extent record are
638  * inserted into the tree.
639  *
640  * If called with @do_free == true, then it will free the work struct.
641  */
642 static void run_one_async_done(struct btrfs_work *work, bool do_free)
643 {
644 	struct async_submit_bio *async =
645 		container_of(work, struct async_submit_bio, work);
646 	struct bio *bio = &async->bbio->bio;
647 
648 	if (do_free) {
649 		kfree(container_of(work, struct async_submit_bio, work));
650 		return;
651 	}
652 
653 	/* If an error occurred we just want to clean up the bio and move on. */
654 	if (bio->bi_status) {
655 		btrfs_bio_end_io(async->bbio, bio->bi_status);
656 		return;
657 	}
658 
659 	/*
660 	 * All of the bios that pass through here are from async helpers.
661 	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
662 	 * context.  This changes nothing when cgroups aren't in use.
663 	 */
664 	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
665 	btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
666 }
667 
668 static bool should_async_write(struct btrfs_bio *bbio)
669 {
670 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
671 	bool auto_csum_mode = true;
672 
673 #ifdef CONFIG_BTRFS_EXPERIMENTAL
674 	/*
675 	 * Write bios will calculate checksum and submit bio at the same time.
676 	 * Unless explicitly required don't offload serial csum calculate and bio
677 	 * submit into a workqueue.
678 	 */
679 	return false;
680 #endif
681 
682 	/* Submit synchronously if the checksum implementation is fast. */
683 	if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
684 		return false;
685 
686 	/*
687 	 * Try to defer the submission to a workqueue to parallelize the
688 	 * checksum calculation unless the I/O is issued synchronously.
689 	 */
690 	if (op_is_sync(bbio->bio.bi_opf))
691 		return false;
692 
693 	/* Zoned devices require I/O to be submitted in order. */
694 	if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
695 		return false;
696 
697 	return true;
698 }
699 
700 /*
701  * Submit bio to an async queue.
702  *
703  * Return true if the work has been successfully submitted, else false.
704  */
705 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
706 				struct btrfs_io_context *bioc,
707 				struct btrfs_io_stripe *smap, int mirror_num)
708 {
709 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
710 	struct async_submit_bio *async;
711 
712 	async = kmalloc(sizeof(*async), GFP_NOFS);
713 	if (!async)
714 		return false;
715 
716 	async->bbio = bbio;
717 	async->bioc = bioc;
718 	async->smap = *smap;
719 	async->mirror_num = mirror_num;
720 
721 	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
722 	btrfs_queue_work(fs_info->workers, &async->work);
723 	return true;
724 }
725 
726 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
727 {
728 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
729 	unsigned int nr_segs;
730 	int sector_offset;
731 
732 	map_length = min(map_length, fs_info->max_zone_append_size);
733 	sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
734 					&nr_segs, map_length);
735 	if (sector_offset) {
736 		/*
737 		 * bio_split_rw_at() could split at a size smaller than our
738 		 * sectorsize and thus cause unaligned I/Os.  Fix that by
739 		 * always rounding down to the nearest boundary.
740 		 */
741 		return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
742 	}
743 	return map_length;
744 }
745 
746 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
747 {
748 	struct btrfs_inode *inode = bbio->inode;
749 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
750 	struct bio *bio = &bbio->bio;
751 	u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
752 	u64 length = bio->bi_iter.bi_size;
753 	u64 map_length = length;
754 	struct btrfs_io_context *bioc = NULL;
755 	struct btrfs_io_stripe smap;
756 	blk_status_t status;
757 	int ret;
758 
759 	if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
760 		smap.rst_search_commit_root = true;
761 	else
762 		smap.rst_search_commit_root = false;
763 
764 	btrfs_bio_counter_inc_blocked(fs_info);
765 	ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
766 			      &bioc, &smap, &mirror_num);
767 	if (ret) {
768 		status = errno_to_blk_status(ret);
769 		btrfs_bio_counter_dec(fs_info);
770 		goto end_bbio;
771 	}
772 
773 	/*
774 	 * For fscrypt writes we will get the encrypted bio after we've remapped
775 	 * our bio to the physical disk location, so we need to save the
776 	 * original bytenr so we know what we're checksumming.
777 	 */
778 	if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
779 		bbio->orig_logical = logical;
780 
781 	bbio->can_use_append = btrfs_use_zone_append(bbio);
782 
783 	map_length = min(map_length, length);
784 	if (bbio->can_use_append)
785 		map_length = btrfs_append_map_length(bbio, map_length);
786 
787 	if (map_length < length) {
788 		struct btrfs_bio *split;
789 
790 		split = btrfs_split_bio(fs_info, bbio, map_length);
791 		if (IS_ERR(split)) {
792 			status = errno_to_blk_status(PTR_ERR(split));
793 			btrfs_bio_counter_dec(fs_info);
794 			goto end_bbio;
795 		}
796 		bbio = split;
797 		bio = &bbio->bio;
798 	}
799 
800 	/*
801 	 * Save the iter for the end_io handler and preload the checksums for
802 	 * data reads.
803 	 */
804 	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
805 		bbio->saved_iter = bio->bi_iter;
806 		ret = btrfs_lookup_bio_sums(bbio);
807 		status = errno_to_blk_status(ret);
808 		if (status)
809 			goto fail;
810 	}
811 
812 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
813 		if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
814 			/*
815 			 * No locking for the list update, as we only add to
816 			 * the list in the I/O submission path, and list
817 			 * iteration only happens in the completion path, which
818 			 * can't happen until after the last submission.
819 			 */
820 			btrfs_get_bioc(bioc);
821 			list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
822 		}
823 
824 		/*
825 		 * Csum items for reloc roots have already been cloned at this
826 		 * point, so they are handled as part of the no-checksum case.
827 		 */
828 		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
829 		    !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
830 		    !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) {
831 			if (should_async_write(bbio) &&
832 			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
833 				goto done;
834 
835 			ret = btrfs_bio_csum(bbio);
836 			status = errno_to_blk_status(ret);
837 			if (status)
838 				goto fail;
839 		} else if (bbio->can_use_append ||
840 			   (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) {
841 			ret = btrfs_alloc_dummy_sum(bbio);
842 			status = errno_to_blk_status(ret);
843 			if (status)
844 				goto fail;
845 		}
846 	}
847 
848 	btrfs_submit_bio(bio, bioc, &smap, mirror_num);
849 done:
850 	return map_length == length;
851 
852 fail:
853 	btrfs_bio_counter_dec(fs_info);
854 	/*
855 	 * We have split the original bbio, now we have to end both the current
856 	 * @bbio and remaining one, as the remaining one will never be submitted.
857 	 */
858 	if (map_length < length) {
859 		struct btrfs_bio *remaining = bbio->private;
860 
861 		ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
862 		ASSERT(remaining);
863 
864 		btrfs_bio_end_io(remaining, status);
865 	}
866 end_bbio:
867 	btrfs_bio_end_io(bbio, status);
868 	/* Do not submit another chunk */
869 	return true;
870 }
871 
872 static void assert_bbio_alignment(struct btrfs_bio *bbio)
873 {
874 #ifdef CONFIG_BTRFS_ASSERT
875 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
876 	struct bio_vec bvec;
877 	struct bvec_iter iter;
878 	const u32 blocksize = fs_info->sectorsize;
879 	const u32 alignment = min(blocksize, PAGE_SIZE);
880 	const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
881 	const u32 length = bbio->bio.bi_iter.bi_size;
882 
883 	/* The logical and length should still be aligned to blocksize. */
884 	ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
885 	       length != 0, "root=%llu inode=%llu logical=%llu length=%u",
886 	       btrfs_root_id(bbio->inode->root),
887 	       btrfs_ino(bbio->inode), logical, length);
888 
889 	bio_for_each_bvec(bvec, &bbio->bio, iter)
890 		ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
891 		       IS_ALIGNED(bvec.bv_len, alignment),
892 		"root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
893 		btrfs_root_id(bbio->inode->root),
894 		btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
895 		bvec.bv_offset, bvec.bv_len);
896 #endif
897 }
898 
899 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
900 {
901 	/* If bbio->inode is not populated, its file_offset must be 0. */
902 	ASSERT(bbio->inode || bbio->file_offset == 0);
903 
904 	assert_bbio_alignment(bbio);
905 
906 	while (!btrfs_submit_chunk(bbio, mirror_num))
907 		;
908 }
909 
910 /*
911  * Submit a repair write.
912  *
913  * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
914  * RAID setup.  Here we only want to write the one bad copy, so we do the
915  * mapping ourselves and submit the bio directly.
916  *
917  * The I/O is issued synchronously to block the repair read completion from
918  * freeing the bio.
919  *
920  * @ino:	Offending inode number
921  * @fileoff:	File offset inside the inode
922  * @length:	Length of the repair write
923  * @logical:	Logical address of the range
924  * @paddrs:	Physical address array of the content
925  * @step:	Length of for each paddrs
926  * @mirror_num: Mirror number to write to. Must not be zero
927  */
928 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
929 			    u32 length, u64 logical, const phys_addr_t paddrs[],
930 			    unsigned int step, int mirror_num)
931 {
932 	const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
933 	struct btrfs_io_stripe smap = { 0 };
934 	struct bio *bio = NULL;
935 	int ret = 0;
936 
937 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
938 	BUG_ON(!mirror_num);
939 
940 	/* Basic alignment checks. */
941 	ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
942 	ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
943 	ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
944 	/* Either it's a single data or metadata block. */
945 	ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
946 	ASSERT(step <= length);
947 	ASSERT(is_power_of_2(step));
948 
949 	if (btrfs_repair_one_zone(fs_info, logical))
950 		return 0;
951 
952 	/*
953 	 * Avoid races with device replace and make sure our bioc has devices
954 	 * associated to its stripes that don't go away while we are doing the
955 	 * read repair operation.
956 	 */
957 	btrfs_bio_counter_inc_blocked(fs_info);
958 	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
959 	if (ret < 0)
960 		goto out_counter_dec;
961 
962 	if (unlikely(!smap.dev->bdev ||
963 		     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
964 		ret = -EIO;
965 		goto out_counter_dec;
966 	}
967 
968 	bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
969 	bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
970 	for (int i = 0; i < nr_steps; i++) {
971 		ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
972 		/* We should have allocated enough slots to contain all the different pages. */
973 		ASSERT(ret == step);
974 	}
975 	ret = submit_bio_wait(bio);
976 	bio_put(bio);
977 	if (ret) {
978 		/* try to remap that extent elsewhere? */
979 		btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
980 		goto out_counter_dec;
981 	}
982 
983 	btrfs_info_rl(fs_info,
984 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
985 			     ino, fileoff, btrfs_dev_name(smap.dev),
986 			     smap.physical >> SECTOR_SHIFT);
987 	ret = 0;
988 
989 out_counter_dec:
990 	btrfs_bio_counter_dec(fs_info);
991 	return ret;
992 }
993 
994 /*
995  * Submit a btrfs_bio based repair write.
996  *
997  * If @dev_replace is true, the write would be submitted to dev-replace target.
998  */
999 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
1000 {
1001 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
1002 	u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1003 	u64 length = bbio->bio.bi_iter.bi_size;
1004 	struct btrfs_io_stripe smap = { 0 };
1005 	int ret;
1006 
1007 	ASSERT(mirror_num > 0);
1008 	ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
1009 	ASSERT(!is_data_inode(bbio->inode));
1010 	ASSERT(bbio->is_scrub);
1011 
1012 	btrfs_bio_counter_inc_blocked(fs_info);
1013 	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
1014 	if (ret < 0)
1015 		goto fail;
1016 
1017 	if (dev_replace) {
1018 		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
1019 		smap.dev = fs_info->dev_replace.tgtdev;
1020 	}
1021 	btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
1022 	return;
1023 
1024 fail:
1025 	btrfs_bio_counter_dec(fs_info);
1026 	btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
1027 }
1028 
1029 int __init btrfs_bioset_init(void)
1030 {
1031 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
1032 			offsetof(struct btrfs_bio, bio),
1033 			BIOSET_NEED_BVECS))
1034 		return -ENOMEM;
1035 	if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
1036 			offsetof(struct btrfs_bio, bio), 0))
1037 		goto out;
1038 	if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
1039 			offsetof(struct btrfs_bio, bio),
1040 			BIOSET_NEED_BVECS))
1041 		goto out;
1042 	if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
1043 				      sizeof(struct btrfs_failed_bio)))
1044 		goto out;
1045 	return 0;
1046 
1047 out:
1048 	btrfs_bioset_exit();
1049 	return -ENOMEM;
1050 }
1051 
1052 void __cold btrfs_bioset_exit(void)
1053 {
1054 	mempool_exit(&btrfs_failed_bio_pool);
1055 	bioset_exit(&btrfs_repair_bioset);
1056 	bioset_exit(&btrfs_clone_bioset);
1057 	bioset_exit(&btrfs_bioset);
1058 }
1059