xref: /linux/fs/btrfs/bio.c (revision 7696286034ac72cf9b46499be1715ac62fd302c3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  * Copyright (C) 2022 Christoph Hellwig.
5  */
6 
7 #include <linux/bio.h>
8 #include "bio.h"
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "raid56.h"
12 #include "async-thread.h"
13 #include "dev-replace.h"
14 #include "zoned.h"
15 #include "file-item.h"
16 #include "raid-stripe-tree.h"
17 
18 static struct bio_set btrfs_bioset;
19 static struct bio_set btrfs_clone_bioset;
20 static struct bio_set btrfs_repair_bioset;
21 static mempool_t btrfs_failed_bio_pool;
22 
23 struct btrfs_failed_bio {
24 	struct btrfs_bio *bbio;
25 	int num_copies;
26 	atomic_t repair_count;
27 };
28 
29 /* Is this a data path I/O that needs storage layer checksum and repair? */
30 static inline bool is_data_bbio(const struct btrfs_bio *bbio)
31 {
32 	return bbio->inode && is_data_inode(bbio->inode);
33 }
34 
35 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
36 {
37 	return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
38 }
39 
40 /*
41  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
42  * is already initialized by the block layer.
43  */
44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
45 		    btrfs_bio_end_io_t end_io, void *private)
46 {
47 	/* @inode parameter is mandatory. */
48 	ASSERT(inode);
49 
50 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
51 	bbio->inode = inode;
52 	bbio->end_io = end_io;
53 	bbio->private = private;
54 	bbio->file_offset = file_offset;
55 	atomic_set(&bbio->pending_ios, 1);
56 	WRITE_ONCE(bbio->status, BLK_STS_OK);
57 }
58 
59 /*
60  * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
61  * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
62  *
63  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
64  * a mempool.
65  */
66 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
67 				  struct btrfs_inode *inode, u64 file_offset,
68 				  btrfs_bio_end_io_t end_io, void *private)
69 {
70 	struct btrfs_bio *bbio;
71 	struct bio *bio;
72 
73 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
74 	bbio = btrfs_bio(bio);
75 	btrfs_bio_init(bbio, inode, file_offset, end_io, private);
76 	return bbio;
77 }
78 
79 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
80 					 struct btrfs_bio *orig_bbio,
81 					 u64 map_length)
82 {
83 	struct btrfs_bio *bbio;
84 	struct bio *bio;
85 
86 	bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
87 			&btrfs_clone_bioset);
88 	if (IS_ERR(bio))
89 		return ERR_CAST(bio);
90 
91 	bbio = btrfs_bio(bio);
92 	btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
93 	orig_bbio->file_offset += map_length;
94 	if (bbio_has_ordered_extent(bbio)) {
95 		refcount_inc(&orig_bbio->ordered->refs);
96 		bbio->ordered = orig_bbio->ordered;
97 		bbio->orig_logical = orig_bbio->orig_logical;
98 		orig_bbio->orig_logical += map_length;
99 	}
100 	bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
101 	atomic_inc(&orig_bbio->pending_ios);
102 	return bbio;
103 }
104 
105 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
106 {
107 	/* Make sure we're already in task context. */
108 	ASSERT(in_task());
109 
110 	if (bbio->async_csum)
111 		wait_for_completion(&bbio->csum_done);
112 
113 	bbio->bio.bi_status = status;
114 	if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
115 		struct btrfs_bio *orig_bbio = bbio->private;
116 
117 		/* Free bio that was never submitted to the underlying device. */
118 		if (bbio_has_ordered_extent(bbio))
119 			btrfs_put_ordered_extent(bbio->ordered);
120 		bio_put(&bbio->bio);
121 
122 		bbio = orig_bbio;
123 	}
124 
125 	/*
126 	 * At this point, bbio always points to the original btrfs_bio. Save
127 	 * the first error in it.
128 	 */
129 	if (status != BLK_STS_OK)
130 		cmpxchg(&bbio->status, BLK_STS_OK, status);
131 
132 	if (atomic_dec_and_test(&bbio->pending_ios)) {
133 		/* Load split bio's error which might be set above. */
134 		if (status == BLK_STS_OK)
135 			bbio->bio.bi_status = READ_ONCE(bbio->status);
136 
137 		if (bbio_has_ordered_extent(bbio)) {
138 			struct btrfs_ordered_extent *ordered = bbio->ordered;
139 
140 			bbio->end_io(bbio);
141 			btrfs_put_ordered_extent(ordered);
142 		} else {
143 			bbio->end_io(bbio);
144 		}
145 	}
146 }
147 
148 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
149 {
150 	if (cur_mirror == fbio->num_copies)
151 		return cur_mirror + 1 - fbio->num_copies;
152 	return cur_mirror + 1;
153 }
154 
155 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
156 {
157 	if (cur_mirror == 1)
158 		return fbio->num_copies;
159 	return cur_mirror - 1;
160 }
161 
162 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
163 {
164 	if (atomic_dec_and_test(&fbio->repair_count)) {
165 		btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
166 		mempool_free(fbio, &btrfs_failed_bio_pool);
167 	}
168 }
169 
170 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
171 				 struct btrfs_device *dev)
172 {
173 	struct btrfs_failed_bio *fbio = repair_bbio->private;
174 	struct btrfs_inode *inode = repair_bbio->inode;
175 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
176 	/*
177 	 * We can not move forward the saved_iter, as it will be later
178 	 * utilized by repair_bbio again.
179 	 */
180 	struct bvec_iter saved_iter = repair_bbio->saved_iter;
181 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
182 	const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
183 	const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
184 	int mirror = repair_bbio->mirror_num;
185 	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
186 	phys_addr_t paddr;
187 	unsigned int slot = 0;
188 
189 	/* Repair bbio should be eaxctly one block sized. */
190 	ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
191 
192 	btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
193 		ASSERT(slot < nr_steps);
194 		paddrs[slot] = paddr;
195 		slot++;
196 	}
197 
198 	if (repair_bbio->bio.bi_status ||
199 	    !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
200 		bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
201 		repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
202 
203 		mirror = next_repair_mirror(fbio, mirror);
204 		if (mirror == fbio->bbio->mirror_num) {
205 			btrfs_debug(fs_info, "no mirror left");
206 			fbio->bbio->bio.bi_status = BLK_STS_IOERR;
207 			goto done;
208 		}
209 
210 		btrfs_submit_bbio(repair_bbio, mirror);
211 		return;
212 	}
213 
214 	do {
215 		mirror = prev_repair_mirror(fbio, mirror);
216 		btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
217 				  repair_bbio->file_offset, fs_info->sectorsize,
218 				  logical, paddrs, step, mirror);
219 	} while (mirror != fbio->bbio->mirror_num);
220 
221 done:
222 	btrfs_repair_done(fbio);
223 	bio_put(&repair_bbio->bio);
224 }
225 
226 /*
227  * Try to kick off a repair read to the next available mirror for a bad sector.
228  *
229  * This primarily tries to recover good data to serve the actual read request,
230  * but also tries to write the good data back to the bad mirror(s) when a
231  * read succeeded to restore the redundancy.
232  */
233 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
234 						  u32 bio_offset,
235 						  phys_addr_t paddrs[],
236 						  struct btrfs_failed_bio *fbio)
237 {
238 	struct btrfs_inode *inode = failed_bbio->inode;
239 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
240 	const u32 sectorsize = fs_info->sectorsize;
241 	const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
242 	const u32 nr_steps = sectorsize / step;
243 	/*
244 	 * For bs > ps cases, the saved_iter can be partially moved forward.
245 	 * In that case we should round it down to the block boundary.
246 	 */
247 	const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
248 				       sectorsize);
249 	struct btrfs_bio *repair_bbio;
250 	struct bio *repair_bio;
251 	int num_copies;
252 	int mirror;
253 
254 	btrfs_debug(fs_info, "repair read error: read error at %llu",
255 		    failed_bbio->file_offset + bio_offset);
256 
257 	num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
258 	if (num_copies == 1) {
259 		btrfs_debug(fs_info, "no copy to repair from");
260 		failed_bbio->bio.bi_status = BLK_STS_IOERR;
261 		return fbio;
262 	}
263 
264 	if (!fbio) {
265 		fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
266 		fbio->bbio = failed_bbio;
267 		fbio->num_copies = num_copies;
268 		atomic_set(&fbio->repair_count, 1);
269 	}
270 
271 	atomic_inc(&fbio->repair_count);
272 
273 	repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
274 				      &btrfs_repair_bioset);
275 	repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
276 	for (int i = 0; i < nr_steps; i++) {
277 		int ret;
278 
279 		ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
280 
281 		ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
282 				   offset_in_page(paddrs[i]));
283 		ASSERT(ret == step);
284 	}
285 
286 	repair_bbio = btrfs_bio(repair_bio);
287 	btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
288 		       NULL, fbio);
289 
290 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
291 	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
292 	btrfs_submit_bbio(repair_bbio, mirror);
293 	return fbio;
294 }
295 
296 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
297 {
298 	struct btrfs_inode *inode = bbio->inode;
299 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
300 	const u32 sectorsize = fs_info->sectorsize;
301 	const u32 step = min(sectorsize, PAGE_SIZE);
302 	const u32 nr_steps = sectorsize / step;
303 	struct bvec_iter *iter = &bbio->saved_iter;
304 	blk_status_t status = bbio->bio.bi_status;
305 	struct btrfs_failed_bio *fbio = NULL;
306 	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
307 	phys_addr_t paddr;
308 	u32 offset = 0;
309 
310 	/* Read-repair requires the inode field to be set by the submitter. */
311 	ASSERT(inode);
312 
313 	/*
314 	 * Hand off repair bios to the repair code as there is no upper level
315 	 * submitter for them.
316 	 */
317 	if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
318 		btrfs_end_repair_bio(bbio, dev);
319 		return;
320 	}
321 
322 	/* Clear the I/O error. A failed repair will reset it. */
323 	bbio->bio.bi_status = BLK_STS_OK;
324 
325 	btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
326 		paddrs[(offset / step) % nr_steps] = paddr;
327 		offset += step;
328 
329 		if (IS_ALIGNED(offset, sectorsize)) {
330 			if (status ||
331 			    !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
332 				fbio = repair_one_sector(bbio, offset - sectorsize,
333 							 paddrs, fbio);
334 		}
335 	}
336 	if (bbio->csum != bbio->csum_inline)
337 		kvfree(bbio->csum);
338 
339 	if (fbio)
340 		btrfs_repair_done(fbio);
341 	else
342 		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
343 }
344 
345 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
346 {
347 	if (!dev || !dev->bdev)
348 		return;
349 	if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
350 		return;
351 
352 	if (btrfs_op(bio) == BTRFS_MAP_WRITE)
353 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
354 	else if (!(bio->bi_opf & REQ_RAHEAD))
355 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
356 	if (bio->bi_opf & REQ_PREFLUSH)
357 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
358 }
359 
360 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
361 						const struct bio *bio)
362 {
363 	if (bio->bi_opf & REQ_META)
364 		return fs_info->endio_meta_workers;
365 	return fs_info->endio_workers;
366 }
367 
368 static void simple_end_io_work(struct work_struct *work)
369 {
370 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
371 	struct bio *bio = &bbio->bio;
372 
373 	if (bio_op(bio) == REQ_OP_READ) {
374 		/* Metadata reads are checked and repaired by the submitter. */
375 		if (is_data_bbio(bbio))
376 			return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
377 		return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
378 	}
379 	if (bio_is_zone_append(bio) && !bio->bi_status)
380 		btrfs_record_physical_zoned(bbio);
381 	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
382 }
383 
384 static void btrfs_simple_end_io(struct bio *bio)
385 {
386 	struct btrfs_bio *bbio = btrfs_bio(bio);
387 	struct btrfs_device *dev = bio->bi_private;
388 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
389 
390 	btrfs_bio_counter_dec(fs_info);
391 
392 	if (bio->bi_status)
393 		btrfs_log_dev_io_error(bio, dev);
394 
395 	INIT_WORK(&bbio->end_io_work, simple_end_io_work);
396 	queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
397 }
398 
399 static void btrfs_raid56_end_io(struct bio *bio)
400 {
401 	struct btrfs_io_context *bioc = bio->bi_private;
402 	struct btrfs_bio *bbio = btrfs_bio(bio);
403 
404 	/* RAID56 endio is always handled in workqueue. */
405 	ASSERT(in_task());
406 
407 	btrfs_bio_counter_dec(bioc->fs_info);
408 	bbio->mirror_num = bioc->mirror_num;
409 	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
410 		btrfs_check_read_bio(bbio, NULL);
411 	else
412 		btrfs_bio_end_io(bbio, bbio->bio.bi_status);
413 
414 	btrfs_put_bioc(bioc);
415 }
416 
417 static void orig_write_end_io_work(struct work_struct *work)
418 {
419 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
420 	struct bio *bio = &bbio->bio;
421 	struct btrfs_io_stripe *stripe = bio->bi_private;
422 	struct btrfs_io_context *bioc = stripe->bioc;
423 
424 	btrfs_bio_counter_dec(bioc->fs_info);
425 
426 	if (bio->bi_status) {
427 		atomic_inc(&bioc->error);
428 		btrfs_log_dev_io_error(bio, stripe->dev);
429 	}
430 
431 	/*
432 	 * Only send an error to the higher layers if it is beyond the tolerance
433 	 * threshold.
434 	 */
435 	if (atomic_read(&bioc->error) > bioc->max_errors)
436 		bio->bi_status = BLK_STS_IOERR;
437 	else
438 		bio->bi_status = BLK_STS_OK;
439 
440 	if (bio_is_zone_append(bio) && !bio->bi_status)
441 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
442 
443 	btrfs_bio_end_io(bbio, bbio->bio.bi_status);
444 	btrfs_put_bioc(bioc);
445 }
446 
447 static void btrfs_orig_write_end_io(struct bio *bio)
448 {
449 	struct btrfs_bio *bbio = btrfs_bio(bio);
450 
451 	INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
452 	queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
453 }
454 
455 static void clone_write_end_io_work(struct work_struct *work)
456 {
457 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
458 	struct bio *bio = &bbio->bio;
459 	struct btrfs_io_stripe *stripe = bio->bi_private;
460 
461 	if (bio->bi_status) {
462 		atomic_inc(&stripe->bioc->error);
463 		btrfs_log_dev_io_error(bio, stripe->dev);
464 	} else if (bio_is_zone_append(bio)) {
465 		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
466 	}
467 
468 	/* Pass on control to the original bio this one was cloned from */
469 	bio_endio(stripe->bioc->orig_bio);
470 	bio_put(bio);
471 }
472 
473 static void btrfs_clone_write_end_io(struct bio *bio)
474 {
475 	struct btrfs_bio *bbio = btrfs_bio(bio);
476 
477 	INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
478 	queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
479 }
480 
481 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
482 {
483 	if (!dev || !dev->bdev ||
484 	    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
485 	    (btrfs_op(bio) == BTRFS_MAP_WRITE &&
486 	     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
487 		bio_io_error(bio);
488 		return;
489 	}
490 
491 	bio_set_dev(bio, dev->bdev);
492 
493 	/*
494 	 * For zone append writing, bi_sector must point the beginning of the
495 	 * zone
496 	 */
497 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
498 		u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
499 		u64 zone_start = round_down(physical, dev->fs_info->zone_size);
500 
501 		ASSERT(btrfs_dev_is_sequential(dev, physical));
502 		bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
503 	}
504 	btrfs_debug(dev->fs_info,
505 	"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
506 		__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
507 		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
508 		dev->devid, bio->bi_iter.bi_size);
509 
510 	/*
511 	 * Track reads if tracking is enabled; ignore I/O operations before the
512 	 * filesystem is fully initialized.
513 	 */
514 	if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
515 		percpu_counter_add(&dev->fs_info->stats_read_blocks,
516 				   bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
517 
518 	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
519 		blkcg_punt_bio_submit(bio);
520 	else
521 		submit_bio(bio);
522 }
523 
524 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
525 {
526 	struct bio *orig_bio = bioc->orig_bio, *bio;
527 	struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
528 
529 	ASSERT(bio_op(orig_bio) != REQ_OP_READ);
530 
531 	/* Reuse the bio embedded into the btrfs_bio for the last mirror */
532 	if (dev_nr == bioc->num_stripes - 1) {
533 		bio = orig_bio;
534 		bio->bi_end_io = btrfs_orig_write_end_io;
535 	} else {
536 		/* We need to use endio_work to run end_io in task context. */
537 		bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
538 		bio_inc_remaining(orig_bio);
539 		btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
540 			       orig_bbio->file_offset, NULL, NULL);
541 		bio->bi_end_io = btrfs_clone_write_end_io;
542 	}
543 
544 	bio->bi_private = &bioc->stripes[dev_nr];
545 	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
546 	bioc->stripes[dev_nr].bioc = bioc;
547 	bioc->size = bio->bi_iter.bi_size;
548 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
549 }
550 
551 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
552 			     struct btrfs_io_stripe *smap, int mirror_num)
553 {
554 	if (!bioc) {
555 		/* Single mirror read/write fast path. */
556 		btrfs_bio(bio)->mirror_num = mirror_num;
557 		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
558 		if (bio_op(bio) != REQ_OP_READ)
559 			btrfs_bio(bio)->orig_physical = smap->physical;
560 		bio->bi_private = smap->dev;
561 		bio->bi_end_io = btrfs_simple_end_io;
562 		btrfs_submit_dev_bio(smap->dev, bio);
563 	} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
564 		/* Parity RAID write or read recovery. */
565 		bio->bi_private = bioc;
566 		bio->bi_end_io = btrfs_raid56_end_io;
567 		if (bio_op(bio) == REQ_OP_READ)
568 			raid56_parity_recover(bio, bioc, mirror_num);
569 		else
570 			raid56_parity_write(bio, bioc);
571 	} else {
572 		/* Write to multiple mirrors. */
573 		int total_devs = bioc->num_stripes;
574 
575 		bioc->orig_bio = bio;
576 		for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
577 			btrfs_submit_mirrored_bio(bioc, dev_nr);
578 	}
579 }
580 
581 static int btrfs_bio_csum(struct btrfs_bio *bbio)
582 {
583 	if (bbio->bio.bi_opf & REQ_META)
584 		return btree_csum_one_bio(bbio);
585 #ifdef CONFIG_BTRFS_EXPERIMENTAL
586 	return btrfs_csum_one_bio(bbio, true);
587 #else
588 	return btrfs_csum_one_bio(bbio, false);
589 #endif
590 }
591 
592 /*
593  * Async submit bios are used to offload expensive checksumming onto the worker
594  * threads.
595  */
596 struct async_submit_bio {
597 	struct btrfs_bio *bbio;
598 	struct btrfs_io_context *bioc;
599 	struct btrfs_io_stripe smap;
600 	int mirror_num;
601 	struct btrfs_work work;
602 };
603 
604 /*
605  * In order to insert checksums into the metadata in large chunks, we wait
606  * until bio submission time.   All the pages in the bio are checksummed and
607  * sums are attached onto the ordered extent record.
608  *
609  * At IO completion time the csums attached on the ordered extent record are
610  * inserted into the btree.
611  */
612 static void run_one_async_start(struct btrfs_work *work)
613 {
614 	struct async_submit_bio *async =
615 		container_of(work, struct async_submit_bio, work);
616 	int ret;
617 
618 	ret = btrfs_bio_csum(async->bbio);
619 	if (ret)
620 		async->bbio->bio.bi_status = errno_to_blk_status(ret);
621 }
622 
623 /*
624  * In order to insert checksums into the metadata in large chunks, we wait
625  * until bio submission time.   All the pages in the bio are checksummed and
626  * sums are attached onto the ordered extent record.
627  *
628  * At IO completion time the csums attached on the ordered extent record are
629  * inserted into the tree.
630  *
631  * If called with @do_free == true, then it will free the work struct.
632  */
633 static void run_one_async_done(struct btrfs_work *work, bool do_free)
634 {
635 	struct async_submit_bio *async =
636 		container_of(work, struct async_submit_bio, work);
637 	struct bio *bio = &async->bbio->bio;
638 
639 	if (do_free) {
640 		kfree(container_of(work, struct async_submit_bio, work));
641 		return;
642 	}
643 
644 	/* If an error occurred we just want to clean up the bio and move on. */
645 	if (bio->bi_status) {
646 		btrfs_bio_end_io(async->bbio, bio->bi_status);
647 		return;
648 	}
649 
650 	/*
651 	 * All of the bios that pass through here are from async helpers.
652 	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
653 	 * context.  This changes nothing when cgroups aren't in use.
654 	 */
655 	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
656 	btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
657 }
658 
659 static bool should_async_write(struct btrfs_bio *bbio)
660 {
661 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
662 	bool auto_csum_mode = true;
663 
664 #ifdef CONFIG_BTRFS_EXPERIMENTAL
665 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
666 	enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
667 
668 	if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON)
669 		return true;
670 	/*
671 	 * Write bios will calculate checksum and submit bio at the same time.
672 	 * Unless explicitly required don't offload serial csum calculate and bio
673 	 * submit into a workqueue.
674 	 */
675 	return false;
676 #endif
677 
678 	/* Submit synchronously if the checksum implementation is fast. */
679 	if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
680 		return false;
681 
682 	/*
683 	 * Try to defer the submission to a workqueue to parallelize the
684 	 * checksum calculation unless the I/O is issued synchronously.
685 	 */
686 	if (op_is_sync(bbio->bio.bi_opf))
687 		return false;
688 
689 	/* Zoned devices require I/O to be submitted in order. */
690 	if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
691 		return false;
692 
693 	return true;
694 }
695 
696 /*
697  * Submit bio to an async queue.
698  *
699  * Return true if the work has been successfully submitted, else false.
700  */
701 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
702 				struct btrfs_io_context *bioc,
703 				struct btrfs_io_stripe *smap, int mirror_num)
704 {
705 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
706 	struct async_submit_bio *async;
707 
708 	async = kmalloc(sizeof(*async), GFP_NOFS);
709 	if (!async)
710 		return false;
711 
712 	async->bbio = bbio;
713 	async->bioc = bioc;
714 	async->smap = *smap;
715 	async->mirror_num = mirror_num;
716 
717 	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
718 	btrfs_queue_work(fs_info->workers, &async->work);
719 	return true;
720 }
721 
722 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
723 {
724 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
725 	unsigned int nr_segs;
726 	int sector_offset;
727 
728 	map_length = min(map_length, fs_info->max_zone_append_size);
729 	sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
730 					&nr_segs, map_length);
731 	if (sector_offset) {
732 		/*
733 		 * bio_split_rw_at() could split at a size smaller than our
734 		 * sectorsize and thus cause unaligned I/Os.  Fix that by
735 		 * always rounding down to the nearest boundary.
736 		 */
737 		return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
738 	}
739 	return map_length;
740 }
741 
742 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
743 {
744 	struct btrfs_inode *inode = bbio->inode;
745 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
746 	struct bio *bio = &bbio->bio;
747 	u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
748 	u64 length = bio->bi_iter.bi_size;
749 	u64 map_length = length;
750 	bool use_append = btrfs_use_zone_append(bbio);
751 	struct btrfs_io_context *bioc = NULL;
752 	struct btrfs_io_stripe smap;
753 	blk_status_t status;
754 	int ret;
755 
756 	if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
757 		smap.rst_search_commit_root = true;
758 	else
759 		smap.rst_search_commit_root = false;
760 
761 	btrfs_bio_counter_inc_blocked(fs_info);
762 	ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
763 			      &bioc, &smap, &mirror_num);
764 	if (ret) {
765 		status = errno_to_blk_status(ret);
766 		btrfs_bio_counter_dec(fs_info);
767 		goto end_bbio;
768 	}
769 
770 	/*
771 	 * For fscrypt writes we will get the encrypted bio after we've remapped
772 	 * our bio to the physical disk location, so we need to save the
773 	 * original bytenr so we know what we're checksumming.
774 	 */
775 	if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
776 		bbio->orig_logical = logical;
777 
778 	map_length = min(map_length, length);
779 	if (use_append)
780 		map_length = btrfs_append_map_length(bbio, map_length);
781 
782 	if (map_length < length) {
783 		struct btrfs_bio *split;
784 
785 		split = btrfs_split_bio(fs_info, bbio, map_length);
786 		if (IS_ERR(split)) {
787 			status = errno_to_blk_status(PTR_ERR(split));
788 			btrfs_bio_counter_dec(fs_info);
789 			goto end_bbio;
790 		}
791 		bbio = split;
792 		bio = &bbio->bio;
793 	}
794 
795 	/*
796 	 * Save the iter for the end_io handler and preload the checksums for
797 	 * data reads.
798 	 */
799 	if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
800 		bbio->saved_iter = bio->bi_iter;
801 		ret = btrfs_lookup_bio_sums(bbio);
802 		status = errno_to_blk_status(ret);
803 		if (status)
804 			goto fail;
805 	}
806 
807 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
808 		if (use_append) {
809 			bio->bi_opf &= ~REQ_OP_WRITE;
810 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
811 		}
812 
813 		if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
814 			/*
815 			 * No locking for the list update, as we only add to
816 			 * the list in the I/O submission path, and list
817 			 * iteration only happens in the completion path, which
818 			 * can't happen until after the last submission.
819 			 */
820 			btrfs_get_bioc(bioc);
821 			list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
822 		}
823 
824 		/*
825 		 * Csum items for reloc roots have already been cloned at this
826 		 * point, so they are handled as part of the no-checksum case.
827 		 */
828 		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
829 		    !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
830 		    !btrfs_is_data_reloc_root(inode->root)) {
831 			if (should_async_write(bbio) &&
832 			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
833 				goto done;
834 
835 			ret = btrfs_bio_csum(bbio);
836 			status = errno_to_blk_status(ret);
837 			if (status)
838 				goto fail;
839 		} else if (use_append ||
840 			   (btrfs_is_zoned(fs_info) && inode &&
841 			    inode->flags & BTRFS_INODE_NODATASUM)) {
842 			ret = btrfs_alloc_dummy_sum(bbio);
843 			status = errno_to_blk_status(ret);
844 			if (status)
845 				goto fail;
846 		}
847 	}
848 
849 	btrfs_submit_bio(bio, bioc, &smap, mirror_num);
850 done:
851 	return map_length == length;
852 
853 fail:
854 	btrfs_bio_counter_dec(fs_info);
855 	/*
856 	 * We have split the original bbio, now we have to end both the current
857 	 * @bbio and remaining one, as the remaining one will never be submitted.
858 	 */
859 	if (map_length < length) {
860 		struct btrfs_bio *remaining = bbio->private;
861 
862 		ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
863 		ASSERT(remaining);
864 
865 		btrfs_bio_end_io(remaining, status);
866 	}
867 end_bbio:
868 	btrfs_bio_end_io(bbio, status);
869 	/* Do not submit another chunk */
870 	return true;
871 }
872 
873 static void assert_bbio_alignment(struct btrfs_bio *bbio)
874 {
875 #ifdef CONFIG_BTRFS_ASSERT
876 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
877 	struct bio_vec bvec;
878 	struct bvec_iter iter;
879 	const u32 blocksize = fs_info->sectorsize;
880 	const u32 alignment = min(blocksize, PAGE_SIZE);
881 	const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
882 	const u32 length = bbio->bio.bi_iter.bi_size;
883 
884 	/* The logical and length should still be aligned to blocksize. */
885 	ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
886 	       length != 0, "root=%llu inode=%llu logical=%llu length=%u",
887 	       btrfs_root_id(bbio->inode->root),
888 	       btrfs_ino(bbio->inode), logical, length);
889 
890 	bio_for_each_bvec(bvec, &bbio->bio, iter)
891 		ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
892 		       IS_ALIGNED(bvec.bv_len, alignment),
893 		"root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
894 		btrfs_root_id(bbio->inode->root),
895 		btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
896 		bvec.bv_offset, bvec.bv_len);
897 #endif
898 }
899 
900 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
901 {
902 	/* If bbio->inode is not populated, its file_offset must be 0. */
903 	ASSERT(bbio->inode || bbio->file_offset == 0);
904 
905 	assert_bbio_alignment(bbio);
906 
907 	while (!btrfs_submit_chunk(bbio, mirror_num))
908 		;
909 }
910 
911 /*
912  * Submit a repair write.
913  *
914  * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
915  * RAID setup.  Here we only want to write the one bad copy, so we do the
916  * mapping ourselves and submit the bio directly.
917  *
918  * The I/O is issued synchronously to block the repair read completion from
919  * freeing the bio.
920  *
921  * @ino:	Offending inode number
922  * @fileoff:	File offset inside the inode
923  * @length:	Length of the repair write
924  * @logical:	Logical address of the range
925  * @paddrs:	Physical address array of the content
926  * @step:	Length of for each paddrs
927  * @mirror_num: Mirror number to write to. Must not be zero
928  */
929 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
930 			    u32 length, u64 logical, const phys_addr_t paddrs[],
931 			    unsigned int step, int mirror_num)
932 {
933 	const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
934 	struct btrfs_io_stripe smap = { 0 };
935 	struct bio *bio = NULL;
936 	int ret = 0;
937 
938 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
939 	BUG_ON(!mirror_num);
940 
941 	/* Basic alignment checks. */
942 	ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
943 	ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
944 	ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
945 	/* Either it's a single data or metadata block. */
946 	ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
947 	ASSERT(step <= length);
948 	ASSERT(is_power_of_2(step));
949 
950 	if (btrfs_repair_one_zone(fs_info, logical))
951 		return 0;
952 
953 	/*
954 	 * Avoid races with device replace and make sure our bioc has devices
955 	 * associated to its stripes that don't go away while we are doing the
956 	 * read repair operation.
957 	 */
958 	btrfs_bio_counter_inc_blocked(fs_info);
959 	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
960 	if (ret < 0)
961 		goto out_counter_dec;
962 
963 	if (unlikely(!smap.dev->bdev ||
964 		     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
965 		ret = -EIO;
966 		goto out_counter_dec;
967 	}
968 
969 	bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
970 	bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
971 	for (int i = 0; i < nr_steps; i++) {
972 		ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
973 		/* We should have allocated enough slots to contain all the different pages. */
974 		ASSERT(ret == step);
975 	}
976 	ret = submit_bio_wait(bio);
977 	bio_put(bio);
978 	if (ret) {
979 		/* try to remap that extent elsewhere? */
980 		btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
981 		goto out_counter_dec;
982 	}
983 
984 	btrfs_info_rl(fs_info,
985 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
986 			     ino, fileoff, btrfs_dev_name(smap.dev),
987 			     smap.physical >> SECTOR_SHIFT);
988 	ret = 0;
989 
990 out_counter_dec:
991 	btrfs_bio_counter_dec(fs_info);
992 	return ret;
993 }
994 
995 /*
996  * Submit a btrfs_bio based repair write.
997  *
998  * If @dev_replace is true, the write would be submitted to dev-replace target.
999  */
1000 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
1001 {
1002 	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
1003 	u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1004 	u64 length = bbio->bio.bi_iter.bi_size;
1005 	struct btrfs_io_stripe smap = { 0 };
1006 	int ret;
1007 
1008 	ASSERT(mirror_num > 0);
1009 	ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
1010 	ASSERT(!is_data_inode(bbio->inode));
1011 	ASSERT(bbio->is_scrub);
1012 
1013 	btrfs_bio_counter_inc_blocked(fs_info);
1014 	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
1015 	if (ret < 0)
1016 		goto fail;
1017 
1018 	if (dev_replace) {
1019 		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
1020 		smap.dev = fs_info->dev_replace.tgtdev;
1021 	}
1022 	btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
1023 	return;
1024 
1025 fail:
1026 	btrfs_bio_counter_dec(fs_info);
1027 	btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
1028 }
1029 
1030 int __init btrfs_bioset_init(void)
1031 {
1032 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
1033 			offsetof(struct btrfs_bio, bio),
1034 			BIOSET_NEED_BVECS))
1035 		return -ENOMEM;
1036 	if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
1037 			offsetof(struct btrfs_bio, bio), 0))
1038 		goto out;
1039 	if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
1040 			offsetof(struct btrfs_bio, bio),
1041 			BIOSET_NEED_BVECS))
1042 		goto out;
1043 	if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
1044 				      sizeof(struct btrfs_failed_bio)))
1045 		goto out;
1046 	return 0;
1047 
1048 out:
1049 	btrfs_bioset_exit();
1050 	return -ENOMEM;
1051 }
1052 
1053 void __cold btrfs_bioset_exit(void)
1054 {
1055 	mempool_exit(&btrfs_failed_bio_pool);
1056 	bioset_exit(&btrfs_repair_bioset);
1057 	bioset_exit(&btrfs_clone_bioset);
1058 	bioset_exit(&btrfs_bioset);
1059 }
1060