1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 * Copyright (C) 2022 Christoph Hellwig.
5 */
6
7 #include <linux/bio.h>
8 #include "bio.h"
9 #include "ctree.h"
10 #include "volumes.h"
11 #include "raid56.h"
12 #include "async-thread.h"
13 #include "dev-replace.h"
14 #include "zoned.h"
15 #include "file-item.h"
16 #include "raid-stripe-tree.h"
17
18 static struct bio_set btrfs_bioset;
19 static struct bio_set btrfs_clone_bioset;
20 static struct bio_set btrfs_repair_bioset;
21 static mempool_t btrfs_failed_bio_pool;
22
23 struct btrfs_failed_bio {
24 struct btrfs_bio *bbio;
25 int num_copies;
26 atomic_t repair_count;
27 };
28
29 /* Is this a data path I/O that needs storage layer checksum and repair? */
is_data_bbio(const struct btrfs_bio * bbio)30 static inline bool is_data_bbio(const struct btrfs_bio *bbio)
31 {
32 return bbio->inode && is_data_inode(bbio->inode);
33 }
34
bbio_has_ordered_extent(const struct btrfs_bio * bbio)35 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
36 {
37 return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
38 }
39
40 /*
41 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
42 * is already initialized by the block layer.
43 */
btrfs_bio_init(struct btrfs_bio * bbio,struct btrfs_inode * inode,u64 file_offset,btrfs_bio_end_io_t end_io,void * private)44 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
45 btrfs_bio_end_io_t end_io, void *private)
46 {
47 /* @inode parameter is mandatory. */
48 ASSERT(inode);
49
50 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
51 bbio->inode = inode;
52 bbio->end_io = end_io;
53 bbio->private = private;
54 bbio->file_offset = file_offset;
55 atomic_set(&bbio->pending_ios, 1);
56 WRITE_ONCE(bbio->status, BLK_STS_OK);
57 }
58
59 /*
60 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
61 * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
62 *
63 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
64 * a mempool.
65 */
btrfs_bio_alloc(unsigned int nr_vecs,blk_opf_t opf,struct btrfs_inode * inode,u64 file_offset,btrfs_bio_end_io_t end_io,void * private)66 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
67 struct btrfs_inode *inode, u64 file_offset,
68 btrfs_bio_end_io_t end_io, void *private)
69 {
70 struct btrfs_bio *bbio;
71 struct bio *bio;
72
73 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
74 bbio = btrfs_bio(bio);
75 btrfs_bio_init(bbio, inode, file_offset, end_io, private);
76 return bbio;
77 }
78
btrfs_split_bio(struct btrfs_fs_info * fs_info,struct btrfs_bio * orig_bbio,u64 map_length)79 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
80 struct btrfs_bio *orig_bbio,
81 u64 map_length)
82 {
83 struct btrfs_bio *bbio;
84 struct bio *bio;
85
86 bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
87 &btrfs_clone_bioset);
88 if (IS_ERR(bio))
89 return ERR_CAST(bio);
90
91 bbio = btrfs_bio(bio);
92 btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
93 orig_bbio->file_offset += map_length;
94 if (bbio_has_ordered_extent(bbio)) {
95 refcount_inc(&orig_bbio->ordered->refs);
96 bbio->ordered = orig_bbio->ordered;
97 bbio->orig_logical = orig_bbio->orig_logical;
98 orig_bbio->orig_logical += map_length;
99 }
100
101 bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
102 bbio->can_use_append = orig_bbio->can_use_append;
103 bbio->is_scrub = orig_bbio->is_scrub;
104 bbio->is_remap = orig_bbio->is_remap;
105 bbio->async_csum = orig_bbio->async_csum;
106
107 atomic_inc(&orig_bbio->pending_ios);
108 return bbio;
109 }
110
btrfs_bio_end_io(struct btrfs_bio * bbio,blk_status_t status)111 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
112 {
113 /* Make sure we're already in task context. */
114 ASSERT(in_task());
115
116 if (bbio->async_csum)
117 wait_for_completion(&bbio->csum_done);
118
119 bbio->bio.bi_status = status;
120 if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
121 struct btrfs_bio *orig_bbio = bbio->private;
122
123 /* Free bio that was never submitted to the underlying device. */
124 if (bbio_has_ordered_extent(bbio))
125 btrfs_put_ordered_extent(bbio->ordered);
126 bio_put(&bbio->bio);
127
128 bbio = orig_bbio;
129 }
130
131 /*
132 * At this point, bbio always points to the original btrfs_bio. Save
133 * the first error in it.
134 */
135 if (status != BLK_STS_OK)
136 cmpxchg(&bbio->status, BLK_STS_OK, status);
137
138 if (atomic_dec_and_test(&bbio->pending_ios)) {
139 /* Load split bio's error which might be set above. */
140 if (status == BLK_STS_OK)
141 bbio->bio.bi_status = READ_ONCE(bbio->status);
142
143 if (bbio_has_ordered_extent(bbio)) {
144 struct btrfs_ordered_extent *ordered = bbio->ordered;
145
146 bbio->end_io(bbio);
147 btrfs_put_ordered_extent(ordered);
148 } else {
149 bbio->end_io(bbio);
150 }
151 }
152 }
153
next_repair_mirror(const struct btrfs_failed_bio * fbio,int cur_mirror)154 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
155 {
156 if (cur_mirror == fbio->num_copies)
157 return cur_mirror + 1 - fbio->num_copies;
158 return cur_mirror + 1;
159 }
160
prev_repair_mirror(const struct btrfs_failed_bio * fbio,int cur_mirror)161 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
162 {
163 if (cur_mirror == 1)
164 return fbio->num_copies;
165 return cur_mirror - 1;
166 }
167
btrfs_repair_done(struct btrfs_failed_bio * fbio)168 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
169 {
170 if (atomic_dec_and_test(&fbio->repair_count)) {
171 btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
172 mempool_free(fbio, &btrfs_failed_bio_pool);
173 }
174 }
175
btrfs_end_repair_bio(struct btrfs_bio * repair_bbio,struct btrfs_device * dev)176 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
177 struct btrfs_device *dev)
178 {
179 struct btrfs_failed_bio *fbio = repair_bbio->private;
180 struct btrfs_inode *inode = repair_bbio->inode;
181 struct btrfs_fs_info *fs_info = inode->root->fs_info;
182 /*
183 * We can not move forward the saved_iter, as it will be later
184 * utilized by repair_bbio again.
185 */
186 struct bvec_iter saved_iter = repair_bbio->saved_iter;
187 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
188 const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
189 const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
190 int mirror = repair_bbio->mirror_num;
191 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
192 phys_addr_t paddr;
193 unsigned int slot = 0;
194
195 /* Repair bbio should be eaxctly one block sized. */
196 ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
197
198 btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
199 ASSERT(slot < nr_steps);
200 paddrs[slot] = paddr;
201 slot++;
202 }
203
204 if (repair_bbio->bio.bi_status ||
205 !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
206 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
207 repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
208
209 mirror = next_repair_mirror(fbio, mirror);
210 if (mirror == fbio->bbio->mirror_num) {
211 btrfs_debug(fs_info, "no mirror left");
212 fbio->bbio->bio.bi_status = BLK_STS_IOERR;
213 goto done;
214 }
215
216 btrfs_submit_bbio(repair_bbio, mirror);
217 return;
218 }
219
220 do {
221 mirror = prev_repair_mirror(fbio, mirror);
222 btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
223 repair_bbio->file_offset, fs_info->sectorsize,
224 logical, paddrs, step, mirror);
225 } while (mirror != fbio->bbio->mirror_num);
226
227 done:
228 btrfs_repair_done(fbio);
229 bio_put(&repair_bbio->bio);
230 }
231
232 /*
233 * Try to kick off a repair read to the next available mirror for a bad sector.
234 *
235 * This primarily tries to recover good data to serve the actual read request,
236 * but also tries to write the good data back to the bad mirror(s) when a
237 * read succeeded to restore the redundancy.
238 */
repair_one_sector(struct btrfs_bio * failed_bbio,u32 bio_offset,phys_addr_t paddrs[],struct btrfs_failed_bio * fbio)239 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
240 u32 bio_offset,
241 phys_addr_t paddrs[],
242 struct btrfs_failed_bio *fbio)
243 {
244 struct btrfs_inode *inode = failed_bbio->inode;
245 struct btrfs_fs_info *fs_info = inode->root->fs_info;
246 const u32 sectorsize = fs_info->sectorsize;
247 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
248 const u32 nr_steps = sectorsize / step;
249 /*
250 * For bs > ps cases, the saved_iter can be partially moved forward.
251 * In that case we should round it down to the block boundary.
252 */
253 const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
254 sectorsize);
255 struct btrfs_bio *repair_bbio;
256 struct bio *repair_bio;
257 int num_copies;
258 int mirror;
259
260 btrfs_debug(fs_info, "repair read error: read error at %llu",
261 failed_bbio->file_offset + bio_offset);
262
263 num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
264 if (num_copies == 1) {
265 btrfs_debug(fs_info, "no copy to repair from");
266 failed_bbio->bio.bi_status = BLK_STS_IOERR;
267 return fbio;
268 }
269
270 if (!fbio) {
271 fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
272 fbio->bbio = failed_bbio;
273 fbio->num_copies = num_copies;
274 atomic_set(&fbio->repair_count, 1);
275 }
276
277 atomic_inc(&fbio->repair_count);
278
279 repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
280 &btrfs_repair_bioset);
281 repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
282 for (int i = 0; i < nr_steps; i++) {
283 int ret;
284
285 ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
286
287 ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
288 offset_in_page(paddrs[i]));
289 ASSERT(ret == step);
290 }
291
292 repair_bbio = btrfs_bio(repair_bio);
293 btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
294 NULL, fbio);
295
296 mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
297 btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
298 btrfs_submit_bbio(repair_bbio, mirror);
299 return fbio;
300 }
301
btrfs_check_read_bio(struct btrfs_bio * bbio,struct btrfs_device * dev)302 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
303 {
304 struct btrfs_inode *inode = bbio->inode;
305 struct btrfs_fs_info *fs_info = inode->root->fs_info;
306 const u32 sectorsize = fs_info->sectorsize;
307 const u32 step = min(sectorsize, PAGE_SIZE);
308 const u32 nr_steps = sectorsize / step;
309 struct bvec_iter *iter = &bbio->saved_iter;
310 blk_status_t status = bbio->bio.bi_status;
311 struct btrfs_failed_bio *fbio = NULL;
312 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
313 phys_addr_t paddr;
314 u32 offset = 0;
315
316 /* Read-repair requires the inode field to be set by the submitter. */
317 ASSERT(inode);
318
319 /*
320 * Hand off repair bios to the repair code as there is no upper level
321 * submitter for them.
322 */
323 if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
324 btrfs_end_repair_bio(bbio, dev);
325 return;
326 }
327
328 /* Clear the I/O error. A failed repair will reset it. */
329 bbio->bio.bi_status = BLK_STS_OK;
330
331 btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
332 paddrs[(offset / step) % nr_steps] = paddr;
333 offset += step;
334
335 if (IS_ALIGNED(offset, sectorsize)) {
336 if (status ||
337 !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
338 fbio = repair_one_sector(bbio, offset - sectorsize,
339 paddrs, fbio);
340 }
341 }
342 if (bbio->csum != bbio->csum_inline)
343 kvfree(bbio->csum);
344
345 if (fbio)
346 btrfs_repair_done(fbio);
347 else
348 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
349 }
350
btrfs_log_dev_io_error(const struct bio * bio,struct btrfs_device * dev)351 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
352 {
353 if (!dev || !dev->bdev)
354 return;
355 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
356 return;
357
358 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
359 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
360 else if (!(bio->bi_opf & REQ_RAHEAD))
361 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
362 if (bio->bi_opf & REQ_PREFLUSH)
363 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
364 }
365
btrfs_end_io_wq(const struct btrfs_fs_info * fs_info,const struct bio * bio)366 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
367 const struct bio *bio)
368 {
369 if (bio->bi_opf & REQ_META)
370 return fs_info->endio_meta_workers;
371 return fs_info->endio_workers;
372 }
373
simple_end_io_work(struct work_struct * work)374 static void simple_end_io_work(struct work_struct *work)
375 {
376 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
377 struct bio *bio = &bbio->bio;
378
379 if (bio_op(bio) == REQ_OP_READ) {
380 /* Metadata reads are checked and repaired by the submitter. */
381 if (is_data_bbio(bbio))
382 return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
383 return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
384 }
385 if (bio_is_zone_append(bio) && !bio->bi_status)
386 btrfs_record_physical_zoned(bbio);
387 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
388 }
389
btrfs_simple_end_io(struct bio * bio)390 static void btrfs_simple_end_io(struct bio *bio)
391 {
392 struct btrfs_bio *bbio = btrfs_bio(bio);
393 struct btrfs_device *dev = bio->bi_private;
394 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
395
396 btrfs_bio_counter_dec(fs_info);
397
398 if (bio->bi_status)
399 btrfs_log_dev_io_error(bio, dev);
400
401 INIT_WORK(&bbio->end_io_work, simple_end_io_work);
402 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
403 }
404
btrfs_raid56_end_io(struct bio * bio)405 static void btrfs_raid56_end_io(struct bio *bio)
406 {
407 struct btrfs_io_context *bioc = bio->bi_private;
408 struct btrfs_bio *bbio = btrfs_bio(bio);
409
410 /* RAID56 endio is always handled in workqueue. */
411 ASSERT(in_task());
412
413 btrfs_bio_counter_dec(bioc->fs_info);
414 bbio->mirror_num = bioc->mirror_num;
415 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
416 btrfs_check_read_bio(bbio, NULL);
417 else
418 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
419
420 btrfs_put_bioc(bioc);
421 }
422
orig_write_end_io_work(struct work_struct * work)423 static void orig_write_end_io_work(struct work_struct *work)
424 {
425 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
426 struct bio *bio = &bbio->bio;
427 struct btrfs_io_stripe *stripe = bio->bi_private;
428 struct btrfs_io_context *bioc = stripe->bioc;
429
430 btrfs_bio_counter_dec(bioc->fs_info);
431
432 if (bio->bi_status) {
433 atomic_inc(&bioc->error);
434 btrfs_log_dev_io_error(bio, stripe->dev);
435 }
436
437 /*
438 * Only send an error to the higher layers if it is beyond the tolerance
439 * threshold.
440 */
441 if (atomic_read(&bioc->error) > bioc->max_errors)
442 bio->bi_status = BLK_STS_IOERR;
443 else
444 bio->bi_status = BLK_STS_OK;
445
446 if (bio_is_zone_append(bio) && !bio->bi_status)
447 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
448
449 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
450 btrfs_put_bioc(bioc);
451 }
452
btrfs_orig_write_end_io(struct bio * bio)453 static void btrfs_orig_write_end_io(struct bio *bio)
454 {
455 struct btrfs_bio *bbio = btrfs_bio(bio);
456
457 INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
458 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
459 }
460
clone_write_end_io_work(struct work_struct * work)461 static void clone_write_end_io_work(struct work_struct *work)
462 {
463 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
464 struct bio *bio = &bbio->bio;
465 struct btrfs_io_stripe *stripe = bio->bi_private;
466
467 if (bio->bi_status) {
468 atomic_inc(&stripe->bioc->error);
469 btrfs_log_dev_io_error(bio, stripe->dev);
470 } else if (bio_is_zone_append(bio)) {
471 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
472 }
473
474 /* Pass on control to the original bio this one was cloned from */
475 bio_endio(stripe->bioc->orig_bio);
476 bio_put(bio);
477 }
478
btrfs_clone_write_end_io(struct bio * bio)479 static void btrfs_clone_write_end_io(struct bio *bio)
480 {
481 struct btrfs_bio *bbio = btrfs_bio(bio);
482
483 INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
484 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
485 }
486
btrfs_submit_dev_bio(struct btrfs_device * dev,struct bio * bio)487 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
488 {
489 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
490
491 if (!dev || !dev->bdev ||
492 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
493 (btrfs_op(bio) == BTRFS_MAP_WRITE &&
494 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
495 bio_io_error(bio);
496 return;
497 }
498
499 bio_set_dev(bio, dev->bdev);
500
501 /*
502 * For zone append writing, bi_sector must point the beginning of the
503 * zone
504 */
505 if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) {
506 u64 zone_start = round_down(physical, dev->fs_info->zone_size);
507
508 ASSERT(btrfs_dev_is_sequential(dev, physical));
509 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
510 bio->bi_opf &= ~REQ_OP_WRITE;
511 bio->bi_opf |= REQ_OP_ZONE_APPEND;
512 }
513 btrfs_debug(dev->fs_info,
514 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
515 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
516 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
517 dev->devid, bio->bi_iter.bi_size);
518
519 /*
520 * Track reads if tracking is enabled; ignore I/O operations before the
521 * filesystem is fully initialized.
522 */
523 if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
524 percpu_counter_add(&dev->fs_info->stats_read_blocks,
525 bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
526
527 if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
528 blkcg_punt_bio_submit(bio);
529 else
530 submit_bio(bio);
531 }
532
btrfs_submit_mirrored_bio(struct btrfs_io_context * bioc,int dev_nr)533 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
534 {
535 struct bio *orig_bio = bioc->orig_bio, *bio;
536 struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
537
538 ASSERT(bio_op(orig_bio) != REQ_OP_READ);
539
540 /* Reuse the bio embedded into the btrfs_bio for the last mirror */
541 if (dev_nr == bioc->num_stripes - 1) {
542 bio = orig_bio;
543 bio->bi_end_io = btrfs_orig_write_end_io;
544 } else {
545 /* We need to use endio_work to run end_io in task context. */
546 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
547 bio_inc_remaining(orig_bio);
548 btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
549 orig_bbio->file_offset, NULL, NULL);
550 bio->bi_end_io = btrfs_clone_write_end_io;
551 }
552
553 bio->bi_private = &bioc->stripes[dev_nr];
554 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
555 bioc->stripes[dev_nr].bioc = bioc;
556 bioc->size = bio->bi_iter.bi_size;
557 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
558 }
559
btrfs_submit_bio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,int mirror_num)560 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
561 struct btrfs_io_stripe *smap, int mirror_num)
562 {
563 if (!bioc) {
564 /* Single mirror read/write fast path. */
565 btrfs_bio(bio)->mirror_num = mirror_num;
566 bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
567 if (bio_op(bio) != REQ_OP_READ)
568 btrfs_bio(bio)->orig_physical = smap->physical;
569 bio->bi_private = smap->dev;
570 bio->bi_end_io = btrfs_simple_end_io;
571 btrfs_submit_dev_bio(smap->dev, bio);
572 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
573 /* Parity RAID write or read recovery. */
574 bio->bi_private = bioc;
575 bio->bi_end_io = btrfs_raid56_end_io;
576 if (bio_op(bio) == REQ_OP_READ)
577 raid56_parity_recover(bio, bioc, mirror_num);
578 else
579 raid56_parity_write(bio, bioc);
580 } else {
581 /* Write to multiple mirrors. */
582 int total_devs = bioc->num_stripes;
583
584 bioc->orig_bio = bio;
585 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
586 btrfs_submit_mirrored_bio(bioc, dev_nr);
587 }
588 }
589
btrfs_bio_csum(struct btrfs_bio * bbio)590 static int btrfs_bio_csum(struct btrfs_bio *bbio)
591 {
592 if (bbio->bio.bi_opf & REQ_META)
593 return btree_csum_one_bio(bbio);
594 #ifdef CONFIG_BTRFS_EXPERIMENTAL
595 return btrfs_csum_one_bio(bbio, true);
596 #else
597 return btrfs_csum_one_bio(bbio, false);
598 #endif
599 }
600
601 /*
602 * Async submit bios are used to offload expensive checksumming onto the worker
603 * threads.
604 */
605 struct async_submit_bio {
606 struct btrfs_bio *bbio;
607 struct btrfs_io_context *bioc;
608 struct btrfs_io_stripe smap;
609 int mirror_num;
610 struct btrfs_work work;
611 };
612
613 /*
614 * In order to insert checksums into the metadata in large chunks, we wait
615 * until bio submission time. All the pages in the bio are checksummed and
616 * sums are attached onto the ordered extent record.
617 *
618 * At IO completion time the csums attached on the ordered extent record are
619 * inserted into the btree.
620 */
run_one_async_start(struct btrfs_work * work)621 static void run_one_async_start(struct btrfs_work *work)
622 {
623 struct async_submit_bio *async =
624 container_of(work, struct async_submit_bio, work);
625 int ret;
626
627 ret = btrfs_bio_csum(async->bbio);
628 if (ret)
629 async->bbio->bio.bi_status = errno_to_blk_status(ret);
630 }
631
632 /*
633 * In order to insert checksums into the metadata in large chunks, we wait
634 * until bio submission time. All the pages in the bio are checksummed and
635 * sums are attached onto the ordered extent record.
636 *
637 * At IO completion time the csums attached on the ordered extent record are
638 * inserted into the tree.
639 *
640 * If called with @do_free == true, then it will free the work struct.
641 */
run_one_async_done(struct btrfs_work * work,bool do_free)642 static void run_one_async_done(struct btrfs_work *work, bool do_free)
643 {
644 struct async_submit_bio *async =
645 container_of(work, struct async_submit_bio, work);
646 struct bio *bio = &async->bbio->bio;
647
648 if (do_free) {
649 kfree(container_of(work, struct async_submit_bio, work));
650 return;
651 }
652
653 /* If an error occurred we just want to clean up the bio and move on. */
654 if (bio->bi_status) {
655 btrfs_bio_end_io(async->bbio, bio->bi_status);
656 return;
657 }
658
659 /*
660 * All of the bios that pass through here are from async helpers.
661 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
662 * context. This changes nothing when cgroups aren't in use.
663 */
664 bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
665 btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
666 }
667
should_async_write(struct btrfs_bio * bbio)668 static bool should_async_write(struct btrfs_bio *bbio)
669 {
670 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
671 bool auto_csum_mode = true;
672
673 #ifdef CONFIG_BTRFS_EXPERIMENTAL
674 /*
675 * Write bios will calculate checksum and submit bio at the same time.
676 * Unless explicitly required don't offload serial csum calculate and bio
677 * submit into a workqueue.
678 */
679 return false;
680 #endif
681
682 /* Submit synchronously if the checksum implementation is fast. */
683 if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
684 return false;
685
686 /*
687 * Try to defer the submission to a workqueue to parallelize the
688 * checksum calculation unless the I/O is issued synchronously.
689 */
690 if (op_is_sync(bbio->bio.bi_opf))
691 return false;
692
693 /* Zoned devices require I/O to be submitted in order. */
694 if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
695 return false;
696
697 return true;
698 }
699
700 /*
701 * Submit bio to an async queue.
702 *
703 * Return true if the work has been successfully submitted, else false.
704 */
btrfs_wq_submit_bio(struct btrfs_bio * bbio,struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,int mirror_num)705 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
706 struct btrfs_io_context *bioc,
707 struct btrfs_io_stripe *smap, int mirror_num)
708 {
709 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
710 struct async_submit_bio *async;
711
712 async = kmalloc_obj(*async, GFP_NOFS);
713 if (!async)
714 return false;
715
716 async->bbio = bbio;
717 async->bioc = bioc;
718 async->smap = *smap;
719 async->mirror_num = mirror_num;
720
721 btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
722 btrfs_queue_work(fs_info->workers, &async->work);
723 return true;
724 }
725
btrfs_append_map_length(struct btrfs_bio * bbio,u64 map_length)726 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
727 {
728 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
729 unsigned int nr_segs;
730 int sector_offset;
731
732 map_length = min(map_length, fs_info->max_zone_append_size);
733 sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
734 &nr_segs, map_length);
735 if (sector_offset) {
736 /*
737 * bio_split_rw_at() could split at a size smaller than our
738 * sectorsize and thus cause unaligned I/Os. Fix that by
739 * always rounding down to the nearest boundary.
740 */
741 return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
742 }
743 return map_length;
744 }
745
btrfs_submit_chunk(struct btrfs_bio * bbio,int mirror_num)746 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
747 {
748 struct btrfs_inode *inode = bbio->inode;
749 struct btrfs_fs_info *fs_info = inode->root->fs_info;
750 struct bio *bio = &bbio->bio;
751 u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
752 u64 length = bio->bi_iter.bi_size;
753 u64 map_length = length;
754 struct btrfs_io_context *bioc = NULL;
755 struct btrfs_io_stripe smap;
756 blk_status_t status;
757 int ret;
758
759 if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
760 smap.rst_search_commit_root = true;
761 else
762 smap.rst_search_commit_root = false;
763
764 btrfs_bio_counter_inc_blocked(fs_info);
765 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
766 &bioc, &smap, &mirror_num);
767 if (ret) {
768 status = errno_to_blk_status(ret);
769 btrfs_bio_counter_dec(fs_info);
770 goto end_bbio;
771 }
772
773 /*
774 * For fscrypt writes we will get the encrypted bio after we've remapped
775 * our bio to the physical disk location, so we need to save the
776 * original bytenr so we know what we're checksumming.
777 */
778 if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
779 bbio->orig_logical = logical;
780
781 bbio->can_use_append = btrfs_use_zone_append(bbio);
782
783 map_length = min(map_length, length);
784 if (bbio->can_use_append)
785 map_length = btrfs_append_map_length(bbio, map_length);
786
787 if (map_length < length) {
788 struct btrfs_bio *split;
789
790 split = btrfs_split_bio(fs_info, bbio, map_length);
791 if (IS_ERR(split)) {
792 status = errno_to_blk_status(PTR_ERR(split));
793 btrfs_bio_counter_dec(fs_info);
794 goto end_bbio;
795 }
796 bbio = split;
797 bio = &bbio->bio;
798 }
799
800 /*
801 * Save the iter for the end_io handler and preload the checksums for
802 * data reads.
803 */
804 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
805 bbio->saved_iter = bio->bi_iter;
806 ret = btrfs_lookup_bio_sums(bbio);
807 status = errno_to_blk_status(ret);
808 if (status)
809 goto fail;
810 }
811
812 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
813 if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
814 /*
815 * No locking for the list update, as we only add to
816 * the list in the I/O submission path, and list
817 * iteration only happens in the completion path, which
818 * can't happen until after the last submission.
819 */
820 btrfs_get_bioc(bioc);
821 list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
822 }
823
824 /*
825 * Csum items for reloc roots have already been cloned at this
826 * point, so they are handled as part of the no-checksum case.
827 */
828 if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
829 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
830 !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) {
831 if (should_async_write(bbio) &&
832 btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
833 goto done;
834
835 ret = btrfs_bio_csum(bbio);
836 status = errno_to_blk_status(ret);
837 if (status)
838 goto fail;
839 } else if (bbio->can_use_append ||
840 (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) {
841 ret = btrfs_alloc_dummy_sum(bbio);
842 status = errno_to_blk_status(ret);
843 if (status)
844 goto fail;
845 }
846 }
847
848 btrfs_submit_bio(bio, bioc, &smap, mirror_num);
849 done:
850 return map_length == length;
851
852 fail:
853 btrfs_bio_counter_dec(fs_info);
854 /*
855 * We have split the original bbio, now we have to end both the current
856 * @bbio and remaining one, as the remaining one will never be submitted.
857 */
858 if (map_length < length) {
859 struct btrfs_bio *remaining = bbio->private;
860
861 ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
862 ASSERT(remaining);
863
864 btrfs_bio_end_io(remaining, status);
865 }
866 end_bbio:
867 btrfs_bio_end_io(bbio, status);
868 /* Do not submit another chunk */
869 return true;
870 }
871
assert_bbio_alignment(struct btrfs_bio * bbio)872 static void assert_bbio_alignment(struct btrfs_bio *bbio)
873 {
874 #ifdef CONFIG_BTRFS_ASSERT
875 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
876 struct bio_vec bvec;
877 struct bvec_iter iter;
878 const u32 blocksize = fs_info->sectorsize;
879 const u32 alignment = min(blocksize, PAGE_SIZE);
880 const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
881 const u32 length = bbio->bio.bi_iter.bi_size;
882
883 /* The logical and length should still be aligned to blocksize. */
884 ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
885 length != 0, "root=%llu inode=%llu logical=%llu length=%u",
886 btrfs_root_id(bbio->inode->root),
887 btrfs_ino(bbio->inode), logical, length);
888
889 bio_for_each_bvec(bvec, &bbio->bio, iter)
890 ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
891 IS_ALIGNED(bvec.bv_len, alignment),
892 "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
893 btrfs_root_id(bbio->inode->root),
894 btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
895 bvec.bv_offset, bvec.bv_len);
896 #endif
897 }
898
btrfs_submit_bbio(struct btrfs_bio * bbio,int mirror_num)899 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
900 {
901 /* If bbio->inode is not populated, its file_offset must be 0. */
902 ASSERT(bbio->inode || bbio->file_offset == 0);
903
904 assert_bbio_alignment(bbio);
905
906 while (!btrfs_submit_chunk(bbio, mirror_num))
907 ;
908 }
909
910 /*
911 * Submit a repair write.
912 *
913 * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
914 * RAID setup. Here we only want to write the one bad copy, so we do the
915 * mapping ourselves and submit the bio directly.
916 *
917 * The I/O is issued synchronously to block the repair read completion from
918 * freeing the bio.
919 *
920 * @ino: Offending inode number
921 * @fileoff: File offset inside the inode
922 * @length: Length of the repair write
923 * @logical: Logical address of the range
924 * @paddrs: Physical address array of the content
925 * @step: Length of for each paddrs
926 * @mirror_num: Mirror number to write to. Must not be zero
927 */
btrfs_repair_io_failure(struct btrfs_fs_info * fs_info,u64 ino,u64 fileoff,u32 length,u64 logical,const phys_addr_t paddrs[],unsigned int step,int mirror_num)928 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
929 u32 length, u64 logical, const phys_addr_t paddrs[],
930 unsigned int step, int mirror_num)
931 {
932 const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
933 struct btrfs_io_stripe smap = { 0 };
934 struct bio *bio = NULL;
935 int ret = 0;
936
937 BUG_ON(!mirror_num);
938
939 /* Basic alignment checks. */
940 ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
941 ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
942 ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
943 /* Either it's a single data or metadata block. */
944 ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
945 ASSERT(step <= length);
946 ASSERT(is_power_of_2(step));
947
948 /*
949 * The fs either mounted RO or hit critical errors, no need
950 * to continue repairing.
951 */
952 if (unlikely(sb_rdonly(fs_info->sb)))
953 return 0;
954
955 if (btrfs_repair_one_zone(fs_info, logical))
956 return 0;
957
958 /*
959 * Avoid races with device replace and make sure our bioc has devices
960 * associated to its stripes that don't go away while we are doing the
961 * read repair operation.
962 */
963 btrfs_bio_counter_inc_blocked(fs_info);
964 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
965 if (ret < 0)
966 goto out_counter_dec;
967
968 if (unlikely(!smap.dev->bdev ||
969 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
970 ret = -EIO;
971 goto out_counter_dec;
972 }
973
974 bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
975 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
976 for (int i = 0; i < nr_steps; i++) {
977 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
978 /* We should have allocated enough slots to contain all the different pages. */
979 ASSERT(ret == step);
980 }
981 ret = submit_bio_wait(bio);
982 bio_put(bio);
983 if (ret) {
984 /* try to remap that extent elsewhere? */
985 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
986 goto out_counter_dec;
987 }
988
989 btrfs_info_rl(fs_info,
990 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
991 ino, fileoff, btrfs_dev_name(smap.dev),
992 smap.physical >> SECTOR_SHIFT);
993 ret = 0;
994
995 out_counter_dec:
996 btrfs_bio_counter_dec(fs_info);
997 return ret;
998 }
999
1000 /*
1001 * Submit a btrfs_bio based repair write.
1002 *
1003 * If @dev_replace is true, the write would be submitted to dev-replace target.
1004 */
btrfs_submit_repair_write(struct btrfs_bio * bbio,int mirror_num,bool dev_replace)1005 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
1006 {
1007 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
1008 u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1009 u64 length = bbio->bio.bi_iter.bi_size;
1010 struct btrfs_io_stripe smap = { 0 };
1011 int ret;
1012
1013 ASSERT(mirror_num > 0);
1014 ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
1015 ASSERT(!is_data_inode(bbio->inode));
1016 ASSERT(bbio->is_scrub);
1017
1018 btrfs_bio_counter_inc_blocked(fs_info);
1019 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
1020 if (ret < 0)
1021 goto fail;
1022
1023 if (dev_replace) {
1024 ASSERT(smap.dev == fs_info->dev_replace.srcdev);
1025 smap.dev = fs_info->dev_replace.tgtdev;
1026 }
1027 btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
1028 return;
1029
1030 fail:
1031 btrfs_bio_counter_dec(fs_info);
1032 btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
1033 }
1034
btrfs_bioset_init(void)1035 int __init btrfs_bioset_init(void)
1036 {
1037 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
1038 offsetof(struct btrfs_bio, bio),
1039 BIOSET_NEED_BVECS))
1040 return -ENOMEM;
1041 if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
1042 offsetof(struct btrfs_bio, bio), 0))
1043 goto out;
1044 if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
1045 offsetof(struct btrfs_bio, bio),
1046 BIOSET_NEED_BVECS))
1047 goto out;
1048 if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
1049 sizeof(struct btrfs_failed_bio)))
1050 goto out;
1051 return 0;
1052
1053 out:
1054 btrfs_bioset_exit();
1055 return -ENOMEM;
1056 }
1057
btrfs_bioset_exit(void)1058 void __cold btrfs_bioset_exit(void)
1059 {
1060 mempool_exit(&btrfs_failed_bio_pool);
1061 bioset_exit(&btrfs_repair_bioset);
1062 bioset_exit(&btrfs_clone_bioset);
1063 bioset_exit(&btrfs_bioset);
1064 }
1065