1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
4 */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include "ctree.h"
10 #include "discard.h"
11 #include "volumes.h"
12 #include "disk-io.h"
13 #include "ordered-data.h"
14 #include "transaction.h"
15 #include "backref.h"
16 #include "extent_io.h"
17 #include "dev-replace.h"
18 #include "raid56.h"
19 #include "block-group.h"
20 #include "zoned.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "file-item.h"
24 #include "scrub.h"
25 #include "raid-stripe-tree.h"
26
27 /*
28 * This is only the first step towards a full-features scrub. It reads all
29 * extent and super block and verifies the checksums. In case a bad checksum
30 * is found or the extent cannot be read, good data will be written back if
31 * any can be found.
32 *
33 * Future enhancements:
34 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them
36 * - track and record media errors, throw out bad devices
37 * - add a mode to also read unallocated space
38 */
39
40 struct scrub_ctx;
41
42 /*
43 * The following value only influences the performance.
44 *
45 * This determines how many stripes would be submitted in one go,
46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
47 */
48 #define SCRUB_STRIPES_PER_GROUP 8
49
50 /*
51 * How many groups we have for each sctx.
52 *
53 * This would be 8M per device, the same value as the old scrub in-flight bios
54 * size limit.
55 */
56 #define SCRUB_GROUPS_PER_SCTX 16
57
58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)
59
60 /*
61 * The following value times PAGE_SIZE needs to be large enough to match the
62 * largest node/leaf/sector size that shall be supported.
63 */
64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
65
66 /* Represent one sector and its needed info to verify the content. */
67 struct scrub_sector_verification {
68 union {
69 /*
70 * Csum pointer for data csum verification. Should point to a
71 * sector csum inside scrub_stripe::csums.
72 *
73 * NULL if this data sector has no csum.
74 */
75 u8 *csum;
76
77 /*
78 * Extra info for metadata verification. All sectors inside a
79 * tree block share the same generation.
80 */
81 u64 generation;
82 };
83 };
84
85 enum scrub_stripe_flags {
86 /* Set when @mirror_num, @dev, @physical and @logical are set. */
87 SCRUB_STRIPE_FLAG_INITIALIZED,
88
89 /* Set when the read-repair is finished. */
90 SCRUB_STRIPE_FLAG_REPAIR_DONE,
91
92 /*
93 * Set for data stripes if it's triggered from P/Q stripe.
94 * During such scrub, we should not report errors in data stripes, nor
95 * update the accounting.
96 */
97 SCRUB_STRIPE_FLAG_NO_REPORT,
98 };
99
100 /*
101 * We have multiple bitmaps for one scrub_stripe.
102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits,
103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64).
104 *
105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps
106 * into a larger one.
107 *
108 * These enum records where the sub-bitmap are inside the larger one.
109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit.
110 */
111 enum {
112 /* Which blocks are covered by extent items. */
113 scrub_bitmap_nr_has_extent = 0,
114
115 /* Which blocks are metadata. */
116 scrub_bitmap_nr_is_metadata,
117
118 /*
119 * Which blocks have errors, including IO, csum, and metadata
120 * errors.
121 * This sub-bitmap is the OR results of the next few error related
122 * sub-bitmaps.
123 */
124 scrub_bitmap_nr_error,
125 scrub_bitmap_nr_io_error,
126 scrub_bitmap_nr_csum_error,
127 scrub_bitmap_nr_meta_error,
128 scrub_bitmap_nr_meta_gen_error,
129 scrub_bitmap_nr_last,
130 };
131
132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE)
133
134 /*
135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
136 */
137 struct scrub_stripe {
138 struct scrub_ctx *sctx;
139 struct btrfs_block_group *bg;
140
141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS];
142 struct scrub_sector_verification *sectors;
143
144 struct btrfs_device *dev;
145 u64 logical;
146 u64 physical;
147
148 u16 mirror_num;
149
150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */
151 u16 nr_sectors;
152
153 /*
154 * How many data/meta extents are in this stripe. Only for scrub status
155 * reporting purposes.
156 */
157 u16 nr_data_extents;
158 u16 nr_meta_extents;
159
160 atomic_t pending_io;
161 wait_queue_head_t io_wait;
162 wait_queue_head_t repair_wait;
163
164 /*
165 * Indicate the states of the stripe. Bits are defined in
166 * scrub_stripe_flags enum.
167 */
168 unsigned long state;
169
170 /* The large bitmap contains all the sub-bitmaps. */
171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last *
172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))];
173
174 /*
175 * For writeback (repair or replace) error reporting.
176 * This one is protected by a spinlock, thus can not be packed into
177 * the larger bitmap.
178 */
179 unsigned long write_error_bitmap;
180
181 /* Writeback can be concurrent, thus we need to protect the bitmap. */
182 spinlock_t write_error_lock;
183
184 /*
185 * Checksum for the whole stripe if this stripe is inside a data block
186 * group.
187 */
188 u8 *csums;
189
190 struct work_struct work;
191 };
192
193 struct scrub_ctx {
194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES];
195 struct scrub_stripe *raid56_data_stripes;
196 struct btrfs_fs_info *fs_info;
197 struct btrfs_path extent_path;
198 struct btrfs_path csum_path;
199 int first_free;
200 int cur_stripe;
201 atomic_t cancel_req;
202 int readonly;
203
204 /* State of IO submission throttling affecting the associated device */
205 ktime_t throttle_deadline;
206 u64 throttle_sent;
207
208 bool is_dev_replace;
209 u64 write_pointer;
210
211 struct mutex wr_lock;
212 struct btrfs_device *wr_tgtdev;
213
214 /*
215 * statistics
216 */
217 struct btrfs_scrub_progress stat;
218 spinlock_t stat_lock;
219
220 /*
221 * Use a ref counter to avoid use-after-free issues. Scrub workers
222 * decrement bios_in_flight and workers_pending and then do a wakeup
223 * on the list_wait wait queue. We must ensure the main scrub task
224 * doesn't free the scrub context before or while the workers are
225 * doing the wakeup() call.
226 */
227 refcount_t refs;
228 };
229
230 #define scrub_calc_start_bit(stripe, name, block_nr) \
231 ({ \
232 unsigned int __start_bit; \
233 \
234 ASSERT(block_nr < stripe->nr_sectors, \
235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \
236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \
237 __start_bit; \
238 })
239
240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \
241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \
242 unsigned int block_nr, \
243 unsigned int nr_blocks) \
244 { \
245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \
246 name, block_nr); \
247 \
248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \
249 } \
250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \
251 unsigned int block_nr, \
252 unsigned int nr_blocks) \
253 { \
254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
255 block_nr); \
256 \
257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \
258 } \
259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \
260 unsigned int block_nr) \
261 { \
262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
263 block_nr); \
264 \
265 return test_bit(start_bit, stripe->bitmaps); \
266 } \
267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \
268 unsigned int block_nr) \
269 { \
270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
271 block_nr); \
272 \
273 set_bit(start_bit, stripe->bitmaps); \
274 } \
275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \
276 unsigned int block_nr) \
277 { \
278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
279 block_nr); \
280 \
281 clear_bit(start_bit, stripe->bitmaps); \
282 } \
283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \
284 { \
285 const unsigned int nr_blocks = stripe->nr_sectors; \
286 \
287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \
288 "nr_blocks=%u BITS_PER_LONG=%u", \
289 nr_blocks, BITS_PER_LONG); \
290 \
291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \
292 stripe->nr_sectors); \
293 } \
294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \
295 { \
296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
297 \
298 return bitmap_empty(&bitmap, stripe->nr_sectors); \
299 } \
300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \
301 { \
302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
303 \
304 return bitmap_weight(&bitmap, stripe->nr_sectors); \
305 }
306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent);
307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata);
308 IMPLEMENT_SCRUB_BITMAP_OPS(error);
309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error);
310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error);
311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error);
312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error);
313
314 struct scrub_warning {
315 struct btrfs_path *path;
316 u64 extent_item_size;
317 const char *errstr;
318 u64 physical;
319 u64 logical;
320 struct btrfs_device *dev;
321 };
322
323 struct scrub_error_records {
324 /*
325 * Bitmap recording which blocks hit errors (IO/csum/...) during the
326 * initial read.
327 */
328 unsigned long init_error_bitmap;
329
330 unsigned int nr_io_errors;
331 unsigned int nr_csum_errors;
332 unsigned int nr_meta_errors;
333 unsigned int nr_meta_gen_errors;
334 };
335
release_scrub_stripe(struct scrub_stripe * stripe)336 static void release_scrub_stripe(struct scrub_stripe *stripe)
337 {
338 if (!stripe)
339 return;
340
341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) {
342 if (stripe->folios[i])
343 folio_put(stripe->folios[i]);
344 stripe->folios[i] = NULL;
345 }
346 kfree(stripe->sectors);
347 kfree(stripe->csums);
348 stripe->sectors = NULL;
349 stripe->csums = NULL;
350 stripe->sctx = NULL;
351 stripe->state = 0;
352 }
353
init_scrub_stripe(struct btrfs_fs_info * fs_info,struct scrub_stripe * stripe)354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
355 struct scrub_stripe *stripe)
356 {
357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
358 int ret;
359
360 memset(stripe, 0, sizeof(*stripe));
361
362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
363 stripe->state = 0;
364
365 init_waitqueue_head(&stripe->io_wait);
366 init_waitqueue_head(&stripe->repair_wait);
367 atomic_set(&stripe->pending_io, 0);
368 spin_lock_init(&stripe->write_error_lock);
369
370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS);
371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift,
372 fs_info->block_min_order, stripe->folios);
373 if (ret < 0)
374 goto error;
375
376 stripe->sectors = kzalloc_objs(struct scrub_sector_verification,
377 stripe->nr_sectors);
378 if (!stripe->sectors)
379 goto error;
380
381 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
382 fs_info->csum_size, GFP_KERNEL);
383 if (!stripe->csums)
384 goto error;
385 return 0;
386 error:
387 release_scrub_stripe(stripe);
388 return -ENOMEM;
389 }
390
wait_scrub_stripe_io(struct scrub_stripe * stripe)391 static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
392 {
393 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
394 }
395
396 static void scrub_put_ctx(struct scrub_ctx *sctx);
397
__scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)398 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
399 {
400 while (atomic_read(&fs_info->scrub_pause_req)) {
401 mutex_unlock(&fs_info->scrub_lock);
402 wait_event(fs_info->scrub_pause_wait,
403 atomic_read(&fs_info->scrub_pause_req) == 0);
404 mutex_lock(&fs_info->scrub_lock);
405 }
406 }
407
scrub_pause_on(struct btrfs_fs_info * fs_info)408 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
409 {
410 atomic_inc(&fs_info->scrubs_paused);
411 wake_up(&fs_info->scrub_pause_wait);
412 }
413
scrub_pause_off(struct btrfs_fs_info * fs_info)414 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
415 {
416 mutex_lock(&fs_info->scrub_lock);
417 __scrub_blocked_if_needed(fs_info);
418 atomic_dec(&fs_info->scrubs_paused);
419 mutex_unlock(&fs_info->scrub_lock);
420
421 wake_up(&fs_info->scrub_pause_wait);
422 }
423
scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)424 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
425 {
426 scrub_pause_on(fs_info);
427 scrub_pause_off(fs_info);
428 }
429
scrub_free_ctx(struct scrub_ctx * sctx)430 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
431 {
432 int i;
433
434 if (!sctx)
435 return;
436
437 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
438 release_scrub_stripe(&sctx->stripes[i]);
439
440 kvfree(sctx);
441 }
442
scrub_put_ctx(struct scrub_ctx * sctx)443 static void scrub_put_ctx(struct scrub_ctx *sctx)
444 {
445 if (refcount_dec_and_test(&sctx->refs))
446 scrub_free_ctx(sctx);
447 }
448
scrub_setup_ctx(struct btrfs_fs_info * fs_info,bool is_dev_replace)449 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
450 struct btrfs_fs_info *fs_info, bool is_dev_replace)
451 {
452 struct scrub_ctx *sctx;
453 int i;
454
455 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use
456 * kvzalloc().
457 */
458 sctx = kvzalloc_obj(*sctx);
459 if (!sctx)
460 goto nomem;
461 refcount_set(&sctx->refs, 1);
462 sctx->is_dev_replace = is_dev_replace;
463 sctx->fs_info = fs_info;
464 sctx->extent_path.search_commit_root = true;
465 sctx->extent_path.skip_locking = true;
466 sctx->csum_path.search_commit_root = true;
467 sctx->csum_path.skip_locking = true;
468 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
469 int ret;
470
471 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
472 if (ret < 0)
473 goto nomem;
474 sctx->stripes[i].sctx = sctx;
475 }
476 sctx->first_free = 0;
477 atomic_set(&sctx->cancel_req, 0);
478
479 spin_lock_init(&sctx->stat_lock);
480 sctx->throttle_deadline = 0;
481
482 mutex_init(&sctx->wr_lock);
483 if (is_dev_replace) {
484 WARN_ON(!fs_info->dev_replace.tgtdev);
485 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
486 }
487
488 return sctx;
489
490 nomem:
491 scrub_free_ctx(sctx);
492 return ERR_PTR(-ENOMEM);
493 }
494
scrub_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)495 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
496 u64 root, void *warn_ctx)
497 {
498 u32 nlink;
499 int ret;
500 int i;
501 unsigned nofs_flag;
502 struct extent_buffer *eb;
503 struct btrfs_inode_item *inode_item;
504 struct scrub_warning *swarn = warn_ctx;
505 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
506 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
507 struct btrfs_root *local_root;
508 struct btrfs_key key;
509
510 local_root = btrfs_get_fs_root(fs_info, root, true);
511 if (IS_ERR(local_root)) {
512 ret = PTR_ERR(local_root);
513 goto err;
514 }
515
516 /*
517 * this makes the path point to (inum INODE_ITEM ioff)
518 */
519 key.objectid = inum;
520 key.type = BTRFS_INODE_ITEM_KEY;
521 key.offset = 0;
522
523 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
524 if (ret) {
525 btrfs_put_root(local_root);
526 btrfs_release_path(swarn->path);
527 goto err;
528 }
529
530 eb = swarn->path->nodes[0];
531 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
532 struct btrfs_inode_item);
533 nlink = btrfs_inode_nlink(eb, inode_item);
534 btrfs_release_path(swarn->path);
535
536 /*
537 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
538 * uses GFP_NOFS in this context, so we keep it consistent but it does
539 * not seem to be strictly necessary.
540 */
541 nofs_flag = memalloc_nofs_save();
542 ipath = init_ipath(4096, local_root, swarn->path);
543 memalloc_nofs_restore(nofs_flag);
544 if (IS_ERR(ipath)) {
545 btrfs_put_root(local_root);
546 ret = PTR_ERR(ipath);
547 ipath = NULL;
548 goto err;
549 }
550 ret = paths_from_inode(inum, ipath);
551
552 if (ret < 0)
553 goto err;
554
555 /*
556 * we deliberately ignore the bit ipath might have been too small to
557 * hold all of the paths here
558 */
559 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
560 btrfs_warn(fs_info,
561 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
562 swarn->errstr, swarn->logical,
563 btrfs_dev_name(swarn->dev),
564 swarn->physical,
565 root, inum, offset,
566 fs_info->sectorsize, nlink,
567 (char *)(unsigned long)ipath->fspath->val[i]);
568
569 btrfs_put_root(local_root);
570 return 0;
571
572 err:
573 btrfs_warn(fs_info,
574 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
575 swarn->errstr, swarn->logical,
576 btrfs_dev_name(swarn->dev),
577 swarn->physical,
578 root, inum, offset, ret);
579
580 return 0;
581 }
582
scrub_print_common_warning(const char * errstr,struct btrfs_device * dev,bool is_super,u64 logical,u64 physical)583 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
584 bool is_super, u64 logical, u64 physical)
585 {
586 struct btrfs_fs_info *fs_info = dev->fs_info;
587 BTRFS_PATH_AUTO_FREE(path);
588 struct btrfs_key found_key;
589 struct extent_buffer *eb;
590 struct btrfs_extent_item *ei;
591 struct scrub_warning swarn;
592 u64 flags = 0;
593 u32 item_size;
594 int ret;
595
596 /* Super block error, no need to search extent tree. */
597 if (is_super) {
598 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu",
599 errstr, btrfs_dev_name(dev), physical);
600 return;
601 }
602 path = btrfs_alloc_path();
603 if (!path)
604 return;
605
606 swarn.physical = physical;
607 swarn.logical = logical;
608 swarn.errstr = errstr;
609 swarn.dev = NULL;
610
611 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
612 &flags);
613 if (ret < 0)
614 return;
615
616 swarn.extent_item_size = found_key.offset;
617
618 eb = path->nodes[0];
619 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
620 item_size = btrfs_item_size(eb, path->slots[0]);
621
622 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
623 unsigned long ptr = 0;
624 u8 ref_level;
625 u64 ref_root;
626
627 while (true) {
628 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
629 item_size, &ref_root,
630 &ref_level);
631 if (ret < 0) {
632 btrfs_warn(fs_info,
633 "scrub: failed to resolve tree backref for logical %llu: %d",
634 swarn.logical, ret);
635 break;
636 }
637 if (ret > 0)
638 break;
639 btrfs_warn(fs_info,
640 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
641 errstr, swarn.logical, btrfs_dev_name(dev),
642 swarn.physical, (ref_level ? "node" : "leaf"),
643 ref_level, ref_root);
644 }
645 btrfs_release_path(path);
646 } else {
647 struct btrfs_backref_walk_ctx ctx = { 0 };
648
649 btrfs_release_path(path);
650
651 ctx.bytenr = found_key.objectid;
652 ctx.extent_item_pos = swarn.logical - found_key.objectid;
653 ctx.fs_info = fs_info;
654
655 swarn.path = path;
656 swarn.dev = dev;
657
658 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
659 }
660 }
661
fill_writer_pointer_gap(struct scrub_ctx * sctx,u64 physical)662 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
663 {
664 int ret = 0;
665 u64 length;
666
667 if (!btrfs_is_zoned(sctx->fs_info))
668 return 0;
669
670 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
671 return 0;
672
673 if (sctx->write_pointer < physical) {
674 length = physical - sctx->write_pointer;
675
676 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
677 sctx->write_pointer, length);
678 if (!ret)
679 sctx->write_pointer = physical;
680 }
681 return ret;
682 }
683
scrub_stripe_get_kaddr(struct scrub_stripe * stripe,int sector_nr)684 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
685 {
686 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
687 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
688 u32 offset = (sector_nr << fs_info->sectorsize_bits);
689 const struct folio *folio = stripe->folios[offset >> min_folio_shift];
690
691 /* stripe->folios[] is allocated by us and no highmem is allowed. */
692 ASSERT(folio);
693 ASSERT(!folio_test_highmem(folio));
694 return folio_address(folio) + offset_in_folio(folio, offset);
695 }
696
scrub_stripe_get_paddr(struct scrub_stripe * stripe,int sector_nr)697 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr)
698 {
699 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
700 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
701 u32 offset = (sector_nr << fs_info->sectorsize_bits);
702 const struct folio *folio = stripe->folios[offset >> min_folio_shift];
703
704 /* stripe->folios[] is allocated by us and no highmem is allowed. */
705 ASSERT(folio);
706 ASSERT(!folio_test_highmem(folio));
707 /* And the range must be contained inside the folio. */
708 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio));
709 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset);
710 }
711
scrub_verify_one_metadata(struct scrub_stripe * stripe,int sector_nr)712 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
713 {
714 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
715 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
716 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
717 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
718 struct btrfs_header *header = first_kaddr;
719 struct btrfs_csum_ctx csum;
720 u8 on_disk_csum[BTRFS_CSUM_SIZE];
721 u8 calculated_csum[BTRFS_CSUM_SIZE];
722
723 /*
724 * Here we don't have a good way to attach the pages (and subpages)
725 * to a dummy extent buffer, thus we have to directly grab the members
726 * from pages.
727 */
728 memcpy(on_disk_csum, header->csum, fs_info->csum_size);
729
730 if (logical != btrfs_stack_header_bytenr(header)) {
731 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
732 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
733 btrfs_warn_rl(fs_info,
734 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
735 logical, stripe->mirror_num,
736 btrfs_stack_header_bytenr(header), logical);
737 return;
738 }
739 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
740 BTRFS_FSID_SIZE) != 0) {
741 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
742 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
743 btrfs_warn_rl(fs_info,
744 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
745 logical, stripe->mirror_num,
746 header->fsid, fs_info->fs_devices->fsid);
747 return;
748 }
749 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
750 BTRFS_UUID_SIZE) != 0) {
751 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
752 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
753 btrfs_warn_rl(fs_info,
754 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
755 logical, stripe->mirror_num,
756 header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
757 return;
758 }
759
760 /* Now check tree block csum. */
761 btrfs_csum_init(&csum, fs_info->csum_type);
762 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE,
763 fs_info->sectorsize - BTRFS_CSUM_SIZE);
764
765 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
766 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i),
767 fs_info->sectorsize);
768 }
769
770 btrfs_csum_final(&csum, calculated_csum);
771 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
772 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
773 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
774 btrfs_warn_rl(fs_info,
775 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT,
776 logical, stripe->mirror_num,
777 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
779 return;
780 }
781 if (stripe->sectors[sector_nr].generation !=
782 btrfs_stack_header_generation(header)) {
783 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
784 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
785 btrfs_warn_rl(fs_info,
786 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
787 logical, stripe->mirror_num,
788 btrfs_stack_header_generation(header),
789 stripe->sectors[sector_nr].generation);
790 return;
791 }
792 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree);
793 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree);
794 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree);
795 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree);
796 }
797
scrub_verify_one_sector(struct scrub_stripe * stripe,int sector_nr)798 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
799 {
800 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
801 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
802 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
803 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr);
804 u8 csum_buf[BTRFS_CSUM_SIZE];
805 int ret;
806
807 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
808
809 /* Sector not utilized, skip it. */
810 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr))
811 return;
812
813 /* IO error, no need to check. */
814 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
815 return;
816
817 /* Metadata, verify the full tree block. */
818 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
819 /*
820 * Check if the tree block crosses the stripe boundary. If
821 * crossed the boundary, we cannot verify it but only give a
822 * warning.
823 *
824 * This can only happen on a very old filesystem where chunks
825 * are not ensured to be stripe aligned.
826 */
827 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
828 btrfs_warn_rl(fs_info,
829 "scrub: tree block at %llu crosses stripe boundary %llu",
830 stripe->logical +
831 (sector_nr << fs_info->sectorsize_bits),
832 stripe->logical);
833 return;
834 }
835 scrub_verify_one_metadata(stripe, sector_nr);
836 return;
837 }
838
839 /*
840 * Data is easier, we just verify the data csum (if we have it). For
841 * cases without csum, we have no other choice but to trust it.
842 */
843 if (!sector->csum) {
844 scrub_bitmap_clear_bit_error(stripe, sector_nr);
845 return;
846 }
847
848 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum);
849 if (ret < 0) {
850 scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
851 scrub_bitmap_set_bit_error(stripe, sector_nr);
852 } else {
853 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr);
854 scrub_bitmap_clear_bit_error(stripe, sector_nr);
855 }
856 }
857
858 /* Verify specified sectors of a stripe. */
scrub_verify_one_stripe(struct scrub_stripe * stripe,unsigned long bitmap)859 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
860 {
861 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
862 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
863 int sector_nr;
864
865 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
866 scrub_verify_one_sector(stripe, sector_nr);
867 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr))
868 sector_nr += sectors_per_tree - 1;
869 }
870 }
871
calc_sector_number(struct scrub_stripe * stripe,struct bio_vec * first_bvec)872 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
873 {
874 int i;
875
876 for (i = 0; i < stripe->nr_sectors; i++) {
877 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec))
878 break;
879 }
880 ASSERT(i < stripe->nr_sectors);
881 return i;
882 }
883
884 /*
885 * Repair read is different to the regular read:
886 *
887 * - Only reads the failed sectors
888 * - May have extra blocksize limits
889 */
scrub_repair_read_endio(struct btrfs_bio * bbio)890 static void scrub_repair_read_endio(struct btrfs_bio *bbio)
891 {
892 struct scrub_stripe *stripe = bbio->private;
893 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
894 struct bio_vec *bvec;
895 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
896 u32 bio_size = 0;
897 int i;
898
899 ASSERT(sector_nr < stripe->nr_sectors);
900
901 bio_for_each_bvec_all(bvec, &bbio->bio, i)
902 bio_size += bvec->bv_len;
903
904 if (bbio->bio.bi_status) {
905 scrub_bitmap_set_io_error(stripe, sector_nr,
906 bio_size >> fs_info->sectorsize_bits);
907 scrub_bitmap_set_error(stripe, sector_nr,
908 bio_size >> fs_info->sectorsize_bits);
909 } else {
910 scrub_bitmap_clear_io_error(stripe, sector_nr,
911 bio_size >> fs_info->sectorsize_bits);
912 }
913 bio_put(&bbio->bio);
914 if (atomic_dec_and_test(&stripe->pending_io))
915 wake_up(&stripe->io_wait);
916 }
917
calc_next_mirror(int mirror,int num_copies)918 static int calc_next_mirror(int mirror, int num_copies)
919 {
920 ASSERT(mirror <= num_copies);
921 return (mirror + 1 > num_copies) ? 1 : mirror + 1;
922 }
923
scrub_bio_add_sector(struct btrfs_bio * bbio,struct scrub_stripe * stripe,int sector_nr)924 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe,
925 int sector_nr)
926 {
927 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
928 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
929 int ret;
930
931 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize,
932 offset_in_page(kaddr));
933 /*
934 * Caller should ensure the bbio has enough size.
935 * And we cannot use __bio_add_page(), which doesn't do any merge.
936 *
937 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge
938 * to create the minimal amount of bio vectors, for fs block size < page
939 * size cases.
940 */
941 ASSERT(ret == fs_info->sectorsize);
942 }
943
alloc_scrub_bbio(struct btrfs_fs_info * fs_info,unsigned int nr_vecs,blk_opf_t opf,u64 logical,btrfs_bio_end_io_t end_io,void * private)944 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info,
945 unsigned int nr_vecs, blk_opf_t opf,
946 u64 logical,
947 btrfs_bio_end_io_t end_io, void *private)
948 {
949 struct btrfs_bio *bbio;
950
951 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode),
952 logical, end_io, private);
953 bbio->is_scrub = true;
954 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
955 return bbio;
956 }
957
scrub_stripe_submit_repair_read(struct scrub_stripe * stripe,int mirror,int blocksize,bool wait)958 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
959 int mirror, int blocksize, bool wait)
960 {
961 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
962 struct btrfs_bio *bbio = NULL;
963 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
964 int i;
965
966 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
967 ASSERT(atomic_read(&stripe->pending_io) == 0,
968 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io));
969
970 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
971 /* The current sector cannot be merged, submit the bio. */
972 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) ||
973 bbio->bio.bi_iter.bi_size >= blocksize)) {
974 ASSERT(bbio->bio.bi_iter.bi_size);
975 atomic_inc(&stripe->pending_io);
976 btrfs_submit_bbio(bbio, mirror);
977 if (wait)
978 wait_scrub_stripe_io(stripe);
979 bbio = NULL;
980 }
981
982 if (!bbio)
983 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
984 stripe->logical + (i << fs_info->sectorsize_bits),
985 scrub_repair_read_endio, stripe);
986
987 scrub_bio_add_sector(bbio, stripe, i);
988 }
989 if (bbio) {
990 ASSERT(bbio->bio.bi_iter.bi_size);
991 atomic_inc(&stripe->pending_io);
992 btrfs_submit_bbio(bbio, mirror);
993 if (wait)
994 wait_scrub_stripe_io(stripe);
995 }
996 }
997
scrub_stripe_report_errors(struct scrub_ctx * sctx,struct scrub_stripe * stripe,const struct scrub_error_records * errors)998 static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
999 struct scrub_stripe *stripe,
1000 const struct scrub_error_records *errors)
1001 {
1002 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1003 DEFAULT_RATELIMIT_BURST);
1004 struct btrfs_fs_info *fs_info = sctx->fs_info;
1005 struct btrfs_device *dev = NULL;
1006 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe);
1007 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe);
1008 u64 physical = 0;
1009 int nr_data_sectors = 0;
1010 int nr_meta_sectors = 0;
1011 int nr_nodatacsum_sectors = 0;
1012 int nr_repaired_sectors = 0;
1013 int sector_nr;
1014
1015 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
1016 return;
1017
1018 /*
1019 * Init needed infos for error reporting.
1020 *
1021 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
1022 * thus no need for dev/physical, error reporting still needs dev and physical.
1023 */
1024 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) {
1025 u64 mapped_len = fs_info->sectorsize;
1026 struct btrfs_io_context *bioc = NULL;
1027 int stripe_index = stripe->mirror_num - 1;
1028 int ret;
1029
1030 /* For scrub, our mirror_num should always start at 1. */
1031 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
1032 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1033 stripe->logical, &mapped_len, &bioc,
1034 NULL, NULL);
1035 /*
1036 * If we failed, dev will be NULL, and later detailed reports
1037 * will just be skipped.
1038 */
1039 if (ret < 0)
1040 goto skip;
1041 physical = bioc->stripes[stripe_index].physical;
1042 dev = bioc->stripes[stripe_index].dev;
1043 btrfs_put_bioc(bioc);
1044 }
1045
1046 skip:
1047 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) {
1048 bool repaired = false;
1049
1050 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
1051 nr_meta_sectors++;
1052 } else {
1053 nr_data_sectors++;
1054 if (!stripe->sectors[sector_nr].csum)
1055 nr_nodatacsum_sectors++;
1056 }
1057
1058 if (test_bit(sector_nr, &errors->init_error_bitmap) &&
1059 !test_bit(sector_nr, &error_bitmap)) {
1060 nr_repaired_sectors++;
1061 repaired = true;
1062 }
1063
1064 /* Good sector from the beginning, nothing need to be done. */
1065 if (!test_bit(sector_nr, &errors->init_error_bitmap))
1066 continue;
1067
1068 /*
1069 * Report error for the corrupted sectors. If repaired, just
1070 * output the message of repaired message.
1071 */
1072 if (repaired) {
1073 if (dev) {
1074 btrfs_err_rl(fs_info,
1075 "scrub: fixed up error at logical %llu on dev %s physical %llu",
1076 stripe->logical, btrfs_dev_name(dev),
1077 physical);
1078 } else {
1079 btrfs_err_rl(fs_info,
1080 "scrub: fixed up error at logical %llu on mirror %u",
1081 stripe->logical, stripe->mirror_num);
1082 }
1083 continue;
1084 }
1085
1086 /* The remaining are all for unrepaired. */
1087 if (dev) {
1088 btrfs_err_rl(fs_info,
1089 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
1090 stripe->logical, btrfs_dev_name(dev),
1091 physical);
1092 } else {
1093 btrfs_err_rl(fs_info,
1094 "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
1095 stripe->logical, stripe->mirror_num);
1096 }
1097
1098 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
1099 if (__ratelimit(&rs) && dev)
1100 scrub_print_common_warning("i/o error", dev, false,
1101 stripe->logical, physical);
1102 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr))
1103 if (__ratelimit(&rs) && dev)
1104 scrub_print_common_warning("checksum error", dev, false,
1105 stripe->logical, physical);
1106 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr))
1107 if (__ratelimit(&rs) && dev)
1108 scrub_print_common_warning("header error", dev, false,
1109 stripe->logical, physical);
1110 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr))
1111 if (__ratelimit(&rs) && dev)
1112 scrub_print_common_warning("generation error", dev, false,
1113 stripe->logical, physical);
1114 }
1115
1116 /* Update the device stats. */
1117 for (int i = 0; i < errors->nr_io_errors; i++)
1118 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
1119 for (int i = 0; i < errors->nr_csum_errors; i++)
1120 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
1121 /* Generation mismatch error is based on each metadata, not each block. */
1122 for (int i = 0; i < errors->nr_meta_gen_errors;
1123 i += (fs_info->nodesize >> fs_info->sectorsize_bits))
1124 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
1125
1126 spin_lock(&sctx->stat_lock);
1127 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
1128 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
1129 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
1130 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
1131 sctx->stat.no_csum += nr_nodatacsum_sectors;
1132 sctx->stat.read_errors += errors->nr_io_errors;
1133 sctx->stat.csum_errors += errors->nr_csum_errors;
1134 sctx->stat.verify_errors += errors->nr_meta_errors +
1135 errors->nr_meta_gen_errors;
1136 sctx->stat.uncorrectable_errors +=
1137 bitmap_weight(&error_bitmap, stripe->nr_sectors);
1138 sctx->stat.corrected_errors += nr_repaired_sectors;
1139 spin_unlock(&sctx->stat_lock);
1140 }
1141
1142 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1143 unsigned long write_bitmap, bool dev_replace);
1144
1145 /*
1146 * The main entrance for all read related scrub work, including:
1147 *
1148 * - Wait for the initial read to finish
1149 * - Verify and locate any bad sectors
1150 * - Go through the remaining mirrors and try to read as large blocksize as
1151 * possible
1152 * - Go through all mirrors (including the failed mirror) sector-by-sector
1153 * - Submit writeback for repaired sectors
1154 *
1155 * Writeback for dev-replace does not happen here, it needs extra
1156 * synchronization for zoned devices.
1157 */
scrub_stripe_read_repair_worker(struct work_struct * work)1158 static void scrub_stripe_read_repair_worker(struct work_struct *work)
1159 {
1160 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
1161 struct scrub_ctx *sctx = stripe->sctx;
1162 struct btrfs_fs_info *fs_info = sctx->fs_info;
1163 struct scrub_error_records errors = { 0 };
1164 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1165 stripe->bg->length);
1166 unsigned long repaired;
1167 unsigned long error;
1168 int mirror;
1169 int i;
1170
1171 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num);
1172
1173 wait_scrub_stripe_io(stripe);
1174 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe));
1175 /* Save the initial failed bitmap for later repair and report usage. */
1176 errors.init_error_bitmap = scrub_bitmap_read_error(stripe);
1177 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe);
1178 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe);
1179 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe);
1180 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe);
1181
1182 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors))
1183 goto out;
1184
1185 /*
1186 * Try all remaining mirrors.
1187 *
1188 * Here we still try to read as large block as possible, as this is
1189 * faster and we have extra safety nets to rely on.
1190 */
1191 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
1192 mirror != stripe->mirror_num;
1193 mirror = calc_next_mirror(mirror, num_copies)) {
1194 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
1195
1196 scrub_stripe_submit_repair_read(stripe, mirror,
1197 BTRFS_STRIPE_LEN, false);
1198 wait_scrub_stripe_io(stripe);
1199 scrub_verify_one_stripe(stripe, old_error_bitmap);
1200 if (scrub_bitmap_empty_error(stripe))
1201 goto out;
1202 }
1203
1204 /*
1205 * Last safety net, try re-checking all mirrors, including the failed
1206 * one, sector-by-sector.
1207 *
1208 * As if one sector failed the drive's internal csum, the whole read
1209 * containing the offending sector would be marked as error.
1210 * Thus here we do sector-by-sector read.
1211 *
1212 * This can be slow, thus we only try it as the last resort.
1213 */
1214
1215 for (i = 0, mirror = stripe->mirror_num;
1216 i < num_copies;
1217 i++, mirror = calc_next_mirror(mirror, num_copies)) {
1218 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
1219
1220 scrub_stripe_submit_repair_read(stripe, mirror,
1221 fs_info->sectorsize, true);
1222 wait_scrub_stripe_io(stripe);
1223 scrub_verify_one_stripe(stripe, old_error_bitmap);
1224 if (scrub_bitmap_empty_error(stripe))
1225 goto out;
1226 }
1227 out:
1228 error = scrub_bitmap_read_error(stripe);
1229 /*
1230 * Submit the repaired sectors. For zoned case, we cannot do repair
1231 * in-place, but queue the bg to be relocated.
1232 */
1233 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error,
1234 stripe->nr_sectors);
1235 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
1236 if (btrfs_is_zoned(fs_info)) {
1237 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
1238 } else {
1239 scrub_write_sectors(sctx, stripe, repaired, false);
1240 wait_scrub_stripe_io(stripe);
1241 }
1242 }
1243
1244 scrub_stripe_report_errors(sctx, stripe, &errors);
1245 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
1246 wake_up(&stripe->repair_wait);
1247 }
1248
scrub_read_endio(struct btrfs_bio * bbio)1249 static void scrub_read_endio(struct btrfs_bio *bbio)
1250 {
1251 struct scrub_stripe *stripe = bbio->private;
1252 struct bio_vec *bvec;
1253 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1254 int num_sectors;
1255 u32 bio_size = 0;
1256 int i;
1257
1258 ASSERT(sector_nr < stripe->nr_sectors);
1259 bio_for_each_bvec_all(bvec, &bbio->bio, i)
1260 bio_size += bvec->bv_len;
1261 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
1262
1263 if (bbio->bio.bi_status) {
1264 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors);
1265 scrub_bitmap_set_error(stripe, sector_nr, num_sectors);
1266 } else {
1267 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors);
1268 }
1269 bio_put(&bbio->bio);
1270 if (atomic_dec_and_test(&stripe->pending_io)) {
1271 wake_up(&stripe->io_wait);
1272 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1273 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1274 }
1275 }
1276
scrub_write_endio(struct btrfs_bio * bbio)1277 static void scrub_write_endio(struct btrfs_bio *bbio)
1278 {
1279 struct scrub_stripe *stripe = bbio->private;
1280 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1281 struct bio_vec *bvec;
1282 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
1283 u32 bio_size = 0;
1284 int i;
1285
1286 bio_for_each_bvec_all(bvec, &bbio->bio, i)
1287 bio_size += bvec->bv_len;
1288
1289 if (bbio->bio.bi_status) {
1290 unsigned long flags;
1291
1292 spin_lock_irqsave(&stripe->write_error_lock, flags);
1293 bitmap_set(&stripe->write_error_bitmap, sector_nr,
1294 bio_size >> fs_info->sectorsize_bits);
1295 spin_unlock_irqrestore(&stripe->write_error_lock, flags);
1296 for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
1297 btrfs_dev_stat_inc_and_print(stripe->dev,
1298 BTRFS_DEV_STAT_WRITE_ERRS);
1299 }
1300 bio_put(&bbio->bio);
1301
1302 if (atomic_dec_and_test(&stripe->pending_io))
1303 wake_up(&stripe->io_wait);
1304 }
1305
scrub_submit_write_bio(struct scrub_ctx * sctx,struct scrub_stripe * stripe,struct btrfs_bio * bbio,bool dev_replace)1306 static void scrub_submit_write_bio(struct scrub_ctx *sctx,
1307 struct scrub_stripe *stripe,
1308 struct btrfs_bio *bbio, bool dev_replace)
1309 {
1310 struct btrfs_fs_info *fs_info = sctx->fs_info;
1311 u32 bio_len = bbio->bio.bi_iter.bi_size;
1312 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
1313 stripe->logical;
1314
1315 fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
1316 atomic_inc(&stripe->pending_io);
1317 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
1318 if (!btrfs_is_zoned(fs_info))
1319 return;
1320 /*
1321 * For zoned writeback, queue depth must be 1, thus we must wait for
1322 * the write to finish before the next write.
1323 */
1324 wait_scrub_stripe_io(stripe);
1325
1326 /*
1327 * And also need to update the write pointer if write finished
1328 * successfully.
1329 */
1330 if (!test_bit(bio_off >> fs_info->sectorsize_bits,
1331 &stripe->write_error_bitmap))
1332 sctx->write_pointer += bio_len;
1333 }
1334
1335 /*
1336 * Submit the write bio(s) for the sectors specified by @write_bitmap.
1337 *
1338 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
1339 *
1340 * - Only needs logical bytenr and mirror_num
1341 * Just like the scrub read path
1342 *
1343 * - Would only result in writes to the specified mirror
1344 * Unlike the regular writeback path, which would write back to all stripes
1345 *
1346 * - Handle dev-replace and read-repair writeback differently
1347 */
scrub_write_sectors(struct scrub_ctx * sctx,struct scrub_stripe * stripe,unsigned long write_bitmap,bool dev_replace)1348 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
1349 unsigned long write_bitmap, bool dev_replace)
1350 {
1351 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1352 struct btrfs_bio *bbio = NULL;
1353 int sector_nr;
1354
1355 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
1356 /* We should only writeback sectors covered by an extent. */
1357 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr));
1358
1359 /* Cannot merge with previous sector, submit the current one. */
1360 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
1361 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1362 bbio = NULL;
1363 }
1364 if (!bbio)
1365 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE,
1366 stripe->logical + (sector_nr << fs_info->sectorsize_bits),
1367 scrub_write_endio, stripe);
1368 scrub_bio_add_sector(bbio, stripe, sector_nr);
1369 }
1370 if (bbio)
1371 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
1372 }
1373
1374 /*
1375 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1376 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1377 */
scrub_throttle_dev_io(struct scrub_ctx * sctx,struct btrfs_device * device,unsigned int bio_size)1378 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
1379 unsigned int bio_size)
1380 {
1381 const int time_slice = 1000;
1382 s64 delta;
1383 ktime_t now;
1384 u32 div;
1385 u64 bwlimit;
1386
1387 bwlimit = READ_ONCE(device->scrub_speed_max);
1388 if (bwlimit == 0)
1389 return;
1390
1391 /*
1392 * Slice is divided into intervals when the IO is submitted, adjust by
1393 * bwlimit and maximum of 64 intervals.
1394 */
1395 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64);
1396
1397 /* Start new epoch, set deadline */
1398 now = ktime_get();
1399 if (sctx->throttle_deadline == 0) {
1400 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1401 sctx->throttle_sent = 0;
1402 }
1403
1404 /* Still in the time to send? */
1405 if (ktime_before(now, sctx->throttle_deadline)) {
1406 /* If current bio is within the limit, send it */
1407 sctx->throttle_sent += bio_size;
1408 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1409 return;
1410
1411 /* We're over the limit, sleep until the rest of the slice */
1412 delta = ktime_ms_delta(sctx->throttle_deadline, now);
1413 } else {
1414 /* New request after deadline, start new epoch */
1415 delta = 0;
1416 }
1417
1418 if (delta) {
1419 long timeout;
1420
1421 timeout = div_u64(delta * HZ, 1000);
1422 schedule_timeout_interruptible(timeout);
1423 }
1424
1425 /* Next call will start the deadline period */
1426 sctx->throttle_deadline = 0;
1427 }
1428
1429 /*
1430 * Given a physical address, this will calculate it's
1431 * logical offset. if this is a parity stripe, it will return
1432 * the most left data stripe's logical offset.
1433 *
1434 * return 0 if it is a data stripe, 1 means parity stripe.
1435 */
get_raid56_logic_offset(u64 physical,int num,struct btrfs_chunk_map * map,u64 * offset,u64 * stripe_start)1436 static int get_raid56_logic_offset(u64 physical, int num,
1437 struct btrfs_chunk_map *map, u64 *offset,
1438 u64 *stripe_start)
1439 {
1440 int i;
1441 int j = 0;
1442 u64 last_offset;
1443 const int data_stripes = nr_data_stripes(map);
1444
1445 last_offset = (physical - map->stripes[num].physical) * data_stripes;
1446 if (stripe_start)
1447 *stripe_start = last_offset;
1448
1449 *offset = last_offset;
1450 for (i = 0; i < data_stripes; i++) {
1451 u32 stripe_nr;
1452 u32 stripe_index;
1453 u32 rot;
1454
1455 *offset = last_offset + btrfs_stripe_nr_to_offset(i);
1456
1457 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
1458
1459 /* Work out the disk rotation on this stripe-set */
1460 rot = stripe_nr % map->num_stripes;
1461 /* calculate which stripe this data locates */
1462 rot += i;
1463 stripe_index = rot % map->num_stripes;
1464 if (stripe_index == num)
1465 return 0;
1466 if (stripe_index < num)
1467 j++;
1468 }
1469 *offset = last_offset + btrfs_stripe_nr_to_offset(j);
1470 return 1;
1471 }
1472
1473 /*
1474 * Return 0 if the extent item range covers any byte of the range.
1475 * Return <0 if the extent item is before @search_start.
1476 * Return >0 if the extent item is after @start_start + @search_len.
1477 */
compare_extent_item_range(struct btrfs_path * path,u64 search_start,u64 search_len)1478 static int compare_extent_item_range(struct btrfs_path *path,
1479 u64 search_start, u64 search_len)
1480 {
1481 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
1482 u64 len;
1483 struct btrfs_key key;
1484
1485 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1486 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
1487 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type);
1488 if (key.type == BTRFS_METADATA_ITEM_KEY)
1489 len = fs_info->nodesize;
1490 else
1491 len = key.offset;
1492
1493 if (key.objectid + len <= search_start)
1494 return -1;
1495 if (key.objectid >= search_start + search_len)
1496 return 1;
1497 return 0;
1498 }
1499
1500 /*
1501 * Locate one extent item which covers any byte in range
1502 * [@search_start, @search_start + @search_length)
1503 *
1504 * If the path is not initialized, we will initialize the search by doing
1505 * a btrfs_search_slot().
1506 * If the path is already initialized, we will use the path as the initial
1507 * slot, to avoid duplicated btrfs_search_slot() calls.
1508 *
1509 * NOTE: If an extent item starts before @search_start, we will still
1510 * return the extent item. This is for data extent crossing stripe boundary.
1511 *
1512 * Return 0 if we found such extent item, and @path will point to the extent item.
1513 * Return >0 if no such extent item can be found, and @path will be released.
1514 * Return <0 if hit fatal error, and @path will be released.
1515 */
find_first_extent_item(struct btrfs_root * extent_root,struct btrfs_path * path,u64 search_start,u64 search_len)1516 static int find_first_extent_item(struct btrfs_root *extent_root,
1517 struct btrfs_path *path,
1518 u64 search_start, u64 search_len)
1519 {
1520 struct btrfs_fs_info *fs_info = extent_root->fs_info;
1521 struct btrfs_key key;
1522 int ret;
1523
1524 /* Continue using the existing path */
1525 if (path->nodes[0])
1526 goto search_forward;
1527
1528 key.objectid = search_start;
1529 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1530 key.type = BTRFS_METADATA_ITEM_KEY;
1531 else
1532 key.type = BTRFS_EXTENT_ITEM_KEY;
1533 key.offset = (u64)-1;
1534
1535 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1536 if (ret < 0)
1537 return ret;
1538 if (unlikely(ret == 0)) {
1539 /*
1540 * Key with offset -1 found, there would have to exist an extent
1541 * item with such offset, but this is out of the valid range.
1542 */
1543 btrfs_release_path(path);
1544 return -EUCLEAN;
1545 }
1546
1547 /*
1548 * Here we intentionally pass 0 as @min_objectid, as there could be
1549 * an extent item starting before @search_start.
1550 */
1551 ret = btrfs_previous_extent_item(extent_root, path, 0);
1552 if (ret < 0)
1553 return ret;
1554 /*
1555 * No matter whether we have found an extent item, the next loop will
1556 * properly do every check on the key.
1557 */
1558 search_forward:
1559 while (true) {
1560 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1561 if (key.objectid >= search_start + search_len)
1562 break;
1563 if (key.type != BTRFS_METADATA_ITEM_KEY &&
1564 key.type != BTRFS_EXTENT_ITEM_KEY)
1565 goto next;
1566
1567 ret = compare_extent_item_range(path, search_start, search_len);
1568 if (ret == 0)
1569 return ret;
1570 if (ret > 0)
1571 break;
1572 next:
1573 ret = btrfs_next_item(extent_root, path);
1574 if (ret) {
1575 /* Either no more items or a fatal error. */
1576 btrfs_release_path(path);
1577 return ret;
1578 }
1579 }
1580 btrfs_release_path(path);
1581 return 1;
1582 }
1583
get_extent_info(struct btrfs_path * path,u64 * extent_start_ret,u64 * size_ret,u64 * flags_ret,u64 * generation_ret)1584 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
1585 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
1586 {
1587 struct btrfs_key key;
1588 struct btrfs_extent_item *ei;
1589
1590 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1591 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
1592 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type);
1593 *extent_start_ret = key.objectid;
1594 if (key.type == BTRFS_METADATA_ITEM_KEY)
1595 *size_ret = path->nodes[0]->fs_info->nodesize;
1596 else
1597 *size_ret = key.offset;
1598 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
1599 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
1600 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
1601 }
1602
sync_write_pointer_for_zoned(struct scrub_ctx * sctx,u64 logical,u64 physical,u64 physical_end)1603 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
1604 u64 physical, u64 physical_end)
1605 {
1606 struct btrfs_fs_info *fs_info = sctx->fs_info;
1607 int ret = 0;
1608
1609 if (!btrfs_is_zoned(fs_info))
1610 return 0;
1611
1612 mutex_lock(&sctx->wr_lock);
1613 if (sctx->write_pointer < physical_end) {
1614 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
1615 physical,
1616 sctx->write_pointer);
1617 if (ret)
1618 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
1619 }
1620 mutex_unlock(&sctx->wr_lock);
1621 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
1622
1623 return ret;
1624 }
1625
fill_one_extent_info(struct btrfs_fs_info * fs_info,struct scrub_stripe * stripe,u64 extent_start,u64 extent_len,u64 extent_flags,u64 extent_gen)1626 static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
1627 struct scrub_stripe *stripe,
1628 u64 extent_start, u64 extent_len,
1629 u64 extent_flags, u64 extent_gen)
1630 {
1631 for (u64 cur_logical = max(stripe->logical, extent_start);
1632 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
1633 extent_start + extent_len);
1634 cur_logical += fs_info->sectorsize) {
1635 const int nr_sector = (cur_logical - stripe->logical) >>
1636 fs_info->sectorsize_bits;
1637 struct scrub_sector_verification *sector =
1638 &stripe->sectors[nr_sector];
1639
1640 scrub_bitmap_set_bit_has_extent(stripe, nr_sector);
1641 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1642 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector);
1643 sector->generation = extent_gen;
1644 }
1645 }
1646 }
1647
scrub_stripe_reset_bitmaps(struct scrub_stripe * stripe)1648 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
1649 {
1650 ASSERT(stripe->nr_sectors);
1651 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors);
1652 }
1653
1654 /*
1655 * Locate one stripe which has at least one extent in its range.
1656 *
1657 * Return 0 if found such stripe, and store its info into @stripe.
1658 * Return >0 if there is no such stripe in the specified range.
1659 * Return <0 for error.
1660 */
scrub_find_fill_first_stripe(struct btrfs_block_group * bg,struct btrfs_path * extent_path,struct btrfs_path * csum_path,struct btrfs_device * dev,u64 physical,int mirror_num,u64 logical_start,u32 logical_len,struct scrub_stripe * stripe)1661 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
1662 struct btrfs_path *extent_path,
1663 struct btrfs_path *csum_path,
1664 struct btrfs_device *dev, u64 physical,
1665 int mirror_num, u64 logical_start,
1666 u32 logical_len,
1667 struct scrub_stripe *stripe)
1668 {
1669 struct btrfs_fs_info *fs_info = bg->fs_info;
1670 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
1671 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
1672 const u64 logical_end = logical_start + logical_len;
1673 u64 cur_logical = logical_start;
1674 u64 stripe_end;
1675 u64 extent_start;
1676 u64 extent_len;
1677 u64 extent_flags;
1678 u64 extent_gen;
1679 int ret;
1680
1681 if (unlikely(!extent_root || !csum_root)) {
1682 btrfs_err(fs_info, "scrub: no valid extent or csum root found");
1683 return -EUCLEAN;
1684 }
1685 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
1686 stripe->nr_sectors);
1687 scrub_stripe_reset_bitmaps(stripe);
1688
1689 /* The range must be inside the bg. */
1690 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg),
1691 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu",
1692 bg->start, logical_start, logical_end, btrfs_block_group_end(bg));
1693
1694 ret = find_first_extent_item(extent_root, extent_path, logical_start,
1695 logical_len);
1696 /* Either error or not found. */
1697 if (ret)
1698 return ret;
1699 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
1700 &extent_gen);
1701 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1702 stripe->nr_meta_extents++;
1703 if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1704 stripe->nr_data_extents++;
1705 cur_logical = max(extent_start, cur_logical);
1706
1707 /*
1708 * Round down to stripe boundary.
1709 *
1710 * The extra calculation against bg->start is to handle block groups
1711 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
1712 */
1713 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
1714 bg->start;
1715 stripe->physical = physical + stripe->logical - logical_start;
1716 stripe->dev = dev;
1717 stripe->bg = bg;
1718 stripe->mirror_num = mirror_num;
1719 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
1720
1721 /* Fill the first extent info into stripe->sectors[] array. */
1722 fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1723 extent_flags, extent_gen);
1724 cur_logical = extent_start + extent_len;
1725
1726 /* Fill the extent info for the remaining sectors. */
1727 while (cur_logical <= stripe_end) {
1728 ret = find_first_extent_item(extent_root, extent_path, cur_logical,
1729 stripe_end - cur_logical + 1);
1730 if (ret < 0)
1731 return ret;
1732 if (ret > 0) {
1733 ret = 0;
1734 break;
1735 }
1736 get_extent_info(extent_path, &extent_start, &extent_len,
1737 &extent_flags, &extent_gen);
1738 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1739 stripe->nr_meta_extents++;
1740 if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
1741 stripe->nr_data_extents++;
1742 fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
1743 extent_flags, extent_gen);
1744 cur_logical = extent_start + extent_len;
1745 }
1746
1747 /* Now fill the data csum. */
1748 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
1749 int sector_nr;
1750 unsigned long csum_bitmap = 0;
1751
1752 /* Csum space should have already been allocated. */
1753 ASSERT(stripe->csums);
1754
1755 /*
1756 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
1757 * should contain at most 16 sectors.
1758 */
1759 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
1760
1761 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path,
1762 stripe->logical, stripe_end,
1763 stripe->csums, &csum_bitmap);
1764 if (ret < 0)
1765 return ret;
1766 if (ret > 0)
1767 ret = 0;
1768
1769 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
1770 stripe->sectors[sector_nr].csum = stripe->csums +
1771 sector_nr * fs_info->csum_size;
1772 }
1773 }
1774 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
1775
1776 return ret;
1777 }
1778
scrub_reset_stripe(struct scrub_stripe * stripe)1779 static void scrub_reset_stripe(struct scrub_stripe *stripe)
1780 {
1781 scrub_stripe_reset_bitmaps(stripe);
1782
1783 stripe->nr_meta_extents = 0;
1784 stripe->nr_data_extents = 0;
1785 stripe->state = 0;
1786
1787 for (int i = 0; i < stripe->nr_sectors; i++) {
1788 stripe->sectors[i].csum = NULL;
1789 stripe->sectors[i].generation = 0;
1790 }
1791 }
1792
stripe_length(const struct scrub_stripe * stripe)1793 static u32 stripe_length(const struct scrub_stripe *stripe)
1794 {
1795 ASSERT(stripe->bg);
1796
1797 return min(BTRFS_STRIPE_LEN,
1798 stripe->bg->start + stripe->bg->length - stripe->logical);
1799 }
1800
scrub_submit_extent_sector_read(struct scrub_stripe * stripe)1801 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
1802 {
1803 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1804 struct btrfs_bio *bbio = NULL;
1805 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
1806 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe);
1807 u64 stripe_len = BTRFS_STRIPE_LEN;
1808 int mirror = stripe->mirror_num;
1809 int i;
1810
1811 atomic_inc(&stripe->pending_io);
1812
1813 for_each_set_bit(i, &has_extent, stripe->nr_sectors) {
1814 /* We're beyond the chunk boundary, no need to read anymore. */
1815 if (i >= nr_sectors)
1816 break;
1817
1818 /* The current sector cannot be merged, submit the bio. */
1819 if (bbio &&
1820 ((i > 0 && !test_bit(i - 1, &has_extent)) ||
1821 bbio->bio.bi_iter.bi_size >= stripe_len)) {
1822 ASSERT(bbio->bio.bi_iter.bi_size);
1823 atomic_inc(&stripe->pending_io);
1824 btrfs_submit_bbio(bbio, mirror);
1825 bbio = NULL;
1826 }
1827
1828 if (!bbio) {
1829 struct btrfs_io_stripe io_stripe = {};
1830 struct btrfs_io_context *bioc = NULL;
1831 const u64 logical = stripe->logical +
1832 (i << fs_info->sectorsize_bits);
1833 int ret;
1834
1835 io_stripe.rst_search_commit_root = true;
1836 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
1837 /*
1838 * For RST cases, we need to manually split the bbio to
1839 * follow the RST boundary.
1840 */
1841 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
1842 &stripe_len, &bioc, &io_stripe, &mirror);
1843 btrfs_put_bioc(bioc);
1844 if (ret < 0) {
1845 if (ret != -ENODATA) {
1846 /*
1847 * Earlier btrfs_get_raid_extent_offset()
1848 * returned -ENODATA, which means there's
1849 * no entry for the corresponding range
1850 * in the stripe tree. But if it's in
1851 * the extent tree, then it's a preallocated
1852 * extent and not an error.
1853 */
1854 scrub_bitmap_set_bit_io_error(stripe, i);
1855 scrub_bitmap_set_bit_error(stripe, i);
1856 }
1857 continue;
1858 }
1859
1860 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ,
1861 logical, scrub_read_endio, stripe);
1862 }
1863
1864 scrub_bio_add_sector(bbio, stripe, i);
1865 }
1866
1867 if (bbio) {
1868 ASSERT(bbio->bio.bi_iter.bi_size);
1869 atomic_inc(&stripe->pending_io);
1870 btrfs_submit_bbio(bbio, mirror);
1871 }
1872
1873 if (atomic_dec_and_test(&stripe->pending_io)) {
1874 wake_up(&stripe->io_wait);
1875 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
1876 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
1877 }
1878 }
1879
scrub_submit_initial_read(struct scrub_ctx * sctx,struct scrub_stripe * stripe)1880 static void scrub_submit_initial_read(struct scrub_ctx *sctx,
1881 struct scrub_stripe *stripe)
1882 {
1883 struct btrfs_fs_info *fs_info = sctx->fs_info;
1884 struct btrfs_bio *bbio;
1885 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
1886 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
1887 int mirror = stripe->mirror_num;
1888
1889 ASSERT(stripe->bg);
1890 ASSERT(stripe->mirror_num > 0);
1891 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1892
1893 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
1894 scrub_submit_extent_sector_read(stripe);
1895 return;
1896 }
1897
1898 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ,
1899 stripe->logical, scrub_read_endio, stripe);
1900 /* Read the whole range inside the chunk boundary. */
1901 for (unsigned int cur = 0; cur < nr_sectors; cur++)
1902 scrub_bio_add_sector(bbio, stripe, cur);
1903 atomic_inc(&stripe->pending_io);
1904
1905 /*
1906 * For dev-replace, either user asks to avoid the source dev, or
1907 * the device is missing, we try the next mirror instead.
1908 */
1909 if (sctx->is_dev_replace &&
1910 (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
1911 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
1912 !stripe->dev->bdev)) {
1913 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
1914 stripe->bg->length);
1915
1916 mirror = calc_next_mirror(mirror, num_copies);
1917 }
1918 btrfs_submit_bbio(bbio, mirror);
1919 }
1920
stripe_has_metadata_error(struct scrub_stripe * stripe)1921 static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
1922 {
1923 const unsigned long error = scrub_bitmap_read_error(stripe);
1924 int i;
1925
1926 for_each_set_bit(i, &error, stripe->nr_sectors) {
1927 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) {
1928 struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
1929
1930 btrfs_err(fs_info,
1931 "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
1932 stripe->logical,
1933 stripe->logical + (i << fs_info->sectorsize_bits));
1934 return true;
1935 }
1936 }
1937 return false;
1938 }
1939
submit_initial_group_read(struct scrub_ctx * sctx,unsigned int first_slot,unsigned int nr_stripes)1940 static void submit_initial_group_read(struct scrub_ctx *sctx,
1941 unsigned int first_slot,
1942 unsigned int nr_stripes)
1943 {
1944 struct blk_plug plug;
1945
1946 ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
1947 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);
1948
1949 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
1950 btrfs_stripe_nr_to_offset(nr_stripes));
1951 blk_start_plug(&plug);
1952 for (int i = 0; i < nr_stripes; i++) {
1953 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];
1954
1955 /* Those stripes should be initialized. */
1956 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
1957 scrub_submit_initial_read(sctx, stripe);
1958 }
1959 blk_finish_plug(&plug);
1960 }
1961
flush_scrub_stripes(struct scrub_ctx * sctx)1962 static int flush_scrub_stripes(struct scrub_ctx *sctx)
1963 {
1964 struct btrfs_fs_info *fs_info = sctx->fs_info;
1965 struct scrub_stripe *stripe;
1966 const int nr_stripes = sctx->cur_stripe;
1967 int ret = 0;
1968
1969 if (!nr_stripes)
1970 return 0;
1971
1972 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
1973
1974 /* Submit the stripes which are populated but not submitted. */
1975 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
1976 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);
1977
1978 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
1979 }
1980
1981 for (int i = 0; i < nr_stripes; i++) {
1982 stripe = &sctx->stripes[i];
1983
1984 wait_event(stripe->repair_wait,
1985 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
1986 }
1987
1988 /* Submit for dev-replace. */
1989 if (sctx->is_dev_replace) {
1990 /*
1991 * For dev-replace, if we know there is something wrong with
1992 * metadata, we should immediately abort.
1993 */
1994 for (int i = 0; i < nr_stripes; i++) {
1995 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) {
1996 ret = -EIO;
1997 goto out;
1998 }
1999 }
2000 for (int i = 0; i < nr_stripes; i++) {
2001 unsigned long good;
2002 unsigned long has_extent;
2003 unsigned long error;
2004
2005 stripe = &sctx->stripes[i];
2006
2007 ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
2008
2009 has_extent = scrub_bitmap_read_has_extent(stripe);
2010 error = scrub_bitmap_read_error(stripe);
2011 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors);
2012 scrub_write_sectors(sctx, stripe, good, true);
2013 }
2014 }
2015
2016 /* Wait for the above writebacks to finish. */
2017 for (int i = 0; i < nr_stripes; i++) {
2018 stripe = &sctx->stripes[i];
2019
2020 wait_scrub_stripe_io(stripe);
2021 spin_lock(&sctx->stat_lock);
2022 sctx->stat.last_physical = stripe->physical + stripe_length(stripe);
2023 spin_unlock(&sctx->stat_lock);
2024 scrub_reset_stripe(stripe);
2025 }
2026 out:
2027 sctx->cur_stripe = 0;
2028 return ret;
2029 }
2030
raid56_scrub_wait_endio(struct bio * bio)2031 static void raid56_scrub_wait_endio(struct bio *bio)
2032 {
2033 complete(bio->bi_private);
2034 }
2035
queue_scrub_stripe(struct scrub_ctx * sctx,struct btrfs_block_group * bg,struct btrfs_device * dev,int mirror_num,u64 logical,u32 length,u64 physical,u64 * found_logical_ret)2036 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
2037 struct btrfs_device *dev, int mirror_num,
2038 u64 logical, u32 length, u64 physical,
2039 u64 *found_logical_ret)
2040 {
2041 struct scrub_stripe *stripe;
2042 int ret;
2043
2044 /*
2045 * There should always be one slot left, as caller filling the last
2046 * slot should flush them all.
2047 */
2048 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
2049
2050 /* @found_logical_ret must be specified. */
2051 ASSERT(found_logical_ret);
2052
2053 stripe = &sctx->stripes[sctx->cur_stripe];
2054 scrub_reset_stripe(stripe);
2055 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
2056 &sctx->csum_path, dev, physical,
2057 mirror_num, logical, length, stripe);
2058 /* Either >0 as no more extents or <0 for error. */
2059 if (ret)
2060 return ret;
2061 *found_logical_ret = stripe->logical;
2062 sctx->cur_stripe++;
2063
2064 /* We filled one group, submit it. */
2065 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
2066 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;
2067
2068 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
2069 }
2070
2071 /* Last slot used, flush them all. */
2072 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
2073 return flush_scrub_stripes(sctx);
2074 return 0;
2075 }
2076
2077 /*
2078 * Return 0 if we should not cancel the scrub.
2079 * Return <0 if we need to cancel the scrub, returned value will
2080 * indicate the reason:
2081 * - -ECANCELED - Being explicitly canceled through ioctl.
2082 * - -EINTR - Being interrupted by signal or fs/process freezing.
2083 */
should_cancel_scrub(const struct scrub_ctx * sctx)2084 static int should_cancel_scrub(const struct scrub_ctx *sctx)
2085 {
2086 struct btrfs_fs_info *fs_info = sctx->fs_info;
2087
2088 if (atomic_read(&fs_info->scrub_cancel_req) ||
2089 atomic_read(&sctx->cancel_req))
2090 return -ECANCELED;
2091
2092 /*
2093 * The user (e.g. fsfreeze command) or power management (PM)
2094 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will
2095 * also freeze all user processes.
2096 *
2097 * A user process can only be frozen when it is in user space, thus we
2098 * have to cancel the run so that the process can return to the user
2099 * space.
2100 *
2101 * Furthermore we have to check both filesystem and process freezing,
2102 * as PM can be configured to freeze the filesystems before processes.
2103 *
2104 * If we only check fs freezing, then suspend without fs freezing
2105 * will timeout, as the process is still in kernel space.
2106 *
2107 * If we only check process freezing, then suspend with fs freezing
2108 * will timeout, as the running scrub will prevent the fs from being frozen.
2109 */
2110 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN ||
2111 freezing(current) || signal_pending(current))
2112 return -EINTR;
2113 return 0;
2114 }
2115
scrub_raid56_cached_parity(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,struct btrfs_chunk_map * map,u64 full_stripe_start,unsigned long * extent_bitmap)2116 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx,
2117 struct btrfs_device *scrub_dev,
2118 struct btrfs_chunk_map *map,
2119 u64 full_stripe_start,
2120 unsigned long *extent_bitmap)
2121 {
2122 DECLARE_COMPLETION_ONSTACK(io_done);
2123 struct btrfs_fs_info *fs_info = sctx->fs_info;
2124 struct btrfs_io_context *bioc = NULL;
2125 struct btrfs_raid_bio *rbio;
2126 struct bio bio;
2127 const int data_stripes = nr_data_stripes(map);
2128 u64 length = btrfs_stripe_nr_to_offset(data_stripes);
2129 int ret;
2130
2131 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ);
2132 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
2133 bio.bi_private = &io_done;
2134 bio.bi_end_io = raid56_scrub_wait_endio;
2135
2136 btrfs_bio_counter_inc_blocked(fs_info);
2137 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
2138 &length, &bioc, NULL, NULL);
2139 if (ret < 0)
2140 goto out;
2141 /* For RAID56 write there must be an @bioc allocated. */
2142 ASSERT(bioc);
2143 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap,
2144 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
2145 btrfs_put_bioc(bioc);
2146 if (!rbio) {
2147 ret = -ENOMEM;
2148 goto out;
2149 }
2150 /* Use the recovered stripes as cache to avoid read them from disk again. */
2151 for (int i = 0; i < data_stripes; i++) {
2152 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i];
2153
2154 raid56_parity_cache_data_folios(rbio, stripe->folios,
2155 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
2156 }
2157 raid56_parity_submit_scrub_rbio(rbio);
2158 wait_for_completion_io(&io_done);
2159 ret = blk_status_to_errno(bio.bi_status);
2160 out:
2161 btrfs_bio_counter_dec(fs_info);
2162 bio_uninit(&bio);
2163 return ret;
2164 }
2165
scrub_raid56_parity_stripe(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,struct btrfs_block_group * bg,struct btrfs_chunk_map * map,u64 full_stripe_start)2166 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
2167 struct btrfs_device *scrub_dev,
2168 struct btrfs_block_group *bg,
2169 struct btrfs_chunk_map *map,
2170 u64 full_stripe_start)
2171 {
2172 struct btrfs_fs_info *fs_info = sctx->fs_info;
2173 BTRFS_PATH_AUTO_RELEASE(extent_path);
2174 BTRFS_PATH_AUTO_RELEASE(csum_path);
2175 struct scrub_stripe *stripe;
2176 bool all_empty = true;
2177 const int data_stripes = nr_data_stripes(map);
2178 unsigned long extent_bitmap = 0;
2179 int ret;
2180
2181 ASSERT(sctx->raid56_data_stripes);
2182
2183 ret = should_cancel_scrub(sctx);
2184 if (ret < 0)
2185 return ret;
2186
2187 if (atomic_read(&fs_info->scrub_pause_req))
2188 scrub_blocked_if_needed(fs_info);
2189
2190 spin_lock(&bg->lock);
2191 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
2192 spin_unlock(&bg->lock);
2193 return 0;
2194 }
2195 spin_unlock(&bg->lock);
2196
2197 /*
2198 * For data stripe search, we cannot reuse the same extent/csum paths,
2199 * as the data stripe bytenr may be smaller than previous extent. Thus
2200 * we have to use our own extent/csum paths.
2201 */
2202 extent_path.search_commit_root = true;
2203 extent_path.skip_locking = true;
2204 csum_path.search_commit_root = true;
2205 csum_path.skip_locking = true;
2206
2207 for (int i = 0; i < data_stripes; i++) {
2208 int stripe_index;
2209 int rot;
2210 u64 physical;
2211
2212 stripe = &sctx->raid56_data_stripes[i];
2213 rot = div_u64(full_stripe_start - bg->start,
2214 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
2215 stripe_index = (i + rot) % map->num_stripes;
2216 physical = map->stripes[stripe_index].physical +
2217 btrfs_stripe_nr_to_offset(rot);
2218
2219 scrub_reset_stripe(stripe);
2220 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
2221 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path,
2222 map->stripes[stripe_index].dev, physical, 1,
2223 full_stripe_start + btrfs_stripe_nr_to_offset(i),
2224 BTRFS_STRIPE_LEN, stripe);
2225 if (ret < 0)
2226 return ret;
2227 /*
2228 * No extent in this data stripe, need to manually mark them
2229 * initialized to make later read submission happy.
2230 */
2231 if (ret > 0) {
2232 stripe->logical = full_stripe_start +
2233 btrfs_stripe_nr_to_offset(i);
2234 stripe->dev = map->stripes[stripe_index].dev;
2235 stripe->mirror_num = 1;
2236 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
2237 }
2238 }
2239
2240 /* Check if all data stripes are empty. */
2241 for (int i = 0; i < data_stripes; i++) {
2242 stripe = &sctx->raid56_data_stripes[i];
2243 if (!scrub_bitmap_empty_has_extent(stripe)) {
2244 all_empty = false;
2245 break;
2246 }
2247 }
2248 if (all_empty)
2249 return 0;
2250
2251 for (int i = 0; i < data_stripes; i++) {
2252 stripe = &sctx->raid56_data_stripes[i];
2253 scrub_submit_initial_read(sctx, stripe);
2254 }
2255 for (int i = 0; i < data_stripes; i++) {
2256 stripe = &sctx->raid56_data_stripes[i];
2257
2258 wait_event(stripe->repair_wait,
2259 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
2260 }
2261 /* For now, no zoned support for RAID56. */
2262 ASSERT(!btrfs_is_zoned(sctx->fs_info));
2263
2264 /*
2265 * Now all data stripes are properly verified. Check if we have any
2266 * unrepaired, if so abort immediately or we could further corrupt the
2267 * P/Q stripes.
2268 *
2269 * During the loop, also populate extent_bitmap.
2270 */
2271 for (int i = 0; i < data_stripes; i++) {
2272 unsigned long error;
2273 unsigned long has_extent;
2274
2275 stripe = &sctx->raid56_data_stripes[i];
2276
2277 error = scrub_bitmap_read_error(stripe);
2278 has_extent = scrub_bitmap_read_has_extent(stripe);
2279
2280 /*
2281 * We should only check the errors where there is an extent.
2282 * As we may hit an empty data stripe while it's missing.
2283 */
2284 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
2285 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) {
2286 btrfs_err(fs_info,
2287 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
2288 full_stripe_start, i, stripe->nr_sectors,
2289 &error);
2290 return ret;
2291 }
2292 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
2293 stripe->nr_sectors);
2294 }
2295
2296 /* Now we can check and regenerate the P/Q stripe. */
2297 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start,
2298 &extent_bitmap);
2299 }
2300
2301 /*
2302 * Scrub one range which can only has simple mirror based profile.
2303 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
2304 * RAID0/RAID10).
2305 *
2306 * Since we may need to handle a subset of block group, we need @logical_start
2307 * and @logical_length parameter.
2308 */
scrub_simple_mirror(struct scrub_ctx * sctx,struct btrfs_block_group * bg,u64 logical_start,u64 logical_length,struct btrfs_device * device,u64 physical,int mirror_num)2309 static int scrub_simple_mirror(struct scrub_ctx *sctx,
2310 struct btrfs_block_group *bg,
2311 u64 logical_start, u64 logical_length,
2312 struct btrfs_device *device,
2313 u64 physical, int mirror_num)
2314 {
2315 struct btrfs_fs_info *fs_info = sctx->fs_info;
2316 const u64 logical_end = logical_start + logical_length;
2317 u64 cur_logical = logical_start;
2318 int ret = 0;
2319
2320 /* The range must be inside the bg */
2321 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg));
2322
2323 /* Go through each extent items inside the logical range */
2324 while (cur_logical < logical_end) {
2325 u64 found_logical = U64_MAX;
2326 u64 cur_physical = physical + cur_logical - logical_start;
2327
2328 ret = should_cancel_scrub(sctx);
2329 if (ret < 0)
2330 break;
2331
2332 if (atomic_read(&fs_info->scrub_pause_req))
2333 scrub_blocked_if_needed(fs_info);
2334
2335 spin_lock(&bg->lock);
2336 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
2337 spin_unlock(&bg->lock);
2338 ret = 0;
2339 break;
2340 }
2341 spin_unlock(&bg->lock);
2342
2343 ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
2344 cur_logical, logical_end - cur_logical,
2345 cur_physical, &found_logical);
2346 if (ret > 0) {
2347 /* No more extent, just update the accounting */
2348 spin_lock(&sctx->stat_lock);
2349 sctx->stat.last_physical = physical + logical_length;
2350 spin_unlock(&sctx->stat_lock);
2351 ret = 0;
2352 break;
2353 }
2354 if (ret < 0)
2355 break;
2356
2357 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
2358 ASSERT(found_logical != U64_MAX);
2359 cur_logical = found_logical + BTRFS_STRIPE_LEN;
2360
2361 /* Don't hold CPU for too long time */
2362 cond_resched();
2363 }
2364 return ret;
2365 }
2366
2367 /* Calculate the full stripe length for simple stripe based profiles */
simple_stripe_full_stripe_len(const struct btrfs_chunk_map * map)2368 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
2369 {
2370 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2371 BTRFS_BLOCK_GROUP_RAID10));
2372
2373 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes);
2374 }
2375
2376 /* Get the logical bytenr for the stripe */
simple_stripe_get_logical(struct btrfs_chunk_map * map,struct btrfs_block_group * bg,int stripe_index)2377 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
2378 struct btrfs_block_group *bg,
2379 int stripe_index)
2380 {
2381 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2382 BTRFS_BLOCK_GROUP_RAID10));
2383 ASSERT(stripe_index < map->num_stripes);
2384
2385 /*
2386 * (stripe_index / sub_stripes) gives how many data stripes we need to
2387 * skip.
2388 */
2389 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) +
2390 bg->start;
2391 }
2392
2393 /* Get the mirror number for the stripe */
simple_stripe_mirror_num(struct btrfs_chunk_map * map,int stripe_index)2394 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
2395 {
2396 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2397 BTRFS_BLOCK_GROUP_RAID10));
2398 ASSERT(stripe_index < map->num_stripes);
2399
2400 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
2401 return stripe_index % map->sub_stripes + 1;
2402 }
2403
scrub_simple_stripe(struct scrub_ctx * sctx,struct btrfs_block_group * bg,struct btrfs_chunk_map * map,struct btrfs_device * device,int stripe_index)2404 static int scrub_simple_stripe(struct scrub_ctx *sctx,
2405 struct btrfs_block_group *bg,
2406 struct btrfs_chunk_map *map,
2407 struct btrfs_device *device,
2408 int stripe_index)
2409 {
2410 const u64 logical_increment = simple_stripe_full_stripe_len(map);
2411 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
2412 const u64 orig_physical = map->stripes[stripe_index].physical;
2413 const u64 end = btrfs_block_group_end(bg);
2414 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
2415 u64 cur_logical = orig_logical;
2416 u64 cur_physical = orig_physical;
2417 int ret = 0;
2418
2419 while (cur_logical < end) {
2420 /*
2421 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
2422 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
2423 * this stripe.
2424 */
2425 ret = scrub_simple_mirror(sctx, bg, cur_logical,
2426 BTRFS_STRIPE_LEN, device, cur_physical,
2427 mirror_num);
2428 if (ret)
2429 return ret;
2430 /* Skip to next stripe which belongs to the target device */
2431 cur_logical += logical_increment;
2432 /* For physical offset, we just go to next stripe */
2433 cur_physical += BTRFS_STRIPE_LEN;
2434 }
2435 return ret;
2436 }
2437
scrub_stripe(struct scrub_ctx * sctx,struct btrfs_block_group * bg,struct btrfs_chunk_map * map,struct btrfs_device * scrub_dev,int stripe_index)2438 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2439 struct btrfs_block_group *bg,
2440 struct btrfs_chunk_map *map,
2441 struct btrfs_device *scrub_dev,
2442 int stripe_index)
2443 {
2444 struct btrfs_fs_info *fs_info = sctx->fs_info;
2445 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
2446 const u64 chunk_logical = bg->start;
2447 int ret;
2448 int ret2;
2449 u64 physical = map->stripes[stripe_index].physical;
2450 const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
2451 const u64 physical_end = physical + dev_stripe_len;
2452 u64 logical;
2453 u64 logic_end;
2454 /* The logical increment after finishing one stripe */
2455 u64 increment;
2456 /* Offset inside the chunk */
2457 u64 offset;
2458 u64 stripe_logical;
2459
2460 /* Extent_path should be released by now. */
2461 ASSERT(sctx->extent_path.nodes[0] == NULL);
2462
2463 scrub_blocked_if_needed(fs_info);
2464
2465 if (sctx->is_dev_replace &&
2466 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
2467 mutex_lock(&sctx->wr_lock);
2468 sctx->write_pointer = physical;
2469 mutex_unlock(&sctx->wr_lock);
2470 }
2471
2472 /* Prepare the extra data stripes used by RAID56. */
2473 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
2474 ASSERT(sctx->raid56_data_stripes == NULL);
2475
2476 sctx->raid56_data_stripes = kzalloc_objs(struct scrub_stripe,
2477 nr_data_stripes(map));
2478 if (!sctx->raid56_data_stripes) {
2479 ret = -ENOMEM;
2480 goto out;
2481 }
2482 for (int i = 0; i < nr_data_stripes(map); i++) {
2483 ret = init_scrub_stripe(fs_info,
2484 &sctx->raid56_data_stripes[i]);
2485 if (ret < 0)
2486 goto out;
2487 sctx->raid56_data_stripes[i].bg = bg;
2488 sctx->raid56_data_stripes[i].sctx = sctx;
2489 }
2490 }
2491 /*
2492 * There used to be a big double loop to handle all profiles using the
2493 * same routine, which grows larger and more gross over time.
2494 *
2495 * So here we handle each profile differently, so simpler profiles
2496 * have simpler scrubbing function.
2497 */
2498 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
2499 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2500 /*
2501 * Above check rules out all complex profile, the remaining
2502 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
2503 * mirrored duplication without stripe.
2504 *
2505 * Only @physical and @mirror_num needs to calculated using
2506 * @stripe_index.
2507 */
2508 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
2509 scrub_dev, map->stripes[stripe_index].physical,
2510 stripe_index + 1);
2511 offset = 0;
2512 goto out;
2513 }
2514 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
2515 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
2516 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes);
2517 goto out;
2518 }
2519
2520 /* Only RAID56 goes through the old code */
2521 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
2522 ret = 0;
2523
2524 /* Calculate the logical end of the stripe */
2525 get_raid56_logic_offset(physical_end, stripe_index,
2526 map, &logic_end, NULL);
2527 logic_end += chunk_logical;
2528
2529 /* Initialize @offset in case we need to go to out: label */
2530 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
2531 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2532
2533 /*
2534 * Due to the rotation, for RAID56 it's better to iterate each stripe
2535 * using their physical offset.
2536 */
2537 while (physical < physical_end) {
2538 ret = get_raid56_logic_offset(physical, stripe_index, map,
2539 &logical, &stripe_logical);
2540 logical += chunk_logical;
2541 if (ret) {
2542 /* it is parity strip */
2543 stripe_logical += chunk_logical;
2544 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
2545 map, stripe_logical);
2546 spin_lock(&sctx->stat_lock);
2547 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN,
2548 physical_end);
2549 spin_unlock(&sctx->stat_lock);
2550 if (ret)
2551 goto out;
2552 goto next;
2553 }
2554
2555 /*
2556 * Now we're at a data stripe, scrub each extents in the range.
2557 *
2558 * At this stage, if we ignore the repair part, inside each data
2559 * stripe it is no different than SINGLE profile.
2560 * We can reuse scrub_simple_mirror() here, as the repair part
2561 * is still based on @mirror_num.
2562 */
2563 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
2564 scrub_dev, physical, 1);
2565 if (ret < 0)
2566 goto out;
2567 next:
2568 logical += increment;
2569 physical += BTRFS_STRIPE_LEN;
2570 spin_lock(&sctx->stat_lock);
2571 sctx->stat.last_physical = physical;
2572 spin_unlock(&sctx->stat_lock);
2573 }
2574 out:
2575 ret2 = flush_scrub_stripes(sctx);
2576 if (!ret)
2577 ret = ret2;
2578 btrfs_release_path(&sctx->extent_path);
2579 btrfs_release_path(&sctx->csum_path);
2580
2581 if (sctx->raid56_data_stripes) {
2582 for (int i = 0; i < nr_data_stripes(map); i++)
2583 release_scrub_stripe(&sctx->raid56_data_stripes[i]);
2584 kfree(sctx->raid56_data_stripes);
2585 sctx->raid56_data_stripes = NULL;
2586 }
2587
2588 if (sctx->is_dev_replace && ret >= 0) {
2589 ret2 = sync_write_pointer_for_zoned(sctx,
2590 chunk_logical + offset,
2591 map->stripes[stripe_index].physical,
2592 physical_end);
2593 if (ret2)
2594 ret = ret2;
2595 }
2596
2597 return ret < 0 ? ret : 0;
2598 }
2599
scrub_chunk(struct scrub_ctx * sctx,struct btrfs_block_group * bg,struct btrfs_device * scrub_dev,u64 dev_offset,u64 dev_extent_len)2600 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2601 struct btrfs_block_group *bg,
2602 struct btrfs_device *scrub_dev,
2603 u64 dev_offset,
2604 u64 dev_extent_len)
2605 {
2606 struct btrfs_fs_info *fs_info = sctx->fs_info;
2607 struct btrfs_chunk_map *map;
2608 int i;
2609 int ret = 0;
2610
2611 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
2612 if (!map) {
2613 /*
2614 * Might have been an unused block group deleted by the cleaner
2615 * kthread or relocation.
2616 */
2617 spin_lock(&bg->lock);
2618 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
2619 ret = -EINVAL;
2620 spin_unlock(&bg->lock);
2621
2622 return ret;
2623 }
2624 if (map->start != bg->start)
2625 goto out;
2626 if (map->chunk_len < dev_extent_len)
2627 goto out;
2628
2629 for (i = 0; i < map->num_stripes; ++i) {
2630 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2631 map->stripes[i].physical == dev_offset) {
2632 ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
2633 if (ret)
2634 goto out;
2635 }
2636 }
2637 out:
2638 btrfs_free_chunk_map(map);
2639
2640 return ret;
2641 }
2642
finish_extent_writes_for_zoned(struct btrfs_root * root,struct btrfs_block_group * cache)2643 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
2644 struct btrfs_block_group *cache)
2645 {
2646 struct btrfs_fs_info *fs_info = cache->fs_info;
2647
2648 if (!btrfs_is_zoned(fs_info))
2649 return 0;
2650
2651 btrfs_wait_block_group_reservations(cache);
2652 btrfs_wait_nocow_writers(cache);
2653 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
2654
2655 return btrfs_commit_current_transaction(root);
2656 }
2657
2658 static noinline_for_stack
scrub_enumerate_chunks(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,u64 start,u64 end)2659 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2660 struct btrfs_device *scrub_dev, u64 start, u64 end)
2661 {
2662 struct btrfs_dev_extent *dev_extent = NULL;
2663 BTRFS_PATH_AUTO_FREE(path);
2664 struct btrfs_fs_info *fs_info = sctx->fs_info;
2665 struct btrfs_root *root = fs_info->dev_root;
2666 u64 chunk_offset;
2667 int ret = 0;
2668 int ro_set;
2669 int slot;
2670 struct extent_buffer *l;
2671 struct btrfs_key key;
2672 struct btrfs_key found_key;
2673 struct btrfs_block_group *cache;
2674 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2675
2676 path = btrfs_alloc_path();
2677 if (!path)
2678 return -ENOMEM;
2679
2680 path->reada = READA_FORWARD;
2681 path->search_commit_root = true;
2682 path->skip_locking = true;
2683
2684 key.objectid = scrub_dev->devid;
2685 key.type = BTRFS_DEV_EXTENT_KEY;
2686 key.offset = 0ull;
2687
2688 while (1) {
2689 u64 dev_extent_len;
2690
2691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2692 if (ret < 0)
2693 break;
2694 if (ret > 0) {
2695 if (path->slots[0] >=
2696 btrfs_header_nritems(path->nodes[0])) {
2697 ret = btrfs_next_leaf(root, path);
2698 if (ret < 0)
2699 break;
2700 if (ret > 0) {
2701 ret = 0;
2702 break;
2703 }
2704 } else {
2705 ret = 0;
2706 }
2707 }
2708
2709 l = path->nodes[0];
2710 slot = path->slots[0];
2711
2712 btrfs_item_key_to_cpu(l, &found_key, slot);
2713
2714 if (found_key.objectid != scrub_dev->devid)
2715 break;
2716
2717 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2718 break;
2719
2720 if (found_key.offset >= end)
2721 break;
2722
2723 if (found_key.offset < key.offset)
2724 break;
2725
2726 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2727 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
2728
2729 if (found_key.offset + dev_extent_len <= start)
2730 goto skip;
2731
2732 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2733
2734 /*
2735 * get a reference on the corresponding block group to prevent
2736 * the chunk from going away while we scrub it
2737 */
2738 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2739
2740 /* some chunks are removed but not committed to disk yet,
2741 * continue scrubbing */
2742 if (!cache)
2743 goto skip;
2744
2745 ASSERT(cache->start <= chunk_offset);
2746 /*
2747 * We are using the commit root to search for device extents, so
2748 * that means we could have found a device extent item from a
2749 * block group that was deleted in the current transaction. The
2750 * logical start offset of the deleted block group, stored at
2751 * @chunk_offset, might be part of the logical address range of
2752 * a new block group (which uses different physical extents).
2753 * In this case btrfs_lookup_block_group() has returned the new
2754 * block group, and its start address is less than @chunk_offset.
2755 *
2756 * We skip such new block groups, because it's pointless to
2757 * process them, as we won't find their extents because we search
2758 * for them using the commit root of the extent tree. For a device
2759 * replace it's also fine to skip it, we won't miss copying them
2760 * to the target device because we have the write duplication
2761 * setup through the regular write path (by btrfs_map_block()),
2762 * and we have committed a transaction when we started the device
2763 * replace, right after setting up the device replace state.
2764 */
2765 if (cache->start < chunk_offset) {
2766 btrfs_put_block_group(cache);
2767 goto skip;
2768 }
2769
2770 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
2771 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
2772 btrfs_put_block_group(cache);
2773 goto skip;
2774 }
2775 }
2776
2777 /*
2778 * Make sure that while we are scrubbing the corresponding block
2779 * group doesn't get its logical address and its device extents
2780 * reused for another block group, which can possibly be of a
2781 * different type and different profile. We do this to prevent
2782 * false error detections and crashes due to bogus attempts to
2783 * repair extents.
2784 */
2785 spin_lock(&cache->lock);
2786 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
2787 spin_unlock(&cache->lock);
2788 btrfs_put_block_group(cache);
2789 goto skip;
2790 }
2791 btrfs_freeze_block_group(cache);
2792 spin_unlock(&cache->lock);
2793
2794 /*
2795 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
2796 * to avoid deadlock caused by:
2797 * btrfs_inc_block_group_ro()
2798 * -> btrfs_wait_for_commit()
2799 * -> btrfs_commit_transaction()
2800 * -> btrfs_scrub_pause()
2801 */
2802 scrub_pause_on(fs_info);
2803
2804 /*
2805 * Don't do chunk preallocation for scrub.
2806 *
2807 * This is especially important for SYSTEM bgs, or we can hit
2808 * -EFBIG from btrfs_finish_chunk_alloc() like:
2809 * 1. The only SYSTEM bg is marked RO.
2810 * Since SYSTEM bg is small, that's pretty common.
2811 * 2. New SYSTEM bg will be allocated
2812 * Due to regular version will allocate new chunk.
2813 * 3. New SYSTEM bg is empty and will get cleaned up
2814 * Before cleanup really happens, it's marked RO again.
2815 * 4. Empty SYSTEM bg get scrubbed
2816 * We go back to 2.
2817 *
2818 * This can easily boost the amount of SYSTEM chunks if cleaner
2819 * thread can't be triggered fast enough, and use up all space
2820 * of btrfs_super_block::sys_chunk_array
2821 *
2822 * While for dev replace, we need to try our best to mark block
2823 * group RO, to prevent race between:
2824 * - Write duplication
2825 * Contains latest data
2826 * - Scrub copy
2827 * Contains data from commit tree
2828 *
2829 * If target block group is not marked RO, nocow writes can
2830 * be overwritten by scrub copy, causing data corruption.
2831 * So for dev-replace, it's not allowed to continue if a block
2832 * group is not RO.
2833 */
2834 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
2835 if (!ret && sctx->is_dev_replace) {
2836 ret = finish_extent_writes_for_zoned(root, cache);
2837 if (ret) {
2838 btrfs_dec_block_group_ro(cache);
2839 scrub_pause_off(fs_info);
2840 btrfs_put_block_group(cache);
2841 break;
2842 }
2843 }
2844
2845 if (ret == 0) {
2846 ro_set = 1;
2847 } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
2848 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
2849 /*
2850 * btrfs_inc_block_group_ro return -ENOSPC when it
2851 * failed in creating new chunk for metadata.
2852 * It is not a problem for scrub, because
2853 * metadata are always cowed, and our scrub paused
2854 * commit_transactions.
2855 *
2856 * For RAID56 chunks, we have to mark them read-only
2857 * for scrub, as later we would use our own cache
2858 * out of RAID56 realm.
2859 * Thus we want the RAID56 bg to be marked RO to
2860 * prevent RMW from screwing up out cache.
2861 */
2862 ro_set = 0;
2863 } else if (ret == -ETXTBSY) {
2864 btrfs_warn(fs_info,
2865 "scrub: skipping scrub of block group %llu due to active swapfile",
2866 cache->start);
2867 scrub_pause_off(fs_info);
2868 ret = 0;
2869 goto skip_unfreeze;
2870 } else {
2871 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
2872 ret);
2873 btrfs_unfreeze_block_group(cache);
2874 btrfs_put_block_group(cache);
2875 scrub_pause_off(fs_info);
2876 break;
2877 }
2878
2879 /*
2880 * Now the target block is marked RO, wait for nocow writes to
2881 * finish before dev-replace.
2882 * COW is fine, as COW never overwrites extents in commit tree.
2883 */
2884 if (sctx->is_dev_replace) {
2885 btrfs_wait_nocow_writers(cache);
2886 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
2887 }
2888
2889 scrub_pause_off(fs_info);
2890 down_write(&dev_replace->rwsem);
2891 dev_replace->cursor_right = found_key.offset + dev_extent_len;
2892 dev_replace->cursor_left = found_key.offset;
2893 dev_replace->item_needs_writeback = 1;
2894 up_write(&dev_replace->rwsem);
2895
2896 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
2897 dev_extent_len);
2898 if (sctx->is_dev_replace &&
2899 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
2900 cache, found_key.offset))
2901 ro_set = 0;
2902
2903 down_write(&dev_replace->rwsem);
2904 dev_replace->cursor_left = dev_replace->cursor_right;
2905 dev_replace->item_needs_writeback = 1;
2906 up_write(&dev_replace->rwsem);
2907
2908 if (ro_set)
2909 btrfs_dec_block_group_ro(cache);
2910
2911 /*
2912 * We might have prevented the cleaner kthread from deleting
2913 * this block group if it was already unused because we raced
2914 * and set it to RO mode first. So add it back to the unused
2915 * list, otherwise it might not ever be deleted unless a manual
2916 * balance is triggered or it becomes used and unused again.
2917 */
2918 spin_lock(&cache->lock);
2919 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
2920 !cache->ro && cache->reserved == 0 && cache->used == 0) {
2921 spin_unlock(&cache->lock);
2922 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
2923 btrfs_discard_queue_work(&fs_info->discard_ctl,
2924 cache);
2925 else
2926 btrfs_mark_bg_unused(cache);
2927 } else {
2928 spin_unlock(&cache->lock);
2929 }
2930 skip_unfreeze:
2931 btrfs_unfreeze_block_group(cache);
2932 btrfs_put_block_group(cache);
2933 if (ret)
2934 break;
2935 if (unlikely(sctx->is_dev_replace &&
2936 atomic64_read(&dev_replace->num_write_errors) > 0)) {
2937 ret = -EIO;
2938 break;
2939 }
2940 if (sctx->stat.malloc_errors > 0) {
2941 ret = -ENOMEM;
2942 break;
2943 }
2944 skip:
2945 key.offset = found_key.offset + dev_extent_len;
2946 btrfs_release_path(path);
2947 }
2948
2949 return ret;
2950 }
2951
scrub_one_super(struct scrub_ctx * sctx,struct btrfs_device * dev,struct page * page,u64 physical,u64 generation)2952 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
2953 struct page *page, u64 physical, u64 generation)
2954 {
2955 struct btrfs_fs_info *fs_info = sctx->fs_info;
2956 struct btrfs_super_block *sb = page_address(page);
2957 int ret;
2958
2959 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
2960 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
2961 if (ret < 0)
2962 return ret;
2963 ret = btrfs_check_super_csum(fs_info, sb);
2964 if (unlikely(ret != 0)) {
2965 btrfs_err_rl(fs_info,
2966 "scrub: super block at physical %llu devid %llu has bad csum",
2967 physical, dev->devid);
2968 return -EIO;
2969 }
2970 if (unlikely(btrfs_super_generation(sb) != generation)) {
2971 btrfs_err_rl(fs_info,
2972 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
2973 physical, dev->devid,
2974 btrfs_super_generation(sb), generation);
2975 return -EUCLEAN;
2976 }
2977
2978 return btrfs_validate_super(fs_info, sb, -1);
2979 }
2980
scrub_supers(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev)2981 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2982 struct btrfs_device *scrub_dev)
2983 {
2984 int i;
2985 u64 bytenr;
2986 u64 gen;
2987 int ret = 0;
2988 struct page *page;
2989 struct btrfs_fs_info *fs_info = sctx->fs_info;
2990
2991 if (BTRFS_FS_ERROR(fs_info))
2992 return -EROFS;
2993
2994 page = alloc_page(GFP_KERNEL);
2995 if (!page) {
2996 spin_lock(&sctx->stat_lock);
2997 sctx->stat.malloc_errors++;
2998 spin_unlock(&sctx->stat_lock);
2999 return -ENOMEM;
3000 }
3001
3002 /* Seed devices of a new filesystem has their own generation. */
3003 if (scrub_dev->fs_devices != fs_info->fs_devices)
3004 gen = scrub_dev->generation;
3005 else
3006 gen = btrfs_get_last_trans_committed(fs_info);
3007
3008 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3009 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr);
3010 if (ret == -ENOENT)
3011 break;
3012
3013 if (ret) {
3014 spin_lock(&sctx->stat_lock);
3015 sctx->stat.super_errors++;
3016 spin_unlock(&sctx->stat_lock);
3017 continue;
3018 }
3019
3020 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3021 scrub_dev->commit_total_bytes)
3022 break;
3023 if (!btrfs_check_super_location(scrub_dev, bytenr))
3024 continue;
3025
3026 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
3027 if (ret) {
3028 spin_lock(&sctx->stat_lock);
3029 sctx->stat.super_errors++;
3030 spin_unlock(&sctx->stat_lock);
3031 }
3032 }
3033 __free_page(page);
3034 return 0;
3035 }
3036
scrub_workers_put(struct btrfs_fs_info * fs_info)3037 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3038 {
3039 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3040 &fs_info->scrub_lock)) {
3041 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
3042
3043 fs_info->scrub_workers = NULL;
3044 mutex_unlock(&fs_info->scrub_lock);
3045
3046 if (scrub_workers)
3047 destroy_workqueue(scrub_workers);
3048 }
3049 }
3050
3051 /*
3052 * get a reference count on fs_info->scrub_workers. start worker if necessary
3053 */
scrub_workers_get(struct btrfs_fs_info * fs_info)3054 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
3055 {
3056 struct workqueue_struct *scrub_workers = NULL;
3057 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3058 int max_active = fs_info->thread_pool_size;
3059 int ret = -ENOMEM;
3060
3061 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3062 return 0;
3063
3064 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
3065 if (!scrub_workers)
3066 return -ENOMEM;
3067
3068 mutex_lock(&fs_info->scrub_lock);
3069 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3070 ASSERT(fs_info->scrub_workers == NULL);
3071 fs_info->scrub_workers = scrub_workers;
3072 refcount_set(&fs_info->scrub_workers_refcnt, 1);
3073 mutex_unlock(&fs_info->scrub_lock);
3074 return 0;
3075 }
3076 /* Other thread raced in and created the workers for us */
3077 refcount_inc(&fs_info->scrub_workers_refcnt);
3078 mutex_unlock(&fs_info->scrub_lock);
3079
3080 ret = 0;
3081
3082 destroy_workqueue(scrub_workers);
3083 return ret;
3084 }
3085
btrfs_scrub_dev(struct btrfs_fs_info * fs_info,u64 devid,u64 start,u64 end,struct btrfs_scrub_progress * progress,bool readonly,bool is_dev_replace)3086 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3087 u64 end, struct btrfs_scrub_progress *progress,
3088 bool readonly, bool is_dev_replace)
3089 {
3090 struct btrfs_dev_lookup_args args = { .devid = devid };
3091 struct scrub_ctx *sctx;
3092 int ret;
3093 struct btrfs_device *dev;
3094 unsigned int nofs_flag;
3095 bool need_commit = false;
3096
3097 /* Set the basic fallback @last_physical before we got a sctx. */
3098 if (progress)
3099 progress->last_physical = start;
3100
3101 if (btrfs_fs_closing(fs_info))
3102 return -EAGAIN;
3103
3104 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
3105 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
3106
3107 /*
3108 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
3109 * value (max nodesize / min sectorsize), thus nodesize should always
3110 * be fine.
3111 */
3112 ASSERT(fs_info->nodesize <=
3113 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
3114
3115 /* Allocate outside of device_list_mutex */
3116 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3117 if (IS_ERR(sctx))
3118 return PTR_ERR(sctx);
3119 sctx->stat.last_physical = start;
3120
3121 ret = scrub_workers_get(fs_info);
3122 if (ret)
3123 goto out_free_ctx;
3124
3125 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3126 dev = btrfs_find_device(fs_info->fs_devices, &args);
3127 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3128 !is_dev_replace)) {
3129 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3130 ret = -ENODEV;
3131 goto out;
3132 }
3133
3134 if (!is_dev_replace && !readonly &&
3135 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3136 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3137 btrfs_err(fs_info,
3138 "scrub: devid %llu: filesystem on %s is not writable",
3139 devid, btrfs_dev_name(dev));
3140 ret = -EROFS;
3141 goto out;
3142 }
3143
3144 mutex_lock(&fs_info->scrub_lock);
3145 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3146 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) {
3147 mutex_unlock(&fs_info->scrub_lock);
3148 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3149 ret = -EIO;
3150 goto out;
3151 }
3152
3153 down_read(&fs_info->dev_replace.rwsem);
3154 if (dev->scrub_ctx ||
3155 (!is_dev_replace &&
3156 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3157 up_read(&fs_info->dev_replace.rwsem);
3158 mutex_unlock(&fs_info->scrub_lock);
3159 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3160 ret = -EINPROGRESS;
3161 goto out;
3162 }
3163 up_read(&fs_info->dev_replace.rwsem);
3164
3165 sctx->readonly = readonly;
3166 dev->scrub_ctx = sctx;
3167 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3168
3169 /*
3170 * checking @scrub_pause_req here, we can avoid
3171 * race between committing transaction and scrubbing.
3172 */
3173 __scrub_blocked_if_needed(fs_info);
3174 atomic_inc(&fs_info->scrubs_running);
3175 mutex_unlock(&fs_info->scrub_lock);
3176
3177 /*
3178 * In order to avoid deadlock with reclaim when there is a transaction
3179 * trying to pause scrub, make sure we use GFP_NOFS for all the
3180 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
3181 * invoked by our callees. The pausing request is done when the
3182 * transaction commit starts, and it blocks the transaction until scrub
3183 * is paused (done at specific points at scrub_stripe() or right above
3184 * before incrementing fs_info->scrubs_running).
3185 */
3186 nofs_flag = memalloc_nofs_save();
3187 if (!is_dev_replace) {
3188 u64 old_super_errors;
3189
3190 spin_lock(&sctx->stat_lock);
3191 old_super_errors = sctx->stat.super_errors;
3192 spin_unlock(&sctx->stat_lock);
3193
3194 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3195 /*
3196 * by holding device list mutex, we can
3197 * kick off writing super in log tree sync.
3198 */
3199 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3200 ret = scrub_supers(sctx, dev);
3201 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3202
3203 spin_lock(&sctx->stat_lock);
3204 /*
3205 * Super block errors found, but we can not commit transaction
3206 * at current context, since btrfs_commit_transaction() needs
3207 * to pause the current running scrub (hold by ourselves).
3208 */
3209 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3210 need_commit = true;
3211 spin_unlock(&sctx->stat_lock);
3212 }
3213
3214 if (!ret)
3215 ret = scrub_enumerate_chunks(sctx, dev, start, end);
3216 memalloc_nofs_restore(nofs_flag);
3217
3218 atomic_dec(&fs_info->scrubs_running);
3219 wake_up(&fs_info->scrub_pause_wait);
3220
3221 if (progress)
3222 memcpy(progress, &sctx->stat, sizeof(*progress));
3223
3224 if (!is_dev_replace)
3225 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3226 ret ? "not finished" : "finished", devid, ret);
3227
3228 mutex_lock(&fs_info->scrub_lock);
3229 dev->scrub_ctx = NULL;
3230 mutex_unlock(&fs_info->scrub_lock);
3231
3232 scrub_workers_put(fs_info);
3233 scrub_put_ctx(sctx);
3234
3235 /*
3236 * We found some super block errors before, now try to force a
3237 * transaction commit, as scrub has finished.
3238 */
3239 if (need_commit) {
3240 struct btrfs_trans_handle *trans;
3241
3242 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3243 if (IS_ERR(trans)) {
3244 ret = PTR_ERR(trans);
3245 btrfs_err(fs_info,
3246 "scrub: failed to start transaction to fix super block errors: %d", ret);
3247 return ret;
3248 }
3249 ret = btrfs_commit_transaction(trans);
3250 if (ret < 0)
3251 btrfs_err(fs_info,
3252 "scrub: failed to commit transaction to fix super block errors: %d", ret);
3253 }
3254 return ret;
3255 out:
3256 scrub_workers_put(fs_info);
3257 out_free_ctx:
3258 scrub_free_ctx(sctx);
3259
3260 return ret;
3261 }
3262
btrfs_scrub_pause(struct btrfs_fs_info * fs_info)3263 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
3264 {
3265 mutex_lock(&fs_info->scrub_lock);
3266 atomic_inc(&fs_info->scrub_pause_req);
3267 while (atomic_read(&fs_info->scrubs_paused) !=
3268 atomic_read(&fs_info->scrubs_running)) {
3269 mutex_unlock(&fs_info->scrub_lock);
3270 wait_event(fs_info->scrub_pause_wait,
3271 atomic_read(&fs_info->scrubs_paused) ==
3272 atomic_read(&fs_info->scrubs_running));
3273 mutex_lock(&fs_info->scrub_lock);
3274 }
3275 mutex_unlock(&fs_info->scrub_lock);
3276 }
3277
btrfs_scrub_continue(struct btrfs_fs_info * fs_info)3278 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
3279 {
3280 atomic_dec(&fs_info->scrub_pause_req);
3281 wake_up(&fs_info->scrub_pause_wait);
3282 }
3283
btrfs_scrub_cancel(struct btrfs_fs_info * fs_info)3284 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3285 {
3286 mutex_lock(&fs_info->scrub_lock);
3287 if (!atomic_read(&fs_info->scrubs_running)) {
3288 mutex_unlock(&fs_info->scrub_lock);
3289 return -ENOTCONN;
3290 }
3291
3292 atomic_inc(&fs_info->scrub_cancel_req);
3293 while (atomic_read(&fs_info->scrubs_running)) {
3294 mutex_unlock(&fs_info->scrub_lock);
3295 wait_event(fs_info->scrub_pause_wait,
3296 atomic_read(&fs_info->scrubs_running) == 0);
3297 mutex_lock(&fs_info->scrub_lock);
3298 }
3299 atomic_dec(&fs_info->scrub_cancel_req);
3300 mutex_unlock(&fs_info->scrub_lock);
3301
3302 return 0;
3303 }
3304
btrfs_scrub_cancel_dev(struct btrfs_device * dev)3305 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
3306 {
3307 struct btrfs_fs_info *fs_info = dev->fs_info;
3308 struct scrub_ctx *sctx;
3309
3310 mutex_lock(&fs_info->scrub_lock);
3311 sctx = dev->scrub_ctx;
3312 if (!sctx) {
3313 mutex_unlock(&fs_info->scrub_lock);
3314 return -ENOTCONN;
3315 }
3316 atomic_inc(&sctx->cancel_req);
3317 while (dev->scrub_ctx) {
3318 mutex_unlock(&fs_info->scrub_lock);
3319 wait_event(fs_info->scrub_pause_wait,
3320 dev->scrub_ctx == NULL);
3321 mutex_lock(&fs_info->scrub_lock);
3322 }
3323 mutex_unlock(&fs_info->scrub_lock);
3324
3325 return 0;
3326 }
3327
btrfs_scrub_progress(struct btrfs_fs_info * fs_info,u64 devid,struct btrfs_scrub_progress * progress)3328 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
3329 struct btrfs_scrub_progress *progress)
3330 {
3331 struct btrfs_dev_lookup_args args = { .devid = devid };
3332 struct btrfs_device *dev;
3333 struct scrub_ctx *sctx = NULL;
3334
3335 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3336 dev = btrfs_find_device(fs_info->fs_devices, &args);
3337 if (dev)
3338 sctx = dev->scrub_ctx;
3339 if (sctx)
3340 memcpy(progress, &sctx->stat, sizeof(*progress));
3341 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3342
3343 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3344 }
3345