1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "discard.h" 11 #include "volumes.h" 12 #include "disk-io.h" 13 #include "ordered-data.h" 14 #include "transaction.h" 15 #include "backref.h" 16 #include "extent_io.h" 17 #include "dev-replace.h" 18 #include "raid56.h" 19 #include "block-group.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "file-item.h" 24 #include "scrub.h" 25 #include "raid-stripe-tree.h" 26 27 /* 28 * This is only the first step towards a full-features scrub. It reads all 29 * extent and super block and verifies the checksums. In case a bad checksum 30 * is found or the extent cannot be read, good data will be written back if 31 * any can be found. 32 * 33 * Future enhancements: 34 * - In case an unrepairable extent is encountered, track which files are 35 * affected and report them 36 * - track and record media errors, throw out bad devices 37 * - add a mode to also read unallocated space 38 */ 39 40 struct scrub_ctx; 41 42 /* 43 * The following value only influences the performance. 44 * 45 * This determines how many stripes would be submitted in one go, 46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 47 */ 48 #define SCRUB_STRIPES_PER_GROUP 8 49 50 /* 51 * How many groups we have for each sctx. 52 * 53 * This would be 8M per device, the same value as the old scrub in-flight bios 54 * size limit. 55 */ 56 #define SCRUB_GROUPS_PER_SCTX 16 57 58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 59 60 /* 61 * The following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 */ 64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 65 66 /* Represent one sector and its needed info to verify the content. */ 67 struct scrub_sector_verification { 68 union { 69 /* 70 * Csum pointer for data csum verification. Should point to a 71 * sector csum inside scrub_stripe::csums. 72 * 73 * NULL if this data sector has no csum. 74 */ 75 u8 *csum; 76 77 /* 78 * Extra info for metadata verification. All sectors inside a 79 * tree block share the same generation. 80 */ 81 u64 generation; 82 }; 83 }; 84 85 enum scrub_stripe_flags { 86 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 87 SCRUB_STRIPE_FLAG_INITIALIZED, 88 89 /* Set when the read-repair is finished. */ 90 SCRUB_STRIPE_FLAG_REPAIR_DONE, 91 92 /* 93 * Set for data stripes if it's triggered from P/Q stripe. 94 * During such scrub, we should not report errors in data stripes, nor 95 * update the accounting. 96 */ 97 SCRUB_STRIPE_FLAG_NO_REPORT, 98 }; 99 100 /* 101 * We have multiple bitmaps for one scrub_stripe. 102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 104 * 105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 106 * into a larger one. 107 * 108 * These enum records where the sub-bitmap are inside the larger one. 109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 110 */ 111 enum { 112 /* Which blocks are covered by extent items. */ 113 scrub_bitmap_nr_has_extent = 0, 114 115 /* Which blocks are metadata. */ 116 scrub_bitmap_nr_is_metadata, 117 118 /* 119 * Which blocks have errors, including IO, csum, and metadata 120 * errors. 121 * This sub-bitmap is the OR results of the next few error related 122 * sub-bitmaps. 123 */ 124 scrub_bitmap_nr_error, 125 scrub_bitmap_nr_io_error, 126 scrub_bitmap_nr_csum_error, 127 scrub_bitmap_nr_meta_error, 128 scrub_bitmap_nr_meta_gen_error, 129 scrub_bitmap_nr_last, 130 }; 131 132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 133 134 /* 135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 136 */ 137 struct scrub_stripe { 138 struct scrub_ctx *sctx; 139 struct btrfs_block_group *bg; 140 141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 142 struct scrub_sector_verification *sectors; 143 144 struct btrfs_device *dev; 145 u64 logical; 146 u64 physical; 147 148 u16 mirror_num; 149 150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 151 u16 nr_sectors; 152 153 /* 154 * How many data/meta extents are in this stripe. Only for scrub status 155 * reporting purposes. 156 */ 157 u16 nr_data_extents; 158 u16 nr_meta_extents; 159 160 atomic_t pending_io; 161 wait_queue_head_t io_wait; 162 wait_queue_head_t repair_wait; 163 164 /* 165 * Indicate the states of the stripe. Bits are defined in 166 * scrub_stripe_flags enum. 167 */ 168 unsigned long state; 169 170 /* The large bitmap contains all the sub-bitmaps. */ 171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 173 174 /* 175 * For writeback (repair or replace) error reporting. 176 * This one is protected by a spinlock, thus can not be packed into 177 * the larger bitmap. 178 */ 179 unsigned long write_error_bitmap; 180 181 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 182 spinlock_t write_error_lock; 183 184 /* 185 * Checksum for the whole stripe if this stripe is inside a data block 186 * group. 187 */ 188 u8 *csums; 189 190 struct work_struct work; 191 }; 192 193 struct scrub_ctx { 194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 195 struct scrub_stripe *raid56_data_stripes; 196 struct btrfs_fs_info *fs_info; 197 struct btrfs_path extent_path; 198 struct btrfs_path csum_path; 199 int first_free; 200 int cur_stripe; 201 atomic_t cancel_req; 202 int readonly; 203 204 /* State of IO submission throttling affecting the associated device */ 205 ktime_t throttle_deadline; 206 u64 throttle_sent; 207 208 bool is_dev_replace; 209 u64 write_pointer; 210 211 struct mutex wr_lock; 212 struct btrfs_device *wr_tgtdev; 213 214 /* 215 * statistics 216 */ 217 struct btrfs_scrub_progress stat; 218 spinlock_t stat_lock; 219 220 /* 221 * Use a ref counter to avoid use-after-free issues. Scrub workers 222 * decrement bios_in_flight and workers_pending and then do a wakeup 223 * on the list_wait wait queue. We must ensure the main scrub task 224 * doesn't free the scrub context before or while the workers are 225 * doing the wakeup() call. 226 */ 227 refcount_t refs; 228 }; 229 230 #define scrub_calc_start_bit(stripe, name, block_nr) \ 231 ({ \ 232 unsigned int __start_bit; \ 233 \ 234 ASSERT(block_nr < stripe->nr_sectors, \ 235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 237 __start_bit; \ 238 }) 239 240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 242 unsigned int block_nr, \ 243 unsigned int nr_blocks) \ 244 { \ 245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 246 name, block_nr); \ 247 \ 248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 249 } \ 250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 251 unsigned int block_nr, \ 252 unsigned int nr_blocks) \ 253 { \ 254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 255 block_nr); \ 256 \ 257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 258 } \ 259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 260 unsigned int block_nr) \ 261 { \ 262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 263 block_nr); \ 264 \ 265 return test_bit(start_bit, stripe->bitmaps); \ 266 } \ 267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 268 unsigned int block_nr) \ 269 { \ 270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 271 block_nr); \ 272 \ 273 set_bit(start_bit, stripe->bitmaps); \ 274 } \ 275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 276 unsigned int block_nr) \ 277 { \ 278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 279 block_nr); \ 280 \ 281 clear_bit(start_bit, stripe->bitmaps); \ 282 } \ 283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 284 { \ 285 const unsigned int nr_blocks = stripe->nr_sectors; \ 286 \ 287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 288 "nr_blocks=%u BITS_PER_LONG=%u", \ 289 nr_blocks, BITS_PER_LONG); \ 290 \ 291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 292 stripe->nr_sectors); \ 293 } \ 294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 295 { \ 296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 297 \ 298 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 299 } \ 300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 301 { \ 302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 303 \ 304 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 305 } 306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 308 IMPLEMENT_SCRUB_BITMAP_OPS(error); 309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 313 314 struct scrub_warning { 315 struct btrfs_path *path; 316 u64 extent_item_size; 317 const char *errstr; 318 u64 physical; 319 u64 logical; 320 struct btrfs_device *dev; 321 }; 322 323 struct scrub_error_records { 324 /* 325 * Bitmap recording which blocks hit errors (IO/csum/...) during the 326 * initial read. 327 */ 328 unsigned long init_error_bitmap; 329 330 unsigned int nr_io_errors; 331 unsigned int nr_csum_errors; 332 unsigned int nr_meta_errors; 333 unsigned int nr_meta_gen_errors; 334 }; 335 336 static void release_scrub_stripe(struct scrub_stripe *stripe) 337 { 338 if (!stripe) 339 return; 340 341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 342 if (stripe->folios[i]) 343 folio_put(stripe->folios[i]); 344 stripe->folios[i] = NULL; 345 } 346 kfree(stripe->sectors); 347 kfree(stripe->csums); 348 stripe->sectors = NULL; 349 stripe->csums = NULL; 350 stripe->sctx = NULL; 351 stripe->state = 0; 352 } 353 354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 355 struct scrub_stripe *stripe) 356 { 357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 358 int ret; 359 360 memset(stripe, 0, sizeof(*stripe)); 361 362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 363 stripe->state = 0; 364 365 init_waitqueue_head(&stripe->io_wait); 366 init_waitqueue_head(&stripe->repair_wait); 367 atomic_set(&stripe->pending_io, 0); 368 spin_lock_init(&stripe->write_error_lock); 369 370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 372 fs_info->block_min_order, stripe->folios, 373 GFP_NOFS); 374 if (ret < 0) 375 goto error; 376 377 stripe->sectors = kzalloc_objs(struct scrub_sector_verification, 378 stripe->nr_sectors); 379 if (!stripe->sectors) 380 goto error; 381 382 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 383 fs_info->csum_size, GFP_KERNEL); 384 if (!stripe->csums) 385 goto error; 386 return 0; 387 error: 388 release_scrub_stripe(stripe); 389 return -ENOMEM; 390 } 391 392 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 393 { 394 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 395 } 396 397 static void scrub_put_ctx(struct scrub_ctx *sctx); 398 399 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 400 { 401 while (atomic_read(&fs_info->scrub_pause_req)) { 402 mutex_unlock(&fs_info->scrub_lock); 403 wait_event(fs_info->scrub_pause_wait, 404 atomic_read(&fs_info->scrub_pause_req) == 0); 405 mutex_lock(&fs_info->scrub_lock); 406 } 407 } 408 409 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 410 { 411 atomic_inc(&fs_info->scrubs_paused); 412 wake_up(&fs_info->scrub_pause_wait); 413 } 414 415 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 416 { 417 mutex_lock(&fs_info->scrub_lock); 418 __scrub_blocked_if_needed(fs_info); 419 atomic_dec(&fs_info->scrubs_paused); 420 mutex_unlock(&fs_info->scrub_lock); 421 422 wake_up(&fs_info->scrub_pause_wait); 423 } 424 425 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 426 { 427 scrub_pause_on(fs_info); 428 scrub_pause_off(fs_info); 429 } 430 431 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 432 { 433 int i; 434 435 if (!sctx) 436 return; 437 438 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 439 release_scrub_stripe(&sctx->stripes[i]); 440 441 kvfree(sctx); 442 } 443 444 static void scrub_put_ctx(struct scrub_ctx *sctx) 445 { 446 if (refcount_dec_and_test(&sctx->refs)) 447 scrub_free_ctx(sctx); 448 } 449 450 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 451 struct btrfs_fs_info *fs_info, bool is_dev_replace) 452 { 453 struct scrub_ctx *sctx; 454 int i; 455 456 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 457 * kvzalloc(). 458 */ 459 sctx = kvzalloc_obj(*sctx); 460 if (!sctx) 461 goto nomem; 462 refcount_set(&sctx->refs, 1); 463 sctx->is_dev_replace = is_dev_replace; 464 sctx->fs_info = fs_info; 465 sctx->extent_path.search_commit_root = true; 466 sctx->extent_path.skip_locking = true; 467 sctx->csum_path.search_commit_root = true; 468 sctx->csum_path.skip_locking = true; 469 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 470 int ret; 471 472 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 473 if (ret < 0) 474 goto nomem; 475 sctx->stripes[i].sctx = sctx; 476 } 477 sctx->first_free = 0; 478 atomic_set(&sctx->cancel_req, 0); 479 480 spin_lock_init(&sctx->stat_lock); 481 sctx->throttle_deadline = 0; 482 483 mutex_init(&sctx->wr_lock); 484 if (is_dev_replace) { 485 WARN_ON(!fs_info->dev_replace.tgtdev); 486 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 487 } 488 489 return sctx; 490 491 nomem: 492 scrub_free_ctx(sctx); 493 return ERR_PTR(-ENOMEM); 494 } 495 496 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 497 u64 root, void *warn_ctx) 498 { 499 u32 nlink; 500 int ret; 501 int i; 502 unsigned nofs_flag; 503 struct extent_buffer *eb; 504 struct btrfs_inode_item *inode_item; 505 struct scrub_warning *swarn = warn_ctx; 506 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 507 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 508 struct btrfs_root *local_root; 509 struct btrfs_key key; 510 511 local_root = btrfs_get_fs_root(fs_info, root, true); 512 if (IS_ERR(local_root)) { 513 ret = PTR_ERR(local_root); 514 goto err; 515 } 516 517 /* 518 * this makes the path point to (inum INODE_ITEM ioff) 519 */ 520 key.objectid = inum; 521 key.type = BTRFS_INODE_ITEM_KEY; 522 key.offset = 0; 523 524 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 525 if (ret) { 526 btrfs_put_root(local_root); 527 btrfs_release_path(swarn->path); 528 goto err; 529 } 530 531 eb = swarn->path->nodes[0]; 532 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 533 struct btrfs_inode_item); 534 nlink = btrfs_inode_nlink(eb, inode_item); 535 btrfs_release_path(swarn->path); 536 537 /* 538 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 539 * uses GFP_NOFS in this context, so we keep it consistent but it does 540 * not seem to be strictly necessary. 541 */ 542 nofs_flag = memalloc_nofs_save(); 543 ipath = init_ipath(4096, local_root, swarn->path); 544 memalloc_nofs_restore(nofs_flag); 545 if (IS_ERR(ipath)) { 546 btrfs_put_root(local_root); 547 ret = PTR_ERR(ipath); 548 ipath = NULL; 549 goto err; 550 } 551 ret = paths_from_inode(inum, ipath); 552 553 if (ret < 0) 554 goto err; 555 556 /* 557 * we deliberately ignore the bit ipath might have been too small to 558 * hold all of the paths here 559 */ 560 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 561 btrfs_warn(fs_info, 562 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 563 swarn->errstr, swarn->logical, 564 btrfs_dev_name(swarn->dev), 565 swarn->physical, 566 root, inum, offset, 567 fs_info->sectorsize, nlink, 568 (char *)(unsigned long)ipath->fspath->val[i]); 569 570 btrfs_put_root(local_root); 571 return 0; 572 573 err: 574 btrfs_warn(fs_info, 575 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 576 swarn->errstr, swarn->logical, 577 btrfs_dev_name(swarn->dev), 578 swarn->physical, 579 root, inum, offset, ret); 580 581 return 0; 582 } 583 584 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 585 bool is_super, u64 logical, u64 physical) 586 { 587 struct btrfs_fs_info *fs_info = dev->fs_info; 588 BTRFS_PATH_AUTO_FREE(path); 589 struct btrfs_key found_key; 590 struct extent_buffer *eb; 591 struct btrfs_extent_item *ei; 592 struct scrub_warning swarn; 593 u64 flags = 0; 594 u32 item_size; 595 int ret; 596 597 /* Super block error, no need to search extent tree. */ 598 if (is_super) { 599 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 600 errstr, btrfs_dev_name(dev), physical); 601 return; 602 } 603 path = btrfs_alloc_path(); 604 if (!path) 605 return; 606 607 swarn.physical = physical; 608 swarn.logical = logical; 609 swarn.errstr = errstr; 610 swarn.dev = NULL; 611 612 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 613 &flags); 614 if (ret < 0) 615 return; 616 617 swarn.extent_item_size = found_key.offset; 618 619 eb = path->nodes[0]; 620 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 621 item_size = btrfs_item_size(eb, path->slots[0]); 622 623 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 624 unsigned long ptr = 0; 625 u8 ref_level; 626 u64 ref_root; 627 628 while (true) { 629 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 630 item_size, &ref_root, 631 &ref_level); 632 if (ret < 0) { 633 btrfs_warn(fs_info, 634 "scrub: failed to resolve tree backref for logical %llu: %d", 635 swarn.logical, ret); 636 break; 637 } 638 if (ret > 0) 639 break; 640 btrfs_warn(fs_info, 641 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 642 errstr, swarn.logical, btrfs_dev_name(dev), 643 swarn.physical, (ref_level ? "node" : "leaf"), 644 ref_level, ref_root); 645 } 646 btrfs_release_path(path); 647 } else { 648 struct btrfs_backref_walk_ctx ctx = { 0 }; 649 650 btrfs_release_path(path); 651 652 ctx.bytenr = found_key.objectid; 653 ctx.extent_item_pos = swarn.logical - found_key.objectid; 654 ctx.fs_info = fs_info; 655 656 swarn.path = path; 657 swarn.dev = dev; 658 659 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 660 } 661 } 662 663 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 664 { 665 int ret = 0; 666 u64 length; 667 668 if (!btrfs_is_zoned(sctx->fs_info)) 669 return 0; 670 671 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 672 return 0; 673 674 if (sctx->write_pointer < physical) { 675 length = physical - sctx->write_pointer; 676 677 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 678 sctx->write_pointer, length); 679 if (!ret) 680 sctx->write_pointer = physical; 681 } 682 return ret; 683 } 684 685 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 686 { 687 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 688 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 689 u32 offset = (sector_nr << fs_info->sectorsize_bits); 690 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 691 692 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 693 ASSERT(folio); 694 ASSERT(!folio_test_highmem(folio)); 695 return folio_address(folio) + offset_in_folio(folio, offset); 696 } 697 698 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 699 { 700 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 701 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 702 u32 offset = (sector_nr << fs_info->sectorsize_bits); 703 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 704 705 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 706 ASSERT(folio); 707 ASSERT(!folio_test_highmem(folio)); 708 /* And the range must be contained inside the folio. */ 709 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 710 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 711 } 712 713 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 714 { 715 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 716 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 717 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 718 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 719 struct btrfs_header *header = first_kaddr; 720 struct btrfs_csum_ctx csum; 721 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 722 u8 calculated_csum[BTRFS_CSUM_SIZE]; 723 724 /* 725 * Here we don't have a good way to attach the pages (and subpages) 726 * to a dummy extent buffer, thus we have to directly grab the members 727 * from pages. 728 */ 729 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 730 731 if (logical != btrfs_stack_header_bytenr(header)) { 732 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 733 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 734 btrfs_warn_rl(fs_info, 735 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 736 logical, stripe->mirror_num, 737 btrfs_stack_header_bytenr(header), logical); 738 return; 739 } 740 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 741 BTRFS_FSID_SIZE) != 0) { 742 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 743 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 744 btrfs_warn_rl(fs_info, 745 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 746 logical, stripe->mirror_num, 747 header->fsid, fs_info->fs_devices->metadata_uuid); 748 return; 749 } 750 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 751 BTRFS_UUID_SIZE) != 0) { 752 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 753 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 754 btrfs_warn_rl(fs_info, 755 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 756 logical, stripe->mirror_num, 757 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 758 return; 759 } 760 761 /* Now check tree block csum. */ 762 btrfs_csum_init(&csum, fs_info->csum_type); 763 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 764 fs_info->sectorsize - BTRFS_CSUM_SIZE); 765 766 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 767 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 768 fs_info->sectorsize); 769 } 770 771 btrfs_csum_final(&csum, calculated_csum); 772 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 773 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 774 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 775 btrfs_warn_rl(fs_info, 776 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 777 logical, stripe->mirror_num, 778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 779 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 780 return; 781 } 782 if (stripe->sectors[sector_nr].generation != 783 btrfs_stack_header_generation(header)) { 784 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 785 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 786 btrfs_warn_rl(fs_info, 787 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 788 logical, stripe->mirror_num, 789 btrfs_stack_header_generation(header), 790 stripe->sectors[sector_nr].generation); 791 return; 792 } 793 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 794 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 795 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 796 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 797 } 798 799 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 800 { 801 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 802 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 803 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 804 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 805 u8 csum_buf[BTRFS_CSUM_SIZE]; 806 int ret; 807 808 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 809 810 /* Sector not utilized, skip it. */ 811 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 812 return; 813 814 /* IO error, no need to check. */ 815 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 816 return; 817 818 /* Metadata, verify the full tree block. */ 819 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 820 /* 821 * Check if the tree block crosses the stripe boundary. If 822 * crossed the boundary, we cannot verify it but only give a 823 * warning. 824 * 825 * This can only happen on a very old filesystem where chunks 826 * are not ensured to be stripe aligned. 827 */ 828 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 829 btrfs_warn_rl(fs_info, 830 "scrub: tree block at %llu crosses stripe boundary %llu", 831 stripe->logical + 832 (sector_nr << fs_info->sectorsize_bits), 833 stripe->logical); 834 return; 835 } 836 scrub_verify_one_metadata(stripe, sector_nr); 837 return; 838 } 839 840 /* 841 * Data is easier, we just verify the data csum (if we have it). For 842 * cases without csum, we have no other choice but to trust it. 843 */ 844 if (!sector->csum) { 845 scrub_bitmap_clear_bit_error(stripe, sector_nr); 846 return; 847 } 848 849 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 850 if (ret < 0) { 851 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 852 scrub_bitmap_set_bit_error(stripe, sector_nr); 853 } else { 854 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 855 scrub_bitmap_clear_bit_error(stripe, sector_nr); 856 } 857 } 858 859 /* Verify specified sectors of a stripe. */ 860 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 861 { 862 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 863 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 864 int sector_nr; 865 866 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 867 scrub_verify_one_sector(stripe, sector_nr); 868 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 869 sector_nr += sectors_per_tree - 1; 870 } 871 } 872 873 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 874 { 875 int i; 876 877 for (i = 0; i < stripe->nr_sectors; i++) { 878 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 879 break; 880 } 881 ASSERT(i < stripe->nr_sectors); 882 return i; 883 } 884 885 /* 886 * Repair read is different to the regular read: 887 * 888 * - Only reads the failed sectors 889 * - May have extra blocksize limits 890 */ 891 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 892 { 893 struct scrub_stripe *stripe = bbio->private; 894 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 895 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 896 const u32 bio_size = bio_get_size(&bbio->bio); 897 898 ASSERT(sector_nr < stripe->nr_sectors); 899 900 if (bbio->bio.bi_status) { 901 scrub_bitmap_set_io_error(stripe, sector_nr, 902 bio_size >> fs_info->sectorsize_bits); 903 scrub_bitmap_set_error(stripe, sector_nr, 904 bio_size >> fs_info->sectorsize_bits); 905 } else { 906 scrub_bitmap_clear_io_error(stripe, sector_nr, 907 bio_size >> fs_info->sectorsize_bits); 908 } 909 bio_put(&bbio->bio); 910 if (atomic_dec_and_test(&stripe->pending_io)) 911 wake_up(&stripe->io_wait); 912 } 913 914 static int calc_next_mirror(int mirror, int num_copies) 915 { 916 ASSERT(mirror <= num_copies); 917 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 918 } 919 920 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 921 int sector_nr) 922 { 923 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 924 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 925 int ret; 926 927 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 928 offset_in_page(kaddr)); 929 /* 930 * Caller should ensure the bbio has enough size. 931 * And we cannot use __bio_add_page(), which doesn't do any merge. 932 * 933 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 934 * to create the minimal amount of bio vectors, for fs block size < page 935 * size cases. 936 */ 937 ASSERT(ret == fs_info->sectorsize); 938 } 939 940 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 941 unsigned int nr_vecs, blk_opf_t opf, 942 u64 logical, 943 btrfs_bio_end_io_t end_io, void *private) 944 { 945 struct btrfs_bio *bbio; 946 947 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 948 logical, end_io, private); 949 bbio->is_scrub = true; 950 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 951 return bbio; 952 } 953 954 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 955 int mirror, int blocksize, bool wait) 956 { 957 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 958 struct btrfs_bio *bbio = NULL; 959 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 960 int i; 961 962 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 963 ASSERT(atomic_read(&stripe->pending_io) == 0, 964 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 965 966 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 967 /* The current sector cannot be merged, submit the bio. */ 968 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 969 bbio->bio.bi_iter.bi_size >= blocksize)) { 970 ASSERT(bbio->bio.bi_iter.bi_size); 971 atomic_inc(&stripe->pending_io); 972 btrfs_submit_bbio(bbio, mirror); 973 if (wait) 974 wait_scrub_stripe_io(stripe); 975 bbio = NULL; 976 } 977 978 if (!bbio) 979 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 980 stripe->logical + (i << fs_info->sectorsize_bits), 981 scrub_repair_read_endio, stripe); 982 983 scrub_bio_add_sector(bbio, stripe, i); 984 } 985 if (bbio) { 986 ASSERT(bbio->bio.bi_iter.bi_size); 987 atomic_inc(&stripe->pending_io); 988 btrfs_submit_bbio(bbio, mirror); 989 if (wait) 990 wait_scrub_stripe_io(stripe); 991 } 992 } 993 994 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 995 struct scrub_stripe *stripe, 996 const struct scrub_error_records *errors) 997 { 998 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 999 DEFAULT_RATELIMIT_BURST); 1000 struct btrfs_fs_info *fs_info = sctx->fs_info; 1001 struct btrfs_device *dev = NULL; 1002 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1003 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1004 u64 physical = 0; 1005 int nr_data_sectors = 0; 1006 int nr_meta_sectors = 0; 1007 int nr_nodatacsum_sectors = 0; 1008 int nr_repaired_sectors = 0; 1009 int sector_nr; 1010 1011 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1012 return; 1013 1014 /* 1015 * Init needed infos for error reporting. 1016 * 1017 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1018 * thus no need for dev/physical, error reporting still needs dev and physical. 1019 */ 1020 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1021 u64 mapped_len = fs_info->sectorsize; 1022 struct btrfs_io_context *bioc = NULL; 1023 int stripe_index = stripe->mirror_num - 1; 1024 int ret; 1025 1026 /* For scrub, our mirror_num should always start at 1. */ 1027 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1028 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1029 stripe->logical, &mapped_len, &bioc, 1030 NULL, NULL); 1031 /* 1032 * If we failed, dev will be NULL, and later detailed reports 1033 * will just be skipped. 1034 */ 1035 if (ret < 0) 1036 goto skip; 1037 physical = bioc->stripes[stripe_index].physical; 1038 dev = bioc->stripes[stripe_index].dev; 1039 btrfs_put_bioc(bioc); 1040 } 1041 1042 skip: 1043 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1044 bool repaired = false; 1045 1046 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1047 nr_meta_sectors++; 1048 } else { 1049 nr_data_sectors++; 1050 if (!stripe->sectors[sector_nr].csum) 1051 nr_nodatacsum_sectors++; 1052 } 1053 1054 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1055 !test_bit(sector_nr, &error_bitmap)) { 1056 nr_repaired_sectors++; 1057 repaired = true; 1058 } 1059 1060 /* Good sector from the beginning, nothing need to be done. */ 1061 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1062 continue; 1063 1064 /* 1065 * Report error for the corrupted sectors. If repaired, just 1066 * output the message of repaired message. 1067 */ 1068 if (repaired) { 1069 if (dev) { 1070 btrfs_err_rl(fs_info, 1071 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1072 stripe->logical, btrfs_dev_name(dev), 1073 physical); 1074 } else { 1075 btrfs_err_rl(fs_info, 1076 "scrub: fixed up error at logical %llu on mirror %u", 1077 stripe->logical, stripe->mirror_num); 1078 } 1079 continue; 1080 } 1081 1082 /* The remaining are all for unrepaired. */ 1083 if (dev) { 1084 btrfs_err_rl(fs_info, 1085 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1086 stripe->logical, btrfs_dev_name(dev), 1087 physical); 1088 } else { 1089 btrfs_err_rl(fs_info, 1090 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1091 stripe->logical, stripe->mirror_num); 1092 } 1093 1094 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1095 if (__ratelimit(&rs) && dev) 1096 scrub_print_common_warning("i/o error", dev, false, 1097 stripe->logical, physical); 1098 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1099 if (__ratelimit(&rs) && dev) 1100 scrub_print_common_warning("checksum error", dev, false, 1101 stripe->logical, physical); 1102 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1103 if (__ratelimit(&rs) && dev) 1104 scrub_print_common_warning("header error", dev, false, 1105 stripe->logical, physical); 1106 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1107 if (__ratelimit(&rs) && dev) 1108 scrub_print_common_warning("generation error", dev, false, 1109 stripe->logical, physical); 1110 } 1111 1112 /* Update the device stats. */ 1113 for (int i = 0; i < errors->nr_io_errors; i++) 1114 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1115 for (int i = 0; i < errors->nr_csum_errors; i++) 1116 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1117 /* Generation mismatch error is based on each metadata, not each block. */ 1118 for (int i = 0; i < errors->nr_meta_gen_errors; 1119 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1120 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1121 1122 spin_lock(&sctx->stat_lock); 1123 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1124 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1125 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1126 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1127 sctx->stat.no_csum += nr_nodatacsum_sectors; 1128 sctx->stat.read_errors += errors->nr_io_errors; 1129 sctx->stat.csum_errors += errors->nr_csum_errors; 1130 sctx->stat.verify_errors += errors->nr_meta_errors + 1131 errors->nr_meta_gen_errors; 1132 sctx->stat.uncorrectable_errors += 1133 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1134 sctx->stat.corrected_errors += nr_repaired_sectors; 1135 spin_unlock(&sctx->stat_lock); 1136 } 1137 1138 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1139 unsigned long write_bitmap, bool dev_replace); 1140 1141 /* 1142 * The main entrance for all read related scrub work, including: 1143 * 1144 * - Wait for the initial read to finish 1145 * - Verify and locate any bad sectors 1146 * - Go through the remaining mirrors and try to read as large blocksize as 1147 * possible 1148 * - Go through all mirrors (including the failed mirror) sector-by-sector 1149 * - Submit writeback for repaired sectors 1150 * 1151 * Writeback for dev-replace does not happen here, it needs extra 1152 * synchronization for zoned devices. 1153 */ 1154 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1155 { 1156 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1157 struct scrub_ctx *sctx = stripe->sctx; 1158 struct btrfs_fs_info *fs_info = sctx->fs_info; 1159 struct scrub_error_records errors = { 0 }; 1160 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1161 stripe->bg->length); 1162 unsigned long repaired; 1163 unsigned long error; 1164 int mirror; 1165 int i; 1166 1167 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1168 1169 wait_scrub_stripe_io(stripe); 1170 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1171 /* Save the initial failed bitmap for later repair and report usage. */ 1172 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1173 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1174 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1175 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1176 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1177 1178 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1179 goto out; 1180 1181 /* 1182 * Try all remaining mirrors. 1183 * 1184 * Here we still try to read as large block as possible, as this is 1185 * faster and we have extra safety nets to rely on. 1186 */ 1187 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1188 mirror != stripe->mirror_num; 1189 mirror = calc_next_mirror(mirror, num_copies)) { 1190 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1191 1192 scrub_stripe_submit_repair_read(stripe, mirror, 1193 BTRFS_STRIPE_LEN, false); 1194 wait_scrub_stripe_io(stripe); 1195 scrub_verify_one_stripe(stripe, old_error_bitmap); 1196 if (scrub_bitmap_empty_error(stripe)) 1197 goto out; 1198 } 1199 1200 /* 1201 * Last safety net, try re-checking all mirrors, including the failed 1202 * one, sector-by-sector. 1203 * 1204 * As if one sector failed the drive's internal csum, the whole read 1205 * containing the offending sector would be marked as error. 1206 * Thus here we do sector-by-sector read. 1207 * 1208 * This can be slow, thus we only try it as the last resort. 1209 */ 1210 1211 for (i = 0, mirror = stripe->mirror_num; 1212 i < num_copies; 1213 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1214 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1215 1216 scrub_stripe_submit_repair_read(stripe, mirror, 1217 fs_info->sectorsize, true); 1218 wait_scrub_stripe_io(stripe); 1219 scrub_verify_one_stripe(stripe, old_error_bitmap); 1220 if (scrub_bitmap_empty_error(stripe)) 1221 goto out; 1222 } 1223 out: 1224 error = scrub_bitmap_read_error(stripe); 1225 /* 1226 * Submit the repaired sectors. For zoned case, we cannot do repair 1227 * in-place, but queue the bg to be relocated. 1228 */ 1229 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1230 stripe->nr_sectors); 1231 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1232 if (btrfs_is_zoned(fs_info)) { 1233 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1234 } else { 1235 scrub_write_sectors(sctx, stripe, repaired, false); 1236 wait_scrub_stripe_io(stripe); 1237 } 1238 } 1239 1240 scrub_stripe_report_errors(sctx, stripe, &errors); 1241 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1242 wake_up(&stripe->repair_wait); 1243 } 1244 1245 static void scrub_read_endio(struct btrfs_bio *bbio) 1246 { 1247 struct scrub_stripe *stripe = bbio->private; 1248 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1249 int num_sectors; 1250 const u32 bio_size = bio_get_size(&bbio->bio); 1251 1252 ASSERT(sector_nr < stripe->nr_sectors); 1253 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1254 1255 if (bbio->bio.bi_status) { 1256 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1257 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1258 } else { 1259 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1260 } 1261 bio_put(&bbio->bio); 1262 if (atomic_dec_and_test(&stripe->pending_io)) { 1263 wake_up(&stripe->io_wait); 1264 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1265 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1266 } 1267 } 1268 1269 static void scrub_write_endio(struct btrfs_bio *bbio) 1270 { 1271 struct scrub_stripe *stripe = bbio->private; 1272 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1273 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1274 const u32 bio_size = bio_get_size(&bbio->bio); 1275 1276 if (bbio->bio.bi_status) { 1277 unsigned long flags; 1278 1279 spin_lock_irqsave(&stripe->write_error_lock, flags); 1280 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1281 bio_size >> fs_info->sectorsize_bits); 1282 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1283 for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1284 btrfs_dev_stat_inc_and_print(stripe->dev, 1285 BTRFS_DEV_STAT_WRITE_ERRS); 1286 } 1287 bio_put(&bbio->bio); 1288 1289 if (atomic_dec_and_test(&stripe->pending_io)) 1290 wake_up(&stripe->io_wait); 1291 } 1292 1293 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1294 struct scrub_stripe *stripe, 1295 struct btrfs_bio *bbio, bool dev_replace) 1296 { 1297 struct btrfs_fs_info *fs_info = sctx->fs_info; 1298 u32 bio_len = bbio->bio.bi_iter.bi_size; 1299 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1300 stripe->logical; 1301 1302 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1303 atomic_inc(&stripe->pending_io); 1304 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1305 if (!btrfs_is_zoned(fs_info)) 1306 return; 1307 /* 1308 * For zoned writeback, queue depth must be 1, thus we must wait for 1309 * the write to finish before the next write. 1310 */ 1311 wait_scrub_stripe_io(stripe); 1312 1313 /* 1314 * And also need to update the write pointer if write finished 1315 * successfully. 1316 */ 1317 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1318 &stripe->write_error_bitmap)) 1319 sctx->write_pointer += bio_len; 1320 } 1321 1322 /* 1323 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1324 * 1325 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1326 * 1327 * - Only needs logical bytenr and mirror_num 1328 * Just like the scrub read path 1329 * 1330 * - Would only result in writes to the specified mirror 1331 * Unlike the regular writeback path, which would write back to all stripes 1332 * 1333 * - Handle dev-replace and read-repair writeback differently 1334 */ 1335 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1336 unsigned long write_bitmap, bool dev_replace) 1337 { 1338 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1339 struct btrfs_bio *bbio = NULL; 1340 int sector_nr; 1341 1342 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1343 /* We should only writeback sectors covered by an extent. */ 1344 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1345 1346 /* Cannot merge with previous sector, submit the current one. */ 1347 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1348 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1349 bbio = NULL; 1350 } 1351 if (!bbio) 1352 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1353 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1354 scrub_write_endio, stripe); 1355 scrub_bio_add_sector(bbio, stripe, sector_nr); 1356 } 1357 if (bbio) 1358 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1359 } 1360 1361 /* 1362 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1363 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1364 */ 1365 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1366 unsigned int bio_size) 1367 { 1368 const int time_slice = 1000; 1369 s64 delta; 1370 ktime_t now; 1371 u32 div; 1372 u64 bwlimit; 1373 1374 bwlimit = READ_ONCE(device->scrub_speed_max); 1375 if (bwlimit == 0) 1376 return; 1377 1378 /* 1379 * Slice is divided into intervals when the IO is submitted, adjust by 1380 * bwlimit and maximum of 64 intervals. 1381 */ 1382 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1383 1384 /* Start new epoch, set deadline */ 1385 now = ktime_get(); 1386 if (sctx->throttle_deadline == 0) { 1387 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1388 sctx->throttle_sent = 0; 1389 } 1390 1391 /* Still in the time to send? */ 1392 if (ktime_before(now, sctx->throttle_deadline)) { 1393 /* If current bio is within the limit, send it */ 1394 sctx->throttle_sent += bio_size; 1395 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1396 return; 1397 1398 /* We're over the limit, sleep until the rest of the slice */ 1399 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1400 } else { 1401 /* New request after deadline, start new epoch */ 1402 delta = 0; 1403 } 1404 1405 if (delta) { 1406 long timeout; 1407 1408 timeout = div_u64(delta * HZ, 1000); 1409 schedule_timeout_interruptible(timeout); 1410 } 1411 1412 /* Next call will start the deadline period */ 1413 sctx->throttle_deadline = 0; 1414 } 1415 1416 /* 1417 * Given a physical address, this will calculate it's 1418 * logical offset. if this is a parity stripe, it will return 1419 * the most left data stripe's logical offset. 1420 * 1421 * return 0 if it is a data stripe, 1 means parity stripe. 1422 */ 1423 static int get_raid56_logic_offset(u64 physical, int num, 1424 struct btrfs_chunk_map *map, u64 *offset, 1425 u64 *stripe_start) 1426 { 1427 int i; 1428 int j = 0; 1429 u64 last_offset; 1430 const int data_stripes = nr_data_stripes(map); 1431 1432 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1433 if (stripe_start) 1434 *stripe_start = last_offset; 1435 1436 *offset = last_offset; 1437 for (i = 0; i < data_stripes; i++) { 1438 u32 stripe_nr; 1439 u32 stripe_index; 1440 u32 rot; 1441 1442 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1443 1444 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1445 1446 /* Work out the disk rotation on this stripe-set */ 1447 rot = stripe_nr % map->num_stripes; 1448 /* calculate which stripe this data locates */ 1449 rot += i; 1450 stripe_index = rot % map->num_stripes; 1451 if (stripe_index == num) 1452 return 0; 1453 if (stripe_index < num) 1454 j++; 1455 } 1456 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1457 return 1; 1458 } 1459 1460 /* 1461 * Return 0 if the extent item range covers any byte of the range. 1462 * Return <0 if the extent item is before @search_start. 1463 * Return >0 if the extent item is after @start_start + @search_len. 1464 */ 1465 static int compare_extent_item_range(struct btrfs_path *path, 1466 u64 search_start, u64 search_len) 1467 { 1468 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1469 u64 len; 1470 struct btrfs_key key; 1471 1472 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1473 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1474 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1475 if (key.type == BTRFS_METADATA_ITEM_KEY) 1476 len = fs_info->nodesize; 1477 else 1478 len = key.offset; 1479 1480 if (key.objectid + len <= search_start) 1481 return -1; 1482 if (key.objectid >= search_start + search_len) 1483 return 1; 1484 return 0; 1485 } 1486 1487 /* 1488 * Locate one extent item which covers any byte in range 1489 * [@search_start, @search_start + @search_length) 1490 * 1491 * If the path is not initialized, we will initialize the search by doing 1492 * a btrfs_search_slot(). 1493 * If the path is already initialized, we will use the path as the initial 1494 * slot, to avoid duplicated btrfs_search_slot() calls. 1495 * 1496 * NOTE: If an extent item starts before @search_start, we will still 1497 * return the extent item. This is for data extent crossing stripe boundary. 1498 * 1499 * Return 0 if we found such extent item, and @path will point to the extent item. 1500 * Return >0 if no such extent item can be found, and @path will be released. 1501 * Return <0 if hit fatal error, and @path will be released. 1502 */ 1503 static int find_first_extent_item(struct btrfs_root *extent_root, 1504 struct btrfs_path *path, 1505 u64 search_start, u64 search_len) 1506 { 1507 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1508 struct btrfs_key key; 1509 int ret; 1510 1511 /* Continue using the existing path */ 1512 if (path->nodes[0]) 1513 goto search_forward; 1514 1515 key.objectid = search_start; 1516 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1517 key.type = BTRFS_METADATA_ITEM_KEY; 1518 else 1519 key.type = BTRFS_EXTENT_ITEM_KEY; 1520 key.offset = (u64)-1; 1521 1522 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1523 if (ret < 0) 1524 return ret; 1525 if (unlikely(ret == 0)) { 1526 /* 1527 * Key with offset -1 found, there would have to exist an extent 1528 * item with such offset, but this is out of the valid range. 1529 */ 1530 btrfs_release_path(path); 1531 return -EUCLEAN; 1532 } 1533 1534 /* 1535 * Here we intentionally pass 0 as @min_objectid, as there could be 1536 * an extent item starting before @search_start. 1537 */ 1538 ret = btrfs_previous_extent_item(extent_root, path, 0); 1539 if (ret < 0) 1540 return ret; 1541 /* 1542 * No matter whether we have found an extent item, the next loop will 1543 * properly do every check on the key. 1544 */ 1545 search_forward: 1546 while (true) { 1547 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1548 if (key.objectid >= search_start + search_len) 1549 break; 1550 if (key.type != BTRFS_METADATA_ITEM_KEY && 1551 key.type != BTRFS_EXTENT_ITEM_KEY) 1552 goto next; 1553 1554 ret = compare_extent_item_range(path, search_start, search_len); 1555 if (ret == 0) 1556 return ret; 1557 if (ret > 0) 1558 break; 1559 next: 1560 ret = btrfs_next_item(extent_root, path); 1561 if (ret) { 1562 /* Either no more items or a fatal error. */ 1563 btrfs_release_path(path); 1564 return ret; 1565 } 1566 } 1567 btrfs_release_path(path); 1568 return 1; 1569 } 1570 1571 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1572 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1573 { 1574 struct btrfs_key key; 1575 struct btrfs_extent_item *ei; 1576 1577 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1578 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1579 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1580 *extent_start_ret = key.objectid; 1581 if (key.type == BTRFS_METADATA_ITEM_KEY) 1582 *size_ret = path->nodes[0]->fs_info->nodesize; 1583 else 1584 *size_ret = key.offset; 1585 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1586 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1587 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1588 } 1589 1590 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1591 u64 physical, u64 physical_end) 1592 { 1593 struct btrfs_fs_info *fs_info = sctx->fs_info; 1594 int ret = 0; 1595 1596 if (!btrfs_is_zoned(fs_info)) 1597 return 0; 1598 1599 mutex_lock(&sctx->wr_lock); 1600 if (sctx->write_pointer < physical_end) { 1601 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1602 physical, 1603 sctx->write_pointer); 1604 if (ret) 1605 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1606 } 1607 mutex_unlock(&sctx->wr_lock); 1608 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1609 1610 return ret; 1611 } 1612 1613 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1614 struct scrub_stripe *stripe, 1615 u64 extent_start, u64 extent_len, 1616 u64 extent_flags, u64 extent_gen) 1617 { 1618 for (u64 cur_logical = max(stripe->logical, extent_start); 1619 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1620 extent_start + extent_len); 1621 cur_logical += fs_info->sectorsize) { 1622 const int nr_sector = (cur_logical - stripe->logical) >> 1623 fs_info->sectorsize_bits; 1624 struct scrub_sector_verification *sector = 1625 &stripe->sectors[nr_sector]; 1626 1627 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1628 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1629 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1630 sector->generation = extent_gen; 1631 } 1632 } 1633 } 1634 1635 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1636 { 1637 ASSERT(stripe->nr_sectors); 1638 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1639 } 1640 1641 /* 1642 * Locate one stripe which has at least one extent in its range. 1643 * 1644 * Return 0 if found such stripe, and store its info into @stripe. 1645 * Return >0 if there is no such stripe in the specified range. 1646 * Return <0 for error. 1647 */ 1648 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1649 struct btrfs_path *extent_path, 1650 struct btrfs_path *csum_path, 1651 struct btrfs_device *dev, u64 physical, 1652 int mirror_num, u64 logical_start, 1653 u32 logical_len, 1654 struct scrub_stripe *stripe) 1655 { 1656 struct btrfs_fs_info *fs_info = bg->fs_info; 1657 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1658 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1659 const u64 logical_end = logical_start + logical_len; 1660 u64 cur_logical = logical_start; 1661 u64 stripe_end; 1662 u64 extent_start; 1663 u64 extent_len; 1664 u64 extent_flags; 1665 u64 extent_gen; 1666 int ret; 1667 1668 if (unlikely(!extent_root || !csum_root)) { 1669 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1670 return -EUCLEAN; 1671 } 1672 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1673 stripe->nr_sectors); 1674 scrub_stripe_reset_bitmaps(stripe); 1675 1676 /* The range must be inside the bg. */ 1677 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1678 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1679 bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1680 1681 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1682 logical_len); 1683 /* Either error or not found. */ 1684 if (ret) 1685 return ret; 1686 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1687 &extent_gen); 1688 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1689 stripe->nr_meta_extents++; 1690 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1691 stripe->nr_data_extents++; 1692 cur_logical = max(extent_start, cur_logical); 1693 1694 /* 1695 * Round down to stripe boundary. 1696 * 1697 * The extra calculation against bg->start is to handle block groups 1698 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1699 */ 1700 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1701 bg->start; 1702 stripe->physical = physical + stripe->logical - logical_start; 1703 stripe->dev = dev; 1704 stripe->bg = bg; 1705 stripe->mirror_num = mirror_num; 1706 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1707 1708 /* Fill the first extent info into stripe->sectors[] array. */ 1709 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1710 extent_flags, extent_gen); 1711 cur_logical = extent_start + extent_len; 1712 1713 /* Fill the extent info for the remaining sectors. */ 1714 while (cur_logical <= stripe_end) { 1715 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1716 stripe_end - cur_logical + 1); 1717 if (ret < 0) 1718 return ret; 1719 if (ret > 0) { 1720 ret = 0; 1721 break; 1722 } 1723 get_extent_info(extent_path, &extent_start, &extent_len, 1724 &extent_flags, &extent_gen); 1725 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1726 stripe->nr_meta_extents++; 1727 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1728 stripe->nr_data_extents++; 1729 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1730 extent_flags, extent_gen); 1731 cur_logical = extent_start + extent_len; 1732 } 1733 1734 /* Now fill the data csum. */ 1735 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1736 int sector_nr; 1737 unsigned long csum_bitmap = 0; 1738 1739 /* Csum space should have already been allocated. */ 1740 ASSERT(stripe->csums); 1741 1742 /* 1743 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1744 * should contain at most 16 sectors. 1745 */ 1746 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1747 1748 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1749 stripe->logical, stripe_end, 1750 stripe->csums, &csum_bitmap); 1751 if (ret < 0) 1752 return ret; 1753 if (ret > 0) 1754 ret = 0; 1755 1756 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1757 stripe->sectors[sector_nr].csum = stripe->csums + 1758 sector_nr * fs_info->csum_size; 1759 } 1760 } 1761 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1762 1763 return ret; 1764 } 1765 1766 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1767 { 1768 scrub_stripe_reset_bitmaps(stripe); 1769 1770 stripe->nr_meta_extents = 0; 1771 stripe->nr_data_extents = 0; 1772 stripe->state = 0; 1773 1774 for (int i = 0; i < stripe->nr_sectors; i++) { 1775 stripe->sectors[i].csum = NULL; 1776 stripe->sectors[i].generation = 0; 1777 } 1778 } 1779 1780 static u32 stripe_length(const struct scrub_stripe *stripe) 1781 { 1782 ASSERT(stripe->bg); 1783 1784 return min(BTRFS_STRIPE_LEN, 1785 stripe->bg->start + stripe->bg->length - stripe->logical); 1786 } 1787 1788 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1789 { 1790 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1791 struct btrfs_bio *bbio = NULL; 1792 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1793 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1794 u64 stripe_len = BTRFS_STRIPE_LEN; 1795 int mirror = stripe->mirror_num; 1796 int i; 1797 1798 atomic_inc(&stripe->pending_io); 1799 1800 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1801 /* We're beyond the chunk boundary, no need to read anymore. */ 1802 if (i >= nr_sectors) 1803 break; 1804 1805 /* The current sector cannot be merged, submit the bio. */ 1806 if (bbio && 1807 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1808 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1809 ASSERT(bbio->bio.bi_iter.bi_size); 1810 atomic_inc(&stripe->pending_io); 1811 btrfs_submit_bbio(bbio, mirror); 1812 bbio = NULL; 1813 } 1814 1815 if (!bbio) { 1816 struct btrfs_io_stripe io_stripe = {}; 1817 struct btrfs_io_context *bioc = NULL; 1818 const u64 logical = stripe->logical + 1819 (i << fs_info->sectorsize_bits); 1820 int ret; 1821 1822 io_stripe.rst_search_commit_root = true; 1823 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1824 /* 1825 * For RST cases, we need to manually split the bbio to 1826 * follow the RST boundary. 1827 */ 1828 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1829 &stripe_len, &bioc, &io_stripe, &mirror); 1830 btrfs_put_bioc(bioc); 1831 if (ret < 0) { 1832 if (ret != -ENODATA) { 1833 /* 1834 * Earlier btrfs_get_raid_extent_offset() 1835 * returned -ENODATA, which means there's 1836 * no entry for the corresponding range 1837 * in the stripe tree. But if it's in 1838 * the extent tree, then it's a preallocated 1839 * extent and not an error. 1840 */ 1841 scrub_bitmap_set_bit_io_error(stripe, i); 1842 scrub_bitmap_set_bit_error(stripe, i); 1843 } 1844 continue; 1845 } 1846 1847 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1848 logical, scrub_read_endio, stripe); 1849 } 1850 1851 scrub_bio_add_sector(bbio, stripe, i); 1852 } 1853 1854 if (bbio) { 1855 ASSERT(bbio->bio.bi_iter.bi_size); 1856 atomic_inc(&stripe->pending_io); 1857 btrfs_submit_bbio(bbio, mirror); 1858 } 1859 1860 if (atomic_dec_and_test(&stripe->pending_io)) { 1861 wake_up(&stripe->io_wait); 1862 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1863 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1864 } 1865 } 1866 1867 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1868 struct scrub_stripe *stripe) 1869 { 1870 struct btrfs_fs_info *fs_info = sctx->fs_info; 1871 struct btrfs_bio *bbio; 1872 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1873 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1874 int mirror = stripe->mirror_num; 1875 1876 ASSERT(stripe->bg); 1877 ASSERT(stripe->mirror_num > 0); 1878 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1879 1880 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1881 scrub_submit_extent_sector_read(stripe); 1882 return; 1883 } 1884 1885 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1886 stripe->logical, scrub_read_endio, stripe); 1887 /* Read the whole range inside the chunk boundary. */ 1888 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1889 scrub_bio_add_sector(bbio, stripe, cur); 1890 atomic_inc(&stripe->pending_io); 1891 1892 /* 1893 * For dev-replace, either user asks to avoid the source dev, or 1894 * the device is missing, we try the next mirror instead. 1895 */ 1896 if (sctx->is_dev_replace && 1897 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1898 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1899 !stripe->dev->bdev)) { 1900 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1901 stripe->bg->length); 1902 1903 mirror = calc_next_mirror(mirror, num_copies); 1904 } 1905 btrfs_submit_bbio(bbio, mirror); 1906 } 1907 1908 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1909 { 1910 const unsigned long error = scrub_bitmap_read_error(stripe); 1911 int i; 1912 1913 for_each_set_bit(i, &error, stripe->nr_sectors) { 1914 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1915 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1916 1917 btrfs_err(fs_info, 1918 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1919 stripe->logical, 1920 stripe->logical + (i << fs_info->sectorsize_bits)); 1921 return true; 1922 } 1923 } 1924 return false; 1925 } 1926 1927 static void submit_initial_group_read(struct scrub_ctx *sctx, 1928 unsigned int first_slot, 1929 unsigned int nr_stripes) 1930 { 1931 struct blk_plug plug; 1932 1933 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1934 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1935 1936 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1937 btrfs_stripe_nr_to_offset(nr_stripes)); 1938 blk_start_plug(&plug); 1939 for (int i = 0; i < nr_stripes; i++) { 1940 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1941 1942 /* Those stripes should be initialized. */ 1943 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1944 scrub_submit_initial_read(sctx, stripe); 1945 } 1946 blk_finish_plug(&plug); 1947 } 1948 1949 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1950 { 1951 struct btrfs_fs_info *fs_info = sctx->fs_info; 1952 struct scrub_stripe *stripe; 1953 const int nr_stripes = sctx->cur_stripe; 1954 int ret = 0; 1955 1956 if (!nr_stripes) 1957 return 0; 1958 1959 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1960 1961 /* Submit the stripes which are populated but not submitted. */ 1962 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1963 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1964 1965 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1966 } 1967 1968 for (int i = 0; i < nr_stripes; i++) { 1969 stripe = &sctx->stripes[i]; 1970 1971 wait_event(stripe->repair_wait, 1972 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1973 } 1974 1975 /* Submit for dev-replace. */ 1976 if (sctx->is_dev_replace) { 1977 /* 1978 * For dev-replace, if we know there is something wrong with 1979 * metadata, we should immediately abort. 1980 */ 1981 for (int i = 0; i < nr_stripes; i++) { 1982 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1983 ret = -EIO; 1984 goto out; 1985 } 1986 } 1987 for (int i = 0; i < nr_stripes; i++) { 1988 unsigned long good; 1989 unsigned long has_extent; 1990 unsigned long error; 1991 1992 stripe = &sctx->stripes[i]; 1993 1994 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 1995 1996 has_extent = scrub_bitmap_read_has_extent(stripe); 1997 error = scrub_bitmap_read_error(stripe); 1998 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 1999 scrub_write_sectors(sctx, stripe, good, true); 2000 } 2001 } 2002 2003 /* Wait for the above writebacks to finish. */ 2004 for (int i = 0; i < nr_stripes; i++) { 2005 stripe = &sctx->stripes[i]; 2006 2007 wait_scrub_stripe_io(stripe); 2008 spin_lock(&sctx->stat_lock); 2009 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2010 spin_unlock(&sctx->stat_lock); 2011 scrub_reset_stripe(stripe); 2012 } 2013 out: 2014 sctx->cur_stripe = 0; 2015 return ret; 2016 } 2017 2018 static void raid56_scrub_wait_endio(struct bio *bio) 2019 { 2020 complete(bio->bi_private); 2021 } 2022 2023 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2024 struct btrfs_device *dev, int mirror_num, 2025 u64 logical, u32 length, u64 physical, 2026 u64 *found_logical_ret) 2027 { 2028 struct scrub_stripe *stripe; 2029 int ret; 2030 2031 /* 2032 * There should always be one slot left, as caller filling the last 2033 * slot should flush them all. 2034 */ 2035 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2036 2037 /* @found_logical_ret must be specified. */ 2038 ASSERT(found_logical_ret); 2039 2040 stripe = &sctx->stripes[sctx->cur_stripe]; 2041 scrub_reset_stripe(stripe); 2042 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2043 &sctx->csum_path, dev, physical, 2044 mirror_num, logical, length, stripe); 2045 /* Either >0 as no more extents or <0 for error. */ 2046 if (ret) 2047 return ret; 2048 *found_logical_ret = stripe->logical; 2049 sctx->cur_stripe++; 2050 2051 /* We filled one group, submit it. */ 2052 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2053 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2054 2055 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2056 } 2057 2058 /* Last slot used, flush them all. */ 2059 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2060 return flush_scrub_stripes(sctx); 2061 return 0; 2062 } 2063 2064 /* 2065 * Return 0 if we should not cancel the scrub. 2066 * Return <0 if we need to cancel the scrub, returned value will 2067 * indicate the reason: 2068 * - -ECANCELED - Being explicitly canceled through ioctl. 2069 * - -EINTR - Being interrupted by signal or fs/process freezing. 2070 */ 2071 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2072 { 2073 struct btrfs_fs_info *fs_info = sctx->fs_info; 2074 2075 if (atomic_read(&fs_info->scrub_cancel_req) || 2076 atomic_read(&sctx->cancel_req)) 2077 return -ECANCELED; 2078 2079 /* 2080 * The user (e.g. fsfreeze command) or power management (PM) 2081 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2082 * also freeze all user processes. 2083 * 2084 * A user process can only be frozen when it is in user space, thus we 2085 * have to cancel the run so that the process can return to the user 2086 * space. 2087 * 2088 * Furthermore we have to check both filesystem and process freezing, 2089 * as PM can be configured to freeze the filesystems before processes. 2090 * 2091 * If we only check fs freezing, then suspend without fs freezing 2092 * will timeout, as the process is still in kernel space. 2093 * 2094 * If we only check process freezing, then suspend with fs freezing 2095 * will timeout, as the running scrub will prevent the fs from being frozen. 2096 */ 2097 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2098 freezing(current) || signal_pending(current)) 2099 return -EINTR; 2100 return 0; 2101 } 2102 2103 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2104 struct btrfs_device *scrub_dev, 2105 struct btrfs_chunk_map *map, 2106 u64 full_stripe_start, 2107 unsigned long *extent_bitmap) 2108 { 2109 DECLARE_COMPLETION_ONSTACK(io_done); 2110 struct btrfs_fs_info *fs_info = sctx->fs_info; 2111 struct btrfs_io_context *bioc = NULL; 2112 struct btrfs_raid_bio *rbio; 2113 struct bio bio; 2114 const int data_stripes = nr_data_stripes(map); 2115 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2116 int ret; 2117 2118 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2119 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2120 bio.bi_private = &io_done; 2121 bio.bi_end_io = raid56_scrub_wait_endio; 2122 2123 btrfs_bio_counter_inc_blocked(fs_info); 2124 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2125 &length, &bioc, NULL, NULL); 2126 if (ret < 0) 2127 goto out; 2128 /* For RAID56 write there must be an @bioc allocated. */ 2129 ASSERT(bioc); 2130 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2131 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2132 btrfs_put_bioc(bioc); 2133 if (!rbio) { 2134 ret = -ENOMEM; 2135 goto out; 2136 } 2137 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2138 for (int i = 0; i < data_stripes; i++) { 2139 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2140 2141 raid56_parity_cache_data_folios(rbio, stripe->folios, 2142 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2143 } 2144 raid56_parity_submit_scrub_rbio(rbio); 2145 wait_for_completion_io(&io_done); 2146 ret = blk_status_to_errno(bio.bi_status); 2147 out: 2148 btrfs_bio_counter_dec(fs_info); 2149 bio_uninit(&bio); 2150 return ret; 2151 } 2152 2153 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2154 struct btrfs_device *scrub_dev, 2155 struct btrfs_block_group *bg, 2156 struct btrfs_chunk_map *map, 2157 u64 full_stripe_start) 2158 { 2159 struct btrfs_fs_info *fs_info = sctx->fs_info; 2160 BTRFS_PATH_AUTO_RELEASE(extent_path); 2161 BTRFS_PATH_AUTO_RELEASE(csum_path); 2162 struct scrub_stripe *stripe; 2163 bool all_empty = true; 2164 const int data_stripes = nr_data_stripes(map); 2165 unsigned long extent_bitmap = 0; 2166 int ret; 2167 2168 ASSERT(sctx->raid56_data_stripes); 2169 2170 ret = should_cancel_scrub(sctx); 2171 if (ret < 0) 2172 return ret; 2173 2174 if (atomic_read(&fs_info->scrub_pause_req)) 2175 scrub_blocked_if_needed(fs_info); 2176 2177 spin_lock(&bg->lock); 2178 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2179 spin_unlock(&bg->lock); 2180 return 0; 2181 } 2182 spin_unlock(&bg->lock); 2183 2184 /* 2185 * For data stripe search, we cannot reuse the same extent/csum paths, 2186 * as the data stripe bytenr may be smaller than previous extent. Thus 2187 * we have to use our own extent/csum paths. 2188 */ 2189 extent_path.search_commit_root = true; 2190 extent_path.skip_locking = true; 2191 csum_path.search_commit_root = true; 2192 csum_path.skip_locking = true; 2193 2194 for (int i = 0; i < data_stripes; i++) { 2195 int stripe_index; 2196 int rot; 2197 u64 physical; 2198 2199 stripe = &sctx->raid56_data_stripes[i]; 2200 rot = div_u64(full_stripe_start - bg->start, 2201 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2202 stripe_index = (i + rot) % map->num_stripes; 2203 physical = map->stripes[stripe_index].physical + 2204 btrfs_stripe_nr_to_offset(rot); 2205 2206 scrub_reset_stripe(stripe); 2207 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2208 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2209 map->stripes[stripe_index].dev, physical, 1, 2210 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2211 BTRFS_STRIPE_LEN, stripe); 2212 if (ret < 0) 2213 return ret; 2214 /* 2215 * No extent in this data stripe, need to manually mark them 2216 * initialized to make later read submission happy. 2217 */ 2218 if (ret > 0) { 2219 stripe->logical = full_stripe_start + 2220 btrfs_stripe_nr_to_offset(i); 2221 stripe->dev = map->stripes[stripe_index].dev; 2222 stripe->mirror_num = 1; 2223 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2224 } 2225 } 2226 2227 /* Check if all data stripes are empty. */ 2228 for (int i = 0; i < data_stripes; i++) { 2229 stripe = &sctx->raid56_data_stripes[i]; 2230 if (!scrub_bitmap_empty_has_extent(stripe)) { 2231 all_empty = false; 2232 break; 2233 } 2234 } 2235 if (all_empty) 2236 return 0; 2237 2238 for (int i = 0; i < data_stripes; i++) { 2239 stripe = &sctx->raid56_data_stripes[i]; 2240 scrub_submit_initial_read(sctx, stripe); 2241 } 2242 for (int i = 0; i < data_stripes; i++) { 2243 stripe = &sctx->raid56_data_stripes[i]; 2244 2245 wait_event(stripe->repair_wait, 2246 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2247 } 2248 /* For now, no zoned support for RAID56. */ 2249 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2250 2251 /* 2252 * Now all data stripes are properly verified. Check if we have any 2253 * unrepaired, if so abort immediately or we could further corrupt the 2254 * P/Q stripes. 2255 * 2256 * During the loop, also populate extent_bitmap. 2257 */ 2258 for (int i = 0; i < data_stripes; i++) { 2259 unsigned long error; 2260 unsigned long has_extent; 2261 2262 stripe = &sctx->raid56_data_stripes[i]; 2263 2264 error = scrub_bitmap_read_error(stripe); 2265 has_extent = scrub_bitmap_read_has_extent(stripe); 2266 2267 /* 2268 * We should only check the errors where there is an extent. 2269 * As we may hit an empty data stripe while it's missing. 2270 */ 2271 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2272 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2273 btrfs_err(fs_info, 2274 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2275 full_stripe_start, i, stripe->nr_sectors, 2276 &error); 2277 return ret; 2278 } 2279 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2280 stripe->nr_sectors); 2281 } 2282 2283 /* Now we can check and regenerate the P/Q stripe. */ 2284 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2285 &extent_bitmap); 2286 } 2287 2288 /* 2289 * Scrub one range which can only has simple mirror based profile. 2290 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2291 * RAID0/RAID10). 2292 * 2293 * Since we may need to handle a subset of block group, we need @logical_start 2294 * and @logical_length parameter. 2295 */ 2296 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2297 struct btrfs_block_group *bg, 2298 u64 logical_start, u64 logical_length, 2299 struct btrfs_device *device, 2300 u64 physical, int mirror_num) 2301 { 2302 struct btrfs_fs_info *fs_info = sctx->fs_info; 2303 const u64 logical_end = logical_start + logical_length; 2304 u64 cur_logical = logical_start; 2305 int ret = 0; 2306 2307 /* The range must be inside the bg */ 2308 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2309 2310 /* Go through each extent items inside the logical range */ 2311 while (cur_logical < logical_end) { 2312 u64 found_logical = U64_MAX; 2313 u64 cur_physical = physical + cur_logical - logical_start; 2314 2315 ret = should_cancel_scrub(sctx); 2316 if (ret < 0) 2317 break; 2318 2319 if (atomic_read(&fs_info->scrub_pause_req)) 2320 scrub_blocked_if_needed(fs_info); 2321 2322 spin_lock(&bg->lock); 2323 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2324 spin_unlock(&bg->lock); 2325 ret = 0; 2326 break; 2327 } 2328 spin_unlock(&bg->lock); 2329 2330 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2331 cur_logical, logical_end - cur_logical, 2332 cur_physical, &found_logical); 2333 if (ret > 0) { 2334 /* No more extent, just update the accounting */ 2335 spin_lock(&sctx->stat_lock); 2336 sctx->stat.last_physical = physical + logical_length; 2337 spin_unlock(&sctx->stat_lock); 2338 ret = 0; 2339 break; 2340 } 2341 if (ret < 0) 2342 break; 2343 2344 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2345 ASSERT(found_logical != U64_MAX); 2346 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2347 2348 /* Don't hold CPU for too long time */ 2349 cond_resched(); 2350 } 2351 return ret; 2352 } 2353 2354 /* Calculate the full stripe length for simple stripe based profiles */ 2355 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2356 { 2357 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2358 BTRFS_BLOCK_GROUP_RAID10)); 2359 2360 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2361 } 2362 2363 /* Get the logical bytenr for the stripe */ 2364 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2365 struct btrfs_block_group *bg, 2366 int stripe_index) 2367 { 2368 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2369 BTRFS_BLOCK_GROUP_RAID10)); 2370 ASSERT(stripe_index < map->num_stripes); 2371 2372 /* 2373 * (stripe_index / sub_stripes) gives how many data stripes we need to 2374 * skip. 2375 */ 2376 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2377 bg->start; 2378 } 2379 2380 /* Get the mirror number for the stripe */ 2381 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2382 { 2383 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2384 BTRFS_BLOCK_GROUP_RAID10)); 2385 ASSERT(stripe_index < map->num_stripes); 2386 2387 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2388 return stripe_index % map->sub_stripes + 1; 2389 } 2390 2391 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2392 struct btrfs_block_group *bg, 2393 struct btrfs_chunk_map *map, 2394 struct btrfs_device *device, 2395 int stripe_index) 2396 { 2397 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2398 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2399 const u64 orig_physical = map->stripes[stripe_index].physical; 2400 const u64 end = btrfs_block_group_end(bg); 2401 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2402 u64 cur_logical = orig_logical; 2403 u64 cur_physical = orig_physical; 2404 int ret = 0; 2405 2406 while (cur_logical < end) { 2407 /* 2408 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2409 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2410 * this stripe. 2411 */ 2412 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2413 BTRFS_STRIPE_LEN, device, cur_physical, 2414 mirror_num); 2415 if (ret) 2416 return ret; 2417 /* Skip to next stripe which belongs to the target device */ 2418 cur_logical += logical_increment; 2419 /* For physical offset, we just go to next stripe */ 2420 cur_physical += BTRFS_STRIPE_LEN; 2421 } 2422 return ret; 2423 } 2424 2425 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2426 struct btrfs_block_group *bg, 2427 struct btrfs_chunk_map *map, 2428 struct btrfs_device *scrub_dev, 2429 int stripe_index) 2430 { 2431 struct btrfs_fs_info *fs_info = sctx->fs_info; 2432 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2433 const u64 chunk_logical = bg->start; 2434 int ret; 2435 int ret2; 2436 u64 physical = map->stripes[stripe_index].physical; 2437 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2438 const u64 physical_end = physical + dev_stripe_len; 2439 u64 logical; 2440 u64 logic_end; 2441 /* The logical increment after finishing one stripe */ 2442 u64 increment; 2443 /* Offset inside the chunk */ 2444 u64 offset; 2445 u64 stripe_logical; 2446 2447 /* Extent_path should be released by now. */ 2448 ASSERT(sctx->extent_path.nodes[0] == NULL); 2449 2450 scrub_blocked_if_needed(fs_info); 2451 2452 if (sctx->is_dev_replace && 2453 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2454 mutex_lock(&sctx->wr_lock); 2455 sctx->write_pointer = physical; 2456 mutex_unlock(&sctx->wr_lock); 2457 } 2458 2459 /* Prepare the extra data stripes used by RAID56. */ 2460 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2461 ASSERT(sctx->raid56_data_stripes == NULL); 2462 2463 sctx->raid56_data_stripes = kzalloc_objs(struct scrub_stripe, 2464 nr_data_stripes(map)); 2465 if (!sctx->raid56_data_stripes) { 2466 ret = -ENOMEM; 2467 goto out; 2468 } 2469 for (int i = 0; i < nr_data_stripes(map); i++) { 2470 ret = init_scrub_stripe(fs_info, 2471 &sctx->raid56_data_stripes[i]); 2472 if (ret < 0) 2473 goto out; 2474 sctx->raid56_data_stripes[i].bg = bg; 2475 sctx->raid56_data_stripes[i].sctx = sctx; 2476 } 2477 } 2478 /* 2479 * There used to be a big double loop to handle all profiles using the 2480 * same routine, which grows larger and more gross over time. 2481 * 2482 * So here we handle each profile differently, so simpler profiles 2483 * have simpler scrubbing function. 2484 */ 2485 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2486 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2487 /* 2488 * Above check rules out all complex profile, the remaining 2489 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2490 * mirrored duplication without stripe. 2491 * 2492 * Only @physical and @mirror_num needs to calculated using 2493 * @stripe_index. 2494 */ 2495 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2496 scrub_dev, map->stripes[stripe_index].physical, 2497 stripe_index + 1); 2498 offset = 0; 2499 goto out; 2500 } 2501 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2502 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2503 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2504 goto out; 2505 } 2506 2507 /* Only RAID56 goes through the old code */ 2508 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2509 ret = 0; 2510 2511 /* Calculate the logical end of the stripe */ 2512 get_raid56_logic_offset(physical_end, stripe_index, 2513 map, &logic_end, NULL); 2514 logic_end += chunk_logical; 2515 2516 /* Initialize @offset in case we need to go to out: label */ 2517 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2518 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2519 2520 /* 2521 * Due to the rotation, for RAID56 it's better to iterate each stripe 2522 * using their physical offset. 2523 */ 2524 while (physical < physical_end) { 2525 ret = get_raid56_logic_offset(physical, stripe_index, map, 2526 &logical, &stripe_logical); 2527 logical += chunk_logical; 2528 if (ret) { 2529 /* it is parity strip */ 2530 stripe_logical += chunk_logical; 2531 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2532 map, stripe_logical); 2533 spin_lock(&sctx->stat_lock); 2534 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2535 physical_end); 2536 spin_unlock(&sctx->stat_lock); 2537 if (ret) 2538 goto out; 2539 goto next; 2540 } 2541 2542 /* 2543 * Now we're at a data stripe, scrub each extents in the range. 2544 * 2545 * At this stage, if we ignore the repair part, inside each data 2546 * stripe it is no different than SINGLE profile. 2547 * We can reuse scrub_simple_mirror() here, as the repair part 2548 * is still based on @mirror_num. 2549 */ 2550 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2551 scrub_dev, physical, 1); 2552 if (ret < 0) 2553 goto out; 2554 next: 2555 logical += increment; 2556 physical += BTRFS_STRIPE_LEN; 2557 spin_lock(&sctx->stat_lock); 2558 sctx->stat.last_physical = physical; 2559 spin_unlock(&sctx->stat_lock); 2560 } 2561 out: 2562 ret2 = flush_scrub_stripes(sctx); 2563 if (!ret) 2564 ret = ret2; 2565 btrfs_release_path(&sctx->extent_path); 2566 btrfs_release_path(&sctx->csum_path); 2567 2568 if (sctx->raid56_data_stripes) { 2569 for (int i = 0; i < nr_data_stripes(map); i++) 2570 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2571 kfree(sctx->raid56_data_stripes); 2572 sctx->raid56_data_stripes = NULL; 2573 } 2574 2575 if (sctx->is_dev_replace && ret >= 0) { 2576 ret2 = sync_write_pointer_for_zoned(sctx, 2577 chunk_logical + offset, 2578 map->stripes[stripe_index].physical, 2579 physical_end); 2580 if (ret2) 2581 ret = ret2; 2582 } 2583 2584 return ret < 0 ? ret : 0; 2585 } 2586 2587 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2588 struct btrfs_block_group *bg, 2589 struct btrfs_device *scrub_dev, 2590 u64 dev_offset, 2591 u64 dev_extent_len) 2592 { 2593 struct btrfs_fs_info *fs_info = sctx->fs_info; 2594 struct btrfs_chunk_map *map; 2595 int i; 2596 int ret = 0; 2597 2598 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2599 if (!map) { 2600 /* 2601 * Might have been an unused block group deleted by the cleaner 2602 * kthread or relocation. 2603 */ 2604 spin_lock(&bg->lock); 2605 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2606 ret = -EINVAL; 2607 spin_unlock(&bg->lock); 2608 2609 return ret; 2610 } 2611 if (map->start != bg->start) 2612 goto out; 2613 if (map->chunk_len < dev_extent_len) 2614 goto out; 2615 2616 for (i = 0; i < map->num_stripes; ++i) { 2617 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2618 map->stripes[i].physical == dev_offset) { 2619 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2620 if (ret) 2621 goto out; 2622 } 2623 } 2624 out: 2625 btrfs_free_chunk_map(map); 2626 2627 return ret; 2628 } 2629 2630 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2631 struct btrfs_block_group *cache) 2632 { 2633 struct btrfs_fs_info *fs_info = cache->fs_info; 2634 2635 if (!btrfs_is_zoned(fs_info)) 2636 return 0; 2637 2638 btrfs_wait_block_group_reservations(cache); 2639 btrfs_wait_nocow_writers(cache); 2640 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2641 2642 return btrfs_commit_current_transaction(root); 2643 } 2644 2645 static noinline_for_stack 2646 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2647 struct btrfs_device *scrub_dev, u64 start, u64 end) 2648 { 2649 struct btrfs_dev_extent *dev_extent = NULL; 2650 BTRFS_PATH_AUTO_FREE(path); 2651 struct btrfs_fs_info *fs_info = sctx->fs_info; 2652 struct btrfs_root *root = fs_info->dev_root; 2653 u64 chunk_offset; 2654 int ret = 0; 2655 int ro_set; 2656 int slot; 2657 struct extent_buffer *l; 2658 struct btrfs_key key; 2659 struct btrfs_key found_key; 2660 struct btrfs_block_group *cache; 2661 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2662 2663 path = btrfs_alloc_path(); 2664 if (!path) 2665 return -ENOMEM; 2666 2667 path->reada = READA_FORWARD; 2668 path->search_commit_root = true; 2669 path->skip_locking = true; 2670 2671 key.objectid = scrub_dev->devid; 2672 key.type = BTRFS_DEV_EXTENT_KEY; 2673 key.offset = 0ull; 2674 2675 while (1) { 2676 u64 dev_extent_len; 2677 2678 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2679 if (ret < 0) 2680 break; 2681 if (ret > 0) { 2682 if (path->slots[0] >= 2683 btrfs_header_nritems(path->nodes[0])) { 2684 ret = btrfs_next_leaf(root, path); 2685 if (ret < 0) 2686 break; 2687 if (ret > 0) { 2688 ret = 0; 2689 break; 2690 } 2691 } else { 2692 ret = 0; 2693 } 2694 } 2695 2696 l = path->nodes[0]; 2697 slot = path->slots[0]; 2698 2699 btrfs_item_key_to_cpu(l, &found_key, slot); 2700 2701 if (found_key.objectid != scrub_dev->devid) 2702 break; 2703 2704 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2705 break; 2706 2707 if (found_key.offset >= end) 2708 break; 2709 2710 if (found_key.offset < key.offset) 2711 break; 2712 2713 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2714 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2715 2716 if (found_key.offset + dev_extent_len <= start) 2717 goto skip; 2718 2719 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2720 2721 /* 2722 * get a reference on the corresponding block group to prevent 2723 * the chunk from going away while we scrub it 2724 */ 2725 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2726 2727 /* some chunks are removed but not committed to disk yet, 2728 * continue scrubbing */ 2729 if (!cache) 2730 goto skip; 2731 2732 ASSERT(cache->start <= chunk_offset); 2733 /* 2734 * We are using the commit root to search for device extents, so 2735 * that means we could have found a device extent item from a 2736 * block group that was deleted in the current transaction. The 2737 * logical start offset of the deleted block group, stored at 2738 * @chunk_offset, might be part of the logical address range of 2739 * a new block group (which uses different physical extents). 2740 * In this case btrfs_lookup_block_group() has returned the new 2741 * block group, and its start address is less than @chunk_offset. 2742 * 2743 * We skip such new block groups, because it's pointless to 2744 * process them, as we won't find their extents because we search 2745 * for them using the commit root of the extent tree. For a device 2746 * replace it's also fine to skip it, we won't miss copying them 2747 * to the target device because we have the write duplication 2748 * setup through the regular write path (by btrfs_map_block()), 2749 * and we have committed a transaction when we started the device 2750 * replace, right after setting up the device replace state. 2751 */ 2752 if (cache->start < chunk_offset) { 2753 btrfs_put_block_group(cache); 2754 goto skip; 2755 } 2756 2757 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2758 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2759 btrfs_put_block_group(cache); 2760 goto skip; 2761 } 2762 } 2763 2764 /* 2765 * Make sure that while we are scrubbing the corresponding block 2766 * group doesn't get its logical address and its device extents 2767 * reused for another block group, which can possibly be of a 2768 * different type and different profile. We do this to prevent 2769 * false error detections and crashes due to bogus attempts to 2770 * repair extents. 2771 */ 2772 spin_lock(&cache->lock); 2773 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2774 spin_unlock(&cache->lock); 2775 btrfs_put_block_group(cache); 2776 goto skip; 2777 } 2778 btrfs_freeze_block_group(cache); 2779 spin_unlock(&cache->lock); 2780 2781 /* 2782 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2783 * to avoid deadlock caused by: 2784 * btrfs_inc_block_group_ro() 2785 * -> btrfs_wait_for_commit() 2786 * -> btrfs_commit_transaction() 2787 * -> btrfs_scrub_pause() 2788 */ 2789 scrub_pause_on(fs_info); 2790 2791 /* 2792 * Don't do chunk preallocation for scrub. 2793 * 2794 * This is especially important for SYSTEM bgs, or we can hit 2795 * -EFBIG from btrfs_finish_chunk_alloc() like: 2796 * 1. The only SYSTEM bg is marked RO. 2797 * Since SYSTEM bg is small, that's pretty common. 2798 * 2. New SYSTEM bg will be allocated 2799 * Due to regular version will allocate new chunk. 2800 * 3. New SYSTEM bg is empty and will get cleaned up 2801 * Before cleanup really happens, it's marked RO again. 2802 * 4. Empty SYSTEM bg get scrubbed 2803 * We go back to 2. 2804 * 2805 * This can easily boost the amount of SYSTEM chunks if cleaner 2806 * thread can't be triggered fast enough, and use up all space 2807 * of btrfs_super_block::sys_chunk_array 2808 * 2809 * While for dev replace, we need to try our best to mark block 2810 * group RO, to prevent race between: 2811 * - Write duplication 2812 * Contains latest data 2813 * - Scrub copy 2814 * Contains data from commit tree 2815 * 2816 * If target block group is not marked RO, nocow writes can 2817 * be overwritten by scrub copy, causing data corruption. 2818 * So for dev-replace, it's not allowed to continue if a block 2819 * group is not RO. 2820 */ 2821 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2822 if (!ret && sctx->is_dev_replace) { 2823 ret = finish_extent_writes_for_zoned(root, cache); 2824 if (ret) { 2825 btrfs_dec_block_group_ro(cache); 2826 scrub_pause_off(fs_info); 2827 btrfs_put_block_group(cache); 2828 break; 2829 } 2830 } 2831 2832 if (ret == 0) { 2833 ro_set = 1; 2834 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2835 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2836 /* 2837 * btrfs_inc_block_group_ro return -ENOSPC when it 2838 * failed in creating new chunk for metadata. 2839 * It is not a problem for scrub, because 2840 * metadata are always cowed, and our scrub paused 2841 * commit_transactions. 2842 * 2843 * For RAID56 chunks, we have to mark them read-only 2844 * for scrub, as later we would use our own cache 2845 * out of RAID56 realm. 2846 * Thus we want the RAID56 bg to be marked RO to 2847 * prevent RMW from screwing up out cache. 2848 */ 2849 ro_set = 0; 2850 } else if (ret == -ETXTBSY) { 2851 btrfs_warn(fs_info, 2852 "scrub: skipping scrub of block group %llu due to active swapfile", 2853 cache->start); 2854 scrub_pause_off(fs_info); 2855 ret = 0; 2856 goto skip_unfreeze; 2857 } else { 2858 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2859 ret); 2860 btrfs_unfreeze_block_group(cache); 2861 btrfs_put_block_group(cache); 2862 scrub_pause_off(fs_info); 2863 break; 2864 } 2865 2866 /* 2867 * Now the target block is marked RO, wait for nocow writes to 2868 * finish before dev-replace. 2869 * COW is fine, as COW never overwrites extents in commit tree. 2870 */ 2871 if (sctx->is_dev_replace) { 2872 btrfs_wait_nocow_writers(cache); 2873 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2874 } 2875 2876 scrub_pause_off(fs_info); 2877 down_write(&dev_replace->rwsem); 2878 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2879 dev_replace->cursor_left = found_key.offset; 2880 dev_replace->item_needs_writeback = 1; 2881 up_write(&dev_replace->rwsem); 2882 2883 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2884 dev_extent_len); 2885 if (sctx->is_dev_replace && 2886 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2887 cache, found_key.offset)) 2888 ro_set = 0; 2889 2890 down_write(&dev_replace->rwsem); 2891 dev_replace->cursor_left = dev_replace->cursor_right; 2892 dev_replace->item_needs_writeback = 1; 2893 up_write(&dev_replace->rwsem); 2894 2895 if (ro_set) 2896 btrfs_dec_block_group_ro(cache); 2897 2898 /* 2899 * We might have prevented the cleaner kthread from deleting 2900 * this block group if it was already unused because we raced 2901 * and set it to RO mode first. So add it back to the unused 2902 * list, otherwise it might not ever be deleted unless a manual 2903 * balance is triggered or it becomes used and unused again. 2904 */ 2905 spin_lock(&cache->lock); 2906 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2907 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2908 spin_unlock(&cache->lock); 2909 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2910 btrfs_discard_queue_work(&fs_info->discard_ctl, 2911 cache); 2912 else 2913 btrfs_mark_bg_unused(cache); 2914 } else { 2915 spin_unlock(&cache->lock); 2916 } 2917 skip_unfreeze: 2918 btrfs_unfreeze_block_group(cache); 2919 btrfs_put_block_group(cache); 2920 if (ret) 2921 break; 2922 if (unlikely(sctx->is_dev_replace && 2923 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2924 ret = -EIO; 2925 break; 2926 } 2927 if (sctx->stat.malloc_errors > 0) { 2928 ret = -ENOMEM; 2929 break; 2930 } 2931 skip: 2932 key.offset = found_key.offset + dev_extent_len; 2933 btrfs_release_path(path); 2934 } 2935 2936 return ret; 2937 } 2938 2939 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2940 struct page *page, u64 physical, u64 generation) 2941 { 2942 struct btrfs_fs_info *fs_info = sctx->fs_info; 2943 struct btrfs_super_block *sb = page_address(page); 2944 int ret; 2945 2946 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2947 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2948 if (ret < 0) 2949 return ret; 2950 ret = btrfs_check_super_csum(fs_info, sb); 2951 if (unlikely(ret != 0)) { 2952 btrfs_err_rl(fs_info, 2953 "scrub: super block at physical %llu devid %llu has bad csum", 2954 physical, dev->devid); 2955 return -EIO; 2956 } 2957 if (unlikely(btrfs_super_generation(sb) != generation)) { 2958 btrfs_err_rl(fs_info, 2959 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2960 physical, dev->devid, 2961 btrfs_super_generation(sb), generation); 2962 return -EUCLEAN; 2963 } 2964 2965 return btrfs_validate_super(fs_info, sb, -1); 2966 } 2967 2968 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2969 struct btrfs_device *scrub_dev) 2970 { 2971 int i; 2972 u64 bytenr; 2973 u64 gen; 2974 int ret = 0; 2975 struct page *page; 2976 struct btrfs_fs_info *fs_info = sctx->fs_info; 2977 2978 if (unlikely(BTRFS_FS_ERROR(fs_info))) 2979 return -EROFS; 2980 2981 page = alloc_page(GFP_KERNEL); 2982 if (!page) { 2983 spin_lock(&sctx->stat_lock); 2984 sctx->stat.malloc_errors++; 2985 spin_unlock(&sctx->stat_lock); 2986 return -ENOMEM; 2987 } 2988 2989 /* Seed devices of a new filesystem has their own generation. */ 2990 if (scrub_dev->fs_devices != fs_info->fs_devices) 2991 gen = scrub_dev->generation; 2992 else 2993 gen = btrfs_get_last_trans_committed(fs_info); 2994 2995 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2996 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 2997 if (ret == -ENOENT) 2998 break; 2999 3000 if (ret) { 3001 spin_lock(&sctx->stat_lock); 3002 sctx->stat.super_errors++; 3003 spin_unlock(&sctx->stat_lock); 3004 continue; 3005 } 3006 3007 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3008 scrub_dev->commit_total_bytes) 3009 break; 3010 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3011 continue; 3012 3013 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3014 if (ret) { 3015 spin_lock(&sctx->stat_lock); 3016 sctx->stat.super_errors++; 3017 spin_unlock(&sctx->stat_lock); 3018 } 3019 } 3020 __free_page(page); 3021 return 0; 3022 } 3023 3024 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3025 { 3026 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3027 &fs_info->scrub_lock)) { 3028 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3029 3030 fs_info->scrub_workers = NULL; 3031 mutex_unlock(&fs_info->scrub_lock); 3032 3033 if (scrub_workers) 3034 destroy_workqueue(scrub_workers); 3035 } 3036 } 3037 3038 /* 3039 * get a reference count on fs_info->scrub_workers. start worker if necessary 3040 */ 3041 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3042 { 3043 struct workqueue_struct *scrub_workers = NULL; 3044 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3045 int max_active = fs_info->thread_pool_size; 3046 int ret = -ENOMEM; 3047 3048 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3049 return 0; 3050 3051 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3052 if (!scrub_workers) 3053 return -ENOMEM; 3054 3055 mutex_lock(&fs_info->scrub_lock); 3056 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3057 ASSERT(fs_info->scrub_workers == NULL); 3058 fs_info->scrub_workers = scrub_workers; 3059 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3060 mutex_unlock(&fs_info->scrub_lock); 3061 return 0; 3062 } 3063 /* Other thread raced in and created the workers for us */ 3064 refcount_inc(&fs_info->scrub_workers_refcnt); 3065 mutex_unlock(&fs_info->scrub_lock); 3066 3067 ret = 0; 3068 3069 destroy_workqueue(scrub_workers); 3070 return ret; 3071 } 3072 3073 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3074 u64 end, struct btrfs_scrub_progress *progress, 3075 bool readonly, bool is_dev_replace) 3076 { 3077 struct btrfs_dev_lookup_args args = { .devid = devid }; 3078 struct scrub_ctx *sctx; 3079 int ret; 3080 struct btrfs_device *dev; 3081 unsigned int nofs_flag; 3082 bool need_commit = false; 3083 3084 /* Set the basic fallback @last_physical before we got a sctx. */ 3085 if (progress) 3086 progress->last_physical = start; 3087 3088 if (btrfs_fs_closing(fs_info)) 3089 return -EAGAIN; 3090 3091 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3092 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3093 3094 /* 3095 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3096 * value (max nodesize / min sectorsize), thus nodesize should always 3097 * be fine. 3098 */ 3099 ASSERT(fs_info->nodesize <= 3100 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3101 3102 /* Allocate outside of device_list_mutex */ 3103 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3104 if (IS_ERR(sctx)) 3105 return PTR_ERR(sctx); 3106 sctx->stat.last_physical = start; 3107 3108 ret = scrub_workers_get(fs_info); 3109 if (ret) 3110 goto out_free_ctx; 3111 3112 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3113 dev = btrfs_find_device(fs_info->fs_devices, &args); 3114 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3115 !is_dev_replace)) { 3116 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3117 ret = -ENODEV; 3118 goto out; 3119 } 3120 3121 if (!is_dev_replace && !readonly && 3122 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3123 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3124 btrfs_err(fs_info, 3125 "scrub: devid %llu: filesystem on %s is not writable", 3126 devid, btrfs_dev_name(dev)); 3127 ret = -EROFS; 3128 goto out; 3129 } 3130 3131 mutex_lock(&fs_info->scrub_lock); 3132 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3133 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3134 mutex_unlock(&fs_info->scrub_lock); 3135 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3136 ret = -EIO; 3137 goto out; 3138 } 3139 3140 down_read(&fs_info->dev_replace.rwsem); 3141 if (dev->scrub_ctx || 3142 (!is_dev_replace && 3143 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3144 up_read(&fs_info->dev_replace.rwsem); 3145 mutex_unlock(&fs_info->scrub_lock); 3146 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3147 ret = -EINPROGRESS; 3148 goto out; 3149 } 3150 up_read(&fs_info->dev_replace.rwsem); 3151 3152 sctx->readonly = readonly; 3153 dev->scrub_ctx = sctx; 3154 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3155 3156 /* 3157 * checking @scrub_pause_req here, we can avoid 3158 * race between committing transaction and scrubbing. 3159 */ 3160 __scrub_blocked_if_needed(fs_info); 3161 atomic_inc(&fs_info->scrubs_running); 3162 mutex_unlock(&fs_info->scrub_lock); 3163 3164 /* 3165 * In order to avoid deadlock with reclaim when there is a transaction 3166 * trying to pause scrub, make sure we use GFP_NOFS for all the 3167 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3168 * invoked by our callees. The pausing request is done when the 3169 * transaction commit starts, and it blocks the transaction until scrub 3170 * is paused (done at specific points at scrub_stripe() or right above 3171 * before incrementing fs_info->scrubs_running). 3172 */ 3173 nofs_flag = memalloc_nofs_save(); 3174 if (!is_dev_replace) { 3175 u64 old_super_errors; 3176 3177 spin_lock(&sctx->stat_lock); 3178 old_super_errors = sctx->stat.super_errors; 3179 spin_unlock(&sctx->stat_lock); 3180 3181 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3182 /* 3183 * by holding device list mutex, we can 3184 * kick off writing super in log tree sync. 3185 */ 3186 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3187 ret = scrub_supers(sctx, dev); 3188 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3189 3190 spin_lock(&sctx->stat_lock); 3191 /* 3192 * Super block errors found, but we can not commit transaction 3193 * at current context, since btrfs_commit_transaction() needs 3194 * to pause the current running scrub (hold by ourselves). 3195 */ 3196 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3197 need_commit = true; 3198 spin_unlock(&sctx->stat_lock); 3199 } 3200 3201 if (!ret) 3202 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3203 memalloc_nofs_restore(nofs_flag); 3204 3205 atomic_dec(&fs_info->scrubs_running); 3206 wake_up(&fs_info->scrub_pause_wait); 3207 3208 if (progress) 3209 memcpy(progress, &sctx->stat, sizeof(*progress)); 3210 3211 if (!is_dev_replace) 3212 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3213 ret ? "not finished" : "finished", devid, ret); 3214 3215 mutex_lock(&fs_info->scrub_lock); 3216 dev->scrub_ctx = NULL; 3217 mutex_unlock(&fs_info->scrub_lock); 3218 3219 scrub_workers_put(fs_info); 3220 scrub_put_ctx(sctx); 3221 3222 /* 3223 * We found some super block errors before, now try to force a 3224 * transaction commit, as scrub has finished. 3225 */ 3226 if (need_commit) { 3227 struct btrfs_trans_handle *trans; 3228 3229 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3230 if (IS_ERR(trans)) { 3231 ret = PTR_ERR(trans); 3232 btrfs_err(fs_info, 3233 "scrub: failed to start transaction to fix super block errors: %d", ret); 3234 return ret; 3235 } 3236 ret = btrfs_commit_transaction(trans); 3237 if (ret < 0) 3238 btrfs_err(fs_info, 3239 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3240 } 3241 return ret; 3242 out: 3243 scrub_workers_put(fs_info); 3244 out_free_ctx: 3245 scrub_free_ctx(sctx); 3246 3247 return ret; 3248 } 3249 3250 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3251 { 3252 mutex_lock(&fs_info->scrub_lock); 3253 atomic_inc(&fs_info->scrub_pause_req); 3254 while (atomic_read(&fs_info->scrubs_paused) != 3255 atomic_read(&fs_info->scrubs_running)) { 3256 mutex_unlock(&fs_info->scrub_lock); 3257 wait_event(fs_info->scrub_pause_wait, 3258 atomic_read(&fs_info->scrubs_paused) == 3259 atomic_read(&fs_info->scrubs_running)); 3260 mutex_lock(&fs_info->scrub_lock); 3261 } 3262 mutex_unlock(&fs_info->scrub_lock); 3263 } 3264 3265 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3266 { 3267 atomic_dec(&fs_info->scrub_pause_req); 3268 wake_up(&fs_info->scrub_pause_wait); 3269 } 3270 3271 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3272 { 3273 mutex_lock(&fs_info->scrub_lock); 3274 if (!atomic_read(&fs_info->scrubs_running)) { 3275 mutex_unlock(&fs_info->scrub_lock); 3276 return -ENOTCONN; 3277 } 3278 3279 atomic_inc(&fs_info->scrub_cancel_req); 3280 while (atomic_read(&fs_info->scrubs_running)) { 3281 mutex_unlock(&fs_info->scrub_lock); 3282 wait_event(fs_info->scrub_pause_wait, 3283 atomic_read(&fs_info->scrubs_running) == 0); 3284 mutex_lock(&fs_info->scrub_lock); 3285 } 3286 atomic_dec(&fs_info->scrub_cancel_req); 3287 mutex_unlock(&fs_info->scrub_lock); 3288 3289 return 0; 3290 } 3291 3292 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3293 { 3294 struct btrfs_fs_info *fs_info = dev->fs_info; 3295 struct scrub_ctx *sctx; 3296 3297 mutex_lock(&fs_info->scrub_lock); 3298 sctx = dev->scrub_ctx; 3299 if (!sctx) { 3300 mutex_unlock(&fs_info->scrub_lock); 3301 return -ENOTCONN; 3302 } 3303 atomic_inc(&sctx->cancel_req); 3304 while (dev->scrub_ctx) { 3305 mutex_unlock(&fs_info->scrub_lock); 3306 wait_event(fs_info->scrub_pause_wait, 3307 dev->scrub_ctx == NULL); 3308 mutex_lock(&fs_info->scrub_lock); 3309 } 3310 mutex_unlock(&fs_info->scrub_lock); 3311 3312 return 0; 3313 } 3314 3315 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3316 struct btrfs_scrub_progress *progress) 3317 { 3318 struct btrfs_dev_lookup_args args = { .devid = devid }; 3319 struct btrfs_device *dev; 3320 struct scrub_ctx *sctx = NULL; 3321 3322 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3323 dev = btrfs_find_device(fs_info->fs_devices, &args); 3324 if (dev) 3325 sctx = dev->scrub_ctx; 3326 if (sctx) 3327 memcpy(progress, &sctx->stat, sizeof(*progress)); 3328 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3329 3330 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3331 } 3332