1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "discard.h" 11 #include "volumes.h" 12 #include "disk-io.h" 13 #include "ordered-data.h" 14 #include "transaction.h" 15 #include "backref.h" 16 #include "extent_io.h" 17 #include "dev-replace.h" 18 #include "raid56.h" 19 #include "block-group.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "file-item.h" 24 #include "scrub.h" 25 #include "raid-stripe-tree.h" 26 27 /* 28 * This is only the first step towards a full-features scrub. It reads all 29 * extent and super block and verifies the checksums. In case a bad checksum 30 * is found or the extent cannot be read, good data will be written back if 31 * any can be found. 32 * 33 * Future enhancements: 34 * - In case an unrepairable extent is encountered, track which files are 35 * affected and report them 36 * - track and record media errors, throw out bad devices 37 * - add a mode to also read unallocated space 38 */ 39 40 struct scrub_ctx; 41 42 /* 43 * The following value only influences the performance. 44 * 45 * This determines how many stripes would be submitted in one go, 46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 47 */ 48 #define SCRUB_STRIPES_PER_GROUP 8 49 50 /* 51 * How many groups we have for each sctx. 52 * 53 * This would be 8M per device, the same value as the old scrub in-flight bios 54 * size limit. 55 */ 56 #define SCRUB_GROUPS_PER_SCTX 16 57 58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 59 60 /* 61 * The following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 */ 64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 65 66 /* Represent one sector and its needed info to verify the content. */ 67 struct scrub_sector_verification { 68 union { 69 /* 70 * Csum pointer for data csum verification. Should point to a 71 * sector csum inside scrub_stripe::csums. 72 * 73 * NULL if this data sector has no csum. 74 */ 75 u8 *csum; 76 77 /* 78 * Extra info for metadata verification. All sectors inside a 79 * tree block share the same generation. 80 */ 81 u64 generation; 82 }; 83 }; 84 85 enum scrub_stripe_flags { 86 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 87 SCRUB_STRIPE_FLAG_INITIALIZED, 88 89 /* Set when the read-repair is finished. */ 90 SCRUB_STRIPE_FLAG_REPAIR_DONE, 91 92 /* 93 * Set for data stripes if it's triggered from P/Q stripe. 94 * During such scrub, we should not report errors in data stripes, nor 95 * update the accounting. 96 */ 97 SCRUB_STRIPE_FLAG_NO_REPORT, 98 }; 99 100 /* 101 * We have multiple bitmaps for one scrub_stripe. 102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 104 * 105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 106 * into a larger one. 107 * 108 * These enum records where the sub-bitmap are inside the larger one. 109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 110 */ 111 enum { 112 /* Which blocks are covered by extent items. */ 113 scrub_bitmap_nr_has_extent = 0, 114 115 /* Which blocks are metadata. */ 116 scrub_bitmap_nr_is_metadata, 117 118 /* 119 * Which blocks have errors, including IO, csum, and metadata 120 * errors. 121 * This sub-bitmap is the OR results of the next few error related 122 * sub-bitmaps. 123 */ 124 scrub_bitmap_nr_error, 125 scrub_bitmap_nr_io_error, 126 scrub_bitmap_nr_csum_error, 127 scrub_bitmap_nr_meta_error, 128 scrub_bitmap_nr_meta_gen_error, 129 scrub_bitmap_nr_last, 130 }; 131 132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 133 134 /* 135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 136 */ 137 struct scrub_stripe { 138 struct scrub_ctx *sctx; 139 struct btrfs_block_group *bg; 140 141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 142 struct scrub_sector_verification *sectors; 143 144 struct btrfs_device *dev; 145 u64 logical; 146 u64 physical; 147 148 u16 mirror_num; 149 150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 151 u16 nr_sectors; 152 153 /* 154 * How many data/meta extents are in this stripe. Only for scrub status 155 * reporting purposes. 156 */ 157 u16 nr_data_extents; 158 u16 nr_meta_extents; 159 160 atomic_t pending_io; 161 wait_queue_head_t io_wait; 162 wait_queue_head_t repair_wait; 163 164 /* 165 * Indicate the states of the stripe. Bits are defined in 166 * scrub_stripe_flags enum. 167 */ 168 unsigned long state; 169 170 /* The large bitmap contains all the sub-bitmaps. */ 171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 173 174 /* 175 * For writeback (repair or replace) error reporting. 176 * This one is protected by a spinlock, thus can not be packed into 177 * the larger bitmap. 178 */ 179 unsigned long write_error_bitmap; 180 181 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 182 spinlock_t write_error_lock; 183 184 /* 185 * Checksum for the whole stripe if this stripe is inside a data block 186 * group. 187 */ 188 u8 *csums; 189 190 struct work_struct work; 191 }; 192 193 struct scrub_ctx { 194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 195 struct scrub_stripe *raid56_data_stripes; 196 struct btrfs_fs_info *fs_info; 197 struct btrfs_path extent_path; 198 struct btrfs_path csum_path; 199 int first_free; 200 int cur_stripe; 201 atomic_t cancel_req; 202 int readonly; 203 204 /* State of IO submission throttling affecting the associated device */ 205 ktime_t throttle_deadline; 206 u64 throttle_sent; 207 208 bool is_dev_replace; 209 u64 write_pointer; 210 211 struct mutex wr_lock; 212 struct btrfs_device *wr_tgtdev; 213 214 /* 215 * statistics 216 */ 217 struct btrfs_scrub_progress stat; 218 spinlock_t stat_lock; 219 220 /* 221 * Use a ref counter to avoid use-after-free issues. Scrub workers 222 * decrement bios_in_flight and workers_pending and then do a wakeup 223 * on the list_wait wait queue. We must ensure the main scrub task 224 * doesn't free the scrub context before or while the workers are 225 * doing the wakeup() call. 226 */ 227 refcount_t refs; 228 }; 229 230 #define scrub_calc_start_bit(stripe, name, block_nr) \ 231 ({ \ 232 unsigned int __start_bit; \ 233 \ 234 ASSERT(block_nr < stripe->nr_sectors, \ 235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 237 __start_bit; \ 238 }) 239 240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 242 unsigned int block_nr, \ 243 unsigned int nr_blocks) \ 244 { \ 245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 246 name, block_nr); \ 247 \ 248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 249 } \ 250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 251 unsigned int block_nr, \ 252 unsigned int nr_blocks) \ 253 { \ 254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 255 block_nr); \ 256 \ 257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 258 } \ 259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 260 unsigned int block_nr) \ 261 { \ 262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 263 block_nr); \ 264 \ 265 return test_bit(start_bit, stripe->bitmaps); \ 266 } \ 267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 268 unsigned int block_nr) \ 269 { \ 270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 271 block_nr); \ 272 \ 273 set_bit(start_bit, stripe->bitmaps); \ 274 } \ 275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 276 unsigned int block_nr) \ 277 { \ 278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 279 block_nr); \ 280 \ 281 clear_bit(start_bit, stripe->bitmaps); \ 282 } \ 283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 284 { \ 285 const unsigned int nr_blocks = stripe->nr_sectors; \ 286 \ 287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 288 "nr_blocks=%u BITS_PER_LONG=%u", \ 289 nr_blocks, BITS_PER_LONG); \ 290 \ 291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 292 stripe->nr_sectors); \ 293 } \ 294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 295 { \ 296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 297 \ 298 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 299 } \ 300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 301 { \ 302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 303 \ 304 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 305 } 306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 308 IMPLEMENT_SCRUB_BITMAP_OPS(error); 309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 313 314 struct scrub_warning { 315 struct btrfs_path *path; 316 u64 extent_item_size; 317 const char *errstr; 318 u64 physical; 319 u64 logical; 320 struct btrfs_device *dev; 321 }; 322 323 struct scrub_error_records { 324 /* 325 * Bitmap recording which blocks hit errors (IO/csum/...) during the 326 * initial read. 327 */ 328 unsigned long init_error_bitmap; 329 330 unsigned int nr_io_errors; 331 unsigned int nr_csum_errors; 332 unsigned int nr_meta_errors; 333 unsigned int nr_meta_gen_errors; 334 }; 335 336 static void release_scrub_stripe(struct scrub_stripe *stripe) 337 { 338 if (!stripe) 339 return; 340 341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 342 if (stripe->folios[i]) 343 folio_put(stripe->folios[i]); 344 stripe->folios[i] = NULL; 345 } 346 kfree(stripe->sectors); 347 kfree(stripe->csums); 348 stripe->sectors = NULL; 349 stripe->csums = NULL; 350 stripe->sctx = NULL; 351 stripe->state = 0; 352 } 353 354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 355 struct scrub_stripe *stripe) 356 { 357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 358 int ret; 359 360 memset(stripe, 0, sizeof(*stripe)); 361 362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 363 stripe->state = 0; 364 365 init_waitqueue_head(&stripe->io_wait); 366 init_waitqueue_head(&stripe->repair_wait); 367 atomic_set(&stripe->pending_io, 0); 368 spin_lock_init(&stripe->write_error_lock); 369 370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 372 fs_info->block_min_order, stripe->folios); 373 if (ret < 0) 374 goto error; 375 376 stripe->sectors = kzalloc_objs(struct scrub_sector_verification, 377 stripe->nr_sectors, GFP_KERNEL); 378 if (!stripe->sectors) 379 goto error; 380 381 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 382 fs_info->csum_size, GFP_KERNEL); 383 if (!stripe->csums) 384 goto error; 385 return 0; 386 error: 387 release_scrub_stripe(stripe); 388 return -ENOMEM; 389 } 390 391 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 392 { 393 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 394 } 395 396 static void scrub_put_ctx(struct scrub_ctx *sctx); 397 398 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 399 { 400 while (atomic_read(&fs_info->scrub_pause_req)) { 401 mutex_unlock(&fs_info->scrub_lock); 402 wait_event(fs_info->scrub_pause_wait, 403 atomic_read(&fs_info->scrub_pause_req) == 0); 404 mutex_lock(&fs_info->scrub_lock); 405 } 406 } 407 408 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 409 { 410 atomic_inc(&fs_info->scrubs_paused); 411 wake_up(&fs_info->scrub_pause_wait); 412 } 413 414 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 415 { 416 mutex_lock(&fs_info->scrub_lock); 417 __scrub_blocked_if_needed(fs_info); 418 atomic_dec(&fs_info->scrubs_paused); 419 mutex_unlock(&fs_info->scrub_lock); 420 421 wake_up(&fs_info->scrub_pause_wait); 422 } 423 424 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 425 { 426 scrub_pause_on(fs_info); 427 scrub_pause_off(fs_info); 428 } 429 430 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 431 { 432 int i; 433 434 if (!sctx) 435 return; 436 437 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 438 release_scrub_stripe(&sctx->stripes[i]); 439 440 kvfree(sctx); 441 } 442 443 static void scrub_put_ctx(struct scrub_ctx *sctx) 444 { 445 if (refcount_dec_and_test(&sctx->refs)) 446 scrub_free_ctx(sctx); 447 } 448 449 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 450 struct btrfs_fs_info *fs_info, bool is_dev_replace) 451 { 452 struct scrub_ctx *sctx; 453 int i; 454 455 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 456 * kvzalloc(). 457 */ 458 sctx = kvzalloc_obj(*sctx); 459 if (!sctx) 460 goto nomem; 461 refcount_set(&sctx->refs, 1); 462 sctx->is_dev_replace = is_dev_replace; 463 sctx->fs_info = fs_info; 464 sctx->extent_path.search_commit_root = true; 465 sctx->extent_path.skip_locking = true; 466 sctx->csum_path.search_commit_root = true; 467 sctx->csum_path.skip_locking = true; 468 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 469 int ret; 470 471 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 472 if (ret < 0) 473 goto nomem; 474 sctx->stripes[i].sctx = sctx; 475 } 476 sctx->first_free = 0; 477 atomic_set(&sctx->cancel_req, 0); 478 479 spin_lock_init(&sctx->stat_lock); 480 sctx->throttle_deadline = 0; 481 482 mutex_init(&sctx->wr_lock); 483 if (is_dev_replace) { 484 WARN_ON(!fs_info->dev_replace.tgtdev); 485 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 486 } 487 488 return sctx; 489 490 nomem: 491 scrub_free_ctx(sctx); 492 return ERR_PTR(-ENOMEM); 493 } 494 495 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 496 u64 root, void *warn_ctx) 497 { 498 u32 nlink; 499 int ret; 500 int i; 501 unsigned nofs_flag; 502 struct extent_buffer *eb; 503 struct btrfs_inode_item *inode_item; 504 struct scrub_warning *swarn = warn_ctx; 505 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 506 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 507 struct btrfs_root *local_root; 508 struct btrfs_key key; 509 510 local_root = btrfs_get_fs_root(fs_info, root, true); 511 if (IS_ERR(local_root)) { 512 ret = PTR_ERR(local_root); 513 goto err; 514 } 515 516 /* 517 * this makes the path point to (inum INODE_ITEM ioff) 518 */ 519 key.objectid = inum; 520 key.type = BTRFS_INODE_ITEM_KEY; 521 key.offset = 0; 522 523 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 524 if (ret) { 525 btrfs_put_root(local_root); 526 btrfs_release_path(swarn->path); 527 goto err; 528 } 529 530 eb = swarn->path->nodes[0]; 531 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 532 struct btrfs_inode_item); 533 nlink = btrfs_inode_nlink(eb, inode_item); 534 btrfs_release_path(swarn->path); 535 536 /* 537 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 538 * uses GFP_NOFS in this context, so we keep it consistent but it does 539 * not seem to be strictly necessary. 540 */ 541 nofs_flag = memalloc_nofs_save(); 542 ipath = init_ipath(4096, local_root, swarn->path); 543 memalloc_nofs_restore(nofs_flag); 544 if (IS_ERR(ipath)) { 545 btrfs_put_root(local_root); 546 ret = PTR_ERR(ipath); 547 ipath = NULL; 548 goto err; 549 } 550 ret = paths_from_inode(inum, ipath); 551 552 if (ret < 0) 553 goto err; 554 555 /* 556 * we deliberately ignore the bit ipath might have been too small to 557 * hold all of the paths here 558 */ 559 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 560 btrfs_warn(fs_info, 561 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 562 swarn->errstr, swarn->logical, 563 btrfs_dev_name(swarn->dev), 564 swarn->physical, 565 root, inum, offset, 566 fs_info->sectorsize, nlink, 567 (char *)(unsigned long)ipath->fspath->val[i]); 568 569 btrfs_put_root(local_root); 570 return 0; 571 572 err: 573 btrfs_warn(fs_info, 574 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 575 swarn->errstr, swarn->logical, 576 btrfs_dev_name(swarn->dev), 577 swarn->physical, 578 root, inum, offset, ret); 579 580 return 0; 581 } 582 583 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 584 bool is_super, u64 logical, u64 physical) 585 { 586 struct btrfs_fs_info *fs_info = dev->fs_info; 587 BTRFS_PATH_AUTO_FREE(path); 588 struct btrfs_key found_key; 589 struct extent_buffer *eb; 590 struct btrfs_extent_item *ei; 591 struct scrub_warning swarn; 592 u64 flags = 0; 593 u32 item_size; 594 int ret; 595 596 /* Super block error, no need to search extent tree. */ 597 if (is_super) { 598 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 599 errstr, btrfs_dev_name(dev), physical); 600 return; 601 } 602 path = btrfs_alloc_path(); 603 if (!path) 604 return; 605 606 swarn.physical = physical; 607 swarn.logical = logical; 608 swarn.errstr = errstr; 609 swarn.dev = NULL; 610 611 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 612 &flags); 613 if (ret < 0) 614 return; 615 616 swarn.extent_item_size = found_key.offset; 617 618 eb = path->nodes[0]; 619 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 620 item_size = btrfs_item_size(eb, path->slots[0]); 621 622 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 unsigned long ptr = 0; 624 u8 ref_level; 625 u64 ref_root; 626 627 while (true) { 628 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 629 item_size, &ref_root, 630 &ref_level); 631 if (ret < 0) { 632 btrfs_warn(fs_info, 633 "scrub: failed to resolve tree backref for logical %llu: %d", 634 swarn.logical, ret); 635 break; 636 } 637 if (ret > 0) 638 break; 639 btrfs_warn(fs_info, 640 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 641 errstr, swarn.logical, btrfs_dev_name(dev), 642 swarn.physical, (ref_level ? "node" : "leaf"), 643 ref_level, ref_root); 644 } 645 btrfs_release_path(path); 646 } else { 647 struct btrfs_backref_walk_ctx ctx = { 0 }; 648 649 btrfs_release_path(path); 650 651 ctx.bytenr = found_key.objectid; 652 ctx.extent_item_pos = swarn.logical - found_key.objectid; 653 ctx.fs_info = fs_info; 654 655 swarn.path = path; 656 swarn.dev = dev; 657 658 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 659 } 660 } 661 662 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 663 { 664 int ret = 0; 665 u64 length; 666 667 if (!btrfs_is_zoned(sctx->fs_info)) 668 return 0; 669 670 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 671 return 0; 672 673 if (sctx->write_pointer < physical) { 674 length = physical - sctx->write_pointer; 675 676 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 677 sctx->write_pointer, length); 678 if (!ret) 679 sctx->write_pointer = physical; 680 } 681 return ret; 682 } 683 684 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 685 { 686 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 687 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 688 u32 offset = (sector_nr << fs_info->sectorsize_bits); 689 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 690 691 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 692 ASSERT(folio); 693 ASSERT(!folio_test_highmem(folio)); 694 return folio_address(folio) + offset_in_folio(folio, offset); 695 } 696 697 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 698 { 699 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 700 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 701 u32 offset = (sector_nr << fs_info->sectorsize_bits); 702 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 703 704 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 705 ASSERT(folio); 706 ASSERT(!folio_test_highmem(folio)); 707 /* And the range must be contained inside the folio. */ 708 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 709 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 710 } 711 712 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 713 { 714 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 715 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 716 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 717 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 718 struct btrfs_header *header = first_kaddr; 719 struct btrfs_csum_ctx csum; 720 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 721 u8 calculated_csum[BTRFS_CSUM_SIZE]; 722 723 /* 724 * Here we don't have a good way to attach the pages (and subpages) 725 * to a dummy extent buffer, thus we have to directly grab the members 726 * from pages. 727 */ 728 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 729 730 if (logical != btrfs_stack_header_bytenr(header)) { 731 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 732 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 733 btrfs_warn_rl(fs_info, 734 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 735 logical, stripe->mirror_num, 736 btrfs_stack_header_bytenr(header), logical); 737 return; 738 } 739 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 740 BTRFS_FSID_SIZE) != 0) { 741 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 742 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 743 btrfs_warn_rl(fs_info, 744 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 745 logical, stripe->mirror_num, 746 header->fsid, fs_info->fs_devices->fsid); 747 return; 748 } 749 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 750 BTRFS_UUID_SIZE) != 0) { 751 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 752 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 753 btrfs_warn_rl(fs_info, 754 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 755 logical, stripe->mirror_num, 756 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 757 return; 758 } 759 760 /* Now check tree block csum. */ 761 btrfs_csum_init(&csum, fs_info->csum_type); 762 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 763 fs_info->sectorsize - BTRFS_CSUM_SIZE); 764 765 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 766 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 767 fs_info->sectorsize); 768 } 769 770 btrfs_csum_final(&csum, calculated_csum); 771 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 772 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 773 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 774 btrfs_warn_rl(fs_info, 775 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 776 logical, stripe->mirror_num, 777 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 779 return; 780 } 781 if (stripe->sectors[sector_nr].generation != 782 btrfs_stack_header_generation(header)) { 783 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 784 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 785 btrfs_warn_rl(fs_info, 786 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 787 logical, stripe->mirror_num, 788 btrfs_stack_header_generation(header), 789 stripe->sectors[sector_nr].generation); 790 return; 791 } 792 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 793 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 794 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 795 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 796 } 797 798 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 799 { 800 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 801 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 802 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 803 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 804 u8 csum_buf[BTRFS_CSUM_SIZE]; 805 int ret; 806 807 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 808 809 /* Sector not utilized, skip it. */ 810 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 811 return; 812 813 /* IO error, no need to check. */ 814 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 815 return; 816 817 /* Metadata, verify the full tree block. */ 818 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 819 /* 820 * Check if the tree block crosses the stripe boundary. If 821 * crossed the boundary, we cannot verify it but only give a 822 * warning. 823 * 824 * This can only happen on a very old filesystem where chunks 825 * are not ensured to be stripe aligned. 826 */ 827 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 828 btrfs_warn_rl(fs_info, 829 "scrub: tree block at %llu crosses stripe boundary %llu", 830 stripe->logical + 831 (sector_nr << fs_info->sectorsize_bits), 832 stripe->logical); 833 return; 834 } 835 scrub_verify_one_metadata(stripe, sector_nr); 836 return; 837 } 838 839 /* 840 * Data is easier, we just verify the data csum (if we have it). For 841 * cases without csum, we have no other choice but to trust it. 842 */ 843 if (!sector->csum) { 844 scrub_bitmap_clear_bit_error(stripe, sector_nr); 845 return; 846 } 847 848 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 849 if (ret < 0) { 850 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 851 scrub_bitmap_set_bit_error(stripe, sector_nr); 852 } else { 853 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 854 scrub_bitmap_clear_bit_error(stripe, sector_nr); 855 } 856 } 857 858 /* Verify specified sectors of a stripe. */ 859 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 860 { 861 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 862 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 863 int sector_nr; 864 865 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 866 scrub_verify_one_sector(stripe, sector_nr); 867 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 868 sector_nr += sectors_per_tree - 1; 869 } 870 } 871 872 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 873 { 874 int i; 875 876 for (i = 0; i < stripe->nr_sectors; i++) { 877 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 878 break; 879 } 880 ASSERT(i < stripe->nr_sectors); 881 return i; 882 } 883 884 /* 885 * Repair read is different to the regular read: 886 * 887 * - Only reads the failed sectors 888 * - May have extra blocksize limits 889 */ 890 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 891 { 892 struct scrub_stripe *stripe = bbio->private; 893 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 894 struct bio_vec *bvec; 895 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 896 u32 bio_size = 0; 897 int i; 898 899 ASSERT(sector_nr < stripe->nr_sectors); 900 901 bio_for_each_bvec_all(bvec, &bbio->bio, i) 902 bio_size += bvec->bv_len; 903 904 if (bbio->bio.bi_status) { 905 scrub_bitmap_set_io_error(stripe, sector_nr, 906 bio_size >> fs_info->sectorsize_bits); 907 scrub_bitmap_set_error(stripe, sector_nr, 908 bio_size >> fs_info->sectorsize_bits); 909 } else { 910 scrub_bitmap_clear_io_error(stripe, sector_nr, 911 bio_size >> fs_info->sectorsize_bits); 912 } 913 bio_put(&bbio->bio); 914 if (atomic_dec_and_test(&stripe->pending_io)) 915 wake_up(&stripe->io_wait); 916 } 917 918 static int calc_next_mirror(int mirror, int num_copies) 919 { 920 ASSERT(mirror <= num_copies); 921 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 922 } 923 924 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 925 int sector_nr) 926 { 927 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 928 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 929 int ret; 930 931 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 932 offset_in_page(kaddr)); 933 /* 934 * Caller should ensure the bbio has enough size. 935 * And we cannot use __bio_add_page(), which doesn't do any merge. 936 * 937 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 938 * to create the minimal amount of bio vectors, for fs block size < page 939 * size cases. 940 */ 941 ASSERT(ret == fs_info->sectorsize); 942 } 943 944 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 945 unsigned int nr_vecs, blk_opf_t opf, 946 u64 logical, 947 btrfs_bio_end_io_t end_io, void *private) 948 { 949 struct btrfs_bio *bbio; 950 951 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 952 logical, end_io, private); 953 bbio->is_scrub = true; 954 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 955 return bbio; 956 } 957 958 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 959 int mirror, int blocksize, bool wait) 960 { 961 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 962 struct btrfs_bio *bbio = NULL; 963 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 964 int i; 965 966 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 967 ASSERT(atomic_read(&stripe->pending_io) == 0, 968 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 969 970 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 971 /* The current sector cannot be merged, submit the bio. */ 972 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 973 bbio->bio.bi_iter.bi_size >= blocksize)) { 974 ASSERT(bbio->bio.bi_iter.bi_size); 975 atomic_inc(&stripe->pending_io); 976 btrfs_submit_bbio(bbio, mirror); 977 if (wait) 978 wait_scrub_stripe_io(stripe); 979 bbio = NULL; 980 } 981 982 if (!bbio) 983 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 984 stripe->logical + (i << fs_info->sectorsize_bits), 985 scrub_repair_read_endio, stripe); 986 987 scrub_bio_add_sector(bbio, stripe, i); 988 } 989 if (bbio) { 990 ASSERT(bbio->bio.bi_iter.bi_size); 991 atomic_inc(&stripe->pending_io); 992 btrfs_submit_bbio(bbio, mirror); 993 if (wait) 994 wait_scrub_stripe_io(stripe); 995 } 996 } 997 998 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 999 struct scrub_stripe *stripe, 1000 const struct scrub_error_records *errors) 1001 { 1002 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 1003 DEFAULT_RATELIMIT_BURST); 1004 struct btrfs_fs_info *fs_info = sctx->fs_info; 1005 struct btrfs_device *dev = NULL; 1006 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1007 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1008 u64 physical = 0; 1009 int nr_data_sectors = 0; 1010 int nr_meta_sectors = 0; 1011 int nr_nodatacsum_sectors = 0; 1012 int nr_repaired_sectors = 0; 1013 int sector_nr; 1014 1015 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1016 return; 1017 1018 /* 1019 * Init needed infos for error reporting. 1020 * 1021 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1022 * thus no need for dev/physical, error reporting still needs dev and physical. 1023 */ 1024 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1025 u64 mapped_len = fs_info->sectorsize; 1026 struct btrfs_io_context *bioc = NULL; 1027 int stripe_index = stripe->mirror_num - 1; 1028 int ret; 1029 1030 /* For scrub, our mirror_num should always start at 1. */ 1031 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1032 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1033 stripe->logical, &mapped_len, &bioc, 1034 NULL, NULL); 1035 /* 1036 * If we failed, dev will be NULL, and later detailed reports 1037 * will just be skipped. 1038 */ 1039 if (ret < 0) 1040 goto skip; 1041 physical = bioc->stripes[stripe_index].physical; 1042 dev = bioc->stripes[stripe_index].dev; 1043 btrfs_put_bioc(bioc); 1044 } 1045 1046 skip: 1047 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1048 bool repaired = false; 1049 1050 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1051 nr_meta_sectors++; 1052 } else { 1053 nr_data_sectors++; 1054 if (!stripe->sectors[sector_nr].csum) 1055 nr_nodatacsum_sectors++; 1056 } 1057 1058 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1059 !test_bit(sector_nr, &error_bitmap)) { 1060 nr_repaired_sectors++; 1061 repaired = true; 1062 } 1063 1064 /* Good sector from the beginning, nothing need to be done. */ 1065 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1066 continue; 1067 1068 /* 1069 * Report error for the corrupted sectors. If repaired, just 1070 * output the message of repaired message. 1071 */ 1072 if (repaired) { 1073 if (dev) { 1074 btrfs_err_rl(fs_info, 1075 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1076 stripe->logical, btrfs_dev_name(dev), 1077 physical); 1078 } else { 1079 btrfs_err_rl(fs_info, 1080 "scrub: fixed up error at logical %llu on mirror %u", 1081 stripe->logical, stripe->mirror_num); 1082 } 1083 continue; 1084 } 1085 1086 /* The remaining are all for unrepaired. */ 1087 if (dev) { 1088 btrfs_err_rl(fs_info, 1089 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1090 stripe->logical, btrfs_dev_name(dev), 1091 physical); 1092 } else { 1093 btrfs_err_rl(fs_info, 1094 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1095 stripe->logical, stripe->mirror_num); 1096 } 1097 1098 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1099 if (__ratelimit(&rs) && dev) 1100 scrub_print_common_warning("i/o error", dev, false, 1101 stripe->logical, physical); 1102 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1103 if (__ratelimit(&rs) && dev) 1104 scrub_print_common_warning("checksum error", dev, false, 1105 stripe->logical, physical); 1106 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1107 if (__ratelimit(&rs) && dev) 1108 scrub_print_common_warning("header error", dev, false, 1109 stripe->logical, physical); 1110 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1111 if (__ratelimit(&rs) && dev) 1112 scrub_print_common_warning("generation error", dev, false, 1113 stripe->logical, physical); 1114 } 1115 1116 /* Update the device stats. */ 1117 for (int i = 0; i < errors->nr_io_errors; i++) 1118 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1119 for (int i = 0; i < errors->nr_csum_errors; i++) 1120 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1121 /* Generation mismatch error is based on each metadata, not each block. */ 1122 for (int i = 0; i < errors->nr_meta_gen_errors; 1123 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1124 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1125 1126 spin_lock(&sctx->stat_lock); 1127 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1128 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1129 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1130 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1131 sctx->stat.no_csum += nr_nodatacsum_sectors; 1132 sctx->stat.read_errors += errors->nr_io_errors; 1133 sctx->stat.csum_errors += errors->nr_csum_errors; 1134 sctx->stat.verify_errors += errors->nr_meta_errors + 1135 errors->nr_meta_gen_errors; 1136 sctx->stat.uncorrectable_errors += 1137 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1138 sctx->stat.corrected_errors += nr_repaired_sectors; 1139 spin_unlock(&sctx->stat_lock); 1140 } 1141 1142 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1143 unsigned long write_bitmap, bool dev_replace); 1144 1145 /* 1146 * The main entrance for all read related scrub work, including: 1147 * 1148 * - Wait for the initial read to finish 1149 * - Verify and locate any bad sectors 1150 * - Go through the remaining mirrors and try to read as large blocksize as 1151 * possible 1152 * - Go through all mirrors (including the failed mirror) sector-by-sector 1153 * - Submit writeback for repaired sectors 1154 * 1155 * Writeback for dev-replace does not happen here, it needs extra 1156 * synchronization for zoned devices. 1157 */ 1158 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1159 { 1160 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1161 struct scrub_ctx *sctx = stripe->sctx; 1162 struct btrfs_fs_info *fs_info = sctx->fs_info; 1163 struct scrub_error_records errors = { 0 }; 1164 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1165 stripe->bg->length); 1166 unsigned long repaired; 1167 unsigned long error; 1168 int mirror; 1169 int i; 1170 1171 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1172 1173 wait_scrub_stripe_io(stripe); 1174 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1175 /* Save the initial failed bitmap for later repair and report usage. */ 1176 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1177 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1178 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1179 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1180 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1181 1182 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1183 goto out; 1184 1185 /* 1186 * Try all remaining mirrors. 1187 * 1188 * Here we still try to read as large block as possible, as this is 1189 * faster and we have extra safety nets to rely on. 1190 */ 1191 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1192 mirror != stripe->mirror_num; 1193 mirror = calc_next_mirror(mirror, num_copies)) { 1194 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1195 1196 scrub_stripe_submit_repair_read(stripe, mirror, 1197 BTRFS_STRIPE_LEN, false); 1198 wait_scrub_stripe_io(stripe); 1199 scrub_verify_one_stripe(stripe, old_error_bitmap); 1200 if (scrub_bitmap_empty_error(stripe)) 1201 goto out; 1202 } 1203 1204 /* 1205 * Last safety net, try re-checking all mirrors, including the failed 1206 * one, sector-by-sector. 1207 * 1208 * As if one sector failed the drive's internal csum, the whole read 1209 * containing the offending sector would be marked as error. 1210 * Thus here we do sector-by-sector read. 1211 * 1212 * This can be slow, thus we only try it as the last resort. 1213 */ 1214 1215 for (i = 0, mirror = stripe->mirror_num; 1216 i < num_copies; 1217 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1218 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1219 1220 scrub_stripe_submit_repair_read(stripe, mirror, 1221 fs_info->sectorsize, true); 1222 wait_scrub_stripe_io(stripe); 1223 scrub_verify_one_stripe(stripe, old_error_bitmap); 1224 if (scrub_bitmap_empty_error(stripe)) 1225 goto out; 1226 } 1227 out: 1228 error = scrub_bitmap_read_error(stripe); 1229 /* 1230 * Submit the repaired sectors. For zoned case, we cannot do repair 1231 * in-place, but queue the bg to be relocated. 1232 */ 1233 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1234 stripe->nr_sectors); 1235 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1236 if (btrfs_is_zoned(fs_info)) { 1237 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1238 } else { 1239 scrub_write_sectors(sctx, stripe, repaired, false); 1240 wait_scrub_stripe_io(stripe); 1241 } 1242 } 1243 1244 scrub_stripe_report_errors(sctx, stripe, &errors); 1245 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1246 wake_up(&stripe->repair_wait); 1247 } 1248 1249 static void scrub_read_endio(struct btrfs_bio *bbio) 1250 { 1251 struct scrub_stripe *stripe = bbio->private; 1252 struct bio_vec *bvec; 1253 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1254 int num_sectors; 1255 u32 bio_size = 0; 1256 int i; 1257 1258 ASSERT(sector_nr < stripe->nr_sectors); 1259 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1260 bio_size += bvec->bv_len; 1261 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1262 1263 if (bbio->bio.bi_status) { 1264 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1265 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1266 } else { 1267 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1268 } 1269 bio_put(&bbio->bio); 1270 if (atomic_dec_and_test(&stripe->pending_io)) { 1271 wake_up(&stripe->io_wait); 1272 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1273 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1274 } 1275 } 1276 1277 static void scrub_write_endio(struct btrfs_bio *bbio) 1278 { 1279 struct scrub_stripe *stripe = bbio->private; 1280 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1281 struct bio_vec *bvec; 1282 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1283 u32 bio_size = 0; 1284 int i; 1285 1286 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1287 bio_size += bvec->bv_len; 1288 1289 if (bbio->bio.bi_status) { 1290 unsigned long flags; 1291 1292 spin_lock_irqsave(&stripe->write_error_lock, flags); 1293 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1294 bio_size >> fs_info->sectorsize_bits); 1295 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1296 for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1297 btrfs_dev_stat_inc_and_print(stripe->dev, 1298 BTRFS_DEV_STAT_WRITE_ERRS); 1299 } 1300 bio_put(&bbio->bio); 1301 1302 if (atomic_dec_and_test(&stripe->pending_io)) 1303 wake_up(&stripe->io_wait); 1304 } 1305 1306 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1307 struct scrub_stripe *stripe, 1308 struct btrfs_bio *bbio, bool dev_replace) 1309 { 1310 struct btrfs_fs_info *fs_info = sctx->fs_info; 1311 u32 bio_len = bbio->bio.bi_iter.bi_size; 1312 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1313 stripe->logical; 1314 1315 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1316 atomic_inc(&stripe->pending_io); 1317 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1318 if (!btrfs_is_zoned(fs_info)) 1319 return; 1320 /* 1321 * For zoned writeback, queue depth must be 1, thus we must wait for 1322 * the write to finish before the next write. 1323 */ 1324 wait_scrub_stripe_io(stripe); 1325 1326 /* 1327 * And also need to update the write pointer if write finished 1328 * successfully. 1329 */ 1330 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1331 &stripe->write_error_bitmap)) 1332 sctx->write_pointer += bio_len; 1333 } 1334 1335 /* 1336 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1337 * 1338 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1339 * 1340 * - Only needs logical bytenr and mirror_num 1341 * Just like the scrub read path 1342 * 1343 * - Would only result in writes to the specified mirror 1344 * Unlike the regular writeback path, which would write back to all stripes 1345 * 1346 * - Handle dev-replace and read-repair writeback differently 1347 */ 1348 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1349 unsigned long write_bitmap, bool dev_replace) 1350 { 1351 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1352 struct btrfs_bio *bbio = NULL; 1353 int sector_nr; 1354 1355 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1356 /* We should only writeback sectors covered by an extent. */ 1357 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1358 1359 /* Cannot merge with previous sector, submit the current one. */ 1360 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1361 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1362 bbio = NULL; 1363 } 1364 if (!bbio) 1365 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1366 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1367 scrub_write_endio, stripe); 1368 scrub_bio_add_sector(bbio, stripe, sector_nr); 1369 } 1370 if (bbio) 1371 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1372 } 1373 1374 /* 1375 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1376 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1377 */ 1378 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1379 unsigned int bio_size) 1380 { 1381 const int time_slice = 1000; 1382 s64 delta; 1383 ktime_t now; 1384 u32 div; 1385 u64 bwlimit; 1386 1387 bwlimit = READ_ONCE(device->scrub_speed_max); 1388 if (bwlimit == 0) 1389 return; 1390 1391 /* 1392 * Slice is divided into intervals when the IO is submitted, adjust by 1393 * bwlimit and maximum of 64 intervals. 1394 */ 1395 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1396 1397 /* Start new epoch, set deadline */ 1398 now = ktime_get(); 1399 if (sctx->throttle_deadline == 0) { 1400 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1401 sctx->throttle_sent = 0; 1402 } 1403 1404 /* Still in the time to send? */ 1405 if (ktime_before(now, sctx->throttle_deadline)) { 1406 /* If current bio is within the limit, send it */ 1407 sctx->throttle_sent += bio_size; 1408 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1409 return; 1410 1411 /* We're over the limit, sleep until the rest of the slice */ 1412 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1413 } else { 1414 /* New request after deadline, start new epoch */ 1415 delta = 0; 1416 } 1417 1418 if (delta) { 1419 long timeout; 1420 1421 timeout = div_u64(delta * HZ, 1000); 1422 schedule_timeout_interruptible(timeout); 1423 } 1424 1425 /* Next call will start the deadline period */ 1426 sctx->throttle_deadline = 0; 1427 } 1428 1429 /* 1430 * Given a physical address, this will calculate it's 1431 * logical offset. if this is a parity stripe, it will return 1432 * the most left data stripe's logical offset. 1433 * 1434 * return 0 if it is a data stripe, 1 means parity stripe. 1435 */ 1436 static int get_raid56_logic_offset(u64 physical, int num, 1437 struct btrfs_chunk_map *map, u64 *offset, 1438 u64 *stripe_start) 1439 { 1440 int i; 1441 int j = 0; 1442 u64 last_offset; 1443 const int data_stripes = nr_data_stripes(map); 1444 1445 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1446 if (stripe_start) 1447 *stripe_start = last_offset; 1448 1449 *offset = last_offset; 1450 for (i = 0; i < data_stripes; i++) { 1451 u32 stripe_nr; 1452 u32 stripe_index; 1453 u32 rot; 1454 1455 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1456 1457 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1458 1459 /* Work out the disk rotation on this stripe-set */ 1460 rot = stripe_nr % map->num_stripes; 1461 /* calculate which stripe this data locates */ 1462 rot += i; 1463 stripe_index = rot % map->num_stripes; 1464 if (stripe_index == num) 1465 return 0; 1466 if (stripe_index < num) 1467 j++; 1468 } 1469 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1470 return 1; 1471 } 1472 1473 /* 1474 * Return 0 if the extent item range covers any byte of the range. 1475 * Return <0 if the extent item is before @search_start. 1476 * Return >0 if the extent item is after @start_start + @search_len. 1477 */ 1478 static int compare_extent_item_range(struct btrfs_path *path, 1479 u64 search_start, u64 search_len) 1480 { 1481 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1482 u64 len; 1483 struct btrfs_key key; 1484 1485 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1486 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1487 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1488 if (key.type == BTRFS_METADATA_ITEM_KEY) 1489 len = fs_info->nodesize; 1490 else 1491 len = key.offset; 1492 1493 if (key.objectid + len <= search_start) 1494 return -1; 1495 if (key.objectid >= search_start + search_len) 1496 return 1; 1497 return 0; 1498 } 1499 1500 /* 1501 * Locate one extent item which covers any byte in range 1502 * [@search_start, @search_start + @search_length) 1503 * 1504 * If the path is not initialized, we will initialize the search by doing 1505 * a btrfs_search_slot(). 1506 * If the path is already initialized, we will use the path as the initial 1507 * slot, to avoid duplicated btrfs_search_slot() calls. 1508 * 1509 * NOTE: If an extent item starts before @search_start, we will still 1510 * return the extent item. This is for data extent crossing stripe boundary. 1511 * 1512 * Return 0 if we found such extent item, and @path will point to the extent item. 1513 * Return >0 if no such extent item can be found, and @path will be released. 1514 * Return <0 if hit fatal error, and @path will be released. 1515 */ 1516 static int find_first_extent_item(struct btrfs_root *extent_root, 1517 struct btrfs_path *path, 1518 u64 search_start, u64 search_len) 1519 { 1520 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1521 struct btrfs_key key; 1522 int ret; 1523 1524 /* Continue using the existing path */ 1525 if (path->nodes[0]) 1526 goto search_forward; 1527 1528 key.objectid = search_start; 1529 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1530 key.type = BTRFS_METADATA_ITEM_KEY; 1531 else 1532 key.type = BTRFS_EXTENT_ITEM_KEY; 1533 key.offset = (u64)-1; 1534 1535 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1536 if (ret < 0) 1537 return ret; 1538 if (unlikely(ret == 0)) { 1539 /* 1540 * Key with offset -1 found, there would have to exist an extent 1541 * item with such offset, but this is out of the valid range. 1542 */ 1543 btrfs_release_path(path); 1544 return -EUCLEAN; 1545 } 1546 1547 /* 1548 * Here we intentionally pass 0 as @min_objectid, as there could be 1549 * an extent item starting before @search_start. 1550 */ 1551 ret = btrfs_previous_extent_item(extent_root, path, 0); 1552 if (ret < 0) 1553 return ret; 1554 /* 1555 * No matter whether we have found an extent item, the next loop will 1556 * properly do every check on the key. 1557 */ 1558 search_forward: 1559 while (true) { 1560 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1561 if (key.objectid >= search_start + search_len) 1562 break; 1563 if (key.type != BTRFS_METADATA_ITEM_KEY && 1564 key.type != BTRFS_EXTENT_ITEM_KEY) 1565 goto next; 1566 1567 ret = compare_extent_item_range(path, search_start, search_len); 1568 if (ret == 0) 1569 return ret; 1570 if (ret > 0) 1571 break; 1572 next: 1573 ret = btrfs_next_item(extent_root, path); 1574 if (ret) { 1575 /* Either no more items or a fatal error. */ 1576 btrfs_release_path(path); 1577 return ret; 1578 } 1579 } 1580 btrfs_release_path(path); 1581 return 1; 1582 } 1583 1584 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1585 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1586 { 1587 struct btrfs_key key; 1588 struct btrfs_extent_item *ei; 1589 1590 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1591 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1592 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1593 *extent_start_ret = key.objectid; 1594 if (key.type == BTRFS_METADATA_ITEM_KEY) 1595 *size_ret = path->nodes[0]->fs_info->nodesize; 1596 else 1597 *size_ret = key.offset; 1598 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1599 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1600 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1601 } 1602 1603 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1604 u64 physical, u64 physical_end) 1605 { 1606 struct btrfs_fs_info *fs_info = sctx->fs_info; 1607 int ret = 0; 1608 1609 if (!btrfs_is_zoned(fs_info)) 1610 return 0; 1611 1612 mutex_lock(&sctx->wr_lock); 1613 if (sctx->write_pointer < physical_end) { 1614 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1615 physical, 1616 sctx->write_pointer); 1617 if (ret) 1618 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1619 } 1620 mutex_unlock(&sctx->wr_lock); 1621 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1622 1623 return ret; 1624 } 1625 1626 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1627 struct scrub_stripe *stripe, 1628 u64 extent_start, u64 extent_len, 1629 u64 extent_flags, u64 extent_gen) 1630 { 1631 for (u64 cur_logical = max(stripe->logical, extent_start); 1632 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1633 extent_start + extent_len); 1634 cur_logical += fs_info->sectorsize) { 1635 const int nr_sector = (cur_logical - stripe->logical) >> 1636 fs_info->sectorsize_bits; 1637 struct scrub_sector_verification *sector = 1638 &stripe->sectors[nr_sector]; 1639 1640 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1641 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1642 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1643 sector->generation = extent_gen; 1644 } 1645 } 1646 } 1647 1648 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1649 { 1650 ASSERT(stripe->nr_sectors); 1651 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1652 } 1653 1654 /* 1655 * Locate one stripe which has at least one extent in its range. 1656 * 1657 * Return 0 if found such stripe, and store its info into @stripe. 1658 * Return >0 if there is no such stripe in the specified range. 1659 * Return <0 for error. 1660 */ 1661 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1662 struct btrfs_path *extent_path, 1663 struct btrfs_path *csum_path, 1664 struct btrfs_device *dev, u64 physical, 1665 int mirror_num, u64 logical_start, 1666 u32 logical_len, 1667 struct scrub_stripe *stripe) 1668 { 1669 struct btrfs_fs_info *fs_info = bg->fs_info; 1670 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1671 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1672 const u64 logical_end = logical_start + logical_len; 1673 u64 cur_logical = logical_start; 1674 u64 stripe_end; 1675 u64 extent_start; 1676 u64 extent_len; 1677 u64 extent_flags; 1678 u64 extent_gen; 1679 int ret; 1680 1681 if (unlikely(!extent_root || !csum_root)) { 1682 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1683 return -EUCLEAN; 1684 } 1685 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1686 stripe->nr_sectors); 1687 scrub_stripe_reset_bitmaps(stripe); 1688 1689 /* The range must be inside the bg. */ 1690 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1691 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1692 bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1693 1694 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1695 logical_len); 1696 /* Either error or not found. */ 1697 if (ret) 1698 return ret; 1699 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1700 &extent_gen); 1701 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1702 stripe->nr_meta_extents++; 1703 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1704 stripe->nr_data_extents++; 1705 cur_logical = max(extent_start, cur_logical); 1706 1707 /* 1708 * Round down to stripe boundary. 1709 * 1710 * The extra calculation against bg->start is to handle block groups 1711 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1712 */ 1713 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1714 bg->start; 1715 stripe->physical = physical + stripe->logical - logical_start; 1716 stripe->dev = dev; 1717 stripe->bg = bg; 1718 stripe->mirror_num = mirror_num; 1719 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1720 1721 /* Fill the first extent info into stripe->sectors[] array. */ 1722 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1723 extent_flags, extent_gen); 1724 cur_logical = extent_start + extent_len; 1725 1726 /* Fill the extent info for the remaining sectors. */ 1727 while (cur_logical <= stripe_end) { 1728 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1729 stripe_end - cur_logical + 1); 1730 if (ret < 0) 1731 return ret; 1732 if (ret > 0) { 1733 ret = 0; 1734 break; 1735 } 1736 get_extent_info(extent_path, &extent_start, &extent_len, 1737 &extent_flags, &extent_gen); 1738 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1739 stripe->nr_meta_extents++; 1740 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1741 stripe->nr_data_extents++; 1742 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1743 extent_flags, extent_gen); 1744 cur_logical = extent_start + extent_len; 1745 } 1746 1747 /* Now fill the data csum. */ 1748 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1749 int sector_nr; 1750 unsigned long csum_bitmap = 0; 1751 1752 /* Csum space should have already been allocated. */ 1753 ASSERT(stripe->csums); 1754 1755 /* 1756 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1757 * should contain at most 16 sectors. 1758 */ 1759 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1760 1761 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1762 stripe->logical, stripe_end, 1763 stripe->csums, &csum_bitmap); 1764 if (ret < 0) 1765 return ret; 1766 if (ret > 0) 1767 ret = 0; 1768 1769 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1770 stripe->sectors[sector_nr].csum = stripe->csums + 1771 sector_nr * fs_info->csum_size; 1772 } 1773 } 1774 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1775 1776 return ret; 1777 } 1778 1779 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1780 { 1781 scrub_stripe_reset_bitmaps(stripe); 1782 1783 stripe->nr_meta_extents = 0; 1784 stripe->nr_data_extents = 0; 1785 stripe->state = 0; 1786 1787 for (int i = 0; i < stripe->nr_sectors; i++) { 1788 stripe->sectors[i].csum = NULL; 1789 stripe->sectors[i].generation = 0; 1790 } 1791 } 1792 1793 static u32 stripe_length(const struct scrub_stripe *stripe) 1794 { 1795 ASSERT(stripe->bg); 1796 1797 return min(BTRFS_STRIPE_LEN, 1798 stripe->bg->start + stripe->bg->length - stripe->logical); 1799 } 1800 1801 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1802 { 1803 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1804 struct btrfs_bio *bbio = NULL; 1805 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1806 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1807 u64 stripe_len = BTRFS_STRIPE_LEN; 1808 int mirror = stripe->mirror_num; 1809 int i; 1810 1811 atomic_inc(&stripe->pending_io); 1812 1813 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1814 /* We're beyond the chunk boundary, no need to read anymore. */ 1815 if (i >= nr_sectors) 1816 break; 1817 1818 /* The current sector cannot be merged, submit the bio. */ 1819 if (bbio && 1820 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1821 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1822 ASSERT(bbio->bio.bi_iter.bi_size); 1823 atomic_inc(&stripe->pending_io); 1824 btrfs_submit_bbio(bbio, mirror); 1825 bbio = NULL; 1826 } 1827 1828 if (!bbio) { 1829 struct btrfs_io_stripe io_stripe = {}; 1830 struct btrfs_io_context *bioc = NULL; 1831 const u64 logical = stripe->logical + 1832 (i << fs_info->sectorsize_bits); 1833 int ret; 1834 1835 io_stripe.rst_search_commit_root = true; 1836 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1837 /* 1838 * For RST cases, we need to manually split the bbio to 1839 * follow the RST boundary. 1840 */ 1841 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1842 &stripe_len, &bioc, &io_stripe, &mirror); 1843 btrfs_put_bioc(bioc); 1844 if (ret < 0) { 1845 if (ret != -ENODATA) { 1846 /* 1847 * Earlier btrfs_get_raid_extent_offset() 1848 * returned -ENODATA, which means there's 1849 * no entry for the corresponding range 1850 * in the stripe tree. But if it's in 1851 * the extent tree, then it's a preallocated 1852 * extent and not an error. 1853 */ 1854 scrub_bitmap_set_bit_io_error(stripe, i); 1855 scrub_bitmap_set_bit_error(stripe, i); 1856 } 1857 continue; 1858 } 1859 1860 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1861 logical, scrub_read_endio, stripe); 1862 } 1863 1864 scrub_bio_add_sector(bbio, stripe, i); 1865 } 1866 1867 if (bbio) { 1868 ASSERT(bbio->bio.bi_iter.bi_size); 1869 atomic_inc(&stripe->pending_io); 1870 btrfs_submit_bbio(bbio, mirror); 1871 } 1872 1873 if (atomic_dec_and_test(&stripe->pending_io)) { 1874 wake_up(&stripe->io_wait); 1875 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1876 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1877 } 1878 } 1879 1880 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1881 struct scrub_stripe *stripe) 1882 { 1883 struct btrfs_fs_info *fs_info = sctx->fs_info; 1884 struct btrfs_bio *bbio; 1885 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1886 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1887 int mirror = stripe->mirror_num; 1888 1889 ASSERT(stripe->bg); 1890 ASSERT(stripe->mirror_num > 0); 1891 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1892 1893 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1894 scrub_submit_extent_sector_read(stripe); 1895 return; 1896 } 1897 1898 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1899 stripe->logical, scrub_read_endio, stripe); 1900 /* Read the whole range inside the chunk boundary. */ 1901 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1902 scrub_bio_add_sector(bbio, stripe, cur); 1903 atomic_inc(&stripe->pending_io); 1904 1905 /* 1906 * For dev-replace, either user asks to avoid the source dev, or 1907 * the device is missing, we try the next mirror instead. 1908 */ 1909 if (sctx->is_dev_replace && 1910 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1911 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1912 !stripe->dev->bdev)) { 1913 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1914 stripe->bg->length); 1915 1916 mirror = calc_next_mirror(mirror, num_copies); 1917 } 1918 btrfs_submit_bbio(bbio, mirror); 1919 } 1920 1921 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1922 { 1923 const unsigned long error = scrub_bitmap_read_error(stripe); 1924 int i; 1925 1926 for_each_set_bit(i, &error, stripe->nr_sectors) { 1927 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1928 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1929 1930 btrfs_err(fs_info, 1931 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1932 stripe->logical, 1933 stripe->logical + (i << fs_info->sectorsize_bits)); 1934 return true; 1935 } 1936 } 1937 return false; 1938 } 1939 1940 static void submit_initial_group_read(struct scrub_ctx *sctx, 1941 unsigned int first_slot, 1942 unsigned int nr_stripes) 1943 { 1944 struct blk_plug plug; 1945 1946 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1947 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1948 1949 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1950 btrfs_stripe_nr_to_offset(nr_stripes)); 1951 blk_start_plug(&plug); 1952 for (int i = 0; i < nr_stripes; i++) { 1953 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1954 1955 /* Those stripes should be initialized. */ 1956 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1957 scrub_submit_initial_read(sctx, stripe); 1958 } 1959 blk_finish_plug(&plug); 1960 } 1961 1962 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1963 { 1964 struct btrfs_fs_info *fs_info = sctx->fs_info; 1965 struct scrub_stripe *stripe; 1966 const int nr_stripes = sctx->cur_stripe; 1967 int ret = 0; 1968 1969 if (!nr_stripes) 1970 return 0; 1971 1972 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1973 1974 /* Submit the stripes which are populated but not submitted. */ 1975 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1976 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1977 1978 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1979 } 1980 1981 for (int i = 0; i < nr_stripes; i++) { 1982 stripe = &sctx->stripes[i]; 1983 1984 wait_event(stripe->repair_wait, 1985 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1986 } 1987 1988 /* Submit for dev-replace. */ 1989 if (sctx->is_dev_replace) { 1990 /* 1991 * For dev-replace, if we know there is something wrong with 1992 * metadata, we should immediately abort. 1993 */ 1994 for (int i = 0; i < nr_stripes; i++) { 1995 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1996 ret = -EIO; 1997 goto out; 1998 } 1999 } 2000 for (int i = 0; i < nr_stripes; i++) { 2001 unsigned long good; 2002 unsigned long has_extent; 2003 unsigned long error; 2004 2005 stripe = &sctx->stripes[i]; 2006 2007 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 2008 2009 has_extent = scrub_bitmap_read_has_extent(stripe); 2010 error = scrub_bitmap_read_error(stripe); 2011 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 2012 scrub_write_sectors(sctx, stripe, good, true); 2013 } 2014 } 2015 2016 /* Wait for the above writebacks to finish. */ 2017 for (int i = 0; i < nr_stripes; i++) { 2018 stripe = &sctx->stripes[i]; 2019 2020 wait_scrub_stripe_io(stripe); 2021 spin_lock(&sctx->stat_lock); 2022 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2023 spin_unlock(&sctx->stat_lock); 2024 scrub_reset_stripe(stripe); 2025 } 2026 out: 2027 sctx->cur_stripe = 0; 2028 return ret; 2029 } 2030 2031 static void raid56_scrub_wait_endio(struct bio *bio) 2032 { 2033 complete(bio->bi_private); 2034 } 2035 2036 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2037 struct btrfs_device *dev, int mirror_num, 2038 u64 logical, u32 length, u64 physical, 2039 u64 *found_logical_ret) 2040 { 2041 struct scrub_stripe *stripe; 2042 int ret; 2043 2044 /* 2045 * There should always be one slot left, as caller filling the last 2046 * slot should flush them all. 2047 */ 2048 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2049 2050 /* @found_logical_ret must be specified. */ 2051 ASSERT(found_logical_ret); 2052 2053 stripe = &sctx->stripes[sctx->cur_stripe]; 2054 scrub_reset_stripe(stripe); 2055 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2056 &sctx->csum_path, dev, physical, 2057 mirror_num, logical, length, stripe); 2058 /* Either >0 as no more extents or <0 for error. */ 2059 if (ret) 2060 return ret; 2061 *found_logical_ret = stripe->logical; 2062 sctx->cur_stripe++; 2063 2064 /* We filled one group, submit it. */ 2065 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2066 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2067 2068 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2069 } 2070 2071 /* Last slot used, flush them all. */ 2072 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2073 return flush_scrub_stripes(sctx); 2074 return 0; 2075 } 2076 2077 /* 2078 * Return 0 if we should not cancel the scrub. 2079 * Return <0 if we need to cancel the scrub, returned value will 2080 * indicate the reason: 2081 * - -ECANCELED - Being explicitly canceled through ioctl. 2082 * - -EINTR - Being interrupted by signal or fs/process freezing. 2083 */ 2084 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2085 { 2086 struct btrfs_fs_info *fs_info = sctx->fs_info; 2087 2088 if (atomic_read(&fs_info->scrub_cancel_req) || 2089 atomic_read(&sctx->cancel_req)) 2090 return -ECANCELED; 2091 2092 /* 2093 * The user (e.g. fsfreeze command) or power management (PM) 2094 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2095 * also freeze all user processes. 2096 * 2097 * A user process can only be frozen when it is in user space, thus we 2098 * have to cancel the run so that the process can return to the user 2099 * space. 2100 * 2101 * Furthermore we have to check both filesystem and process freezing, 2102 * as PM can be configured to freeze the filesystems before processes. 2103 * 2104 * If we only check fs freezing, then suspend without fs freezing 2105 * will timeout, as the process is still in kernel space. 2106 * 2107 * If we only check process freezing, then suspend with fs freezing 2108 * will timeout, as the running scrub will prevent the fs from being frozen. 2109 */ 2110 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2111 freezing(current) || signal_pending(current)) 2112 return -EINTR; 2113 return 0; 2114 } 2115 2116 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2117 struct btrfs_device *scrub_dev, 2118 struct btrfs_chunk_map *map, 2119 u64 full_stripe_start, 2120 unsigned long *extent_bitmap) 2121 { 2122 DECLARE_COMPLETION_ONSTACK(io_done); 2123 struct btrfs_fs_info *fs_info = sctx->fs_info; 2124 struct btrfs_io_context *bioc = NULL; 2125 struct btrfs_raid_bio *rbio; 2126 struct bio bio; 2127 const int data_stripes = nr_data_stripes(map); 2128 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2129 int ret; 2130 2131 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2132 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2133 bio.bi_private = &io_done; 2134 bio.bi_end_io = raid56_scrub_wait_endio; 2135 2136 btrfs_bio_counter_inc_blocked(fs_info); 2137 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2138 &length, &bioc, NULL, NULL); 2139 if (ret < 0) 2140 goto out; 2141 /* For RAID56 write there must be an @bioc allocated. */ 2142 ASSERT(bioc); 2143 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2144 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2145 btrfs_put_bioc(bioc); 2146 if (!rbio) { 2147 ret = -ENOMEM; 2148 goto out; 2149 } 2150 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2151 for (int i = 0; i < data_stripes; i++) { 2152 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2153 2154 raid56_parity_cache_data_folios(rbio, stripe->folios, 2155 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2156 } 2157 raid56_parity_submit_scrub_rbio(rbio); 2158 wait_for_completion_io(&io_done); 2159 ret = blk_status_to_errno(bio.bi_status); 2160 out: 2161 btrfs_bio_counter_dec(fs_info); 2162 bio_uninit(&bio); 2163 return ret; 2164 } 2165 2166 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2167 struct btrfs_device *scrub_dev, 2168 struct btrfs_block_group *bg, 2169 struct btrfs_chunk_map *map, 2170 u64 full_stripe_start) 2171 { 2172 struct btrfs_fs_info *fs_info = sctx->fs_info; 2173 BTRFS_PATH_AUTO_RELEASE(extent_path); 2174 BTRFS_PATH_AUTO_RELEASE(csum_path); 2175 struct scrub_stripe *stripe; 2176 bool all_empty = true; 2177 const int data_stripes = nr_data_stripes(map); 2178 unsigned long extent_bitmap = 0; 2179 int ret; 2180 2181 ASSERT(sctx->raid56_data_stripes); 2182 2183 ret = should_cancel_scrub(sctx); 2184 if (ret < 0) 2185 return ret; 2186 2187 if (atomic_read(&fs_info->scrub_pause_req)) 2188 scrub_blocked_if_needed(fs_info); 2189 2190 spin_lock(&bg->lock); 2191 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2192 spin_unlock(&bg->lock); 2193 return 0; 2194 } 2195 spin_unlock(&bg->lock); 2196 2197 /* 2198 * For data stripe search, we cannot reuse the same extent/csum paths, 2199 * as the data stripe bytenr may be smaller than previous extent. Thus 2200 * we have to use our own extent/csum paths. 2201 */ 2202 extent_path.search_commit_root = true; 2203 extent_path.skip_locking = true; 2204 csum_path.search_commit_root = true; 2205 csum_path.skip_locking = true; 2206 2207 for (int i = 0; i < data_stripes; i++) { 2208 int stripe_index; 2209 int rot; 2210 u64 physical; 2211 2212 stripe = &sctx->raid56_data_stripes[i]; 2213 rot = div_u64(full_stripe_start - bg->start, 2214 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2215 stripe_index = (i + rot) % map->num_stripes; 2216 physical = map->stripes[stripe_index].physical + 2217 btrfs_stripe_nr_to_offset(rot); 2218 2219 scrub_reset_stripe(stripe); 2220 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2221 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2222 map->stripes[stripe_index].dev, physical, 1, 2223 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2224 BTRFS_STRIPE_LEN, stripe); 2225 if (ret < 0) 2226 return ret; 2227 /* 2228 * No extent in this data stripe, need to manually mark them 2229 * initialized to make later read submission happy. 2230 */ 2231 if (ret > 0) { 2232 stripe->logical = full_stripe_start + 2233 btrfs_stripe_nr_to_offset(i); 2234 stripe->dev = map->stripes[stripe_index].dev; 2235 stripe->mirror_num = 1; 2236 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2237 } 2238 } 2239 2240 /* Check if all data stripes are empty. */ 2241 for (int i = 0; i < data_stripes; i++) { 2242 stripe = &sctx->raid56_data_stripes[i]; 2243 if (!scrub_bitmap_empty_has_extent(stripe)) { 2244 all_empty = false; 2245 break; 2246 } 2247 } 2248 if (all_empty) 2249 return 0; 2250 2251 for (int i = 0; i < data_stripes; i++) { 2252 stripe = &sctx->raid56_data_stripes[i]; 2253 scrub_submit_initial_read(sctx, stripe); 2254 } 2255 for (int i = 0; i < data_stripes; i++) { 2256 stripe = &sctx->raid56_data_stripes[i]; 2257 2258 wait_event(stripe->repair_wait, 2259 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2260 } 2261 /* For now, no zoned support for RAID56. */ 2262 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2263 2264 /* 2265 * Now all data stripes are properly verified. Check if we have any 2266 * unrepaired, if so abort immediately or we could further corrupt the 2267 * P/Q stripes. 2268 * 2269 * During the loop, also populate extent_bitmap. 2270 */ 2271 for (int i = 0; i < data_stripes; i++) { 2272 unsigned long error; 2273 unsigned long has_extent; 2274 2275 stripe = &sctx->raid56_data_stripes[i]; 2276 2277 error = scrub_bitmap_read_error(stripe); 2278 has_extent = scrub_bitmap_read_has_extent(stripe); 2279 2280 /* 2281 * We should only check the errors where there is an extent. 2282 * As we may hit an empty data stripe while it's missing. 2283 */ 2284 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2285 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2286 btrfs_err(fs_info, 2287 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2288 full_stripe_start, i, stripe->nr_sectors, 2289 &error); 2290 return ret; 2291 } 2292 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2293 stripe->nr_sectors); 2294 } 2295 2296 /* Now we can check and regenerate the P/Q stripe. */ 2297 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2298 &extent_bitmap); 2299 } 2300 2301 /* 2302 * Scrub one range which can only has simple mirror based profile. 2303 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2304 * RAID0/RAID10). 2305 * 2306 * Since we may need to handle a subset of block group, we need @logical_start 2307 * and @logical_length parameter. 2308 */ 2309 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2310 struct btrfs_block_group *bg, 2311 u64 logical_start, u64 logical_length, 2312 struct btrfs_device *device, 2313 u64 physical, int mirror_num) 2314 { 2315 struct btrfs_fs_info *fs_info = sctx->fs_info; 2316 const u64 logical_end = logical_start + logical_length; 2317 u64 cur_logical = logical_start; 2318 int ret = 0; 2319 2320 /* The range must be inside the bg */ 2321 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2322 2323 /* Go through each extent items inside the logical range */ 2324 while (cur_logical < logical_end) { 2325 u64 found_logical = U64_MAX; 2326 u64 cur_physical = physical + cur_logical - logical_start; 2327 2328 ret = should_cancel_scrub(sctx); 2329 if (ret < 0) 2330 break; 2331 2332 if (atomic_read(&fs_info->scrub_pause_req)) 2333 scrub_blocked_if_needed(fs_info); 2334 2335 spin_lock(&bg->lock); 2336 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2337 spin_unlock(&bg->lock); 2338 ret = 0; 2339 break; 2340 } 2341 spin_unlock(&bg->lock); 2342 2343 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2344 cur_logical, logical_end - cur_logical, 2345 cur_physical, &found_logical); 2346 if (ret > 0) { 2347 /* No more extent, just update the accounting */ 2348 spin_lock(&sctx->stat_lock); 2349 sctx->stat.last_physical = physical + logical_length; 2350 spin_unlock(&sctx->stat_lock); 2351 ret = 0; 2352 break; 2353 } 2354 if (ret < 0) 2355 break; 2356 2357 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2358 ASSERT(found_logical != U64_MAX); 2359 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2360 2361 /* Don't hold CPU for too long time */ 2362 cond_resched(); 2363 } 2364 return ret; 2365 } 2366 2367 /* Calculate the full stripe length for simple stripe based profiles */ 2368 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2369 { 2370 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2371 BTRFS_BLOCK_GROUP_RAID10)); 2372 2373 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2374 } 2375 2376 /* Get the logical bytenr for the stripe */ 2377 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2378 struct btrfs_block_group *bg, 2379 int stripe_index) 2380 { 2381 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2382 BTRFS_BLOCK_GROUP_RAID10)); 2383 ASSERT(stripe_index < map->num_stripes); 2384 2385 /* 2386 * (stripe_index / sub_stripes) gives how many data stripes we need to 2387 * skip. 2388 */ 2389 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2390 bg->start; 2391 } 2392 2393 /* Get the mirror number for the stripe */ 2394 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2395 { 2396 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2397 BTRFS_BLOCK_GROUP_RAID10)); 2398 ASSERT(stripe_index < map->num_stripes); 2399 2400 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2401 return stripe_index % map->sub_stripes + 1; 2402 } 2403 2404 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2405 struct btrfs_block_group *bg, 2406 struct btrfs_chunk_map *map, 2407 struct btrfs_device *device, 2408 int stripe_index) 2409 { 2410 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2411 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2412 const u64 orig_physical = map->stripes[stripe_index].physical; 2413 const u64 end = btrfs_block_group_end(bg); 2414 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2415 u64 cur_logical = orig_logical; 2416 u64 cur_physical = orig_physical; 2417 int ret = 0; 2418 2419 while (cur_logical < end) { 2420 /* 2421 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2422 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2423 * this stripe. 2424 */ 2425 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2426 BTRFS_STRIPE_LEN, device, cur_physical, 2427 mirror_num); 2428 if (ret) 2429 return ret; 2430 /* Skip to next stripe which belongs to the target device */ 2431 cur_logical += logical_increment; 2432 /* For physical offset, we just go to next stripe */ 2433 cur_physical += BTRFS_STRIPE_LEN; 2434 } 2435 return ret; 2436 } 2437 2438 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2439 struct btrfs_block_group *bg, 2440 struct btrfs_chunk_map *map, 2441 struct btrfs_device *scrub_dev, 2442 int stripe_index) 2443 { 2444 struct btrfs_fs_info *fs_info = sctx->fs_info; 2445 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2446 const u64 chunk_logical = bg->start; 2447 int ret; 2448 int ret2; 2449 u64 physical = map->stripes[stripe_index].physical; 2450 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2451 const u64 physical_end = physical + dev_stripe_len; 2452 u64 logical; 2453 u64 logic_end; 2454 /* The logical increment after finishing one stripe */ 2455 u64 increment; 2456 /* Offset inside the chunk */ 2457 u64 offset; 2458 u64 stripe_logical; 2459 2460 /* Extent_path should be released by now. */ 2461 ASSERT(sctx->extent_path.nodes[0] == NULL); 2462 2463 scrub_blocked_if_needed(fs_info); 2464 2465 if (sctx->is_dev_replace && 2466 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2467 mutex_lock(&sctx->wr_lock); 2468 sctx->write_pointer = physical; 2469 mutex_unlock(&sctx->wr_lock); 2470 } 2471 2472 /* Prepare the extra data stripes used by RAID56. */ 2473 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2474 ASSERT(sctx->raid56_data_stripes == NULL); 2475 2476 sctx->raid56_data_stripes = kzalloc_objs(struct scrub_stripe, 2477 nr_data_stripes(map), 2478 GFP_KERNEL); 2479 if (!sctx->raid56_data_stripes) { 2480 ret = -ENOMEM; 2481 goto out; 2482 } 2483 for (int i = 0; i < nr_data_stripes(map); i++) { 2484 ret = init_scrub_stripe(fs_info, 2485 &sctx->raid56_data_stripes[i]); 2486 if (ret < 0) 2487 goto out; 2488 sctx->raid56_data_stripes[i].bg = bg; 2489 sctx->raid56_data_stripes[i].sctx = sctx; 2490 } 2491 } 2492 /* 2493 * There used to be a big double loop to handle all profiles using the 2494 * same routine, which grows larger and more gross over time. 2495 * 2496 * So here we handle each profile differently, so simpler profiles 2497 * have simpler scrubbing function. 2498 */ 2499 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2500 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2501 /* 2502 * Above check rules out all complex profile, the remaining 2503 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2504 * mirrored duplication without stripe. 2505 * 2506 * Only @physical and @mirror_num needs to calculated using 2507 * @stripe_index. 2508 */ 2509 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2510 scrub_dev, map->stripes[stripe_index].physical, 2511 stripe_index + 1); 2512 offset = 0; 2513 goto out; 2514 } 2515 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2516 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2517 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2518 goto out; 2519 } 2520 2521 /* Only RAID56 goes through the old code */ 2522 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2523 ret = 0; 2524 2525 /* Calculate the logical end of the stripe */ 2526 get_raid56_logic_offset(physical_end, stripe_index, 2527 map, &logic_end, NULL); 2528 logic_end += chunk_logical; 2529 2530 /* Initialize @offset in case we need to go to out: label */ 2531 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2532 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2533 2534 /* 2535 * Due to the rotation, for RAID56 it's better to iterate each stripe 2536 * using their physical offset. 2537 */ 2538 while (physical < physical_end) { 2539 ret = get_raid56_logic_offset(physical, stripe_index, map, 2540 &logical, &stripe_logical); 2541 logical += chunk_logical; 2542 if (ret) { 2543 /* it is parity strip */ 2544 stripe_logical += chunk_logical; 2545 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2546 map, stripe_logical); 2547 spin_lock(&sctx->stat_lock); 2548 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2549 physical_end); 2550 spin_unlock(&sctx->stat_lock); 2551 if (ret) 2552 goto out; 2553 goto next; 2554 } 2555 2556 /* 2557 * Now we're at a data stripe, scrub each extents in the range. 2558 * 2559 * At this stage, if we ignore the repair part, inside each data 2560 * stripe it is no different than SINGLE profile. 2561 * We can reuse scrub_simple_mirror() here, as the repair part 2562 * is still based on @mirror_num. 2563 */ 2564 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2565 scrub_dev, physical, 1); 2566 if (ret < 0) 2567 goto out; 2568 next: 2569 logical += increment; 2570 physical += BTRFS_STRIPE_LEN; 2571 spin_lock(&sctx->stat_lock); 2572 sctx->stat.last_physical = physical; 2573 spin_unlock(&sctx->stat_lock); 2574 } 2575 out: 2576 ret2 = flush_scrub_stripes(sctx); 2577 if (!ret) 2578 ret = ret2; 2579 btrfs_release_path(&sctx->extent_path); 2580 btrfs_release_path(&sctx->csum_path); 2581 2582 if (sctx->raid56_data_stripes) { 2583 for (int i = 0; i < nr_data_stripes(map); i++) 2584 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2585 kfree(sctx->raid56_data_stripes); 2586 sctx->raid56_data_stripes = NULL; 2587 } 2588 2589 if (sctx->is_dev_replace && ret >= 0) { 2590 ret2 = sync_write_pointer_for_zoned(sctx, 2591 chunk_logical + offset, 2592 map->stripes[stripe_index].physical, 2593 physical_end); 2594 if (ret2) 2595 ret = ret2; 2596 } 2597 2598 return ret < 0 ? ret : 0; 2599 } 2600 2601 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2602 struct btrfs_block_group *bg, 2603 struct btrfs_device *scrub_dev, 2604 u64 dev_offset, 2605 u64 dev_extent_len) 2606 { 2607 struct btrfs_fs_info *fs_info = sctx->fs_info; 2608 struct btrfs_chunk_map *map; 2609 int i; 2610 int ret = 0; 2611 2612 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2613 if (!map) { 2614 /* 2615 * Might have been an unused block group deleted by the cleaner 2616 * kthread or relocation. 2617 */ 2618 spin_lock(&bg->lock); 2619 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2620 ret = -EINVAL; 2621 spin_unlock(&bg->lock); 2622 2623 return ret; 2624 } 2625 if (map->start != bg->start) 2626 goto out; 2627 if (map->chunk_len < dev_extent_len) 2628 goto out; 2629 2630 for (i = 0; i < map->num_stripes; ++i) { 2631 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2632 map->stripes[i].physical == dev_offset) { 2633 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2634 if (ret) 2635 goto out; 2636 } 2637 } 2638 out: 2639 btrfs_free_chunk_map(map); 2640 2641 return ret; 2642 } 2643 2644 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2645 struct btrfs_block_group *cache) 2646 { 2647 struct btrfs_fs_info *fs_info = cache->fs_info; 2648 2649 if (!btrfs_is_zoned(fs_info)) 2650 return 0; 2651 2652 btrfs_wait_block_group_reservations(cache); 2653 btrfs_wait_nocow_writers(cache); 2654 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2655 2656 return btrfs_commit_current_transaction(root); 2657 } 2658 2659 static noinline_for_stack 2660 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2661 struct btrfs_device *scrub_dev, u64 start, u64 end) 2662 { 2663 struct btrfs_dev_extent *dev_extent = NULL; 2664 BTRFS_PATH_AUTO_FREE(path); 2665 struct btrfs_fs_info *fs_info = sctx->fs_info; 2666 struct btrfs_root *root = fs_info->dev_root; 2667 u64 chunk_offset; 2668 int ret = 0; 2669 int ro_set; 2670 int slot; 2671 struct extent_buffer *l; 2672 struct btrfs_key key; 2673 struct btrfs_key found_key; 2674 struct btrfs_block_group *cache; 2675 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2676 2677 path = btrfs_alloc_path(); 2678 if (!path) 2679 return -ENOMEM; 2680 2681 path->reada = READA_FORWARD; 2682 path->search_commit_root = true; 2683 path->skip_locking = true; 2684 2685 key.objectid = scrub_dev->devid; 2686 key.type = BTRFS_DEV_EXTENT_KEY; 2687 key.offset = 0ull; 2688 2689 while (1) { 2690 u64 dev_extent_len; 2691 2692 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2693 if (ret < 0) 2694 break; 2695 if (ret > 0) { 2696 if (path->slots[0] >= 2697 btrfs_header_nritems(path->nodes[0])) { 2698 ret = btrfs_next_leaf(root, path); 2699 if (ret < 0) 2700 break; 2701 if (ret > 0) { 2702 ret = 0; 2703 break; 2704 } 2705 } else { 2706 ret = 0; 2707 } 2708 } 2709 2710 l = path->nodes[0]; 2711 slot = path->slots[0]; 2712 2713 btrfs_item_key_to_cpu(l, &found_key, slot); 2714 2715 if (found_key.objectid != scrub_dev->devid) 2716 break; 2717 2718 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2719 break; 2720 2721 if (found_key.offset >= end) 2722 break; 2723 2724 if (found_key.offset < key.offset) 2725 break; 2726 2727 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2728 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2729 2730 if (found_key.offset + dev_extent_len <= start) 2731 goto skip; 2732 2733 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2734 2735 /* 2736 * get a reference on the corresponding block group to prevent 2737 * the chunk from going away while we scrub it 2738 */ 2739 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2740 2741 /* some chunks are removed but not committed to disk yet, 2742 * continue scrubbing */ 2743 if (!cache) 2744 goto skip; 2745 2746 ASSERT(cache->start <= chunk_offset); 2747 /* 2748 * We are using the commit root to search for device extents, so 2749 * that means we could have found a device extent item from a 2750 * block group that was deleted in the current transaction. The 2751 * logical start offset of the deleted block group, stored at 2752 * @chunk_offset, might be part of the logical address range of 2753 * a new block group (which uses different physical extents). 2754 * In this case btrfs_lookup_block_group() has returned the new 2755 * block group, and its start address is less than @chunk_offset. 2756 * 2757 * We skip such new block groups, because it's pointless to 2758 * process them, as we won't find their extents because we search 2759 * for them using the commit root of the extent tree. For a device 2760 * replace it's also fine to skip it, we won't miss copying them 2761 * to the target device because we have the write duplication 2762 * setup through the regular write path (by btrfs_map_block()), 2763 * and we have committed a transaction when we started the device 2764 * replace, right after setting up the device replace state. 2765 */ 2766 if (cache->start < chunk_offset) { 2767 btrfs_put_block_group(cache); 2768 goto skip; 2769 } 2770 2771 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2772 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2773 btrfs_put_block_group(cache); 2774 goto skip; 2775 } 2776 } 2777 2778 /* 2779 * Make sure that while we are scrubbing the corresponding block 2780 * group doesn't get its logical address and its device extents 2781 * reused for another block group, which can possibly be of a 2782 * different type and different profile. We do this to prevent 2783 * false error detections and crashes due to bogus attempts to 2784 * repair extents. 2785 */ 2786 spin_lock(&cache->lock); 2787 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2788 spin_unlock(&cache->lock); 2789 btrfs_put_block_group(cache); 2790 goto skip; 2791 } 2792 btrfs_freeze_block_group(cache); 2793 spin_unlock(&cache->lock); 2794 2795 /* 2796 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2797 * to avoid deadlock caused by: 2798 * btrfs_inc_block_group_ro() 2799 * -> btrfs_wait_for_commit() 2800 * -> btrfs_commit_transaction() 2801 * -> btrfs_scrub_pause() 2802 */ 2803 scrub_pause_on(fs_info); 2804 2805 /* 2806 * Don't do chunk preallocation for scrub. 2807 * 2808 * This is especially important for SYSTEM bgs, or we can hit 2809 * -EFBIG from btrfs_finish_chunk_alloc() like: 2810 * 1. The only SYSTEM bg is marked RO. 2811 * Since SYSTEM bg is small, that's pretty common. 2812 * 2. New SYSTEM bg will be allocated 2813 * Due to regular version will allocate new chunk. 2814 * 3. New SYSTEM bg is empty and will get cleaned up 2815 * Before cleanup really happens, it's marked RO again. 2816 * 4. Empty SYSTEM bg get scrubbed 2817 * We go back to 2. 2818 * 2819 * This can easily boost the amount of SYSTEM chunks if cleaner 2820 * thread can't be triggered fast enough, and use up all space 2821 * of btrfs_super_block::sys_chunk_array 2822 * 2823 * While for dev replace, we need to try our best to mark block 2824 * group RO, to prevent race between: 2825 * - Write duplication 2826 * Contains latest data 2827 * - Scrub copy 2828 * Contains data from commit tree 2829 * 2830 * If target block group is not marked RO, nocow writes can 2831 * be overwritten by scrub copy, causing data corruption. 2832 * So for dev-replace, it's not allowed to continue if a block 2833 * group is not RO. 2834 */ 2835 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2836 if (!ret && sctx->is_dev_replace) { 2837 ret = finish_extent_writes_for_zoned(root, cache); 2838 if (ret) { 2839 btrfs_dec_block_group_ro(cache); 2840 scrub_pause_off(fs_info); 2841 btrfs_put_block_group(cache); 2842 break; 2843 } 2844 } 2845 2846 if (ret == 0) { 2847 ro_set = 1; 2848 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2849 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2850 /* 2851 * btrfs_inc_block_group_ro return -ENOSPC when it 2852 * failed in creating new chunk for metadata. 2853 * It is not a problem for scrub, because 2854 * metadata are always cowed, and our scrub paused 2855 * commit_transactions. 2856 * 2857 * For RAID56 chunks, we have to mark them read-only 2858 * for scrub, as later we would use our own cache 2859 * out of RAID56 realm. 2860 * Thus we want the RAID56 bg to be marked RO to 2861 * prevent RMW from screwing up out cache. 2862 */ 2863 ro_set = 0; 2864 } else if (ret == -ETXTBSY) { 2865 btrfs_warn(fs_info, 2866 "scrub: skipping scrub of block group %llu due to active swapfile", 2867 cache->start); 2868 scrub_pause_off(fs_info); 2869 ret = 0; 2870 goto skip_unfreeze; 2871 } else { 2872 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2873 ret); 2874 btrfs_unfreeze_block_group(cache); 2875 btrfs_put_block_group(cache); 2876 scrub_pause_off(fs_info); 2877 break; 2878 } 2879 2880 /* 2881 * Now the target block is marked RO, wait for nocow writes to 2882 * finish before dev-replace. 2883 * COW is fine, as COW never overwrites extents in commit tree. 2884 */ 2885 if (sctx->is_dev_replace) { 2886 btrfs_wait_nocow_writers(cache); 2887 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2888 } 2889 2890 scrub_pause_off(fs_info); 2891 down_write(&dev_replace->rwsem); 2892 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2893 dev_replace->cursor_left = found_key.offset; 2894 dev_replace->item_needs_writeback = 1; 2895 up_write(&dev_replace->rwsem); 2896 2897 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2898 dev_extent_len); 2899 if (sctx->is_dev_replace && 2900 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2901 cache, found_key.offset)) 2902 ro_set = 0; 2903 2904 down_write(&dev_replace->rwsem); 2905 dev_replace->cursor_left = dev_replace->cursor_right; 2906 dev_replace->item_needs_writeback = 1; 2907 up_write(&dev_replace->rwsem); 2908 2909 if (ro_set) 2910 btrfs_dec_block_group_ro(cache); 2911 2912 /* 2913 * We might have prevented the cleaner kthread from deleting 2914 * this block group if it was already unused because we raced 2915 * and set it to RO mode first. So add it back to the unused 2916 * list, otherwise it might not ever be deleted unless a manual 2917 * balance is triggered or it becomes used and unused again. 2918 */ 2919 spin_lock(&cache->lock); 2920 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2921 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2922 spin_unlock(&cache->lock); 2923 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2924 btrfs_discard_queue_work(&fs_info->discard_ctl, 2925 cache); 2926 else 2927 btrfs_mark_bg_unused(cache); 2928 } else { 2929 spin_unlock(&cache->lock); 2930 } 2931 skip_unfreeze: 2932 btrfs_unfreeze_block_group(cache); 2933 btrfs_put_block_group(cache); 2934 if (ret) 2935 break; 2936 if (unlikely(sctx->is_dev_replace && 2937 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2938 ret = -EIO; 2939 break; 2940 } 2941 if (sctx->stat.malloc_errors > 0) { 2942 ret = -ENOMEM; 2943 break; 2944 } 2945 skip: 2946 key.offset = found_key.offset + dev_extent_len; 2947 btrfs_release_path(path); 2948 } 2949 2950 return ret; 2951 } 2952 2953 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2954 struct page *page, u64 physical, u64 generation) 2955 { 2956 struct btrfs_fs_info *fs_info = sctx->fs_info; 2957 struct btrfs_super_block *sb = page_address(page); 2958 int ret; 2959 2960 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2961 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2962 if (ret < 0) 2963 return ret; 2964 ret = btrfs_check_super_csum(fs_info, sb); 2965 if (unlikely(ret != 0)) { 2966 btrfs_err_rl(fs_info, 2967 "scrub: super block at physical %llu devid %llu has bad csum", 2968 physical, dev->devid); 2969 return -EIO; 2970 } 2971 if (unlikely(btrfs_super_generation(sb) != generation)) { 2972 btrfs_err_rl(fs_info, 2973 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2974 physical, dev->devid, 2975 btrfs_super_generation(sb), generation); 2976 return -EUCLEAN; 2977 } 2978 2979 return btrfs_validate_super(fs_info, sb, -1); 2980 } 2981 2982 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2983 struct btrfs_device *scrub_dev) 2984 { 2985 int i; 2986 u64 bytenr; 2987 u64 gen; 2988 int ret = 0; 2989 struct page *page; 2990 struct btrfs_fs_info *fs_info = sctx->fs_info; 2991 2992 if (BTRFS_FS_ERROR(fs_info)) 2993 return -EROFS; 2994 2995 page = alloc_page(GFP_KERNEL); 2996 if (!page) { 2997 spin_lock(&sctx->stat_lock); 2998 sctx->stat.malloc_errors++; 2999 spin_unlock(&sctx->stat_lock); 3000 return -ENOMEM; 3001 } 3002 3003 /* Seed devices of a new filesystem has their own generation. */ 3004 if (scrub_dev->fs_devices != fs_info->fs_devices) 3005 gen = scrub_dev->generation; 3006 else 3007 gen = btrfs_get_last_trans_committed(fs_info); 3008 3009 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3010 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 3011 if (ret == -ENOENT) 3012 break; 3013 3014 if (ret) { 3015 spin_lock(&sctx->stat_lock); 3016 sctx->stat.super_errors++; 3017 spin_unlock(&sctx->stat_lock); 3018 continue; 3019 } 3020 3021 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3022 scrub_dev->commit_total_bytes) 3023 break; 3024 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3025 continue; 3026 3027 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3028 if (ret) { 3029 spin_lock(&sctx->stat_lock); 3030 sctx->stat.super_errors++; 3031 spin_unlock(&sctx->stat_lock); 3032 } 3033 } 3034 __free_page(page); 3035 return 0; 3036 } 3037 3038 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3039 { 3040 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3041 &fs_info->scrub_lock)) { 3042 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3043 3044 fs_info->scrub_workers = NULL; 3045 mutex_unlock(&fs_info->scrub_lock); 3046 3047 if (scrub_workers) 3048 destroy_workqueue(scrub_workers); 3049 } 3050 } 3051 3052 /* 3053 * get a reference count on fs_info->scrub_workers. start worker if necessary 3054 */ 3055 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3056 { 3057 struct workqueue_struct *scrub_workers = NULL; 3058 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3059 int max_active = fs_info->thread_pool_size; 3060 int ret = -ENOMEM; 3061 3062 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3063 return 0; 3064 3065 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3066 if (!scrub_workers) 3067 return -ENOMEM; 3068 3069 mutex_lock(&fs_info->scrub_lock); 3070 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3071 ASSERT(fs_info->scrub_workers == NULL); 3072 fs_info->scrub_workers = scrub_workers; 3073 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3074 mutex_unlock(&fs_info->scrub_lock); 3075 return 0; 3076 } 3077 /* Other thread raced in and created the workers for us */ 3078 refcount_inc(&fs_info->scrub_workers_refcnt); 3079 mutex_unlock(&fs_info->scrub_lock); 3080 3081 ret = 0; 3082 3083 destroy_workqueue(scrub_workers); 3084 return ret; 3085 } 3086 3087 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3088 u64 end, struct btrfs_scrub_progress *progress, 3089 bool readonly, bool is_dev_replace) 3090 { 3091 struct btrfs_dev_lookup_args args = { .devid = devid }; 3092 struct scrub_ctx *sctx; 3093 int ret; 3094 struct btrfs_device *dev; 3095 unsigned int nofs_flag; 3096 bool need_commit = false; 3097 3098 /* Set the basic fallback @last_physical before we got a sctx. */ 3099 if (progress) 3100 progress->last_physical = start; 3101 3102 if (btrfs_fs_closing(fs_info)) 3103 return -EAGAIN; 3104 3105 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3106 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3107 3108 /* 3109 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3110 * value (max nodesize / min sectorsize), thus nodesize should always 3111 * be fine. 3112 */ 3113 ASSERT(fs_info->nodesize <= 3114 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3115 3116 /* Allocate outside of device_list_mutex */ 3117 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3118 if (IS_ERR(sctx)) 3119 return PTR_ERR(sctx); 3120 sctx->stat.last_physical = start; 3121 3122 ret = scrub_workers_get(fs_info); 3123 if (ret) 3124 goto out_free_ctx; 3125 3126 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3127 dev = btrfs_find_device(fs_info->fs_devices, &args); 3128 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3129 !is_dev_replace)) { 3130 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3131 ret = -ENODEV; 3132 goto out; 3133 } 3134 3135 if (!is_dev_replace && !readonly && 3136 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3137 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3138 btrfs_err(fs_info, 3139 "scrub: devid %llu: filesystem on %s is not writable", 3140 devid, btrfs_dev_name(dev)); 3141 ret = -EROFS; 3142 goto out; 3143 } 3144 3145 mutex_lock(&fs_info->scrub_lock); 3146 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3147 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3148 mutex_unlock(&fs_info->scrub_lock); 3149 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3150 ret = -EIO; 3151 goto out; 3152 } 3153 3154 down_read(&fs_info->dev_replace.rwsem); 3155 if (dev->scrub_ctx || 3156 (!is_dev_replace && 3157 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3158 up_read(&fs_info->dev_replace.rwsem); 3159 mutex_unlock(&fs_info->scrub_lock); 3160 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3161 ret = -EINPROGRESS; 3162 goto out; 3163 } 3164 up_read(&fs_info->dev_replace.rwsem); 3165 3166 sctx->readonly = readonly; 3167 dev->scrub_ctx = sctx; 3168 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3169 3170 /* 3171 * checking @scrub_pause_req here, we can avoid 3172 * race between committing transaction and scrubbing. 3173 */ 3174 __scrub_blocked_if_needed(fs_info); 3175 atomic_inc(&fs_info->scrubs_running); 3176 mutex_unlock(&fs_info->scrub_lock); 3177 3178 /* 3179 * In order to avoid deadlock with reclaim when there is a transaction 3180 * trying to pause scrub, make sure we use GFP_NOFS for all the 3181 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3182 * invoked by our callees. The pausing request is done when the 3183 * transaction commit starts, and it blocks the transaction until scrub 3184 * is paused (done at specific points at scrub_stripe() or right above 3185 * before incrementing fs_info->scrubs_running). 3186 */ 3187 nofs_flag = memalloc_nofs_save(); 3188 if (!is_dev_replace) { 3189 u64 old_super_errors; 3190 3191 spin_lock(&sctx->stat_lock); 3192 old_super_errors = sctx->stat.super_errors; 3193 spin_unlock(&sctx->stat_lock); 3194 3195 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3196 /* 3197 * by holding device list mutex, we can 3198 * kick off writing super in log tree sync. 3199 */ 3200 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3201 ret = scrub_supers(sctx, dev); 3202 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3203 3204 spin_lock(&sctx->stat_lock); 3205 /* 3206 * Super block errors found, but we can not commit transaction 3207 * at current context, since btrfs_commit_transaction() needs 3208 * to pause the current running scrub (hold by ourselves). 3209 */ 3210 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3211 need_commit = true; 3212 spin_unlock(&sctx->stat_lock); 3213 } 3214 3215 if (!ret) 3216 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3217 memalloc_nofs_restore(nofs_flag); 3218 3219 atomic_dec(&fs_info->scrubs_running); 3220 wake_up(&fs_info->scrub_pause_wait); 3221 3222 if (progress) 3223 memcpy(progress, &sctx->stat, sizeof(*progress)); 3224 3225 if (!is_dev_replace) 3226 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3227 ret ? "not finished" : "finished", devid, ret); 3228 3229 mutex_lock(&fs_info->scrub_lock); 3230 dev->scrub_ctx = NULL; 3231 mutex_unlock(&fs_info->scrub_lock); 3232 3233 scrub_workers_put(fs_info); 3234 scrub_put_ctx(sctx); 3235 3236 /* 3237 * We found some super block errors before, now try to force a 3238 * transaction commit, as scrub has finished. 3239 */ 3240 if (need_commit) { 3241 struct btrfs_trans_handle *trans; 3242 3243 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3244 if (IS_ERR(trans)) { 3245 ret = PTR_ERR(trans); 3246 btrfs_err(fs_info, 3247 "scrub: failed to start transaction to fix super block errors: %d", ret); 3248 return ret; 3249 } 3250 ret = btrfs_commit_transaction(trans); 3251 if (ret < 0) 3252 btrfs_err(fs_info, 3253 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3254 } 3255 return ret; 3256 out: 3257 scrub_workers_put(fs_info); 3258 out_free_ctx: 3259 scrub_free_ctx(sctx); 3260 3261 return ret; 3262 } 3263 3264 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3265 { 3266 mutex_lock(&fs_info->scrub_lock); 3267 atomic_inc(&fs_info->scrub_pause_req); 3268 while (atomic_read(&fs_info->scrubs_paused) != 3269 atomic_read(&fs_info->scrubs_running)) { 3270 mutex_unlock(&fs_info->scrub_lock); 3271 wait_event(fs_info->scrub_pause_wait, 3272 atomic_read(&fs_info->scrubs_paused) == 3273 atomic_read(&fs_info->scrubs_running)); 3274 mutex_lock(&fs_info->scrub_lock); 3275 } 3276 mutex_unlock(&fs_info->scrub_lock); 3277 } 3278 3279 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3280 { 3281 atomic_dec(&fs_info->scrub_pause_req); 3282 wake_up(&fs_info->scrub_pause_wait); 3283 } 3284 3285 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3286 { 3287 mutex_lock(&fs_info->scrub_lock); 3288 if (!atomic_read(&fs_info->scrubs_running)) { 3289 mutex_unlock(&fs_info->scrub_lock); 3290 return -ENOTCONN; 3291 } 3292 3293 atomic_inc(&fs_info->scrub_cancel_req); 3294 while (atomic_read(&fs_info->scrubs_running)) { 3295 mutex_unlock(&fs_info->scrub_lock); 3296 wait_event(fs_info->scrub_pause_wait, 3297 atomic_read(&fs_info->scrubs_running) == 0); 3298 mutex_lock(&fs_info->scrub_lock); 3299 } 3300 atomic_dec(&fs_info->scrub_cancel_req); 3301 mutex_unlock(&fs_info->scrub_lock); 3302 3303 return 0; 3304 } 3305 3306 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3307 { 3308 struct btrfs_fs_info *fs_info = dev->fs_info; 3309 struct scrub_ctx *sctx; 3310 3311 mutex_lock(&fs_info->scrub_lock); 3312 sctx = dev->scrub_ctx; 3313 if (!sctx) { 3314 mutex_unlock(&fs_info->scrub_lock); 3315 return -ENOTCONN; 3316 } 3317 atomic_inc(&sctx->cancel_req); 3318 while (dev->scrub_ctx) { 3319 mutex_unlock(&fs_info->scrub_lock); 3320 wait_event(fs_info->scrub_pause_wait, 3321 dev->scrub_ctx == NULL); 3322 mutex_lock(&fs_info->scrub_lock); 3323 } 3324 mutex_unlock(&fs_info->scrub_lock); 3325 3326 return 0; 3327 } 3328 3329 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3330 struct btrfs_scrub_progress *progress) 3331 { 3332 struct btrfs_dev_lookup_args args = { .devid = devid }; 3333 struct btrfs_device *dev; 3334 struct scrub_ctx *sctx = NULL; 3335 3336 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3337 dev = btrfs_find_device(fs_info->fs_devices, &args); 3338 if (dev) 3339 sctx = dev->scrub_ctx; 3340 if (sctx) 3341 memcpy(progress, &sctx->stat, sizeof(*progress)); 3342 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3343 3344 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3345 } 3346