1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "discard.h" 11 #include "volumes.h" 12 #include "disk-io.h" 13 #include "ordered-data.h" 14 #include "transaction.h" 15 #include "backref.h" 16 #include "extent_io.h" 17 #include "dev-replace.h" 18 #include "raid56.h" 19 #include "block-group.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "file-item.h" 24 #include "scrub.h" 25 #include "raid-stripe-tree.h" 26 27 /* 28 * This is only the first step towards a full-features scrub. It reads all 29 * extent and super block and verifies the checksums. In case a bad checksum 30 * is found or the extent cannot be read, good data will be written back if 31 * any can be found. 32 * 33 * Future enhancements: 34 * - In case an unrepairable extent is encountered, track which files are 35 * affected and report them 36 * - track and record media errors, throw out bad devices 37 * - add a mode to also read unallocated space 38 */ 39 40 struct scrub_ctx; 41 42 /* 43 * The following value only influences the performance. 44 * 45 * This determines how many stripes would be submitted in one go, 46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 47 */ 48 #define SCRUB_STRIPES_PER_GROUP 8 49 50 /* 51 * How many groups we have for each sctx. 52 * 53 * This would be 8M per device, the same value as the old scrub in-flight bios 54 * size limit. 55 */ 56 #define SCRUB_GROUPS_PER_SCTX 16 57 58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 59 60 /* 61 * The following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 */ 64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 65 66 /* Represent one sector and its needed info to verify the content. */ 67 struct scrub_sector_verification { 68 union { 69 /* 70 * Csum pointer for data csum verification. Should point to a 71 * sector csum inside scrub_stripe::csums. 72 * 73 * NULL if this data sector has no csum. 74 */ 75 u8 *csum; 76 77 /* 78 * Extra info for metadata verification. All sectors inside a 79 * tree block share the same generation. 80 */ 81 u64 generation; 82 }; 83 }; 84 85 enum scrub_stripe_flags { 86 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 87 SCRUB_STRIPE_FLAG_INITIALIZED, 88 89 /* Set when the read-repair is finished. */ 90 SCRUB_STRIPE_FLAG_REPAIR_DONE, 91 92 /* 93 * Set for data stripes if it's triggered from P/Q stripe. 94 * During such scrub, we should not report errors in data stripes, nor 95 * update the accounting. 96 */ 97 SCRUB_STRIPE_FLAG_NO_REPORT, 98 }; 99 100 /* 101 * We have multiple bitmaps for one scrub_stripe. 102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 104 * 105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 106 * into a larger one. 107 * 108 * These enum records where the sub-bitmap are inside the larger one. 109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 110 */ 111 enum { 112 /* Which blocks are covered by extent items. */ 113 scrub_bitmap_nr_has_extent = 0, 114 115 /* Which blocks are metadata. */ 116 scrub_bitmap_nr_is_metadata, 117 118 /* 119 * Which blocks have errors, including IO, csum, and metadata 120 * errors. 121 * This sub-bitmap is the OR results of the next few error related 122 * sub-bitmaps. 123 */ 124 scrub_bitmap_nr_error, 125 scrub_bitmap_nr_io_error, 126 scrub_bitmap_nr_csum_error, 127 scrub_bitmap_nr_meta_error, 128 scrub_bitmap_nr_meta_gen_error, 129 scrub_bitmap_nr_last, 130 }; 131 132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 133 134 /* 135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 136 */ 137 struct scrub_stripe { 138 struct scrub_ctx *sctx; 139 struct btrfs_block_group *bg; 140 141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 142 struct scrub_sector_verification *sectors; 143 144 struct btrfs_device *dev; 145 u64 logical; 146 u64 physical; 147 148 u16 mirror_num; 149 150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 151 u16 nr_sectors; 152 153 /* 154 * How many data/meta extents are in this stripe. Only for scrub status 155 * reporting purposes. 156 */ 157 u16 nr_data_extents; 158 u16 nr_meta_extents; 159 160 atomic_t pending_io; 161 wait_queue_head_t io_wait; 162 wait_queue_head_t repair_wait; 163 164 /* 165 * Indicate the states of the stripe. Bits are defined in 166 * scrub_stripe_flags enum. 167 */ 168 unsigned long state; 169 170 /* The large bitmap contains all the sub-bitmaps. */ 171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 173 174 /* 175 * For writeback (repair or replace) error reporting. 176 * This one is protected by a spinlock, thus can not be packed into 177 * the larger bitmap. 178 */ 179 unsigned long write_error_bitmap; 180 181 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 182 spinlock_t write_error_lock; 183 184 /* 185 * Checksum for the whole stripe if this stripe is inside a data block 186 * group. 187 */ 188 u8 *csums; 189 190 struct work_struct work; 191 }; 192 193 struct scrub_ctx { 194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 195 struct scrub_stripe *raid56_data_stripes; 196 struct btrfs_fs_info *fs_info; 197 struct btrfs_path extent_path; 198 struct btrfs_path csum_path; 199 int first_free; 200 int cur_stripe; 201 atomic_t cancel_req; 202 int readonly; 203 204 /* State of IO submission throttling affecting the associated device */ 205 ktime_t throttle_deadline; 206 u64 throttle_sent; 207 208 bool is_dev_replace; 209 u64 write_pointer; 210 211 struct mutex wr_lock; 212 struct btrfs_device *wr_tgtdev; 213 214 /* 215 * statistics 216 */ 217 struct btrfs_scrub_progress stat; 218 spinlock_t stat_lock; 219 220 /* 221 * Use a ref counter to avoid use-after-free issues. Scrub workers 222 * decrement bios_in_flight and workers_pending and then do a wakeup 223 * on the list_wait wait queue. We must ensure the main scrub task 224 * doesn't free the scrub context before or while the workers are 225 * doing the wakeup() call. 226 */ 227 refcount_t refs; 228 }; 229 230 #define scrub_calc_start_bit(stripe, name, block_nr) \ 231 ({ \ 232 unsigned int __start_bit; \ 233 \ 234 ASSERT(block_nr < stripe->nr_sectors, \ 235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 237 __start_bit; \ 238 }) 239 240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 242 unsigned int block_nr, \ 243 unsigned int nr_blocks) \ 244 { \ 245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 246 name, block_nr); \ 247 \ 248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 249 } \ 250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 251 unsigned int block_nr, \ 252 unsigned int nr_blocks) \ 253 { \ 254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 255 block_nr); \ 256 \ 257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 258 } \ 259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 260 unsigned int block_nr) \ 261 { \ 262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 263 block_nr); \ 264 \ 265 return test_bit(start_bit, stripe->bitmaps); \ 266 } \ 267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 268 unsigned int block_nr) \ 269 { \ 270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 271 block_nr); \ 272 \ 273 set_bit(start_bit, stripe->bitmaps); \ 274 } \ 275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 276 unsigned int block_nr) \ 277 { \ 278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 279 block_nr); \ 280 \ 281 clear_bit(start_bit, stripe->bitmaps); \ 282 } \ 283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 284 { \ 285 const unsigned int nr_blocks = stripe->nr_sectors; \ 286 \ 287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 288 "nr_blocks=%u BITS_PER_LONG=%u", \ 289 nr_blocks, BITS_PER_LONG); \ 290 \ 291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 292 stripe->nr_sectors); \ 293 } \ 294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 295 { \ 296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 297 \ 298 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 299 } \ 300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 301 { \ 302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 303 \ 304 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 305 } 306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 308 IMPLEMENT_SCRUB_BITMAP_OPS(error); 309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 313 314 struct scrub_warning { 315 struct btrfs_path *path; 316 u64 extent_item_size; 317 const char *errstr; 318 u64 physical; 319 u64 logical; 320 struct btrfs_device *dev; 321 }; 322 323 struct scrub_error_records { 324 /* 325 * Bitmap recording which blocks hit errors (IO/csum/...) during the 326 * initial read. 327 */ 328 unsigned long init_error_bitmap; 329 330 unsigned int nr_io_errors; 331 unsigned int nr_csum_errors; 332 unsigned int nr_meta_errors; 333 unsigned int nr_meta_gen_errors; 334 }; 335 336 static void release_scrub_stripe(struct scrub_stripe *stripe) 337 { 338 if (!stripe) 339 return; 340 341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 342 if (stripe->folios[i]) 343 folio_put(stripe->folios[i]); 344 stripe->folios[i] = NULL; 345 } 346 kfree(stripe->sectors); 347 kfree(stripe->csums); 348 stripe->sectors = NULL; 349 stripe->csums = NULL; 350 stripe->sctx = NULL; 351 stripe->state = 0; 352 } 353 354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 355 struct scrub_stripe *stripe) 356 { 357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 358 int ret; 359 360 memset(stripe, 0, sizeof(*stripe)); 361 362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 363 stripe->state = 0; 364 365 init_waitqueue_head(&stripe->io_wait); 366 init_waitqueue_head(&stripe->repair_wait); 367 atomic_set(&stripe->pending_io, 0); 368 spin_lock_init(&stripe->write_error_lock); 369 370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 372 fs_info->block_min_order, stripe->folios); 373 if (ret < 0) 374 goto error; 375 376 stripe->sectors = kcalloc(stripe->nr_sectors, 377 sizeof(struct scrub_sector_verification), 378 GFP_KERNEL); 379 if (!stripe->sectors) 380 goto error; 381 382 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 383 fs_info->csum_size, GFP_KERNEL); 384 if (!stripe->csums) 385 goto error; 386 return 0; 387 error: 388 release_scrub_stripe(stripe); 389 return -ENOMEM; 390 } 391 392 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 393 { 394 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 395 } 396 397 static void scrub_put_ctx(struct scrub_ctx *sctx); 398 399 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 400 { 401 while (atomic_read(&fs_info->scrub_pause_req)) { 402 mutex_unlock(&fs_info->scrub_lock); 403 wait_event(fs_info->scrub_pause_wait, 404 atomic_read(&fs_info->scrub_pause_req) == 0); 405 mutex_lock(&fs_info->scrub_lock); 406 } 407 } 408 409 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 410 { 411 atomic_inc(&fs_info->scrubs_paused); 412 wake_up(&fs_info->scrub_pause_wait); 413 } 414 415 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 416 { 417 mutex_lock(&fs_info->scrub_lock); 418 __scrub_blocked_if_needed(fs_info); 419 atomic_dec(&fs_info->scrubs_paused); 420 mutex_unlock(&fs_info->scrub_lock); 421 422 wake_up(&fs_info->scrub_pause_wait); 423 } 424 425 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 426 { 427 scrub_pause_on(fs_info); 428 scrub_pause_off(fs_info); 429 } 430 431 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 432 { 433 int i; 434 435 if (!sctx) 436 return; 437 438 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 439 release_scrub_stripe(&sctx->stripes[i]); 440 441 kvfree(sctx); 442 } 443 444 static void scrub_put_ctx(struct scrub_ctx *sctx) 445 { 446 if (refcount_dec_and_test(&sctx->refs)) 447 scrub_free_ctx(sctx); 448 } 449 450 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 451 struct btrfs_fs_info *fs_info, bool is_dev_replace) 452 { 453 struct scrub_ctx *sctx; 454 int i; 455 456 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 457 * kvzalloc(). 458 */ 459 sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); 460 if (!sctx) 461 goto nomem; 462 refcount_set(&sctx->refs, 1); 463 sctx->is_dev_replace = is_dev_replace; 464 sctx->fs_info = fs_info; 465 sctx->extent_path.search_commit_root = true; 466 sctx->extent_path.skip_locking = true; 467 sctx->csum_path.search_commit_root = true; 468 sctx->csum_path.skip_locking = true; 469 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 470 int ret; 471 472 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 473 if (ret < 0) 474 goto nomem; 475 sctx->stripes[i].sctx = sctx; 476 } 477 sctx->first_free = 0; 478 atomic_set(&sctx->cancel_req, 0); 479 480 spin_lock_init(&sctx->stat_lock); 481 sctx->throttle_deadline = 0; 482 483 mutex_init(&sctx->wr_lock); 484 if (is_dev_replace) { 485 WARN_ON(!fs_info->dev_replace.tgtdev); 486 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 487 } 488 489 return sctx; 490 491 nomem: 492 scrub_free_ctx(sctx); 493 return ERR_PTR(-ENOMEM); 494 } 495 496 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 497 u64 root, void *warn_ctx) 498 { 499 u32 nlink; 500 int ret; 501 int i; 502 unsigned nofs_flag; 503 struct extent_buffer *eb; 504 struct btrfs_inode_item *inode_item; 505 struct scrub_warning *swarn = warn_ctx; 506 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 507 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 508 struct btrfs_root *local_root; 509 struct btrfs_key key; 510 511 local_root = btrfs_get_fs_root(fs_info, root, true); 512 if (IS_ERR(local_root)) { 513 ret = PTR_ERR(local_root); 514 goto err; 515 } 516 517 /* 518 * this makes the path point to (inum INODE_ITEM ioff) 519 */ 520 key.objectid = inum; 521 key.type = BTRFS_INODE_ITEM_KEY; 522 key.offset = 0; 523 524 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 525 if (ret) { 526 btrfs_put_root(local_root); 527 btrfs_release_path(swarn->path); 528 goto err; 529 } 530 531 eb = swarn->path->nodes[0]; 532 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 533 struct btrfs_inode_item); 534 nlink = btrfs_inode_nlink(eb, inode_item); 535 btrfs_release_path(swarn->path); 536 537 /* 538 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 539 * uses GFP_NOFS in this context, so we keep it consistent but it does 540 * not seem to be strictly necessary. 541 */ 542 nofs_flag = memalloc_nofs_save(); 543 ipath = init_ipath(4096, local_root, swarn->path); 544 memalloc_nofs_restore(nofs_flag); 545 if (IS_ERR(ipath)) { 546 btrfs_put_root(local_root); 547 ret = PTR_ERR(ipath); 548 ipath = NULL; 549 goto err; 550 } 551 ret = paths_from_inode(inum, ipath); 552 553 if (ret < 0) 554 goto err; 555 556 /* 557 * we deliberately ignore the bit ipath might have been too small to 558 * hold all of the paths here 559 */ 560 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 561 btrfs_warn(fs_info, 562 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 563 swarn->errstr, swarn->logical, 564 btrfs_dev_name(swarn->dev), 565 swarn->physical, 566 root, inum, offset, 567 fs_info->sectorsize, nlink, 568 (char *)(unsigned long)ipath->fspath->val[i]); 569 570 btrfs_put_root(local_root); 571 return 0; 572 573 err: 574 btrfs_warn(fs_info, 575 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 576 swarn->errstr, swarn->logical, 577 btrfs_dev_name(swarn->dev), 578 swarn->physical, 579 root, inum, offset, ret); 580 581 return 0; 582 } 583 584 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 585 bool is_super, u64 logical, u64 physical) 586 { 587 struct btrfs_fs_info *fs_info = dev->fs_info; 588 BTRFS_PATH_AUTO_FREE(path); 589 struct btrfs_key found_key; 590 struct extent_buffer *eb; 591 struct btrfs_extent_item *ei; 592 struct scrub_warning swarn; 593 u64 flags = 0; 594 u32 item_size; 595 int ret; 596 597 /* Super block error, no need to search extent tree. */ 598 if (is_super) { 599 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 600 errstr, btrfs_dev_name(dev), physical); 601 return; 602 } 603 path = btrfs_alloc_path(); 604 if (!path) 605 return; 606 607 swarn.physical = physical; 608 swarn.logical = logical; 609 swarn.errstr = errstr; 610 swarn.dev = NULL; 611 612 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 613 &flags); 614 if (ret < 0) 615 return; 616 617 swarn.extent_item_size = found_key.offset; 618 619 eb = path->nodes[0]; 620 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 621 item_size = btrfs_item_size(eb, path->slots[0]); 622 623 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 624 unsigned long ptr = 0; 625 u8 ref_level; 626 u64 ref_root; 627 628 while (true) { 629 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 630 item_size, &ref_root, 631 &ref_level); 632 if (ret < 0) { 633 btrfs_warn(fs_info, 634 "scrub: failed to resolve tree backref for logical %llu: %d", 635 swarn.logical, ret); 636 break; 637 } 638 if (ret > 0) 639 break; 640 btrfs_warn(fs_info, 641 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 642 errstr, swarn.logical, btrfs_dev_name(dev), 643 swarn.physical, (ref_level ? "node" : "leaf"), 644 ref_level, ref_root); 645 } 646 btrfs_release_path(path); 647 } else { 648 struct btrfs_backref_walk_ctx ctx = { 0 }; 649 650 btrfs_release_path(path); 651 652 ctx.bytenr = found_key.objectid; 653 ctx.extent_item_pos = swarn.logical - found_key.objectid; 654 ctx.fs_info = fs_info; 655 656 swarn.path = path; 657 swarn.dev = dev; 658 659 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 660 } 661 } 662 663 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 664 { 665 int ret = 0; 666 u64 length; 667 668 if (!btrfs_is_zoned(sctx->fs_info)) 669 return 0; 670 671 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 672 return 0; 673 674 if (sctx->write_pointer < physical) { 675 length = physical - sctx->write_pointer; 676 677 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 678 sctx->write_pointer, length); 679 if (!ret) 680 sctx->write_pointer = physical; 681 } 682 return ret; 683 } 684 685 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 686 { 687 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 688 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 689 u32 offset = (sector_nr << fs_info->sectorsize_bits); 690 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 691 692 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 693 ASSERT(folio); 694 ASSERT(!folio_test_highmem(folio)); 695 return folio_address(folio) + offset_in_folio(folio, offset); 696 } 697 698 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 699 { 700 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 701 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 702 u32 offset = (sector_nr << fs_info->sectorsize_bits); 703 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 704 705 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 706 ASSERT(folio); 707 ASSERT(!folio_test_highmem(folio)); 708 /* And the range must be contained inside the folio. */ 709 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 710 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 711 } 712 713 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 714 { 715 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 716 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 717 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 718 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 719 struct btrfs_header *header = first_kaddr; 720 struct btrfs_csum_ctx csum; 721 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 722 u8 calculated_csum[BTRFS_CSUM_SIZE]; 723 724 /* 725 * Here we don't have a good way to attach the pages (and subpages) 726 * to a dummy extent buffer, thus we have to directly grab the members 727 * from pages. 728 */ 729 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 730 731 if (logical != btrfs_stack_header_bytenr(header)) { 732 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 733 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 734 btrfs_warn_rl(fs_info, 735 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 736 logical, stripe->mirror_num, 737 btrfs_stack_header_bytenr(header), logical); 738 return; 739 } 740 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 741 BTRFS_FSID_SIZE) != 0) { 742 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 743 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 744 btrfs_warn_rl(fs_info, 745 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 746 logical, stripe->mirror_num, 747 header->fsid, fs_info->fs_devices->fsid); 748 return; 749 } 750 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 751 BTRFS_UUID_SIZE) != 0) { 752 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 753 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 754 btrfs_warn_rl(fs_info, 755 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 756 logical, stripe->mirror_num, 757 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 758 return; 759 } 760 761 /* Now check tree block csum. */ 762 btrfs_csum_init(&csum, fs_info->csum_type); 763 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 764 fs_info->sectorsize - BTRFS_CSUM_SIZE); 765 766 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 767 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 768 fs_info->sectorsize); 769 } 770 771 btrfs_csum_final(&csum, calculated_csum); 772 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 773 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 774 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 775 btrfs_warn_rl(fs_info, 776 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 777 logical, stripe->mirror_num, 778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 779 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 780 return; 781 } 782 if (stripe->sectors[sector_nr].generation != 783 btrfs_stack_header_generation(header)) { 784 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 785 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 786 btrfs_warn_rl(fs_info, 787 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 788 logical, stripe->mirror_num, 789 btrfs_stack_header_generation(header), 790 stripe->sectors[sector_nr].generation); 791 return; 792 } 793 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 794 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 795 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 796 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 797 } 798 799 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 800 { 801 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 802 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 803 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 804 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 805 u8 csum_buf[BTRFS_CSUM_SIZE]; 806 int ret; 807 808 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 809 810 /* Sector not utilized, skip it. */ 811 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 812 return; 813 814 /* IO error, no need to check. */ 815 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 816 return; 817 818 /* Metadata, verify the full tree block. */ 819 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 820 /* 821 * Check if the tree block crosses the stripe boundary. If 822 * crossed the boundary, we cannot verify it but only give a 823 * warning. 824 * 825 * This can only happen on a very old filesystem where chunks 826 * are not ensured to be stripe aligned. 827 */ 828 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 829 btrfs_warn_rl(fs_info, 830 "scrub: tree block at %llu crosses stripe boundary %llu", 831 stripe->logical + 832 (sector_nr << fs_info->sectorsize_bits), 833 stripe->logical); 834 return; 835 } 836 scrub_verify_one_metadata(stripe, sector_nr); 837 return; 838 } 839 840 /* 841 * Data is easier, we just verify the data csum (if we have it). For 842 * cases without csum, we have no other choice but to trust it. 843 */ 844 if (!sector->csum) { 845 scrub_bitmap_clear_bit_error(stripe, sector_nr); 846 return; 847 } 848 849 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 850 if (ret < 0) { 851 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 852 scrub_bitmap_set_bit_error(stripe, sector_nr); 853 } else { 854 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 855 scrub_bitmap_clear_bit_error(stripe, sector_nr); 856 } 857 } 858 859 /* Verify specified sectors of a stripe. */ 860 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 861 { 862 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 863 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 864 int sector_nr; 865 866 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 867 scrub_verify_one_sector(stripe, sector_nr); 868 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 869 sector_nr += sectors_per_tree - 1; 870 } 871 } 872 873 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 874 { 875 int i; 876 877 for (i = 0; i < stripe->nr_sectors; i++) { 878 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 879 break; 880 } 881 ASSERT(i < stripe->nr_sectors); 882 return i; 883 } 884 885 /* 886 * Repair read is different to the regular read: 887 * 888 * - Only reads the failed sectors 889 * - May have extra blocksize limits 890 */ 891 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 892 { 893 struct scrub_stripe *stripe = bbio->private; 894 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 895 struct bio_vec *bvec; 896 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 897 u32 bio_size = 0; 898 int i; 899 900 ASSERT(sector_nr < stripe->nr_sectors); 901 902 bio_for_each_bvec_all(bvec, &bbio->bio, i) 903 bio_size += bvec->bv_len; 904 905 if (bbio->bio.bi_status) { 906 scrub_bitmap_set_io_error(stripe, sector_nr, 907 bio_size >> fs_info->sectorsize_bits); 908 scrub_bitmap_set_error(stripe, sector_nr, 909 bio_size >> fs_info->sectorsize_bits); 910 } else { 911 scrub_bitmap_clear_io_error(stripe, sector_nr, 912 bio_size >> fs_info->sectorsize_bits); 913 } 914 bio_put(&bbio->bio); 915 if (atomic_dec_and_test(&stripe->pending_io)) 916 wake_up(&stripe->io_wait); 917 } 918 919 static int calc_next_mirror(int mirror, int num_copies) 920 { 921 ASSERT(mirror <= num_copies); 922 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 923 } 924 925 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 926 int sector_nr) 927 { 928 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 929 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 930 int ret; 931 932 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 933 offset_in_page(kaddr)); 934 /* 935 * Caller should ensure the bbio has enough size. 936 * And we cannot use __bio_add_page(), which doesn't do any merge. 937 * 938 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 939 * to create the minimal amount of bio vectors, for fs block size < page 940 * size cases. 941 */ 942 ASSERT(ret == fs_info->sectorsize); 943 } 944 945 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 946 unsigned int nr_vecs, blk_opf_t opf, 947 u64 logical, 948 btrfs_bio_end_io_t end_io, void *private) 949 { 950 struct btrfs_bio *bbio; 951 952 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 953 logical, end_io, private); 954 bbio->is_scrub = true; 955 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 956 return bbio; 957 } 958 959 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 960 int mirror, int blocksize, bool wait) 961 { 962 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 963 struct btrfs_bio *bbio = NULL; 964 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 965 int i; 966 967 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 968 ASSERT(atomic_read(&stripe->pending_io) == 0, 969 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 970 971 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 972 /* The current sector cannot be merged, submit the bio. */ 973 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 974 bbio->bio.bi_iter.bi_size >= blocksize)) { 975 ASSERT(bbio->bio.bi_iter.bi_size); 976 atomic_inc(&stripe->pending_io); 977 btrfs_submit_bbio(bbio, mirror); 978 if (wait) 979 wait_scrub_stripe_io(stripe); 980 bbio = NULL; 981 } 982 983 if (!bbio) 984 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 985 stripe->logical + (i << fs_info->sectorsize_bits), 986 scrub_repair_read_endio, stripe); 987 988 scrub_bio_add_sector(bbio, stripe, i); 989 } 990 if (bbio) { 991 ASSERT(bbio->bio.bi_iter.bi_size); 992 atomic_inc(&stripe->pending_io); 993 btrfs_submit_bbio(bbio, mirror); 994 if (wait) 995 wait_scrub_stripe_io(stripe); 996 } 997 } 998 999 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 1000 struct scrub_stripe *stripe, 1001 const struct scrub_error_records *errors) 1002 { 1003 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 1004 DEFAULT_RATELIMIT_BURST); 1005 struct btrfs_fs_info *fs_info = sctx->fs_info; 1006 struct btrfs_device *dev = NULL; 1007 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1008 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1009 u64 physical = 0; 1010 int nr_data_sectors = 0; 1011 int nr_meta_sectors = 0; 1012 int nr_nodatacsum_sectors = 0; 1013 int nr_repaired_sectors = 0; 1014 int sector_nr; 1015 1016 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1017 return; 1018 1019 /* 1020 * Init needed infos for error reporting. 1021 * 1022 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1023 * thus no need for dev/physical, error reporting still needs dev and physical. 1024 */ 1025 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1026 u64 mapped_len = fs_info->sectorsize; 1027 struct btrfs_io_context *bioc = NULL; 1028 int stripe_index = stripe->mirror_num - 1; 1029 int ret; 1030 1031 /* For scrub, our mirror_num should always start at 1. */ 1032 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1033 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1034 stripe->logical, &mapped_len, &bioc, 1035 NULL, NULL); 1036 /* 1037 * If we failed, dev will be NULL, and later detailed reports 1038 * will just be skipped. 1039 */ 1040 if (ret < 0) 1041 goto skip; 1042 physical = bioc->stripes[stripe_index].physical; 1043 dev = bioc->stripes[stripe_index].dev; 1044 btrfs_put_bioc(bioc); 1045 } 1046 1047 skip: 1048 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1049 bool repaired = false; 1050 1051 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1052 nr_meta_sectors++; 1053 } else { 1054 nr_data_sectors++; 1055 if (!stripe->sectors[sector_nr].csum) 1056 nr_nodatacsum_sectors++; 1057 } 1058 1059 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1060 !test_bit(sector_nr, &error_bitmap)) { 1061 nr_repaired_sectors++; 1062 repaired = true; 1063 } 1064 1065 /* Good sector from the beginning, nothing need to be done. */ 1066 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1067 continue; 1068 1069 /* 1070 * Report error for the corrupted sectors. If repaired, just 1071 * output the message of repaired message. 1072 */ 1073 if (repaired) { 1074 if (dev) { 1075 btrfs_err_rl(fs_info, 1076 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1077 stripe->logical, btrfs_dev_name(dev), 1078 physical); 1079 } else { 1080 btrfs_err_rl(fs_info, 1081 "scrub: fixed up error at logical %llu on mirror %u", 1082 stripe->logical, stripe->mirror_num); 1083 } 1084 continue; 1085 } 1086 1087 /* The remaining are all for unrepaired. */ 1088 if (dev) { 1089 btrfs_err_rl(fs_info, 1090 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1091 stripe->logical, btrfs_dev_name(dev), 1092 physical); 1093 } else { 1094 btrfs_err_rl(fs_info, 1095 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1096 stripe->logical, stripe->mirror_num); 1097 } 1098 1099 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1100 if (__ratelimit(&rs) && dev) 1101 scrub_print_common_warning("i/o error", dev, false, 1102 stripe->logical, physical); 1103 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1104 if (__ratelimit(&rs) && dev) 1105 scrub_print_common_warning("checksum error", dev, false, 1106 stripe->logical, physical); 1107 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1108 if (__ratelimit(&rs) && dev) 1109 scrub_print_common_warning("header error", dev, false, 1110 stripe->logical, physical); 1111 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1112 if (__ratelimit(&rs) && dev) 1113 scrub_print_common_warning("generation error", dev, false, 1114 stripe->logical, physical); 1115 } 1116 1117 /* Update the device stats. */ 1118 for (int i = 0; i < errors->nr_io_errors; i++) 1119 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1120 for (int i = 0; i < errors->nr_csum_errors; i++) 1121 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1122 /* Generation mismatch error is based on each metadata, not each block. */ 1123 for (int i = 0; i < errors->nr_meta_gen_errors; 1124 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1125 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1126 1127 spin_lock(&sctx->stat_lock); 1128 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1129 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1130 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1131 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1132 sctx->stat.no_csum += nr_nodatacsum_sectors; 1133 sctx->stat.read_errors += errors->nr_io_errors; 1134 sctx->stat.csum_errors += errors->nr_csum_errors; 1135 sctx->stat.verify_errors += errors->nr_meta_errors + 1136 errors->nr_meta_gen_errors; 1137 sctx->stat.uncorrectable_errors += 1138 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1139 sctx->stat.corrected_errors += nr_repaired_sectors; 1140 spin_unlock(&sctx->stat_lock); 1141 } 1142 1143 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1144 unsigned long write_bitmap, bool dev_replace); 1145 1146 /* 1147 * The main entrance for all read related scrub work, including: 1148 * 1149 * - Wait for the initial read to finish 1150 * - Verify and locate any bad sectors 1151 * - Go through the remaining mirrors and try to read as large blocksize as 1152 * possible 1153 * - Go through all mirrors (including the failed mirror) sector-by-sector 1154 * - Submit writeback for repaired sectors 1155 * 1156 * Writeback for dev-replace does not happen here, it needs extra 1157 * synchronization for zoned devices. 1158 */ 1159 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1160 { 1161 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1162 struct scrub_ctx *sctx = stripe->sctx; 1163 struct btrfs_fs_info *fs_info = sctx->fs_info; 1164 struct scrub_error_records errors = { 0 }; 1165 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1166 stripe->bg->length); 1167 unsigned long repaired; 1168 unsigned long error; 1169 int mirror; 1170 int i; 1171 1172 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1173 1174 wait_scrub_stripe_io(stripe); 1175 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1176 /* Save the initial failed bitmap for later repair and report usage. */ 1177 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1178 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1179 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1180 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1181 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1182 1183 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1184 goto out; 1185 1186 /* 1187 * Try all remaining mirrors. 1188 * 1189 * Here we still try to read as large block as possible, as this is 1190 * faster and we have extra safety nets to rely on. 1191 */ 1192 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1193 mirror != stripe->mirror_num; 1194 mirror = calc_next_mirror(mirror, num_copies)) { 1195 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1196 1197 scrub_stripe_submit_repair_read(stripe, mirror, 1198 BTRFS_STRIPE_LEN, false); 1199 wait_scrub_stripe_io(stripe); 1200 scrub_verify_one_stripe(stripe, old_error_bitmap); 1201 if (scrub_bitmap_empty_error(stripe)) 1202 goto out; 1203 } 1204 1205 /* 1206 * Last safety net, try re-checking all mirrors, including the failed 1207 * one, sector-by-sector. 1208 * 1209 * As if one sector failed the drive's internal csum, the whole read 1210 * containing the offending sector would be marked as error. 1211 * Thus here we do sector-by-sector read. 1212 * 1213 * This can be slow, thus we only try it as the last resort. 1214 */ 1215 1216 for (i = 0, mirror = stripe->mirror_num; 1217 i < num_copies; 1218 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1219 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1220 1221 scrub_stripe_submit_repair_read(stripe, mirror, 1222 fs_info->sectorsize, true); 1223 wait_scrub_stripe_io(stripe); 1224 scrub_verify_one_stripe(stripe, old_error_bitmap); 1225 if (scrub_bitmap_empty_error(stripe)) 1226 goto out; 1227 } 1228 out: 1229 error = scrub_bitmap_read_error(stripe); 1230 /* 1231 * Submit the repaired sectors. For zoned case, we cannot do repair 1232 * in-place, but queue the bg to be relocated. 1233 */ 1234 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1235 stripe->nr_sectors); 1236 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1237 if (btrfs_is_zoned(fs_info)) { 1238 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1239 } else { 1240 scrub_write_sectors(sctx, stripe, repaired, false); 1241 wait_scrub_stripe_io(stripe); 1242 } 1243 } 1244 1245 scrub_stripe_report_errors(sctx, stripe, &errors); 1246 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1247 wake_up(&stripe->repair_wait); 1248 } 1249 1250 static void scrub_read_endio(struct btrfs_bio *bbio) 1251 { 1252 struct scrub_stripe *stripe = bbio->private; 1253 struct bio_vec *bvec; 1254 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1255 int num_sectors; 1256 u32 bio_size = 0; 1257 int i; 1258 1259 ASSERT(sector_nr < stripe->nr_sectors); 1260 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1261 bio_size += bvec->bv_len; 1262 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1263 1264 if (bbio->bio.bi_status) { 1265 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1266 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1267 } else { 1268 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1269 } 1270 bio_put(&bbio->bio); 1271 if (atomic_dec_and_test(&stripe->pending_io)) { 1272 wake_up(&stripe->io_wait); 1273 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1274 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1275 } 1276 } 1277 1278 static void scrub_write_endio(struct btrfs_bio *bbio) 1279 { 1280 struct scrub_stripe *stripe = bbio->private; 1281 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1282 struct bio_vec *bvec; 1283 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1284 u32 bio_size = 0; 1285 int i; 1286 1287 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1288 bio_size += bvec->bv_len; 1289 1290 if (bbio->bio.bi_status) { 1291 unsigned long flags; 1292 1293 spin_lock_irqsave(&stripe->write_error_lock, flags); 1294 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1295 bio_size >> fs_info->sectorsize_bits); 1296 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1297 for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1298 btrfs_dev_stat_inc_and_print(stripe->dev, 1299 BTRFS_DEV_STAT_WRITE_ERRS); 1300 } 1301 bio_put(&bbio->bio); 1302 1303 if (atomic_dec_and_test(&stripe->pending_io)) 1304 wake_up(&stripe->io_wait); 1305 } 1306 1307 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1308 struct scrub_stripe *stripe, 1309 struct btrfs_bio *bbio, bool dev_replace) 1310 { 1311 struct btrfs_fs_info *fs_info = sctx->fs_info; 1312 u32 bio_len = bbio->bio.bi_iter.bi_size; 1313 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1314 stripe->logical; 1315 1316 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1317 atomic_inc(&stripe->pending_io); 1318 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1319 if (!btrfs_is_zoned(fs_info)) 1320 return; 1321 /* 1322 * For zoned writeback, queue depth must be 1, thus we must wait for 1323 * the write to finish before the next write. 1324 */ 1325 wait_scrub_stripe_io(stripe); 1326 1327 /* 1328 * And also need to update the write pointer if write finished 1329 * successfully. 1330 */ 1331 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1332 &stripe->write_error_bitmap)) 1333 sctx->write_pointer += bio_len; 1334 } 1335 1336 /* 1337 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1338 * 1339 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1340 * 1341 * - Only needs logical bytenr and mirror_num 1342 * Just like the scrub read path 1343 * 1344 * - Would only result in writes to the specified mirror 1345 * Unlike the regular writeback path, which would write back to all stripes 1346 * 1347 * - Handle dev-replace and read-repair writeback differently 1348 */ 1349 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1350 unsigned long write_bitmap, bool dev_replace) 1351 { 1352 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1353 struct btrfs_bio *bbio = NULL; 1354 int sector_nr; 1355 1356 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1357 /* We should only writeback sectors covered by an extent. */ 1358 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1359 1360 /* Cannot merge with previous sector, submit the current one. */ 1361 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1362 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1363 bbio = NULL; 1364 } 1365 if (!bbio) 1366 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1367 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1368 scrub_write_endio, stripe); 1369 scrub_bio_add_sector(bbio, stripe, sector_nr); 1370 } 1371 if (bbio) 1372 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1373 } 1374 1375 /* 1376 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1377 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1378 */ 1379 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1380 unsigned int bio_size) 1381 { 1382 const int time_slice = 1000; 1383 s64 delta; 1384 ktime_t now; 1385 u32 div; 1386 u64 bwlimit; 1387 1388 bwlimit = READ_ONCE(device->scrub_speed_max); 1389 if (bwlimit == 0) 1390 return; 1391 1392 /* 1393 * Slice is divided into intervals when the IO is submitted, adjust by 1394 * bwlimit and maximum of 64 intervals. 1395 */ 1396 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1397 1398 /* Start new epoch, set deadline */ 1399 now = ktime_get(); 1400 if (sctx->throttle_deadline == 0) { 1401 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1402 sctx->throttle_sent = 0; 1403 } 1404 1405 /* Still in the time to send? */ 1406 if (ktime_before(now, sctx->throttle_deadline)) { 1407 /* If current bio is within the limit, send it */ 1408 sctx->throttle_sent += bio_size; 1409 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1410 return; 1411 1412 /* We're over the limit, sleep until the rest of the slice */ 1413 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1414 } else { 1415 /* New request after deadline, start new epoch */ 1416 delta = 0; 1417 } 1418 1419 if (delta) { 1420 long timeout; 1421 1422 timeout = div_u64(delta * HZ, 1000); 1423 schedule_timeout_interruptible(timeout); 1424 } 1425 1426 /* Next call will start the deadline period */ 1427 sctx->throttle_deadline = 0; 1428 } 1429 1430 /* 1431 * Given a physical address, this will calculate it's 1432 * logical offset. if this is a parity stripe, it will return 1433 * the most left data stripe's logical offset. 1434 * 1435 * return 0 if it is a data stripe, 1 means parity stripe. 1436 */ 1437 static int get_raid56_logic_offset(u64 physical, int num, 1438 struct btrfs_chunk_map *map, u64 *offset, 1439 u64 *stripe_start) 1440 { 1441 int i; 1442 int j = 0; 1443 u64 last_offset; 1444 const int data_stripes = nr_data_stripes(map); 1445 1446 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1447 if (stripe_start) 1448 *stripe_start = last_offset; 1449 1450 *offset = last_offset; 1451 for (i = 0; i < data_stripes; i++) { 1452 u32 stripe_nr; 1453 u32 stripe_index; 1454 u32 rot; 1455 1456 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1457 1458 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1459 1460 /* Work out the disk rotation on this stripe-set */ 1461 rot = stripe_nr % map->num_stripes; 1462 /* calculate which stripe this data locates */ 1463 rot += i; 1464 stripe_index = rot % map->num_stripes; 1465 if (stripe_index == num) 1466 return 0; 1467 if (stripe_index < num) 1468 j++; 1469 } 1470 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1471 return 1; 1472 } 1473 1474 /* 1475 * Return 0 if the extent item range covers any byte of the range. 1476 * Return <0 if the extent item is before @search_start. 1477 * Return >0 if the extent item is after @start_start + @search_len. 1478 */ 1479 static int compare_extent_item_range(struct btrfs_path *path, 1480 u64 search_start, u64 search_len) 1481 { 1482 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1483 u64 len; 1484 struct btrfs_key key; 1485 1486 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1487 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1488 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1489 if (key.type == BTRFS_METADATA_ITEM_KEY) 1490 len = fs_info->nodesize; 1491 else 1492 len = key.offset; 1493 1494 if (key.objectid + len <= search_start) 1495 return -1; 1496 if (key.objectid >= search_start + search_len) 1497 return 1; 1498 return 0; 1499 } 1500 1501 /* 1502 * Locate one extent item which covers any byte in range 1503 * [@search_start, @search_start + @search_length) 1504 * 1505 * If the path is not initialized, we will initialize the search by doing 1506 * a btrfs_search_slot(). 1507 * If the path is already initialized, we will use the path as the initial 1508 * slot, to avoid duplicated btrfs_search_slot() calls. 1509 * 1510 * NOTE: If an extent item starts before @search_start, we will still 1511 * return the extent item. This is for data extent crossing stripe boundary. 1512 * 1513 * Return 0 if we found such extent item, and @path will point to the extent item. 1514 * Return >0 if no such extent item can be found, and @path will be released. 1515 * Return <0 if hit fatal error, and @path will be released. 1516 */ 1517 static int find_first_extent_item(struct btrfs_root *extent_root, 1518 struct btrfs_path *path, 1519 u64 search_start, u64 search_len) 1520 { 1521 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1522 struct btrfs_key key; 1523 int ret; 1524 1525 /* Continue using the existing path */ 1526 if (path->nodes[0]) 1527 goto search_forward; 1528 1529 key.objectid = search_start; 1530 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1531 key.type = BTRFS_METADATA_ITEM_KEY; 1532 else 1533 key.type = BTRFS_EXTENT_ITEM_KEY; 1534 key.offset = (u64)-1; 1535 1536 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1537 if (ret < 0) 1538 return ret; 1539 if (unlikely(ret == 0)) { 1540 /* 1541 * Key with offset -1 found, there would have to exist an extent 1542 * item with such offset, but this is out of the valid range. 1543 */ 1544 btrfs_release_path(path); 1545 return -EUCLEAN; 1546 } 1547 1548 /* 1549 * Here we intentionally pass 0 as @min_objectid, as there could be 1550 * an extent item starting before @search_start. 1551 */ 1552 ret = btrfs_previous_extent_item(extent_root, path, 0); 1553 if (ret < 0) 1554 return ret; 1555 /* 1556 * No matter whether we have found an extent item, the next loop will 1557 * properly do every check on the key. 1558 */ 1559 search_forward: 1560 while (true) { 1561 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1562 if (key.objectid >= search_start + search_len) 1563 break; 1564 if (key.type != BTRFS_METADATA_ITEM_KEY && 1565 key.type != BTRFS_EXTENT_ITEM_KEY) 1566 goto next; 1567 1568 ret = compare_extent_item_range(path, search_start, search_len); 1569 if (ret == 0) 1570 return ret; 1571 if (ret > 0) 1572 break; 1573 next: 1574 ret = btrfs_next_item(extent_root, path); 1575 if (ret) { 1576 /* Either no more items or a fatal error. */ 1577 btrfs_release_path(path); 1578 return ret; 1579 } 1580 } 1581 btrfs_release_path(path); 1582 return 1; 1583 } 1584 1585 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1586 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1587 { 1588 struct btrfs_key key; 1589 struct btrfs_extent_item *ei; 1590 1591 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1592 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1593 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1594 *extent_start_ret = key.objectid; 1595 if (key.type == BTRFS_METADATA_ITEM_KEY) 1596 *size_ret = path->nodes[0]->fs_info->nodesize; 1597 else 1598 *size_ret = key.offset; 1599 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1600 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1601 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1602 } 1603 1604 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1605 u64 physical, u64 physical_end) 1606 { 1607 struct btrfs_fs_info *fs_info = sctx->fs_info; 1608 int ret = 0; 1609 1610 if (!btrfs_is_zoned(fs_info)) 1611 return 0; 1612 1613 mutex_lock(&sctx->wr_lock); 1614 if (sctx->write_pointer < physical_end) { 1615 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1616 physical, 1617 sctx->write_pointer); 1618 if (ret) 1619 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1620 } 1621 mutex_unlock(&sctx->wr_lock); 1622 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1623 1624 return ret; 1625 } 1626 1627 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1628 struct scrub_stripe *stripe, 1629 u64 extent_start, u64 extent_len, 1630 u64 extent_flags, u64 extent_gen) 1631 { 1632 for (u64 cur_logical = max(stripe->logical, extent_start); 1633 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1634 extent_start + extent_len); 1635 cur_logical += fs_info->sectorsize) { 1636 const int nr_sector = (cur_logical - stripe->logical) >> 1637 fs_info->sectorsize_bits; 1638 struct scrub_sector_verification *sector = 1639 &stripe->sectors[nr_sector]; 1640 1641 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1642 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1643 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1644 sector->generation = extent_gen; 1645 } 1646 } 1647 } 1648 1649 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1650 { 1651 ASSERT(stripe->nr_sectors); 1652 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1653 } 1654 1655 /* 1656 * Locate one stripe which has at least one extent in its range. 1657 * 1658 * Return 0 if found such stripe, and store its info into @stripe. 1659 * Return >0 if there is no such stripe in the specified range. 1660 * Return <0 for error. 1661 */ 1662 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1663 struct btrfs_path *extent_path, 1664 struct btrfs_path *csum_path, 1665 struct btrfs_device *dev, u64 physical, 1666 int mirror_num, u64 logical_start, 1667 u32 logical_len, 1668 struct scrub_stripe *stripe) 1669 { 1670 struct btrfs_fs_info *fs_info = bg->fs_info; 1671 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1672 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1673 const u64 logical_end = logical_start + logical_len; 1674 u64 cur_logical = logical_start; 1675 u64 stripe_end; 1676 u64 extent_start; 1677 u64 extent_len; 1678 u64 extent_flags; 1679 u64 extent_gen; 1680 int ret; 1681 1682 if (unlikely(!extent_root || !csum_root)) { 1683 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1684 return -EUCLEAN; 1685 } 1686 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1687 stripe->nr_sectors); 1688 scrub_stripe_reset_bitmaps(stripe); 1689 1690 /* The range must be inside the bg. */ 1691 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1692 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1693 bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1694 1695 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1696 logical_len); 1697 /* Either error or not found. */ 1698 if (ret) 1699 return ret; 1700 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1701 &extent_gen); 1702 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1703 stripe->nr_meta_extents++; 1704 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1705 stripe->nr_data_extents++; 1706 cur_logical = max(extent_start, cur_logical); 1707 1708 /* 1709 * Round down to stripe boundary. 1710 * 1711 * The extra calculation against bg->start is to handle block groups 1712 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1713 */ 1714 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1715 bg->start; 1716 stripe->physical = physical + stripe->logical - logical_start; 1717 stripe->dev = dev; 1718 stripe->bg = bg; 1719 stripe->mirror_num = mirror_num; 1720 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1721 1722 /* Fill the first extent info into stripe->sectors[] array. */ 1723 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1724 extent_flags, extent_gen); 1725 cur_logical = extent_start + extent_len; 1726 1727 /* Fill the extent info for the remaining sectors. */ 1728 while (cur_logical <= stripe_end) { 1729 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1730 stripe_end - cur_logical + 1); 1731 if (ret < 0) 1732 return ret; 1733 if (ret > 0) { 1734 ret = 0; 1735 break; 1736 } 1737 get_extent_info(extent_path, &extent_start, &extent_len, 1738 &extent_flags, &extent_gen); 1739 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1740 stripe->nr_meta_extents++; 1741 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1742 stripe->nr_data_extents++; 1743 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1744 extent_flags, extent_gen); 1745 cur_logical = extent_start + extent_len; 1746 } 1747 1748 /* Now fill the data csum. */ 1749 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1750 int sector_nr; 1751 unsigned long csum_bitmap = 0; 1752 1753 /* Csum space should have already been allocated. */ 1754 ASSERT(stripe->csums); 1755 1756 /* 1757 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1758 * should contain at most 16 sectors. 1759 */ 1760 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1761 1762 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1763 stripe->logical, stripe_end, 1764 stripe->csums, &csum_bitmap); 1765 if (ret < 0) 1766 return ret; 1767 if (ret > 0) 1768 ret = 0; 1769 1770 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1771 stripe->sectors[sector_nr].csum = stripe->csums + 1772 sector_nr * fs_info->csum_size; 1773 } 1774 } 1775 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1776 1777 return ret; 1778 } 1779 1780 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1781 { 1782 scrub_stripe_reset_bitmaps(stripe); 1783 1784 stripe->nr_meta_extents = 0; 1785 stripe->nr_data_extents = 0; 1786 stripe->state = 0; 1787 1788 for (int i = 0; i < stripe->nr_sectors; i++) { 1789 stripe->sectors[i].csum = NULL; 1790 stripe->sectors[i].generation = 0; 1791 } 1792 } 1793 1794 static u32 stripe_length(const struct scrub_stripe *stripe) 1795 { 1796 ASSERT(stripe->bg); 1797 1798 return min(BTRFS_STRIPE_LEN, 1799 stripe->bg->start + stripe->bg->length - stripe->logical); 1800 } 1801 1802 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1803 { 1804 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1805 struct btrfs_bio *bbio = NULL; 1806 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1807 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1808 u64 stripe_len = BTRFS_STRIPE_LEN; 1809 int mirror = stripe->mirror_num; 1810 int i; 1811 1812 atomic_inc(&stripe->pending_io); 1813 1814 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1815 /* We're beyond the chunk boundary, no need to read anymore. */ 1816 if (i >= nr_sectors) 1817 break; 1818 1819 /* The current sector cannot be merged, submit the bio. */ 1820 if (bbio && 1821 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1822 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1823 ASSERT(bbio->bio.bi_iter.bi_size); 1824 atomic_inc(&stripe->pending_io); 1825 btrfs_submit_bbio(bbio, mirror); 1826 bbio = NULL; 1827 } 1828 1829 if (!bbio) { 1830 struct btrfs_io_stripe io_stripe = {}; 1831 struct btrfs_io_context *bioc = NULL; 1832 const u64 logical = stripe->logical + 1833 (i << fs_info->sectorsize_bits); 1834 int ret; 1835 1836 io_stripe.rst_search_commit_root = true; 1837 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1838 /* 1839 * For RST cases, we need to manually split the bbio to 1840 * follow the RST boundary. 1841 */ 1842 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1843 &stripe_len, &bioc, &io_stripe, &mirror); 1844 btrfs_put_bioc(bioc); 1845 if (ret < 0) { 1846 if (ret != -ENODATA) { 1847 /* 1848 * Earlier btrfs_get_raid_extent_offset() 1849 * returned -ENODATA, which means there's 1850 * no entry for the corresponding range 1851 * in the stripe tree. But if it's in 1852 * the extent tree, then it's a preallocated 1853 * extent and not an error. 1854 */ 1855 scrub_bitmap_set_bit_io_error(stripe, i); 1856 scrub_bitmap_set_bit_error(stripe, i); 1857 } 1858 continue; 1859 } 1860 1861 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1862 logical, scrub_read_endio, stripe); 1863 } 1864 1865 scrub_bio_add_sector(bbio, stripe, i); 1866 } 1867 1868 if (bbio) { 1869 ASSERT(bbio->bio.bi_iter.bi_size); 1870 atomic_inc(&stripe->pending_io); 1871 btrfs_submit_bbio(bbio, mirror); 1872 } 1873 1874 if (atomic_dec_and_test(&stripe->pending_io)) { 1875 wake_up(&stripe->io_wait); 1876 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1877 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1878 } 1879 } 1880 1881 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1882 struct scrub_stripe *stripe) 1883 { 1884 struct btrfs_fs_info *fs_info = sctx->fs_info; 1885 struct btrfs_bio *bbio; 1886 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1887 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1888 int mirror = stripe->mirror_num; 1889 1890 ASSERT(stripe->bg); 1891 ASSERT(stripe->mirror_num > 0); 1892 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1893 1894 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1895 scrub_submit_extent_sector_read(stripe); 1896 return; 1897 } 1898 1899 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1900 stripe->logical, scrub_read_endio, stripe); 1901 /* Read the whole range inside the chunk boundary. */ 1902 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1903 scrub_bio_add_sector(bbio, stripe, cur); 1904 atomic_inc(&stripe->pending_io); 1905 1906 /* 1907 * For dev-replace, either user asks to avoid the source dev, or 1908 * the device is missing, we try the next mirror instead. 1909 */ 1910 if (sctx->is_dev_replace && 1911 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1912 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1913 !stripe->dev->bdev)) { 1914 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1915 stripe->bg->length); 1916 1917 mirror = calc_next_mirror(mirror, num_copies); 1918 } 1919 btrfs_submit_bbio(bbio, mirror); 1920 } 1921 1922 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1923 { 1924 const unsigned long error = scrub_bitmap_read_error(stripe); 1925 int i; 1926 1927 for_each_set_bit(i, &error, stripe->nr_sectors) { 1928 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1929 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1930 1931 btrfs_err(fs_info, 1932 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1933 stripe->logical, 1934 stripe->logical + (i << fs_info->sectorsize_bits)); 1935 return true; 1936 } 1937 } 1938 return false; 1939 } 1940 1941 static void submit_initial_group_read(struct scrub_ctx *sctx, 1942 unsigned int first_slot, 1943 unsigned int nr_stripes) 1944 { 1945 struct blk_plug plug; 1946 1947 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1948 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1949 1950 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1951 btrfs_stripe_nr_to_offset(nr_stripes)); 1952 blk_start_plug(&plug); 1953 for (int i = 0; i < nr_stripes; i++) { 1954 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1955 1956 /* Those stripes should be initialized. */ 1957 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1958 scrub_submit_initial_read(sctx, stripe); 1959 } 1960 blk_finish_plug(&plug); 1961 } 1962 1963 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1964 { 1965 struct btrfs_fs_info *fs_info = sctx->fs_info; 1966 struct scrub_stripe *stripe; 1967 const int nr_stripes = sctx->cur_stripe; 1968 int ret = 0; 1969 1970 if (!nr_stripes) 1971 return 0; 1972 1973 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1974 1975 /* Submit the stripes which are populated but not submitted. */ 1976 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1977 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1978 1979 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1980 } 1981 1982 for (int i = 0; i < nr_stripes; i++) { 1983 stripe = &sctx->stripes[i]; 1984 1985 wait_event(stripe->repair_wait, 1986 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1987 } 1988 1989 /* Submit for dev-replace. */ 1990 if (sctx->is_dev_replace) { 1991 /* 1992 * For dev-replace, if we know there is something wrong with 1993 * metadata, we should immediately abort. 1994 */ 1995 for (int i = 0; i < nr_stripes; i++) { 1996 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1997 ret = -EIO; 1998 goto out; 1999 } 2000 } 2001 for (int i = 0; i < nr_stripes; i++) { 2002 unsigned long good; 2003 unsigned long has_extent; 2004 unsigned long error; 2005 2006 stripe = &sctx->stripes[i]; 2007 2008 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 2009 2010 has_extent = scrub_bitmap_read_has_extent(stripe); 2011 error = scrub_bitmap_read_error(stripe); 2012 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 2013 scrub_write_sectors(sctx, stripe, good, true); 2014 } 2015 } 2016 2017 /* Wait for the above writebacks to finish. */ 2018 for (int i = 0; i < nr_stripes; i++) { 2019 stripe = &sctx->stripes[i]; 2020 2021 wait_scrub_stripe_io(stripe); 2022 spin_lock(&sctx->stat_lock); 2023 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2024 spin_unlock(&sctx->stat_lock); 2025 scrub_reset_stripe(stripe); 2026 } 2027 out: 2028 sctx->cur_stripe = 0; 2029 return ret; 2030 } 2031 2032 static void raid56_scrub_wait_endio(struct bio *bio) 2033 { 2034 complete(bio->bi_private); 2035 } 2036 2037 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2038 struct btrfs_device *dev, int mirror_num, 2039 u64 logical, u32 length, u64 physical, 2040 u64 *found_logical_ret) 2041 { 2042 struct scrub_stripe *stripe; 2043 int ret; 2044 2045 /* 2046 * There should always be one slot left, as caller filling the last 2047 * slot should flush them all. 2048 */ 2049 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2050 2051 /* @found_logical_ret must be specified. */ 2052 ASSERT(found_logical_ret); 2053 2054 stripe = &sctx->stripes[sctx->cur_stripe]; 2055 scrub_reset_stripe(stripe); 2056 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2057 &sctx->csum_path, dev, physical, 2058 mirror_num, logical, length, stripe); 2059 /* Either >0 as no more extents or <0 for error. */ 2060 if (ret) 2061 return ret; 2062 *found_logical_ret = stripe->logical; 2063 sctx->cur_stripe++; 2064 2065 /* We filled one group, submit it. */ 2066 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2067 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2068 2069 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2070 } 2071 2072 /* Last slot used, flush them all. */ 2073 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2074 return flush_scrub_stripes(sctx); 2075 return 0; 2076 } 2077 2078 /* 2079 * Return 0 if we should not cancel the scrub. 2080 * Return <0 if we need to cancel the scrub, returned value will 2081 * indicate the reason: 2082 * - -ECANCELED - Being explicitly canceled through ioctl. 2083 * - -EINTR - Being interrupted by signal or fs/process freezing. 2084 */ 2085 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2086 { 2087 struct btrfs_fs_info *fs_info = sctx->fs_info; 2088 2089 if (atomic_read(&fs_info->scrub_cancel_req) || 2090 atomic_read(&sctx->cancel_req)) 2091 return -ECANCELED; 2092 2093 /* 2094 * The user (e.g. fsfreeze command) or power management (PM) 2095 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2096 * also freeze all user processes. 2097 * 2098 * A user process can only be frozen when it is in user space, thus we 2099 * have to cancel the run so that the process can return to the user 2100 * space. 2101 * 2102 * Furthermore we have to check both filesystem and process freezing, 2103 * as PM can be configured to freeze the filesystems before processes. 2104 * 2105 * If we only check fs freezing, then suspend without fs freezing 2106 * will timeout, as the process is still in kernel space. 2107 * 2108 * If we only check process freezing, then suspend with fs freezing 2109 * will timeout, as the running scrub will prevent the fs from being frozen. 2110 */ 2111 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2112 freezing(current) || signal_pending(current)) 2113 return -EINTR; 2114 return 0; 2115 } 2116 2117 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2118 struct btrfs_device *scrub_dev, 2119 struct btrfs_chunk_map *map, 2120 u64 full_stripe_start, 2121 unsigned long *extent_bitmap) 2122 { 2123 DECLARE_COMPLETION_ONSTACK(io_done); 2124 struct btrfs_fs_info *fs_info = sctx->fs_info; 2125 struct btrfs_io_context *bioc = NULL; 2126 struct btrfs_raid_bio *rbio; 2127 struct bio bio; 2128 const int data_stripes = nr_data_stripes(map); 2129 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2130 int ret; 2131 2132 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2133 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2134 bio.bi_private = &io_done; 2135 bio.bi_end_io = raid56_scrub_wait_endio; 2136 2137 btrfs_bio_counter_inc_blocked(fs_info); 2138 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2139 &length, &bioc, NULL, NULL); 2140 if (ret < 0) 2141 goto out; 2142 /* For RAID56 write there must be an @bioc allocated. */ 2143 ASSERT(bioc); 2144 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2145 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2146 btrfs_put_bioc(bioc); 2147 if (!rbio) { 2148 ret = -ENOMEM; 2149 goto out; 2150 } 2151 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2152 for (int i = 0; i < data_stripes; i++) { 2153 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2154 2155 raid56_parity_cache_data_folios(rbio, stripe->folios, 2156 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2157 } 2158 raid56_parity_submit_scrub_rbio(rbio); 2159 wait_for_completion_io(&io_done); 2160 ret = blk_status_to_errno(bio.bi_status); 2161 out: 2162 btrfs_bio_counter_dec(fs_info); 2163 bio_uninit(&bio); 2164 return ret; 2165 } 2166 2167 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2168 struct btrfs_device *scrub_dev, 2169 struct btrfs_block_group *bg, 2170 struct btrfs_chunk_map *map, 2171 u64 full_stripe_start) 2172 { 2173 struct btrfs_fs_info *fs_info = sctx->fs_info; 2174 BTRFS_PATH_AUTO_RELEASE(extent_path); 2175 BTRFS_PATH_AUTO_RELEASE(csum_path); 2176 struct scrub_stripe *stripe; 2177 bool all_empty = true; 2178 const int data_stripes = nr_data_stripes(map); 2179 unsigned long extent_bitmap = 0; 2180 int ret; 2181 2182 ASSERT(sctx->raid56_data_stripes); 2183 2184 ret = should_cancel_scrub(sctx); 2185 if (ret < 0) 2186 return ret; 2187 2188 if (atomic_read(&fs_info->scrub_pause_req)) 2189 scrub_blocked_if_needed(fs_info); 2190 2191 spin_lock(&bg->lock); 2192 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2193 spin_unlock(&bg->lock); 2194 return 0; 2195 } 2196 spin_unlock(&bg->lock); 2197 2198 /* 2199 * For data stripe search, we cannot reuse the same extent/csum paths, 2200 * as the data stripe bytenr may be smaller than previous extent. Thus 2201 * we have to use our own extent/csum paths. 2202 */ 2203 extent_path.search_commit_root = true; 2204 extent_path.skip_locking = true; 2205 csum_path.search_commit_root = true; 2206 csum_path.skip_locking = true; 2207 2208 for (int i = 0; i < data_stripes; i++) { 2209 int stripe_index; 2210 int rot; 2211 u64 physical; 2212 2213 stripe = &sctx->raid56_data_stripes[i]; 2214 rot = div_u64(full_stripe_start - bg->start, 2215 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2216 stripe_index = (i + rot) % map->num_stripes; 2217 physical = map->stripes[stripe_index].physical + 2218 btrfs_stripe_nr_to_offset(rot); 2219 2220 scrub_reset_stripe(stripe); 2221 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2222 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2223 map->stripes[stripe_index].dev, physical, 1, 2224 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2225 BTRFS_STRIPE_LEN, stripe); 2226 if (ret < 0) 2227 return ret; 2228 /* 2229 * No extent in this data stripe, need to manually mark them 2230 * initialized to make later read submission happy. 2231 */ 2232 if (ret > 0) { 2233 stripe->logical = full_stripe_start + 2234 btrfs_stripe_nr_to_offset(i); 2235 stripe->dev = map->stripes[stripe_index].dev; 2236 stripe->mirror_num = 1; 2237 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2238 } 2239 } 2240 2241 /* Check if all data stripes are empty. */ 2242 for (int i = 0; i < data_stripes; i++) { 2243 stripe = &sctx->raid56_data_stripes[i]; 2244 if (!scrub_bitmap_empty_has_extent(stripe)) { 2245 all_empty = false; 2246 break; 2247 } 2248 } 2249 if (all_empty) 2250 return 0; 2251 2252 for (int i = 0; i < data_stripes; i++) { 2253 stripe = &sctx->raid56_data_stripes[i]; 2254 scrub_submit_initial_read(sctx, stripe); 2255 } 2256 for (int i = 0; i < data_stripes; i++) { 2257 stripe = &sctx->raid56_data_stripes[i]; 2258 2259 wait_event(stripe->repair_wait, 2260 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2261 } 2262 /* For now, no zoned support for RAID56. */ 2263 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2264 2265 /* 2266 * Now all data stripes are properly verified. Check if we have any 2267 * unrepaired, if so abort immediately or we could further corrupt the 2268 * P/Q stripes. 2269 * 2270 * During the loop, also populate extent_bitmap. 2271 */ 2272 for (int i = 0; i < data_stripes; i++) { 2273 unsigned long error; 2274 unsigned long has_extent; 2275 2276 stripe = &sctx->raid56_data_stripes[i]; 2277 2278 error = scrub_bitmap_read_error(stripe); 2279 has_extent = scrub_bitmap_read_has_extent(stripe); 2280 2281 /* 2282 * We should only check the errors where there is an extent. 2283 * As we may hit an empty data stripe while it's missing. 2284 */ 2285 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2286 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2287 btrfs_err(fs_info, 2288 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2289 full_stripe_start, i, stripe->nr_sectors, 2290 &error); 2291 return ret; 2292 } 2293 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2294 stripe->nr_sectors); 2295 } 2296 2297 /* Now we can check and regenerate the P/Q stripe. */ 2298 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2299 &extent_bitmap); 2300 } 2301 2302 /* 2303 * Scrub one range which can only has simple mirror based profile. 2304 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2305 * RAID0/RAID10). 2306 * 2307 * Since we may need to handle a subset of block group, we need @logical_start 2308 * and @logical_length parameter. 2309 */ 2310 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2311 struct btrfs_block_group *bg, 2312 u64 logical_start, u64 logical_length, 2313 struct btrfs_device *device, 2314 u64 physical, int mirror_num) 2315 { 2316 struct btrfs_fs_info *fs_info = sctx->fs_info; 2317 const u64 logical_end = logical_start + logical_length; 2318 u64 cur_logical = logical_start; 2319 int ret = 0; 2320 2321 /* The range must be inside the bg */ 2322 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2323 2324 /* Go through each extent items inside the logical range */ 2325 while (cur_logical < logical_end) { 2326 u64 found_logical = U64_MAX; 2327 u64 cur_physical = physical + cur_logical - logical_start; 2328 2329 ret = should_cancel_scrub(sctx); 2330 if (ret < 0) 2331 break; 2332 2333 if (atomic_read(&fs_info->scrub_pause_req)) 2334 scrub_blocked_if_needed(fs_info); 2335 2336 spin_lock(&bg->lock); 2337 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2338 spin_unlock(&bg->lock); 2339 ret = 0; 2340 break; 2341 } 2342 spin_unlock(&bg->lock); 2343 2344 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2345 cur_logical, logical_end - cur_logical, 2346 cur_physical, &found_logical); 2347 if (ret > 0) { 2348 /* No more extent, just update the accounting */ 2349 spin_lock(&sctx->stat_lock); 2350 sctx->stat.last_physical = physical + logical_length; 2351 spin_unlock(&sctx->stat_lock); 2352 ret = 0; 2353 break; 2354 } 2355 if (ret < 0) 2356 break; 2357 2358 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2359 ASSERT(found_logical != U64_MAX); 2360 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2361 2362 /* Don't hold CPU for too long time */ 2363 cond_resched(); 2364 } 2365 return ret; 2366 } 2367 2368 /* Calculate the full stripe length for simple stripe based profiles */ 2369 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2370 { 2371 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2372 BTRFS_BLOCK_GROUP_RAID10)); 2373 2374 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2375 } 2376 2377 /* Get the logical bytenr for the stripe */ 2378 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2379 struct btrfs_block_group *bg, 2380 int stripe_index) 2381 { 2382 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2383 BTRFS_BLOCK_GROUP_RAID10)); 2384 ASSERT(stripe_index < map->num_stripes); 2385 2386 /* 2387 * (stripe_index / sub_stripes) gives how many data stripes we need to 2388 * skip. 2389 */ 2390 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2391 bg->start; 2392 } 2393 2394 /* Get the mirror number for the stripe */ 2395 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2396 { 2397 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2398 BTRFS_BLOCK_GROUP_RAID10)); 2399 ASSERT(stripe_index < map->num_stripes); 2400 2401 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2402 return stripe_index % map->sub_stripes + 1; 2403 } 2404 2405 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2406 struct btrfs_block_group *bg, 2407 struct btrfs_chunk_map *map, 2408 struct btrfs_device *device, 2409 int stripe_index) 2410 { 2411 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2412 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2413 const u64 orig_physical = map->stripes[stripe_index].physical; 2414 const u64 end = btrfs_block_group_end(bg); 2415 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2416 u64 cur_logical = orig_logical; 2417 u64 cur_physical = orig_physical; 2418 int ret = 0; 2419 2420 while (cur_logical < end) { 2421 /* 2422 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2423 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2424 * this stripe. 2425 */ 2426 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2427 BTRFS_STRIPE_LEN, device, cur_physical, 2428 mirror_num); 2429 if (ret) 2430 return ret; 2431 /* Skip to next stripe which belongs to the target device */ 2432 cur_logical += logical_increment; 2433 /* For physical offset, we just go to next stripe */ 2434 cur_physical += BTRFS_STRIPE_LEN; 2435 } 2436 return ret; 2437 } 2438 2439 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2440 struct btrfs_block_group *bg, 2441 struct btrfs_chunk_map *map, 2442 struct btrfs_device *scrub_dev, 2443 int stripe_index) 2444 { 2445 struct btrfs_fs_info *fs_info = sctx->fs_info; 2446 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2447 const u64 chunk_logical = bg->start; 2448 int ret; 2449 int ret2; 2450 u64 physical = map->stripes[stripe_index].physical; 2451 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2452 const u64 physical_end = physical + dev_stripe_len; 2453 u64 logical; 2454 u64 logic_end; 2455 /* The logical increment after finishing one stripe */ 2456 u64 increment; 2457 /* Offset inside the chunk */ 2458 u64 offset; 2459 u64 stripe_logical; 2460 2461 /* Extent_path should be released by now. */ 2462 ASSERT(sctx->extent_path.nodes[0] == NULL); 2463 2464 scrub_blocked_if_needed(fs_info); 2465 2466 if (sctx->is_dev_replace && 2467 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2468 mutex_lock(&sctx->wr_lock); 2469 sctx->write_pointer = physical; 2470 mutex_unlock(&sctx->wr_lock); 2471 } 2472 2473 /* Prepare the extra data stripes used by RAID56. */ 2474 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2475 ASSERT(sctx->raid56_data_stripes == NULL); 2476 2477 sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), 2478 sizeof(struct scrub_stripe), 2479 GFP_KERNEL); 2480 if (!sctx->raid56_data_stripes) { 2481 ret = -ENOMEM; 2482 goto out; 2483 } 2484 for (int i = 0; i < nr_data_stripes(map); i++) { 2485 ret = init_scrub_stripe(fs_info, 2486 &sctx->raid56_data_stripes[i]); 2487 if (ret < 0) 2488 goto out; 2489 sctx->raid56_data_stripes[i].bg = bg; 2490 sctx->raid56_data_stripes[i].sctx = sctx; 2491 } 2492 } 2493 /* 2494 * There used to be a big double loop to handle all profiles using the 2495 * same routine, which grows larger and more gross over time. 2496 * 2497 * So here we handle each profile differently, so simpler profiles 2498 * have simpler scrubbing function. 2499 */ 2500 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2501 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2502 /* 2503 * Above check rules out all complex profile, the remaining 2504 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2505 * mirrored duplication without stripe. 2506 * 2507 * Only @physical and @mirror_num needs to calculated using 2508 * @stripe_index. 2509 */ 2510 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2511 scrub_dev, map->stripes[stripe_index].physical, 2512 stripe_index + 1); 2513 offset = 0; 2514 goto out; 2515 } 2516 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2517 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2518 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2519 goto out; 2520 } 2521 2522 /* Only RAID56 goes through the old code */ 2523 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2524 ret = 0; 2525 2526 /* Calculate the logical end of the stripe */ 2527 get_raid56_logic_offset(physical_end, stripe_index, 2528 map, &logic_end, NULL); 2529 logic_end += chunk_logical; 2530 2531 /* Initialize @offset in case we need to go to out: label */ 2532 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2533 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2534 2535 /* 2536 * Due to the rotation, for RAID56 it's better to iterate each stripe 2537 * using their physical offset. 2538 */ 2539 while (physical < physical_end) { 2540 ret = get_raid56_logic_offset(physical, stripe_index, map, 2541 &logical, &stripe_logical); 2542 logical += chunk_logical; 2543 if (ret) { 2544 /* it is parity strip */ 2545 stripe_logical += chunk_logical; 2546 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2547 map, stripe_logical); 2548 spin_lock(&sctx->stat_lock); 2549 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2550 physical_end); 2551 spin_unlock(&sctx->stat_lock); 2552 if (ret) 2553 goto out; 2554 goto next; 2555 } 2556 2557 /* 2558 * Now we're at a data stripe, scrub each extents in the range. 2559 * 2560 * At this stage, if we ignore the repair part, inside each data 2561 * stripe it is no different than SINGLE profile. 2562 * We can reuse scrub_simple_mirror() here, as the repair part 2563 * is still based on @mirror_num. 2564 */ 2565 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2566 scrub_dev, physical, 1); 2567 if (ret < 0) 2568 goto out; 2569 next: 2570 logical += increment; 2571 physical += BTRFS_STRIPE_LEN; 2572 spin_lock(&sctx->stat_lock); 2573 sctx->stat.last_physical = physical; 2574 spin_unlock(&sctx->stat_lock); 2575 } 2576 out: 2577 ret2 = flush_scrub_stripes(sctx); 2578 if (!ret) 2579 ret = ret2; 2580 btrfs_release_path(&sctx->extent_path); 2581 btrfs_release_path(&sctx->csum_path); 2582 2583 if (sctx->raid56_data_stripes) { 2584 for (int i = 0; i < nr_data_stripes(map); i++) 2585 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2586 kfree(sctx->raid56_data_stripes); 2587 sctx->raid56_data_stripes = NULL; 2588 } 2589 2590 if (sctx->is_dev_replace && ret >= 0) { 2591 ret2 = sync_write_pointer_for_zoned(sctx, 2592 chunk_logical + offset, 2593 map->stripes[stripe_index].physical, 2594 physical_end); 2595 if (ret2) 2596 ret = ret2; 2597 } 2598 2599 return ret < 0 ? ret : 0; 2600 } 2601 2602 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2603 struct btrfs_block_group *bg, 2604 struct btrfs_device *scrub_dev, 2605 u64 dev_offset, 2606 u64 dev_extent_len) 2607 { 2608 struct btrfs_fs_info *fs_info = sctx->fs_info; 2609 struct btrfs_chunk_map *map; 2610 int i; 2611 int ret = 0; 2612 2613 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2614 if (!map) { 2615 /* 2616 * Might have been an unused block group deleted by the cleaner 2617 * kthread or relocation. 2618 */ 2619 spin_lock(&bg->lock); 2620 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2621 ret = -EINVAL; 2622 spin_unlock(&bg->lock); 2623 2624 return ret; 2625 } 2626 if (map->start != bg->start) 2627 goto out; 2628 if (map->chunk_len < dev_extent_len) 2629 goto out; 2630 2631 for (i = 0; i < map->num_stripes; ++i) { 2632 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2633 map->stripes[i].physical == dev_offset) { 2634 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2635 if (ret) 2636 goto out; 2637 } 2638 } 2639 out: 2640 btrfs_free_chunk_map(map); 2641 2642 return ret; 2643 } 2644 2645 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2646 struct btrfs_block_group *cache) 2647 { 2648 struct btrfs_fs_info *fs_info = cache->fs_info; 2649 2650 if (!btrfs_is_zoned(fs_info)) 2651 return 0; 2652 2653 btrfs_wait_block_group_reservations(cache); 2654 btrfs_wait_nocow_writers(cache); 2655 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2656 2657 return btrfs_commit_current_transaction(root); 2658 } 2659 2660 static noinline_for_stack 2661 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2662 struct btrfs_device *scrub_dev, u64 start, u64 end) 2663 { 2664 struct btrfs_dev_extent *dev_extent = NULL; 2665 BTRFS_PATH_AUTO_FREE(path); 2666 struct btrfs_fs_info *fs_info = sctx->fs_info; 2667 struct btrfs_root *root = fs_info->dev_root; 2668 u64 chunk_offset; 2669 int ret = 0; 2670 int ro_set; 2671 int slot; 2672 struct extent_buffer *l; 2673 struct btrfs_key key; 2674 struct btrfs_key found_key; 2675 struct btrfs_block_group *cache; 2676 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2677 2678 path = btrfs_alloc_path(); 2679 if (!path) 2680 return -ENOMEM; 2681 2682 path->reada = READA_FORWARD; 2683 path->search_commit_root = true; 2684 path->skip_locking = true; 2685 2686 key.objectid = scrub_dev->devid; 2687 key.type = BTRFS_DEV_EXTENT_KEY; 2688 key.offset = 0ull; 2689 2690 while (1) { 2691 u64 dev_extent_len; 2692 2693 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2694 if (ret < 0) 2695 break; 2696 if (ret > 0) { 2697 if (path->slots[0] >= 2698 btrfs_header_nritems(path->nodes[0])) { 2699 ret = btrfs_next_leaf(root, path); 2700 if (ret < 0) 2701 break; 2702 if (ret > 0) { 2703 ret = 0; 2704 break; 2705 } 2706 } else { 2707 ret = 0; 2708 } 2709 } 2710 2711 l = path->nodes[0]; 2712 slot = path->slots[0]; 2713 2714 btrfs_item_key_to_cpu(l, &found_key, slot); 2715 2716 if (found_key.objectid != scrub_dev->devid) 2717 break; 2718 2719 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2720 break; 2721 2722 if (found_key.offset >= end) 2723 break; 2724 2725 if (found_key.offset < key.offset) 2726 break; 2727 2728 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2729 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2730 2731 if (found_key.offset + dev_extent_len <= start) 2732 goto skip; 2733 2734 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2735 2736 /* 2737 * get a reference on the corresponding block group to prevent 2738 * the chunk from going away while we scrub it 2739 */ 2740 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2741 2742 /* some chunks are removed but not committed to disk yet, 2743 * continue scrubbing */ 2744 if (!cache) 2745 goto skip; 2746 2747 ASSERT(cache->start <= chunk_offset); 2748 /* 2749 * We are using the commit root to search for device extents, so 2750 * that means we could have found a device extent item from a 2751 * block group that was deleted in the current transaction. The 2752 * logical start offset of the deleted block group, stored at 2753 * @chunk_offset, might be part of the logical address range of 2754 * a new block group (which uses different physical extents). 2755 * In this case btrfs_lookup_block_group() has returned the new 2756 * block group, and its start address is less than @chunk_offset. 2757 * 2758 * We skip such new block groups, because it's pointless to 2759 * process them, as we won't find their extents because we search 2760 * for them using the commit root of the extent tree. For a device 2761 * replace it's also fine to skip it, we won't miss copying them 2762 * to the target device because we have the write duplication 2763 * setup through the regular write path (by btrfs_map_block()), 2764 * and we have committed a transaction when we started the device 2765 * replace, right after setting up the device replace state. 2766 */ 2767 if (cache->start < chunk_offset) { 2768 btrfs_put_block_group(cache); 2769 goto skip; 2770 } 2771 2772 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2773 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2774 btrfs_put_block_group(cache); 2775 goto skip; 2776 } 2777 } 2778 2779 /* 2780 * Make sure that while we are scrubbing the corresponding block 2781 * group doesn't get its logical address and its device extents 2782 * reused for another block group, which can possibly be of a 2783 * different type and different profile. We do this to prevent 2784 * false error detections and crashes due to bogus attempts to 2785 * repair extents. 2786 */ 2787 spin_lock(&cache->lock); 2788 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2789 spin_unlock(&cache->lock); 2790 btrfs_put_block_group(cache); 2791 goto skip; 2792 } 2793 btrfs_freeze_block_group(cache); 2794 spin_unlock(&cache->lock); 2795 2796 /* 2797 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2798 * to avoid deadlock caused by: 2799 * btrfs_inc_block_group_ro() 2800 * -> btrfs_wait_for_commit() 2801 * -> btrfs_commit_transaction() 2802 * -> btrfs_scrub_pause() 2803 */ 2804 scrub_pause_on(fs_info); 2805 2806 /* 2807 * Don't do chunk preallocation for scrub. 2808 * 2809 * This is especially important for SYSTEM bgs, or we can hit 2810 * -EFBIG from btrfs_finish_chunk_alloc() like: 2811 * 1. The only SYSTEM bg is marked RO. 2812 * Since SYSTEM bg is small, that's pretty common. 2813 * 2. New SYSTEM bg will be allocated 2814 * Due to regular version will allocate new chunk. 2815 * 3. New SYSTEM bg is empty and will get cleaned up 2816 * Before cleanup really happens, it's marked RO again. 2817 * 4. Empty SYSTEM bg get scrubbed 2818 * We go back to 2. 2819 * 2820 * This can easily boost the amount of SYSTEM chunks if cleaner 2821 * thread can't be triggered fast enough, and use up all space 2822 * of btrfs_super_block::sys_chunk_array 2823 * 2824 * While for dev replace, we need to try our best to mark block 2825 * group RO, to prevent race between: 2826 * - Write duplication 2827 * Contains latest data 2828 * - Scrub copy 2829 * Contains data from commit tree 2830 * 2831 * If target block group is not marked RO, nocow writes can 2832 * be overwritten by scrub copy, causing data corruption. 2833 * So for dev-replace, it's not allowed to continue if a block 2834 * group is not RO. 2835 */ 2836 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2837 if (!ret && sctx->is_dev_replace) { 2838 ret = finish_extent_writes_for_zoned(root, cache); 2839 if (ret) { 2840 btrfs_dec_block_group_ro(cache); 2841 scrub_pause_off(fs_info); 2842 btrfs_put_block_group(cache); 2843 break; 2844 } 2845 } 2846 2847 if (ret == 0) { 2848 ro_set = 1; 2849 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2850 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2851 /* 2852 * btrfs_inc_block_group_ro return -ENOSPC when it 2853 * failed in creating new chunk for metadata. 2854 * It is not a problem for scrub, because 2855 * metadata are always cowed, and our scrub paused 2856 * commit_transactions. 2857 * 2858 * For RAID56 chunks, we have to mark them read-only 2859 * for scrub, as later we would use our own cache 2860 * out of RAID56 realm. 2861 * Thus we want the RAID56 bg to be marked RO to 2862 * prevent RMW from screwing up out cache. 2863 */ 2864 ro_set = 0; 2865 } else if (ret == -ETXTBSY) { 2866 btrfs_warn(fs_info, 2867 "scrub: skipping scrub of block group %llu due to active swapfile", 2868 cache->start); 2869 scrub_pause_off(fs_info); 2870 ret = 0; 2871 goto skip_unfreeze; 2872 } else { 2873 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2874 ret); 2875 btrfs_unfreeze_block_group(cache); 2876 btrfs_put_block_group(cache); 2877 scrub_pause_off(fs_info); 2878 break; 2879 } 2880 2881 /* 2882 * Now the target block is marked RO, wait for nocow writes to 2883 * finish before dev-replace. 2884 * COW is fine, as COW never overwrites extents in commit tree. 2885 */ 2886 if (sctx->is_dev_replace) { 2887 btrfs_wait_nocow_writers(cache); 2888 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2889 } 2890 2891 scrub_pause_off(fs_info); 2892 down_write(&dev_replace->rwsem); 2893 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2894 dev_replace->cursor_left = found_key.offset; 2895 dev_replace->item_needs_writeback = 1; 2896 up_write(&dev_replace->rwsem); 2897 2898 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2899 dev_extent_len); 2900 if (sctx->is_dev_replace && 2901 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2902 cache, found_key.offset)) 2903 ro_set = 0; 2904 2905 down_write(&dev_replace->rwsem); 2906 dev_replace->cursor_left = dev_replace->cursor_right; 2907 dev_replace->item_needs_writeback = 1; 2908 up_write(&dev_replace->rwsem); 2909 2910 if (ro_set) 2911 btrfs_dec_block_group_ro(cache); 2912 2913 /* 2914 * We might have prevented the cleaner kthread from deleting 2915 * this block group if it was already unused because we raced 2916 * and set it to RO mode first. So add it back to the unused 2917 * list, otherwise it might not ever be deleted unless a manual 2918 * balance is triggered or it becomes used and unused again. 2919 */ 2920 spin_lock(&cache->lock); 2921 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2922 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2923 spin_unlock(&cache->lock); 2924 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2925 btrfs_discard_queue_work(&fs_info->discard_ctl, 2926 cache); 2927 else 2928 btrfs_mark_bg_unused(cache); 2929 } else { 2930 spin_unlock(&cache->lock); 2931 } 2932 skip_unfreeze: 2933 btrfs_unfreeze_block_group(cache); 2934 btrfs_put_block_group(cache); 2935 if (ret) 2936 break; 2937 if (unlikely(sctx->is_dev_replace && 2938 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2939 ret = -EIO; 2940 break; 2941 } 2942 if (sctx->stat.malloc_errors > 0) { 2943 ret = -ENOMEM; 2944 break; 2945 } 2946 skip: 2947 key.offset = found_key.offset + dev_extent_len; 2948 btrfs_release_path(path); 2949 } 2950 2951 return ret; 2952 } 2953 2954 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2955 struct page *page, u64 physical, u64 generation) 2956 { 2957 struct btrfs_fs_info *fs_info = sctx->fs_info; 2958 struct btrfs_super_block *sb = page_address(page); 2959 int ret; 2960 2961 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2962 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2963 if (ret < 0) 2964 return ret; 2965 ret = btrfs_check_super_csum(fs_info, sb); 2966 if (unlikely(ret != 0)) { 2967 btrfs_err_rl(fs_info, 2968 "scrub: super block at physical %llu devid %llu has bad csum", 2969 physical, dev->devid); 2970 return -EIO; 2971 } 2972 if (unlikely(btrfs_super_generation(sb) != generation)) { 2973 btrfs_err_rl(fs_info, 2974 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2975 physical, dev->devid, 2976 btrfs_super_generation(sb), generation); 2977 return -EUCLEAN; 2978 } 2979 2980 return btrfs_validate_super(fs_info, sb, -1); 2981 } 2982 2983 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2984 struct btrfs_device *scrub_dev) 2985 { 2986 int i; 2987 u64 bytenr; 2988 u64 gen; 2989 int ret = 0; 2990 struct page *page; 2991 struct btrfs_fs_info *fs_info = sctx->fs_info; 2992 2993 if (BTRFS_FS_ERROR(fs_info)) 2994 return -EROFS; 2995 2996 page = alloc_page(GFP_KERNEL); 2997 if (!page) { 2998 spin_lock(&sctx->stat_lock); 2999 sctx->stat.malloc_errors++; 3000 spin_unlock(&sctx->stat_lock); 3001 return -ENOMEM; 3002 } 3003 3004 /* Seed devices of a new filesystem has their own generation. */ 3005 if (scrub_dev->fs_devices != fs_info->fs_devices) 3006 gen = scrub_dev->generation; 3007 else 3008 gen = btrfs_get_last_trans_committed(fs_info); 3009 3010 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3011 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 3012 if (ret == -ENOENT) 3013 break; 3014 3015 if (ret) { 3016 spin_lock(&sctx->stat_lock); 3017 sctx->stat.super_errors++; 3018 spin_unlock(&sctx->stat_lock); 3019 continue; 3020 } 3021 3022 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3023 scrub_dev->commit_total_bytes) 3024 break; 3025 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3026 continue; 3027 3028 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3029 if (ret) { 3030 spin_lock(&sctx->stat_lock); 3031 sctx->stat.super_errors++; 3032 spin_unlock(&sctx->stat_lock); 3033 } 3034 } 3035 __free_page(page); 3036 return 0; 3037 } 3038 3039 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3040 { 3041 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3042 &fs_info->scrub_lock)) { 3043 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3044 3045 fs_info->scrub_workers = NULL; 3046 mutex_unlock(&fs_info->scrub_lock); 3047 3048 if (scrub_workers) 3049 destroy_workqueue(scrub_workers); 3050 } 3051 } 3052 3053 /* 3054 * get a reference count on fs_info->scrub_workers. start worker if necessary 3055 */ 3056 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3057 { 3058 struct workqueue_struct *scrub_workers = NULL; 3059 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3060 int max_active = fs_info->thread_pool_size; 3061 int ret = -ENOMEM; 3062 3063 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3064 return 0; 3065 3066 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3067 if (!scrub_workers) 3068 return -ENOMEM; 3069 3070 mutex_lock(&fs_info->scrub_lock); 3071 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3072 ASSERT(fs_info->scrub_workers == NULL); 3073 fs_info->scrub_workers = scrub_workers; 3074 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3075 mutex_unlock(&fs_info->scrub_lock); 3076 return 0; 3077 } 3078 /* Other thread raced in and created the workers for us */ 3079 refcount_inc(&fs_info->scrub_workers_refcnt); 3080 mutex_unlock(&fs_info->scrub_lock); 3081 3082 ret = 0; 3083 3084 destroy_workqueue(scrub_workers); 3085 return ret; 3086 } 3087 3088 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3089 u64 end, struct btrfs_scrub_progress *progress, 3090 bool readonly, bool is_dev_replace) 3091 { 3092 struct btrfs_dev_lookup_args args = { .devid = devid }; 3093 struct scrub_ctx *sctx; 3094 int ret; 3095 struct btrfs_device *dev; 3096 unsigned int nofs_flag; 3097 bool need_commit = false; 3098 3099 /* Set the basic fallback @last_physical before we got a sctx. */ 3100 if (progress) 3101 progress->last_physical = start; 3102 3103 if (btrfs_fs_closing(fs_info)) 3104 return -EAGAIN; 3105 3106 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3107 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3108 3109 /* 3110 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3111 * value (max nodesize / min sectorsize), thus nodesize should always 3112 * be fine. 3113 */ 3114 ASSERT(fs_info->nodesize <= 3115 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3116 3117 /* Allocate outside of device_list_mutex */ 3118 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3119 if (IS_ERR(sctx)) 3120 return PTR_ERR(sctx); 3121 sctx->stat.last_physical = start; 3122 3123 ret = scrub_workers_get(fs_info); 3124 if (ret) 3125 goto out_free_ctx; 3126 3127 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3128 dev = btrfs_find_device(fs_info->fs_devices, &args); 3129 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3130 !is_dev_replace)) { 3131 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3132 ret = -ENODEV; 3133 goto out; 3134 } 3135 3136 if (!is_dev_replace && !readonly && 3137 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3138 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3139 btrfs_err(fs_info, 3140 "scrub: devid %llu: filesystem on %s is not writable", 3141 devid, btrfs_dev_name(dev)); 3142 ret = -EROFS; 3143 goto out; 3144 } 3145 3146 mutex_lock(&fs_info->scrub_lock); 3147 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3148 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3149 mutex_unlock(&fs_info->scrub_lock); 3150 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3151 ret = -EIO; 3152 goto out; 3153 } 3154 3155 down_read(&fs_info->dev_replace.rwsem); 3156 if (dev->scrub_ctx || 3157 (!is_dev_replace && 3158 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3159 up_read(&fs_info->dev_replace.rwsem); 3160 mutex_unlock(&fs_info->scrub_lock); 3161 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3162 ret = -EINPROGRESS; 3163 goto out; 3164 } 3165 up_read(&fs_info->dev_replace.rwsem); 3166 3167 sctx->readonly = readonly; 3168 dev->scrub_ctx = sctx; 3169 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3170 3171 /* 3172 * checking @scrub_pause_req here, we can avoid 3173 * race between committing transaction and scrubbing. 3174 */ 3175 __scrub_blocked_if_needed(fs_info); 3176 atomic_inc(&fs_info->scrubs_running); 3177 mutex_unlock(&fs_info->scrub_lock); 3178 3179 /* 3180 * In order to avoid deadlock with reclaim when there is a transaction 3181 * trying to pause scrub, make sure we use GFP_NOFS for all the 3182 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3183 * invoked by our callees. The pausing request is done when the 3184 * transaction commit starts, and it blocks the transaction until scrub 3185 * is paused (done at specific points at scrub_stripe() or right above 3186 * before incrementing fs_info->scrubs_running). 3187 */ 3188 nofs_flag = memalloc_nofs_save(); 3189 if (!is_dev_replace) { 3190 u64 old_super_errors; 3191 3192 spin_lock(&sctx->stat_lock); 3193 old_super_errors = sctx->stat.super_errors; 3194 spin_unlock(&sctx->stat_lock); 3195 3196 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3197 /* 3198 * by holding device list mutex, we can 3199 * kick off writing super in log tree sync. 3200 */ 3201 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3202 ret = scrub_supers(sctx, dev); 3203 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3204 3205 spin_lock(&sctx->stat_lock); 3206 /* 3207 * Super block errors found, but we can not commit transaction 3208 * at current context, since btrfs_commit_transaction() needs 3209 * to pause the current running scrub (hold by ourselves). 3210 */ 3211 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3212 need_commit = true; 3213 spin_unlock(&sctx->stat_lock); 3214 } 3215 3216 if (!ret) 3217 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3218 memalloc_nofs_restore(nofs_flag); 3219 3220 atomic_dec(&fs_info->scrubs_running); 3221 wake_up(&fs_info->scrub_pause_wait); 3222 3223 if (progress) 3224 memcpy(progress, &sctx->stat, sizeof(*progress)); 3225 3226 if (!is_dev_replace) 3227 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3228 ret ? "not finished" : "finished", devid, ret); 3229 3230 mutex_lock(&fs_info->scrub_lock); 3231 dev->scrub_ctx = NULL; 3232 mutex_unlock(&fs_info->scrub_lock); 3233 3234 scrub_workers_put(fs_info); 3235 scrub_put_ctx(sctx); 3236 3237 /* 3238 * We found some super block errors before, now try to force a 3239 * transaction commit, as scrub has finished. 3240 */ 3241 if (need_commit) { 3242 struct btrfs_trans_handle *trans; 3243 3244 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3245 if (IS_ERR(trans)) { 3246 ret = PTR_ERR(trans); 3247 btrfs_err(fs_info, 3248 "scrub: failed to start transaction to fix super block errors: %d", ret); 3249 return ret; 3250 } 3251 ret = btrfs_commit_transaction(trans); 3252 if (ret < 0) 3253 btrfs_err(fs_info, 3254 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3255 } 3256 return ret; 3257 out: 3258 scrub_workers_put(fs_info); 3259 out_free_ctx: 3260 scrub_free_ctx(sctx); 3261 3262 return ret; 3263 } 3264 3265 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3266 { 3267 mutex_lock(&fs_info->scrub_lock); 3268 atomic_inc(&fs_info->scrub_pause_req); 3269 while (atomic_read(&fs_info->scrubs_paused) != 3270 atomic_read(&fs_info->scrubs_running)) { 3271 mutex_unlock(&fs_info->scrub_lock); 3272 wait_event(fs_info->scrub_pause_wait, 3273 atomic_read(&fs_info->scrubs_paused) == 3274 atomic_read(&fs_info->scrubs_running)); 3275 mutex_lock(&fs_info->scrub_lock); 3276 } 3277 mutex_unlock(&fs_info->scrub_lock); 3278 } 3279 3280 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3281 { 3282 atomic_dec(&fs_info->scrub_pause_req); 3283 wake_up(&fs_info->scrub_pause_wait); 3284 } 3285 3286 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3287 { 3288 mutex_lock(&fs_info->scrub_lock); 3289 if (!atomic_read(&fs_info->scrubs_running)) { 3290 mutex_unlock(&fs_info->scrub_lock); 3291 return -ENOTCONN; 3292 } 3293 3294 atomic_inc(&fs_info->scrub_cancel_req); 3295 while (atomic_read(&fs_info->scrubs_running)) { 3296 mutex_unlock(&fs_info->scrub_lock); 3297 wait_event(fs_info->scrub_pause_wait, 3298 atomic_read(&fs_info->scrubs_running) == 0); 3299 mutex_lock(&fs_info->scrub_lock); 3300 } 3301 atomic_dec(&fs_info->scrub_cancel_req); 3302 mutex_unlock(&fs_info->scrub_lock); 3303 3304 return 0; 3305 } 3306 3307 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3308 { 3309 struct btrfs_fs_info *fs_info = dev->fs_info; 3310 struct scrub_ctx *sctx; 3311 3312 mutex_lock(&fs_info->scrub_lock); 3313 sctx = dev->scrub_ctx; 3314 if (!sctx) { 3315 mutex_unlock(&fs_info->scrub_lock); 3316 return -ENOTCONN; 3317 } 3318 atomic_inc(&sctx->cancel_req); 3319 while (dev->scrub_ctx) { 3320 mutex_unlock(&fs_info->scrub_lock); 3321 wait_event(fs_info->scrub_pause_wait, 3322 dev->scrub_ctx == NULL); 3323 mutex_lock(&fs_info->scrub_lock); 3324 } 3325 mutex_unlock(&fs_info->scrub_lock); 3326 3327 return 0; 3328 } 3329 3330 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3331 struct btrfs_scrub_progress *progress) 3332 { 3333 struct btrfs_dev_lookup_args args = { .devid = devid }; 3334 struct btrfs_device *dev; 3335 struct scrub_ctx *sctx = NULL; 3336 3337 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3338 dev = btrfs_find_device(fs_info->fs_devices, &args); 3339 if (dev) 3340 sctx = dev->scrub_ctx; 3341 if (sctx) 3342 memcpy(progress, &sctx->stat, sizeof(*progress)); 3343 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3344 3345 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3346 } 3347