1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "discard.h" 11 #include "volumes.h" 12 #include "disk-io.h" 13 #include "ordered-data.h" 14 #include "transaction.h" 15 #include "backref.h" 16 #include "extent_io.h" 17 #include "dev-replace.h" 18 #include "raid56.h" 19 #include "block-group.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "file-item.h" 24 #include "scrub.h" 25 #include "raid-stripe-tree.h" 26 27 /* 28 * This is only the first step towards a full-features scrub. It reads all 29 * extent and super block and verifies the checksums. In case a bad checksum 30 * is found or the extent cannot be read, good data will be written back if 31 * any can be found. 32 * 33 * Future enhancements: 34 * - In case an unrepairable extent is encountered, track which files are 35 * affected and report them 36 * - track and record media errors, throw out bad devices 37 * - add a mode to also read unallocated space 38 */ 39 40 struct scrub_ctx; 41 42 /* 43 * The following value only influences the performance. 44 * 45 * This determines how many stripes would be submitted in one go, 46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 47 */ 48 #define SCRUB_STRIPES_PER_GROUP 8 49 50 /* 51 * How many groups we have for each sctx. 52 * 53 * This would be 8M per device, the same value as the old scrub in-flight bios 54 * size limit. 55 */ 56 #define SCRUB_GROUPS_PER_SCTX 16 57 58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 59 60 /* 61 * The following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 */ 64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 65 66 /* Represent one sector and its needed info to verify the content. */ 67 struct scrub_sector_verification { 68 union { 69 /* 70 * Csum pointer for data csum verification. Should point to a 71 * sector csum inside scrub_stripe::csums. 72 * 73 * NULL if this data sector has no csum. 74 */ 75 u8 *csum; 76 77 /* 78 * Extra info for metadata verification. All sectors inside a 79 * tree block share the same generation. 80 */ 81 u64 generation; 82 }; 83 }; 84 85 enum scrub_stripe_flags { 86 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 87 SCRUB_STRIPE_FLAG_INITIALIZED, 88 89 /* Set when the read-repair is finished. */ 90 SCRUB_STRIPE_FLAG_REPAIR_DONE, 91 92 /* 93 * Set for data stripes if it's triggered from P/Q stripe. 94 * During such scrub, we should not report errors in data stripes, nor 95 * update the accounting. 96 */ 97 SCRUB_STRIPE_FLAG_NO_REPORT, 98 }; 99 100 /* 101 * We have multiple bitmaps for one scrub_stripe. 102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 104 * 105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 106 * into a larger one. 107 * 108 * These enum records where the sub-bitmap are inside the larger one. 109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 110 */ 111 enum { 112 /* Which blocks are covered by extent items. */ 113 scrub_bitmap_nr_has_extent = 0, 114 115 /* Which blocks are metadata. */ 116 scrub_bitmap_nr_is_metadata, 117 118 /* 119 * Which blocks have errors, including IO, csum, and metadata 120 * errors. 121 * This sub-bitmap is the OR results of the next few error related 122 * sub-bitmaps. 123 */ 124 scrub_bitmap_nr_error, 125 scrub_bitmap_nr_io_error, 126 scrub_bitmap_nr_csum_error, 127 scrub_bitmap_nr_meta_error, 128 scrub_bitmap_nr_meta_gen_error, 129 scrub_bitmap_nr_last, 130 }; 131 132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 133 134 /* 135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 136 */ 137 struct scrub_stripe { 138 struct scrub_ctx *sctx; 139 struct btrfs_block_group *bg; 140 141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 142 struct scrub_sector_verification *sectors; 143 144 struct btrfs_device *dev; 145 u64 logical; 146 u64 physical; 147 148 u16 mirror_num; 149 150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 151 u16 nr_sectors; 152 153 /* 154 * How many data/meta extents are in this stripe. Only for scrub status 155 * reporting purposes. 156 */ 157 u16 nr_data_extents; 158 u16 nr_meta_extents; 159 160 atomic_t pending_io; 161 wait_queue_head_t io_wait; 162 wait_queue_head_t repair_wait; 163 164 /* 165 * Indicate the states of the stripe. Bits are defined in 166 * scrub_stripe_flags enum. 167 */ 168 unsigned long state; 169 170 /* The large bitmap contains all the sub-bitmaps. */ 171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 173 174 /* 175 * For writeback (repair or replace) error reporting. 176 * This one is protected by a spinlock, thus can not be packed into 177 * the larger bitmap. 178 */ 179 unsigned long write_error_bitmap; 180 181 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 182 spinlock_t write_error_lock; 183 184 /* 185 * Checksum for the whole stripe if this stripe is inside a data block 186 * group. 187 */ 188 u8 *csums; 189 190 struct work_struct work; 191 }; 192 193 struct scrub_ctx { 194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 195 struct scrub_stripe *raid56_data_stripes; 196 struct btrfs_fs_info *fs_info; 197 struct btrfs_path extent_path; 198 struct btrfs_path csum_path; 199 int first_free; 200 int cur_stripe; 201 atomic_t cancel_req; 202 int readonly; 203 204 /* State of IO submission throttling affecting the associated device */ 205 ktime_t throttle_deadline; 206 u64 throttle_sent; 207 208 bool is_dev_replace; 209 u64 write_pointer; 210 211 struct mutex wr_lock; 212 struct btrfs_device *wr_tgtdev; 213 214 /* 215 * statistics 216 */ 217 struct btrfs_scrub_progress stat; 218 spinlock_t stat_lock; 219 220 /* 221 * Use a ref counter to avoid use-after-free issues. Scrub workers 222 * decrement bios_in_flight and workers_pending and then do a wakeup 223 * on the list_wait wait queue. We must ensure the main scrub task 224 * doesn't free the scrub context before or while the workers are 225 * doing the wakeup() call. 226 */ 227 refcount_t refs; 228 }; 229 230 #define scrub_calc_start_bit(stripe, name, block_nr) \ 231 ({ \ 232 unsigned int __start_bit; \ 233 \ 234 ASSERT(block_nr < stripe->nr_sectors, \ 235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 237 __start_bit; \ 238 }) 239 240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 242 unsigned int block_nr, \ 243 unsigned int nr_blocks) \ 244 { \ 245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 246 name, block_nr); \ 247 \ 248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 249 } \ 250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 251 unsigned int block_nr, \ 252 unsigned int nr_blocks) \ 253 { \ 254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 255 block_nr); \ 256 \ 257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 258 } \ 259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 260 unsigned int block_nr) \ 261 { \ 262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 263 block_nr); \ 264 \ 265 return test_bit(start_bit, stripe->bitmaps); \ 266 } \ 267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 268 unsigned int block_nr) \ 269 { \ 270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 271 block_nr); \ 272 \ 273 set_bit(start_bit, stripe->bitmaps); \ 274 } \ 275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 276 unsigned int block_nr) \ 277 { \ 278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 279 block_nr); \ 280 \ 281 clear_bit(start_bit, stripe->bitmaps); \ 282 } \ 283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 284 { \ 285 const unsigned int nr_blocks = stripe->nr_sectors; \ 286 \ 287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 288 "nr_blocks=%u BITS_PER_LONG=%u", \ 289 nr_blocks, BITS_PER_LONG); \ 290 \ 291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 292 stripe->nr_sectors); \ 293 } \ 294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 295 { \ 296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 297 \ 298 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 299 } \ 300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 301 { \ 302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 303 \ 304 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 305 } 306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 308 IMPLEMENT_SCRUB_BITMAP_OPS(error); 309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 313 314 struct scrub_warning { 315 struct btrfs_path *path; 316 u64 extent_item_size; 317 const char *errstr; 318 u64 physical; 319 u64 logical; 320 struct btrfs_device *dev; 321 }; 322 323 struct scrub_error_records { 324 /* 325 * Bitmap recording which blocks hit errors (IO/csum/...) during the 326 * initial read. 327 */ 328 unsigned long init_error_bitmap; 329 330 unsigned int nr_io_errors; 331 unsigned int nr_csum_errors; 332 unsigned int nr_meta_errors; 333 unsigned int nr_meta_gen_errors; 334 }; 335 336 static void release_scrub_stripe(struct scrub_stripe *stripe) 337 { 338 if (!stripe) 339 return; 340 341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 342 if (stripe->folios[i]) 343 folio_put(stripe->folios[i]); 344 stripe->folios[i] = NULL; 345 } 346 kfree(stripe->sectors); 347 kfree(stripe->csums); 348 stripe->sectors = NULL; 349 stripe->csums = NULL; 350 stripe->sctx = NULL; 351 stripe->state = 0; 352 } 353 354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 355 struct scrub_stripe *stripe) 356 { 357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 358 int ret; 359 360 memset(stripe, 0, sizeof(*stripe)); 361 362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 363 stripe->state = 0; 364 365 init_waitqueue_head(&stripe->io_wait); 366 init_waitqueue_head(&stripe->repair_wait); 367 atomic_set(&stripe->pending_io, 0); 368 spin_lock_init(&stripe->write_error_lock); 369 370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 372 fs_info->block_min_order, stripe->folios); 373 if (ret < 0) 374 goto error; 375 376 stripe->sectors = kzalloc_objs(struct scrub_sector_verification, 377 stripe->nr_sectors); 378 if (!stripe->sectors) 379 goto error; 380 381 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 382 fs_info->csum_size, GFP_KERNEL); 383 if (!stripe->csums) 384 goto error; 385 return 0; 386 error: 387 release_scrub_stripe(stripe); 388 return -ENOMEM; 389 } 390 391 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 392 { 393 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 394 } 395 396 static void scrub_put_ctx(struct scrub_ctx *sctx); 397 398 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 399 { 400 while (atomic_read(&fs_info->scrub_pause_req)) { 401 mutex_unlock(&fs_info->scrub_lock); 402 wait_event(fs_info->scrub_pause_wait, 403 atomic_read(&fs_info->scrub_pause_req) == 0); 404 mutex_lock(&fs_info->scrub_lock); 405 } 406 } 407 408 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 409 { 410 atomic_inc(&fs_info->scrubs_paused); 411 wake_up(&fs_info->scrub_pause_wait); 412 } 413 414 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 415 { 416 mutex_lock(&fs_info->scrub_lock); 417 __scrub_blocked_if_needed(fs_info); 418 atomic_dec(&fs_info->scrubs_paused); 419 mutex_unlock(&fs_info->scrub_lock); 420 421 wake_up(&fs_info->scrub_pause_wait); 422 } 423 424 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 425 { 426 scrub_pause_on(fs_info); 427 scrub_pause_off(fs_info); 428 } 429 430 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 431 { 432 int i; 433 434 if (!sctx) 435 return; 436 437 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 438 release_scrub_stripe(&sctx->stripes[i]); 439 440 kvfree(sctx); 441 } 442 443 static void scrub_put_ctx(struct scrub_ctx *sctx) 444 { 445 if (refcount_dec_and_test(&sctx->refs)) 446 scrub_free_ctx(sctx); 447 } 448 449 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 450 struct btrfs_fs_info *fs_info, bool is_dev_replace) 451 { 452 struct scrub_ctx *sctx; 453 int i; 454 455 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 456 * kvzalloc(). 457 */ 458 sctx = kvzalloc_obj(*sctx); 459 if (!sctx) 460 goto nomem; 461 refcount_set(&sctx->refs, 1); 462 sctx->is_dev_replace = is_dev_replace; 463 sctx->fs_info = fs_info; 464 sctx->extent_path.search_commit_root = true; 465 sctx->extent_path.skip_locking = true; 466 sctx->csum_path.search_commit_root = true; 467 sctx->csum_path.skip_locking = true; 468 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 469 int ret; 470 471 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 472 if (ret < 0) 473 goto nomem; 474 sctx->stripes[i].sctx = sctx; 475 } 476 sctx->first_free = 0; 477 atomic_set(&sctx->cancel_req, 0); 478 479 spin_lock_init(&sctx->stat_lock); 480 sctx->throttle_deadline = 0; 481 482 mutex_init(&sctx->wr_lock); 483 if (is_dev_replace) { 484 WARN_ON(!fs_info->dev_replace.tgtdev); 485 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 486 } 487 488 return sctx; 489 490 nomem: 491 scrub_free_ctx(sctx); 492 return ERR_PTR(-ENOMEM); 493 } 494 495 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 496 u64 root, void *warn_ctx) 497 { 498 u32 nlink; 499 int ret; 500 int i; 501 unsigned nofs_flag; 502 struct extent_buffer *eb; 503 struct btrfs_inode_item *inode_item; 504 struct scrub_warning *swarn = warn_ctx; 505 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 506 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 507 struct btrfs_root *local_root; 508 struct btrfs_key key; 509 510 local_root = btrfs_get_fs_root(fs_info, root, true); 511 if (IS_ERR(local_root)) { 512 ret = PTR_ERR(local_root); 513 goto err; 514 } 515 516 /* 517 * this makes the path point to (inum INODE_ITEM ioff) 518 */ 519 key.objectid = inum; 520 key.type = BTRFS_INODE_ITEM_KEY; 521 key.offset = 0; 522 523 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 524 if (ret) { 525 btrfs_put_root(local_root); 526 btrfs_release_path(swarn->path); 527 goto err; 528 } 529 530 eb = swarn->path->nodes[0]; 531 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 532 struct btrfs_inode_item); 533 nlink = btrfs_inode_nlink(eb, inode_item); 534 btrfs_release_path(swarn->path); 535 536 /* 537 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 538 * uses GFP_NOFS in this context, so we keep it consistent but it does 539 * not seem to be strictly necessary. 540 */ 541 nofs_flag = memalloc_nofs_save(); 542 ipath = init_ipath(4096, local_root, swarn->path); 543 memalloc_nofs_restore(nofs_flag); 544 if (IS_ERR(ipath)) { 545 btrfs_put_root(local_root); 546 ret = PTR_ERR(ipath); 547 ipath = NULL; 548 goto err; 549 } 550 ret = paths_from_inode(inum, ipath); 551 552 if (ret < 0) 553 goto err; 554 555 /* 556 * we deliberately ignore the bit ipath might have been too small to 557 * hold all of the paths here 558 */ 559 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 560 btrfs_warn(fs_info, 561 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 562 swarn->errstr, swarn->logical, 563 btrfs_dev_name(swarn->dev), 564 swarn->physical, 565 root, inum, offset, 566 fs_info->sectorsize, nlink, 567 (char *)(unsigned long)ipath->fspath->val[i]); 568 569 btrfs_put_root(local_root); 570 return 0; 571 572 err: 573 btrfs_warn(fs_info, 574 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 575 swarn->errstr, swarn->logical, 576 btrfs_dev_name(swarn->dev), 577 swarn->physical, 578 root, inum, offset, ret); 579 580 return 0; 581 } 582 583 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 584 bool is_super, u64 logical, u64 physical) 585 { 586 struct btrfs_fs_info *fs_info = dev->fs_info; 587 BTRFS_PATH_AUTO_FREE(path); 588 struct btrfs_key found_key; 589 struct extent_buffer *eb; 590 struct btrfs_extent_item *ei; 591 struct scrub_warning swarn; 592 u64 flags = 0; 593 u32 item_size; 594 int ret; 595 596 /* Super block error, no need to search extent tree. */ 597 if (is_super) { 598 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 599 errstr, btrfs_dev_name(dev), physical); 600 return; 601 } 602 path = btrfs_alloc_path(); 603 if (!path) 604 return; 605 606 swarn.physical = physical; 607 swarn.logical = logical; 608 swarn.errstr = errstr; 609 swarn.dev = NULL; 610 611 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 612 &flags); 613 if (ret < 0) 614 return; 615 616 swarn.extent_item_size = found_key.offset; 617 618 eb = path->nodes[0]; 619 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 620 item_size = btrfs_item_size(eb, path->slots[0]); 621 622 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 unsigned long ptr = 0; 624 u8 ref_level; 625 u64 ref_root; 626 627 while (true) { 628 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 629 item_size, &ref_root, 630 &ref_level); 631 if (ret < 0) { 632 btrfs_warn(fs_info, 633 "scrub: failed to resolve tree backref for logical %llu: %d", 634 swarn.logical, ret); 635 break; 636 } 637 if (ret > 0) 638 break; 639 btrfs_warn(fs_info, 640 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 641 errstr, swarn.logical, btrfs_dev_name(dev), 642 swarn.physical, (ref_level ? "node" : "leaf"), 643 ref_level, ref_root); 644 } 645 btrfs_release_path(path); 646 } else { 647 struct btrfs_backref_walk_ctx ctx = { 0 }; 648 649 btrfs_release_path(path); 650 651 ctx.bytenr = found_key.objectid; 652 ctx.extent_item_pos = swarn.logical - found_key.objectid; 653 ctx.fs_info = fs_info; 654 655 swarn.path = path; 656 swarn.dev = dev; 657 658 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 659 } 660 } 661 662 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 663 { 664 int ret = 0; 665 u64 length; 666 667 if (!btrfs_is_zoned(sctx->fs_info)) 668 return 0; 669 670 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 671 return 0; 672 673 if (sctx->write_pointer < physical) { 674 length = physical - sctx->write_pointer; 675 676 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 677 sctx->write_pointer, length); 678 if (!ret) 679 sctx->write_pointer = physical; 680 } 681 return ret; 682 } 683 684 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 685 { 686 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 687 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 688 u32 offset = (sector_nr << fs_info->sectorsize_bits); 689 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 690 691 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 692 ASSERT(folio); 693 ASSERT(!folio_test_highmem(folio)); 694 return folio_address(folio) + offset_in_folio(folio, offset); 695 } 696 697 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 698 { 699 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 700 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 701 u32 offset = (sector_nr << fs_info->sectorsize_bits); 702 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 703 704 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 705 ASSERT(folio); 706 ASSERT(!folio_test_highmem(folio)); 707 /* And the range must be contained inside the folio. */ 708 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 709 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 710 } 711 712 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 713 { 714 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 715 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 716 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 717 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 718 struct btrfs_header *header = first_kaddr; 719 struct btrfs_csum_ctx csum; 720 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 721 u8 calculated_csum[BTRFS_CSUM_SIZE]; 722 723 /* 724 * Here we don't have a good way to attach the pages (and subpages) 725 * to a dummy extent buffer, thus we have to directly grab the members 726 * from pages. 727 */ 728 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 729 730 if (logical != btrfs_stack_header_bytenr(header)) { 731 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 732 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 733 btrfs_warn_rl(fs_info, 734 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 735 logical, stripe->mirror_num, 736 btrfs_stack_header_bytenr(header), logical); 737 return; 738 } 739 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 740 BTRFS_FSID_SIZE) != 0) { 741 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 742 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 743 btrfs_warn_rl(fs_info, 744 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 745 logical, stripe->mirror_num, 746 header->fsid, fs_info->fs_devices->fsid); 747 return; 748 } 749 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 750 BTRFS_UUID_SIZE) != 0) { 751 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 752 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 753 btrfs_warn_rl(fs_info, 754 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 755 logical, stripe->mirror_num, 756 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 757 return; 758 } 759 760 /* Now check tree block csum. */ 761 btrfs_csum_init(&csum, fs_info->csum_type); 762 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 763 fs_info->sectorsize - BTRFS_CSUM_SIZE); 764 765 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 766 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 767 fs_info->sectorsize); 768 } 769 770 btrfs_csum_final(&csum, calculated_csum); 771 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 772 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 773 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 774 btrfs_warn_rl(fs_info, 775 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 776 logical, stripe->mirror_num, 777 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 779 return; 780 } 781 if (stripe->sectors[sector_nr].generation != 782 btrfs_stack_header_generation(header)) { 783 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 784 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 785 btrfs_warn_rl(fs_info, 786 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 787 logical, stripe->mirror_num, 788 btrfs_stack_header_generation(header), 789 stripe->sectors[sector_nr].generation); 790 return; 791 } 792 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 793 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 794 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 795 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 796 } 797 798 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 799 { 800 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 801 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 802 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 803 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 804 u8 csum_buf[BTRFS_CSUM_SIZE]; 805 int ret; 806 807 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 808 809 /* Sector not utilized, skip it. */ 810 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 811 return; 812 813 /* IO error, no need to check. */ 814 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 815 return; 816 817 /* Metadata, verify the full tree block. */ 818 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 819 /* 820 * Check if the tree block crosses the stripe boundary. If 821 * crossed the boundary, we cannot verify it but only give a 822 * warning. 823 * 824 * This can only happen on a very old filesystem where chunks 825 * are not ensured to be stripe aligned. 826 */ 827 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 828 btrfs_warn_rl(fs_info, 829 "scrub: tree block at %llu crosses stripe boundary %llu", 830 stripe->logical + 831 (sector_nr << fs_info->sectorsize_bits), 832 stripe->logical); 833 return; 834 } 835 scrub_verify_one_metadata(stripe, sector_nr); 836 return; 837 } 838 839 /* 840 * Data is easier, we just verify the data csum (if we have it). For 841 * cases without csum, we have no other choice but to trust it. 842 */ 843 if (!sector->csum) { 844 scrub_bitmap_clear_bit_error(stripe, sector_nr); 845 return; 846 } 847 848 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 849 if (ret < 0) { 850 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 851 scrub_bitmap_set_bit_error(stripe, sector_nr); 852 } else { 853 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 854 scrub_bitmap_clear_bit_error(stripe, sector_nr); 855 } 856 } 857 858 /* Verify specified sectors of a stripe. */ 859 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 860 { 861 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 862 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 863 int sector_nr; 864 865 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 866 scrub_verify_one_sector(stripe, sector_nr); 867 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 868 sector_nr += sectors_per_tree - 1; 869 } 870 } 871 872 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 873 { 874 int i; 875 876 for (i = 0; i < stripe->nr_sectors; i++) { 877 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 878 break; 879 } 880 ASSERT(i < stripe->nr_sectors); 881 return i; 882 } 883 884 /* 885 * Repair read is different to the regular read: 886 * 887 * - Only reads the failed sectors 888 * - May have extra blocksize limits 889 */ 890 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 891 { 892 struct scrub_stripe *stripe = bbio->private; 893 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 894 struct bio_vec *bvec; 895 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 896 u32 bio_size = 0; 897 int i; 898 899 ASSERT(sector_nr < stripe->nr_sectors); 900 901 bio_for_each_bvec_all(bvec, &bbio->bio, i) 902 bio_size += bvec->bv_len; 903 904 if (bbio->bio.bi_status) { 905 scrub_bitmap_set_io_error(stripe, sector_nr, 906 bio_size >> fs_info->sectorsize_bits); 907 scrub_bitmap_set_error(stripe, sector_nr, 908 bio_size >> fs_info->sectorsize_bits); 909 } else { 910 scrub_bitmap_clear_io_error(stripe, sector_nr, 911 bio_size >> fs_info->sectorsize_bits); 912 } 913 bio_put(&bbio->bio); 914 if (atomic_dec_and_test(&stripe->pending_io)) 915 wake_up(&stripe->io_wait); 916 } 917 918 static int calc_next_mirror(int mirror, int num_copies) 919 { 920 ASSERT(mirror <= num_copies); 921 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 922 } 923 924 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 925 int sector_nr) 926 { 927 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 928 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 929 int ret; 930 931 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 932 offset_in_page(kaddr)); 933 /* 934 * Caller should ensure the bbio has enough size. 935 * And we cannot use __bio_add_page(), which doesn't do any merge. 936 * 937 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 938 * to create the minimal amount of bio vectors, for fs block size < page 939 * size cases. 940 */ 941 ASSERT(ret == fs_info->sectorsize); 942 } 943 944 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 945 unsigned int nr_vecs, blk_opf_t opf, 946 u64 logical, 947 btrfs_bio_end_io_t end_io, void *private) 948 { 949 struct btrfs_bio *bbio; 950 951 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 952 logical, end_io, private); 953 bbio->is_scrub = true; 954 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 955 return bbio; 956 } 957 958 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 959 int mirror, int blocksize, bool wait) 960 { 961 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 962 struct btrfs_bio *bbio = NULL; 963 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 964 int i; 965 966 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 967 ASSERT(atomic_read(&stripe->pending_io) == 0, 968 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 969 970 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 971 /* The current sector cannot be merged, submit the bio. */ 972 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 973 bbio->bio.bi_iter.bi_size >= blocksize)) { 974 ASSERT(bbio->bio.bi_iter.bi_size); 975 atomic_inc(&stripe->pending_io); 976 btrfs_submit_bbio(bbio, mirror); 977 if (wait) 978 wait_scrub_stripe_io(stripe); 979 bbio = NULL; 980 } 981 982 if (!bbio) 983 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 984 stripe->logical + (i << fs_info->sectorsize_bits), 985 scrub_repair_read_endio, stripe); 986 987 scrub_bio_add_sector(bbio, stripe, i); 988 } 989 if (bbio) { 990 ASSERT(bbio->bio.bi_iter.bi_size); 991 atomic_inc(&stripe->pending_io); 992 btrfs_submit_bbio(bbio, mirror); 993 if (wait) 994 wait_scrub_stripe_io(stripe); 995 } 996 } 997 998 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 999 struct scrub_stripe *stripe, 1000 const struct scrub_error_records *errors) 1001 { 1002 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 1003 DEFAULT_RATELIMIT_BURST); 1004 struct btrfs_fs_info *fs_info = sctx->fs_info; 1005 struct btrfs_device *dev = NULL; 1006 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1007 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1008 u64 physical = 0; 1009 int nr_data_sectors = 0; 1010 int nr_meta_sectors = 0; 1011 int nr_nodatacsum_sectors = 0; 1012 int nr_repaired_sectors = 0; 1013 int sector_nr; 1014 1015 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1016 return; 1017 1018 /* 1019 * Init needed infos for error reporting. 1020 * 1021 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1022 * thus no need for dev/physical, error reporting still needs dev and physical. 1023 */ 1024 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1025 u64 mapped_len = fs_info->sectorsize; 1026 struct btrfs_io_context *bioc = NULL; 1027 int stripe_index = stripe->mirror_num - 1; 1028 int ret; 1029 1030 /* For scrub, our mirror_num should always start at 1. */ 1031 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1032 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1033 stripe->logical, &mapped_len, &bioc, 1034 NULL, NULL); 1035 /* 1036 * If we failed, dev will be NULL, and later detailed reports 1037 * will just be skipped. 1038 */ 1039 if (ret < 0) 1040 goto skip; 1041 physical = bioc->stripes[stripe_index].physical; 1042 dev = bioc->stripes[stripe_index].dev; 1043 btrfs_put_bioc(bioc); 1044 } 1045 1046 skip: 1047 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1048 bool repaired = false; 1049 1050 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1051 nr_meta_sectors++; 1052 } else { 1053 nr_data_sectors++; 1054 if (!stripe->sectors[sector_nr].csum) 1055 nr_nodatacsum_sectors++; 1056 } 1057 1058 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1059 !test_bit(sector_nr, &error_bitmap)) { 1060 nr_repaired_sectors++; 1061 repaired = true; 1062 } 1063 1064 /* Good sector from the beginning, nothing need to be done. */ 1065 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1066 continue; 1067 1068 /* 1069 * Report error for the corrupted sectors. If repaired, just 1070 * output the message of repaired message. 1071 */ 1072 if (repaired) { 1073 if (dev) { 1074 btrfs_err_rl(fs_info, 1075 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1076 stripe->logical, btrfs_dev_name(dev), 1077 physical); 1078 } else { 1079 btrfs_err_rl(fs_info, 1080 "scrub: fixed up error at logical %llu on mirror %u", 1081 stripe->logical, stripe->mirror_num); 1082 } 1083 continue; 1084 } 1085 1086 /* The remaining are all for unrepaired. */ 1087 if (dev) { 1088 btrfs_err_rl(fs_info, 1089 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1090 stripe->logical, btrfs_dev_name(dev), 1091 physical); 1092 } else { 1093 btrfs_err_rl(fs_info, 1094 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1095 stripe->logical, stripe->mirror_num); 1096 } 1097 1098 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1099 if (__ratelimit(&rs) && dev) 1100 scrub_print_common_warning("i/o error", dev, false, 1101 stripe->logical, physical); 1102 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1103 if (__ratelimit(&rs) && dev) 1104 scrub_print_common_warning("checksum error", dev, false, 1105 stripe->logical, physical); 1106 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1107 if (__ratelimit(&rs) && dev) 1108 scrub_print_common_warning("header error", dev, false, 1109 stripe->logical, physical); 1110 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1111 if (__ratelimit(&rs) && dev) 1112 scrub_print_common_warning("generation error", dev, false, 1113 stripe->logical, physical); 1114 } 1115 1116 /* Update the device stats. */ 1117 for (int i = 0; i < errors->nr_io_errors; i++) 1118 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1119 for (int i = 0; i < errors->nr_csum_errors; i++) 1120 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1121 /* Generation mismatch error is based on each metadata, not each block. */ 1122 for (int i = 0; i < errors->nr_meta_gen_errors; 1123 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1124 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1125 1126 spin_lock(&sctx->stat_lock); 1127 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1128 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1129 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1130 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1131 sctx->stat.no_csum += nr_nodatacsum_sectors; 1132 sctx->stat.read_errors += errors->nr_io_errors; 1133 sctx->stat.csum_errors += errors->nr_csum_errors; 1134 sctx->stat.verify_errors += errors->nr_meta_errors + 1135 errors->nr_meta_gen_errors; 1136 sctx->stat.uncorrectable_errors += 1137 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1138 sctx->stat.corrected_errors += nr_repaired_sectors; 1139 spin_unlock(&sctx->stat_lock); 1140 } 1141 1142 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1143 unsigned long write_bitmap, bool dev_replace); 1144 1145 /* 1146 * The main entrance for all read related scrub work, including: 1147 * 1148 * - Wait for the initial read to finish 1149 * - Verify and locate any bad sectors 1150 * - Go through the remaining mirrors and try to read as large blocksize as 1151 * possible 1152 * - Go through all mirrors (including the failed mirror) sector-by-sector 1153 * - Submit writeback for repaired sectors 1154 * 1155 * Writeback for dev-replace does not happen here, it needs extra 1156 * synchronization for zoned devices. 1157 */ 1158 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1159 { 1160 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1161 struct scrub_ctx *sctx = stripe->sctx; 1162 struct btrfs_fs_info *fs_info = sctx->fs_info; 1163 struct scrub_error_records errors = { 0 }; 1164 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1165 stripe->bg->length); 1166 unsigned long repaired; 1167 unsigned long error; 1168 int mirror; 1169 int i; 1170 1171 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1172 1173 wait_scrub_stripe_io(stripe); 1174 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1175 /* Save the initial failed bitmap for later repair and report usage. */ 1176 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1177 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1178 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1179 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1180 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1181 1182 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1183 goto out; 1184 1185 /* 1186 * Try all remaining mirrors. 1187 * 1188 * Here we still try to read as large block as possible, as this is 1189 * faster and we have extra safety nets to rely on. 1190 */ 1191 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1192 mirror != stripe->mirror_num; 1193 mirror = calc_next_mirror(mirror, num_copies)) { 1194 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1195 1196 scrub_stripe_submit_repair_read(stripe, mirror, 1197 BTRFS_STRIPE_LEN, false); 1198 wait_scrub_stripe_io(stripe); 1199 scrub_verify_one_stripe(stripe, old_error_bitmap); 1200 if (scrub_bitmap_empty_error(stripe)) 1201 goto out; 1202 } 1203 1204 /* 1205 * Last safety net, try re-checking all mirrors, including the failed 1206 * one, sector-by-sector. 1207 * 1208 * As if one sector failed the drive's internal csum, the whole read 1209 * containing the offending sector would be marked as error. 1210 * Thus here we do sector-by-sector read. 1211 * 1212 * This can be slow, thus we only try it as the last resort. 1213 */ 1214 1215 for (i = 0, mirror = stripe->mirror_num; 1216 i < num_copies; 1217 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1218 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1219 1220 scrub_stripe_submit_repair_read(stripe, mirror, 1221 fs_info->sectorsize, true); 1222 wait_scrub_stripe_io(stripe); 1223 scrub_verify_one_stripe(stripe, old_error_bitmap); 1224 if (scrub_bitmap_empty_error(stripe)) 1225 goto out; 1226 } 1227 out: 1228 error = scrub_bitmap_read_error(stripe); 1229 /* 1230 * Submit the repaired sectors. For zoned case, we cannot do repair 1231 * in-place, but queue the bg to be relocated. 1232 */ 1233 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1234 stripe->nr_sectors); 1235 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1236 if (btrfs_is_zoned(fs_info)) { 1237 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1238 } else { 1239 scrub_write_sectors(sctx, stripe, repaired, false); 1240 wait_scrub_stripe_io(stripe); 1241 } 1242 } 1243 1244 scrub_stripe_report_errors(sctx, stripe, &errors); 1245 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1246 wake_up(&stripe->repair_wait); 1247 } 1248 1249 static void scrub_read_endio(struct btrfs_bio *bbio) 1250 { 1251 struct scrub_stripe *stripe = bbio->private; 1252 struct bio_vec *bvec; 1253 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1254 int num_sectors; 1255 u32 bio_size = 0; 1256 int i; 1257 1258 ASSERT(sector_nr < stripe->nr_sectors); 1259 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1260 bio_size += bvec->bv_len; 1261 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1262 1263 if (bbio->bio.bi_status) { 1264 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1265 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1266 } else { 1267 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1268 } 1269 bio_put(&bbio->bio); 1270 if (atomic_dec_and_test(&stripe->pending_io)) { 1271 wake_up(&stripe->io_wait); 1272 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1273 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1274 } 1275 } 1276 1277 static void scrub_write_endio(struct btrfs_bio *bbio) 1278 { 1279 struct scrub_stripe *stripe = bbio->private; 1280 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1281 struct bio_vec *bvec; 1282 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1283 u32 bio_size = 0; 1284 int i; 1285 1286 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1287 bio_size += bvec->bv_len; 1288 1289 if (bbio->bio.bi_status) { 1290 unsigned long flags; 1291 1292 spin_lock_irqsave(&stripe->write_error_lock, flags); 1293 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1294 bio_size >> fs_info->sectorsize_bits); 1295 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1296 for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1297 btrfs_dev_stat_inc_and_print(stripe->dev, 1298 BTRFS_DEV_STAT_WRITE_ERRS); 1299 } 1300 bio_put(&bbio->bio); 1301 1302 if (atomic_dec_and_test(&stripe->pending_io)) 1303 wake_up(&stripe->io_wait); 1304 } 1305 1306 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1307 struct scrub_stripe *stripe, 1308 struct btrfs_bio *bbio, bool dev_replace) 1309 { 1310 struct btrfs_fs_info *fs_info = sctx->fs_info; 1311 u32 bio_len = bbio->bio.bi_iter.bi_size; 1312 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1313 stripe->logical; 1314 1315 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1316 atomic_inc(&stripe->pending_io); 1317 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1318 if (!btrfs_is_zoned(fs_info)) 1319 return; 1320 /* 1321 * For zoned writeback, queue depth must be 1, thus we must wait for 1322 * the write to finish before the next write. 1323 */ 1324 wait_scrub_stripe_io(stripe); 1325 1326 /* 1327 * And also need to update the write pointer if write finished 1328 * successfully. 1329 */ 1330 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1331 &stripe->write_error_bitmap)) 1332 sctx->write_pointer += bio_len; 1333 } 1334 1335 /* 1336 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1337 * 1338 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1339 * 1340 * - Only needs logical bytenr and mirror_num 1341 * Just like the scrub read path 1342 * 1343 * - Would only result in writes to the specified mirror 1344 * Unlike the regular writeback path, which would write back to all stripes 1345 * 1346 * - Handle dev-replace and read-repair writeback differently 1347 */ 1348 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1349 unsigned long write_bitmap, bool dev_replace) 1350 { 1351 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1352 struct btrfs_bio *bbio = NULL; 1353 int sector_nr; 1354 1355 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1356 /* We should only writeback sectors covered by an extent. */ 1357 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1358 1359 /* Cannot merge with previous sector, submit the current one. */ 1360 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1361 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1362 bbio = NULL; 1363 } 1364 if (!bbio) 1365 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1366 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1367 scrub_write_endio, stripe); 1368 scrub_bio_add_sector(bbio, stripe, sector_nr); 1369 } 1370 if (bbio) 1371 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1372 } 1373 1374 /* 1375 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1376 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1377 */ 1378 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1379 unsigned int bio_size) 1380 { 1381 const int time_slice = 1000; 1382 s64 delta; 1383 ktime_t now; 1384 u32 div; 1385 u64 bwlimit; 1386 1387 bwlimit = READ_ONCE(device->scrub_speed_max); 1388 if (bwlimit == 0) 1389 return; 1390 1391 /* 1392 * Slice is divided into intervals when the IO is submitted, adjust by 1393 * bwlimit and maximum of 64 intervals. 1394 */ 1395 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1396 1397 /* Start new epoch, set deadline */ 1398 now = ktime_get(); 1399 if (sctx->throttle_deadline == 0) { 1400 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1401 sctx->throttle_sent = 0; 1402 } 1403 1404 /* Still in the time to send? */ 1405 if (ktime_before(now, sctx->throttle_deadline)) { 1406 /* If current bio is within the limit, send it */ 1407 sctx->throttle_sent += bio_size; 1408 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1409 return; 1410 1411 /* We're over the limit, sleep until the rest of the slice */ 1412 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1413 } else { 1414 /* New request after deadline, start new epoch */ 1415 delta = 0; 1416 } 1417 1418 if (delta) { 1419 long timeout; 1420 1421 timeout = div_u64(delta * HZ, 1000); 1422 schedule_timeout_interruptible(timeout); 1423 } 1424 1425 /* Next call will start the deadline period */ 1426 sctx->throttle_deadline = 0; 1427 } 1428 1429 /* 1430 * Given a physical address, this will calculate it's 1431 * logical offset. if this is a parity stripe, it will return 1432 * the most left data stripe's logical offset. 1433 * 1434 * return 0 if it is a data stripe, 1 means parity stripe. 1435 */ 1436 static int get_raid56_logic_offset(u64 physical, int num, 1437 struct btrfs_chunk_map *map, u64 *offset, 1438 u64 *stripe_start) 1439 { 1440 int i; 1441 int j = 0; 1442 u64 last_offset; 1443 const int data_stripes = nr_data_stripes(map); 1444 1445 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1446 if (stripe_start) 1447 *stripe_start = last_offset; 1448 1449 *offset = last_offset; 1450 for (i = 0; i < data_stripes; i++) { 1451 u32 stripe_nr; 1452 u32 stripe_index; 1453 u32 rot; 1454 1455 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1456 1457 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1458 1459 /* Work out the disk rotation on this stripe-set */ 1460 rot = stripe_nr % map->num_stripes; 1461 /* calculate which stripe this data locates */ 1462 rot += i; 1463 stripe_index = rot % map->num_stripes; 1464 if (stripe_index == num) 1465 return 0; 1466 if (stripe_index < num) 1467 j++; 1468 } 1469 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1470 return 1; 1471 } 1472 1473 /* 1474 * Return 0 if the extent item range covers any byte of the range. 1475 * Return <0 if the extent item is before @search_start. 1476 * Return >0 if the extent item is after @start_start + @search_len. 1477 */ 1478 static int compare_extent_item_range(struct btrfs_path *path, 1479 u64 search_start, u64 search_len) 1480 { 1481 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1482 u64 len; 1483 struct btrfs_key key; 1484 1485 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1486 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1487 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1488 if (key.type == BTRFS_METADATA_ITEM_KEY) 1489 len = fs_info->nodesize; 1490 else 1491 len = key.offset; 1492 1493 if (key.objectid + len <= search_start) 1494 return -1; 1495 if (key.objectid >= search_start + search_len) 1496 return 1; 1497 return 0; 1498 } 1499 1500 /* 1501 * Locate one extent item which covers any byte in range 1502 * [@search_start, @search_start + @search_length) 1503 * 1504 * If the path is not initialized, we will initialize the search by doing 1505 * a btrfs_search_slot(). 1506 * If the path is already initialized, we will use the path as the initial 1507 * slot, to avoid duplicated btrfs_search_slot() calls. 1508 * 1509 * NOTE: If an extent item starts before @search_start, we will still 1510 * return the extent item. This is for data extent crossing stripe boundary. 1511 * 1512 * Return 0 if we found such extent item, and @path will point to the extent item. 1513 * Return >0 if no such extent item can be found, and @path will be released. 1514 * Return <0 if hit fatal error, and @path will be released. 1515 */ 1516 static int find_first_extent_item(struct btrfs_root *extent_root, 1517 struct btrfs_path *path, 1518 u64 search_start, u64 search_len) 1519 { 1520 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1521 struct btrfs_key key; 1522 int ret; 1523 1524 /* Continue using the existing path */ 1525 if (path->nodes[0]) 1526 goto search_forward; 1527 1528 key.objectid = search_start; 1529 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1530 key.type = BTRFS_METADATA_ITEM_KEY; 1531 else 1532 key.type = BTRFS_EXTENT_ITEM_KEY; 1533 key.offset = (u64)-1; 1534 1535 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1536 if (ret < 0) 1537 return ret; 1538 if (unlikely(ret == 0)) { 1539 /* 1540 * Key with offset -1 found, there would have to exist an extent 1541 * item with such offset, but this is out of the valid range. 1542 */ 1543 btrfs_release_path(path); 1544 return -EUCLEAN; 1545 } 1546 1547 /* 1548 * Here we intentionally pass 0 as @min_objectid, as there could be 1549 * an extent item starting before @search_start. 1550 */ 1551 ret = btrfs_previous_extent_item(extent_root, path, 0); 1552 if (ret < 0) 1553 return ret; 1554 /* 1555 * No matter whether we have found an extent item, the next loop will 1556 * properly do every check on the key. 1557 */ 1558 search_forward: 1559 while (true) { 1560 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1561 if (key.objectid >= search_start + search_len) 1562 break; 1563 if (key.type != BTRFS_METADATA_ITEM_KEY && 1564 key.type != BTRFS_EXTENT_ITEM_KEY) 1565 goto next; 1566 1567 ret = compare_extent_item_range(path, search_start, search_len); 1568 if (ret == 0) 1569 return ret; 1570 if (ret > 0) 1571 break; 1572 next: 1573 ret = btrfs_next_item(extent_root, path); 1574 if (ret) { 1575 /* Either no more items or a fatal error. */ 1576 btrfs_release_path(path); 1577 return ret; 1578 } 1579 } 1580 btrfs_release_path(path); 1581 return 1; 1582 } 1583 1584 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1585 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1586 { 1587 struct btrfs_key key; 1588 struct btrfs_extent_item *ei; 1589 1590 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1591 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1592 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1593 *extent_start_ret = key.objectid; 1594 if (key.type == BTRFS_METADATA_ITEM_KEY) 1595 *size_ret = path->nodes[0]->fs_info->nodesize; 1596 else 1597 *size_ret = key.offset; 1598 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1599 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1600 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1601 } 1602 1603 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1604 u64 physical, u64 physical_end) 1605 { 1606 struct btrfs_fs_info *fs_info = sctx->fs_info; 1607 int ret = 0; 1608 1609 if (!btrfs_is_zoned(fs_info)) 1610 return 0; 1611 1612 mutex_lock(&sctx->wr_lock); 1613 if (sctx->write_pointer < physical_end) { 1614 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1615 physical, 1616 sctx->write_pointer); 1617 if (ret) 1618 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1619 } 1620 mutex_unlock(&sctx->wr_lock); 1621 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1622 1623 return ret; 1624 } 1625 1626 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1627 struct scrub_stripe *stripe, 1628 u64 extent_start, u64 extent_len, 1629 u64 extent_flags, u64 extent_gen) 1630 { 1631 for (u64 cur_logical = max(stripe->logical, extent_start); 1632 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1633 extent_start + extent_len); 1634 cur_logical += fs_info->sectorsize) { 1635 const int nr_sector = (cur_logical - stripe->logical) >> 1636 fs_info->sectorsize_bits; 1637 struct scrub_sector_verification *sector = 1638 &stripe->sectors[nr_sector]; 1639 1640 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1641 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1642 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1643 sector->generation = extent_gen; 1644 } 1645 } 1646 } 1647 1648 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1649 { 1650 ASSERT(stripe->nr_sectors); 1651 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1652 } 1653 1654 /* 1655 * Locate one stripe which has at least one extent in its range. 1656 * 1657 * Return 0 if found such stripe, and store its info into @stripe. 1658 * Return >0 if there is no such stripe in the specified range. 1659 * Return <0 for error. 1660 */ 1661 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1662 struct btrfs_path *extent_path, 1663 struct btrfs_path *csum_path, 1664 struct btrfs_device *dev, u64 physical, 1665 int mirror_num, u64 logical_start, 1666 u32 logical_len, 1667 struct scrub_stripe *stripe) 1668 { 1669 struct btrfs_fs_info *fs_info = bg->fs_info; 1670 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1671 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1672 const u64 logical_end = logical_start + logical_len; 1673 u64 cur_logical = logical_start; 1674 u64 stripe_end; 1675 u64 extent_start; 1676 u64 extent_len; 1677 u64 extent_flags; 1678 u64 extent_gen; 1679 int ret; 1680 1681 if (unlikely(!extent_root || !csum_root)) { 1682 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1683 return -EUCLEAN; 1684 } 1685 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1686 stripe->nr_sectors); 1687 scrub_stripe_reset_bitmaps(stripe); 1688 1689 /* The range must be inside the bg. */ 1690 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1691 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1692 bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1693 1694 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1695 logical_len); 1696 /* Either error or not found. */ 1697 if (ret) 1698 return ret; 1699 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1700 &extent_gen); 1701 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1702 stripe->nr_meta_extents++; 1703 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1704 stripe->nr_data_extents++; 1705 cur_logical = max(extent_start, cur_logical); 1706 1707 /* 1708 * Round down to stripe boundary. 1709 * 1710 * The extra calculation against bg->start is to handle block groups 1711 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1712 */ 1713 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1714 bg->start; 1715 stripe->physical = physical + stripe->logical - logical_start; 1716 stripe->dev = dev; 1717 stripe->bg = bg; 1718 stripe->mirror_num = mirror_num; 1719 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1720 1721 /* Fill the first extent info into stripe->sectors[] array. */ 1722 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1723 extent_flags, extent_gen); 1724 cur_logical = extent_start + extent_len; 1725 1726 /* Fill the extent info for the remaining sectors. */ 1727 while (cur_logical <= stripe_end) { 1728 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1729 stripe_end - cur_logical + 1); 1730 if (ret < 0) 1731 return ret; 1732 if (ret > 0) { 1733 ret = 0; 1734 break; 1735 } 1736 get_extent_info(extent_path, &extent_start, &extent_len, 1737 &extent_flags, &extent_gen); 1738 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1739 stripe->nr_meta_extents++; 1740 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1741 stripe->nr_data_extents++; 1742 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1743 extent_flags, extent_gen); 1744 cur_logical = extent_start + extent_len; 1745 } 1746 1747 /* Now fill the data csum. */ 1748 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1749 int sector_nr; 1750 unsigned long csum_bitmap = 0; 1751 1752 /* Csum space should have already been allocated. */ 1753 ASSERT(stripe->csums); 1754 1755 /* 1756 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1757 * should contain at most 16 sectors. 1758 */ 1759 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1760 1761 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1762 stripe->logical, stripe_end, 1763 stripe->csums, &csum_bitmap); 1764 if (ret < 0) 1765 return ret; 1766 if (ret > 0) 1767 ret = 0; 1768 1769 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1770 stripe->sectors[sector_nr].csum = stripe->csums + 1771 sector_nr * fs_info->csum_size; 1772 } 1773 } 1774 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1775 1776 return ret; 1777 } 1778 1779 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1780 { 1781 scrub_stripe_reset_bitmaps(stripe); 1782 1783 stripe->nr_meta_extents = 0; 1784 stripe->nr_data_extents = 0; 1785 stripe->state = 0; 1786 1787 for (int i = 0; i < stripe->nr_sectors; i++) { 1788 stripe->sectors[i].csum = NULL; 1789 stripe->sectors[i].generation = 0; 1790 } 1791 } 1792 1793 static u32 stripe_length(const struct scrub_stripe *stripe) 1794 { 1795 ASSERT(stripe->bg); 1796 1797 return min(BTRFS_STRIPE_LEN, 1798 stripe->bg->start + stripe->bg->length - stripe->logical); 1799 } 1800 1801 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1802 { 1803 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1804 struct btrfs_bio *bbio = NULL; 1805 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1806 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1807 u64 stripe_len = BTRFS_STRIPE_LEN; 1808 int mirror = stripe->mirror_num; 1809 int i; 1810 1811 atomic_inc(&stripe->pending_io); 1812 1813 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1814 /* We're beyond the chunk boundary, no need to read anymore. */ 1815 if (i >= nr_sectors) 1816 break; 1817 1818 /* The current sector cannot be merged, submit the bio. */ 1819 if (bbio && 1820 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1821 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1822 ASSERT(bbio->bio.bi_iter.bi_size); 1823 atomic_inc(&stripe->pending_io); 1824 btrfs_submit_bbio(bbio, mirror); 1825 bbio = NULL; 1826 } 1827 1828 if (!bbio) { 1829 struct btrfs_io_stripe io_stripe = {}; 1830 struct btrfs_io_context *bioc = NULL; 1831 const u64 logical = stripe->logical + 1832 (i << fs_info->sectorsize_bits); 1833 int ret; 1834 1835 io_stripe.rst_search_commit_root = true; 1836 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1837 /* 1838 * For RST cases, we need to manually split the bbio to 1839 * follow the RST boundary. 1840 */ 1841 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1842 &stripe_len, &bioc, &io_stripe, &mirror); 1843 btrfs_put_bioc(bioc); 1844 if (ret < 0) { 1845 if (ret != -ENODATA) { 1846 /* 1847 * Earlier btrfs_get_raid_extent_offset() 1848 * returned -ENODATA, which means there's 1849 * no entry for the corresponding range 1850 * in the stripe tree. But if it's in 1851 * the extent tree, then it's a preallocated 1852 * extent and not an error. 1853 */ 1854 scrub_bitmap_set_bit_io_error(stripe, i); 1855 scrub_bitmap_set_bit_error(stripe, i); 1856 } 1857 continue; 1858 } 1859 1860 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1861 logical, scrub_read_endio, stripe); 1862 } 1863 1864 scrub_bio_add_sector(bbio, stripe, i); 1865 } 1866 1867 if (bbio) { 1868 ASSERT(bbio->bio.bi_iter.bi_size); 1869 atomic_inc(&stripe->pending_io); 1870 btrfs_submit_bbio(bbio, mirror); 1871 } 1872 1873 if (atomic_dec_and_test(&stripe->pending_io)) { 1874 wake_up(&stripe->io_wait); 1875 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1876 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1877 } 1878 } 1879 1880 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1881 struct scrub_stripe *stripe) 1882 { 1883 struct btrfs_fs_info *fs_info = sctx->fs_info; 1884 struct btrfs_bio *bbio; 1885 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1886 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1887 int mirror = stripe->mirror_num; 1888 1889 ASSERT(stripe->bg); 1890 ASSERT(stripe->mirror_num > 0); 1891 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1892 1893 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1894 scrub_submit_extent_sector_read(stripe); 1895 return; 1896 } 1897 1898 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1899 stripe->logical, scrub_read_endio, stripe); 1900 /* Read the whole range inside the chunk boundary. */ 1901 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1902 scrub_bio_add_sector(bbio, stripe, cur); 1903 atomic_inc(&stripe->pending_io); 1904 1905 /* 1906 * For dev-replace, either user asks to avoid the source dev, or 1907 * the device is missing, we try the next mirror instead. 1908 */ 1909 if (sctx->is_dev_replace && 1910 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1911 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1912 !stripe->dev->bdev)) { 1913 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1914 stripe->bg->length); 1915 1916 mirror = calc_next_mirror(mirror, num_copies); 1917 } 1918 btrfs_submit_bbio(bbio, mirror); 1919 } 1920 1921 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1922 { 1923 const unsigned long error = scrub_bitmap_read_error(stripe); 1924 int i; 1925 1926 for_each_set_bit(i, &error, stripe->nr_sectors) { 1927 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1928 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1929 1930 btrfs_err(fs_info, 1931 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1932 stripe->logical, 1933 stripe->logical + (i << fs_info->sectorsize_bits)); 1934 return true; 1935 } 1936 } 1937 return false; 1938 } 1939 1940 static void submit_initial_group_read(struct scrub_ctx *sctx, 1941 unsigned int first_slot, 1942 unsigned int nr_stripes) 1943 { 1944 struct blk_plug plug; 1945 1946 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1947 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1948 1949 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1950 btrfs_stripe_nr_to_offset(nr_stripes)); 1951 blk_start_plug(&plug); 1952 for (int i = 0; i < nr_stripes; i++) { 1953 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1954 1955 /* Those stripes should be initialized. */ 1956 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1957 scrub_submit_initial_read(sctx, stripe); 1958 } 1959 blk_finish_plug(&plug); 1960 } 1961 1962 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1963 { 1964 struct btrfs_fs_info *fs_info = sctx->fs_info; 1965 struct scrub_stripe *stripe; 1966 const int nr_stripes = sctx->cur_stripe; 1967 int ret = 0; 1968 1969 if (!nr_stripes) 1970 return 0; 1971 1972 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1973 1974 /* Submit the stripes which are populated but not submitted. */ 1975 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1976 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1977 1978 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1979 } 1980 1981 for (int i = 0; i < nr_stripes; i++) { 1982 stripe = &sctx->stripes[i]; 1983 1984 wait_event(stripe->repair_wait, 1985 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1986 } 1987 1988 /* Submit for dev-replace. */ 1989 if (sctx->is_dev_replace) { 1990 /* 1991 * For dev-replace, if we know there is something wrong with 1992 * metadata, we should immediately abort. 1993 */ 1994 for (int i = 0; i < nr_stripes; i++) { 1995 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1996 ret = -EIO; 1997 goto out; 1998 } 1999 } 2000 for (int i = 0; i < nr_stripes; i++) { 2001 unsigned long good; 2002 unsigned long has_extent; 2003 unsigned long error; 2004 2005 stripe = &sctx->stripes[i]; 2006 2007 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 2008 2009 has_extent = scrub_bitmap_read_has_extent(stripe); 2010 error = scrub_bitmap_read_error(stripe); 2011 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 2012 scrub_write_sectors(sctx, stripe, good, true); 2013 } 2014 } 2015 2016 /* Wait for the above writebacks to finish. */ 2017 for (int i = 0; i < nr_stripes; i++) { 2018 stripe = &sctx->stripes[i]; 2019 2020 wait_scrub_stripe_io(stripe); 2021 spin_lock(&sctx->stat_lock); 2022 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2023 spin_unlock(&sctx->stat_lock); 2024 scrub_reset_stripe(stripe); 2025 } 2026 out: 2027 sctx->cur_stripe = 0; 2028 return ret; 2029 } 2030 2031 static void raid56_scrub_wait_endio(struct bio *bio) 2032 { 2033 complete(bio->bi_private); 2034 } 2035 2036 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2037 struct btrfs_device *dev, int mirror_num, 2038 u64 logical, u32 length, u64 physical, 2039 u64 *found_logical_ret) 2040 { 2041 struct scrub_stripe *stripe; 2042 int ret; 2043 2044 /* 2045 * There should always be one slot left, as caller filling the last 2046 * slot should flush them all. 2047 */ 2048 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2049 2050 /* @found_logical_ret must be specified. */ 2051 ASSERT(found_logical_ret); 2052 2053 stripe = &sctx->stripes[sctx->cur_stripe]; 2054 scrub_reset_stripe(stripe); 2055 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2056 &sctx->csum_path, dev, physical, 2057 mirror_num, logical, length, stripe); 2058 /* Either >0 as no more extents or <0 for error. */ 2059 if (ret) 2060 return ret; 2061 *found_logical_ret = stripe->logical; 2062 sctx->cur_stripe++; 2063 2064 /* We filled one group, submit it. */ 2065 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2066 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2067 2068 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2069 } 2070 2071 /* Last slot used, flush them all. */ 2072 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2073 return flush_scrub_stripes(sctx); 2074 return 0; 2075 } 2076 2077 /* 2078 * Return 0 if we should not cancel the scrub. 2079 * Return <0 if we need to cancel the scrub, returned value will 2080 * indicate the reason: 2081 * - -ECANCELED - Being explicitly canceled through ioctl. 2082 * - -EINTR - Being interrupted by signal or fs/process freezing. 2083 */ 2084 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2085 { 2086 struct btrfs_fs_info *fs_info = sctx->fs_info; 2087 2088 if (atomic_read(&fs_info->scrub_cancel_req) || 2089 atomic_read(&sctx->cancel_req)) 2090 return -ECANCELED; 2091 2092 /* 2093 * The user (e.g. fsfreeze command) or power management (PM) 2094 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2095 * also freeze all user processes. 2096 * 2097 * A user process can only be frozen when it is in user space, thus we 2098 * have to cancel the run so that the process can return to the user 2099 * space. 2100 * 2101 * Furthermore we have to check both filesystem and process freezing, 2102 * as PM can be configured to freeze the filesystems before processes. 2103 * 2104 * If we only check fs freezing, then suspend without fs freezing 2105 * will timeout, as the process is still in kernel space. 2106 * 2107 * If we only check process freezing, then suspend with fs freezing 2108 * will timeout, as the running scrub will prevent the fs from being frozen. 2109 */ 2110 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2111 freezing(current) || signal_pending(current)) 2112 return -EINTR; 2113 return 0; 2114 } 2115 2116 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2117 struct btrfs_device *scrub_dev, 2118 struct btrfs_chunk_map *map, 2119 u64 full_stripe_start, 2120 unsigned long *extent_bitmap) 2121 { 2122 DECLARE_COMPLETION_ONSTACK(io_done); 2123 struct btrfs_fs_info *fs_info = sctx->fs_info; 2124 struct btrfs_io_context *bioc = NULL; 2125 struct btrfs_raid_bio *rbio; 2126 struct bio bio; 2127 const int data_stripes = nr_data_stripes(map); 2128 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2129 int ret; 2130 2131 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2132 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2133 bio.bi_private = &io_done; 2134 bio.bi_end_io = raid56_scrub_wait_endio; 2135 2136 btrfs_bio_counter_inc_blocked(fs_info); 2137 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2138 &length, &bioc, NULL, NULL); 2139 if (ret < 0) 2140 goto out; 2141 /* For RAID56 write there must be an @bioc allocated. */ 2142 ASSERT(bioc); 2143 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2144 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2145 btrfs_put_bioc(bioc); 2146 if (!rbio) { 2147 ret = -ENOMEM; 2148 goto out; 2149 } 2150 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2151 for (int i = 0; i < data_stripes; i++) { 2152 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2153 2154 raid56_parity_cache_data_folios(rbio, stripe->folios, 2155 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2156 } 2157 raid56_parity_submit_scrub_rbio(rbio); 2158 wait_for_completion_io(&io_done); 2159 ret = blk_status_to_errno(bio.bi_status); 2160 out: 2161 btrfs_bio_counter_dec(fs_info); 2162 bio_uninit(&bio); 2163 return ret; 2164 } 2165 2166 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2167 struct btrfs_device *scrub_dev, 2168 struct btrfs_block_group *bg, 2169 struct btrfs_chunk_map *map, 2170 u64 full_stripe_start) 2171 { 2172 struct btrfs_fs_info *fs_info = sctx->fs_info; 2173 BTRFS_PATH_AUTO_RELEASE(extent_path); 2174 BTRFS_PATH_AUTO_RELEASE(csum_path); 2175 struct scrub_stripe *stripe; 2176 bool all_empty = true; 2177 const int data_stripes = nr_data_stripes(map); 2178 unsigned long extent_bitmap = 0; 2179 int ret; 2180 2181 ASSERT(sctx->raid56_data_stripes); 2182 2183 ret = should_cancel_scrub(sctx); 2184 if (ret < 0) 2185 return ret; 2186 2187 if (atomic_read(&fs_info->scrub_pause_req)) 2188 scrub_blocked_if_needed(fs_info); 2189 2190 spin_lock(&bg->lock); 2191 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2192 spin_unlock(&bg->lock); 2193 return 0; 2194 } 2195 spin_unlock(&bg->lock); 2196 2197 /* 2198 * For data stripe search, we cannot reuse the same extent/csum paths, 2199 * as the data stripe bytenr may be smaller than previous extent. Thus 2200 * we have to use our own extent/csum paths. 2201 */ 2202 extent_path.search_commit_root = true; 2203 extent_path.skip_locking = true; 2204 csum_path.search_commit_root = true; 2205 csum_path.skip_locking = true; 2206 2207 for (int i = 0; i < data_stripes; i++) { 2208 int stripe_index; 2209 int rot; 2210 u64 physical; 2211 2212 stripe = &sctx->raid56_data_stripes[i]; 2213 rot = div_u64(full_stripe_start - bg->start, 2214 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2215 stripe_index = (i + rot) % map->num_stripes; 2216 physical = map->stripes[stripe_index].physical + 2217 btrfs_stripe_nr_to_offset(rot); 2218 2219 scrub_reset_stripe(stripe); 2220 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2221 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2222 map->stripes[stripe_index].dev, physical, 1, 2223 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2224 BTRFS_STRIPE_LEN, stripe); 2225 if (ret < 0) 2226 return ret; 2227 /* 2228 * No extent in this data stripe, need to manually mark them 2229 * initialized to make later read submission happy. 2230 */ 2231 if (ret > 0) { 2232 stripe->logical = full_stripe_start + 2233 btrfs_stripe_nr_to_offset(i); 2234 stripe->dev = map->stripes[stripe_index].dev; 2235 stripe->mirror_num = 1; 2236 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2237 } 2238 } 2239 2240 /* Check if all data stripes are empty. */ 2241 for (int i = 0; i < data_stripes; i++) { 2242 stripe = &sctx->raid56_data_stripes[i]; 2243 if (!scrub_bitmap_empty_has_extent(stripe)) { 2244 all_empty = false; 2245 break; 2246 } 2247 } 2248 if (all_empty) 2249 return 0; 2250 2251 for (int i = 0; i < data_stripes; i++) { 2252 stripe = &sctx->raid56_data_stripes[i]; 2253 scrub_submit_initial_read(sctx, stripe); 2254 } 2255 for (int i = 0; i < data_stripes; i++) { 2256 stripe = &sctx->raid56_data_stripes[i]; 2257 2258 wait_event(stripe->repair_wait, 2259 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2260 } 2261 /* For now, no zoned support for RAID56. */ 2262 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2263 2264 /* 2265 * Now all data stripes are properly verified. Check if we have any 2266 * unrepaired, if so abort immediately or we could further corrupt the 2267 * P/Q stripes. 2268 * 2269 * During the loop, also populate extent_bitmap. 2270 */ 2271 for (int i = 0; i < data_stripes; i++) { 2272 unsigned long error; 2273 unsigned long has_extent; 2274 2275 stripe = &sctx->raid56_data_stripes[i]; 2276 2277 error = scrub_bitmap_read_error(stripe); 2278 has_extent = scrub_bitmap_read_has_extent(stripe); 2279 2280 /* 2281 * We should only check the errors where there is an extent. 2282 * As we may hit an empty data stripe while it's missing. 2283 */ 2284 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2285 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2286 btrfs_err(fs_info, 2287 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2288 full_stripe_start, i, stripe->nr_sectors, 2289 &error); 2290 return ret; 2291 } 2292 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2293 stripe->nr_sectors); 2294 } 2295 2296 /* Now we can check and regenerate the P/Q stripe. */ 2297 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2298 &extent_bitmap); 2299 } 2300 2301 /* 2302 * Scrub one range which can only has simple mirror based profile. 2303 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2304 * RAID0/RAID10). 2305 * 2306 * Since we may need to handle a subset of block group, we need @logical_start 2307 * and @logical_length parameter. 2308 */ 2309 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2310 struct btrfs_block_group *bg, 2311 u64 logical_start, u64 logical_length, 2312 struct btrfs_device *device, 2313 u64 physical, int mirror_num) 2314 { 2315 struct btrfs_fs_info *fs_info = sctx->fs_info; 2316 const u64 logical_end = logical_start + logical_length; 2317 u64 cur_logical = logical_start; 2318 int ret = 0; 2319 2320 /* The range must be inside the bg */ 2321 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2322 2323 /* Go through each extent items inside the logical range */ 2324 while (cur_logical < logical_end) { 2325 u64 found_logical = U64_MAX; 2326 u64 cur_physical = physical + cur_logical - logical_start; 2327 2328 ret = should_cancel_scrub(sctx); 2329 if (ret < 0) 2330 break; 2331 2332 if (atomic_read(&fs_info->scrub_pause_req)) 2333 scrub_blocked_if_needed(fs_info); 2334 2335 spin_lock(&bg->lock); 2336 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2337 spin_unlock(&bg->lock); 2338 ret = 0; 2339 break; 2340 } 2341 spin_unlock(&bg->lock); 2342 2343 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2344 cur_logical, logical_end - cur_logical, 2345 cur_physical, &found_logical); 2346 if (ret > 0) { 2347 /* No more extent, just update the accounting */ 2348 spin_lock(&sctx->stat_lock); 2349 sctx->stat.last_physical = physical + logical_length; 2350 spin_unlock(&sctx->stat_lock); 2351 ret = 0; 2352 break; 2353 } 2354 if (ret < 0) 2355 break; 2356 2357 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2358 ASSERT(found_logical != U64_MAX); 2359 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2360 2361 /* Don't hold CPU for too long time */ 2362 cond_resched(); 2363 } 2364 return ret; 2365 } 2366 2367 /* Calculate the full stripe length for simple stripe based profiles */ 2368 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2369 { 2370 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2371 BTRFS_BLOCK_GROUP_RAID10)); 2372 2373 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2374 } 2375 2376 /* Get the logical bytenr for the stripe */ 2377 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2378 struct btrfs_block_group *bg, 2379 int stripe_index) 2380 { 2381 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2382 BTRFS_BLOCK_GROUP_RAID10)); 2383 ASSERT(stripe_index < map->num_stripes); 2384 2385 /* 2386 * (stripe_index / sub_stripes) gives how many data stripes we need to 2387 * skip. 2388 */ 2389 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2390 bg->start; 2391 } 2392 2393 /* Get the mirror number for the stripe */ 2394 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2395 { 2396 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2397 BTRFS_BLOCK_GROUP_RAID10)); 2398 ASSERT(stripe_index < map->num_stripes); 2399 2400 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2401 return stripe_index % map->sub_stripes + 1; 2402 } 2403 2404 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2405 struct btrfs_block_group *bg, 2406 struct btrfs_chunk_map *map, 2407 struct btrfs_device *device, 2408 int stripe_index) 2409 { 2410 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2411 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2412 const u64 orig_physical = map->stripes[stripe_index].physical; 2413 const u64 end = btrfs_block_group_end(bg); 2414 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2415 u64 cur_logical = orig_logical; 2416 u64 cur_physical = orig_physical; 2417 int ret = 0; 2418 2419 while (cur_logical < end) { 2420 /* 2421 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2422 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2423 * this stripe. 2424 */ 2425 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2426 BTRFS_STRIPE_LEN, device, cur_physical, 2427 mirror_num); 2428 if (ret) 2429 return ret; 2430 /* Skip to next stripe which belongs to the target device */ 2431 cur_logical += logical_increment; 2432 /* For physical offset, we just go to next stripe */ 2433 cur_physical += BTRFS_STRIPE_LEN; 2434 } 2435 return ret; 2436 } 2437 2438 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2439 struct btrfs_block_group *bg, 2440 struct btrfs_chunk_map *map, 2441 struct btrfs_device *scrub_dev, 2442 int stripe_index) 2443 { 2444 struct btrfs_fs_info *fs_info = sctx->fs_info; 2445 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2446 const u64 chunk_logical = bg->start; 2447 int ret; 2448 int ret2; 2449 u64 physical = map->stripes[stripe_index].physical; 2450 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2451 const u64 physical_end = physical + dev_stripe_len; 2452 u64 logical; 2453 u64 logic_end; 2454 /* The logical increment after finishing one stripe */ 2455 u64 increment; 2456 /* Offset inside the chunk */ 2457 u64 offset; 2458 u64 stripe_logical; 2459 2460 /* Extent_path should be released by now. */ 2461 ASSERT(sctx->extent_path.nodes[0] == NULL); 2462 2463 scrub_blocked_if_needed(fs_info); 2464 2465 if (sctx->is_dev_replace && 2466 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2467 mutex_lock(&sctx->wr_lock); 2468 sctx->write_pointer = physical; 2469 mutex_unlock(&sctx->wr_lock); 2470 } 2471 2472 /* Prepare the extra data stripes used by RAID56. */ 2473 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2474 ASSERT(sctx->raid56_data_stripes == NULL); 2475 2476 sctx->raid56_data_stripes = kzalloc_objs(struct scrub_stripe, 2477 nr_data_stripes(map)); 2478 if (!sctx->raid56_data_stripes) { 2479 ret = -ENOMEM; 2480 goto out; 2481 } 2482 for (int i = 0; i < nr_data_stripes(map); i++) { 2483 ret = init_scrub_stripe(fs_info, 2484 &sctx->raid56_data_stripes[i]); 2485 if (ret < 0) 2486 goto out; 2487 sctx->raid56_data_stripes[i].bg = bg; 2488 sctx->raid56_data_stripes[i].sctx = sctx; 2489 } 2490 } 2491 /* 2492 * There used to be a big double loop to handle all profiles using the 2493 * same routine, which grows larger and more gross over time. 2494 * 2495 * So here we handle each profile differently, so simpler profiles 2496 * have simpler scrubbing function. 2497 */ 2498 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2499 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2500 /* 2501 * Above check rules out all complex profile, the remaining 2502 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2503 * mirrored duplication without stripe. 2504 * 2505 * Only @physical and @mirror_num needs to calculated using 2506 * @stripe_index. 2507 */ 2508 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2509 scrub_dev, map->stripes[stripe_index].physical, 2510 stripe_index + 1); 2511 offset = 0; 2512 goto out; 2513 } 2514 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2515 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2516 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2517 goto out; 2518 } 2519 2520 /* Only RAID56 goes through the old code */ 2521 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2522 ret = 0; 2523 2524 /* Calculate the logical end of the stripe */ 2525 get_raid56_logic_offset(physical_end, stripe_index, 2526 map, &logic_end, NULL); 2527 logic_end += chunk_logical; 2528 2529 /* Initialize @offset in case we need to go to out: label */ 2530 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2531 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2532 2533 /* 2534 * Due to the rotation, for RAID56 it's better to iterate each stripe 2535 * using their physical offset. 2536 */ 2537 while (physical < physical_end) { 2538 ret = get_raid56_logic_offset(physical, stripe_index, map, 2539 &logical, &stripe_logical); 2540 logical += chunk_logical; 2541 if (ret) { 2542 /* it is parity strip */ 2543 stripe_logical += chunk_logical; 2544 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2545 map, stripe_logical); 2546 spin_lock(&sctx->stat_lock); 2547 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2548 physical_end); 2549 spin_unlock(&sctx->stat_lock); 2550 if (ret) 2551 goto out; 2552 goto next; 2553 } 2554 2555 /* 2556 * Now we're at a data stripe, scrub each extents in the range. 2557 * 2558 * At this stage, if we ignore the repair part, inside each data 2559 * stripe it is no different than SINGLE profile. 2560 * We can reuse scrub_simple_mirror() here, as the repair part 2561 * is still based on @mirror_num. 2562 */ 2563 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2564 scrub_dev, physical, 1); 2565 if (ret < 0) 2566 goto out; 2567 next: 2568 logical += increment; 2569 physical += BTRFS_STRIPE_LEN; 2570 spin_lock(&sctx->stat_lock); 2571 sctx->stat.last_physical = physical; 2572 spin_unlock(&sctx->stat_lock); 2573 } 2574 out: 2575 ret2 = flush_scrub_stripes(sctx); 2576 if (!ret) 2577 ret = ret2; 2578 btrfs_release_path(&sctx->extent_path); 2579 btrfs_release_path(&sctx->csum_path); 2580 2581 if (sctx->raid56_data_stripes) { 2582 for (int i = 0; i < nr_data_stripes(map); i++) 2583 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2584 kfree(sctx->raid56_data_stripes); 2585 sctx->raid56_data_stripes = NULL; 2586 } 2587 2588 if (sctx->is_dev_replace && ret >= 0) { 2589 ret2 = sync_write_pointer_for_zoned(sctx, 2590 chunk_logical + offset, 2591 map->stripes[stripe_index].physical, 2592 physical_end); 2593 if (ret2) 2594 ret = ret2; 2595 } 2596 2597 return ret < 0 ? ret : 0; 2598 } 2599 2600 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2601 struct btrfs_block_group *bg, 2602 struct btrfs_device *scrub_dev, 2603 u64 dev_offset, 2604 u64 dev_extent_len) 2605 { 2606 struct btrfs_fs_info *fs_info = sctx->fs_info; 2607 struct btrfs_chunk_map *map; 2608 int i; 2609 int ret = 0; 2610 2611 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2612 if (!map) { 2613 /* 2614 * Might have been an unused block group deleted by the cleaner 2615 * kthread or relocation. 2616 */ 2617 spin_lock(&bg->lock); 2618 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2619 ret = -EINVAL; 2620 spin_unlock(&bg->lock); 2621 2622 return ret; 2623 } 2624 if (map->start != bg->start) 2625 goto out; 2626 if (map->chunk_len < dev_extent_len) 2627 goto out; 2628 2629 for (i = 0; i < map->num_stripes; ++i) { 2630 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2631 map->stripes[i].physical == dev_offset) { 2632 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2633 if (ret) 2634 goto out; 2635 } 2636 } 2637 out: 2638 btrfs_free_chunk_map(map); 2639 2640 return ret; 2641 } 2642 2643 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2644 struct btrfs_block_group *cache) 2645 { 2646 struct btrfs_fs_info *fs_info = cache->fs_info; 2647 2648 if (!btrfs_is_zoned(fs_info)) 2649 return 0; 2650 2651 btrfs_wait_block_group_reservations(cache); 2652 btrfs_wait_nocow_writers(cache); 2653 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2654 2655 return btrfs_commit_current_transaction(root); 2656 } 2657 2658 static noinline_for_stack 2659 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2660 struct btrfs_device *scrub_dev, u64 start, u64 end) 2661 { 2662 struct btrfs_dev_extent *dev_extent = NULL; 2663 BTRFS_PATH_AUTO_FREE(path); 2664 struct btrfs_fs_info *fs_info = sctx->fs_info; 2665 struct btrfs_root *root = fs_info->dev_root; 2666 u64 chunk_offset; 2667 int ret = 0; 2668 int ro_set; 2669 int slot; 2670 struct extent_buffer *l; 2671 struct btrfs_key key; 2672 struct btrfs_key found_key; 2673 struct btrfs_block_group *cache; 2674 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2675 2676 path = btrfs_alloc_path(); 2677 if (!path) 2678 return -ENOMEM; 2679 2680 path->reada = READA_FORWARD; 2681 path->search_commit_root = true; 2682 path->skip_locking = true; 2683 2684 key.objectid = scrub_dev->devid; 2685 key.type = BTRFS_DEV_EXTENT_KEY; 2686 key.offset = 0ull; 2687 2688 while (1) { 2689 u64 dev_extent_len; 2690 2691 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2692 if (ret < 0) 2693 break; 2694 if (ret > 0) { 2695 if (path->slots[0] >= 2696 btrfs_header_nritems(path->nodes[0])) { 2697 ret = btrfs_next_leaf(root, path); 2698 if (ret < 0) 2699 break; 2700 if (ret > 0) { 2701 ret = 0; 2702 break; 2703 } 2704 } else { 2705 ret = 0; 2706 } 2707 } 2708 2709 l = path->nodes[0]; 2710 slot = path->slots[0]; 2711 2712 btrfs_item_key_to_cpu(l, &found_key, slot); 2713 2714 if (found_key.objectid != scrub_dev->devid) 2715 break; 2716 2717 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2718 break; 2719 2720 if (found_key.offset >= end) 2721 break; 2722 2723 if (found_key.offset < key.offset) 2724 break; 2725 2726 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2727 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2728 2729 if (found_key.offset + dev_extent_len <= start) 2730 goto skip; 2731 2732 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2733 2734 /* 2735 * get a reference on the corresponding block group to prevent 2736 * the chunk from going away while we scrub it 2737 */ 2738 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2739 2740 /* some chunks are removed but not committed to disk yet, 2741 * continue scrubbing */ 2742 if (!cache) 2743 goto skip; 2744 2745 ASSERT(cache->start <= chunk_offset); 2746 /* 2747 * We are using the commit root to search for device extents, so 2748 * that means we could have found a device extent item from a 2749 * block group that was deleted in the current transaction. The 2750 * logical start offset of the deleted block group, stored at 2751 * @chunk_offset, might be part of the logical address range of 2752 * a new block group (which uses different physical extents). 2753 * In this case btrfs_lookup_block_group() has returned the new 2754 * block group, and its start address is less than @chunk_offset. 2755 * 2756 * We skip such new block groups, because it's pointless to 2757 * process them, as we won't find their extents because we search 2758 * for them using the commit root of the extent tree. For a device 2759 * replace it's also fine to skip it, we won't miss copying them 2760 * to the target device because we have the write duplication 2761 * setup through the regular write path (by btrfs_map_block()), 2762 * and we have committed a transaction when we started the device 2763 * replace, right after setting up the device replace state. 2764 */ 2765 if (cache->start < chunk_offset) { 2766 btrfs_put_block_group(cache); 2767 goto skip; 2768 } 2769 2770 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2771 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2772 btrfs_put_block_group(cache); 2773 goto skip; 2774 } 2775 } 2776 2777 /* 2778 * Make sure that while we are scrubbing the corresponding block 2779 * group doesn't get its logical address and its device extents 2780 * reused for another block group, which can possibly be of a 2781 * different type and different profile. We do this to prevent 2782 * false error detections and crashes due to bogus attempts to 2783 * repair extents. 2784 */ 2785 spin_lock(&cache->lock); 2786 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2787 spin_unlock(&cache->lock); 2788 btrfs_put_block_group(cache); 2789 goto skip; 2790 } 2791 btrfs_freeze_block_group(cache); 2792 spin_unlock(&cache->lock); 2793 2794 /* 2795 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2796 * to avoid deadlock caused by: 2797 * btrfs_inc_block_group_ro() 2798 * -> btrfs_wait_for_commit() 2799 * -> btrfs_commit_transaction() 2800 * -> btrfs_scrub_pause() 2801 */ 2802 scrub_pause_on(fs_info); 2803 2804 /* 2805 * Don't do chunk preallocation for scrub. 2806 * 2807 * This is especially important for SYSTEM bgs, or we can hit 2808 * -EFBIG from btrfs_finish_chunk_alloc() like: 2809 * 1. The only SYSTEM bg is marked RO. 2810 * Since SYSTEM bg is small, that's pretty common. 2811 * 2. New SYSTEM bg will be allocated 2812 * Due to regular version will allocate new chunk. 2813 * 3. New SYSTEM bg is empty and will get cleaned up 2814 * Before cleanup really happens, it's marked RO again. 2815 * 4. Empty SYSTEM bg get scrubbed 2816 * We go back to 2. 2817 * 2818 * This can easily boost the amount of SYSTEM chunks if cleaner 2819 * thread can't be triggered fast enough, and use up all space 2820 * of btrfs_super_block::sys_chunk_array 2821 * 2822 * While for dev replace, we need to try our best to mark block 2823 * group RO, to prevent race between: 2824 * - Write duplication 2825 * Contains latest data 2826 * - Scrub copy 2827 * Contains data from commit tree 2828 * 2829 * If target block group is not marked RO, nocow writes can 2830 * be overwritten by scrub copy, causing data corruption. 2831 * So for dev-replace, it's not allowed to continue if a block 2832 * group is not RO. 2833 */ 2834 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2835 if (!ret && sctx->is_dev_replace) { 2836 ret = finish_extent_writes_for_zoned(root, cache); 2837 if (ret) { 2838 btrfs_dec_block_group_ro(cache); 2839 scrub_pause_off(fs_info); 2840 btrfs_put_block_group(cache); 2841 break; 2842 } 2843 } 2844 2845 if (ret == 0) { 2846 ro_set = 1; 2847 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2848 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2849 /* 2850 * btrfs_inc_block_group_ro return -ENOSPC when it 2851 * failed in creating new chunk for metadata. 2852 * It is not a problem for scrub, because 2853 * metadata are always cowed, and our scrub paused 2854 * commit_transactions. 2855 * 2856 * For RAID56 chunks, we have to mark them read-only 2857 * for scrub, as later we would use our own cache 2858 * out of RAID56 realm. 2859 * Thus we want the RAID56 bg to be marked RO to 2860 * prevent RMW from screwing up out cache. 2861 */ 2862 ro_set = 0; 2863 } else if (ret == -ETXTBSY) { 2864 btrfs_warn(fs_info, 2865 "scrub: skipping scrub of block group %llu due to active swapfile", 2866 cache->start); 2867 scrub_pause_off(fs_info); 2868 ret = 0; 2869 goto skip_unfreeze; 2870 } else { 2871 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2872 ret); 2873 btrfs_unfreeze_block_group(cache); 2874 btrfs_put_block_group(cache); 2875 scrub_pause_off(fs_info); 2876 break; 2877 } 2878 2879 /* 2880 * Now the target block is marked RO, wait for nocow writes to 2881 * finish before dev-replace. 2882 * COW is fine, as COW never overwrites extents in commit tree. 2883 */ 2884 if (sctx->is_dev_replace) { 2885 btrfs_wait_nocow_writers(cache); 2886 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2887 } 2888 2889 scrub_pause_off(fs_info); 2890 down_write(&dev_replace->rwsem); 2891 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2892 dev_replace->cursor_left = found_key.offset; 2893 dev_replace->item_needs_writeback = 1; 2894 up_write(&dev_replace->rwsem); 2895 2896 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2897 dev_extent_len); 2898 if (sctx->is_dev_replace && 2899 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2900 cache, found_key.offset)) 2901 ro_set = 0; 2902 2903 down_write(&dev_replace->rwsem); 2904 dev_replace->cursor_left = dev_replace->cursor_right; 2905 dev_replace->item_needs_writeback = 1; 2906 up_write(&dev_replace->rwsem); 2907 2908 if (ro_set) 2909 btrfs_dec_block_group_ro(cache); 2910 2911 /* 2912 * We might have prevented the cleaner kthread from deleting 2913 * this block group if it was already unused because we raced 2914 * and set it to RO mode first. So add it back to the unused 2915 * list, otherwise it might not ever be deleted unless a manual 2916 * balance is triggered or it becomes used and unused again. 2917 */ 2918 spin_lock(&cache->lock); 2919 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2920 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2921 spin_unlock(&cache->lock); 2922 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2923 btrfs_discard_queue_work(&fs_info->discard_ctl, 2924 cache); 2925 else 2926 btrfs_mark_bg_unused(cache); 2927 } else { 2928 spin_unlock(&cache->lock); 2929 } 2930 skip_unfreeze: 2931 btrfs_unfreeze_block_group(cache); 2932 btrfs_put_block_group(cache); 2933 if (ret) 2934 break; 2935 if (unlikely(sctx->is_dev_replace && 2936 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2937 ret = -EIO; 2938 break; 2939 } 2940 if (sctx->stat.malloc_errors > 0) { 2941 ret = -ENOMEM; 2942 break; 2943 } 2944 skip: 2945 key.offset = found_key.offset + dev_extent_len; 2946 btrfs_release_path(path); 2947 } 2948 2949 return ret; 2950 } 2951 2952 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2953 struct page *page, u64 physical, u64 generation) 2954 { 2955 struct btrfs_fs_info *fs_info = sctx->fs_info; 2956 struct btrfs_super_block *sb = page_address(page); 2957 int ret; 2958 2959 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2960 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2961 if (ret < 0) 2962 return ret; 2963 ret = btrfs_check_super_csum(fs_info, sb); 2964 if (unlikely(ret != 0)) { 2965 btrfs_err_rl(fs_info, 2966 "scrub: super block at physical %llu devid %llu has bad csum", 2967 physical, dev->devid); 2968 return -EIO; 2969 } 2970 if (unlikely(btrfs_super_generation(sb) != generation)) { 2971 btrfs_err_rl(fs_info, 2972 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2973 physical, dev->devid, 2974 btrfs_super_generation(sb), generation); 2975 return -EUCLEAN; 2976 } 2977 2978 return btrfs_validate_super(fs_info, sb, -1); 2979 } 2980 2981 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2982 struct btrfs_device *scrub_dev) 2983 { 2984 int i; 2985 u64 bytenr; 2986 u64 gen; 2987 int ret = 0; 2988 struct page *page; 2989 struct btrfs_fs_info *fs_info = sctx->fs_info; 2990 2991 if (BTRFS_FS_ERROR(fs_info)) 2992 return -EROFS; 2993 2994 page = alloc_page(GFP_KERNEL); 2995 if (!page) { 2996 spin_lock(&sctx->stat_lock); 2997 sctx->stat.malloc_errors++; 2998 spin_unlock(&sctx->stat_lock); 2999 return -ENOMEM; 3000 } 3001 3002 /* Seed devices of a new filesystem has their own generation. */ 3003 if (scrub_dev->fs_devices != fs_info->fs_devices) 3004 gen = scrub_dev->generation; 3005 else 3006 gen = btrfs_get_last_trans_committed(fs_info); 3007 3008 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3009 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 3010 if (ret == -ENOENT) 3011 break; 3012 3013 if (ret) { 3014 spin_lock(&sctx->stat_lock); 3015 sctx->stat.super_errors++; 3016 spin_unlock(&sctx->stat_lock); 3017 continue; 3018 } 3019 3020 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3021 scrub_dev->commit_total_bytes) 3022 break; 3023 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3024 continue; 3025 3026 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3027 if (ret) { 3028 spin_lock(&sctx->stat_lock); 3029 sctx->stat.super_errors++; 3030 spin_unlock(&sctx->stat_lock); 3031 } 3032 } 3033 __free_page(page); 3034 return 0; 3035 } 3036 3037 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3038 { 3039 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3040 &fs_info->scrub_lock)) { 3041 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3042 3043 fs_info->scrub_workers = NULL; 3044 mutex_unlock(&fs_info->scrub_lock); 3045 3046 if (scrub_workers) 3047 destroy_workqueue(scrub_workers); 3048 } 3049 } 3050 3051 /* 3052 * get a reference count on fs_info->scrub_workers. start worker if necessary 3053 */ 3054 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3055 { 3056 struct workqueue_struct *scrub_workers = NULL; 3057 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3058 int max_active = fs_info->thread_pool_size; 3059 int ret = -ENOMEM; 3060 3061 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3062 return 0; 3063 3064 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3065 if (!scrub_workers) 3066 return -ENOMEM; 3067 3068 mutex_lock(&fs_info->scrub_lock); 3069 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3070 ASSERT(fs_info->scrub_workers == NULL); 3071 fs_info->scrub_workers = scrub_workers; 3072 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3073 mutex_unlock(&fs_info->scrub_lock); 3074 return 0; 3075 } 3076 /* Other thread raced in and created the workers for us */ 3077 refcount_inc(&fs_info->scrub_workers_refcnt); 3078 mutex_unlock(&fs_info->scrub_lock); 3079 3080 ret = 0; 3081 3082 destroy_workqueue(scrub_workers); 3083 return ret; 3084 } 3085 3086 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3087 u64 end, struct btrfs_scrub_progress *progress, 3088 bool readonly, bool is_dev_replace) 3089 { 3090 struct btrfs_dev_lookup_args args = { .devid = devid }; 3091 struct scrub_ctx *sctx; 3092 int ret; 3093 struct btrfs_device *dev; 3094 unsigned int nofs_flag; 3095 bool need_commit = false; 3096 3097 /* Set the basic fallback @last_physical before we got a sctx. */ 3098 if (progress) 3099 progress->last_physical = start; 3100 3101 if (btrfs_fs_closing(fs_info)) 3102 return -EAGAIN; 3103 3104 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3105 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3106 3107 /* 3108 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3109 * value (max nodesize / min sectorsize), thus nodesize should always 3110 * be fine. 3111 */ 3112 ASSERT(fs_info->nodesize <= 3113 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3114 3115 /* Allocate outside of device_list_mutex */ 3116 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3117 if (IS_ERR(sctx)) 3118 return PTR_ERR(sctx); 3119 sctx->stat.last_physical = start; 3120 3121 ret = scrub_workers_get(fs_info); 3122 if (ret) 3123 goto out_free_ctx; 3124 3125 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3126 dev = btrfs_find_device(fs_info->fs_devices, &args); 3127 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3128 !is_dev_replace)) { 3129 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3130 ret = -ENODEV; 3131 goto out; 3132 } 3133 3134 if (!is_dev_replace && !readonly && 3135 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3136 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3137 btrfs_err(fs_info, 3138 "scrub: devid %llu: filesystem on %s is not writable", 3139 devid, btrfs_dev_name(dev)); 3140 ret = -EROFS; 3141 goto out; 3142 } 3143 3144 mutex_lock(&fs_info->scrub_lock); 3145 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3146 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3147 mutex_unlock(&fs_info->scrub_lock); 3148 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3149 ret = -EIO; 3150 goto out; 3151 } 3152 3153 down_read(&fs_info->dev_replace.rwsem); 3154 if (dev->scrub_ctx || 3155 (!is_dev_replace && 3156 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3157 up_read(&fs_info->dev_replace.rwsem); 3158 mutex_unlock(&fs_info->scrub_lock); 3159 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3160 ret = -EINPROGRESS; 3161 goto out; 3162 } 3163 up_read(&fs_info->dev_replace.rwsem); 3164 3165 sctx->readonly = readonly; 3166 dev->scrub_ctx = sctx; 3167 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3168 3169 /* 3170 * checking @scrub_pause_req here, we can avoid 3171 * race between committing transaction and scrubbing. 3172 */ 3173 __scrub_blocked_if_needed(fs_info); 3174 atomic_inc(&fs_info->scrubs_running); 3175 mutex_unlock(&fs_info->scrub_lock); 3176 3177 /* 3178 * In order to avoid deadlock with reclaim when there is a transaction 3179 * trying to pause scrub, make sure we use GFP_NOFS for all the 3180 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3181 * invoked by our callees. The pausing request is done when the 3182 * transaction commit starts, and it blocks the transaction until scrub 3183 * is paused (done at specific points at scrub_stripe() or right above 3184 * before incrementing fs_info->scrubs_running). 3185 */ 3186 nofs_flag = memalloc_nofs_save(); 3187 if (!is_dev_replace) { 3188 u64 old_super_errors; 3189 3190 spin_lock(&sctx->stat_lock); 3191 old_super_errors = sctx->stat.super_errors; 3192 spin_unlock(&sctx->stat_lock); 3193 3194 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3195 /* 3196 * by holding device list mutex, we can 3197 * kick off writing super in log tree sync. 3198 */ 3199 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3200 ret = scrub_supers(sctx, dev); 3201 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3202 3203 spin_lock(&sctx->stat_lock); 3204 /* 3205 * Super block errors found, but we can not commit transaction 3206 * at current context, since btrfs_commit_transaction() needs 3207 * to pause the current running scrub (hold by ourselves). 3208 */ 3209 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3210 need_commit = true; 3211 spin_unlock(&sctx->stat_lock); 3212 } 3213 3214 if (!ret) 3215 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3216 memalloc_nofs_restore(nofs_flag); 3217 3218 atomic_dec(&fs_info->scrubs_running); 3219 wake_up(&fs_info->scrub_pause_wait); 3220 3221 if (progress) 3222 memcpy(progress, &sctx->stat, sizeof(*progress)); 3223 3224 if (!is_dev_replace) 3225 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3226 ret ? "not finished" : "finished", devid, ret); 3227 3228 mutex_lock(&fs_info->scrub_lock); 3229 dev->scrub_ctx = NULL; 3230 mutex_unlock(&fs_info->scrub_lock); 3231 3232 scrub_workers_put(fs_info); 3233 scrub_put_ctx(sctx); 3234 3235 /* 3236 * We found some super block errors before, now try to force a 3237 * transaction commit, as scrub has finished. 3238 */ 3239 if (need_commit) { 3240 struct btrfs_trans_handle *trans; 3241 3242 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3243 if (IS_ERR(trans)) { 3244 ret = PTR_ERR(trans); 3245 btrfs_err(fs_info, 3246 "scrub: failed to start transaction to fix super block errors: %d", ret); 3247 return ret; 3248 } 3249 ret = btrfs_commit_transaction(trans); 3250 if (ret < 0) 3251 btrfs_err(fs_info, 3252 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3253 } 3254 return ret; 3255 out: 3256 scrub_workers_put(fs_info); 3257 out_free_ctx: 3258 scrub_free_ctx(sctx); 3259 3260 return ret; 3261 } 3262 3263 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3264 { 3265 mutex_lock(&fs_info->scrub_lock); 3266 atomic_inc(&fs_info->scrub_pause_req); 3267 while (atomic_read(&fs_info->scrubs_paused) != 3268 atomic_read(&fs_info->scrubs_running)) { 3269 mutex_unlock(&fs_info->scrub_lock); 3270 wait_event(fs_info->scrub_pause_wait, 3271 atomic_read(&fs_info->scrubs_paused) == 3272 atomic_read(&fs_info->scrubs_running)); 3273 mutex_lock(&fs_info->scrub_lock); 3274 } 3275 mutex_unlock(&fs_info->scrub_lock); 3276 } 3277 3278 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3279 { 3280 atomic_dec(&fs_info->scrub_pause_req); 3281 wake_up(&fs_info->scrub_pause_wait); 3282 } 3283 3284 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3285 { 3286 mutex_lock(&fs_info->scrub_lock); 3287 if (!atomic_read(&fs_info->scrubs_running)) { 3288 mutex_unlock(&fs_info->scrub_lock); 3289 return -ENOTCONN; 3290 } 3291 3292 atomic_inc(&fs_info->scrub_cancel_req); 3293 while (atomic_read(&fs_info->scrubs_running)) { 3294 mutex_unlock(&fs_info->scrub_lock); 3295 wait_event(fs_info->scrub_pause_wait, 3296 atomic_read(&fs_info->scrubs_running) == 0); 3297 mutex_lock(&fs_info->scrub_lock); 3298 } 3299 atomic_dec(&fs_info->scrub_cancel_req); 3300 mutex_unlock(&fs_info->scrub_lock); 3301 3302 return 0; 3303 } 3304 3305 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3306 { 3307 struct btrfs_fs_info *fs_info = dev->fs_info; 3308 struct scrub_ctx *sctx; 3309 3310 mutex_lock(&fs_info->scrub_lock); 3311 sctx = dev->scrub_ctx; 3312 if (!sctx) { 3313 mutex_unlock(&fs_info->scrub_lock); 3314 return -ENOTCONN; 3315 } 3316 atomic_inc(&sctx->cancel_req); 3317 while (dev->scrub_ctx) { 3318 mutex_unlock(&fs_info->scrub_lock); 3319 wait_event(fs_info->scrub_pause_wait, 3320 dev->scrub_ctx == NULL); 3321 mutex_lock(&fs_info->scrub_lock); 3322 } 3323 mutex_unlock(&fs_info->scrub_lock); 3324 3325 return 0; 3326 } 3327 3328 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3329 struct btrfs_scrub_progress *progress) 3330 { 3331 struct btrfs_dev_lookup_args args = { .devid = devid }; 3332 struct btrfs_device *dev; 3333 struct scrub_ctx *sctx = NULL; 3334 3335 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3336 dev = btrfs_find_device(fs_info->fs_devices, &args); 3337 if (dev) 3338 sctx = dev->scrub_ctx; 3339 if (sctx) 3340 memcpy(progress, &sctx->stat, sizeof(*progress)); 3341 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3342 3343 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3344 } 3345