1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "discard.h" 11 #include "volumes.h" 12 #include "disk-io.h" 13 #include "ordered-data.h" 14 #include "transaction.h" 15 #include "backref.h" 16 #include "extent_io.h" 17 #include "dev-replace.h" 18 #include "raid56.h" 19 #include "block-group.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "file-item.h" 24 #include "scrub.h" 25 #include "raid-stripe-tree.h" 26 27 /* 28 * This is only the first step towards a full-features scrub. It reads all 29 * extent and super block and verifies the checksums. In case a bad checksum 30 * is found or the extent cannot be read, good data will be written back if 31 * any can be found. 32 * 33 * Future enhancements: 34 * - In case an unrepairable extent is encountered, track which files are 35 * affected and report them 36 * - track and record media errors, throw out bad devices 37 * - add a mode to also read unallocated space 38 */ 39 40 struct scrub_ctx; 41 42 /* 43 * The following value only influences the performance. 44 * 45 * This determines how many stripes would be submitted in one go, 46 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 47 */ 48 #define SCRUB_STRIPES_PER_GROUP 8 49 50 /* 51 * How many groups we have for each sctx. 52 * 53 * This would be 8M per device, the same value as the old scrub in-flight bios 54 * size limit. 55 */ 56 #define SCRUB_GROUPS_PER_SCTX 16 57 58 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 59 60 /* 61 * The following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 */ 64 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 65 66 /* Represent one sector and its needed info to verify the content. */ 67 struct scrub_sector_verification { 68 union { 69 /* 70 * Csum pointer for data csum verification. Should point to a 71 * sector csum inside scrub_stripe::csums. 72 * 73 * NULL if this data sector has no csum. 74 */ 75 u8 *csum; 76 77 /* 78 * Extra info for metadata verification. All sectors inside a 79 * tree block share the same generation. 80 */ 81 u64 generation; 82 }; 83 }; 84 85 enum scrub_stripe_flags { 86 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 87 SCRUB_STRIPE_FLAG_INITIALIZED, 88 89 /* Set when the read-repair is finished. */ 90 SCRUB_STRIPE_FLAG_REPAIR_DONE, 91 92 /* 93 * Set for data stripes if it's triggered from P/Q stripe. 94 * During such scrub, we should not report errors in data stripes, nor 95 * update the accounting. 96 */ 97 SCRUB_STRIPE_FLAG_NO_REPORT, 98 }; 99 100 /* 101 * We have multiple bitmaps for one scrub_stripe. 102 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 103 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 104 * 105 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 106 * into a larger one. 107 * 108 * These enum records where the sub-bitmap are inside the larger one. 109 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 110 */ 111 enum { 112 /* Which blocks are covered by extent items. */ 113 scrub_bitmap_nr_has_extent = 0, 114 115 /* Which blocks are metadata. */ 116 scrub_bitmap_nr_is_metadata, 117 118 /* 119 * Which blocks have errors, including IO, csum, and metadata 120 * errors. 121 * This sub-bitmap is the OR results of the next few error related 122 * sub-bitmaps. 123 */ 124 scrub_bitmap_nr_error, 125 scrub_bitmap_nr_io_error, 126 scrub_bitmap_nr_csum_error, 127 scrub_bitmap_nr_meta_error, 128 scrub_bitmap_nr_meta_gen_error, 129 scrub_bitmap_nr_last, 130 }; 131 132 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 133 134 /* 135 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 136 */ 137 struct scrub_stripe { 138 struct scrub_ctx *sctx; 139 struct btrfs_block_group *bg; 140 141 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 142 struct scrub_sector_verification *sectors; 143 144 struct btrfs_device *dev; 145 u64 logical; 146 u64 physical; 147 148 u16 mirror_num; 149 150 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 151 u16 nr_sectors; 152 153 /* 154 * How many data/meta extents are in this stripe. Only for scrub status 155 * reporting purposes. 156 */ 157 u16 nr_data_extents; 158 u16 nr_meta_extents; 159 160 atomic_t pending_io; 161 wait_queue_head_t io_wait; 162 wait_queue_head_t repair_wait; 163 164 /* 165 * Indicate the states of the stripe. Bits are defined in 166 * scrub_stripe_flags enum. 167 */ 168 unsigned long state; 169 170 /* The large bitmap contains all the sub-bitmaps. */ 171 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 172 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 173 174 /* 175 * For writeback (repair or replace) error reporting. 176 * This one is protected by a spinlock, thus can not be packed into 177 * the larger bitmap. 178 */ 179 unsigned long write_error_bitmap; 180 181 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 182 spinlock_t write_error_lock; 183 184 /* 185 * Checksum for the whole stripe if this stripe is inside a data block 186 * group. 187 */ 188 u8 *csums; 189 190 struct work_struct work; 191 }; 192 193 struct scrub_ctx { 194 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 195 struct scrub_stripe *raid56_data_stripes; 196 struct btrfs_fs_info *fs_info; 197 struct btrfs_path extent_path; 198 struct btrfs_path csum_path; 199 int first_free; 200 int cur_stripe; 201 atomic_t cancel_req; 202 int readonly; 203 204 /* State of IO submission throttling affecting the associated device */ 205 ktime_t throttle_deadline; 206 u64 throttle_sent; 207 208 bool is_dev_replace; 209 u64 write_pointer; 210 211 struct mutex wr_lock; 212 struct btrfs_device *wr_tgtdev; 213 214 /* 215 * statistics 216 */ 217 struct btrfs_scrub_progress stat; 218 spinlock_t stat_lock; 219 220 /* 221 * Use a ref counter to avoid use-after-free issues. Scrub workers 222 * decrement bios_in_flight and workers_pending and then do a wakeup 223 * on the list_wait wait queue. We must ensure the main scrub task 224 * doesn't free the scrub context before or while the workers are 225 * doing the wakeup() call. 226 */ 227 refcount_t refs; 228 }; 229 230 #define scrub_calc_start_bit(stripe, name, block_nr) \ 231 ({ \ 232 unsigned int __start_bit; \ 233 \ 234 ASSERT(block_nr < stripe->nr_sectors, \ 235 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 236 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 237 __start_bit; \ 238 }) 239 240 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 241 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 242 unsigned int block_nr, \ 243 unsigned int nr_blocks) \ 244 { \ 245 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 246 name, block_nr); \ 247 \ 248 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 249 } \ 250 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 251 unsigned int block_nr, \ 252 unsigned int nr_blocks) \ 253 { \ 254 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 255 block_nr); \ 256 \ 257 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 258 } \ 259 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 260 unsigned int block_nr) \ 261 { \ 262 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 263 block_nr); \ 264 \ 265 return test_bit(start_bit, stripe->bitmaps); \ 266 } \ 267 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 268 unsigned int block_nr) \ 269 { \ 270 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 271 block_nr); \ 272 \ 273 set_bit(start_bit, stripe->bitmaps); \ 274 } \ 275 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 276 unsigned int block_nr) \ 277 { \ 278 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 279 block_nr); \ 280 \ 281 clear_bit(start_bit, stripe->bitmaps); \ 282 } \ 283 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 284 { \ 285 const unsigned int nr_blocks = stripe->nr_sectors; \ 286 \ 287 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 288 "nr_blocks=%u BITS_PER_LONG=%u", \ 289 nr_blocks, BITS_PER_LONG); \ 290 \ 291 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 292 stripe->nr_sectors); \ 293 } \ 294 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 295 { \ 296 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 297 \ 298 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 299 } \ 300 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 301 { \ 302 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 303 \ 304 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 305 } 306 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 307 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 308 IMPLEMENT_SCRUB_BITMAP_OPS(error); 309 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 313 314 struct scrub_warning { 315 struct btrfs_path *path; 316 u64 extent_item_size; 317 const char *errstr; 318 u64 physical; 319 u64 logical; 320 struct btrfs_device *dev; 321 }; 322 323 struct scrub_error_records { 324 /* 325 * Bitmap recording which blocks hit errors (IO/csum/...) during the 326 * initial read. 327 */ 328 unsigned long init_error_bitmap; 329 330 unsigned int nr_io_errors; 331 unsigned int nr_csum_errors; 332 unsigned int nr_meta_errors; 333 unsigned int nr_meta_gen_errors; 334 }; 335 336 static void release_scrub_stripe(struct scrub_stripe *stripe) 337 { 338 if (!stripe) 339 return; 340 341 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 342 if (stripe->folios[i]) 343 folio_put(stripe->folios[i]); 344 stripe->folios[i] = NULL; 345 } 346 kfree(stripe->sectors); 347 kfree(stripe->csums); 348 stripe->sectors = NULL; 349 stripe->csums = NULL; 350 stripe->sctx = NULL; 351 stripe->state = 0; 352 } 353 354 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 355 struct scrub_stripe *stripe) 356 { 357 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 358 int ret; 359 360 memset(stripe, 0, sizeof(*stripe)); 361 362 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 363 stripe->state = 0; 364 365 init_waitqueue_head(&stripe->io_wait); 366 init_waitqueue_head(&stripe->repair_wait); 367 atomic_set(&stripe->pending_io, 0); 368 spin_lock_init(&stripe->write_error_lock); 369 370 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 371 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 372 fs_info->block_min_order, stripe->folios); 373 if (ret < 0) 374 goto error; 375 376 stripe->sectors = kzalloc_objs(struct scrub_sector_verification, 377 stripe->nr_sectors); 378 if (!stripe->sectors) 379 goto error; 380 381 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 382 fs_info->csum_size, GFP_KERNEL); 383 if (!stripe->csums) 384 goto error; 385 return 0; 386 error: 387 release_scrub_stripe(stripe); 388 return -ENOMEM; 389 } 390 391 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 392 { 393 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 394 } 395 396 static void scrub_put_ctx(struct scrub_ctx *sctx); 397 398 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 399 { 400 while (atomic_read(&fs_info->scrub_pause_req)) { 401 mutex_unlock(&fs_info->scrub_lock); 402 wait_event(fs_info->scrub_pause_wait, 403 atomic_read(&fs_info->scrub_pause_req) == 0); 404 mutex_lock(&fs_info->scrub_lock); 405 } 406 } 407 408 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 409 { 410 atomic_inc(&fs_info->scrubs_paused); 411 wake_up(&fs_info->scrub_pause_wait); 412 } 413 414 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 415 { 416 mutex_lock(&fs_info->scrub_lock); 417 __scrub_blocked_if_needed(fs_info); 418 atomic_dec(&fs_info->scrubs_paused); 419 mutex_unlock(&fs_info->scrub_lock); 420 421 wake_up(&fs_info->scrub_pause_wait); 422 } 423 424 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 425 { 426 scrub_pause_on(fs_info); 427 scrub_pause_off(fs_info); 428 } 429 430 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 431 { 432 int i; 433 434 if (!sctx) 435 return; 436 437 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 438 release_scrub_stripe(&sctx->stripes[i]); 439 440 kvfree(sctx); 441 } 442 443 static void scrub_put_ctx(struct scrub_ctx *sctx) 444 { 445 if (refcount_dec_and_test(&sctx->refs)) 446 scrub_free_ctx(sctx); 447 } 448 449 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 450 struct btrfs_fs_info *fs_info, bool is_dev_replace) 451 { 452 struct scrub_ctx *sctx; 453 int i; 454 455 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 456 * kvzalloc(). 457 */ 458 sctx = kvzalloc_obj(*sctx); 459 if (!sctx) 460 goto nomem; 461 refcount_set(&sctx->refs, 1); 462 sctx->is_dev_replace = is_dev_replace; 463 sctx->fs_info = fs_info; 464 sctx->extent_path.search_commit_root = true; 465 sctx->extent_path.skip_locking = true; 466 sctx->csum_path.search_commit_root = true; 467 sctx->csum_path.skip_locking = true; 468 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 469 int ret; 470 471 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 472 if (ret < 0) 473 goto nomem; 474 sctx->stripes[i].sctx = sctx; 475 } 476 sctx->first_free = 0; 477 atomic_set(&sctx->cancel_req, 0); 478 479 spin_lock_init(&sctx->stat_lock); 480 sctx->throttle_deadline = 0; 481 482 mutex_init(&sctx->wr_lock); 483 if (is_dev_replace) { 484 WARN_ON(!fs_info->dev_replace.tgtdev); 485 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 486 } 487 488 return sctx; 489 490 nomem: 491 scrub_free_ctx(sctx); 492 return ERR_PTR(-ENOMEM); 493 } 494 495 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 496 u64 root, void *warn_ctx) 497 { 498 u32 nlink; 499 int ret; 500 int i; 501 unsigned nofs_flag; 502 struct extent_buffer *eb; 503 struct btrfs_inode_item *inode_item; 504 struct scrub_warning *swarn = warn_ctx; 505 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 506 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 507 struct btrfs_root *local_root; 508 struct btrfs_key key; 509 510 local_root = btrfs_get_fs_root(fs_info, root, true); 511 if (IS_ERR(local_root)) { 512 ret = PTR_ERR(local_root); 513 goto err; 514 } 515 516 /* 517 * this makes the path point to (inum INODE_ITEM ioff) 518 */ 519 key.objectid = inum; 520 key.type = BTRFS_INODE_ITEM_KEY; 521 key.offset = 0; 522 523 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 524 if (ret) { 525 btrfs_put_root(local_root); 526 btrfs_release_path(swarn->path); 527 goto err; 528 } 529 530 eb = swarn->path->nodes[0]; 531 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 532 struct btrfs_inode_item); 533 nlink = btrfs_inode_nlink(eb, inode_item); 534 btrfs_release_path(swarn->path); 535 536 /* 537 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 538 * uses GFP_NOFS in this context, so we keep it consistent but it does 539 * not seem to be strictly necessary. 540 */ 541 nofs_flag = memalloc_nofs_save(); 542 ipath = init_ipath(4096, local_root, swarn->path); 543 memalloc_nofs_restore(nofs_flag); 544 if (IS_ERR(ipath)) { 545 btrfs_put_root(local_root); 546 ret = PTR_ERR(ipath); 547 ipath = NULL; 548 goto err; 549 } 550 ret = paths_from_inode(inum, ipath); 551 552 if (ret < 0) 553 goto err; 554 555 /* 556 * we deliberately ignore the bit ipath might have been too small to 557 * hold all of the paths here 558 */ 559 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 560 btrfs_warn(fs_info, 561 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 562 swarn->errstr, swarn->logical, 563 btrfs_dev_name(swarn->dev), 564 swarn->physical, 565 root, inum, offset, 566 fs_info->sectorsize, nlink, 567 (char *)(unsigned long)ipath->fspath->val[i]); 568 569 btrfs_put_root(local_root); 570 return 0; 571 572 err: 573 btrfs_warn(fs_info, 574 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 575 swarn->errstr, swarn->logical, 576 btrfs_dev_name(swarn->dev), 577 swarn->physical, 578 root, inum, offset, ret); 579 580 return 0; 581 } 582 583 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 584 bool is_super, u64 logical, u64 physical) 585 { 586 struct btrfs_fs_info *fs_info = dev->fs_info; 587 BTRFS_PATH_AUTO_FREE(path); 588 struct btrfs_key found_key; 589 struct extent_buffer *eb; 590 struct btrfs_extent_item *ei; 591 struct scrub_warning swarn; 592 u64 flags = 0; 593 u32 item_size; 594 int ret; 595 596 /* Super block error, no need to search extent tree. */ 597 if (is_super) { 598 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 599 errstr, btrfs_dev_name(dev), physical); 600 return; 601 } 602 path = btrfs_alloc_path(); 603 if (!path) 604 return; 605 606 swarn.physical = physical; 607 swarn.logical = logical; 608 swarn.errstr = errstr; 609 swarn.dev = NULL; 610 611 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 612 &flags); 613 if (ret < 0) 614 return; 615 616 swarn.extent_item_size = found_key.offset; 617 618 eb = path->nodes[0]; 619 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 620 item_size = btrfs_item_size(eb, path->slots[0]); 621 622 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 unsigned long ptr = 0; 624 u8 ref_level; 625 u64 ref_root; 626 627 while (true) { 628 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 629 item_size, &ref_root, 630 &ref_level); 631 if (ret < 0) { 632 btrfs_warn(fs_info, 633 "scrub: failed to resolve tree backref for logical %llu: %d", 634 swarn.logical, ret); 635 break; 636 } 637 if (ret > 0) 638 break; 639 btrfs_warn(fs_info, 640 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 641 errstr, swarn.logical, btrfs_dev_name(dev), 642 swarn.physical, (ref_level ? "node" : "leaf"), 643 ref_level, ref_root); 644 } 645 btrfs_release_path(path); 646 } else { 647 struct btrfs_backref_walk_ctx ctx = { 0 }; 648 649 btrfs_release_path(path); 650 651 ctx.bytenr = found_key.objectid; 652 ctx.extent_item_pos = swarn.logical - found_key.objectid; 653 ctx.fs_info = fs_info; 654 655 swarn.path = path; 656 swarn.dev = dev; 657 658 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 659 } 660 } 661 662 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 663 { 664 int ret = 0; 665 u64 length; 666 667 if (!btrfs_is_zoned(sctx->fs_info)) 668 return 0; 669 670 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 671 return 0; 672 673 if (sctx->write_pointer < physical) { 674 length = physical - sctx->write_pointer; 675 676 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 677 sctx->write_pointer, length); 678 if (!ret) 679 sctx->write_pointer = physical; 680 } 681 return ret; 682 } 683 684 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 685 { 686 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 687 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 688 u32 offset = (sector_nr << fs_info->sectorsize_bits); 689 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 690 691 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 692 ASSERT(folio); 693 ASSERT(!folio_test_highmem(folio)); 694 return folio_address(folio) + offset_in_folio(folio, offset); 695 } 696 697 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 698 { 699 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 700 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 701 u32 offset = (sector_nr << fs_info->sectorsize_bits); 702 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 703 704 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 705 ASSERT(folio); 706 ASSERT(!folio_test_highmem(folio)); 707 /* And the range must be contained inside the folio. */ 708 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 709 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 710 } 711 712 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 713 { 714 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 715 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 716 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 717 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 718 struct btrfs_header *header = first_kaddr; 719 struct btrfs_csum_ctx csum; 720 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 721 u8 calculated_csum[BTRFS_CSUM_SIZE]; 722 723 /* 724 * Here we don't have a good way to attach the pages (and subpages) 725 * to a dummy extent buffer, thus we have to directly grab the members 726 * from pages. 727 */ 728 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 729 730 if (logical != btrfs_stack_header_bytenr(header)) { 731 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 732 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 733 btrfs_warn_rl(fs_info, 734 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 735 logical, stripe->mirror_num, 736 btrfs_stack_header_bytenr(header), logical); 737 return; 738 } 739 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 740 BTRFS_FSID_SIZE) != 0) { 741 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 742 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 743 btrfs_warn_rl(fs_info, 744 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 745 logical, stripe->mirror_num, 746 header->fsid, fs_info->fs_devices->metadata_uuid); 747 return; 748 } 749 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 750 BTRFS_UUID_SIZE) != 0) { 751 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 752 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 753 btrfs_warn_rl(fs_info, 754 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 755 logical, stripe->mirror_num, 756 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 757 return; 758 } 759 760 /* Now check tree block csum. */ 761 btrfs_csum_init(&csum, fs_info->csum_type); 762 btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 763 fs_info->sectorsize - BTRFS_CSUM_SIZE); 764 765 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 766 btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 767 fs_info->sectorsize); 768 } 769 770 btrfs_csum_final(&csum, calculated_csum); 771 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 772 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 773 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 774 btrfs_warn_rl(fs_info, 775 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 776 logical, stripe->mirror_num, 777 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 778 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 779 return; 780 } 781 if (stripe->sectors[sector_nr].generation != 782 btrfs_stack_header_generation(header)) { 783 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 784 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 785 btrfs_warn_rl(fs_info, 786 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 787 logical, stripe->mirror_num, 788 btrfs_stack_header_generation(header), 789 stripe->sectors[sector_nr].generation); 790 return; 791 } 792 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 793 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 794 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 795 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 796 } 797 798 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 799 { 800 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 801 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 802 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 803 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 804 u8 csum_buf[BTRFS_CSUM_SIZE]; 805 int ret; 806 807 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 808 809 /* Sector not utilized, skip it. */ 810 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 811 return; 812 813 /* IO error, no need to check. */ 814 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 815 return; 816 817 /* Metadata, verify the full tree block. */ 818 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 819 /* 820 * Check if the tree block crosses the stripe boundary. If 821 * crossed the boundary, we cannot verify it but only give a 822 * warning. 823 * 824 * This can only happen on a very old filesystem where chunks 825 * are not ensured to be stripe aligned. 826 */ 827 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 828 btrfs_warn_rl(fs_info, 829 "scrub: tree block at %llu crosses stripe boundary %llu", 830 stripe->logical + 831 (sector_nr << fs_info->sectorsize_bits), 832 stripe->logical); 833 return; 834 } 835 scrub_verify_one_metadata(stripe, sector_nr); 836 return; 837 } 838 839 /* 840 * Data is easier, we just verify the data csum (if we have it). For 841 * cases without csum, we have no other choice but to trust it. 842 */ 843 if (!sector->csum) { 844 scrub_bitmap_clear_bit_error(stripe, sector_nr); 845 return; 846 } 847 848 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 849 if (ret < 0) { 850 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 851 scrub_bitmap_set_bit_error(stripe, sector_nr); 852 } else { 853 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 854 scrub_bitmap_clear_bit_error(stripe, sector_nr); 855 } 856 } 857 858 /* Verify specified sectors of a stripe. */ 859 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 860 { 861 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 862 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 863 int sector_nr; 864 865 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 866 scrub_verify_one_sector(stripe, sector_nr); 867 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 868 sector_nr += sectors_per_tree - 1; 869 } 870 } 871 872 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 873 { 874 int i; 875 876 for (i = 0; i < stripe->nr_sectors; i++) { 877 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 878 break; 879 } 880 ASSERT(i < stripe->nr_sectors); 881 return i; 882 } 883 884 /* 885 * Repair read is different to the regular read: 886 * 887 * - Only reads the failed sectors 888 * - May have extra blocksize limits 889 */ 890 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 891 { 892 struct scrub_stripe *stripe = bbio->private; 893 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 894 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 895 const u32 bio_size = bio_get_size(&bbio->bio); 896 897 ASSERT(sector_nr < stripe->nr_sectors); 898 899 if (bbio->bio.bi_status) { 900 scrub_bitmap_set_io_error(stripe, sector_nr, 901 bio_size >> fs_info->sectorsize_bits); 902 scrub_bitmap_set_error(stripe, sector_nr, 903 bio_size >> fs_info->sectorsize_bits); 904 } else { 905 scrub_bitmap_clear_io_error(stripe, sector_nr, 906 bio_size >> fs_info->sectorsize_bits); 907 } 908 bio_put(&bbio->bio); 909 if (atomic_dec_and_test(&stripe->pending_io)) 910 wake_up(&stripe->io_wait); 911 } 912 913 static int calc_next_mirror(int mirror, int num_copies) 914 { 915 ASSERT(mirror <= num_copies); 916 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 917 } 918 919 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 920 int sector_nr) 921 { 922 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 923 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 924 int ret; 925 926 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 927 offset_in_page(kaddr)); 928 /* 929 * Caller should ensure the bbio has enough size. 930 * And we cannot use __bio_add_page(), which doesn't do any merge. 931 * 932 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 933 * to create the minimal amount of bio vectors, for fs block size < page 934 * size cases. 935 */ 936 ASSERT(ret == fs_info->sectorsize); 937 } 938 939 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 940 unsigned int nr_vecs, blk_opf_t opf, 941 u64 logical, 942 btrfs_bio_end_io_t end_io, void *private) 943 { 944 struct btrfs_bio *bbio; 945 946 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 947 logical, end_io, private); 948 bbio->is_scrub = true; 949 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 950 return bbio; 951 } 952 953 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 954 int mirror, int blocksize, bool wait) 955 { 956 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 957 struct btrfs_bio *bbio = NULL; 958 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 959 int i; 960 961 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 962 ASSERT(atomic_read(&stripe->pending_io) == 0, 963 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 964 965 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 966 /* The current sector cannot be merged, submit the bio. */ 967 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 968 bbio->bio.bi_iter.bi_size >= blocksize)) { 969 ASSERT(bbio->bio.bi_iter.bi_size); 970 atomic_inc(&stripe->pending_io); 971 btrfs_submit_bbio(bbio, mirror); 972 if (wait) 973 wait_scrub_stripe_io(stripe); 974 bbio = NULL; 975 } 976 977 if (!bbio) 978 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 979 stripe->logical + (i << fs_info->sectorsize_bits), 980 scrub_repair_read_endio, stripe); 981 982 scrub_bio_add_sector(bbio, stripe, i); 983 } 984 if (bbio) { 985 ASSERT(bbio->bio.bi_iter.bi_size); 986 atomic_inc(&stripe->pending_io); 987 btrfs_submit_bbio(bbio, mirror); 988 if (wait) 989 wait_scrub_stripe_io(stripe); 990 } 991 } 992 993 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 994 struct scrub_stripe *stripe, 995 const struct scrub_error_records *errors) 996 { 997 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 998 DEFAULT_RATELIMIT_BURST); 999 struct btrfs_fs_info *fs_info = sctx->fs_info; 1000 struct btrfs_device *dev = NULL; 1001 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1002 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1003 u64 physical = 0; 1004 int nr_data_sectors = 0; 1005 int nr_meta_sectors = 0; 1006 int nr_nodatacsum_sectors = 0; 1007 int nr_repaired_sectors = 0; 1008 int sector_nr; 1009 1010 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1011 return; 1012 1013 /* 1014 * Init needed infos for error reporting. 1015 * 1016 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1017 * thus no need for dev/physical, error reporting still needs dev and physical. 1018 */ 1019 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1020 u64 mapped_len = fs_info->sectorsize; 1021 struct btrfs_io_context *bioc = NULL; 1022 int stripe_index = stripe->mirror_num - 1; 1023 int ret; 1024 1025 /* For scrub, our mirror_num should always start at 1. */ 1026 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1027 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1028 stripe->logical, &mapped_len, &bioc, 1029 NULL, NULL); 1030 /* 1031 * If we failed, dev will be NULL, and later detailed reports 1032 * will just be skipped. 1033 */ 1034 if (ret < 0) 1035 goto skip; 1036 physical = bioc->stripes[stripe_index].physical; 1037 dev = bioc->stripes[stripe_index].dev; 1038 btrfs_put_bioc(bioc); 1039 } 1040 1041 skip: 1042 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1043 bool repaired = false; 1044 1045 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1046 nr_meta_sectors++; 1047 } else { 1048 nr_data_sectors++; 1049 if (!stripe->sectors[sector_nr].csum) 1050 nr_nodatacsum_sectors++; 1051 } 1052 1053 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1054 !test_bit(sector_nr, &error_bitmap)) { 1055 nr_repaired_sectors++; 1056 repaired = true; 1057 } 1058 1059 /* Good sector from the beginning, nothing need to be done. */ 1060 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1061 continue; 1062 1063 /* 1064 * Report error for the corrupted sectors. If repaired, just 1065 * output the message of repaired message. 1066 */ 1067 if (repaired) { 1068 if (dev) { 1069 btrfs_err_rl(fs_info, 1070 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1071 stripe->logical, btrfs_dev_name(dev), 1072 physical); 1073 } else { 1074 btrfs_err_rl(fs_info, 1075 "scrub: fixed up error at logical %llu on mirror %u", 1076 stripe->logical, stripe->mirror_num); 1077 } 1078 continue; 1079 } 1080 1081 /* The remaining are all for unrepaired. */ 1082 if (dev) { 1083 btrfs_err_rl(fs_info, 1084 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1085 stripe->logical, btrfs_dev_name(dev), 1086 physical); 1087 } else { 1088 btrfs_err_rl(fs_info, 1089 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1090 stripe->logical, stripe->mirror_num); 1091 } 1092 1093 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1094 if (__ratelimit(&rs) && dev) 1095 scrub_print_common_warning("i/o error", dev, false, 1096 stripe->logical, physical); 1097 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1098 if (__ratelimit(&rs) && dev) 1099 scrub_print_common_warning("checksum error", dev, false, 1100 stripe->logical, physical); 1101 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1102 if (__ratelimit(&rs) && dev) 1103 scrub_print_common_warning("header error", dev, false, 1104 stripe->logical, physical); 1105 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1106 if (__ratelimit(&rs) && dev) 1107 scrub_print_common_warning("generation error", dev, false, 1108 stripe->logical, physical); 1109 } 1110 1111 /* Update the device stats. */ 1112 for (int i = 0; i < errors->nr_io_errors; i++) 1113 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1114 for (int i = 0; i < errors->nr_csum_errors; i++) 1115 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1116 /* Generation mismatch error is based on each metadata, not each block. */ 1117 for (int i = 0; i < errors->nr_meta_gen_errors; 1118 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1119 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1120 1121 spin_lock(&sctx->stat_lock); 1122 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1123 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1124 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1125 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1126 sctx->stat.no_csum += nr_nodatacsum_sectors; 1127 sctx->stat.read_errors += errors->nr_io_errors; 1128 sctx->stat.csum_errors += errors->nr_csum_errors; 1129 sctx->stat.verify_errors += errors->nr_meta_errors + 1130 errors->nr_meta_gen_errors; 1131 sctx->stat.uncorrectable_errors += 1132 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1133 sctx->stat.corrected_errors += nr_repaired_sectors; 1134 spin_unlock(&sctx->stat_lock); 1135 } 1136 1137 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1138 unsigned long write_bitmap, bool dev_replace); 1139 1140 /* 1141 * The main entrance for all read related scrub work, including: 1142 * 1143 * - Wait for the initial read to finish 1144 * - Verify and locate any bad sectors 1145 * - Go through the remaining mirrors and try to read as large blocksize as 1146 * possible 1147 * - Go through all mirrors (including the failed mirror) sector-by-sector 1148 * - Submit writeback for repaired sectors 1149 * 1150 * Writeback for dev-replace does not happen here, it needs extra 1151 * synchronization for zoned devices. 1152 */ 1153 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1154 { 1155 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1156 struct scrub_ctx *sctx = stripe->sctx; 1157 struct btrfs_fs_info *fs_info = sctx->fs_info; 1158 struct scrub_error_records errors = { 0 }; 1159 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1160 stripe->bg->length); 1161 unsigned long repaired; 1162 unsigned long error; 1163 int mirror; 1164 int i; 1165 1166 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1167 1168 wait_scrub_stripe_io(stripe); 1169 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1170 /* Save the initial failed bitmap for later repair and report usage. */ 1171 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1172 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1173 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1174 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1175 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1176 1177 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1178 goto out; 1179 1180 /* 1181 * Try all remaining mirrors. 1182 * 1183 * Here we still try to read as large block as possible, as this is 1184 * faster and we have extra safety nets to rely on. 1185 */ 1186 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1187 mirror != stripe->mirror_num; 1188 mirror = calc_next_mirror(mirror, num_copies)) { 1189 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1190 1191 scrub_stripe_submit_repair_read(stripe, mirror, 1192 BTRFS_STRIPE_LEN, false); 1193 wait_scrub_stripe_io(stripe); 1194 scrub_verify_one_stripe(stripe, old_error_bitmap); 1195 if (scrub_bitmap_empty_error(stripe)) 1196 goto out; 1197 } 1198 1199 /* 1200 * Last safety net, try re-checking all mirrors, including the failed 1201 * one, sector-by-sector. 1202 * 1203 * As if one sector failed the drive's internal csum, the whole read 1204 * containing the offending sector would be marked as error. 1205 * Thus here we do sector-by-sector read. 1206 * 1207 * This can be slow, thus we only try it as the last resort. 1208 */ 1209 1210 for (i = 0, mirror = stripe->mirror_num; 1211 i < num_copies; 1212 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1213 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1214 1215 scrub_stripe_submit_repair_read(stripe, mirror, 1216 fs_info->sectorsize, true); 1217 wait_scrub_stripe_io(stripe); 1218 scrub_verify_one_stripe(stripe, old_error_bitmap); 1219 if (scrub_bitmap_empty_error(stripe)) 1220 goto out; 1221 } 1222 out: 1223 error = scrub_bitmap_read_error(stripe); 1224 /* 1225 * Submit the repaired sectors. For zoned case, we cannot do repair 1226 * in-place, but queue the bg to be relocated. 1227 */ 1228 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1229 stripe->nr_sectors); 1230 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1231 if (btrfs_is_zoned(fs_info)) { 1232 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1233 } else { 1234 scrub_write_sectors(sctx, stripe, repaired, false); 1235 wait_scrub_stripe_io(stripe); 1236 } 1237 } 1238 1239 scrub_stripe_report_errors(sctx, stripe, &errors); 1240 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1241 wake_up(&stripe->repair_wait); 1242 } 1243 1244 static void scrub_read_endio(struct btrfs_bio *bbio) 1245 { 1246 struct scrub_stripe *stripe = bbio->private; 1247 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1248 int num_sectors; 1249 const u32 bio_size = bio_get_size(&bbio->bio); 1250 1251 ASSERT(sector_nr < stripe->nr_sectors); 1252 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1253 1254 if (bbio->bio.bi_status) { 1255 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1256 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1257 } else { 1258 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1259 } 1260 bio_put(&bbio->bio); 1261 if (atomic_dec_and_test(&stripe->pending_io)) { 1262 wake_up(&stripe->io_wait); 1263 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1264 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1265 } 1266 } 1267 1268 static void scrub_write_endio(struct btrfs_bio *bbio) 1269 { 1270 struct scrub_stripe *stripe = bbio->private; 1271 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1272 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1273 const u32 bio_size = bio_get_size(&bbio->bio); 1274 1275 if (bbio->bio.bi_status) { 1276 unsigned long flags; 1277 1278 spin_lock_irqsave(&stripe->write_error_lock, flags); 1279 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1280 bio_size >> fs_info->sectorsize_bits); 1281 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1282 for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1283 btrfs_dev_stat_inc_and_print(stripe->dev, 1284 BTRFS_DEV_STAT_WRITE_ERRS); 1285 } 1286 bio_put(&bbio->bio); 1287 1288 if (atomic_dec_and_test(&stripe->pending_io)) 1289 wake_up(&stripe->io_wait); 1290 } 1291 1292 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1293 struct scrub_stripe *stripe, 1294 struct btrfs_bio *bbio, bool dev_replace) 1295 { 1296 struct btrfs_fs_info *fs_info = sctx->fs_info; 1297 u32 bio_len = bbio->bio.bi_iter.bi_size; 1298 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1299 stripe->logical; 1300 1301 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1302 atomic_inc(&stripe->pending_io); 1303 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1304 if (!btrfs_is_zoned(fs_info)) 1305 return; 1306 /* 1307 * For zoned writeback, queue depth must be 1, thus we must wait for 1308 * the write to finish before the next write. 1309 */ 1310 wait_scrub_stripe_io(stripe); 1311 1312 /* 1313 * And also need to update the write pointer if write finished 1314 * successfully. 1315 */ 1316 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1317 &stripe->write_error_bitmap)) 1318 sctx->write_pointer += bio_len; 1319 } 1320 1321 /* 1322 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1323 * 1324 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1325 * 1326 * - Only needs logical bytenr and mirror_num 1327 * Just like the scrub read path 1328 * 1329 * - Would only result in writes to the specified mirror 1330 * Unlike the regular writeback path, which would write back to all stripes 1331 * 1332 * - Handle dev-replace and read-repair writeback differently 1333 */ 1334 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1335 unsigned long write_bitmap, bool dev_replace) 1336 { 1337 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1338 struct btrfs_bio *bbio = NULL; 1339 int sector_nr; 1340 1341 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1342 /* We should only writeback sectors covered by an extent. */ 1343 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1344 1345 /* Cannot merge with previous sector, submit the current one. */ 1346 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1347 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1348 bbio = NULL; 1349 } 1350 if (!bbio) 1351 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1352 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1353 scrub_write_endio, stripe); 1354 scrub_bio_add_sector(bbio, stripe, sector_nr); 1355 } 1356 if (bbio) 1357 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1358 } 1359 1360 /* 1361 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1362 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1363 */ 1364 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1365 unsigned int bio_size) 1366 { 1367 const int time_slice = 1000; 1368 s64 delta; 1369 ktime_t now; 1370 u32 div; 1371 u64 bwlimit; 1372 1373 bwlimit = READ_ONCE(device->scrub_speed_max); 1374 if (bwlimit == 0) 1375 return; 1376 1377 /* 1378 * Slice is divided into intervals when the IO is submitted, adjust by 1379 * bwlimit and maximum of 64 intervals. 1380 */ 1381 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1382 1383 /* Start new epoch, set deadline */ 1384 now = ktime_get(); 1385 if (sctx->throttle_deadline == 0) { 1386 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1387 sctx->throttle_sent = 0; 1388 } 1389 1390 /* Still in the time to send? */ 1391 if (ktime_before(now, sctx->throttle_deadline)) { 1392 /* If current bio is within the limit, send it */ 1393 sctx->throttle_sent += bio_size; 1394 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1395 return; 1396 1397 /* We're over the limit, sleep until the rest of the slice */ 1398 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1399 } else { 1400 /* New request after deadline, start new epoch */ 1401 delta = 0; 1402 } 1403 1404 if (delta) { 1405 long timeout; 1406 1407 timeout = div_u64(delta * HZ, 1000); 1408 schedule_timeout_interruptible(timeout); 1409 } 1410 1411 /* Next call will start the deadline period */ 1412 sctx->throttle_deadline = 0; 1413 } 1414 1415 /* 1416 * Given a physical address, this will calculate it's 1417 * logical offset. if this is a parity stripe, it will return 1418 * the most left data stripe's logical offset. 1419 * 1420 * return 0 if it is a data stripe, 1 means parity stripe. 1421 */ 1422 static int get_raid56_logic_offset(u64 physical, int num, 1423 struct btrfs_chunk_map *map, u64 *offset, 1424 u64 *stripe_start) 1425 { 1426 int i; 1427 int j = 0; 1428 u64 last_offset; 1429 const int data_stripes = nr_data_stripes(map); 1430 1431 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1432 if (stripe_start) 1433 *stripe_start = last_offset; 1434 1435 *offset = last_offset; 1436 for (i = 0; i < data_stripes; i++) { 1437 u32 stripe_nr; 1438 u32 stripe_index; 1439 u32 rot; 1440 1441 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1442 1443 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1444 1445 /* Work out the disk rotation on this stripe-set */ 1446 rot = stripe_nr % map->num_stripes; 1447 /* calculate which stripe this data locates */ 1448 rot += i; 1449 stripe_index = rot % map->num_stripes; 1450 if (stripe_index == num) 1451 return 0; 1452 if (stripe_index < num) 1453 j++; 1454 } 1455 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1456 return 1; 1457 } 1458 1459 /* 1460 * Return 0 if the extent item range covers any byte of the range. 1461 * Return <0 if the extent item is before @search_start. 1462 * Return >0 if the extent item is after @start_start + @search_len. 1463 */ 1464 static int compare_extent_item_range(struct btrfs_path *path, 1465 u64 search_start, u64 search_len) 1466 { 1467 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1468 u64 len; 1469 struct btrfs_key key; 1470 1471 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1472 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1473 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1474 if (key.type == BTRFS_METADATA_ITEM_KEY) 1475 len = fs_info->nodesize; 1476 else 1477 len = key.offset; 1478 1479 if (key.objectid + len <= search_start) 1480 return -1; 1481 if (key.objectid >= search_start + search_len) 1482 return 1; 1483 return 0; 1484 } 1485 1486 /* 1487 * Locate one extent item which covers any byte in range 1488 * [@search_start, @search_start + @search_length) 1489 * 1490 * If the path is not initialized, we will initialize the search by doing 1491 * a btrfs_search_slot(). 1492 * If the path is already initialized, we will use the path as the initial 1493 * slot, to avoid duplicated btrfs_search_slot() calls. 1494 * 1495 * NOTE: If an extent item starts before @search_start, we will still 1496 * return the extent item. This is for data extent crossing stripe boundary. 1497 * 1498 * Return 0 if we found such extent item, and @path will point to the extent item. 1499 * Return >0 if no such extent item can be found, and @path will be released. 1500 * Return <0 if hit fatal error, and @path will be released. 1501 */ 1502 static int find_first_extent_item(struct btrfs_root *extent_root, 1503 struct btrfs_path *path, 1504 u64 search_start, u64 search_len) 1505 { 1506 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1507 struct btrfs_key key; 1508 int ret; 1509 1510 /* Continue using the existing path */ 1511 if (path->nodes[0]) 1512 goto search_forward; 1513 1514 key.objectid = search_start; 1515 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1516 key.type = BTRFS_METADATA_ITEM_KEY; 1517 else 1518 key.type = BTRFS_EXTENT_ITEM_KEY; 1519 key.offset = (u64)-1; 1520 1521 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1522 if (ret < 0) 1523 return ret; 1524 if (unlikely(ret == 0)) { 1525 /* 1526 * Key with offset -1 found, there would have to exist an extent 1527 * item with such offset, but this is out of the valid range. 1528 */ 1529 btrfs_release_path(path); 1530 return -EUCLEAN; 1531 } 1532 1533 /* 1534 * Here we intentionally pass 0 as @min_objectid, as there could be 1535 * an extent item starting before @search_start. 1536 */ 1537 ret = btrfs_previous_extent_item(extent_root, path, 0); 1538 if (ret < 0) 1539 return ret; 1540 /* 1541 * No matter whether we have found an extent item, the next loop will 1542 * properly do every check on the key. 1543 */ 1544 search_forward: 1545 while (true) { 1546 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1547 if (key.objectid >= search_start + search_len) 1548 break; 1549 if (key.type != BTRFS_METADATA_ITEM_KEY && 1550 key.type != BTRFS_EXTENT_ITEM_KEY) 1551 goto next; 1552 1553 ret = compare_extent_item_range(path, search_start, search_len); 1554 if (ret == 0) 1555 return ret; 1556 if (ret > 0) 1557 break; 1558 next: 1559 ret = btrfs_next_item(extent_root, path); 1560 if (ret) { 1561 /* Either no more items or a fatal error. */ 1562 btrfs_release_path(path); 1563 return ret; 1564 } 1565 } 1566 btrfs_release_path(path); 1567 return 1; 1568 } 1569 1570 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1571 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1572 { 1573 struct btrfs_key key; 1574 struct btrfs_extent_item *ei; 1575 1576 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1577 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1578 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1579 *extent_start_ret = key.objectid; 1580 if (key.type == BTRFS_METADATA_ITEM_KEY) 1581 *size_ret = path->nodes[0]->fs_info->nodesize; 1582 else 1583 *size_ret = key.offset; 1584 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1585 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1586 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1587 } 1588 1589 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1590 u64 physical, u64 physical_end) 1591 { 1592 struct btrfs_fs_info *fs_info = sctx->fs_info; 1593 int ret = 0; 1594 1595 if (!btrfs_is_zoned(fs_info)) 1596 return 0; 1597 1598 mutex_lock(&sctx->wr_lock); 1599 if (sctx->write_pointer < physical_end) { 1600 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1601 physical, 1602 sctx->write_pointer); 1603 if (ret) 1604 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1605 } 1606 mutex_unlock(&sctx->wr_lock); 1607 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1608 1609 return ret; 1610 } 1611 1612 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1613 struct scrub_stripe *stripe, 1614 u64 extent_start, u64 extent_len, 1615 u64 extent_flags, u64 extent_gen) 1616 { 1617 for (u64 cur_logical = max(stripe->logical, extent_start); 1618 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1619 extent_start + extent_len); 1620 cur_logical += fs_info->sectorsize) { 1621 const int nr_sector = (cur_logical - stripe->logical) >> 1622 fs_info->sectorsize_bits; 1623 struct scrub_sector_verification *sector = 1624 &stripe->sectors[nr_sector]; 1625 1626 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1627 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1628 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1629 sector->generation = extent_gen; 1630 } 1631 } 1632 } 1633 1634 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1635 { 1636 ASSERT(stripe->nr_sectors); 1637 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1638 } 1639 1640 /* 1641 * Locate one stripe which has at least one extent in its range. 1642 * 1643 * Return 0 if found such stripe, and store its info into @stripe. 1644 * Return >0 if there is no such stripe in the specified range. 1645 * Return <0 for error. 1646 */ 1647 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1648 struct btrfs_path *extent_path, 1649 struct btrfs_path *csum_path, 1650 struct btrfs_device *dev, u64 physical, 1651 int mirror_num, u64 logical_start, 1652 u32 logical_len, 1653 struct scrub_stripe *stripe) 1654 { 1655 struct btrfs_fs_info *fs_info = bg->fs_info; 1656 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1657 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1658 const u64 logical_end = logical_start + logical_len; 1659 u64 cur_logical = logical_start; 1660 u64 stripe_end; 1661 u64 extent_start; 1662 u64 extent_len; 1663 u64 extent_flags; 1664 u64 extent_gen; 1665 int ret; 1666 1667 if (unlikely(!extent_root || !csum_root)) { 1668 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1669 return -EUCLEAN; 1670 } 1671 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1672 stripe->nr_sectors); 1673 scrub_stripe_reset_bitmaps(stripe); 1674 1675 /* The range must be inside the bg. */ 1676 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1677 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1678 bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1679 1680 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1681 logical_len); 1682 /* Either error or not found. */ 1683 if (ret) 1684 return ret; 1685 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1686 &extent_gen); 1687 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1688 stripe->nr_meta_extents++; 1689 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1690 stripe->nr_data_extents++; 1691 cur_logical = max(extent_start, cur_logical); 1692 1693 /* 1694 * Round down to stripe boundary. 1695 * 1696 * The extra calculation against bg->start is to handle block groups 1697 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1698 */ 1699 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1700 bg->start; 1701 stripe->physical = physical + stripe->logical - logical_start; 1702 stripe->dev = dev; 1703 stripe->bg = bg; 1704 stripe->mirror_num = mirror_num; 1705 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1706 1707 /* Fill the first extent info into stripe->sectors[] array. */ 1708 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1709 extent_flags, extent_gen); 1710 cur_logical = extent_start + extent_len; 1711 1712 /* Fill the extent info for the remaining sectors. */ 1713 while (cur_logical <= stripe_end) { 1714 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1715 stripe_end - cur_logical + 1); 1716 if (ret < 0) 1717 return ret; 1718 if (ret > 0) { 1719 ret = 0; 1720 break; 1721 } 1722 get_extent_info(extent_path, &extent_start, &extent_len, 1723 &extent_flags, &extent_gen); 1724 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1725 stripe->nr_meta_extents++; 1726 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1727 stripe->nr_data_extents++; 1728 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1729 extent_flags, extent_gen); 1730 cur_logical = extent_start + extent_len; 1731 } 1732 1733 /* Now fill the data csum. */ 1734 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1735 int sector_nr; 1736 unsigned long csum_bitmap = 0; 1737 1738 /* Csum space should have already been allocated. */ 1739 ASSERT(stripe->csums); 1740 1741 /* 1742 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1743 * should contain at most 16 sectors. 1744 */ 1745 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1746 1747 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1748 stripe->logical, stripe_end, 1749 stripe->csums, &csum_bitmap); 1750 if (ret < 0) 1751 return ret; 1752 if (ret > 0) 1753 ret = 0; 1754 1755 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1756 stripe->sectors[sector_nr].csum = stripe->csums + 1757 sector_nr * fs_info->csum_size; 1758 } 1759 } 1760 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1761 1762 return ret; 1763 } 1764 1765 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1766 { 1767 scrub_stripe_reset_bitmaps(stripe); 1768 1769 stripe->nr_meta_extents = 0; 1770 stripe->nr_data_extents = 0; 1771 stripe->state = 0; 1772 1773 for (int i = 0; i < stripe->nr_sectors; i++) { 1774 stripe->sectors[i].csum = NULL; 1775 stripe->sectors[i].generation = 0; 1776 } 1777 } 1778 1779 static u32 stripe_length(const struct scrub_stripe *stripe) 1780 { 1781 ASSERT(stripe->bg); 1782 1783 return min(BTRFS_STRIPE_LEN, 1784 stripe->bg->start + stripe->bg->length - stripe->logical); 1785 } 1786 1787 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1788 { 1789 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1790 struct btrfs_bio *bbio = NULL; 1791 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1792 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1793 u64 stripe_len = BTRFS_STRIPE_LEN; 1794 int mirror = stripe->mirror_num; 1795 int i; 1796 1797 atomic_inc(&stripe->pending_io); 1798 1799 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1800 /* We're beyond the chunk boundary, no need to read anymore. */ 1801 if (i >= nr_sectors) 1802 break; 1803 1804 /* The current sector cannot be merged, submit the bio. */ 1805 if (bbio && 1806 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1807 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1808 ASSERT(bbio->bio.bi_iter.bi_size); 1809 atomic_inc(&stripe->pending_io); 1810 btrfs_submit_bbio(bbio, mirror); 1811 bbio = NULL; 1812 } 1813 1814 if (!bbio) { 1815 struct btrfs_io_stripe io_stripe = {}; 1816 struct btrfs_io_context *bioc = NULL; 1817 const u64 logical = stripe->logical + 1818 (i << fs_info->sectorsize_bits); 1819 int ret; 1820 1821 io_stripe.rst_search_commit_root = true; 1822 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1823 /* 1824 * For RST cases, we need to manually split the bbio to 1825 * follow the RST boundary. 1826 */ 1827 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1828 &stripe_len, &bioc, &io_stripe, &mirror); 1829 btrfs_put_bioc(bioc); 1830 if (ret < 0) { 1831 if (ret != -ENODATA) { 1832 /* 1833 * Earlier btrfs_get_raid_extent_offset() 1834 * returned -ENODATA, which means there's 1835 * no entry for the corresponding range 1836 * in the stripe tree. But if it's in 1837 * the extent tree, then it's a preallocated 1838 * extent and not an error. 1839 */ 1840 scrub_bitmap_set_bit_io_error(stripe, i); 1841 scrub_bitmap_set_bit_error(stripe, i); 1842 } 1843 continue; 1844 } 1845 1846 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1847 logical, scrub_read_endio, stripe); 1848 } 1849 1850 scrub_bio_add_sector(bbio, stripe, i); 1851 } 1852 1853 if (bbio) { 1854 ASSERT(bbio->bio.bi_iter.bi_size); 1855 atomic_inc(&stripe->pending_io); 1856 btrfs_submit_bbio(bbio, mirror); 1857 } 1858 1859 if (atomic_dec_and_test(&stripe->pending_io)) { 1860 wake_up(&stripe->io_wait); 1861 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1862 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1863 } 1864 } 1865 1866 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1867 struct scrub_stripe *stripe) 1868 { 1869 struct btrfs_fs_info *fs_info = sctx->fs_info; 1870 struct btrfs_bio *bbio; 1871 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1872 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1873 int mirror = stripe->mirror_num; 1874 1875 ASSERT(stripe->bg); 1876 ASSERT(stripe->mirror_num > 0); 1877 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1878 1879 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1880 scrub_submit_extent_sector_read(stripe); 1881 return; 1882 } 1883 1884 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1885 stripe->logical, scrub_read_endio, stripe); 1886 /* Read the whole range inside the chunk boundary. */ 1887 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1888 scrub_bio_add_sector(bbio, stripe, cur); 1889 atomic_inc(&stripe->pending_io); 1890 1891 /* 1892 * For dev-replace, either user asks to avoid the source dev, or 1893 * the device is missing, we try the next mirror instead. 1894 */ 1895 if (sctx->is_dev_replace && 1896 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1897 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1898 !stripe->dev->bdev)) { 1899 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1900 stripe->bg->length); 1901 1902 mirror = calc_next_mirror(mirror, num_copies); 1903 } 1904 btrfs_submit_bbio(bbio, mirror); 1905 } 1906 1907 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1908 { 1909 const unsigned long error = scrub_bitmap_read_error(stripe); 1910 int i; 1911 1912 for_each_set_bit(i, &error, stripe->nr_sectors) { 1913 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1914 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1915 1916 btrfs_err(fs_info, 1917 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1918 stripe->logical, 1919 stripe->logical + (i << fs_info->sectorsize_bits)); 1920 return true; 1921 } 1922 } 1923 return false; 1924 } 1925 1926 static void submit_initial_group_read(struct scrub_ctx *sctx, 1927 unsigned int first_slot, 1928 unsigned int nr_stripes) 1929 { 1930 struct blk_plug plug; 1931 1932 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1933 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1934 1935 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1936 btrfs_stripe_nr_to_offset(nr_stripes)); 1937 blk_start_plug(&plug); 1938 for (int i = 0; i < nr_stripes; i++) { 1939 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1940 1941 /* Those stripes should be initialized. */ 1942 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1943 scrub_submit_initial_read(sctx, stripe); 1944 } 1945 blk_finish_plug(&plug); 1946 } 1947 1948 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1949 { 1950 struct btrfs_fs_info *fs_info = sctx->fs_info; 1951 struct scrub_stripe *stripe; 1952 const int nr_stripes = sctx->cur_stripe; 1953 int ret = 0; 1954 1955 if (!nr_stripes) 1956 return 0; 1957 1958 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1959 1960 /* Submit the stripes which are populated but not submitted. */ 1961 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1962 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1963 1964 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1965 } 1966 1967 for (int i = 0; i < nr_stripes; i++) { 1968 stripe = &sctx->stripes[i]; 1969 1970 wait_event(stripe->repair_wait, 1971 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1972 } 1973 1974 /* Submit for dev-replace. */ 1975 if (sctx->is_dev_replace) { 1976 /* 1977 * For dev-replace, if we know there is something wrong with 1978 * metadata, we should immediately abort. 1979 */ 1980 for (int i = 0; i < nr_stripes; i++) { 1981 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1982 ret = -EIO; 1983 goto out; 1984 } 1985 } 1986 for (int i = 0; i < nr_stripes; i++) { 1987 unsigned long good; 1988 unsigned long has_extent; 1989 unsigned long error; 1990 1991 stripe = &sctx->stripes[i]; 1992 1993 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 1994 1995 has_extent = scrub_bitmap_read_has_extent(stripe); 1996 error = scrub_bitmap_read_error(stripe); 1997 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 1998 scrub_write_sectors(sctx, stripe, good, true); 1999 } 2000 } 2001 2002 /* Wait for the above writebacks to finish. */ 2003 for (int i = 0; i < nr_stripes; i++) { 2004 stripe = &sctx->stripes[i]; 2005 2006 wait_scrub_stripe_io(stripe); 2007 spin_lock(&sctx->stat_lock); 2008 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2009 spin_unlock(&sctx->stat_lock); 2010 scrub_reset_stripe(stripe); 2011 } 2012 out: 2013 sctx->cur_stripe = 0; 2014 return ret; 2015 } 2016 2017 static void raid56_scrub_wait_endio(struct bio *bio) 2018 { 2019 complete(bio->bi_private); 2020 } 2021 2022 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2023 struct btrfs_device *dev, int mirror_num, 2024 u64 logical, u32 length, u64 physical, 2025 u64 *found_logical_ret) 2026 { 2027 struct scrub_stripe *stripe; 2028 int ret; 2029 2030 /* 2031 * There should always be one slot left, as caller filling the last 2032 * slot should flush them all. 2033 */ 2034 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2035 2036 /* @found_logical_ret must be specified. */ 2037 ASSERT(found_logical_ret); 2038 2039 stripe = &sctx->stripes[sctx->cur_stripe]; 2040 scrub_reset_stripe(stripe); 2041 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2042 &sctx->csum_path, dev, physical, 2043 mirror_num, logical, length, stripe); 2044 /* Either >0 as no more extents or <0 for error. */ 2045 if (ret) 2046 return ret; 2047 *found_logical_ret = stripe->logical; 2048 sctx->cur_stripe++; 2049 2050 /* We filled one group, submit it. */ 2051 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2052 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2053 2054 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2055 } 2056 2057 /* Last slot used, flush them all. */ 2058 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2059 return flush_scrub_stripes(sctx); 2060 return 0; 2061 } 2062 2063 /* 2064 * Return 0 if we should not cancel the scrub. 2065 * Return <0 if we need to cancel the scrub, returned value will 2066 * indicate the reason: 2067 * - -ECANCELED - Being explicitly canceled through ioctl. 2068 * - -EINTR - Being interrupted by signal or fs/process freezing. 2069 */ 2070 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2071 { 2072 struct btrfs_fs_info *fs_info = sctx->fs_info; 2073 2074 if (atomic_read(&fs_info->scrub_cancel_req) || 2075 atomic_read(&sctx->cancel_req)) 2076 return -ECANCELED; 2077 2078 /* 2079 * The user (e.g. fsfreeze command) or power management (PM) 2080 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2081 * also freeze all user processes. 2082 * 2083 * A user process can only be frozen when it is in user space, thus we 2084 * have to cancel the run so that the process can return to the user 2085 * space. 2086 * 2087 * Furthermore we have to check both filesystem and process freezing, 2088 * as PM can be configured to freeze the filesystems before processes. 2089 * 2090 * If we only check fs freezing, then suspend without fs freezing 2091 * will timeout, as the process is still in kernel space. 2092 * 2093 * If we only check process freezing, then suspend with fs freezing 2094 * will timeout, as the running scrub will prevent the fs from being frozen. 2095 */ 2096 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2097 freezing(current) || signal_pending(current)) 2098 return -EINTR; 2099 return 0; 2100 } 2101 2102 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2103 struct btrfs_device *scrub_dev, 2104 struct btrfs_chunk_map *map, 2105 u64 full_stripe_start, 2106 unsigned long *extent_bitmap) 2107 { 2108 DECLARE_COMPLETION_ONSTACK(io_done); 2109 struct btrfs_fs_info *fs_info = sctx->fs_info; 2110 struct btrfs_io_context *bioc = NULL; 2111 struct btrfs_raid_bio *rbio; 2112 struct bio bio; 2113 const int data_stripes = nr_data_stripes(map); 2114 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2115 int ret; 2116 2117 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2118 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2119 bio.bi_private = &io_done; 2120 bio.bi_end_io = raid56_scrub_wait_endio; 2121 2122 btrfs_bio_counter_inc_blocked(fs_info); 2123 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2124 &length, &bioc, NULL, NULL); 2125 if (ret < 0) 2126 goto out; 2127 /* For RAID56 write there must be an @bioc allocated. */ 2128 ASSERT(bioc); 2129 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2130 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2131 btrfs_put_bioc(bioc); 2132 if (!rbio) { 2133 ret = -ENOMEM; 2134 goto out; 2135 } 2136 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2137 for (int i = 0; i < data_stripes; i++) { 2138 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2139 2140 raid56_parity_cache_data_folios(rbio, stripe->folios, 2141 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2142 } 2143 raid56_parity_submit_scrub_rbio(rbio); 2144 wait_for_completion_io(&io_done); 2145 ret = blk_status_to_errno(bio.bi_status); 2146 out: 2147 btrfs_bio_counter_dec(fs_info); 2148 bio_uninit(&bio); 2149 return ret; 2150 } 2151 2152 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2153 struct btrfs_device *scrub_dev, 2154 struct btrfs_block_group *bg, 2155 struct btrfs_chunk_map *map, 2156 u64 full_stripe_start) 2157 { 2158 struct btrfs_fs_info *fs_info = sctx->fs_info; 2159 BTRFS_PATH_AUTO_RELEASE(extent_path); 2160 BTRFS_PATH_AUTO_RELEASE(csum_path); 2161 struct scrub_stripe *stripe; 2162 bool all_empty = true; 2163 const int data_stripes = nr_data_stripes(map); 2164 unsigned long extent_bitmap = 0; 2165 int ret; 2166 2167 ASSERT(sctx->raid56_data_stripes); 2168 2169 ret = should_cancel_scrub(sctx); 2170 if (ret < 0) 2171 return ret; 2172 2173 if (atomic_read(&fs_info->scrub_pause_req)) 2174 scrub_blocked_if_needed(fs_info); 2175 2176 spin_lock(&bg->lock); 2177 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2178 spin_unlock(&bg->lock); 2179 return 0; 2180 } 2181 spin_unlock(&bg->lock); 2182 2183 /* 2184 * For data stripe search, we cannot reuse the same extent/csum paths, 2185 * as the data stripe bytenr may be smaller than previous extent. Thus 2186 * we have to use our own extent/csum paths. 2187 */ 2188 extent_path.search_commit_root = true; 2189 extent_path.skip_locking = true; 2190 csum_path.search_commit_root = true; 2191 csum_path.skip_locking = true; 2192 2193 for (int i = 0; i < data_stripes; i++) { 2194 int stripe_index; 2195 int rot; 2196 u64 physical; 2197 2198 stripe = &sctx->raid56_data_stripes[i]; 2199 rot = div_u64(full_stripe_start - bg->start, 2200 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2201 stripe_index = (i + rot) % map->num_stripes; 2202 physical = map->stripes[stripe_index].physical + 2203 btrfs_stripe_nr_to_offset(rot); 2204 2205 scrub_reset_stripe(stripe); 2206 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2207 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2208 map->stripes[stripe_index].dev, physical, 1, 2209 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2210 BTRFS_STRIPE_LEN, stripe); 2211 if (ret < 0) 2212 return ret; 2213 /* 2214 * No extent in this data stripe, need to manually mark them 2215 * initialized to make later read submission happy. 2216 */ 2217 if (ret > 0) { 2218 stripe->logical = full_stripe_start + 2219 btrfs_stripe_nr_to_offset(i); 2220 stripe->dev = map->stripes[stripe_index].dev; 2221 stripe->mirror_num = 1; 2222 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2223 } 2224 } 2225 2226 /* Check if all data stripes are empty. */ 2227 for (int i = 0; i < data_stripes; i++) { 2228 stripe = &sctx->raid56_data_stripes[i]; 2229 if (!scrub_bitmap_empty_has_extent(stripe)) { 2230 all_empty = false; 2231 break; 2232 } 2233 } 2234 if (all_empty) 2235 return 0; 2236 2237 for (int i = 0; i < data_stripes; i++) { 2238 stripe = &sctx->raid56_data_stripes[i]; 2239 scrub_submit_initial_read(sctx, stripe); 2240 } 2241 for (int i = 0; i < data_stripes; i++) { 2242 stripe = &sctx->raid56_data_stripes[i]; 2243 2244 wait_event(stripe->repair_wait, 2245 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2246 } 2247 /* For now, no zoned support for RAID56. */ 2248 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2249 2250 /* 2251 * Now all data stripes are properly verified. Check if we have any 2252 * unrepaired, if so abort immediately or we could further corrupt the 2253 * P/Q stripes. 2254 * 2255 * During the loop, also populate extent_bitmap. 2256 */ 2257 for (int i = 0; i < data_stripes; i++) { 2258 unsigned long error; 2259 unsigned long has_extent; 2260 2261 stripe = &sctx->raid56_data_stripes[i]; 2262 2263 error = scrub_bitmap_read_error(stripe); 2264 has_extent = scrub_bitmap_read_has_extent(stripe); 2265 2266 /* 2267 * We should only check the errors where there is an extent. 2268 * As we may hit an empty data stripe while it's missing. 2269 */ 2270 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2271 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2272 btrfs_err(fs_info, 2273 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2274 full_stripe_start, i, stripe->nr_sectors, 2275 &error); 2276 return ret; 2277 } 2278 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2279 stripe->nr_sectors); 2280 } 2281 2282 /* Now we can check and regenerate the P/Q stripe. */ 2283 return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2284 &extent_bitmap); 2285 } 2286 2287 /* 2288 * Scrub one range which can only has simple mirror based profile. 2289 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2290 * RAID0/RAID10). 2291 * 2292 * Since we may need to handle a subset of block group, we need @logical_start 2293 * and @logical_length parameter. 2294 */ 2295 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2296 struct btrfs_block_group *bg, 2297 u64 logical_start, u64 logical_length, 2298 struct btrfs_device *device, 2299 u64 physical, int mirror_num) 2300 { 2301 struct btrfs_fs_info *fs_info = sctx->fs_info; 2302 const u64 logical_end = logical_start + logical_length; 2303 u64 cur_logical = logical_start; 2304 int ret = 0; 2305 2306 /* The range must be inside the bg */ 2307 ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2308 2309 /* Go through each extent items inside the logical range */ 2310 while (cur_logical < logical_end) { 2311 u64 found_logical = U64_MAX; 2312 u64 cur_physical = physical + cur_logical - logical_start; 2313 2314 ret = should_cancel_scrub(sctx); 2315 if (ret < 0) 2316 break; 2317 2318 if (atomic_read(&fs_info->scrub_pause_req)) 2319 scrub_blocked_if_needed(fs_info); 2320 2321 spin_lock(&bg->lock); 2322 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2323 spin_unlock(&bg->lock); 2324 ret = 0; 2325 break; 2326 } 2327 spin_unlock(&bg->lock); 2328 2329 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2330 cur_logical, logical_end - cur_logical, 2331 cur_physical, &found_logical); 2332 if (ret > 0) { 2333 /* No more extent, just update the accounting */ 2334 spin_lock(&sctx->stat_lock); 2335 sctx->stat.last_physical = physical + logical_length; 2336 spin_unlock(&sctx->stat_lock); 2337 ret = 0; 2338 break; 2339 } 2340 if (ret < 0) 2341 break; 2342 2343 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2344 ASSERT(found_logical != U64_MAX); 2345 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2346 2347 /* Don't hold CPU for too long time */ 2348 cond_resched(); 2349 } 2350 return ret; 2351 } 2352 2353 /* Calculate the full stripe length for simple stripe based profiles */ 2354 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2355 { 2356 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2357 BTRFS_BLOCK_GROUP_RAID10)); 2358 2359 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2360 } 2361 2362 /* Get the logical bytenr for the stripe */ 2363 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2364 struct btrfs_block_group *bg, 2365 int stripe_index) 2366 { 2367 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2368 BTRFS_BLOCK_GROUP_RAID10)); 2369 ASSERT(stripe_index < map->num_stripes); 2370 2371 /* 2372 * (stripe_index / sub_stripes) gives how many data stripes we need to 2373 * skip. 2374 */ 2375 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2376 bg->start; 2377 } 2378 2379 /* Get the mirror number for the stripe */ 2380 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2381 { 2382 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2383 BTRFS_BLOCK_GROUP_RAID10)); 2384 ASSERT(stripe_index < map->num_stripes); 2385 2386 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2387 return stripe_index % map->sub_stripes + 1; 2388 } 2389 2390 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2391 struct btrfs_block_group *bg, 2392 struct btrfs_chunk_map *map, 2393 struct btrfs_device *device, 2394 int stripe_index) 2395 { 2396 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2397 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2398 const u64 orig_physical = map->stripes[stripe_index].physical; 2399 const u64 end = btrfs_block_group_end(bg); 2400 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2401 u64 cur_logical = orig_logical; 2402 u64 cur_physical = orig_physical; 2403 int ret = 0; 2404 2405 while (cur_logical < end) { 2406 /* 2407 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2408 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2409 * this stripe. 2410 */ 2411 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2412 BTRFS_STRIPE_LEN, device, cur_physical, 2413 mirror_num); 2414 if (ret) 2415 return ret; 2416 /* Skip to next stripe which belongs to the target device */ 2417 cur_logical += logical_increment; 2418 /* For physical offset, we just go to next stripe */ 2419 cur_physical += BTRFS_STRIPE_LEN; 2420 } 2421 return ret; 2422 } 2423 2424 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2425 struct btrfs_block_group *bg, 2426 struct btrfs_chunk_map *map, 2427 struct btrfs_device *scrub_dev, 2428 int stripe_index) 2429 { 2430 struct btrfs_fs_info *fs_info = sctx->fs_info; 2431 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2432 const u64 chunk_logical = bg->start; 2433 int ret; 2434 int ret2; 2435 u64 physical = map->stripes[stripe_index].physical; 2436 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2437 const u64 physical_end = physical + dev_stripe_len; 2438 u64 logical; 2439 u64 logic_end; 2440 /* The logical increment after finishing one stripe */ 2441 u64 increment; 2442 /* Offset inside the chunk */ 2443 u64 offset; 2444 u64 stripe_logical; 2445 2446 /* Extent_path should be released by now. */ 2447 ASSERT(sctx->extent_path.nodes[0] == NULL); 2448 2449 scrub_blocked_if_needed(fs_info); 2450 2451 if (sctx->is_dev_replace && 2452 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2453 mutex_lock(&sctx->wr_lock); 2454 sctx->write_pointer = physical; 2455 mutex_unlock(&sctx->wr_lock); 2456 } 2457 2458 /* Prepare the extra data stripes used by RAID56. */ 2459 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2460 ASSERT(sctx->raid56_data_stripes == NULL); 2461 2462 sctx->raid56_data_stripes = kzalloc_objs(struct scrub_stripe, 2463 nr_data_stripes(map)); 2464 if (!sctx->raid56_data_stripes) { 2465 ret = -ENOMEM; 2466 goto out; 2467 } 2468 for (int i = 0; i < nr_data_stripes(map); i++) { 2469 ret = init_scrub_stripe(fs_info, 2470 &sctx->raid56_data_stripes[i]); 2471 if (ret < 0) 2472 goto out; 2473 sctx->raid56_data_stripes[i].bg = bg; 2474 sctx->raid56_data_stripes[i].sctx = sctx; 2475 } 2476 } 2477 /* 2478 * There used to be a big double loop to handle all profiles using the 2479 * same routine, which grows larger and more gross over time. 2480 * 2481 * So here we handle each profile differently, so simpler profiles 2482 * have simpler scrubbing function. 2483 */ 2484 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2485 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2486 /* 2487 * Above check rules out all complex profile, the remaining 2488 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2489 * mirrored duplication without stripe. 2490 * 2491 * Only @physical and @mirror_num needs to calculated using 2492 * @stripe_index. 2493 */ 2494 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2495 scrub_dev, map->stripes[stripe_index].physical, 2496 stripe_index + 1); 2497 offset = 0; 2498 goto out; 2499 } 2500 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2501 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2502 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2503 goto out; 2504 } 2505 2506 /* Only RAID56 goes through the old code */ 2507 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2508 ret = 0; 2509 2510 /* Calculate the logical end of the stripe */ 2511 get_raid56_logic_offset(physical_end, stripe_index, 2512 map, &logic_end, NULL); 2513 logic_end += chunk_logical; 2514 2515 /* Initialize @offset in case we need to go to out: label */ 2516 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2517 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2518 2519 /* 2520 * Due to the rotation, for RAID56 it's better to iterate each stripe 2521 * using their physical offset. 2522 */ 2523 while (physical < physical_end) { 2524 ret = get_raid56_logic_offset(physical, stripe_index, map, 2525 &logical, &stripe_logical); 2526 logical += chunk_logical; 2527 if (ret) { 2528 /* it is parity strip */ 2529 stripe_logical += chunk_logical; 2530 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2531 map, stripe_logical); 2532 spin_lock(&sctx->stat_lock); 2533 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2534 physical_end); 2535 spin_unlock(&sctx->stat_lock); 2536 if (ret) 2537 goto out; 2538 goto next; 2539 } 2540 2541 /* 2542 * Now we're at a data stripe, scrub each extents in the range. 2543 * 2544 * At this stage, if we ignore the repair part, inside each data 2545 * stripe it is no different than SINGLE profile. 2546 * We can reuse scrub_simple_mirror() here, as the repair part 2547 * is still based on @mirror_num. 2548 */ 2549 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2550 scrub_dev, physical, 1); 2551 if (ret < 0) 2552 goto out; 2553 next: 2554 logical += increment; 2555 physical += BTRFS_STRIPE_LEN; 2556 spin_lock(&sctx->stat_lock); 2557 sctx->stat.last_physical = physical; 2558 spin_unlock(&sctx->stat_lock); 2559 } 2560 out: 2561 ret2 = flush_scrub_stripes(sctx); 2562 if (!ret) 2563 ret = ret2; 2564 btrfs_release_path(&sctx->extent_path); 2565 btrfs_release_path(&sctx->csum_path); 2566 2567 if (sctx->raid56_data_stripes) { 2568 for (int i = 0; i < nr_data_stripes(map); i++) 2569 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2570 kfree(sctx->raid56_data_stripes); 2571 sctx->raid56_data_stripes = NULL; 2572 } 2573 2574 if (sctx->is_dev_replace && ret >= 0) { 2575 ret2 = sync_write_pointer_for_zoned(sctx, 2576 chunk_logical + offset, 2577 map->stripes[stripe_index].physical, 2578 physical_end); 2579 if (ret2) 2580 ret = ret2; 2581 } 2582 2583 return ret < 0 ? ret : 0; 2584 } 2585 2586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2587 struct btrfs_block_group *bg, 2588 struct btrfs_device *scrub_dev, 2589 u64 dev_offset, 2590 u64 dev_extent_len) 2591 { 2592 struct btrfs_fs_info *fs_info = sctx->fs_info; 2593 struct btrfs_chunk_map *map; 2594 int i; 2595 int ret = 0; 2596 2597 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2598 if (!map) { 2599 /* 2600 * Might have been an unused block group deleted by the cleaner 2601 * kthread or relocation. 2602 */ 2603 spin_lock(&bg->lock); 2604 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2605 ret = -EINVAL; 2606 spin_unlock(&bg->lock); 2607 2608 return ret; 2609 } 2610 if (map->start != bg->start) 2611 goto out; 2612 if (map->chunk_len < dev_extent_len) 2613 goto out; 2614 2615 for (i = 0; i < map->num_stripes; ++i) { 2616 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2617 map->stripes[i].physical == dev_offset) { 2618 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2619 if (ret) 2620 goto out; 2621 } 2622 } 2623 out: 2624 btrfs_free_chunk_map(map); 2625 2626 return ret; 2627 } 2628 2629 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2630 struct btrfs_block_group *cache) 2631 { 2632 struct btrfs_fs_info *fs_info = cache->fs_info; 2633 2634 if (!btrfs_is_zoned(fs_info)) 2635 return 0; 2636 2637 btrfs_wait_block_group_reservations(cache); 2638 btrfs_wait_nocow_writers(cache); 2639 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2640 2641 return btrfs_commit_current_transaction(root); 2642 } 2643 2644 static noinline_for_stack 2645 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2646 struct btrfs_device *scrub_dev, u64 start, u64 end) 2647 { 2648 struct btrfs_dev_extent *dev_extent = NULL; 2649 BTRFS_PATH_AUTO_FREE(path); 2650 struct btrfs_fs_info *fs_info = sctx->fs_info; 2651 struct btrfs_root *root = fs_info->dev_root; 2652 u64 chunk_offset; 2653 int ret = 0; 2654 int ro_set; 2655 int slot; 2656 struct extent_buffer *l; 2657 struct btrfs_key key; 2658 struct btrfs_key found_key; 2659 struct btrfs_block_group *cache; 2660 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2661 2662 path = btrfs_alloc_path(); 2663 if (!path) 2664 return -ENOMEM; 2665 2666 path->reada = READA_FORWARD; 2667 path->search_commit_root = true; 2668 path->skip_locking = true; 2669 2670 key.objectid = scrub_dev->devid; 2671 key.type = BTRFS_DEV_EXTENT_KEY; 2672 key.offset = 0ull; 2673 2674 while (1) { 2675 u64 dev_extent_len; 2676 2677 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2678 if (ret < 0) 2679 break; 2680 if (ret > 0) { 2681 if (path->slots[0] >= 2682 btrfs_header_nritems(path->nodes[0])) { 2683 ret = btrfs_next_leaf(root, path); 2684 if (ret < 0) 2685 break; 2686 if (ret > 0) { 2687 ret = 0; 2688 break; 2689 } 2690 } else { 2691 ret = 0; 2692 } 2693 } 2694 2695 l = path->nodes[0]; 2696 slot = path->slots[0]; 2697 2698 btrfs_item_key_to_cpu(l, &found_key, slot); 2699 2700 if (found_key.objectid != scrub_dev->devid) 2701 break; 2702 2703 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2704 break; 2705 2706 if (found_key.offset >= end) 2707 break; 2708 2709 if (found_key.offset < key.offset) 2710 break; 2711 2712 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2713 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2714 2715 if (found_key.offset + dev_extent_len <= start) 2716 goto skip; 2717 2718 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2719 2720 /* 2721 * get a reference on the corresponding block group to prevent 2722 * the chunk from going away while we scrub it 2723 */ 2724 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2725 2726 /* some chunks are removed but not committed to disk yet, 2727 * continue scrubbing */ 2728 if (!cache) 2729 goto skip; 2730 2731 ASSERT(cache->start <= chunk_offset); 2732 /* 2733 * We are using the commit root to search for device extents, so 2734 * that means we could have found a device extent item from a 2735 * block group that was deleted in the current transaction. The 2736 * logical start offset of the deleted block group, stored at 2737 * @chunk_offset, might be part of the logical address range of 2738 * a new block group (which uses different physical extents). 2739 * In this case btrfs_lookup_block_group() has returned the new 2740 * block group, and its start address is less than @chunk_offset. 2741 * 2742 * We skip such new block groups, because it's pointless to 2743 * process them, as we won't find their extents because we search 2744 * for them using the commit root of the extent tree. For a device 2745 * replace it's also fine to skip it, we won't miss copying them 2746 * to the target device because we have the write duplication 2747 * setup through the regular write path (by btrfs_map_block()), 2748 * and we have committed a transaction when we started the device 2749 * replace, right after setting up the device replace state. 2750 */ 2751 if (cache->start < chunk_offset) { 2752 btrfs_put_block_group(cache); 2753 goto skip; 2754 } 2755 2756 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2757 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2758 btrfs_put_block_group(cache); 2759 goto skip; 2760 } 2761 } 2762 2763 /* 2764 * Make sure that while we are scrubbing the corresponding block 2765 * group doesn't get its logical address and its device extents 2766 * reused for another block group, which can possibly be of a 2767 * different type and different profile. We do this to prevent 2768 * false error detections and crashes due to bogus attempts to 2769 * repair extents. 2770 */ 2771 spin_lock(&cache->lock); 2772 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2773 spin_unlock(&cache->lock); 2774 btrfs_put_block_group(cache); 2775 goto skip; 2776 } 2777 btrfs_freeze_block_group(cache); 2778 spin_unlock(&cache->lock); 2779 2780 /* 2781 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2782 * to avoid deadlock caused by: 2783 * btrfs_inc_block_group_ro() 2784 * -> btrfs_wait_for_commit() 2785 * -> btrfs_commit_transaction() 2786 * -> btrfs_scrub_pause() 2787 */ 2788 scrub_pause_on(fs_info); 2789 2790 /* 2791 * Don't do chunk preallocation for scrub. 2792 * 2793 * This is especially important for SYSTEM bgs, or we can hit 2794 * -EFBIG from btrfs_finish_chunk_alloc() like: 2795 * 1. The only SYSTEM bg is marked RO. 2796 * Since SYSTEM bg is small, that's pretty common. 2797 * 2. New SYSTEM bg will be allocated 2798 * Due to regular version will allocate new chunk. 2799 * 3. New SYSTEM bg is empty and will get cleaned up 2800 * Before cleanup really happens, it's marked RO again. 2801 * 4. Empty SYSTEM bg get scrubbed 2802 * We go back to 2. 2803 * 2804 * This can easily boost the amount of SYSTEM chunks if cleaner 2805 * thread can't be triggered fast enough, and use up all space 2806 * of btrfs_super_block::sys_chunk_array 2807 * 2808 * While for dev replace, we need to try our best to mark block 2809 * group RO, to prevent race between: 2810 * - Write duplication 2811 * Contains latest data 2812 * - Scrub copy 2813 * Contains data from commit tree 2814 * 2815 * If target block group is not marked RO, nocow writes can 2816 * be overwritten by scrub copy, causing data corruption. 2817 * So for dev-replace, it's not allowed to continue if a block 2818 * group is not RO. 2819 */ 2820 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2821 if (!ret && sctx->is_dev_replace) { 2822 ret = finish_extent_writes_for_zoned(root, cache); 2823 if (ret) { 2824 btrfs_dec_block_group_ro(cache); 2825 scrub_pause_off(fs_info); 2826 btrfs_put_block_group(cache); 2827 break; 2828 } 2829 } 2830 2831 if (ret == 0) { 2832 ro_set = 1; 2833 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2834 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2835 /* 2836 * btrfs_inc_block_group_ro return -ENOSPC when it 2837 * failed in creating new chunk for metadata. 2838 * It is not a problem for scrub, because 2839 * metadata are always cowed, and our scrub paused 2840 * commit_transactions. 2841 * 2842 * For RAID56 chunks, we have to mark them read-only 2843 * for scrub, as later we would use our own cache 2844 * out of RAID56 realm. 2845 * Thus we want the RAID56 bg to be marked RO to 2846 * prevent RMW from screwing up out cache. 2847 */ 2848 ro_set = 0; 2849 } else if (ret == -ETXTBSY) { 2850 btrfs_warn(fs_info, 2851 "scrub: skipping scrub of block group %llu due to active swapfile", 2852 cache->start); 2853 scrub_pause_off(fs_info); 2854 ret = 0; 2855 goto skip_unfreeze; 2856 } else { 2857 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2858 ret); 2859 btrfs_unfreeze_block_group(cache); 2860 btrfs_put_block_group(cache); 2861 scrub_pause_off(fs_info); 2862 break; 2863 } 2864 2865 /* 2866 * Now the target block is marked RO, wait for nocow writes to 2867 * finish before dev-replace. 2868 * COW is fine, as COW never overwrites extents in commit tree. 2869 */ 2870 if (sctx->is_dev_replace) { 2871 btrfs_wait_nocow_writers(cache); 2872 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2873 } 2874 2875 scrub_pause_off(fs_info); 2876 down_write(&dev_replace->rwsem); 2877 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2878 dev_replace->cursor_left = found_key.offset; 2879 dev_replace->item_needs_writeback = 1; 2880 up_write(&dev_replace->rwsem); 2881 2882 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2883 dev_extent_len); 2884 if (sctx->is_dev_replace && 2885 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2886 cache, found_key.offset)) 2887 ro_set = 0; 2888 2889 down_write(&dev_replace->rwsem); 2890 dev_replace->cursor_left = dev_replace->cursor_right; 2891 dev_replace->item_needs_writeback = 1; 2892 up_write(&dev_replace->rwsem); 2893 2894 if (ro_set) 2895 btrfs_dec_block_group_ro(cache); 2896 2897 /* 2898 * We might have prevented the cleaner kthread from deleting 2899 * this block group if it was already unused because we raced 2900 * and set it to RO mode first. So add it back to the unused 2901 * list, otherwise it might not ever be deleted unless a manual 2902 * balance is triggered or it becomes used and unused again. 2903 */ 2904 spin_lock(&cache->lock); 2905 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2906 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2907 spin_unlock(&cache->lock); 2908 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2909 btrfs_discard_queue_work(&fs_info->discard_ctl, 2910 cache); 2911 else 2912 btrfs_mark_bg_unused(cache); 2913 } else { 2914 spin_unlock(&cache->lock); 2915 } 2916 skip_unfreeze: 2917 btrfs_unfreeze_block_group(cache); 2918 btrfs_put_block_group(cache); 2919 if (ret) 2920 break; 2921 if (unlikely(sctx->is_dev_replace && 2922 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2923 ret = -EIO; 2924 break; 2925 } 2926 if (sctx->stat.malloc_errors > 0) { 2927 ret = -ENOMEM; 2928 break; 2929 } 2930 skip: 2931 key.offset = found_key.offset + dev_extent_len; 2932 btrfs_release_path(path); 2933 } 2934 2935 return ret; 2936 } 2937 2938 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2939 struct page *page, u64 physical, u64 generation) 2940 { 2941 struct btrfs_fs_info *fs_info = sctx->fs_info; 2942 struct btrfs_super_block *sb = page_address(page); 2943 int ret; 2944 2945 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2946 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2947 if (ret < 0) 2948 return ret; 2949 ret = btrfs_check_super_csum(fs_info, sb); 2950 if (unlikely(ret != 0)) { 2951 btrfs_err_rl(fs_info, 2952 "scrub: super block at physical %llu devid %llu has bad csum", 2953 physical, dev->devid); 2954 return -EIO; 2955 } 2956 if (unlikely(btrfs_super_generation(sb) != generation)) { 2957 btrfs_err_rl(fs_info, 2958 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2959 physical, dev->devid, 2960 btrfs_super_generation(sb), generation); 2961 return -EUCLEAN; 2962 } 2963 2964 return btrfs_validate_super(fs_info, sb, -1); 2965 } 2966 2967 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2968 struct btrfs_device *scrub_dev) 2969 { 2970 int i; 2971 u64 bytenr; 2972 u64 gen; 2973 int ret = 0; 2974 struct page *page; 2975 struct btrfs_fs_info *fs_info = sctx->fs_info; 2976 2977 if (unlikely(BTRFS_FS_ERROR(fs_info))) 2978 return -EROFS; 2979 2980 page = alloc_page(GFP_KERNEL); 2981 if (!page) { 2982 spin_lock(&sctx->stat_lock); 2983 sctx->stat.malloc_errors++; 2984 spin_unlock(&sctx->stat_lock); 2985 return -ENOMEM; 2986 } 2987 2988 /* Seed devices of a new filesystem has their own generation. */ 2989 if (scrub_dev->fs_devices != fs_info->fs_devices) 2990 gen = scrub_dev->generation; 2991 else 2992 gen = btrfs_get_last_trans_committed(fs_info); 2993 2994 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2995 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 2996 if (ret == -ENOENT) 2997 break; 2998 2999 if (ret) { 3000 spin_lock(&sctx->stat_lock); 3001 sctx->stat.super_errors++; 3002 spin_unlock(&sctx->stat_lock); 3003 continue; 3004 } 3005 3006 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3007 scrub_dev->commit_total_bytes) 3008 break; 3009 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3010 continue; 3011 3012 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3013 if (ret) { 3014 spin_lock(&sctx->stat_lock); 3015 sctx->stat.super_errors++; 3016 spin_unlock(&sctx->stat_lock); 3017 } 3018 } 3019 __free_page(page); 3020 return 0; 3021 } 3022 3023 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3024 { 3025 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3026 &fs_info->scrub_lock)) { 3027 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3028 3029 fs_info->scrub_workers = NULL; 3030 mutex_unlock(&fs_info->scrub_lock); 3031 3032 if (scrub_workers) 3033 destroy_workqueue(scrub_workers); 3034 } 3035 } 3036 3037 /* 3038 * get a reference count on fs_info->scrub_workers. start worker if necessary 3039 */ 3040 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3041 { 3042 struct workqueue_struct *scrub_workers = NULL; 3043 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3044 int max_active = fs_info->thread_pool_size; 3045 int ret = -ENOMEM; 3046 3047 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3048 return 0; 3049 3050 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3051 if (!scrub_workers) 3052 return -ENOMEM; 3053 3054 mutex_lock(&fs_info->scrub_lock); 3055 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3056 ASSERT(fs_info->scrub_workers == NULL); 3057 fs_info->scrub_workers = scrub_workers; 3058 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3059 mutex_unlock(&fs_info->scrub_lock); 3060 return 0; 3061 } 3062 /* Other thread raced in and created the workers for us */ 3063 refcount_inc(&fs_info->scrub_workers_refcnt); 3064 mutex_unlock(&fs_info->scrub_lock); 3065 3066 ret = 0; 3067 3068 destroy_workqueue(scrub_workers); 3069 return ret; 3070 } 3071 3072 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3073 u64 end, struct btrfs_scrub_progress *progress, 3074 bool readonly, bool is_dev_replace) 3075 { 3076 struct btrfs_dev_lookup_args args = { .devid = devid }; 3077 struct scrub_ctx *sctx; 3078 int ret; 3079 struct btrfs_device *dev; 3080 unsigned int nofs_flag; 3081 bool need_commit = false; 3082 3083 /* Set the basic fallback @last_physical before we got a sctx. */ 3084 if (progress) 3085 progress->last_physical = start; 3086 3087 if (btrfs_fs_closing(fs_info)) 3088 return -EAGAIN; 3089 3090 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3091 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3092 3093 /* 3094 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3095 * value (max nodesize / min sectorsize), thus nodesize should always 3096 * be fine. 3097 */ 3098 ASSERT(fs_info->nodesize <= 3099 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3100 3101 /* Allocate outside of device_list_mutex */ 3102 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3103 if (IS_ERR(sctx)) 3104 return PTR_ERR(sctx); 3105 sctx->stat.last_physical = start; 3106 3107 ret = scrub_workers_get(fs_info); 3108 if (ret) 3109 goto out_free_ctx; 3110 3111 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3112 dev = btrfs_find_device(fs_info->fs_devices, &args); 3113 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3114 !is_dev_replace)) { 3115 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3116 ret = -ENODEV; 3117 goto out; 3118 } 3119 3120 if (!is_dev_replace && !readonly && 3121 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3122 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3123 btrfs_err(fs_info, 3124 "scrub: devid %llu: filesystem on %s is not writable", 3125 devid, btrfs_dev_name(dev)); 3126 ret = -EROFS; 3127 goto out; 3128 } 3129 3130 mutex_lock(&fs_info->scrub_lock); 3131 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3132 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3133 mutex_unlock(&fs_info->scrub_lock); 3134 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3135 ret = -EIO; 3136 goto out; 3137 } 3138 3139 down_read(&fs_info->dev_replace.rwsem); 3140 if (dev->scrub_ctx || 3141 (!is_dev_replace && 3142 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3143 up_read(&fs_info->dev_replace.rwsem); 3144 mutex_unlock(&fs_info->scrub_lock); 3145 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3146 ret = -EINPROGRESS; 3147 goto out; 3148 } 3149 up_read(&fs_info->dev_replace.rwsem); 3150 3151 sctx->readonly = readonly; 3152 dev->scrub_ctx = sctx; 3153 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3154 3155 /* 3156 * checking @scrub_pause_req here, we can avoid 3157 * race between committing transaction and scrubbing. 3158 */ 3159 __scrub_blocked_if_needed(fs_info); 3160 atomic_inc(&fs_info->scrubs_running); 3161 mutex_unlock(&fs_info->scrub_lock); 3162 3163 /* 3164 * In order to avoid deadlock with reclaim when there is a transaction 3165 * trying to pause scrub, make sure we use GFP_NOFS for all the 3166 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3167 * invoked by our callees. The pausing request is done when the 3168 * transaction commit starts, and it blocks the transaction until scrub 3169 * is paused (done at specific points at scrub_stripe() or right above 3170 * before incrementing fs_info->scrubs_running). 3171 */ 3172 nofs_flag = memalloc_nofs_save(); 3173 if (!is_dev_replace) { 3174 u64 old_super_errors; 3175 3176 spin_lock(&sctx->stat_lock); 3177 old_super_errors = sctx->stat.super_errors; 3178 spin_unlock(&sctx->stat_lock); 3179 3180 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3181 /* 3182 * by holding device list mutex, we can 3183 * kick off writing super in log tree sync. 3184 */ 3185 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3186 ret = scrub_supers(sctx, dev); 3187 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3188 3189 spin_lock(&sctx->stat_lock); 3190 /* 3191 * Super block errors found, but we can not commit transaction 3192 * at current context, since btrfs_commit_transaction() needs 3193 * to pause the current running scrub (hold by ourselves). 3194 */ 3195 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3196 need_commit = true; 3197 spin_unlock(&sctx->stat_lock); 3198 } 3199 3200 if (!ret) 3201 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3202 memalloc_nofs_restore(nofs_flag); 3203 3204 atomic_dec(&fs_info->scrubs_running); 3205 wake_up(&fs_info->scrub_pause_wait); 3206 3207 if (progress) 3208 memcpy(progress, &sctx->stat, sizeof(*progress)); 3209 3210 if (!is_dev_replace) 3211 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3212 ret ? "not finished" : "finished", devid, ret); 3213 3214 mutex_lock(&fs_info->scrub_lock); 3215 dev->scrub_ctx = NULL; 3216 mutex_unlock(&fs_info->scrub_lock); 3217 3218 scrub_workers_put(fs_info); 3219 scrub_put_ctx(sctx); 3220 3221 /* 3222 * We found some super block errors before, now try to force a 3223 * transaction commit, as scrub has finished. 3224 */ 3225 if (need_commit) { 3226 struct btrfs_trans_handle *trans; 3227 3228 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3229 if (IS_ERR(trans)) { 3230 ret = PTR_ERR(trans); 3231 btrfs_err(fs_info, 3232 "scrub: failed to start transaction to fix super block errors: %d", ret); 3233 return ret; 3234 } 3235 ret = btrfs_commit_transaction(trans); 3236 if (ret < 0) 3237 btrfs_err(fs_info, 3238 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3239 } 3240 return ret; 3241 out: 3242 scrub_workers_put(fs_info); 3243 out_free_ctx: 3244 scrub_free_ctx(sctx); 3245 3246 return ret; 3247 } 3248 3249 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3250 { 3251 mutex_lock(&fs_info->scrub_lock); 3252 atomic_inc(&fs_info->scrub_pause_req); 3253 while (atomic_read(&fs_info->scrubs_paused) != 3254 atomic_read(&fs_info->scrubs_running)) { 3255 mutex_unlock(&fs_info->scrub_lock); 3256 wait_event(fs_info->scrub_pause_wait, 3257 atomic_read(&fs_info->scrubs_paused) == 3258 atomic_read(&fs_info->scrubs_running)); 3259 mutex_lock(&fs_info->scrub_lock); 3260 } 3261 mutex_unlock(&fs_info->scrub_lock); 3262 } 3263 3264 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3265 { 3266 atomic_dec(&fs_info->scrub_pause_req); 3267 wake_up(&fs_info->scrub_pause_wait); 3268 } 3269 3270 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3271 { 3272 mutex_lock(&fs_info->scrub_lock); 3273 if (!atomic_read(&fs_info->scrubs_running)) { 3274 mutex_unlock(&fs_info->scrub_lock); 3275 return -ENOTCONN; 3276 } 3277 3278 atomic_inc(&fs_info->scrub_cancel_req); 3279 while (atomic_read(&fs_info->scrubs_running)) { 3280 mutex_unlock(&fs_info->scrub_lock); 3281 wait_event(fs_info->scrub_pause_wait, 3282 atomic_read(&fs_info->scrubs_running) == 0); 3283 mutex_lock(&fs_info->scrub_lock); 3284 } 3285 atomic_dec(&fs_info->scrub_cancel_req); 3286 mutex_unlock(&fs_info->scrub_lock); 3287 3288 return 0; 3289 } 3290 3291 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3292 { 3293 struct btrfs_fs_info *fs_info = dev->fs_info; 3294 struct scrub_ctx *sctx; 3295 3296 mutex_lock(&fs_info->scrub_lock); 3297 sctx = dev->scrub_ctx; 3298 if (!sctx) { 3299 mutex_unlock(&fs_info->scrub_lock); 3300 return -ENOTCONN; 3301 } 3302 atomic_inc(&sctx->cancel_req); 3303 while (dev->scrub_ctx) { 3304 mutex_unlock(&fs_info->scrub_lock); 3305 wait_event(fs_info->scrub_pause_wait, 3306 dev->scrub_ctx == NULL); 3307 mutex_lock(&fs_info->scrub_lock); 3308 } 3309 mutex_unlock(&fs_info->scrub_lock); 3310 3311 return 0; 3312 } 3313 3314 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3315 struct btrfs_scrub_progress *progress) 3316 { 3317 struct btrfs_dev_lookup_args args = { .devid = devid }; 3318 struct btrfs_device *dev; 3319 struct scrub_ctx *sctx = NULL; 3320 3321 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3322 dev = btrfs_find_device(fs_info->fs_devices, &args); 3323 if (dev) 3324 sctx = dev->scrub_ctx; 3325 if (sctx) 3326 memcpy(progress, &sctx->stat, sizeof(*progress)); 3327 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3328 3329 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3330 } 3331