1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include <crypto/hash.h> 10 #include "ctree.h" 11 #include "discard.h" 12 #include "volumes.h" 13 #include "disk-io.h" 14 #include "ordered-data.h" 15 #include "transaction.h" 16 #include "backref.h" 17 #include "extent_io.h" 18 #include "dev-replace.h" 19 #include "raid56.h" 20 #include "block-group.h" 21 #include "zoned.h" 22 #include "fs.h" 23 #include "accessors.h" 24 #include "file-item.h" 25 #include "scrub.h" 26 #include "raid-stripe-tree.h" 27 28 /* 29 * This is only the first step towards a full-features scrub. It reads all 30 * extent and super block and verifies the checksums. In case a bad checksum 31 * is found or the extent cannot be read, good data will be written back if 32 * any can be found. 33 * 34 * Future enhancements: 35 * - In case an unrepairable extent is encountered, track which files are 36 * affected and report them 37 * - track and record media errors, throw out bad devices 38 * - add a mode to also read unallocated space 39 */ 40 41 struct scrub_ctx; 42 43 /* 44 * The following value only influences the performance. 45 * 46 * This determines how many stripes would be submitted in one go, 47 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). 48 */ 49 #define SCRUB_STRIPES_PER_GROUP 8 50 51 /* 52 * How many groups we have for each sctx. 53 * 54 * This would be 8M per device, the same value as the old scrub in-flight bios 55 * size limit. 56 */ 57 #define SCRUB_GROUPS_PER_SCTX 16 58 59 #define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) 60 61 /* 62 * The following value times PAGE_SIZE needs to be large enough to match the 63 * largest node/leaf/sector size that shall be supported. 64 */ 65 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 66 67 /* Represent one sector and its needed info to verify the content. */ 68 struct scrub_sector_verification { 69 union { 70 /* 71 * Csum pointer for data csum verification. Should point to a 72 * sector csum inside scrub_stripe::csums. 73 * 74 * NULL if this data sector has no csum. 75 */ 76 u8 *csum; 77 78 /* 79 * Extra info for metadata verification. All sectors inside a 80 * tree block share the same generation. 81 */ 82 u64 generation; 83 }; 84 }; 85 86 enum scrub_stripe_flags { 87 /* Set when @mirror_num, @dev, @physical and @logical are set. */ 88 SCRUB_STRIPE_FLAG_INITIALIZED, 89 90 /* Set when the read-repair is finished. */ 91 SCRUB_STRIPE_FLAG_REPAIR_DONE, 92 93 /* 94 * Set for data stripes if it's triggered from P/Q stripe. 95 * During such scrub, we should not report errors in data stripes, nor 96 * update the accounting. 97 */ 98 SCRUB_STRIPE_FLAG_NO_REPORT, 99 }; 100 101 /* 102 * We have multiple bitmaps for one scrub_stripe. 103 * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, 104 * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). 105 * 106 * So to reduce memory usage for each scrub_stripe, we pack those bitmaps 107 * into a larger one. 108 * 109 * These enum records where the sub-bitmap are inside the larger one. 110 * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. 111 */ 112 enum { 113 /* Which blocks are covered by extent items. */ 114 scrub_bitmap_nr_has_extent = 0, 115 116 /* Which blocks are metadata. */ 117 scrub_bitmap_nr_is_metadata, 118 119 /* 120 * Which blocks have errors, including IO, csum, and metadata 121 * errors. 122 * This sub-bitmap is the OR results of the next few error related 123 * sub-bitmaps. 124 */ 125 scrub_bitmap_nr_error, 126 scrub_bitmap_nr_io_error, 127 scrub_bitmap_nr_csum_error, 128 scrub_bitmap_nr_meta_error, 129 scrub_bitmap_nr_meta_gen_error, 130 scrub_bitmap_nr_last, 131 }; 132 133 #define SCRUB_STRIPE_MAX_FOLIOS (BTRFS_STRIPE_LEN / PAGE_SIZE) 134 135 /* 136 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. 137 */ 138 struct scrub_stripe { 139 struct scrub_ctx *sctx; 140 struct btrfs_block_group *bg; 141 142 struct folio *folios[SCRUB_STRIPE_MAX_FOLIOS]; 143 struct scrub_sector_verification *sectors; 144 145 struct btrfs_device *dev; 146 u64 logical; 147 u64 physical; 148 149 u16 mirror_num; 150 151 /* Should be BTRFS_STRIPE_LEN / sectorsize. */ 152 u16 nr_sectors; 153 154 /* 155 * How many data/meta extents are in this stripe. Only for scrub status 156 * reporting purposes. 157 */ 158 u16 nr_data_extents; 159 u16 nr_meta_extents; 160 161 atomic_t pending_io; 162 wait_queue_head_t io_wait; 163 wait_queue_head_t repair_wait; 164 165 /* 166 * Indicate the states of the stripe. Bits are defined in 167 * scrub_stripe_flags enum. 168 */ 169 unsigned long state; 170 171 /* The large bitmap contains all the sub-bitmaps. */ 172 unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * 173 (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; 174 175 /* 176 * For writeback (repair or replace) error reporting. 177 * This one is protected by a spinlock, thus can not be packed into 178 * the larger bitmap. 179 */ 180 unsigned long write_error_bitmap; 181 182 /* Writeback can be concurrent, thus we need to protect the bitmap. */ 183 spinlock_t write_error_lock; 184 185 /* 186 * Checksum for the whole stripe if this stripe is inside a data block 187 * group. 188 */ 189 u8 *csums; 190 191 struct work_struct work; 192 }; 193 194 struct scrub_ctx { 195 struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; 196 struct scrub_stripe *raid56_data_stripes; 197 struct btrfs_fs_info *fs_info; 198 struct btrfs_path extent_path; 199 struct btrfs_path csum_path; 200 int first_free; 201 int cur_stripe; 202 atomic_t cancel_req; 203 int readonly; 204 205 /* State of IO submission throttling affecting the associated device */ 206 ktime_t throttle_deadline; 207 u64 throttle_sent; 208 209 bool is_dev_replace; 210 u64 write_pointer; 211 212 struct mutex wr_lock; 213 struct btrfs_device *wr_tgtdev; 214 215 /* 216 * statistics 217 */ 218 struct btrfs_scrub_progress stat; 219 spinlock_t stat_lock; 220 221 /* 222 * Use a ref counter to avoid use-after-free issues. Scrub workers 223 * decrement bios_in_flight and workers_pending and then do a wakeup 224 * on the list_wait wait queue. We must ensure the main scrub task 225 * doesn't free the scrub context before or while the workers are 226 * doing the wakeup() call. 227 */ 228 refcount_t refs; 229 }; 230 231 #define scrub_calc_start_bit(stripe, name, block_nr) \ 232 ({ \ 233 unsigned int __start_bit; \ 234 \ 235 ASSERT(block_nr < stripe->nr_sectors, \ 236 "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ 237 __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ 238 __start_bit; \ 239 }) 240 241 #define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ 242 static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ 243 unsigned int block_nr, \ 244 unsigned int nr_blocks) \ 245 { \ 246 const unsigned int start_bit = scrub_calc_start_bit(stripe, \ 247 name, block_nr); \ 248 \ 249 bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ 250 } \ 251 static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ 252 unsigned int block_nr, \ 253 unsigned int nr_blocks) \ 254 { \ 255 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 256 block_nr); \ 257 \ 258 bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ 259 } \ 260 static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ 261 unsigned int block_nr) \ 262 { \ 263 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 264 block_nr); \ 265 \ 266 return test_bit(start_bit, stripe->bitmaps); \ 267 } \ 268 static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ 269 unsigned int block_nr) \ 270 { \ 271 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 272 block_nr); \ 273 \ 274 set_bit(start_bit, stripe->bitmaps); \ 275 } \ 276 static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ 277 unsigned int block_nr) \ 278 { \ 279 const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ 280 block_nr); \ 281 \ 282 clear_bit(start_bit, stripe->bitmaps); \ 283 } \ 284 static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ 285 { \ 286 const unsigned int nr_blocks = stripe->nr_sectors; \ 287 \ 288 ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ 289 "nr_blocks=%u BITS_PER_LONG=%u", \ 290 nr_blocks, BITS_PER_LONG); \ 291 \ 292 return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ 293 stripe->nr_sectors); \ 294 } \ 295 static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ 296 { \ 297 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 298 \ 299 return bitmap_empty(&bitmap, stripe->nr_sectors); \ 300 } \ 301 static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ 302 { \ 303 unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ 304 \ 305 return bitmap_weight(&bitmap, stripe->nr_sectors); \ 306 } 307 IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); 308 IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); 309 IMPLEMENT_SCRUB_BITMAP_OPS(error); 310 IMPLEMENT_SCRUB_BITMAP_OPS(io_error); 311 IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); 312 IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); 313 IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); 314 315 struct scrub_warning { 316 struct btrfs_path *path; 317 u64 extent_item_size; 318 const char *errstr; 319 u64 physical; 320 u64 logical; 321 struct btrfs_device *dev; 322 }; 323 324 struct scrub_error_records { 325 /* 326 * Bitmap recording which blocks hit errors (IO/csum/...) during the 327 * initial read. 328 */ 329 unsigned long init_error_bitmap; 330 331 unsigned int nr_io_errors; 332 unsigned int nr_csum_errors; 333 unsigned int nr_meta_errors; 334 unsigned int nr_meta_gen_errors; 335 }; 336 337 static void release_scrub_stripe(struct scrub_stripe *stripe) 338 { 339 if (!stripe) 340 return; 341 342 for (int i = 0; i < SCRUB_STRIPE_MAX_FOLIOS; i++) { 343 if (stripe->folios[i]) 344 folio_put(stripe->folios[i]); 345 stripe->folios[i] = NULL; 346 } 347 kfree(stripe->sectors); 348 kfree(stripe->csums); 349 stripe->sectors = NULL; 350 stripe->csums = NULL; 351 stripe->sctx = NULL; 352 stripe->state = 0; 353 } 354 355 static int init_scrub_stripe(struct btrfs_fs_info *fs_info, 356 struct scrub_stripe *stripe) 357 { 358 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 359 int ret; 360 361 memset(stripe, 0, sizeof(*stripe)); 362 363 stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 364 stripe->state = 0; 365 366 init_waitqueue_head(&stripe->io_wait); 367 init_waitqueue_head(&stripe->repair_wait); 368 atomic_set(&stripe->pending_io, 0); 369 spin_lock_init(&stripe->write_error_lock); 370 371 ASSERT(BTRFS_STRIPE_LEN >> min_folio_shift <= SCRUB_STRIPE_MAX_FOLIOS); 372 ret = btrfs_alloc_folio_array(BTRFS_STRIPE_LEN >> min_folio_shift, 373 fs_info->block_min_order, stripe->folios); 374 if (ret < 0) 375 goto error; 376 377 stripe->sectors = kcalloc(stripe->nr_sectors, 378 sizeof(struct scrub_sector_verification), 379 GFP_KERNEL); 380 if (!stripe->sectors) 381 goto error; 382 383 stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, 384 fs_info->csum_size, GFP_KERNEL); 385 if (!stripe->csums) 386 goto error; 387 return 0; 388 error: 389 release_scrub_stripe(stripe); 390 return -ENOMEM; 391 } 392 393 static void wait_scrub_stripe_io(struct scrub_stripe *stripe) 394 { 395 wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); 396 } 397 398 static void scrub_put_ctx(struct scrub_ctx *sctx); 399 400 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 401 { 402 while (atomic_read(&fs_info->scrub_pause_req)) { 403 mutex_unlock(&fs_info->scrub_lock); 404 wait_event(fs_info->scrub_pause_wait, 405 atomic_read(&fs_info->scrub_pause_req) == 0); 406 mutex_lock(&fs_info->scrub_lock); 407 } 408 } 409 410 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 411 { 412 atomic_inc(&fs_info->scrubs_paused); 413 wake_up(&fs_info->scrub_pause_wait); 414 } 415 416 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 417 { 418 mutex_lock(&fs_info->scrub_lock); 419 __scrub_blocked_if_needed(fs_info); 420 atomic_dec(&fs_info->scrubs_paused); 421 mutex_unlock(&fs_info->scrub_lock); 422 423 wake_up(&fs_info->scrub_pause_wait); 424 } 425 426 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 427 { 428 scrub_pause_on(fs_info); 429 scrub_pause_off(fs_info); 430 } 431 432 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 433 { 434 int i; 435 436 if (!sctx) 437 return; 438 439 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) 440 release_scrub_stripe(&sctx->stripes[i]); 441 442 kvfree(sctx); 443 } 444 445 static void scrub_put_ctx(struct scrub_ctx *sctx) 446 { 447 if (refcount_dec_and_test(&sctx->refs)) 448 scrub_free_ctx(sctx); 449 } 450 451 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 452 struct btrfs_fs_info *fs_info, bool is_dev_replace) 453 { 454 struct scrub_ctx *sctx; 455 int i; 456 457 /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use 458 * kvzalloc(). 459 */ 460 sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); 461 if (!sctx) 462 goto nomem; 463 refcount_set(&sctx->refs, 1); 464 sctx->is_dev_replace = is_dev_replace; 465 sctx->fs_info = fs_info; 466 sctx->extent_path.search_commit_root = true; 467 sctx->extent_path.skip_locking = true; 468 sctx->csum_path.search_commit_root = true; 469 sctx->csum_path.skip_locking = true; 470 for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { 471 int ret; 472 473 ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); 474 if (ret < 0) 475 goto nomem; 476 sctx->stripes[i].sctx = sctx; 477 } 478 sctx->first_free = 0; 479 atomic_set(&sctx->cancel_req, 0); 480 481 spin_lock_init(&sctx->stat_lock); 482 sctx->throttle_deadline = 0; 483 484 mutex_init(&sctx->wr_lock); 485 if (is_dev_replace) { 486 WARN_ON(!fs_info->dev_replace.tgtdev); 487 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 488 } 489 490 return sctx; 491 492 nomem: 493 scrub_free_ctx(sctx); 494 return ERR_PTR(-ENOMEM); 495 } 496 497 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, 498 u64 root, void *warn_ctx) 499 { 500 u32 nlink; 501 int ret; 502 int i; 503 unsigned nofs_flag; 504 struct extent_buffer *eb; 505 struct btrfs_inode_item *inode_item; 506 struct scrub_warning *swarn = warn_ctx; 507 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 508 struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; 509 struct btrfs_root *local_root; 510 struct btrfs_key key; 511 512 local_root = btrfs_get_fs_root(fs_info, root, true); 513 if (IS_ERR(local_root)) { 514 ret = PTR_ERR(local_root); 515 goto err; 516 } 517 518 /* 519 * this makes the path point to (inum INODE_ITEM ioff) 520 */ 521 key.objectid = inum; 522 key.type = BTRFS_INODE_ITEM_KEY; 523 key.offset = 0; 524 525 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 526 if (ret) { 527 btrfs_put_root(local_root); 528 btrfs_release_path(swarn->path); 529 goto err; 530 } 531 532 eb = swarn->path->nodes[0]; 533 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 534 struct btrfs_inode_item); 535 nlink = btrfs_inode_nlink(eb, inode_item); 536 btrfs_release_path(swarn->path); 537 538 /* 539 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 540 * uses GFP_NOFS in this context, so we keep it consistent but it does 541 * not seem to be strictly necessary. 542 */ 543 nofs_flag = memalloc_nofs_save(); 544 ipath = init_ipath(4096, local_root, swarn->path); 545 memalloc_nofs_restore(nofs_flag); 546 if (IS_ERR(ipath)) { 547 btrfs_put_root(local_root); 548 ret = PTR_ERR(ipath); 549 ipath = NULL; 550 goto err; 551 } 552 ret = paths_from_inode(inum, ipath); 553 554 if (ret < 0) 555 goto err; 556 557 /* 558 * we deliberately ignore the bit ipath might have been too small to 559 * hold all of the paths here 560 */ 561 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 562 btrfs_warn(fs_info, 563 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", 564 swarn->errstr, swarn->logical, 565 btrfs_dev_name(swarn->dev), 566 swarn->physical, 567 root, inum, offset, 568 fs_info->sectorsize, nlink, 569 (char *)(unsigned long)ipath->fspath->val[i]); 570 571 btrfs_put_root(local_root); 572 return 0; 573 574 err: 575 btrfs_warn(fs_info, 576 "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", 577 swarn->errstr, swarn->logical, 578 btrfs_dev_name(swarn->dev), 579 swarn->physical, 580 root, inum, offset, ret); 581 582 return 0; 583 } 584 585 static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, 586 bool is_super, u64 logical, u64 physical) 587 { 588 struct btrfs_fs_info *fs_info = dev->fs_info; 589 BTRFS_PATH_AUTO_FREE(path); 590 struct btrfs_key found_key; 591 struct extent_buffer *eb; 592 struct btrfs_extent_item *ei; 593 struct scrub_warning swarn; 594 u64 flags = 0; 595 u32 item_size; 596 int ret; 597 598 /* Super block error, no need to search extent tree. */ 599 if (is_super) { 600 btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", 601 errstr, btrfs_dev_name(dev), physical); 602 return; 603 } 604 path = btrfs_alloc_path(); 605 if (!path) 606 return; 607 608 swarn.physical = physical; 609 swarn.logical = logical; 610 swarn.errstr = errstr; 611 swarn.dev = NULL; 612 613 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 614 &flags); 615 if (ret < 0) 616 return; 617 618 swarn.extent_item_size = found_key.offset; 619 620 eb = path->nodes[0]; 621 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 622 item_size = btrfs_item_size(eb, path->slots[0]); 623 624 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 625 unsigned long ptr = 0; 626 u8 ref_level; 627 u64 ref_root; 628 629 while (true) { 630 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 631 item_size, &ref_root, 632 &ref_level); 633 if (ret < 0) { 634 btrfs_warn(fs_info, 635 "scrub: failed to resolve tree backref for logical %llu: %d", 636 swarn.logical, ret); 637 break; 638 } 639 if (ret > 0) 640 break; 641 btrfs_warn(fs_info, 642 "scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 643 errstr, swarn.logical, btrfs_dev_name(dev), 644 swarn.physical, (ref_level ? "node" : "leaf"), 645 ref_level, ref_root); 646 } 647 btrfs_release_path(path); 648 } else { 649 struct btrfs_backref_walk_ctx ctx = { 0 }; 650 651 btrfs_release_path(path); 652 653 ctx.bytenr = found_key.objectid; 654 ctx.extent_item_pos = swarn.logical - found_key.objectid; 655 ctx.fs_info = fs_info; 656 657 swarn.path = path; 658 swarn.dev = dev; 659 660 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); 661 } 662 } 663 664 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 665 { 666 int ret = 0; 667 u64 length; 668 669 if (!btrfs_is_zoned(sctx->fs_info)) 670 return 0; 671 672 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 673 return 0; 674 675 if (sctx->write_pointer < physical) { 676 length = physical - sctx->write_pointer; 677 678 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 679 sctx->write_pointer, length); 680 if (!ret) 681 sctx->write_pointer = physical; 682 } 683 return ret; 684 } 685 686 static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) 687 { 688 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 689 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 690 u32 offset = (sector_nr << fs_info->sectorsize_bits); 691 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 692 693 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 694 ASSERT(folio); 695 ASSERT(!folio_test_highmem(folio)); 696 return folio_address(folio) + offset_in_folio(folio, offset); 697 } 698 699 static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int sector_nr) 700 { 701 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 702 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 703 u32 offset = (sector_nr << fs_info->sectorsize_bits); 704 const struct folio *folio = stripe->folios[offset >> min_folio_shift]; 705 706 /* stripe->folios[] is allocated by us and no highmem is allowed. */ 707 ASSERT(folio); 708 ASSERT(!folio_test_highmem(folio)); 709 /* And the range must be contained inside the folio. */ 710 ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); 711 return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); 712 } 713 714 static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) 715 { 716 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 717 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 718 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 719 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 720 struct btrfs_header *header = first_kaddr; 721 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 722 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 723 u8 calculated_csum[BTRFS_CSUM_SIZE]; 724 725 /* 726 * Here we don't have a good way to attach the pages (and subpages) 727 * to a dummy extent buffer, thus we have to directly grab the members 728 * from pages. 729 */ 730 memcpy(on_disk_csum, header->csum, fs_info->csum_size); 731 732 if (logical != btrfs_stack_header_bytenr(header)) { 733 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 734 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 735 btrfs_warn_rl(fs_info, 736 "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", 737 logical, stripe->mirror_num, 738 btrfs_stack_header_bytenr(header), logical); 739 return; 740 } 741 if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, 742 BTRFS_FSID_SIZE) != 0) { 743 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 744 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 745 btrfs_warn_rl(fs_info, 746 "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", 747 logical, stripe->mirror_num, 748 header->fsid, fs_info->fs_devices->fsid); 749 return; 750 } 751 if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, 752 BTRFS_UUID_SIZE) != 0) { 753 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 754 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 755 btrfs_warn_rl(fs_info, 756 "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", 757 logical, stripe->mirror_num, 758 header->chunk_tree_uuid, fs_info->chunk_tree_uuid); 759 return; 760 } 761 762 /* Now check tree block csum. */ 763 shash->tfm = fs_info->csum_shash; 764 crypto_shash_init(shash); 765 crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, 766 fs_info->sectorsize - BTRFS_CSUM_SIZE); 767 768 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 769 crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), 770 fs_info->sectorsize); 771 } 772 773 crypto_shash_final(shash, calculated_csum); 774 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 775 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 776 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 777 btrfs_warn_rl(fs_info, 778 "scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, 779 logical, stripe->mirror_num, 780 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), 781 BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); 782 return; 783 } 784 if (stripe->sectors[sector_nr].generation != 785 btrfs_stack_header_generation(header)) { 786 scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); 787 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); 788 btrfs_warn_rl(fs_info, 789 "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", 790 logical, stripe->mirror_num, 791 btrfs_stack_header_generation(header), 792 stripe->sectors[sector_nr].generation); 793 return; 794 } 795 scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); 796 scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); 797 scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); 798 scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); 799 } 800 801 static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) 802 { 803 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 804 struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; 805 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 806 phys_addr_t paddr = scrub_stripe_get_paddr(stripe, sector_nr); 807 u8 csum_buf[BTRFS_CSUM_SIZE]; 808 int ret; 809 810 ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); 811 812 /* Sector not utilized, skip it. */ 813 if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) 814 return; 815 816 /* IO error, no need to check. */ 817 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 818 return; 819 820 /* Metadata, verify the full tree block. */ 821 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 822 /* 823 * Check if the tree block crosses the stripe boundary. If 824 * crossed the boundary, we cannot verify it but only give a 825 * warning. 826 * 827 * This can only happen on a very old filesystem where chunks 828 * are not ensured to be stripe aligned. 829 */ 830 if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { 831 btrfs_warn_rl(fs_info, 832 "scrub: tree block at %llu crosses stripe boundary %llu", 833 stripe->logical + 834 (sector_nr << fs_info->sectorsize_bits), 835 stripe->logical); 836 return; 837 } 838 scrub_verify_one_metadata(stripe, sector_nr); 839 return; 840 } 841 842 /* 843 * Data is easier, we just verify the data csum (if we have it). For 844 * cases without csum, we have no other choice but to trust it. 845 */ 846 if (!sector->csum) { 847 scrub_bitmap_clear_bit_error(stripe, sector_nr); 848 return; 849 } 850 851 ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, sector->csum); 852 if (ret < 0) { 853 scrub_bitmap_set_bit_csum_error(stripe, sector_nr); 854 scrub_bitmap_set_bit_error(stripe, sector_nr); 855 } else { 856 scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); 857 scrub_bitmap_clear_bit_error(stripe, sector_nr); 858 } 859 } 860 861 /* Verify specified sectors of a stripe. */ 862 static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) 863 { 864 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 865 const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; 866 int sector_nr; 867 868 for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { 869 scrub_verify_one_sector(stripe, sector_nr); 870 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) 871 sector_nr += sectors_per_tree - 1; 872 } 873 } 874 875 static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) 876 { 877 int i; 878 879 for (i = 0; i < stripe->nr_sectors; i++) { 880 if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) 881 break; 882 } 883 ASSERT(i < stripe->nr_sectors); 884 return i; 885 } 886 887 /* 888 * Repair read is different to the regular read: 889 * 890 * - Only reads the failed sectors 891 * - May have extra blocksize limits 892 */ 893 static void scrub_repair_read_endio(struct btrfs_bio *bbio) 894 { 895 struct scrub_stripe *stripe = bbio->private; 896 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 897 struct bio_vec *bvec; 898 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 899 u32 bio_size = 0; 900 int i; 901 902 ASSERT(sector_nr < stripe->nr_sectors); 903 904 bio_for_each_bvec_all(bvec, &bbio->bio, i) 905 bio_size += bvec->bv_len; 906 907 if (bbio->bio.bi_status) { 908 scrub_bitmap_set_io_error(stripe, sector_nr, 909 bio_size >> fs_info->sectorsize_bits); 910 scrub_bitmap_set_error(stripe, sector_nr, 911 bio_size >> fs_info->sectorsize_bits); 912 } else { 913 scrub_bitmap_clear_io_error(stripe, sector_nr, 914 bio_size >> fs_info->sectorsize_bits); 915 } 916 bio_put(&bbio->bio); 917 if (atomic_dec_and_test(&stripe->pending_io)) 918 wake_up(&stripe->io_wait); 919 } 920 921 static int calc_next_mirror(int mirror, int num_copies) 922 { 923 ASSERT(mirror <= num_copies); 924 return (mirror + 1 > num_copies) ? 1 : mirror + 1; 925 } 926 927 static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, 928 int sector_nr) 929 { 930 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 931 void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 932 int ret; 933 934 ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, 935 offset_in_page(kaddr)); 936 /* 937 * Caller should ensure the bbio has enough size. 938 * And we cannot use __bio_add_page(), which doesn't do any merge. 939 * 940 * Meanwhile for scrub_submit_initial_read() we fully rely on the merge 941 * to create the minimal amount of bio vectors, for fs block size < page 942 * size cases. 943 */ 944 ASSERT(ret == fs_info->sectorsize); 945 } 946 947 static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, 948 unsigned int nr_vecs, blk_opf_t opf, 949 u64 logical, 950 btrfs_bio_end_io_t end_io, void *private) 951 { 952 struct btrfs_bio *bbio; 953 954 bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), 955 logical, end_io, private); 956 bbio->is_scrub = true; 957 bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; 958 return bbio; 959 } 960 961 static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, 962 int mirror, int blocksize, bool wait) 963 { 964 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 965 struct btrfs_bio *bbio = NULL; 966 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 967 int i; 968 969 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 970 ASSERT(atomic_read(&stripe->pending_io) == 0, 971 "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); 972 973 for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { 974 /* The current sector cannot be merged, submit the bio. */ 975 if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || 976 bbio->bio.bi_iter.bi_size >= blocksize)) { 977 ASSERT(bbio->bio.bi_iter.bi_size); 978 atomic_inc(&stripe->pending_io); 979 btrfs_submit_bbio(bbio, mirror); 980 if (wait) 981 wait_scrub_stripe_io(stripe); 982 bbio = NULL; 983 } 984 985 if (!bbio) 986 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 987 stripe->logical + (i << fs_info->sectorsize_bits), 988 scrub_repair_read_endio, stripe); 989 990 scrub_bio_add_sector(bbio, stripe, i); 991 } 992 if (bbio) { 993 ASSERT(bbio->bio.bi_iter.bi_size); 994 atomic_inc(&stripe->pending_io); 995 btrfs_submit_bbio(bbio, mirror); 996 if (wait) 997 wait_scrub_stripe_io(stripe); 998 } 999 } 1000 1001 static void scrub_stripe_report_errors(struct scrub_ctx *sctx, 1002 struct scrub_stripe *stripe, 1003 const struct scrub_error_records *errors) 1004 { 1005 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 1006 DEFAULT_RATELIMIT_BURST); 1007 struct btrfs_fs_info *fs_info = sctx->fs_info; 1008 struct btrfs_device *dev = NULL; 1009 const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); 1010 const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); 1011 u64 physical = 0; 1012 int nr_data_sectors = 0; 1013 int nr_meta_sectors = 0; 1014 int nr_nodatacsum_sectors = 0; 1015 int nr_repaired_sectors = 0; 1016 int sector_nr; 1017 1018 if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) 1019 return; 1020 1021 /* 1022 * Init needed infos for error reporting. 1023 * 1024 * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() 1025 * thus no need for dev/physical, error reporting still needs dev and physical. 1026 */ 1027 if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { 1028 u64 mapped_len = fs_info->sectorsize; 1029 struct btrfs_io_context *bioc = NULL; 1030 int stripe_index = stripe->mirror_num - 1; 1031 int ret; 1032 1033 /* For scrub, our mirror_num should always start at 1. */ 1034 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1035 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1036 stripe->logical, &mapped_len, &bioc, 1037 NULL, NULL); 1038 /* 1039 * If we failed, dev will be NULL, and later detailed reports 1040 * will just be skipped. 1041 */ 1042 if (ret < 0) 1043 goto skip; 1044 physical = bioc->stripes[stripe_index].physical; 1045 dev = bioc->stripes[stripe_index].dev; 1046 btrfs_put_bioc(bioc); 1047 } 1048 1049 skip: 1050 for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { 1051 bool repaired = false; 1052 1053 if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { 1054 nr_meta_sectors++; 1055 } else { 1056 nr_data_sectors++; 1057 if (!stripe->sectors[sector_nr].csum) 1058 nr_nodatacsum_sectors++; 1059 } 1060 1061 if (test_bit(sector_nr, &errors->init_error_bitmap) && 1062 !test_bit(sector_nr, &error_bitmap)) { 1063 nr_repaired_sectors++; 1064 repaired = true; 1065 } 1066 1067 /* Good sector from the beginning, nothing need to be done. */ 1068 if (!test_bit(sector_nr, &errors->init_error_bitmap)) 1069 continue; 1070 1071 /* 1072 * Report error for the corrupted sectors. If repaired, just 1073 * output the message of repaired message. 1074 */ 1075 if (repaired) { 1076 if (dev) { 1077 btrfs_err_rl(fs_info, 1078 "scrub: fixed up error at logical %llu on dev %s physical %llu", 1079 stripe->logical, btrfs_dev_name(dev), 1080 physical); 1081 } else { 1082 btrfs_err_rl(fs_info, 1083 "scrub: fixed up error at logical %llu on mirror %u", 1084 stripe->logical, stripe->mirror_num); 1085 } 1086 continue; 1087 } 1088 1089 /* The remaining are all for unrepaired. */ 1090 if (dev) { 1091 btrfs_err_rl(fs_info, 1092 "scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", 1093 stripe->logical, btrfs_dev_name(dev), 1094 physical); 1095 } else { 1096 btrfs_err_rl(fs_info, 1097 "scrub: unable to fixup (regular) error at logical %llu on mirror %u", 1098 stripe->logical, stripe->mirror_num); 1099 } 1100 1101 if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) 1102 if (__ratelimit(&rs) && dev) 1103 scrub_print_common_warning("i/o error", dev, false, 1104 stripe->logical, physical); 1105 if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) 1106 if (__ratelimit(&rs) && dev) 1107 scrub_print_common_warning("checksum error", dev, false, 1108 stripe->logical, physical); 1109 if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) 1110 if (__ratelimit(&rs) && dev) 1111 scrub_print_common_warning("header error", dev, false, 1112 stripe->logical, physical); 1113 if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) 1114 if (__ratelimit(&rs) && dev) 1115 scrub_print_common_warning("generation error", dev, false, 1116 stripe->logical, physical); 1117 } 1118 1119 /* Update the device stats. */ 1120 for (int i = 0; i < errors->nr_io_errors; i++) 1121 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); 1122 for (int i = 0; i < errors->nr_csum_errors; i++) 1123 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 1124 /* Generation mismatch error is based on each metadata, not each block. */ 1125 for (int i = 0; i < errors->nr_meta_gen_errors; 1126 i += (fs_info->nodesize >> fs_info->sectorsize_bits)) 1127 btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); 1128 1129 spin_lock(&sctx->stat_lock); 1130 sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; 1131 sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; 1132 sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; 1133 sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; 1134 sctx->stat.no_csum += nr_nodatacsum_sectors; 1135 sctx->stat.read_errors += errors->nr_io_errors; 1136 sctx->stat.csum_errors += errors->nr_csum_errors; 1137 sctx->stat.verify_errors += errors->nr_meta_errors + 1138 errors->nr_meta_gen_errors; 1139 sctx->stat.uncorrectable_errors += 1140 bitmap_weight(&error_bitmap, stripe->nr_sectors); 1141 sctx->stat.corrected_errors += nr_repaired_sectors; 1142 spin_unlock(&sctx->stat_lock); 1143 } 1144 1145 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1146 unsigned long write_bitmap, bool dev_replace); 1147 1148 /* 1149 * The main entrance for all read related scrub work, including: 1150 * 1151 * - Wait for the initial read to finish 1152 * - Verify and locate any bad sectors 1153 * - Go through the remaining mirrors and try to read as large blocksize as 1154 * possible 1155 * - Go through all mirrors (including the failed mirror) sector-by-sector 1156 * - Submit writeback for repaired sectors 1157 * 1158 * Writeback for dev-replace does not happen here, it needs extra 1159 * synchronization for zoned devices. 1160 */ 1161 static void scrub_stripe_read_repair_worker(struct work_struct *work) 1162 { 1163 struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); 1164 struct scrub_ctx *sctx = stripe->sctx; 1165 struct btrfs_fs_info *fs_info = sctx->fs_info; 1166 struct scrub_error_records errors = { 0 }; 1167 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1168 stripe->bg->length); 1169 unsigned long repaired; 1170 unsigned long error; 1171 int mirror; 1172 int i; 1173 1174 ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); 1175 1176 wait_scrub_stripe_io(stripe); 1177 scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); 1178 /* Save the initial failed bitmap for later repair and report usage. */ 1179 errors.init_error_bitmap = scrub_bitmap_read_error(stripe); 1180 errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); 1181 errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); 1182 errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); 1183 errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); 1184 1185 if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) 1186 goto out; 1187 1188 /* 1189 * Try all remaining mirrors. 1190 * 1191 * Here we still try to read as large block as possible, as this is 1192 * faster and we have extra safety nets to rely on. 1193 */ 1194 for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); 1195 mirror != stripe->mirror_num; 1196 mirror = calc_next_mirror(mirror, num_copies)) { 1197 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1198 1199 scrub_stripe_submit_repair_read(stripe, mirror, 1200 BTRFS_STRIPE_LEN, false); 1201 wait_scrub_stripe_io(stripe); 1202 scrub_verify_one_stripe(stripe, old_error_bitmap); 1203 if (scrub_bitmap_empty_error(stripe)) 1204 goto out; 1205 } 1206 1207 /* 1208 * Last safety net, try re-checking all mirrors, including the failed 1209 * one, sector-by-sector. 1210 * 1211 * As if one sector failed the drive's internal csum, the whole read 1212 * containing the offending sector would be marked as error. 1213 * Thus here we do sector-by-sector read. 1214 * 1215 * This can be slow, thus we only try it as the last resort. 1216 */ 1217 1218 for (i = 0, mirror = stripe->mirror_num; 1219 i < num_copies; 1220 i++, mirror = calc_next_mirror(mirror, num_copies)) { 1221 const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); 1222 1223 scrub_stripe_submit_repair_read(stripe, mirror, 1224 fs_info->sectorsize, true); 1225 wait_scrub_stripe_io(stripe); 1226 scrub_verify_one_stripe(stripe, old_error_bitmap); 1227 if (scrub_bitmap_empty_error(stripe)) 1228 goto out; 1229 } 1230 out: 1231 error = scrub_bitmap_read_error(stripe); 1232 /* 1233 * Submit the repaired sectors. For zoned case, we cannot do repair 1234 * in-place, but queue the bg to be relocated. 1235 */ 1236 bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, 1237 stripe->nr_sectors); 1238 if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { 1239 if (btrfs_is_zoned(fs_info)) { 1240 btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); 1241 } else { 1242 scrub_write_sectors(sctx, stripe, repaired, false); 1243 wait_scrub_stripe_io(stripe); 1244 } 1245 } 1246 1247 scrub_stripe_report_errors(sctx, stripe, &errors); 1248 set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); 1249 wake_up(&stripe->repair_wait); 1250 } 1251 1252 static void scrub_read_endio(struct btrfs_bio *bbio) 1253 { 1254 struct scrub_stripe *stripe = bbio->private; 1255 struct bio_vec *bvec; 1256 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1257 int num_sectors; 1258 u32 bio_size = 0; 1259 int i; 1260 1261 ASSERT(sector_nr < stripe->nr_sectors); 1262 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1263 bio_size += bvec->bv_len; 1264 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1265 1266 if (bbio->bio.bi_status) { 1267 scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); 1268 scrub_bitmap_set_error(stripe, sector_nr, num_sectors); 1269 } else { 1270 scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); 1271 } 1272 bio_put(&bbio->bio); 1273 if (atomic_dec_and_test(&stripe->pending_io)) { 1274 wake_up(&stripe->io_wait); 1275 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1276 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1277 } 1278 } 1279 1280 static void scrub_write_endio(struct btrfs_bio *bbio) 1281 { 1282 struct scrub_stripe *stripe = bbio->private; 1283 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1284 struct bio_vec *bvec; 1285 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1286 u32 bio_size = 0; 1287 int i; 1288 1289 bio_for_each_bvec_all(bvec, &bbio->bio, i) 1290 bio_size += bvec->bv_len; 1291 1292 if (bbio->bio.bi_status) { 1293 unsigned long flags; 1294 1295 spin_lock_irqsave(&stripe->write_error_lock, flags); 1296 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1297 bio_size >> fs_info->sectorsize_bits); 1298 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1299 for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1300 btrfs_dev_stat_inc_and_print(stripe->dev, 1301 BTRFS_DEV_STAT_WRITE_ERRS); 1302 } 1303 bio_put(&bbio->bio); 1304 1305 if (atomic_dec_and_test(&stripe->pending_io)) 1306 wake_up(&stripe->io_wait); 1307 } 1308 1309 static void scrub_submit_write_bio(struct scrub_ctx *sctx, 1310 struct scrub_stripe *stripe, 1311 struct btrfs_bio *bbio, bool dev_replace) 1312 { 1313 struct btrfs_fs_info *fs_info = sctx->fs_info; 1314 u32 bio_len = bbio->bio.bi_iter.bi_size; 1315 u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) - 1316 stripe->logical; 1317 1318 fill_writer_pointer_gap(sctx, stripe->physical + bio_off); 1319 atomic_inc(&stripe->pending_io); 1320 btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); 1321 if (!btrfs_is_zoned(fs_info)) 1322 return; 1323 /* 1324 * For zoned writeback, queue depth must be 1, thus we must wait for 1325 * the write to finish before the next write. 1326 */ 1327 wait_scrub_stripe_io(stripe); 1328 1329 /* 1330 * And also need to update the write pointer if write finished 1331 * successfully. 1332 */ 1333 if (!test_bit(bio_off >> fs_info->sectorsize_bits, 1334 &stripe->write_error_bitmap)) 1335 sctx->write_pointer += bio_len; 1336 } 1337 1338 /* 1339 * Submit the write bio(s) for the sectors specified by @write_bitmap. 1340 * 1341 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: 1342 * 1343 * - Only needs logical bytenr and mirror_num 1344 * Just like the scrub read path 1345 * 1346 * - Would only result in writes to the specified mirror 1347 * Unlike the regular writeback path, which would write back to all stripes 1348 * 1349 * - Handle dev-replace and read-repair writeback differently 1350 */ 1351 static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, 1352 unsigned long write_bitmap, bool dev_replace) 1353 { 1354 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1355 struct btrfs_bio *bbio = NULL; 1356 int sector_nr; 1357 1358 for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { 1359 /* We should only writeback sectors covered by an extent. */ 1360 ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); 1361 1362 /* Cannot merge with previous sector, submit the current one. */ 1363 if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { 1364 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1365 bbio = NULL; 1366 } 1367 if (!bbio) 1368 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, 1369 stripe->logical + (sector_nr << fs_info->sectorsize_bits), 1370 scrub_write_endio, stripe); 1371 scrub_bio_add_sector(bbio, stripe, sector_nr); 1372 } 1373 if (bbio) 1374 scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); 1375 } 1376 1377 /* 1378 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1379 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1380 */ 1381 static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, 1382 unsigned int bio_size) 1383 { 1384 const int time_slice = 1000; 1385 s64 delta; 1386 ktime_t now; 1387 u32 div; 1388 u64 bwlimit; 1389 1390 bwlimit = READ_ONCE(device->scrub_speed_max); 1391 if (bwlimit == 0) 1392 return; 1393 1394 /* 1395 * Slice is divided into intervals when the IO is submitted, adjust by 1396 * bwlimit and maximum of 64 intervals. 1397 */ 1398 div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); 1399 1400 /* Start new epoch, set deadline */ 1401 now = ktime_get(); 1402 if (sctx->throttle_deadline == 0) { 1403 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1404 sctx->throttle_sent = 0; 1405 } 1406 1407 /* Still in the time to send? */ 1408 if (ktime_before(now, sctx->throttle_deadline)) { 1409 /* If current bio is within the limit, send it */ 1410 sctx->throttle_sent += bio_size; 1411 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1412 return; 1413 1414 /* We're over the limit, sleep until the rest of the slice */ 1415 delta = ktime_ms_delta(sctx->throttle_deadline, now); 1416 } else { 1417 /* New request after deadline, start new epoch */ 1418 delta = 0; 1419 } 1420 1421 if (delta) { 1422 long timeout; 1423 1424 timeout = div_u64(delta * HZ, 1000); 1425 schedule_timeout_interruptible(timeout); 1426 } 1427 1428 /* Next call will start the deadline period */ 1429 sctx->throttle_deadline = 0; 1430 } 1431 1432 /* 1433 * Given a physical address, this will calculate it's 1434 * logical offset. if this is a parity stripe, it will return 1435 * the most left data stripe's logical offset. 1436 * 1437 * return 0 if it is a data stripe, 1 means parity stripe. 1438 */ 1439 static int get_raid56_logic_offset(u64 physical, int num, 1440 struct btrfs_chunk_map *map, u64 *offset, 1441 u64 *stripe_start) 1442 { 1443 int i; 1444 int j = 0; 1445 u64 last_offset; 1446 const int data_stripes = nr_data_stripes(map); 1447 1448 last_offset = (physical - map->stripes[num].physical) * data_stripes; 1449 if (stripe_start) 1450 *stripe_start = last_offset; 1451 1452 *offset = last_offset; 1453 for (i = 0; i < data_stripes; i++) { 1454 u32 stripe_nr; 1455 u32 stripe_index; 1456 u32 rot; 1457 1458 *offset = last_offset + btrfs_stripe_nr_to_offset(i); 1459 1460 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; 1461 1462 /* Work out the disk rotation on this stripe-set */ 1463 rot = stripe_nr % map->num_stripes; 1464 /* calculate which stripe this data locates */ 1465 rot += i; 1466 stripe_index = rot % map->num_stripes; 1467 if (stripe_index == num) 1468 return 0; 1469 if (stripe_index < num) 1470 j++; 1471 } 1472 *offset = last_offset + btrfs_stripe_nr_to_offset(j); 1473 return 1; 1474 } 1475 1476 /* 1477 * Return 0 if the extent item range covers any byte of the range. 1478 * Return <0 if the extent item is before @search_start. 1479 * Return >0 if the extent item is after @start_start + @search_len. 1480 */ 1481 static int compare_extent_item_range(struct btrfs_path *path, 1482 u64 search_start, u64 search_len) 1483 { 1484 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 1485 u64 len; 1486 struct btrfs_key key; 1487 1488 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1489 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 1490 key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); 1491 if (key.type == BTRFS_METADATA_ITEM_KEY) 1492 len = fs_info->nodesize; 1493 else 1494 len = key.offset; 1495 1496 if (key.objectid + len <= search_start) 1497 return -1; 1498 if (key.objectid >= search_start + search_len) 1499 return 1; 1500 return 0; 1501 } 1502 1503 /* 1504 * Locate one extent item which covers any byte in range 1505 * [@search_start, @search_start + @search_length) 1506 * 1507 * If the path is not initialized, we will initialize the search by doing 1508 * a btrfs_search_slot(). 1509 * If the path is already initialized, we will use the path as the initial 1510 * slot, to avoid duplicated btrfs_search_slot() calls. 1511 * 1512 * NOTE: If an extent item starts before @search_start, we will still 1513 * return the extent item. This is for data extent crossing stripe boundary. 1514 * 1515 * Return 0 if we found such extent item, and @path will point to the extent item. 1516 * Return >0 if no such extent item can be found, and @path will be released. 1517 * Return <0 if hit fatal error, and @path will be released. 1518 */ 1519 static int find_first_extent_item(struct btrfs_root *extent_root, 1520 struct btrfs_path *path, 1521 u64 search_start, u64 search_len) 1522 { 1523 struct btrfs_fs_info *fs_info = extent_root->fs_info; 1524 struct btrfs_key key; 1525 int ret; 1526 1527 /* Continue using the existing path */ 1528 if (path->nodes[0]) 1529 goto search_forward; 1530 1531 key.objectid = search_start; 1532 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1533 key.type = BTRFS_METADATA_ITEM_KEY; 1534 else 1535 key.type = BTRFS_EXTENT_ITEM_KEY; 1536 key.offset = (u64)-1; 1537 1538 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 1539 if (ret < 0) 1540 return ret; 1541 if (unlikely(ret == 0)) { 1542 /* 1543 * Key with offset -1 found, there would have to exist an extent 1544 * item with such offset, but this is out of the valid range. 1545 */ 1546 btrfs_release_path(path); 1547 return -EUCLEAN; 1548 } 1549 1550 /* 1551 * Here we intentionally pass 0 as @min_objectid, as there could be 1552 * an extent item starting before @search_start. 1553 */ 1554 ret = btrfs_previous_extent_item(extent_root, path, 0); 1555 if (ret < 0) 1556 return ret; 1557 /* 1558 * No matter whether we have found an extent item, the next loop will 1559 * properly do every check on the key. 1560 */ 1561 search_forward: 1562 while (true) { 1563 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1564 if (key.objectid >= search_start + search_len) 1565 break; 1566 if (key.type != BTRFS_METADATA_ITEM_KEY && 1567 key.type != BTRFS_EXTENT_ITEM_KEY) 1568 goto next; 1569 1570 ret = compare_extent_item_range(path, search_start, search_len); 1571 if (ret == 0) 1572 return ret; 1573 if (ret > 0) 1574 break; 1575 next: 1576 ret = btrfs_next_item(extent_root, path); 1577 if (ret) { 1578 /* Either no more items or a fatal error. */ 1579 btrfs_release_path(path); 1580 return ret; 1581 } 1582 } 1583 btrfs_release_path(path); 1584 return 1; 1585 } 1586 1587 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 1588 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 1589 { 1590 struct btrfs_key key; 1591 struct btrfs_extent_item *ei; 1592 1593 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1594 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 1595 key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); 1596 *extent_start_ret = key.objectid; 1597 if (key.type == BTRFS_METADATA_ITEM_KEY) 1598 *size_ret = path->nodes[0]->fs_info->nodesize; 1599 else 1600 *size_ret = key.offset; 1601 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 1602 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 1603 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 1604 } 1605 1606 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 1607 u64 physical, u64 physical_end) 1608 { 1609 struct btrfs_fs_info *fs_info = sctx->fs_info; 1610 int ret = 0; 1611 1612 if (!btrfs_is_zoned(fs_info)) 1613 return 0; 1614 1615 mutex_lock(&sctx->wr_lock); 1616 if (sctx->write_pointer < physical_end) { 1617 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 1618 physical, 1619 sctx->write_pointer); 1620 if (ret) 1621 btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); 1622 } 1623 mutex_unlock(&sctx->wr_lock); 1624 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 1625 1626 return ret; 1627 } 1628 1629 static void fill_one_extent_info(struct btrfs_fs_info *fs_info, 1630 struct scrub_stripe *stripe, 1631 u64 extent_start, u64 extent_len, 1632 u64 extent_flags, u64 extent_gen) 1633 { 1634 for (u64 cur_logical = max(stripe->logical, extent_start); 1635 cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, 1636 extent_start + extent_len); 1637 cur_logical += fs_info->sectorsize) { 1638 const int nr_sector = (cur_logical - stripe->logical) >> 1639 fs_info->sectorsize_bits; 1640 struct scrub_sector_verification *sector = 1641 &stripe->sectors[nr_sector]; 1642 1643 scrub_bitmap_set_bit_has_extent(stripe, nr_sector); 1644 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1645 scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); 1646 sector->generation = extent_gen; 1647 } 1648 } 1649 } 1650 1651 static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) 1652 { 1653 ASSERT(stripe->nr_sectors); 1654 bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); 1655 } 1656 1657 /* 1658 * Locate one stripe which has at least one extent in its range. 1659 * 1660 * Return 0 if found such stripe, and store its info into @stripe. 1661 * Return >0 if there is no such stripe in the specified range. 1662 * Return <0 for error. 1663 */ 1664 static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, 1665 struct btrfs_path *extent_path, 1666 struct btrfs_path *csum_path, 1667 struct btrfs_device *dev, u64 physical, 1668 int mirror_num, u64 logical_start, 1669 u32 logical_len, 1670 struct scrub_stripe *stripe) 1671 { 1672 struct btrfs_fs_info *fs_info = bg->fs_info; 1673 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); 1674 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); 1675 const u64 logical_end = logical_start + logical_len; 1676 u64 cur_logical = logical_start; 1677 u64 stripe_end; 1678 u64 extent_start; 1679 u64 extent_len; 1680 u64 extent_flags; 1681 u64 extent_gen; 1682 int ret; 1683 1684 if (unlikely(!extent_root || !csum_root)) { 1685 btrfs_err(fs_info, "scrub: no valid extent or csum root found"); 1686 return -EUCLEAN; 1687 } 1688 memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * 1689 stripe->nr_sectors); 1690 scrub_stripe_reset_bitmaps(stripe); 1691 1692 /* The range must be inside the bg. */ 1693 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, 1694 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1695 bg->start, logical_start, logical_end, bg->start + bg->length); 1696 1697 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1698 logical_len); 1699 /* Either error or not found. */ 1700 if (ret) 1701 goto out; 1702 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1703 &extent_gen); 1704 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1705 stripe->nr_meta_extents++; 1706 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1707 stripe->nr_data_extents++; 1708 cur_logical = max(extent_start, cur_logical); 1709 1710 /* 1711 * Round down to stripe boundary. 1712 * 1713 * The extra calculation against bg->start is to handle block groups 1714 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. 1715 */ 1716 stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + 1717 bg->start; 1718 stripe->physical = physical + stripe->logical - logical_start; 1719 stripe->dev = dev; 1720 stripe->bg = bg; 1721 stripe->mirror_num = mirror_num; 1722 stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; 1723 1724 /* Fill the first extent info into stripe->sectors[] array. */ 1725 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1726 extent_flags, extent_gen); 1727 cur_logical = extent_start + extent_len; 1728 1729 /* Fill the extent info for the remaining sectors. */ 1730 while (cur_logical <= stripe_end) { 1731 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1732 stripe_end - cur_logical + 1); 1733 if (ret < 0) 1734 goto out; 1735 if (ret > 0) { 1736 ret = 0; 1737 break; 1738 } 1739 get_extent_info(extent_path, &extent_start, &extent_len, 1740 &extent_flags, &extent_gen); 1741 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1742 stripe->nr_meta_extents++; 1743 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) 1744 stripe->nr_data_extents++; 1745 fill_one_extent_info(fs_info, stripe, extent_start, extent_len, 1746 extent_flags, extent_gen); 1747 cur_logical = extent_start + extent_len; 1748 } 1749 1750 /* Now fill the data csum. */ 1751 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { 1752 int sector_nr; 1753 unsigned long csum_bitmap = 0; 1754 1755 /* Csum space should have already been allocated. */ 1756 ASSERT(stripe->csums); 1757 1758 /* 1759 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN 1760 * should contain at most 16 sectors. 1761 */ 1762 ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 1763 1764 ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, 1765 stripe->logical, stripe_end, 1766 stripe->csums, &csum_bitmap); 1767 if (ret < 0) 1768 goto out; 1769 if (ret > 0) 1770 ret = 0; 1771 1772 for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { 1773 stripe->sectors[sector_nr].csum = stripe->csums + 1774 sector_nr * fs_info->csum_size; 1775 } 1776 } 1777 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1778 out: 1779 return ret; 1780 } 1781 1782 static void scrub_reset_stripe(struct scrub_stripe *stripe) 1783 { 1784 scrub_stripe_reset_bitmaps(stripe); 1785 1786 stripe->nr_meta_extents = 0; 1787 stripe->nr_data_extents = 0; 1788 stripe->state = 0; 1789 1790 for (int i = 0; i < stripe->nr_sectors; i++) { 1791 stripe->sectors[i].csum = NULL; 1792 stripe->sectors[i].generation = 0; 1793 } 1794 } 1795 1796 static u32 stripe_length(const struct scrub_stripe *stripe) 1797 { 1798 ASSERT(stripe->bg); 1799 1800 return min(BTRFS_STRIPE_LEN, 1801 stripe->bg->start + stripe->bg->length - stripe->logical); 1802 } 1803 1804 static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) 1805 { 1806 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1807 struct btrfs_bio *bbio = NULL; 1808 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1809 const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); 1810 u64 stripe_len = BTRFS_STRIPE_LEN; 1811 int mirror = stripe->mirror_num; 1812 int i; 1813 1814 atomic_inc(&stripe->pending_io); 1815 1816 for_each_set_bit(i, &has_extent, stripe->nr_sectors) { 1817 /* We're beyond the chunk boundary, no need to read anymore. */ 1818 if (i >= nr_sectors) 1819 break; 1820 1821 /* The current sector cannot be merged, submit the bio. */ 1822 if (bbio && 1823 ((i > 0 && !test_bit(i - 1, &has_extent)) || 1824 bbio->bio.bi_iter.bi_size >= stripe_len)) { 1825 ASSERT(bbio->bio.bi_iter.bi_size); 1826 atomic_inc(&stripe->pending_io); 1827 btrfs_submit_bbio(bbio, mirror); 1828 bbio = NULL; 1829 } 1830 1831 if (!bbio) { 1832 struct btrfs_io_stripe io_stripe = {}; 1833 struct btrfs_io_context *bioc = NULL; 1834 const u64 logical = stripe->logical + 1835 (i << fs_info->sectorsize_bits); 1836 int ret; 1837 1838 io_stripe.rst_search_commit_root = true; 1839 stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; 1840 /* 1841 * For RST cases, we need to manually split the bbio to 1842 * follow the RST boundary. 1843 */ 1844 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 1845 &stripe_len, &bioc, &io_stripe, &mirror); 1846 btrfs_put_bioc(bioc); 1847 if (ret < 0) { 1848 if (ret != -ENODATA) { 1849 /* 1850 * Earlier btrfs_get_raid_extent_offset() 1851 * returned -ENODATA, which means there's 1852 * no entry for the corresponding range 1853 * in the stripe tree. But if it's in 1854 * the extent tree, then it's a preallocated 1855 * extent and not an error. 1856 */ 1857 scrub_bitmap_set_bit_io_error(stripe, i); 1858 scrub_bitmap_set_bit_error(stripe, i); 1859 } 1860 continue; 1861 } 1862 1863 bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, 1864 logical, scrub_read_endio, stripe); 1865 } 1866 1867 scrub_bio_add_sector(bbio, stripe, i); 1868 } 1869 1870 if (bbio) { 1871 ASSERT(bbio->bio.bi_iter.bi_size); 1872 atomic_inc(&stripe->pending_io); 1873 btrfs_submit_bbio(bbio, mirror); 1874 } 1875 1876 if (atomic_dec_and_test(&stripe->pending_io)) { 1877 wake_up(&stripe->io_wait); 1878 INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); 1879 queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); 1880 } 1881 } 1882 1883 static void scrub_submit_initial_read(struct scrub_ctx *sctx, 1884 struct scrub_stripe *stripe) 1885 { 1886 struct btrfs_fs_info *fs_info = sctx->fs_info; 1887 struct btrfs_bio *bbio; 1888 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 1889 unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; 1890 int mirror = stripe->mirror_num; 1891 1892 ASSERT(stripe->bg); 1893 ASSERT(stripe->mirror_num > 0); 1894 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1895 1896 if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { 1897 scrub_submit_extent_sector_read(stripe); 1898 return; 1899 } 1900 1901 bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, 1902 stripe->logical, scrub_read_endio, stripe); 1903 /* Read the whole range inside the chunk boundary. */ 1904 for (unsigned int cur = 0; cur < nr_sectors; cur++) 1905 scrub_bio_add_sector(bbio, stripe, cur); 1906 atomic_inc(&stripe->pending_io); 1907 1908 /* 1909 * For dev-replace, either user asks to avoid the source dev, or 1910 * the device is missing, we try the next mirror instead. 1911 */ 1912 if (sctx->is_dev_replace && 1913 (fs_info->dev_replace.cont_reading_from_srcdev_mode == 1914 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || 1915 !stripe->dev->bdev)) { 1916 int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, 1917 stripe->bg->length); 1918 1919 mirror = calc_next_mirror(mirror, num_copies); 1920 } 1921 btrfs_submit_bbio(bbio, mirror); 1922 } 1923 1924 static bool stripe_has_metadata_error(struct scrub_stripe *stripe) 1925 { 1926 const unsigned long error = scrub_bitmap_read_error(stripe); 1927 int i; 1928 1929 for_each_set_bit(i, &error, stripe->nr_sectors) { 1930 if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { 1931 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1932 1933 btrfs_err(fs_info, 1934 "scrub: stripe %llu has unrepaired metadata sector at logical %llu", 1935 stripe->logical, 1936 stripe->logical + (i << fs_info->sectorsize_bits)); 1937 return true; 1938 } 1939 } 1940 return false; 1941 } 1942 1943 static void submit_initial_group_read(struct scrub_ctx *sctx, 1944 unsigned int first_slot, 1945 unsigned int nr_stripes) 1946 { 1947 struct blk_plug plug; 1948 1949 ASSERT(first_slot < SCRUB_TOTAL_STRIPES); 1950 ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); 1951 1952 scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, 1953 btrfs_stripe_nr_to_offset(nr_stripes)); 1954 blk_start_plug(&plug); 1955 for (int i = 0; i < nr_stripes; i++) { 1956 struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; 1957 1958 /* Those stripes should be initialized. */ 1959 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); 1960 scrub_submit_initial_read(sctx, stripe); 1961 } 1962 blk_finish_plug(&plug); 1963 } 1964 1965 static int flush_scrub_stripes(struct scrub_ctx *sctx) 1966 { 1967 struct btrfs_fs_info *fs_info = sctx->fs_info; 1968 struct scrub_stripe *stripe; 1969 const int nr_stripes = sctx->cur_stripe; 1970 int ret = 0; 1971 1972 if (!nr_stripes) 1973 return 0; 1974 1975 ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); 1976 1977 /* Submit the stripes which are populated but not submitted. */ 1978 if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { 1979 const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); 1980 1981 submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); 1982 } 1983 1984 for (int i = 0; i < nr_stripes; i++) { 1985 stripe = &sctx->stripes[i]; 1986 1987 wait_event(stripe->repair_wait, 1988 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 1989 } 1990 1991 /* Submit for dev-replace. */ 1992 if (sctx->is_dev_replace) { 1993 /* 1994 * For dev-replace, if we know there is something wrong with 1995 * metadata, we should immediately abort. 1996 */ 1997 for (int i = 0; i < nr_stripes; i++) { 1998 if (unlikely(stripe_has_metadata_error(&sctx->stripes[i]))) { 1999 ret = -EIO; 2000 goto out; 2001 } 2002 } 2003 for (int i = 0; i < nr_stripes; i++) { 2004 unsigned long good; 2005 unsigned long has_extent; 2006 unsigned long error; 2007 2008 stripe = &sctx->stripes[i]; 2009 2010 ASSERT(stripe->dev == fs_info->dev_replace.srcdev); 2011 2012 has_extent = scrub_bitmap_read_has_extent(stripe); 2013 error = scrub_bitmap_read_error(stripe); 2014 bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); 2015 scrub_write_sectors(sctx, stripe, good, true); 2016 } 2017 } 2018 2019 /* Wait for the above writebacks to finish. */ 2020 for (int i = 0; i < nr_stripes; i++) { 2021 stripe = &sctx->stripes[i]; 2022 2023 wait_scrub_stripe_io(stripe); 2024 spin_lock(&sctx->stat_lock); 2025 sctx->stat.last_physical = stripe->physical + stripe_length(stripe); 2026 spin_unlock(&sctx->stat_lock); 2027 scrub_reset_stripe(stripe); 2028 } 2029 out: 2030 sctx->cur_stripe = 0; 2031 return ret; 2032 } 2033 2034 static void raid56_scrub_wait_endio(struct bio *bio) 2035 { 2036 complete(bio->bi_private); 2037 } 2038 2039 static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, 2040 struct btrfs_device *dev, int mirror_num, 2041 u64 logical, u32 length, u64 physical, 2042 u64 *found_logical_ret) 2043 { 2044 struct scrub_stripe *stripe; 2045 int ret; 2046 2047 /* 2048 * There should always be one slot left, as caller filling the last 2049 * slot should flush them all. 2050 */ 2051 ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); 2052 2053 /* @found_logical_ret must be specified. */ 2054 ASSERT(found_logical_ret); 2055 2056 stripe = &sctx->stripes[sctx->cur_stripe]; 2057 scrub_reset_stripe(stripe); 2058 ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, 2059 &sctx->csum_path, dev, physical, 2060 mirror_num, logical, length, stripe); 2061 /* Either >0 as no more extents or <0 for error. */ 2062 if (ret) 2063 return ret; 2064 *found_logical_ret = stripe->logical; 2065 sctx->cur_stripe++; 2066 2067 /* We filled one group, submit it. */ 2068 if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { 2069 const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; 2070 2071 submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); 2072 } 2073 2074 /* Last slot used, flush them all. */ 2075 if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) 2076 return flush_scrub_stripes(sctx); 2077 return 0; 2078 } 2079 2080 /* 2081 * Return 0 if we should not cancel the scrub. 2082 * Return <0 if we need to cancel the scrub, returned value will 2083 * indicate the reason: 2084 * - -ECANCELED - Being explicitly canceled through ioctl. 2085 * - -EINTR - Being interrupted by signal or fs/process freezing. 2086 */ 2087 static int should_cancel_scrub(const struct scrub_ctx *sctx) 2088 { 2089 struct btrfs_fs_info *fs_info = sctx->fs_info; 2090 2091 if (atomic_read(&fs_info->scrub_cancel_req) || 2092 atomic_read(&sctx->cancel_req)) 2093 return -ECANCELED; 2094 2095 /* 2096 * The user (e.g. fsfreeze command) or power management (PM) 2097 * suspend/hibernate can freeze the fs. And PM suspend/hibernate will 2098 * also freeze all user processes. 2099 * 2100 * A user process can only be frozen when it is in user space, thus we 2101 * have to cancel the run so that the process can return to the user 2102 * space. 2103 * 2104 * Furthermore we have to check both filesystem and process freezing, 2105 * as PM can be configured to freeze the filesystems before processes. 2106 * 2107 * If we only check fs freezing, then suspend without fs freezing 2108 * will timeout, as the process is still in kernel space. 2109 * 2110 * If we only check process freezing, then suspend with fs freezing 2111 * will timeout, as the running scrub will prevent the fs from being frozen. 2112 */ 2113 if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || 2114 freezing(current) || signal_pending(current)) 2115 return -EINTR; 2116 return 0; 2117 } 2118 2119 static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, 2120 struct btrfs_device *scrub_dev, 2121 struct btrfs_chunk_map *map, 2122 u64 full_stripe_start, 2123 unsigned long *extent_bitmap) 2124 { 2125 DECLARE_COMPLETION_ONSTACK(io_done); 2126 struct btrfs_fs_info *fs_info = sctx->fs_info; 2127 struct btrfs_io_context *bioc = NULL; 2128 struct btrfs_raid_bio *rbio; 2129 struct bio bio; 2130 const int data_stripes = nr_data_stripes(map); 2131 u64 length = btrfs_stripe_nr_to_offset(data_stripes); 2132 int ret; 2133 2134 bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); 2135 bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; 2136 bio.bi_private = &io_done; 2137 bio.bi_end_io = raid56_scrub_wait_endio; 2138 2139 btrfs_bio_counter_inc_blocked(fs_info); 2140 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, 2141 &length, &bioc, NULL, NULL); 2142 if (ret < 0) 2143 goto out; 2144 /* For RAID56 write there must be an @bioc allocated. */ 2145 ASSERT(bioc); 2146 rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, 2147 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 2148 btrfs_put_bioc(bioc); 2149 if (!rbio) { 2150 ret = -ENOMEM; 2151 goto out; 2152 } 2153 /* Use the recovered stripes as cache to avoid read them from disk again. */ 2154 for (int i = 0; i < data_stripes; i++) { 2155 struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; 2156 2157 raid56_parity_cache_data_folios(rbio, stripe->folios, 2158 full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); 2159 } 2160 raid56_parity_submit_scrub_rbio(rbio); 2161 wait_for_completion_io(&io_done); 2162 ret = blk_status_to_errno(bio.bi_status); 2163 out: 2164 btrfs_bio_counter_dec(fs_info); 2165 bio_uninit(&bio); 2166 return ret; 2167 } 2168 2169 static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, 2170 struct btrfs_device *scrub_dev, 2171 struct btrfs_block_group *bg, 2172 struct btrfs_chunk_map *map, 2173 u64 full_stripe_start) 2174 { 2175 struct btrfs_fs_info *fs_info = sctx->fs_info; 2176 struct btrfs_path extent_path = { 0 }; 2177 struct btrfs_path csum_path = { 0 }; 2178 struct scrub_stripe *stripe; 2179 bool all_empty = true; 2180 const int data_stripes = nr_data_stripes(map); 2181 unsigned long extent_bitmap = 0; 2182 int ret; 2183 2184 ASSERT(sctx->raid56_data_stripes); 2185 2186 ret = should_cancel_scrub(sctx); 2187 if (ret < 0) 2188 return ret; 2189 2190 if (atomic_read(&fs_info->scrub_pause_req)) 2191 scrub_blocked_if_needed(fs_info); 2192 2193 spin_lock(&bg->lock); 2194 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2195 spin_unlock(&bg->lock); 2196 return 0; 2197 } 2198 spin_unlock(&bg->lock); 2199 2200 /* 2201 * For data stripe search, we cannot reuse the same extent/csum paths, 2202 * as the data stripe bytenr may be smaller than previous extent. Thus 2203 * we have to use our own extent/csum paths. 2204 */ 2205 extent_path.search_commit_root = true; 2206 extent_path.skip_locking = true; 2207 csum_path.search_commit_root = true; 2208 csum_path.skip_locking = true; 2209 2210 for (int i = 0; i < data_stripes; i++) { 2211 int stripe_index; 2212 int rot; 2213 u64 physical; 2214 2215 stripe = &sctx->raid56_data_stripes[i]; 2216 rot = div_u64(full_stripe_start - bg->start, 2217 data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; 2218 stripe_index = (i + rot) % map->num_stripes; 2219 physical = map->stripes[stripe_index].physical + 2220 btrfs_stripe_nr_to_offset(rot); 2221 2222 scrub_reset_stripe(stripe); 2223 set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); 2224 ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, 2225 map->stripes[stripe_index].dev, physical, 1, 2226 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2227 BTRFS_STRIPE_LEN, stripe); 2228 if (ret < 0) 2229 goto out; 2230 /* 2231 * No extent in this data stripe, need to manually mark them 2232 * initialized to make later read submission happy. 2233 */ 2234 if (ret > 0) { 2235 stripe->logical = full_stripe_start + 2236 btrfs_stripe_nr_to_offset(i); 2237 stripe->dev = map->stripes[stripe_index].dev; 2238 stripe->mirror_num = 1; 2239 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 2240 } 2241 } 2242 2243 /* Check if all data stripes are empty. */ 2244 for (int i = 0; i < data_stripes; i++) { 2245 stripe = &sctx->raid56_data_stripes[i]; 2246 if (!scrub_bitmap_empty_has_extent(stripe)) { 2247 all_empty = false; 2248 break; 2249 } 2250 } 2251 if (all_empty) { 2252 ret = 0; 2253 goto out; 2254 } 2255 2256 for (int i = 0; i < data_stripes; i++) { 2257 stripe = &sctx->raid56_data_stripes[i]; 2258 scrub_submit_initial_read(sctx, stripe); 2259 } 2260 for (int i = 0; i < data_stripes; i++) { 2261 stripe = &sctx->raid56_data_stripes[i]; 2262 2263 wait_event(stripe->repair_wait, 2264 test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); 2265 } 2266 /* For now, no zoned support for RAID56. */ 2267 ASSERT(!btrfs_is_zoned(sctx->fs_info)); 2268 2269 /* 2270 * Now all data stripes are properly verified. Check if we have any 2271 * unrepaired, if so abort immediately or we could further corrupt the 2272 * P/Q stripes. 2273 * 2274 * During the loop, also populate extent_bitmap. 2275 */ 2276 for (int i = 0; i < data_stripes; i++) { 2277 unsigned long error; 2278 unsigned long has_extent; 2279 2280 stripe = &sctx->raid56_data_stripes[i]; 2281 2282 error = scrub_bitmap_read_error(stripe); 2283 has_extent = scrub_bitmap_read_has_extent(stripe); 2284 2285 /* 2286 * We should only check the errors where there is an extent. 2287 * As we may hit an empty data stripe while it's missing. 2288 */ 2289 bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); 2290 if (unlikely(!bitmap_empty(&error, stripe->nr_sectors))) { 2291 btrfs_err(fs_info, 2292 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2293 full_stripe_start, i, stripe->nr_sectors, 2294 &error); 2295 ret = -EIO; 2296 goto out; 2297 } 2298 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2299 stripe->nr_sectors); 2300 } 2301 2302 /* Now we can check and regenerate the P/Q stripe. */ 2303 ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2304 &extent_bitmap); 2305 out: 2306 btrfs_release_path(&extent_path); 2307 btrfs_release_path(&csum_path); 2308 return ret; 2309 } 2310 2311 /* 2312 * Scrub one range which can only has simple mirror based profile. 2313 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 2314 * RAID0/RAID10). 2315 * 2316 * Since we may need to handle a subset of block group, we need @logical_start 2317 * and @logical_length parameter. 2318 */ 2319 static int scrub_simple_mirror(struct scrub_ctx *sctx, 2320 struct btrfs_block_group *bg, 2321 u64 logical_start, u64 logical_length, 2322 struct btrfs_device *device, 2323 u64 physical, int mirror_num) 2324 { 2325 struct btrfs_fs_info *fs_info = sctx->fs_info; 2326 const u64 logical_end = logical_start + logical_length; 2327 u64 cur_logical = logical_start; 2328 int ret = 0; 2329 2330 /* The range must be inside the bg */ 2331 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); 2332 2333 /* Go through each extent items inside the logical range */ 2334 while (cur_logical < logical_end) { 2335 u64 found_logical = U64_MAX; 2336 u64 cur_physical = physical + cur_logical - logical_start; 2337 2338 ret = should_cancel_scrub(sctx); 2339 if (ret < 0) 2340 break; 2341 2342 if (atomic_read(&fs_info->scrub_pause_req)) 2343 scrub_blocked_if_needed(fs_info); 2344 2345 spin_lock(&bg->lock); 2346 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { 2347 spin_unlock(&bg->lock); 2348 ret = 0; 2349 break; 2350 } 2351 spin_unlock(&bg->lock); 2352 2353 ret = queue_scrub_stripe(sctx, bg, device, mirror_num, 2354 cur_logical, logical_end - cur_logical, 2355 cur_physical, &found_logical); 2356 if (ret > 0) { 2357 /* No more extent, just update the accounting */ 2358 spin_lock(&sctx->stat_lock); 2359 sctx->stat.last_physical = physical + logical_length; 2360 spin_unlock(&sctx->stat_lock); 2361 ret = 0; 2362 break; 2363 } 2364 if (ret < 0) 2365 break; 2366 2367 /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ 2368 ASSERT(found_logical != U64_MAX); 2369 cur_logical = found_logical + BTRFS_STRIPE_LEN; 2370 2371 /* Don't hold CPU for too long time */ 2372 cond_resched(); 2373 } 2374 return ret; 2375 } 2376 2377 /* Calculate the full stripe length for simple stripe based profiles */ 2378 static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) 2379 { 2380 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2381 BTRFS_BLOCK_GROUP_RAID10)); 2382 2383 return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); 2384 } 2385 2386 /* Get the logical bytenr for the stripe */ 2387 static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, 2388 struct btrfs_block_group *bg, 2389 int stripe_index) 2390 { 2391 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2392 BTRFS_BLOCK_GROUP_RAID10)); 2393 ASSERT(stripe_index < map->num_stripes); 2394 2395 /* 2396 * (stripe_index / sub_stripes) gives how many data stripes we need to 2397 * skip. 2398 */ 2399 return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + 2400 bg->start; 2401 } 2402 2403 /* Get the mirror number for the stripe */ 2404 static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) 2405 { 2406 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2407 BTRFS_BLOCK_GROUP_RAID10)); 2408 ASSERT(stripe_index < map->num_stripes); 2409 2410 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 2411 return stripe_index % map->sub_stripes + 1; 2412 } 2413 2414 static int scrub_simple_stripe(struct scrub_ctx *sctx, 2415 struct btrfs_block_group *bg, 2416 struct btrfs_chunk_map *map, 2417 struct btrfs_device *device, 2418 int stripe_index) 2419 { 2420 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2421 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2422 const u64 orig_physical = map->stripes[stripe_index].physical; 2423 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2424 u64 cur_logical = orig_logical; 2425 u64 cur_physical = orig_physical; 2426 int ret = 0; 2427 2428 while (cur_logical < bg->start + bg->length) { 2429 /* 2430 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2431 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 2432 * this stripe. 2433 */ 2434 ret = scrub_simple_mirror(sctx, bg, cur_logical, 2435 BTRFS_STRIPE_LEN, device, cur_physical, 2436 mirror_num); 2437 if (ret) 2438 return ret; 2439 /* Skip to next stripe which belongs to the target device */ 2440 cur_logical += logical_increment; 2441 /* For physical offset, we just go to next stripe */ 2442 cur_physical += BTRFS_STRIPE_LEN; 2443 } 2444 return ret; 2445 } 2446 2447 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2448 struct btrfs_block_group *bg, 2449 struct btrfs_chunk_map *map, 2450 struct btrfs_device *scrub_dev, 2451 int stripe_index) 2452 { 2453 struct btrfs_fs_info *fs_info = sctx->fs_info; 2454 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 2455 const u64 chunk_logical = bg->start; 2456 int ret; 2457 int ret2; 2458 u64 physical = map->stripes[stripe_index].physical; 2459 const u64 dev_stripe_len = btrfs_calc_stripe_length(map); 2460 const u64 physical_end = physical + dev_stripe_len; 2461 u64 logical; 2462 u64 logic_end; 2463 /* The logical increment after finishing one stripe */ 2464 u64 increment; 2465 /* Offset inside the chunk */ 2466 u64 offset; 2467 u64 stripe_logical; 2468 2469 /* Extent_path should be released by now. */ 2470 ASSERT(sctx->extent_path.nodes[0] == NULL); 2471 2472 scrub_blocked_if_needed(fs_info); 2473 2474 if (sctx->is_dev_replace && 2475 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 2476 mutex_lock(&sctx->wr_lock); 2477 sctx->write_pointer = physical; 2478 mutex_unlock(&sctx->wr_lock); 2479 } 2480 2481 /* Prepare the extra data stripes used by RAID56. */ 2482 if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2483 ASSERT(sctx->raid56_data_stripes == NULL); 2484 2485 sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), 2486 sizeof(struct scrub_stripe), 2487 GFP_KERNEL); 2488 if (!sctx->raid56_data_stripes) { 2489 ret = -ENOMEM; 2490 goto out; 2491 } 2492 for (int i = 0; i < nr_data_stripes(map); i++) { 2493 ret = init_scrub_stripe(fs_info, 2494 &sctx->raid56_data_stripes[i]); 2495 if (ret < 0) 2496 goto out; 2497 sctx->raid56_data_stripes[i].bg = bg; 2498 sctx->raid56_data_stripes[i].sctx = sctx; 2499 } 2500 } 2501 /* 2502 * There used to be a big double loop to handle all profiles using the 2503 * same routine, which grows larger and more gross over time. 2504 * 2505 * So here we handle each profile differently, so simpler profiles 2506 * have simpler scrubbing function. 2507 */ 2508 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 2509 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2510 /* 2511 * Above check rules out all complex profile, the remaining 2512 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 2513 * mirrored duplication without stripe. 2514 * 2515 * Only @physical and @mirror_num needs to calculated using 2516 * @stripe_index. 2517 */ 2518 ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, 2519 scrub_dev, map->stripes[stripe_index].physical, 2520 stripe_index + 1); 2521 offset = 0; 2522 goto out; 2523 } 2524 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 2525 ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); 2526 offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); 2527 goto out; 2528 } 2529 2530 /* Only RAID56 goes through the old code */ 2531 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 2532 ret = 0; 2533 2534 /* Calculate the logical end of the stripe */ 2535 get_raid56_logic_offset(physical_end, stripe_index, 2536 map, &logic_end, NULL); 2537 logic_end += chunk_logical; 2538 2539 /* Initialize @offset in case we need to go to out: label */ 2540 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 2541 increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2542 2543 /* 2544 * Due to the rotation, for RAID56 it's better to iterate each stripe 2545 * using their physical offset. 2546 */ 2547 while (physical < physical_end) { 2548 ret = get_raid56_logic_offset(physical, stripe_index, map, 2549 &logical, &stripe_logical); 2550 logical += chunk_logical; 2551 if (ret) { 2552 /* it is parity strip */ 2553 stripe_logical += chunk_logical; 2554 ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, 2555 map, stripe_logical); 2556 spin_lock(&sctx->stat_lock); 2557 sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN, 2558 physical_end); 2559 spin_unlock(&sctx->stat_lock); 2560 if (ret) 2561 goto out; 2562 goto next; 2563 } 2564 2565 /* 2566 * Now we're at a data stripe, scrub each extents in the range. 2567 * 2568 * At this stage, if we ignore the repair part, inside each data 2569 * stripe it is no different than SINGLE profile. 2570 * We can reuse scrub_simple_mirror() here, as the repair part 2571 * is still based on @mirror_num. 2572 */ 2573 ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, 2574 scrub_dev, physical, 1); 2575 if (ret < 0) 2576 goto out; 2577 next: 2578 logical += increment; 2579 physical += BTRFS_STRIPE_LEN; 2580 spin_lock(&sctx->stat_lock); 2581 sctx->stat.last_physical = physical; 2582 spin_unlock(&sctx->stat_lock); 2583 } 2584 out: 2585 ret2 = flush_scrub_stripes(sctx); 2586 if (!ret) 2587 ret = ret2; 2588 btrfs_release_path(&sctx->extent_path); 2589 btrfs_release_path(&sctx->csum_path); 2590 2591 if (sctx->raid56_data_stripes) { 2592 for (int i = 0; i < nr_data_stripes(map); i++) 2593 release_scrub_stripe(&sctx->raid56_data_stripes[i]); 2594 kfree(sctx->raid56_data_stripes); 2595 sctx->raid56_data_stripes = NULL; 2596 } 2597 2598 if (sctx->is_dev_replace && ret >= 0) { 2599 ret2 = sync_write_pointer_for_zoned(sctx, 2600 chunk_logical + offset, 2601 map->stripes[stripe_index].physical, 2602 physical_end); 2603 if (ret2) 2604 ret = ret2; 2605 } 2606 2607 return ret < 0 ? ret : 0; 2608 } 2609 2610 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 2611 struct btrfs_block_group *bg, 2612 struct btrfs_device *scrub_dev, 2613 u64 dev_offset, 2614 u64 dev_extent_len) 2615 { 2616 struct btrfs_fs_info *fs_info = sctx->fs_info; 2617 struct btrfs_chunk_map *map; 2618 int i; 2619 int ret = 0; 2620 2621 map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); 2622 if (!map) { 2623 /* 2624 * Might have been an unused block group deleted by the cleaner 2625 * kthread or relocation. 2626 */ 2627 spin_lock(&bg->lock); 2628 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) 2629 ret = -EINVAL; 2630 spin_unlock(&bg->lock); 2631 2632 return ret; 2633 } 2634 if (map->start != bg->start) 2635 goto out; 2636 if (map->chunk_len < dev_extent_len) 2637 goto out; 2638 2639 for (i = 0; i < map->num_stripes; ++i) { 2640 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2641 map->stripes[i].physical == dev_offset) { 2642 ret = scrub_stripe(sctx, bg, map, scrub_dev, i); 2643 if (ret) 2644 goto out; 2645 } 2646 } 2647 out: 2648 btrfs_free_chunk_map(map); 2649 2650 return ret; 2651 } 2652 2653 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 2654 struct btrfs_block_group *cache) 2655 { 2656 struct btrfs_fs_info *fs_info = cache->fs_info; 2657 2658 if (!btrfs_is_zoned(fs_info)) 2659 return 0; 2660 2661 btrfs_wait_block_group_reservations(cache); 2662 btrfs_wait_nocow_writers(cache); 2663 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2664 2665 return btrfs_commit_current_transaction(root); 2666 } 2667 2668 static noinline_for_stack 2669 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2670 struct btrfs_device *scrub_dev, u64 start, u64 end) 2671 { 2672 struct btrfs_dev_extent *dev_extent = NULL; 2673 BTRFS_PATH_AUTO_FREE(path); 2674 struct btrfs_fs_info *fs_info = sctx->fs_info; 2675 struct btrfs_root *root = fs_info->dev_root; 2676 u64 chunk_offset; 2677 int ret = 0; 2678 int ro_set; 2679 int slot; 2680 struct extent_buffer *l; 2681 struct btrfs_key key; 2682 struct btrfs_key found_key; 2683 struct btrfs_block_group *cache; 2684 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2685 2686 path = btrfs_alloc_path(); 2687 if (!path) 2688 return -ENOMEM; 2689 2690 path->reada = READA_FORWARD; 2691 path->search_commit_root = true; 2692 path->skip_locking = true; 2693 2694 key.objectid = scrub_dev->devid; 2695 key.type = BTRFS_DEV_EXTENT_KEY; 2696 key.offset = 0ull; 2697 2698 while (1) { 2699 u64 dev_extent_len; 2700 2701 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2702 if (ret < 0) 2703 break; 2704 if (ret > 0) { 2705 if (path->slots[0] >= 2706 btrfs_header_nritems(path->nodes[0])) { 2707 ret = btrfs_next_leaf(root, path); 2708 if (ret < 0) 2709 break; 2710 if (ret > 0) { 2711 ret = 0; 2712 break; 2713 } 2714 } else { 2715 ret = 0; 2716 } 2717 } 2718 2719 l = path->nodes[0]; 2720 slot = path->slots[0]; 2721 2722 btrfs_item_key_to_cpu(l, &found_key, slot); 2723 2724 if (found_key.objectid != scrub_dev->devid) 2725 break; 2726 2727 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 2728 break; 2729 2730 if (found_key.offset >= end) 2731 break; 2732 2733 if (found_key.offset < key.offset) 2734 break; 2735 2736 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2737 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 2738 2739 if (found_key.offset + dev_extent_len <= start) 2740 goto skip; 2741 2742 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2743 2744 /* 2745 * get a reference on the corresponding block group to prevent 2746 * the chunk from going away while we scrub it 2747 */ 2748 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2749 2750 /* some chunks are removed but not committed to disk yet, 2751 * continue scrubbing */ 2752 if (!cache) 2753 goto skip; 2754 2755 ASSERT(cache->start <= chunk_offset); 2756 /* 2757 * We are using the commit root to search for device extents, so 2758 * that means we could have found a device extent item from a 2759 * block group that was deleted in the current transaction. The 2760 * logical start offset of the deleted block group, stored at 2761 * @chunk_offset, might be part of the logical address range of 2762 * a new block group (which uses different physical extents). 2763 * In this case btrfs_lookup_block_group() has returned the new 2764 * block group, and its start address is less than @chunk_offset. 2765 * 2766 * We skip such new block groups, because it's pointless to 2767 * process them, as we won't find their extents because we search 2768 * for them using the commit root of the extent tree. For a device 2769 * replace it's also fine to skip it, we won't miss copying them 2770 * to the target device because we have the write duplication 2771 * setup through the regular write path (by btrfs_map_block()), 2772 * and we have committed a transaction when we started the device 2773 * replace, right after setting up the device replace state. 2774 */ 2775 if (cache->start < chunk_offset) { 2776 btrfs_put_block_group(cache); 2777 goto skip; 2778 } 2779 2780 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 2781 if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { 2782 btrfs_put_block_group(cache); 2783 goto skip; 2784 } 2785 } 2786 2787 /* 2788 * Make sure that while we are scrubbing the corresponding block 2789 * group doesn't get its logical address and its device extents 2790 * reused for another block group, which can possibly be of a 2791 * different type and different profile. We do this to prevent 2792 * false error detections and crashes due to bogus attempts to 2793 * repair extents. 2794 */ 2795 spin_lock(&cache->lock); 2796 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 2797 spin_unlock(&cache->lock); 2798 btrfs_put_block_group(cache); 2799 goto skip; 2800 } 2801 btrfs_freeze_block_group(cache); 2802 spin_unlock(&cache->lock); 2803 2804 /* 2805 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 2806 * to avoid deadlock caused by: 2807 * btrfs_inc_block_group_ro() 2808 * -> btrfs_wait_for_commit() 2809 * -> btrfs_commit_transaction() 2810 * -> btrfs_scrub_pause() 2811 */ 2812 scrub_pause_on(fs_info); 2813 2814 /* 2815 * Don't do chunk preallocation for scrub. 2816 * 2817 * This is especially important for SYSTEM bgs, or we can hit 2818 * -EFBIG from btrfs_finish_chunk_alloc() like: 2819 * 1. The only SYSTEM bg is marked RO. 2820 * Since SYSTEM bg is small, that's pretty common. 2821 * 2. New SYSTEM bg will be allocated 2822 * Due to regular version will allocate new chunk. 2823 * 3. New SYSTEM bg is empty and will get cleaned up 2824 * Before cleanup really happens, it's marked RO again. 2825 * 4. Empty SYSTEM bg get scrubbed 2826 * We go back to 2. 2827 * 2828 * This can easily boost the amount of SYSTEM chunks if cleaner 2829 * thread can't be triggered fast enough, and use up all space 2830 * of btrfs_super_block::sys_chunk_array 2831 * 2832 * While for dev replace, we need to try our best to mark block 2833 * group RO, to prevent race between: 2834 * - Write duplication 2835 * Contains latest data 2836 * - Scrub copy 2837 * Contains data from commit tree 2838 * 2839 * If target block group is not marked RO, nocow writes can 2840 * be overwritten by scrub copy, causing data corruption. 2841 * So for dev-replace, it's not allowed to continue if a block 2842 * group is not RO. 2843 */ 2844 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 2845 if (!ret && sctx->is_dev_replace) { 2846 ret = finish_extent_writes_for_zoned(root, cache); 2847 if (ret) { 2848 btrfs_dec_block_group_ro(cache); 2849 scrub_pause_off(fs_info); 2850 btrfs_put_block_group(cache); 2851 break; 2852 } 2853 } 2854 2855 if (ret == 0) { 2856 ro_set = 1; 2857 } else if (ret == -ENOSPC && !sctx->is_dev_replace && 2858 !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { 2859 /* 2860 * btrfs_inc_block_group_ro return -ENOSPC when it 2861 * failed in creating new chunk for metadata. 2862 * It is not a problem for scrub, because 2863 * metadata are always cowed, and our scrub paused 2864 * commit_transactions. 2865 * 2866 * For RAID56 chunks, we have to mark them read-only 2867 * for scrub, as later we would use our own cache 2868 * out of RAID56 realm. 2869 * Thus we want the RAID56 bg to be marked RO to 2870 * prevent RMW from screwing up out cache. 2871 */ 2872 ro_set = 0; 2873 } else if (ret == -ETXTBSY) { 2874 btrfs_warn(fs_info, 2875 "scrub: skipping scrub of block group %llu due to active swapfile", 2876 cache->start); 2877 scrub_pause_off(fs_info); 2878 ret = 0; 2879 goto skip_unfreeze; 2880 } else { 2881 btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", 2882 ret); 2883 btrfs_unfreeze_block_group(cache); 2884 btrfs_put_block_group(cache); 2885 scrub_pause_off(fs_info); 2886 break; 2887 } 2888 2889 /* 2890 * Now the target block is marked RO, wait for nocow writes to 2891 * finish before dev-replace. 2892 * COW is fine, as COW never overwrites extents in commit tree. 2893 */ 2894 if (sctx->is_dev_replace) { 2895 btrfs_wait_nocow_writers(cache); 2896 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache); 2897 } 2898 2899 scrub_pause_off(fs_info); 2900 down_write(&dev_replace->rwsem); 2901 dev_replace->cursor_right = found_key.offset + dev_extent_len; 2902 dev_replace->cursor_left = found_key.offset; 2903 dev_replace->item_needs_writeback = 1; 2904 up_write(&dev_replace->rwsem); 2905 2906 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 2907 dev_extent_len); 2908 if (sctx->is_dev_replace && 2909 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 2910 cache, found_key.offset)) 2911 ro_set = 0; 2912 2913 down_write(&dev_replace->rwsem); 2914 dev_replace->cursor_left = dev_replace->cursor_right; 2915 dev_replace->item_needs_writeback = 1; 2916 up_write(&dev_replace->rwsem); 2917 2918 if (ro_set) 2919 btrfs_dec_block_group_ro(cache); 2920 2921 /* 2922 * We might have prevented the cleaner kthread from deleting 2923 * this block group if it was already unused because we raced 2924 * and set it to RO mode first. So add it back to the unused 2925 * list, otherwise it might not ever be deleted unless a manual 2926 * balance is triggered or it becomes used and unused again. 2927 */ 2928 spin_lock(&cache->lock); 2929 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && 2930 !cache->ro && cache->reserved == 0 && cache->used == 0) { 2931 spin_unlock(&cache->lock); 2932 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2933 btrfs_discard_queue_work(&fs_info->discard_ctl, 2934 cache); 2935 else 2936 btrfs_mark_bg_unused(cache); 2937 } else { 2938 spin_unlock(&cache->lock); 2939 } 2940 skip_unfreeze: 2941 btrfs_unfreeze_block_group(cache); 2942 btrfs_put_block_group(cache); 2943 if (ret) 2944 break; 2945 if (unlikely(sctx->is_dev_replace && 2946 atomic64_read(&dev_replace->num_write_errors) > 0)) { 2947 ret = -EIO; 2948 break; 2949 } 2950 if (sctx->stat.malloc_errors > 0) { 2951 ret = -ENOMEM; 2952 break; 2953 } 2954 skip: 2955 key.offset = found_key.offset + dev_extent_len; 2956 btrfs_release_path(path); 2957 } 2958 2959 return ret; 2960 } 2961 2962 static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, 2963 struct page *page, u64 physical, u64 generation) 2964 { 2965 struct btrfs_fs_info *fs_info = sctx->fs_info; 2966 struct btrfs_super_block *sb = page_address(page); 2967 int ret; 2968 2969 ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, 2970 BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); 2971 if (ret < 0) 2972 return ret; 2973 ret = btrfs_check_super_csum(fs_info, sb); 2974 if (unlikely(ret != 0)) { 2975 btrfs_err_rl(fs_info, 2976 "scrub: super block at physical %llu devid %llu has bad csum", 2977 physical, dev->devid); 2978 return -EIO; 2979 } 2980 if (unlikely(btrfs_super_generation(sb) != generation)) { 2981 btrfs_err_rl(fs_info, 2982 "scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", 2983 physical, dev->devid, 2984 btrfs_super_generation(sb), generation); 2985 return -EUCLEAN; 2986 } 2987 2988 return btrfs_validate_super(fs_info, sb, -1); 2989 } 2990 2991 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 2992 struct btrfs_device *scrub_dev) 2993 { 2994 int i; 2995 u64 bytenr; 2996 u64 gen; 2997 int ret = 0; 2998 struct page *page; 2999 struct btrfs_fs_info *fs_info = sctx->fs_info; 3000 3001 if (BTRFS_FS_ERROR(fs_info)) 3002 return -EROFS; 3003 3004 page = alloc_page(GFP_KERNEL); 3005 if (!page) { 3006 spin_lock(&sctx->stat_lock); 3007 sctx->stat.malloc_errors++; 3008 spin_unlock(&sctx->stat_lock); 3009 return -ENOMEM; 3010 } 3011 3012 /* Seed devices of a new filesystem has their own generation. */ 3013 if (scrub_dev->fs_devices != fs_info->fs_devices) 3014 gen = scrub_dev->generation; 3015 else 3016 gen = btrfs_get_last_trans_committed(fs_info); 3017 3018 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3019 ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); 3020 if (ret == -ENOENT) 3021 break; 3022 3023 if (ret) { 3024 spin_lock(&sctx->stat_lock); 3025 sctx->stat.super_errors++; 3026 spin_unlock(&sctx->stat_lock); 3027 continue; 3028 } 3029 3030 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3031 scrub_dev->commit_total_bytes) 3032 break; 3033 if (!btrfs_check_super_location(scrub_dev, bytenr)) 3034 continue; 3035 3036 ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); 3037 if (ret) { 3038 spin_lock(&sctx->stat_lock); 3039 sctx->stat.super_errors++; 3040 spin_unlock(&sctx->stat_lock); 3041 } 3042 } 3043 __free_page(page); 3044 return 0; 3045 } 3046 3047 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 3048 { 3049 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 3050 &fs_info->scrub_lock)) { 3051 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3052 3053 fs_info->scrub_workers = NULL; 3054 mutex_unlock(&fs_info->scrub_lock); 3055 3056 if (scrub_workers) 3057 destroy_workqueue(scrub_workers); 3058 } 3059 } 3060 3061 /* 3062 * get a reference count on fs_info->scrub_workers. start worker if necessary 3063 */ 3064 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 3065 { 3066 struct workqueue_struct *scrub_workers = NULL; 3067 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 3068 int max_active = fs_info->thread_pool_size; 3069 int ret = -ENOMEM; 3070 3071 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 3072 return 0; 3073 3074 scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); 3075 if (!scrub_workers) 3076 return -ENOMEM; 3077 3078 mutex_lock(&fs_info->scrub_lock); 3079 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 3080 ASSERT(fs_info->scrub_workers == NULL); 3081 fs_info->scrub_workers = scrub_workers; 3082 refcount_set(&fs_info->scrub_workers_refcnt, 1); 3083 mutex_unlock(&fs_info->scrub_lock); 3084 return 0; 3085 } 3086 /* Other thread raced in and created the workers for us */ 3087 refcount_inc(&fs_info->scrub_workers_refcnt); 3088 mutex_unlock(&fs_info->scrub_lock); 3089 3090 ret = 0; 3091 3092 destroy_workqueue(scrub_workers); 3093 return ret; 3094 } 3095 3096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3097 u64 end, struct btrfs_scrub_progress *progress, 3098 bool readonly, bool is_dev_replace) 3099 { 3100 struct btrfs_dev_lookup_args args = { .devid = devid }; 3101 struct scrub_ctx *sctx; 3102 int ret; 3103 struct btrfs_device *dev; 3104 unsigned int nofs_flag; 3105 bool need_commit = false; 3106 3107 /* Set the basic fallback @last_physical before we got a sctx. */ 3108 if (progress) 3109 progress->last_physical = start; 3110 3111 if (btrfs_fs_closing(fs_info)) 3112 return -EAGAIN; 3113 3114 /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ 3115 ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); 3116 3117 /* 3118 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible 3119 * value (max nodesize / min sectorsize), thus nodesize should always 3120 * be fine. 3121 */ 3122 ASSERT(fs_info->nodesize <= 3123 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); 3124 3125 /* Allocate outside of device_list_mutex */ 3126 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 3127 if (IS_ERR(sctx)) 3128 return PTR_ERR(sctx); 3129 sctx->stat.last_physical = start; 3130 3131 ret = scrub_workers_get(fs_info); 3132 if (ret) 3133 goto out_free_ctx; 3134 3135 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3136 dev = btrfs_find_device(fs_info->fs_devices, &args); 3137 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 3138 !is_dev_replace)) { 3139 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3140 ret = -ENODEV; 3141 goto out; 3142 } 3143 3144 if (!is_dev_replace && !readonly && 3145 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 3146 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3147 btrfs_err(fs_info, 3148 "scrub: devid %llu: filesystem on %s is not writable", 3149 devid, btrfs_dev_name(dev)); 3150 ret = -EROFS; 3151 goto out; 3152 } 3153 3154 mutex_lock(&fs_info->scrub_lock); 3155 if (unlikely(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 3156 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state))) { 3157 mutex_unlock(&fs_info->scrub_lock); 3158 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3159 ret = -EIO; 3160 goto out; 3161 } 3162 3163 down_read(&fs_info->dev_replace.rwsem); 3164 if (dev->scrub_ctx || 3165 (!is_dev_replace && 3166 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3167 up_read(&fs_info->dev_replace.rwsem); 3168 mutex_unlock(&fs_info->scrub_lock); 3169 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3170 ret = -EINPROGRESS; 3171 goto out; 3172 } 3173 up_read(&fs_info->dev_replace.rwsem); 3174 3175 sctx->readonly = readonly; 3176 dev->scrub_ctx = sctx; 3177 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3178 3179 /* 3180 * checking @scrub_pause_req here, we can avoid 3181 * race between committing transaction and scrubbing. 3182 */ 3183 __scrub_blocked_if_needed(fs_info); 3184 atomic_inc(&fs_info->scrubs_running); 3185 mutex_unlock(&fs_info->scrub_lock); 3186 3187 /* 3188 * In order to avoid deadlock with reclaim when there is a transaction 3189 * trying to pause scrub, make sure we use GFP_NOFS for all the 3190 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 3191 * invoked by our callees. The pausing request is done when the 3192 * transaction commit starts, and it blocks the transaction until scrub 3193 * is paused (done at specific points at scrub_stripe() or right above 3194 * before incrementing fs_info->scrubs_running). 3195 */ 3196 nofs_flag = memalloc_nofs_save(); 3197 if (!is_dev_replace) { 3198 u64 old_super_errors; 3199 3200 spin_lock(&sctx->stat_lock); 3201 old_super_errors = sctx->stat.super_errors; 3202 spin_unlock(&sctx->stat_lock); 3203 3204 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 3205 /* 3206 * by holding device list mutex, we can 3207 * kick off writing super in log tree sync. 3208 */ 3209 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3210 ret = scrub_supers(sctx, dev); 3211 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3212 3213 spin_lock(&sctx->stat_lock); 3214 /* 3215 * Super block errors found, but we can not commit transaction 3216 * at current context, since btrfs_commit_transaction() needs 3217 * to pause the current running scrub (hold by ourselves). 3218 */ 3219 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) 3220 need_commit = true; 3221 spin_unlock(&sctx->stat_lock); 3222 } 3223 3224 if (!ret) 3225 ret = scrub_enumerate_chunks(sctx, dev, start, end); 3226 memalloc_nofs_restore(nofs_flag); 3227 3228 atomic_dec(&fs_info->scrubs_running); 3229 wake_up(&fs_info->scrub_pause_wait); 3230 3231 if (progress) 3232 memcpy(progress, &sctx->stat, sizeof(*progress)); 3233 3234 if (!is_dev_replace) 3235 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 3236 ret ? "not finished" : "finished", devid, ret); 3237 3238 mutex_lock(&fs_info->scrub_lock); 3239 dev->scrub_ctx = NULL; 3240 mutex_unlock(&fs_info->scrub_lock); 3241 3242 scrub_workers_put(fs_info); 3243 scrub_put_ctx(sctx); 3244 3245 /* 3246 * We found some super block errors before, now try to force a 3247 * transaction commit, as scrub has finished. 3248 */ 3249 if (need_commit) { 3250 struct btrfs_trans_handle *trans; 3251 3252 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3253 if (IS_ERR(trans)) { 3254 ret = PTR_ERR(trans); 3255 btrfs_err(fs_info, 3256 "scrub: failed to start transaction to fix super block errors: %d", ret); 3257 return ret; 3258 } 3259 ret = btrfs_commit_transaction(trans); 3260 if (ret < 0) 3261 btrfs_err(fs_info, 3262 "scrub: failed to commit transaction to fix super block errors: %d", ret); 3263 } 3264 return ret; 3265 out: 3266 scrub_workers_put(fs_info); 3267 out_free_ctx: 3268 scrub_free_ctx(sctx); 3269 3270 return ret; 3271 } 3272 3273 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 3274 { 3275 mutex_lock(&fs_info->scrub_lock); 3276 atomic_inc(&fs_info->scrub_pause_req); 3277 while (atomic_read(&fs_info->scrubs_paused) != 3278 atomic_read(&fs_info->scrubs_running)) { 3279 mutex_unlock(&fs_info->scrub_lock); 3280 wait_event(fs_info->scrub_pause_wait, 3281 atomic_read(&fs_info->scrubs_paused) == 3282 atomic_read(&fs_info->scrubs_running)); 3283 mutex_lock(&fs_info->scrub_lock); 3284 } 3285 mutex_unlock(&fs_info->scrub_lock); 3286 } 3287 3288 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 3289 { 3290 atomic_dec(&fs_info->scrub_pause_req); 3291 wake_up(&fs_info->scrub_pause_wait); 3292 } 3293 3294 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3295 { 3296 mutex_lock(&fs_info->scrub_lock); 3297 if (!atomic_read(&fs_info->scrubs_running)) { 3298 mutex_unlock(&fs_info->scrub_lock); 3299 return -ENOTCONN; 3300 } 3301 3302 atomic_inc(&fs_info->scrub_cancel_req); 3303 while (atomic_read(&fs_info->scrubs_running)) { 3304 mutex_unlock(&fs_info->scrub_lock); 3305 wait_event(fs_info->scrub_pause_wait, 3306 atomic_read(&fs_info->scrubs_running) == 0); 3307 mutex_lock(&fs_info->scrub_lock); 3308 } 3309 atomic_dec(&fs_info->scrub_cancel_req); 3310 mutex_unlock(&fs_info->scrub_lock); 3311 3312 return 0; 3313 } 3314 3315 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 3316 { 3317 struct btrfs_fs_info *fs_info = dev->fs_info; 3318 struct scrub_ctx *sctx; 3319 3320 mutex_lock(&fs_info->scrub_lock); 3321 sctx = dev->scrub_ctx; 3322 if (!sctx) { 3323 mutex_unlock(&fs_info->scrub_lock); 3324 return -ENOTCONN; 3325 } 3326 atomic_inc(&sctx->cancel_req); 3327 while (dev->scrub_ctx) { 3328 mutex_unlock(&fs_info->scrub_lock); 3329 wait_event(fs_info->scrub_pause_wait, 3330 dev->scrub_ctx == NULL); 3331 mutex_lock(&fs_info->scrub_lock); 3332 } 3333 mutex_unlock(&fs_info->scrub_lock); 3334 3335 return 0; 3336 } 3337 3338 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 3339 struct btrfs_scrub_progress *progress) 3340 { 3341 struct btrfs_dev_lookup_args args = { .devid = devid }; 3342 struct btrfs_device *dev; 3343 struct scrub_ctx *sctx = NULL; 3344 3345 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3346 dev = btrfs_find_device(fs_info->fs_devices, &args); 3347 if (dev) 3348 sctx = dev->scrub_ctx; 3349 if (sctx) 3350 memcpy(progress, &sctx->stat, sizeof(*progress)); 3351 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3352 3353 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3354 } 3355