1 /* 2 * Copyright (C) 2011 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/blkdev.h> 20 #include <linux/ratelimit.h> 21 #include "ctree.h" 22 #include "volumes.h" 23 #include "disk-io.h" 24 #include "ordered-data.h" 25 #include "transaction.h" 26 #include "backref.h" 27 #include "extent_io.h" 28 #include "check-integrity.h" 29 30 /* 31 * This is only the first step towards a full-features scrub. It reads all 32 * extent and super block and verifies the checksums. In case a bad checksum 33 * is found or the extent cannot be read, good data will be written back if 34 * any can be found. 35 * 36 * Future enhancements: 37 * - In case an unrepairable extent is encountered, track which files are 38 * affected and report them 39 * - In case of a read error on files with nodatasum, map the file and read 40 * the extent to trigger a writeback of the good copy 41 * - track and record media errors, throw out bad devices 42 * - add a mode to also read unallocated space 43 */ 44 45 struct scrub_bio; 46 struct scrub_page; 47 struct scrub_dev; 48 static void scrub_bio_end_io(struct bio *bio, int err); 49 static void scrub_checksum(struct btrfs_work *work); 50 static int scrub_checksum_data(struct scrub_dev *sdev, 51 struct scrub_page *spag, void *buffer); 52 static int scrub_checksum_tree_block(struct scrub_dev *sdev, 53 struct scrub_page *spag, u64 logical, 54 void *buffer); 55 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); 56 static int scrub_fixup_check(struct scrub_bio *sbio, int ix); 57 static void scrub_fixup_end_io(struct bio *bio, int err); 58 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 59 struct page *page); 60 static void scrub_fixup(struct scrub_bio *sbio, int ix); 61 62 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 63 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 64 65 struct scrub_page { 66 u64 flags; /* extent flags */ 67 u64 generation; 68 int mirror_num; 69 int have_csum; 70 u8 csum[BTRFS_CSUM_SIZE]; 71 }; 72 73 struct scrub_bio { 74 int index; 75 struct scrub_dev *sdev; 76 struct bio *bio; 77 int err; 78 u64 logical; 79 u64 physical; 80 struct scrub_page spag[SCRUB_PAGES_PER_BIO]; 81 u64 count; 82 int next_free; 83 struct btrfs_work work; 84 }; 85 86 struct scrub_dev { 87 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 88 struct btrfs_device *dev; 89 int first_free; 90 int curr; 91 atomic_t in_flight; 92 atomic_t fixup_cnt; 93 spinlock_t list_lock; 94 wait_queue_head_t list_wait; 95 u16 csum_size; 96 struct list_head csum_list; 97 atomic_t cancel_req; 98 int readonly; 99 /* 100 * statistics 101 */ 102 struct btrfs_scrub_progress stat; 103 spinlock_t stat_lock; 104 }; 105 106 struct scrub_fixup_nodatasum { 107 struct scrub_dev *sdev; 108 u64 logical; 109 struct btrfs_root *root; 110 struct btrfs_work work; 111 int mirror_num; 112 }; 113 114 struct scrub_warning { 115 struct btrfs_path *path; 116 u64 extent_item_size; 117 char *scratch_buf; 118 char *msg_buf; 119 const char *errstr; 120 sector_t sector; 121 u64 logical; 122 struct btrfs_device *dev; 123 int msg_bufsize; 124 int scratch_bufsize; 125 }; 126 127 static void scrub_free_csums(struct scrub_dev *sdev) 128 { 129 while (!list_empty(&sdev->csum_list)) { 130 struct btrfs_ordered_sum *sum; 131 sum = list_first_entry(&sdev->csum_list, 132 struct btrfs_ordered_sum, list); 133 list_del(&sum->list); 134 kfree(sum); 135 } 136 } 137 138 static void scrub_free_bio(struct bio *bio) 139 { 140 int i; 141 struct page *last_page = NULL; 142 143 if (!bio) 144 return; 145 146 for (i = 0; i < bio->bi_vcnt; ++i) { 147 if (bio->bi_io_vec[i].bv_page == last_page) 148 continue; 149 last_page = bio->bi_io_vec[i].bv_page; 150 __free_page(last_page); 151 } 152 bio_put(bio); 153 } 154 155 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 156 { 157 int i; 158 159 if (!sdev) 160 return; 161 162 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 163 struct scrub_bio *sbio = sdev->bios[i]; 164 165 if (!sbio) 166 break; 167 168 scrub_free_bio(sbio->bio); 169 kfree(sbio); 170 } 171 172 scrub_free_csums(sdev); 173 kfree(sdev); 174 } 175 176 static noinline_for_stack 177 struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 178 { 179 struct scrub_dev *sdev; 180 int i; 181 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 182 183 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 184 if (!sdev) 185 goto nomem; 186 sdev->dev = dev; 187 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 188 struct scrub_bio *sbio; 189 190 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 191 if (!sbio) 192 goto nomem; 193 sdev->bios[i] = sbio; 194 195 sbio->index = i; 196 sbio->sdev = sdev; 197 sbio->count = 0; 198 sbio->work.func = scrub_checksum; 199 200 if (i != SCRUB_BIOS_PER_DEV-1) 201 sdev->bios[i]->next_free = i + 1; 202 else 203 sdev->bios[i]->next_free = -1; 204 } 205 sdev->first_free = 0; 206 sdev->curr = -1; 207 atomic_set(&sdev->in_flight, 0); 208 atomic_set(&sdev->fixup_cnt, 0); 209 atomic_set(&sdev->cancel_req, 0); 210 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 211 INIT_LIST_HEAD(&sdev->csum_list); 212 213 spin_lock_init(&sdev->list_lock); 214 spin_lock_init(&sdev->stat_lock); 215 init_waitqueue_head(&sdev->list_wait); 216 return sdev; 217 218 nomem: 219 scrub_free_dev(sdev); 220 return ERR_PTR(-ENOMEM); 221 } 222 223 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 224 { 225 u64 isize; 226 u32 nlink; 227 int ret; 228 int i; 229 struct extent_buffer *eb; 230 struct btrfs_inode_item *inode_item; 231 struct scrub_warning *swarn = ctx; 232 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 233 struct inode_fs_paths *ipath = NULL; 234 struct btrfs_root *local_root; 235 struct btrfs_key root_key; 236 237 root_key.objectid = root; 238 root_key.type = BTRFS_ROOT_ITEM_KEY; 239 root_key.offset = (u64)-1; 240 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 241 if (IS_ERR(local_root)) { 242 ret = PTR_ERR(local_root); 243 goto err; 244 } 245 246 ret = inode_item_info(inum, 0, local_root, swarn->path); 247 if (ret) { 248 btrfs_release_path(swarn->path); 249 goto err; 250 } 251 252 eb = swarn->path->nodes[0]; 253 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 254 struct btrfs_inode_item); 255 isize = btrfs_inode_size(eb, inode_item); 256 nlink = btrfs_inode_nlink(eb, inode_item); 257 btrfs_release_path(swarn->path); 258 259 ipath = init_ipath(4096, local_root, swarn->path); 260 if (IS_ERR(ipath)) { 261 ret = PTR_ERR(ipath); 262 ipath = NULL; 263 goto err; 264 } 265 ret = paths_from_inode(inum, ipath); 266 267 if (ret < 0) 268 goto err; 269 270 /* 271 * we deliberately ignore the bit ipath might have been too small to 272 * hold all of the paths here 273 */ 274 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 275 printk(KERN_WARNING "btrfs: %s at logical %llu on dev " 276 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 277 "length %llu, links %u (path: %s)\n", swarn->errstr, 278 swarn->logical, swarn->dev->name, 279 (unsigned long long)swarn->sector, root, inum, offset, 280 min(isize - offset, (u64)PAGE_SIZE), nlink, 281 (char *)(unsigned long)ipath->fspath->val[i]); 282 283 free_ipath(ipath); 284 return 0; 285 286 err: 287 printk(KERN_WARNING "btrfs: %s at logical %llu on dev " 288 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 289 "resolving failed with ret=%d\n", swarn->errstr, 290 swarn->logical, swarn->dev->name, 291 (unsigned long long)swarn->sector, root, inum, offset, ret); 292 293 free_ipath(ipath); 294 return 0; 295 } 296 297 static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, 298 int ix) 299 { 300 struct btrfs_device *dev = sbio->sdev->dev; 301 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 302 struct btrfs_path *path; 303 struct btrfs_key found_key; 304 struct extent_buffer *eb; 305 struct btrfs_extent_item *ei; 306 struct scrub_warning swarn; 307 u32 item_size; 308 int ret; 309 u64 ref_root; 310 u8 ref_level; 311 unsigned long ptr = 0; 312 const int bufsize = 4096; 313 u64 extent_item_pos; 314 315 path = btrfs_alloc_path(); 316 317 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 318 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 319 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; 320 swarn.logical = sbio->logical + ix * PAGE_SIZE; 321 swarn.errstr = errstr; 322 swarn.dev = dev; 323 swarn.msg_bufsize = bufsize; 324 swarn.scratch_bufsize = bufsize; 325 326 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 327 goto out; 328 329 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); 330 if (ret < 0) 331 goto out; 332 333 extent_item_pos = swarn.logical - found_key.objectid; 334 swarn.extent_item_size = found_key.offset; 335 336 eb = path->nodes[0]; 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]); 339 btrfs_release_path(path); 340 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 342 do { 343 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 344 &ref_root, &ref_level); 345 printk(KERN_WARNING "%s at logical %llu on dev %s, " 346 "sector %llu: metadata %s (level %d) in tree " 347 "%llu\n", errstr, swarn.logical, dev->name, 348 (unsigned long long)swarn.sector, 349 ref_level ? "node" : "leaf", 350 ret < 0 ? -1 : ref_level, 351 ret < 0 ? -1 : ref_root); 352 } while (ret != 1); 353 } else { 354 swarn.path = path; 355 iterate_extent_inodes(fs_info, path, found_key.objectid, 356 extent_item_pos, 357 scrub_print_warning_inode, &swarn); 358 } 359 360 out: 361 btrfs_free_path(path); 362 kfree(swarn.scratch_buf); 363 kfree(swarn.msg_buf); 364 } 365 366 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 367 { 368 struct page *page = NULL; 369 unsigned long index; 370 struct scrub_fixup_nodatasum *fixup = ctx; 371 int ret; 372 int corrected = 0; 373 struct btrfs_key key; 374 struct inode *inode = NULL; 375 u64 end = offset + PAGE_SIZE - 1; 376 struct btrfs_root *local_root; 377 378 key.objectid = root; 379 key.type = BTRFS_ROOT_ITEM_KEY; 380 key.offset = (u64)-1; 381 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 382 if (IS_ERR(local_root)) 383 return PTR_ERR(local_root); 384 385 key.type = BTRFS_INODE_ITEM_KEY; 386 key.objectid = inum; 387 key.offset = 0; 388 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 389 if (IS_ERR(inode)) 390 return PTR_ERR(inode); 391 392 index = offset >> PAGE_CACHE_SHIFT; 393 394 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 395 if (!page) { 396 ret = -ENOMEM; 397 goto out; 398 } 399 400 if (PageUptodate(page)) { 401 struct btrfs_mapping_tree *map_tree; 402 if (PageDirty(page)) { 403 /* 404 * we need to write the data to the defect sector. the 405 * data that was in that sector is not in memory, 406 * because the page was modified. we must not write the 407 * modified page to that sector. 408 * 409 * TODO: what could be done here: wait for the delalloc 410 * runner to write out that page (might involve 411 * COW) and see whether the sector is still 412 * referenced afterwards. 413 * 414 * For the meantime, we'll treat this error 415 * incorrectable, although there is a chance that a 416 * later scrub will find the bad sector again and that 417 * there's no dirty page in memory, then. 418 */ 419 ret = -EIO; 420 goto out; 421 } 422 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 423 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 424 fixup->logical, page, 425 fixup->mirror_num); 426 unlock_page(page); 427 corrected = !ret; 428 } else { 429 /* 430 * we need to get good data first. the general readpage path 431 * will call repair_io_failure for us, we just have to make 432 * sure we read the bad mirror. 433 */ 434 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 435 EXTENT_DAMAGED, GFP_NOFS); 436 if (ret) { 437 /* set_extent_bits should give proper error */ 438 WARN_ON(ret > 0); 439 if (ret > 0) 440 ret = -EFAULT; 441 goto out; 442 } 443 444 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 445 btrfs_get_extent, 446 fixup->mirror_num); 447 wait_on_page_locked(page); 448 449 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 450 end, EXTENT_DAMAGED, 0, NULL); 451 if (!corrected) 452 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 453 EXTENT_DAMAGED, GFP_NOFS); 454 } 455 456 out: 457 if (page) 458 put_page(page); 459 if (inode) 460 iput(inode); 461 462 if (ret < 0) 463 return ret; 464 465 if (ret == 0 && corrected) { 466 /* 467 * we only need to call readpage for one of the inodes belonging 468 * to this extent. so make iterate_extent_inodes stop 469 */ 470 return 1; 471 } 472 473 return -EIO; 474 } 475 476 static void scrub_fixup_nodatasum(struct btrfs_work *work) 477 { 478 int ret; 479 struct scrub_fixup_nodatasum *fixup; 480 struct scrub_dev *sdev; 481 struct btrfs_trans_handle *trans = NULL; 482 struct btrfs_fs_info *fs_info; 483 struct btrfs_path *path; 484 int uncorrectable = 0; 485 486 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 487 sdev = fixup->sdev; 488 fs_info = fixup->root->fs_info; 489 490 path = btrfs_alloc_path(); 491 if (!path) { 492 spin_lock(&sdev->stat_lock); 493 ++sdev->stat.malloc_errors; 494 spin_unlock(&sdev->stat_lock); 495 uncorrectable = 1; 496 goto out; 497 } 498 499 trans = btrfs_join_transaction(fixup->root); 500 if (IS_ERR(trans)) { 501 uncorrectable = 1; 502 goto out; 503 } 504 505 /* 506 * the idea is to trigger a regular read through the standard path. we 507 * read a page from the (failed) logical address by specifying the 508 * corresponding copynum of the failed sector. thus, that readpage is 509 * expected to fail. 510 * that is the point where on-the-fly error correction will kick in 511 * (once it's finished) and rewrite the failed sector if a good copy 512 * can be found. 513 */ 514 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, 515 path, scrub_fixup_readpage, 516 fixup); 517 if (ret < 0) { 518 uncorrectable = 1; 519 goto out; 520 } 521 WARN_ON(ret != 1); 522 523 spin_lock(&sdev->stat_lock); 524 ++sdev->stat.corrected_errors; 525 spin_unlock(&sdev->stat_lock); 526 527 out: 528 if (trans && !IS_ERR(trans)) 529 btrfs_end_transaction(trans, fixup->root); 530 if (uncorrectable) { 531 spin_lock(&sdev->stat_lock); 532 ++sdev->stat.uncorrectable_errors; 533 spin_unlock(&sdev->stat_lock); 534 printk_ratelimited(KERN_ERR "btrfs: unable to fixup " 535 "(nodatasum) error at logical %llu\n", 536 fixup->logical); 537 } 538 539 btrfs_free_path(path); 540 kfree(fixup); 541 542 /* see caller why we're pretending to be paused in the scrub counters */ 543 mutex_lock(&fs_info->scrub_lock); 544 atomic_dec(&fs_info->scrubs_running); 545 atomic_dec(&fs_info->scrubs_paused); 546 mutex_unlock(&fs_info->scrub_lock); 547 atomic_dec(&sdev->fixup_cnt); 548 wake_up(&fs_info->scrub_pause_wait); 549 wake_up(&sdev->list_wait); 550 } 551 552 /* 553 * scrub_recheck_error gets called when either verification of the page 554 * failed or the bio failed to read, e.g. with EIO. In the latter case, 555 * recheck_error gets called for every page in the bio, even though only 556 * one may be bad 557 */ 558 static int scrub_recheck_error(struct scrub_bio *sbio, int ix) 559 { 560 struct scrub_dev *sdev = sbio->sdev; 561 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; 562 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 563 DEFAULT_RATELIMIT_BURST); 564 565 if (sbio->err) { 566 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, 567 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 568 if (scrub_fixup_check(sbio, ix) == 0) 569 return 0; 570 } 571 if (__ratelimit(&_rs)) 572 scrub_print_warning("i/o error", sbio, ix); 573 } else { 574 if (__ratelimit(&_rs)) 575 scrub_print_warning("checksum error", sbio, ix); 576 } 577 578 spin_lock(&sdev->stat_lock); 579 ++sdev->stat.read_errors; 580 spin_unlock(&sdev->stat_lock); 581 582 scrub_fixup(sbio, ix); 583 return 1; 584 } 585 586 static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 587 { 588 int ret = 1; 589 struct page *page; 590 void *buffer; 591 u64 flags = sbio->spag[ix].flags; 592 593 page = sbio->bio->bi_io_vec[ix].bv_page; 594 buffer = kmap_atomic(page, KM_USER0); 595 if (flags & BTRFS_EXTENT_FLAG_DATA) { 596 ret = scrub_checksum_data(sbio->sdev, 597 sbio->spag + ix, buffer); 598 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 599 ret = scrub_checksum_tree_block(sbio->sdev, 600 sbio->spag + ix, 601 sbio->logical + ix * PAGE_SIZE, 602 buffer); 603 } else { 604 WARN_ON(1); 605 } 606 kunmap_atomic(buffer, KM_USER0); 607 608 return ret; 609 } 610 611 static void scrub_fixup_end_io(struct bio *bio, int err) 612 { 613 complete((struct completion *)bio->bi_private); 614 } 615 616 static void scrub_fixup(struct scrub_bio *sbio, int ix) 617 { 618 struct scrub_dev *sdev = sbio->sdev; 619 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 620 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 621 struct btrfs_bio *bbio = NULL; 622 struct scrub_fixup_nodatasum *fixup; 623 u64 logical = sbio->logical + ix * PAGE_SIZE; 624 u64 length; 625 int i; 626 int ret; 627 DECLARE_COMPLETION_ONSTACK(complete); 628 629 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 630 (sbio->spag[ix].have_csum == 0)) { 631 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 632 if (!fixup) 633 goto uncorrectable; 634 fixup->sdev = sdev; 635 fixup->logical = logical; 636 fixup->root = fs_info->extent_root; 637 fixup->mirror_num = sbio->spag[ix].mirror_num; 638 /* 639 * increment scrubs_running to prevent cancel requests from 640 * completing as long as a fixup worker is running. we must also 641 * increment scrubs_paused to prevent deadlocking on pause 642 * requests used for transactions commits (as the worker uses a 643 * transaction context). it is safe to regard the fixup worker 644 * as paused for all matters practical. effectively, we only 645 * avoid cancellation requests from completing. 646 */ 647 mutex_lock(&fs_info->scrub_lock); 648 atomic_inc(&fs_info->scrubs_running); 649 atomic_inc(&fs_info->scrubs_paused); 650 mutex_unlock(&fs_info->scrub_lock); 651 atomic_inc(&sdev->fixup_cnt); 652 fixup->work.func = scrub_fixup_nodatasum; 653 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); 654 return; 655 } 656 657 length = PAGE_SIZE; 658 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 659 &bbio, 0); 660 if (ret || !bbio || length < PAGE_SIZE) { 661 printk(KERN_ERR 662 "scrub_fixup: btrfs_map_block failed us for %llu\n", 663 (unsigned long long)logical); 664 WARN_ON(1); 665 kfree(bbio); 666 return; 667 } 668 669 if (bbio->num_stripes == 1) 670 /* there aren't any replicas */ 671 goto uncorrectable; 672 673 /* 674 * first find a good copy 675 */ 676 for (i = 0; i < bbio->num_stripes; ++i) { 677 if (i + 1 == sbio->spag[ix].mirror_num) 678 continue; 679 680 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, 681 bbio->stripes[i].physical >> 9, 682 sbio->bio->bi_io_vec[ix].bv_page)) { 683 /* I/O-error, this is not a good copy */ 684 continue; 685 } 686 687 if (scrub_fixup_check(sbio, ix) == 0) 688 break; 689 } 690 if (i == bbio->num_stripes) 691 goto uncorrectable; 692 693 if (!sdev->readonly) { 694 /* 695 * bi_io_vec[ix].bv_page now contains good data, write it back 696 */ 697 if (scrub_fixup_io(WRITE, sdev->dev->bdev, 698 (sbio->physical + ix * PAGE_SIZE) >> 9, 699 sbio->bio->bi_io_vec[ix].bv_page)) { 700 /* I/O-error, writeback failed, give up */ 701 goto uncorrectable; 702 } 703 } 704 705 kfree(bbio); 706 spin_lock(&sdev->stat_lock); 707 ++sdev->stat.corrected_errors; 708 spin_unlock(&sdev->stat_lock); 709 710 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", 711 (unsigned long long)logical); 712 return; 713 714 uncorrectable: 715 kfree(bbio); 716 spin_lock(&sdev->stat_lock); 717 ++sdev->stat.uncorrectable_errors; 718 spin_unlock(&sdev->stat_lock); 719 720 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " 721 "logical %llu\n", (unsigned long long)logical); 722 } 723 724 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 725 struct page *page) 726 { 727 struct bio *bio = NULL; 728 int ret; 729 DECLARE_COMPLETION_ONSTACK(complete); 730 731 bio = bio_alloc(GFP_NOFS, 1); 732 bio->bi_bdev = bdev; 733 bio->bi_sector = sector; 734 bio_add_page(bio, page, PAGE_SIZE, 0); 735 bio->bi_end_io = scrub_fixup_end_io; 736 bio->bi_private = &complete; 737 btrfsic_submit_bio(rw, bio); 738 739 /* this will also unplug the queue */ 740 wait_for_completion(&complete); 741 742 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); 743 bio_put(bio); 744 return ret; 745 } 746 747 static void scrub_bio_end_io(struct bio *bio, int err) 748 { 749 struct scrub_bio *sbio = bio->bi_private; 750 struct scrub_dev *sdev = sbio->sdev; 751 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 752 753 sbio->err = err; 754 sbio->bio = bio; 755 756 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 757 } 758 759 static void scrub_checksum(struct btrfs_work *work) 760 { 761 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 762 struct scrub_dev *sdev = sbio->sdev; 763 struct page *page; 764 void *buffer; 765 int i; 766 u64 flags; 767 u64 logical; 768 int ret; 769 770 if (sbio->err) { 771 ret = 0; 772 for (i = 0; i < sbio->count; ++i) 773 ret |= scrub_recheck_error(sbio, i); 774 if (!ret) { 775 spin_lock(&sdev->stat_lock); 776 ++sdev->stat.unverified_errors; 777 spin_unlock(&sdev->stat_lock); 778 } 779 780 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 781 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 782 sbio->bio->bi_phys_segments = 0; 783 sbio->bio->bi_idx = 0; 784 785 for (i = 0; i < sbio->count; i++) { 786 struct bio_vec *bi; 787 bi = &sbio->bio->bi_io_vec[i]; 788 bi->bv_offset = 0; 789 bi->bv_len = PAGE_SIZE; 790 } 791 goto out; 792 } 793 for (i = 0; i < sbio->count; ++i) { 794 page = sbio->bio->bi_io_vec[i].bv_page; 795 buffer = kmap_atomic(page, KM_USER0); 796 flags = sbio->spag[i].flags; 797 logical = sbio->logical + i * PAGE_SIZE; 798 ret = 0; 799 if (flags & BTRFS_EXTENT_FLAG_DATA) { 800 ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); 801 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 802 ret = scrub_checksum_tree_block(sdev, sbio->spag + i, 803 logical, buffer); 804 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { 805 BUG_ON(i); 806 (void)scrub_checksum_super(sbio, buffer); 807 } else { 808 WARN_ON(1); 809 } 810 kunmap_atomic(buffer, KM_USER0); 811 if (ret) { 812 ret = scrub_recheck_error(sbio, i); 813 if (!ret) { 814 spin_lock(&sdev->stat_lock); 815 ++sdev->stat.unverified_errors; 816 spin_unlock(&sdev->stat_lock); 817 } 818 } 819 } 820 821 out: 822 scrub_free_bio(sbio->bio); 823 sbio->bio = NULL; 824 spin_lock(&sdev->list_lock); 825 sbio->next_free = sdev->first_free; 826 sdev->first_free = sbio->index; 827 spin_unlock(&sdev->list_lock); 828 atomic_dec(&sdev->in_flight); 829 wake_up(&sdev->list_wait); 830 } 831 832 static int scrub_checksum_data(struct scrub_dev *sdev, 833 struct scrub_page *spag, void *buffer) 834 { 835 u8 csum[BTRFS_CSUM_SIZE]; 836 u32 crc = ~(u32)0; 837 int fail = 0; 838 struct btrfs_root *root = sdev->dev->dev_root; 839 840 if (!spag->have_csum) 841 return 0; 842 843 crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); 844 btrfs_csum_final(crc, csum); 845 if (memcmp(csum, spag->csum, sdev->csum_size)) 846 fail = 1; 847 848 spin_lock(&sdev->stat_lock); 849 ++sdev->stat.data_extents_scrubbed; 850 sdev->stat.data_bytes_scrubbed += PAGE_SIZE; 851 if (fail) 852 ++sdev->stat.csum_errors; 853 spin_unlock(&sdev->stat_lock); 854 855 return fail; 856 } 857 858 static int scrub_checksum_tree_block(struct scrub_dev *sdev, 859 struct scrub_page *spag, u64 logical, 860 void *buffer) 861 { 862 struct btrfs_header *h; 863 struct btrfs_root *root = sdev->dev->dev_root; 864 struct btrfs_fs_info *fs_info = root->fs_info; 865 u8 csum[BTRFS_CSUM_SIZE]; 866 u32 crc = ~(u32)0; 867 int fail = 0; 868 int crc_fail = 0; 869 870 /* 871 * we don't use the getter functions here, as we 872 * a) don't have an extent buffer and 873 * b) the page is already kmapped 874 */ 875 h = (struct btrfs_header *)buffer; 876 877 if (logical != le64_to_cpu(h->bytenr)) 878 ++fail; 879 880 if (spag->generation != le64_to_cpu(h->generation)) 881 ++fail; 882 883 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 884 ++fail; 885 886 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 887 BTRFS_UUID_SIZE)) 888 ++fail; 889 890 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 891 PAGE_SIZE - BTRFS_CSUM_SIZE); 892 btrfs_csum_final(crc, csum); 893 if (memcmp(csum, h->csum, sdev->csum_size)) 894 ++crc_fail; 895 896 spin_lock(&sdev->stat_lock); 897 ++sdev->stat.tree_extents_scrubbed; 898 sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; 899 if (crc_fail) 900 ++sdev->stat.csum_errors; 901 if (fail) 902 ++sdev->stat.verify_errors; 903 spin_unlock(&sdev->stat_lock); 904 905 return fail || crc_fail; 906 } 907 908 static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) 909 { 910 struct btrfs_super_block *s; 911 u64 logical; 912 struct scrub_dev *sdev = sbio->sdev; 913 struct btrfs_root *root = sdev->dev->dev_root; 914 struct btrfs_fs_info *fs_info = root->fs_info; 915 u8 csum[BTRFS_CSUM_SIZE]; 916 u32 crc = ~(u32)0; 917 int fail = 0; 918 919 s = (struct btrfs_super_block *)buffer; 920 logical = sbio->logical; 921 922 if (logical != le64_to_cpu(s->bytenr)) 923 ++fail; 924 925 if (sbio->spag[0].generation != le64_to_cpu(s->generation)) 926 ++fail; 927 928 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 929 ++fail; 930 931 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 932 PAGE_SIZE - BTRFS_CSUM_SIZE); 933 btrfs_csum_final(crc, csum); 934 if (memcmp(csum, s->csum, sbio->sdev->csum_size)) 935 ++fail; 936 937 if (fail) { 938 /* 939 * if we find an error in a super block, we just report it. 940 * They will get written with the next transaction commit 941 * anyway 942 */ 943 spin_lock(&sdev->stat_lock); 944 ++sdev->stat.super_errors; 945 spin_unlock(&sdev->stat_lock); 946 } 947 948 return fail; 949 } 950 951 static int scrub_submit(struct scrub_dev *sdev) 952 { 953 struct scrub_bio *sbio; 954 955 if (sdev->curr == -1) 956 return 0; 957 958 sbio = sdev->bios[sdev->curr]; 959 sbio->err = 0; 960 sdev->curr = -1; 961 atomic_inc(&sdev->in_flight); 962 963 btrfsic_submit_bio(READ, sbio->bio); 964 965 return 0; 966 } 967 968 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 969 u64 physical, u64 flags, u64 gen, int mirror_num, 970 u8 *csum, int force) 971 { 972 struct scrub_bio *sbio; 973 struct page *page; 974 int ret; 975 976 again: 977 /* 978 * grab a fresh bio or wait for one to become available 979 */ 980 while (sdev->curr == -1) { 981 spin_lock(&sdev->list_lock); 982 sdev->curr = sdev->first_free; 983 if (sdev->curr != -1) { 984 sdev->first_free = sdev->bios[sdev->curr]->next_free; 985 sdev->bios[sdev->curr]->next_free = -1; 986 sdev->bios[sdev->curr]->count = 0; 987 spin_unlock(&sdev->list_lock); 988 } else { 989 spin_unlock(&sdev->list_lock); 990 wait_event(sdev->list_wait, sdev->first_free != -1); 991 } 992 } 993 sbio = sdev->bios[sdev->curr]; 994 if (sbio->count == 0) { 995 struct bio *bio; 996 997 sbio->physical = physical; 998 sbio->logical = logical; 999 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); 1000 if (!bio) 1001 return -ENOMEM; 1002 1003 bio->bi_private = sbio; 1004 bio->bi_end_io = scrub_bio_end_io; 1005 bio->bi_bdev = sdev->dev->bdev; 1006 bio->bi_sector = sbio->physical >> 9; 1007 sbio->err = 0; 1008 sbio->bio = bio; 1009 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1010 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1011 ret = scrub_submit(sdev); 1012 if (ret) 1013 return ret; 1014 goto again; 1015 } 1016 sbio->spag[sbio->count].flags = flags; 1017 sbio->spag[sbio->count].generation = gen; 1018 sbio->spag[sbio->count].have_csum = 0; 1019 sbio->spag[sbio->count].mirror_num = mirror_num; 1020 1021 page = alloc_page(GFP_NOFS); 1022 if (!page) 1023 return -ENOMEM; 1024 1025 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); 1026 if (!ret) { 1027 __free_page(page); 1028 ret = scrub_submit(sdev); 1029 if (ret) 1030 return ret; 1031 goto again; 1032 } 1033 1034 if (csum) { 1035 sbio->spag[sbio->count].have_csum = 1; 1036 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1037 } 1038 ++sbio->count; 1039 if (sbio->count == SCRUB_PAGES_PER_BIO || force) { 1040 int ret; 1041 1042 ret = scrub_submit(sdev); 1043 if (ret) 1044 return ret; 1045 } 1046 1047 return 0; 1048 } 1049 1050 static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 1051 u8 *csum) 1052 { 1053 struct btrfs_ordered_sum *sum = NULL; 1054 int ret = 0; 1055 unsigned long i; 1056 unsigned long num_sectors; 1057 u32 sectorsize = sdev->dev->dev_root->sectorsize; 1058 1059 while (!list_empty(&sdev->csum_list)) { 1060 sum = list_first_entry(&sdev->csum_list, 1061 struct btrfs_ordered_sum, list); 1062 if (sum->bytenr > logical) 1063 return 0; 1064 if (sum->bytenr + sum->len > logical) 1065 break; 1066 1067 ++sdev->stat.csum_discards; 1068 list_del(&sum->list); 1069 kfree(sum); 1070 sum = NULL; 1071 } 1072 if (!sum) 1073 return 0; 1074 1075 num_sectors = sum->len / sectorsize; 1076 for (i = 0; i < num_sectors; ++i) { 1077 if (sum->sums[i].bytenr == logical) { 1078 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 1079 ret = 1; 1080 break; 1081 } 1082 } 1083 if (ret && i == num_sectors - 1) { 1084 list_del(&sum->list); 1085 kfree(sum); 1086 } 1087 return ret; 1088 } 1089 1090 /* scrub extent tries to collect up to 64 kB for each bio */ 1091 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1092 u64 physical, u64 flags, u64 gen, int mirror_num) 1093 { 1094 int ret; 1095 u8 csum[BTRFS_CSUM_SIZE]; 1096 1097 while (len) { 1098 u64 l = min_t(u64, len, PAGE_SIZE); 1099 int have_csum = 0; 1100 1101 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1102 /* push csums to sbio */ 1103 have_csum = scrub_find_csum(sdev, logical, l, csum); 1104 if (have_csum == 0) 1105 ++sdev->stat.no_csum; 1106 } 1107 ret = scrub_page(sdev, logical, l, physical, flags, gen, 1108 mirror_num, have_csum ? csum : NULL, 0); 1109 if (ret) 1110 return ret; 1111 len -= l; 1112 logical += l; 1113 physical += l; 1114 } 1115 return 0; 1116 } 1117 1118 static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 1119 struct map_lookup *map, int num, u64 base, u64 length) 1120 { 1121 struct btrfs_path *path; 1122 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1123 struct btrfs_root *root = fs_info->extent_root; 1124 struct btrfs_root *csum_root = fs_info->csum_root; 1125 struct btrfs_extent_item *extent; 1126 struct blk_plug plug; 1127 u64 flags; 1128 int ret; 1129 int slot; 1130 int i; 1131 u64 nstripes; 1132 struct extent_buffer *l; 1133 struct btrfs_key key; 1134 u64 physical; 1135 u64 logical; 1136 u64 generation; 1137 int mirror_num; 1138 struct reada_control *reada1; 1139 struct reada_control *reada2; 1140 struct btrfs_key key_start; 1141 struct btrfs_key key_end; 1142 1143 u64 increment = map->stripe_len; 1144 u64 offset; 1145 1146 nstripes = length; 1147 offset = 0; 1148 do_div(nstripes, map->stripe_len); 1149 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1150 offset = map->stripe_len * num; 1151 increment = map->stripe_len * map->num_stripes; 1152 mirror_num = 1; 1153 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1154 int factor = map->num_stripes / map->sub_stripes; 1155 offset = map->stripe_len * (num / map->sub_stripes); 1156 increment = map->stripe_len * factor; 1157 mirror_num = num % map->sub_stripes + 1; 1158 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1159 increment = map->stripe_len; 1160 mirror_num = num % map->num_stripes + 1; 1161 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1162 increment = map->stripe_len; 1163 mirror_num = num % map->num_stripes + 1; 1164 } else { 1165 increment = map->stripe_len; 1166 mirror_num = 1; 1167 } 1168 1169 path = btrfs_alloc_path(); 1170 if (!path) 1171 return -ENOMEM; 1172 1173 path->search_commit_root = 1; 1174 path->skip_locking = 1; 1175 1176 /* 1177 * trigger the readahead for extent tree csum tree and wait for 1178 * completion. During readahead, the scrub is officially paused 1179 * to not hold off transaction commits 1180 */ 1181 logical = base + offset; 1182 1183 wait_event(sdev->list_wait, 1184 atomic_read(&sdev->in_flight) == 0); 1185 atomic_inc(&fs_info->scrubs_paused); 1186 wake_up(&fs_info->scrub_pause_wait); 1187 1188 /* FIXME it might be better to start readahead at commit root */ 1189 key_start.objectid = logical; 1190 key_start.type = BTRFS_EXTENT_ITEM_KEY; 1191 key_start.offset = (u64)0; 1192 key_end.objectid = base + offset + nstripes * increment; 1193 key_end.type = BTRFS_EXTENT_ITEM_KEY; 1194 key_end.offset = (u64)0; 1195 reada1 = btrfs_reada_add(root, &key_start, &key_end); 1196 1197 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1198 key_start.type = BTRFS_EXTENT_CSUM_KEY; 1199 key_start.offset = logical; 1200 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1201 key_end.type = BTRFS_EXTENT_CSUM_KEY; 1202 key_end.offset = base + offset + nstripes * increment; 1203 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 1204 1205 if (!IS_ERR(reada1)) 1206 btrfs_reada_wait(reada1); 1207 if (!IS_ERR(reada2)) 1208 btrfs_reada_wait(reada2); 1209 1210 mutex_lock(&fs_info->scrub_lock); 1211 while (atomic_read(&fs_info->scrub_pause_req)) { 1212 mutex_unlock(&fs_info->scrub_lock); 1213 wait_event(fs_info->scrub_pause_wait, 1214 atomic_read(&fs_info->scrub_pause_req) == 0); 1215 mutex_lock(&fs_info->scrub_lock); 1216 } 1217 atomic_dec(&fs_info->scrubs_paused); 1218 mutex_unlock(&fs_info->scrub_lock); 1219 wake_up(&fs_info->scrub_pause_wait); 1220 1221 /* 1222 * collect all data csums for the stripe to avoid seeking during 1223 * the scrub. This might currently (crc32) end up to be about 1MB 1224 */ 1225 blk_start_plug(&plug); 1226 1227 /* 1228 * now find all extents for each stripe and scrub them 1229 */ 1230 logical = base + offset; 1231 physical = map->stripes[num].physical; 1232 ret = 0; 1233 for (i = 0; i < nstripes; ++i) { 1234 /* 1235 * canceled? 1236 */ 1237 if (atomic_read(&fs_info->scrub_cancel_req) || 1238 atomic_read(&sdev->cancel_req)) { 1239 ret = -ECANCELED; 1240 goto out; 1241 } 1242 /* 1243 * check to see if we have to pause 1244 */ 1245 if (atomic_read(&fs_info->scrub_pause_req)) { 1246 /* push queued extents */ 1247 scrub_submit(sdev); 1248 wait_event(sdev->list_wait, 1249 atomic_read(&sdev->in_flight) == 0); 1250 atomic_inc(&fs_info->scrubs_paused); 1251 wake_up(&fs_info->scrub_pause_wait); 1252 mutex_lock(&fs_info->scrub_lock); 1253 while (atomic_read(&fs_info->scrub_pause_req)) { 1254 mutex_unlock(&fs_info->scrub_lock); 1255 wait_event(fs_info->scrub_pause_wait, 1256 atomic_read(&fs_info->scrub_pause_req) == 0); 1257 mutex_lock(&fs_info->scrub_lock); 1258 } 1259 atomic_dec(&fs_info->scrubs_paused); 1260 mutex_unlock(&fs_info->scrub_lock); 1261 wake_up(&fs_info->scrub_pause_wait); 1262 } 1263 1264 ret = btrfs_lookup_csums_range(csum_root, logical, 1265 logical + map->stripe_len - 1, 1266 &sdev->csum_list, 1); 1267 if (ret) 1268 goto out; 1269 1270 key.objectid = logical; 1271 key.type = BTRFS_EXTENT_ITEM_KEY; 1272 key.offset = (u64)0; 1273 1274 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1275 if (ret < 0) 1276 goto out; 1277 if (ret > 0) { 1278 ret = btrfs_previous_item(root, path, 0, 1279 BTRFS_EXTENT_ITEM_KEY); 1280 if (ret < 0) 1281 goto out; 1282 if (ret > 0) { 1283 /* there's no smaller item, so stick with the 1284 * larger one */ 1285 btrfs_release_path(path); 1286 ret = btrfs_search_slot(NULL, root, &key, 1287 path, 0, 0); 1288 if (ret < 0) 1289 goto out; 1290 } 1291 } 1292 1293 while (1) { 1294 l = path->nodes[0]; 1295 slot = path->slots[0]; 1296 if (slot >= btrfs_header_nritems(l)) { 1297 ret = btrfs_next_leaf(root, path); 1298 if (ret == 0) 1299 continue; 1300 if (ret < 0) 1301 goto out; 1302 1303 break; 1304 } 1305 btrfs_item_key_to_cpu(l, &key, slot); 1306 1307 if (key.objectid + key.offset <= logical) 1308 goto next; 1309 1310 if (key.objectid >= logical + map->stripe_len) 1311 break; 1312 1313 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) 1314 goto next; 1315 1316 extent = btrfs_item_ptr(l, slot, 1317 struct btrfs_extent_item); 1318 flags = btrfs_extent_flags(l, extent); 1319 generation = btrfs_extent_generation(l, extent); 1320 1321 if (key.objectid < logical && 1322 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 1323 printk(KERN_ERR 1324 "btrfs scrub: tree block %llu spanning " 1325 "stripes, ignored. logical=%llu\n", 1326 (unsigned long long)key.objectid, 1327 (unsigned long long)logical); 1328 goto next; 1329 } 1330 1331 /* 1332 * trim extent to this stripe 1333 */ 1334 if (key.objectid < logical) { 1335 key.offset -= logical - key.objectid; 1336 key.objectid = logical; 1337 } 1338 if (key.objectid + key.offset > 1339 logical + map->stripe_len) { 1340 key.offset = logical + map->stripe_len - 1341 key.objectid; 1342 } 1343 1344 ret = scrub_extent(sdev, key.objectid, key.offset, 1345 key.objectid - logical + physical, 1346 flags, generation, mirror_num); 1347 if (ret) 1348 goto out; 1349 1350 next: 1351 path->slots[0]++; 1352 } 1353 btrfs_release_path(path); 1354 logical += increment; 1355 physical += map->stripe_len; 1356 spin_lock(&sdev->stat_lock); 1357 sdev->stat.last_physical = physical; 1358 spin_unlock(&sdev->stat_lock); 1359 } 1360 /* push queued extents */ 1361 scrub_submit(sdev); 1362 1363 out: 1364 blk_finish_plug(&plug); 1365 btrfs_free_path(path); 1366 return ret < 0 ? ret : 0; 1367 } 1368 1369 static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 1370 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 1371 u64 dev_offset) 1372 { 1373 struct btrfs_mapping_tree *map_tree = 1374 &sdev->dev->dev_root->fs_info->mapping_tree; 1375 struct map_lookup *map; 1376 struct extent_map *em; 1377 int i; 1378 int ret = -EINVAL; 1379 1380 read_lock(&map_tree->map_tree.lock); 1381 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 1382 read_unlock(&map_tree->map_tree.lock); 1383 1384 if (!em) 1385 return -EINVAL; 1386 1387 map = (struct map_lookup *)em->bdev; 1388 if (em->start != chunk_offset) 1389 goto out; 1390 1391 if (em->len < length) 1392 goto out; 1393 1394 for (i = 0; i < map->num_stripes; ++i) { 1395 if (map->stripes[i].dev == sdev->dev && 1396 map->stripes[i].physical == dev_offset) { 1397 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 1398 if (ret) 1399 goto out; 1400 } 1401 } 1402 out: 1403 free_extent_map(em); 1404 1405 return ret; 1406 } 1407 1408 static noinline_for_stack 1409 int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 1410 { 1411 struct btrfs_dev_extent *dev_extent = NULL; 1412 struct btrfs_path *path; 1413 struct btrfs_root *root = sdev->dev->dev_root; 1414 struct btrfs_fs_info *fs_info = root->fs_info; 1415 u64 length; 1416 u64 chunk_tree; 1417 u64 chunk_objectid; 1418 u64 chunk_offset; 1419 int ret; 1420 int slot; 1421 struct extent_buffer *l; 1422 struct btrfs_key key; 1423 struct btrfs_key found_key; 1424 struct btrfs_block_group_cache *cache; 1425 1426 path = btrfs_alloc_path(); 1427 if (!path) 1428 return -ENOMEM; 1429 1430 path->reada = 2; 1431 path->search_commit_root = 1; 1432 path->skip_locking = 1; 1433 1434 key.objectid = sdev->dev->devid; 1435 key.offset = 0ull; 1436 key.type = BTRFS_DEV_EXTENT_KEY; 1437 1438 1439 while (1) { 1440 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1441 if (ret < 0) 1442 break; 1443 if (ret > 0) { 1444 if (path->slots[0] >= 1445 btrfs_header_nritems(path->nodes[0])) { 1446 ret = btrfs_next_leaf(root, path); 1447 if (ret) 1448 break; 1449 } 1450 } 1451 1452 l = path->nodes[0]; 1453 slot = path->slots[0]; 1454 1455 btrfs_item_key_to_cpu(l, &found_key, slot); 1456 1457 if (found_key.objectid != sdev->dev->devid) 1458 break; 1459 1460 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 1461 break; 1462 1463 if (found_key.offset >= end) 1464 break; 1465 1466 if (found_key.offset < key.offset) 1467 break; 1468 1469 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1470 length = btrfs_dev_extent_length(l, dev_extent); 1471 1472 if (found_key.offset + length <= start) { 1473 key.offset = found_key.offset + length; 1474 btrfs_release_path(path); 1475 continue; 1476 } 1477 1478 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 1479 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 1480 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 1481 1482 /* 1483 * get a reference on the corresponding block group to prevent 1484 * the chunk from going away while we scrub it 1485 */ 1486 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 1487 if (!cache) { 1488 ret = -ENOENT; 1489 break; 1490 } 1491 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 1492 chunk_offset, length, found_key.offset); 1493 btrfs_put_block_group(cache); 1494 if (ret) 1495 break; 1496 1497 key.offset = found_key.offset + length; 1498 btrfs_release_path(path); 1499 } 1500 1501 btrfs_free_path(path); 1502 1503 /* 1504 * ret can still be 1 from search_slot or next_leaf, 1505 * that's not an error 1506 */ 1507 return ret < 0 ? ret : 0; 1508 } 1509 1510 static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 1511 { 1512 int i; 1513 u64 bytenr; 1514 u64 gen; 1515 int ret; 1516 struct btrfs_device *device = sdev->dev; 1517 struct btrfs_root *root = device->dev_root; 1518 1519 gen = root->fs_info->last_trans_committed; 1520 1521 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1522 bytenr = btrfs_sb_offset(i); 1523 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 1524 break; 1525 1526 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, 1527 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 1528 if (ret) 1529 return ret; 1530 } 1531 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1532 1533 return 0; 1534 } 1535 1536 /* 1537 * get a reference count on fs_info->scrub_workers. start worker if necessary 1538 */ 1539 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 1540 { 1541 struct btrfs_fs_info *fs_info = root->fs_info; 1542 int ret = 0; 1543 1544 mutex_lock(&fs_info->scrub_lock); 1545 if (fs_info->scrub_workers_refcnt == 0) { 1546 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1547 fs_info->thread_pool_size, &fs_info->generic_worker); 1548 fs_info->scrub_workers.idle_thresh = 4; 1549 ret = btrfs_start_workers(&fs_info->scrub_workers); 1550 if (ret) 1551 goto out; 1552 } 1553 ++fs_info->scrub_workers_refcnt; 1554 out: 1555 mutex_unlock(&fs_info->scrub_lock); 1556 1557 return ret; 1558 } 1559 1560 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 1561 { 1562 struct btrfs_fs_info *fs_info = root->fs_info; 1563 1564 mutex_lock(&fs_info->scrub_lock); 1565 if (--fs_info->scrub_workers_refcnt == 0) 1566 btrfs_stop_workers(&fs_info->scrub_workers); 1567 WARN_ON(fs_info->scrub_workers_refcnt < 0); 1568 mutex_unlock(&fs_info->scrub_lock); 1569 } 1570 1571 1572 int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 1573 struct btrfs_scrub_progress *progress, int readonly) 1574 { 1575 struct scrub_dev *sdev; 1576 struct btrfs_fs_info *fs_info = root->fs_info; 1577 int ret; 1578 struct btrfs_device *dev; 1579 1580 if (btrfs_fs_closing(root->fs_info)) 1581 return -EINVAL; 1582 1583 /* 1584 * check some assumptions 1585 */ 1586 if (root->sectorsize != PAGE_SIZE || 1587 root->sectorsize != root->leafsize || 1588 root->sectorsize != root->nodesize) { 1589 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); 1590 return -EINVAL; 1591 } 1592 1593 ret = scrub_workers_get(root); 1594 if (ret) 1595 return ret; 1596 1597 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1598 dev = btrfs_find_device(root, devid, NULL, NULL); 1599 if (!dev || dev->missing) { 1600 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1601 scrub_workers_put(root); 1602 return -ENODEV; 1603 } 1604 mutex_lock(&fs_info->scrub_lock); 1605 1606 if (!dev->in_fs_metadata) { 1607 mutex_unlock(&fs_info->scrub_lock); 1608 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1609 scrub_workers_put(root); 1610 return -ENODEV; 1611 } 1612 1613 if (dev->scrub_device) { 1614 mutex_unlock(&fs_info->scrub_lock); 1615 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1616 scrub_workers_put(root); 1617 return -EINPROGRESS; 1618 } 1619 sdev = scrub_setup_dev(dev); 1620 if (IS_ERR(sdev)) { 1621 mutex_unlock(&fs_info->scrub_lock); 1622 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1623 scrub_workers_put(root); 1624 return PTR_ERR(sdev); 1625 } 1626 sdev->readonly = readonly; 1627 dev->scrub_device = sdev; 1628 1629 atomic_inc(&fs_info->scrubs_running); 1630 mutex_unlock(&fs_info->scrub_lock); 1631 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1632 1633 down_read(&fs_info->scrub_super_lock); 1634 ret = scrub_supers(sdev); 1635 up_read(&fs_info->scrub_super_lock); 1636 1637 if (!ret) 1638 ret = scrub_enumerate_chunks(sdev, start, end); 1639 1640 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1641 atomic_dec(&fs_info->scrubs_running); 1642 wake_up(&fs_info->scrub_pause_wait); 1643 1644 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 1645 1646 if (progress) 1647 memcpy(progress, &sdev->stat, sizeof(*progress)); 1648 1649 mutex_lock(&fs_info->scrub_lock); 1650 dev->scrub_device = NULL; 1651 mutex_unlock(&fs_info->scrub_lock); 1652 1653 scrub_free_dev(sdev); 1654 scrub_workers_put(root); 1655 1656 return ret; 1657 } 1658 1659 int btrfs_scrub_pause(struct btrfs_root *root) 1660 { 1661 struct btrfs_fs_info *fs_info = root->fs_info; 1662 1663 mutex_lock(&fs_info->scrub_lock); 1664 atomic_inc(&fs_info->scrub_pause_req); 1665 while (atomic_read(&fs_info->scrubs_paused) != 1666 atomic_read(&fs_info->scrubs_running)) { 1667 mutex_unlock(&fs_info->scrub_lock); 1668 wait_event(fs_info->scrub_pause_wait, 1669 atomic_read(&fs_info->scrubs_paused) == 1670 atomic_read(&fs_info->scrubs_running)); 1671 mutex_lock(&fs_info->scrub_lock); 1672 } 1673 mutex_unlock(&fs_info->scrub_lock); 1674 1675 return 0; 1676 } 1677 1678 int btrfs_scrub_continue(struct btrfs_root *root) 1679 { 1680 struct btrfs_fs_info *fs_info = root->fs_info; 1681 1682 atomic_dec(&fs_info->scrub_pause_req); 1683 wake_up(&fs_info->scrub_pause_wait); 1684 return 0; 1685 } 1686 1687 int btrfs_scrub_pause_super(struct btrfs_root *root) 1688 { 1689 down_write(&root->fs_info->scrub_super_lock); 1690 return 0; 1691 } 1692 1693 int btrfs_scrub_continue_super(struct btrfs_root *root) 1694 { 1695 up_write(&root->fs_info->scrub_super_lock); 1696 return 0; 1697 } 1698 1699 int btrfs_scrub_cancel(struct btrfs_root *root) 1700 { 1701 struct btrfs_fs_info *fs_info = root->fs_info; 1702 1703 mutex_lock(&fs_info->scrub_lock); 1704 if (!atomic_read(&fs_info->scrubs_running)) { 1705 mutex_unlock(&fs_info->scrub_lock); 1706 return -ENOTCONN; 1707 } 1708 1709 atomic_inc(&fs_info->scrub_cancel_req); 1710 while (atomic_read(&fs_info->scrubs_running)) { 1711 mutex_unlock(&fs_info->scrub_lock); 1712 wait_event(fs_info->scrub_pause_wait, 1713 atomic_read(&fs_info->scrubs_running) == 0); 1714 mutex_lock(&fs_info->scrub_lock); 1715 } 1716 atomic_dec(&fs_info->scrub_cancel_req); 1717 mutex_unlock(&fs_info->scrub_lock); 1718 1719 return 0; 1720 } 1721 1722 int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 1723 { 1724 struct btrfs_fs_info *fs_info = root->fs_info; 1725 struct scrub_dev *sdev; 1726 1727 mutex_lock(&fs_info->scrub_lock); 1728 sdev = dev->scrub_device; 1729 if (!sdev) { 1730 mutex_unlock(&fs_info->scrub_lock); 1731 return -ENOTCONN; 1732 } 1733 atomic_inc(&sdev->cancel_req); 1734 while (dev->scrub_device) { 1735 mutex_unlock(&fs_info->scrub_lock); 1736 wait_event(fs_info->scrub_pause_wait, 1737 dev->scrub_device == NULL); 1738 mutex_lock(&fs_info->scrub_lock); 1739 } 1740 mutex_unlock(&fs_info->scrub_lock); 1741 1742 return 0; 1743 } 1744 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 1745 { 1746 struct btrfs_fs_info *fs_info = root->fs_info; 1747 struct btrfs_device *dev; 1748 int ret; 1749 1750 /* 1751 * we have to hold the device_list_mutex here so the device 1752 * does not go away in cancel_dev. FIXME: find a better solution 1753 */ 1754 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1755 dev = btrfs_find_device(root, devid, NULL, NULL); 1756 if (!dev) { 1757 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1758 return -ENODEV; 1759 } 1760 ret = btrfs_scrub_cancel_dev(root, dev); 1761 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1762 1763 return ret; 1764 } 1765 1766 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 1767 struct btrfs_scrub_progress *progress) 1768 { 1769 struct btrfs_device *dev; 1770 struct scrub_dev *sdev = NULL; 1771 1772 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1773 dev = btrfs_find_device(root, devid, NULL, NULL); 1774 if (dev) 1775 sdev = dev->scrub_device; 1776 if (sdev) 1777 memcpy(progress, &sdev->stat, sizeof(*progress)); 1778 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1779 1780 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 1781 } 1782