1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "messages.h" 17 #include "misc.h" 18 #include "ctree.h" 19 #include "disk-io.h" 20 #include "volumes.h" 21 #include "raid56.h" 22 #include "async-thread.h" 23 #include "file-item.h" 24 #include "btrfs_inode.h" 25 26 /* set when additional merges to this rbio are not allowed */ 27 #define RBIO_RMW_LOCKED_BIT 1 28 29 /* 30 * set when this rbio is sitting in the hash, but it is just a cache 31 * of past RMW 32 */ 33 #define RBIO_CACHE_BIT 2 34 35 /* 36 * set when it is safe to trust the stripe_pages for caching 37 */ 38 #define RBIO_CACHE_READY_BIT 3 39 40 #define RBIO_CACHE_SIZE 1024 41 42 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 43 44 /* Used by the raid56 code to lock stripes for read/modify/write */ 45 struct btrfs_stripe_hash { 46 struct list_head hash_list; 47 spinlock_t lock; 48 }; 49 50 /* Used by the raid56 code to lock stripes for read/modify/write */ 51 struct btrfs_stripe_hash_table { 52 struct list_head stripe_cache; 53 spinlock_t cache_lock; 54 int cache_size; 55 struct btrfs_stripe_hash table[]; 56 }; 57 58 /* 59 * A bvec like structure to present a sector inside a page. 60 * 61 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 62 */ 63 struct sector_ptr { 64 struct page *page; 65 unsigned int pgoff:24; 66 unsigned int uptodate:8; 67 }; 68 69 static void rmw_rbio_work(struct work_struct *work); 70 static void rmw_rbio_work_locked(struct work_struct *work); 71 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 72 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 73 74 static int finish_parity_scrub(struct btrfs_raid_bio *rbio); 75 static void scrub_rbio_work_locked(struct work_struct *work); 76 77 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 78 { 79 bitmap_free(rbio->error_bitmap); 80 kfree(rbio->stripe_pages); 81 kfree(rbio->bio_sectors); 82 kfree(rbio->stripe_sectors); 83 kfree(rbio->finish_pointers); 84 } 85 86 static void free_raid_bio(struct btrfs_raid_bio *rbio) 87 { 88 int i; 89 90 if (!refcount_dec_and_test(&rbio->refs)) 91 return; 92 93 WARN_ON(!list_empty(&rbio->stripe_cache)); 94 WARN_ON(!list_empty(&rbio->hash_list)); 95 WARN_ON(!bio_list_empty(&rbio->bio_list)); 96 97 for (i = 0; i < rbio->nr_pages; i++) { 98 if (rbio->stripe_pages[i]) { 99 __free_page(rbio->stripe_pages[i]); 100 rbio->stripe_pages[i] = NULL; 101 } 102 } 103 104 btrfs_put_bioc(rbio->bioc); 105 free_raid_bio_pointers(rbio); 106 kfree(rbio); 107 } 108 109 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 110 { 111 INIT_WORK(&rbio->work, work_func); 112 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 113 } 114 115 /* 116 * the stripe hash table is used for locking, and to collect 117 * bios in hopes of making a full stripe 118 */ 119 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 120 { 121 struct btrfs_stripe_hash_table *table; 122 struct btrfs_stripe_hash_table *x; 123 struct btrfs_stripe_hash *cur; 124 struct btrfs_stripe_hash *h; 125 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 126 int i; 127 128 if (info->stripe_hash_table) 129 return 0; 130 131 /* 132 * The table is large, starting with order 4 and can go as high as 133 * order 7 in case lock debugging is turned on. 134 * 135 * Try harder to allocate and fallback to vmalloc to lower the chance 136 * of a failing mount. 137 */ 138 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 139 if (!table) 140 return -ENOMEM; 141 142 spin_lock_init(&table->cache_lock); 143 INIT_LIST_HEAD(&table->stripe_cache); 144 145 h = table->table; 146 147 for (i = 0; i < num_entries; i++) { 148 cur = h + i; 149 INIT_LIST_HEAD(&cur->hash_list); 150 spin_lock_init(&cur->lock); 151 } 152 153 x = cmpxchg(&info->stripe_hash_table, NULL, table); 154 kvfree(x); 155 return 0; 156 } 157 158 /* 159 * caching an rbio means to copy anything from the 160 * bio_sectors array into the stripe_pages array. We 161 * use the page uptodate bit in the stripe cache array 162 * to indicate if it has valid data 163 * 164 * once the caching is done, we set the cache ready 165 * bit. 166 */ 167 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 168 { 169 int i; 170 int ret; 171 172 ret = alloc_rbio_pages(rbio); 173 if (ret) 174 return; 175 176 for (i = 0; i < rbio->nr_sectors; i++) { 177 /* Some range not covered by bio (partial write), skip it */ 178 if (!rbio->bio_sectors[i].page) { 179 /* 180 * Even if the sector is not covered by bio, if it is 181 * a data sector it should still be uptodate as it is 182 * read from disk. 183 */ 184 if (i < rbio->nr_data * rbio->stripe_nsectors) 185 ASSERT(rbio->stripe_sectors[i].uptodate); 186 continue; 187 } 188 189 ASSERT(rbio->stripe_sectors[i].page); 190 memcpy_page(rbio->stripe_sectors[i].page, 191 rbio->stripe_sectors[i].pgoff, 192 rbio->bio_sectors[i].page, 193 rbio->bio_sectors[i].pgoff, 194 rbio->bioc->fs_info->sectorsize); 195 rbio->stripe_sectors[i].uptodate = 1; 196 } 197 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 198 } 199 200 /* 201 * we hash on the first logical address of the stripe 202 */ 203 static int rbio_bucket(struct btrfs_raid_bio *rbio) 204 { 205 u64 num = rbio->bioc->full_stripe_logical; 206 207 /* 208 * we shift down quite a bit. We're using byte 209 * addressing, and most of the lower bits are zeros. 210 * This tends to upset hash_64, and it consistently 211 * returns just one or two different values. 212 * 213 * shifting off the lower bits fixes things. 214 */ 215 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 216 } 217 218 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 219 unsigned int page_nr) 220 { 221 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 222 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 223 int i; 224 225 ASSERT(page_nr < rbio->nr_pages); 226 227 for (i = sectors_per_page * page_nr; 228 i < sectors_per_page * page_nr + sectors_per_page; 229 i++) { 230 if (!rbio->stripe_sectors[i].uptodate) 231 return false; 232 } 233 return true; 234 } 235 236 /* 237 * Update the stripe_sectors[] array to use correct page and pgoff 238 * 239 * Should be called every time any page pointer in stripes_pages[] got modified. 240 */ 241 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 242 { 243 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 244 u32 offset; 245 int i; 246 247 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 248 int page_index = offset >> PAGE_SHIFT; 249 250 ASSERT(page_index < rbio->nr_pages); 251 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 252 rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 253 } 254 } 255 256 static void steal_rbio_page(struct btrfs_raid_bio *src, 257 struct btrfs_raid_bio *dest, int page_nr) 258 { 259 const u32 sectorsize = src->bioc->fs_info->sectorsize; 260 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 261 int i; 262 263 if (dest->stripe_pages[page_nr]) 264 __free_page(dest->stripe_pages[page_nr]); 265 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 266 src->stripe_pages[page_nr] = NULL; 267 268 /* Also update the sector->uptodate bits. */ 269 for (i = sectors_per_page * page_nr; 270 i < sectors_per_page * page_nr + sectors_per_page; i++) 271 dest->stripe_sectors[i].uptodate = true; 272 } 273 274 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 275 { 276 const int sector_nr = (page_nr << PAGE_SHIFT) >> 277 rbio->bioc->fs_info->sectorsize_bits; 278 279 /* 280 * We have ensured PAGE_SIZE is aligned with sectorsize, thus 281 * we won't have a page which is half data half parity. 282 * 283 * Thus if the first sector of the page belongs to data stripes, then 284 * the full page belongs to data stripes. 285 */ 286 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 287 } 288 289 /* 290 * Stealing an rbio means taking all the uptodate pages from the stripe array 291 * in the source rbio and putting them into the destination rbio. 292 * 293 * This will also update the involved stripe_sectors[] which are referring to 294 * the old pages. 295 */ 296 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 297 { 298 int i; 299 300 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 301 return; 302 303 for (i = 0; i < dest->nr_pages; i++) { 304 struct page *p = src->stripe_pages[i]; 305 306 /* 307 * We don't need to steal P/Q pages as they will always be 308 * regenerated for RMW or full write anyway. 309 */ 310 if (!is_data_stripe_page(src, i)) 311 continue; 312 313 /* 314 * If @src already has RBIO_CACHE_READY_BIT, it should have 315 * all data stripe pages present and uptodate. 316 */ 317 ASSERT(p); 318 ASSERT(full_page_sectors_uptodate(src, i)); 319 steal_rbio_page(src, dest, i); 320 } 321 index_stripe_sectors(dest); 322 index_stripe_sectors(src); 323 } 324 325 /* 326 * merging means we take the bio_list from the victim and 327 * splice it into the destination. The victim should 328 * be discarded afterwards. 329 * 330 * must be called with dest->rbio_list_lock held 331 */ 332 static void merge_rbio(struct btrfs_raid_bio *dest, 333 struct btrfs_raid_bio *victim) 334 { 335 bio_list_merge(&dest->bio_list, &victim->bio_list); 336 dest->bio_list_bytes += victim->bio_list_bytes; 337 /* Also inherit the bitmaps from @victim. */ 338 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 339 dest->stripe_nsectors); 340 bio_list_init(&victim->bio_list); 341 } 342 343 /* 344 * used to prune items that are in the cache. The caller 345 * must hold the hash table lock. 346 */ 347 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 348 { 349 int bucket = rbio_bucket(rbio); 350 struct btrfs_stripe_hash_table *table; 351 struct btrfs_stripe_hash *h; 352 int freeit = 0; 353 354 /* 355 * check the bit again under the hash table lock. 356 */ 357 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 358 return; 359 360 table = rbio->bioc->fs_info->stripe_hash_table; 361 h = table->table + bucket; 362 363 /* hold the lock for the bucket because we may be 364 * removing it from the hash table 365 */ 366 spin_lock(&h->lock); 367 368 /* 369 * hold the lock for the bio list because we need 370 * to make sure the bio list is empty 371 */ 372 spin_lock(&rbio->bio_list_lock); 373 374 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 375 list_del_init(&rbio->stripe_cache); 376 table->cache_size -= 1; 377 freeit = 1; 378 379 /* if the bio list isn't empty, this rbio is 380 * still involved in an IO. We take it out 381 * of the cache list, and drop the ref that 382 * was held for the list. 383 * 384 * If the bio_list was empty, we also remove 385 * the rbio from the hash_table, and drop 386 * the corresponding ref 387 */ 388 if (bio_list_empty(&rbio->bio_list)) { 389 if (!list_empty(&rbio->hash_list)) { 390 list_del_init(&rbio->hash_list); 391 refcount_dec(&rbio->refs); 392 BUG_ON(!list_empty(&rbio->plug_list)); 393 } 394 } 395 } 396 397 spin_unlock(&rbio->bio_list_lock); 398 spin_unlock(&h->lock); 399 400 if (freeit) 401 free_raid_bio(rbio); 402 } 403 404 /* 405 * prune a given rbio from the cache 406 */ 407 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 408 { 409 struct btrfs_stripe_hash_table *table; 410 411 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 412 return; 413 414 table = rbio->bioc->fs_info->stripe_hash_table; 415 416 spin_lock(&table->cache_lock); 417 __remove_rbio_from_cache(rbio); 418 spin_unlock(&table->cache_lock); 419 } 420 421 /* 422 * remove everything in the cache 423 */ 424 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 425 { 426 struct btrfs_stripe_hash_table *table; 427 struct btrfs_raid_bio *rbio; 428 429 table = info->stripe_hash_table; 430 431 spin_lock(&table->cache_lock); 432 while (!list_empty(&table->stripe_cache)) { 433 rbio = list_entry(table->stripe_cache.next, 434 struct btrfs_raid_bio, 435 stripe_cache); 436 __remove_rbio_from_cache(rbio); 437 } 438 spin_unlock(&table->cache_lock); 439 } 440 441 /* 442 * remove all cached entries and free the hash table 443 * used by unmount 444 */ 445 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 446 { 447 if (!info->stripe_hash_table) 448 return; 449 btrfs_clear_rbio_cache(info); 450 kvfree(info->stripe_hash_table); 451 info->stripe_hash_table = NULL; 452 } 453 454 /* 455 * insert an rbio into the stripe cache. It 456 * must have already been prepared by calling 457 * cache_rbio_pages 458 * 459 * If this rbio was already cached, it gets 460 * moved to the front of the lru. 461 * 462 * If the size of the rbio cache is too big, we 463 * prune an item. 464 */ 465 static void cache_rbio(struct btrfs_raid_bio *rbio) 466 { 467 struct btrfs_stripe_hash_table *table; 468 469 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 470 return; 471 472 table = rbio->bioc->fs_info->stripe_hash_table; 473 474 spin_lock(&table->cache_lock); 475 spin_lock(&rbio->bio_list_lock); 476 477 /* bump our ref if we were not in the list before */ 478 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 479 refcount_inc(&rbio->refs); 480 481 if (!list_empty(&rbio->stripe_cache)){ 482 list_move(&rbio->stripe_cache, &table->stripe_cache); 483 } else { 484 list_add(&rbio->stripe_cache, &table->stripe_cache); 485 table->cache_size += 1; 486 } 487 488 spin_unlock(&rbio->bio_list_lock); 489 490 if (table->cache_size > RBIO_CACHE_SIZE) { 491 struct btrfs_raid_bio *found; 492 493 found = list_entry(table->stripe_cache.prev, 494 struct btrfs_raid_bio, 495 stripe_cache); 496 497 if (found != rbio) 498 __remove_rbio_from_cache(found); 499 } 500 501 spin_unlock(&table->cache_lock); 502 } 503 504 /* 505 * helper function to run the xor_blocks api. It is only 506 * able to do MAX_XOR_BLOCKS at a time, so we need to 507 * loop through. 508 */ 509 static void run_xor(void **pages, int src_cnt, ssize_t len) 510 { 511 int src_off = 0; 512 int xor_src_cnt = 0; 513 void *dest = pages[src_cnt]; 514 515 while(src_cnt > 0) { 516 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 517 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 518 519 src_cnt -= xor_src_cnt; 520 src_off += xor_src_cnt; 521 } 522 } 523 524 /* 525 * Returns true if the bio list inside this rbio covers an entire stripe (no 526 * rmw required). 527 */ 528 static int rbio_is_full(struct btrfs_raid_bio *rbio) 529 { 530 unsigned long size = rbio->bio_list_bytes; 531 int ret = 1; 532 533 spin_lock(&rbio->bio_list_lock); 534 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 535 ret = 0; 536 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 537 spin_unlock(&rbio->bio_list_lock); 538 539 return ret; 540 } 541 542 /* 543 * returns 1 if it is safe to merge two rbios together. 544 * The merging is safe if the two rbios correspond to 545 * the same stripe and if they are both going in the same 546 * direction (read vs write), and if neither one is 547 * locked for final IO 548 * 549 * The caller is responsible for locking such that 550 * rmw_locked is safe to test 551 */ 552 static int rbio_can_merge(struct btrfs_raid_bio *last, 553 struct btrfs_raid_bio *cur) 554 { 555 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 556 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 557 return 0; 558 559 /* 560 * we can't merge with cached rbios, since the 561 * idea is that when we merge the destination 562 * rbio is going to run our IO for us. We can 563 * steal from cached rbios though, other functions 564 * handle that. 565 */ 566 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 567 test_bit(RBIO_CACHE_BIT, &cur->flags)) 568 return 0; 569 570 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) 571 return 0; 572 573 /* we can't merge with different operations */ 574 if (last->operation != cur->operation) 575 return 0; 576 /* 577 * We've need read the full stripe from the drive. 578 * check and repair the parity and write the new results. 579 * 580 * We're not allowed to add any new bios to the 581 * bio list here, anyone else that wants to 582 * change this stripe needs to do their own rmw. 583 */ 584 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 585 return 0; 586 587 if (last->operation == BTRFS_RBIO_READ_REBUILD) 588 return 0; 589 590 return 1; 591 } 592 593 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 594 unsigned int stripe_nr, 595 unsigned int sector_nr) 596 { 597 ASSERT(stripe_nr < rbio->real_stripes); 598 ASSERT(sector_nr < rbio->stripe_nsectors); 599 600 return stripe_nr * rbio->stripe_nsectors + sector_nr; 601 } 602 603 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 604 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 605 unsigned int stripe_nr, 606 unsigned int sector_nr) 607 { 608 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 609 sector_nr)]; 610 } 611 612 /* Grab a sector inside P stripe */ 613 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 614 unsigned int sector_nr) 615 { 616 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 617 } 618 619 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 620 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 621 unsigned int sector_nr) 622 { 623 if (rbio->nr_data + 1 == rbio->real_stripes) 624 return NULL; 625 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 626 } 627 628 /* 629 * The first stripe in the table for a logical address 630 * has the lock. rbios are added in one of three ways: 631 * 632 * 1) Nobody has the stripe locked yet. The rbio is given 633 * the lock and 0 is returned. The caller must start the IO 634 * themselves. 635 * 636 * 2) Someone has the stripe locked, but we're able to merge 637 * with the lock owner. The rbio is freed and the IO will 638 * start automatically along with the existing rbio. 1 is returned. 639 * 640 * 3) Someone has the stripe locked, but we're not able to merge. 641 * The rbio is added to the lock owner's plug list, or merged into 642 * an rbio already on the plug list. When the lock owner unlocks, 643 * the next rbio on the list is run and the IO is started automatically. 644 * 1 is returned 645 * 646 * If we return 0, the caller still owns the rbio and must continue with 647 * IO submission. If we return 1, the caller must assume the rbio has 648 * already been freed. 649 */ 650 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 651 { 652 struct btrfs_stripe_hash *h; 653 struct btrfs_raid_bio *cur; 654 struct btrfs_raid_bio *pending; 655 struct btrfs_raid_bio *freeit = NULL; 656 struct btrfs_raid_bio *cache_drop = NULL; 657 int ret = 0; 658 659 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 660 661 spin_lock(&h->lock); 662 list_for_each_entry(cur, &h->hash_list, hash_list) { 663 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) 664 continue; 665 666 spin_lock(&cur->bio_list_lock); 667 668 /* Can we steal this cached rbio's pages? */ 669 if (bio_list_empty(&cur->bio_list) && 670 list_empty(&cur->plug_list) && 671 test_bit(RBIO_CACHE_BIT, &cur->flags) && 672 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 673 list_del_init(&cur->hash_list); 674 refcount_dec(&cur->refs); 675 676 steal_rbio(cur, rbio); 677 cache_drop = cur; 678 spin_unlock(&cur->bio_list_lock); 679 680 goto lockit; 681 } 682 683 /* Can we merge into the lock owner? */ 684 if (rbio_can_merge(cur, rbio)) { 685 merge_rbio(cur, rbio); 686 spin_unlock(&cur->bio_list_lock); 687 freeit = rbio; 688 ret = 1; 689 goto out; 690 } 691 692 693 /* 694 * We couldn't merge with the running rbio, see if we can merge 695 * with the pending ones. We don't have to check for rmw_locked 696 * because there is no way they are inside finish_rmw right now 697 */ 698 list_for_each_entry(pending, &cur->plug_list, plug_list) { 699 if (rbio_can_merge(pending, rbio)) { 700 merge_rbio(pending, rbio); 701 spin_unlock(&cur->bio_list_lock); 702 freeit = rbio; 703 ret = 1; 704 goto out; 705 } 706 } 707 708 /* 709 * No merging, put us on the tail of the plug list, our rbio 710 * will be started with the currently running rbio unlocks 711 */ 712 list_add_tail(&rbio->plug_list, &cur->plug_list); 713 spin_unlock(&cur->bio_list_lock); 714 ret = 1; 715 goto out; 716 } 717 lockit: 718 refcount_inc(&rbio->refs); 719 list_add(&rbio->hash_list, &h->hash_list); 720 out: 721 spin_unlock(&h->lock); 722 if (cache_drop) 723 remove_rbio_from_cache(cache_drop); 724 if (freeit) 725 free_raid_bio(freeit); 726 return ret; 727 } 728 729 static void recover_rbio_work_locked(struct work_struct *work); 730 731 /* 732 * called as rmw or parity rebuild is completed. If the plug list has more 733 * rbios waiting for this stripe, the next one on the list will be started 734 */ 735 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 736 { 737 int bucket; 738 struct btrfs_stripe_hash *h; 739 int keep_cache = 0; 740 741 bucket = rbio_bucket(rbio); 742 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 743 744 if (list_empty(&rbio->plug_list)) 745 cache_rbio(rbio); 746 747 spin_lock(&h->lock); 748 spin_lock(&rbio->bio_list_lock); 749 750 if (!list_empty(&rbio->hash_list)) { 751 /* 752 * if we're still cached and there is no other IO 753 * to perform, just leave this rbio here for others 754 * to steal from later 755 */ 756 if (list_empty(&rbio->plug_list) && 757 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 758 keep_cache = 1; 759 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 760 BUG_ON(!bio_list_empty(&rbio->bio_list)); 761 goto done; 762 } 763 764 list_del_init(&rbio->hash_list); 765 refcount_dec(&rbio->refs); 766 767 /* 768 * we use the plug list to hold all the rbios 769 * waiting for the chance to lock this stripe. 770 * hand the lock over to one of them. 771 */ 772 if (!list_empty(&rbio->plug_list)) { 773 struct btrfs_raid_bio *next; 774 struct list_head *head = rbio->plug_list.next; 775 776 next = list_entry(head, struct btrfs_raid_bio, 777 plug_list); 778 779 list_del_init(&rbio->plug_list); 780 781 list_add(&next->hash_list, &h->hash_list); 782 refcount_inc(&next->refs); 783 spin_unlock(&rbio->bio_list_lock); 784 spin_unlock(&h->lock); 785 786 if (next->operation == BTRFS_RBIO_READ_REBUILD) { 787 start_async_work(next, recover_rbio_work_locked); 788 } else if (next->operation == BTRFS_RBIO_WRITE) { 789 steal_rbio(rbio, next); 790 start_async_work(next, rmw_rbio_work_locked); 791 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 792 steal_rbio(rbio, next); 793 start_async_work(next, scrub_rbio_work_locked); 794 } 795 796 goto done_nolock; 797 } 798 } 799 done: 800 spin_unlock(&rbio->bio_list_lock); 801 spin_unlock(&h->lock); 802 803 done_nolock: 804 if (!keep_cache) 805 remove_rbio_from_cache(rbio); 806 } 807 808 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 809 { 810 struct bio *next; 811 812 while (cur) { 813 next = cur->bi_next; 814 cur->bi_next = NULL; 815 cur->bi_status = err; 816 bio_endio(cur); 817 cur = next; 818 } 819 } 820 821 /* 822 * this frees the rbio and runs through all the bios in the 823 * bio_list and calls end_io on them 824 */ 825 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 826 { 827 struct bio *cur = bio_list_get(&rbio->bio_list); 828 struct bio *extra; 829 830 kfree(rbio->csum_buf); 831 bitmap_free(rbio->csum_bitmap); 832 rbio->csum_buf = NULL; 833 rbio->csum_bitmap = NULL; 834 835 /* 836 * Clear the data bitmap, as the rbio may be cached for later usage. 837 * do this before before unlock_stripe() so there will be no new bio 838 * for this bio. 839 */ 840 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 841 842 /* 843 * At this moment, rbio->bio_list is empty, however since rbio does not 844 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 845 * hash list, rbio may be merged with others so that rbio->bio_list 846 * becomes non-empty. 847 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 848 * more and we can call bio_endio() on all queued bios. 849 */ 850 unlock_stripe(rbio); 851 extra = bio_list_get(&rbio->bio_list); 852 free_raid_bio(rbio); 853 854 rbio_endio_bio_list(cur, err); 855 if (extra) 856 rbio_endio_bio_list(extra, err); 857 } 858 859 /* 860 * Get a sector pointer specified by its @stripe_nr and @sector_nr. 861 * 862 * @rbio: The raid bio 863 * @stripe_nr: Stripe number, valid range [0, real_stripe) 864 * @sector_nr: Sector number inside the stripe, 865 * valid range [0, stripe_nsectors) 866 * @bio_list_only: Whether to use sectors inside the bio list only. 867 * 868 * The read/modify/write code wants to reuse the original bio page as much 869 * as possible, and only use stripe_sectors as fallback. 870 */ 871 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 872 int stripe_nr, int sector_nr, 873 bool bio_list_only) 874 { 875 struct sector_ptr *sector; 876 int index; 877 878 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 879 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 880 881 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 882 ASSERT(index >= 0 && index < rbio->nr_sectors); 883 884 spin_lock(&rbio->bio_list_lock); 885 sector = &rbio->bio_sectors[index]; 886 if (sector->page || bio_list_only) { 887 /* Don't return sector without a valid page pointer */ 888 if (!sector->page) 889 sector = NULL; 890 spin_unlock(&rbio->bio_list_lock); 891 return sector; 892 } 893 spin_unlock(&rbio->bio_list_lock); 894 895 return &rbio->stripe_sectors[index]; 896 } 897 898 /* 899 * allocation and initial setup for the btrfs_raid_bio. Not 900 * this does not allocate any pages for rbio->pages. 901 */ 902 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 903 struct btrfs_io_context *bioc) 904 { 905 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; 906 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 907 const unsigned int num_pages = stripe_npages * real_stripes; 908 const unsigned int stripe_nsectors = 909 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 910 const unsigned int num_sectors = stripe_nsectors * real_stripes; 911 struct btrfs_raid_bio *rbio; 912 913 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 914 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 915 /* 916 * Our current stripe len should be fixed to 64k thus stripe_nsectors 917 * (at most 16) should be no larger than BITS_PER_LONG. 918 */ 919 ASSERT(stripe_nsectors <= BITS_PER_LONG); 920 921 rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 922 if (!rbio) 923 return ERR_PTR(-ENOMEM); 924 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 925 GFP_NOFS); 926 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 927 GFP_NOFS); 928 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 929 GFP_NOFS); 930 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 931 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 932 933 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || 934 !rbio->finish_pointers || !rbio->error_bitmap) { 935 free_raid_bio_pointers(rbio); 936 kfree(rbio); 937 return ERR_PTR(-ENOMEM); 938 } 939 940 bio_list_init(&rbio->bio_list); 941 init_waitqueue_head(&rbio->io_wait); 942 INIT_LIST_HEAD(&rbio->plug_list); 943 spin_lock_init(&rbio->bio_list_lock); 944 INIT_LIST_HEAD(&rbio->stripe_cache); 945 INIT_LIST_HEAD(&rbio->hash_list); 946 btrfs_get_bioc(bioc); 947 rbio->bioc = bioc; 948 rbio->nr_pages = num_pages; 949 rbio->nr_sectors = num_sectors; 950 rbio->real_stripes = real_stripes; 951 rbio->stripe_npages = stripe_npages; 952 rbio->stripe_nsectors = stripe_nsectors; 953 refcount_set(&rbio->refs, 1); 954 atomic_set(&rbio->stripes_pending, 0); 955 956 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 957 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 958 959 return rbio; 960 } 961 962 /* allocate pages for all the stripes in the bio, including parity */ 963 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 964 { 965 int ret; 966 967 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); 968 if (ret < 0) 969 return ret; 970 /* Mapping all sectors */ 971 index_stripe_sectors(rbio); 972 return 0; 973 } 974 975 /* only allocate pages for p/q stripes */ 976 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 977 { 978 const int data_pages = rbio->nr_data * rbio->stripe_npages; 979 int ret; 980 981 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 982 rbio->stripe_pages + data_pages, 0); 983 if (ret < 0) 984 return ret; 985 986 index_stripe_sectors(rbio); 987 return 0; 988 } 989 990 /* 991 * Return the total number of errors found in the vertical stripe of @sector_nr. 992 * 993 * @faila and @failb will also be updated to the first and second stripe 994 * number of the errors. 995 */ 996 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, 997 int *faila, int *failb) 998 { 999 int stripe_nr; 1000 int found_errors = 0; 1001 1002 if (faila || failb) { 1003 /* 1004 * Both @faila and @failb should be valid pointers if any of 1005 * them is specified. 1006 */ 1007 ASSERT(faila && failb); 1008 *faila = -1; 1009 *failb = -1; 1010 } 1011 1012 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1013 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; 1014 1015 if (test_bit(total_sector_nr, rbio->error_bitmap)) { 1016 found_errors++; 1017 if (faila) { 1018 /* Update faila and failb. */ 1019 if (*faila < 0) 1020 *faila = stripe_nr; 1021 else if (*failb < 0) 1022 *failb = stripe_nr; 1023 } 1024 } 1025 } 1026 return found_errors; 1027 } 1028 1029 /* 1030 * Add a single sector @sector into our list of bios for IO. 1031 * 1032 * Return 0 if everything went well. 1033 * Return <0 for error. 1034 */ 1035 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1036 struct bio_list *bio_list, 1037 struct sector_ptr *sector, 1038 unsigned int stripe_nr, 1039 unsigned int sector_nr, 1040 enum req_op op) 1041 { 1042 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1043 struct bio *last = bio_list->tail; 1044 int ret; 1045 struct bio *bio; 1046 struct btrfs_io_stripe *stripe; 1047 u64 disk_start; 1048 1049 /* 1050 * Note: here stripe_nr has taken device replace into consideration, 1051 * thus it can be larger than rbio->real_stripe. 1052 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1053 */ 1054 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1055 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1056 ASSERT(sector->page); 1057 1058 stripe = &rbio->bioc->stripes[stripe_nr]; 1059 disk_start = stripe->physical + sector_nr * sectorsize; 1060 1061 /* if the device is missing, just fail this stripe */ 1062 if (!stripe->dev->bdev) { 1063 int found_errors; 1064 1065 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, 1066 rbio->error_bitmap); 1067 1068 /* Check if we have reached tolerance early. */ 1069 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 1070 NULL, NULL); 1071 if (found_errors > rbio->bioc->max_errors) 1072 return -EIO; 1073 return 0; 1074 } 1075 1076 /* see if we can add this page onto our existing bio */ 1077 if (last) { 1078 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; 1079 last_end += last->bi_iter.bi_size; 1080 1081 /* 1082 * we can't merge these if they are from different 1083 * devices or if they are not contiguous 1084 */ 1085 if (last_end == disk_start && !last->bi_status && 1086 last->bi_bdev == stripe->dev->bdev) { 1087 ret = bio_add_page(last, sector->page, sectorsize, 1088 sector->pgoff); 1089 if (ret == sectorsize) 1090 return 0; 1091 } 1092 } 1093 1094 /* put a new bio on the list */ 1095 bio = bio_alloc(stripe->dev->bdev, 1096 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1097 op, GFP_NOFS); 1098 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; 1099 bio->bi_private = rbio; 1100 1101 __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1102 bio_list_add(bio_list, bio); 1103 return 0; 1104 } 1105 1106 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1107 { 1108 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1109 struct bio_vec bvec; 1110 struct bvec_iter iter; 1111 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1112 rbio->bioc->full_stripe_logical; 1113 1114 bio_for_each_segment(bvec, bio, iter) { 1115 u32 bvec_offset; 1116 1117 for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1118 bvec_offset += sectorsize, offset += sectorsize) { 1119 int index = offset / sectorsize; 1120 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1121 1122 sector->page = bvec.bv_page; 1123 sector->pgoff = bvec.bv_offset + bvec_offset; 1124 ASSERT(sector->pgoff < PAGE_SIZE); 1125 } 1126 } 1127 } 1128 1129 /* 1130 * helper function to walk our bio list and populate the bio_pages array with 1131 * the result. This seems expensive, but it is faster than constantly 1132 * searching through the bio list as we setup the IO in finish_rmw or stripe 1133 * reconstruction. 1134 * 1135 * This must be called before you trust the answers from page_in_rbio 1136 */ 1137 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1138 { 1139 struct bio *bio; 1140 1141 spin_lock(&rbio->bio_list_lock); 1142 bio_list_for_each(bio, &rbio->bio_list) 1143 index_one_bio(rbio, bio); 1144 1145 spin_unlock(&rbio->bio_list_lock); 1146 } 1147 1148 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1149 struct raid56_bio_trace_info *trace_info) 1150 { 1151 const struct btrfs_io_context *bioc = rbio->bioc; 1152 int i; 1153 1154 ASSERT(bioc); 1155 1156 /* We rely on bio->bi_bdev to find the stripe number. */ 1157 if (!bio->bi_bdev) 1158 goto not_found; 1159 1160 for (i = 0; i < bioc->num_stripes; i++) { 1161 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1162 continue; 1163 trace_info->stripe_nr = i; 1164 trace_info->devid = bioc->stripes[i].dev->devid; 1165 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1166 bioc->stripes[i].physical; 1167 return; 1168 } 1169 1170 not_found: 1171 trace_info->devid = -1; 1172 trace_info->offset = -1; 1173 trace_info->stripe_nr = -1; 1174 } 1175 1176 static inline void bio_list_put(struct bio_list *bio_list) 1177 { 1178 struct bio *bio; 1179 1180 while ((bio = bio_list_pop(bio_list))) 1181 bio_put(bio); 1182 } 1183 1184 /* Generate PQ for one vertical stripe. */ 1185 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 1186 { 1187 void **pointers = rbio->finish_pointers; 1188 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1189 struct sector_ptr *sector; 1190 int stripe; 1191 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 1192 1193 /* First collect one sector from each data stripe */ 1194 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1195 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1196 pointers[stripe] = kmap_local_page(sector->page) + 1197 sector->pgoff; 1198 } 1199 1200 /* Then add the parity stripe */ 1201 sector = rbio_pstripe_sector(rbio, sectornr); 1202 sector->uptodate = 1; 1203 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1204 1205 if (has_qstripe) { 1206 /* 1207 * RAID6, add the qstripe and call the library function 1208 * to fill in our p/q 1209 */ 1210 sector = rbio_qstripe_sector(rbio, sectornr); 1211 sector->uptodate = 1; 1212 pointers[stripe++] = kmap_local_page(sector->page) + 1213 sector->pgoff; 1214 1215 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1216 pointers); 1217 } else { 1218 /* raid5 */ 1219 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); 1220 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); 1221 } 1222 for (stripe = stripe - 1; stripe >= 0; stripe--) 1223 kunmap_local(pointers[stripe]); 1224 } 1225 1226 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, 1227 struct bio_list *bio_list) 1228 { 1229 /* The total sector number inside the full stripe. */ 1230 int total_sector_nr; 1231 int sectornr; 1232 int stripe; 1233 int ret; 1234 1235 ASSERT(bio_list_size(bio_list) == 0); 1236 1237 /* We should have at least one data sector. */ 1238 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1239 1240 /* 1241 * Reset errors, as we may have errors inherited from from degraded 1242 * write. 1243 */ 1244 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 1245 1246 /* 1247 * Start assembly. Make bios for everything from the higher layers (the 1248 * bio_list in our rbio) and our P/Q. Ignore everything else. 1249 */ 1250 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1251 total_sector_nr++) { 1252 struct sector_ptr *sector; 1253 1254 stripe = total_sector_nr / rbio->stripe_nsectors; 1255 sectornr = total_sector_nr % rbio->stripe_nsectors; 1256 1257 /* This vertical stripe has no data, skip it. */ 1258 if (!test_bit(sectornr, &rbio->dbitmap)) 1259 continue; 1260 1261 if (stripe < rbio->nr_data) { 1262 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1263 if (!sector) 1264 continue; 1265 } else { 1266 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1267 } 1268 1269 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 1270 sectornr, REQ_OP_WRITE); 1271 if (ret) 1272 goto error; 1273 } 1274 1275 if (likely(!rbio->bioc->replace_nr_stripes)) 1276 return 0; 1277 1278 /* 1279 * Make a copy for the replace target device. 1280 * 1281 * Thus the source stripe number (in replace_stripe_src) should be valid. 1282 */ 1283 ASSERT(rbio->bioc->replace_stripe_src >= 0); 1284 1285 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1286 total_sector_nr++) { 1287 struct sector_ptr *sector; 1288 1289 stripe = total_sector_nr / rbio->stripe_nsectors; 1290 sectornr = total_sector_nr % rbio->stripe_nsectors; 1291 1292 /* 1293 * For RAID56, there is only one device that can be replaced, 1294 * and replace_stripe_src[0] indicates the stripe number we 1295 * need to copy from. 1296 */ 1297 if (stripe != rbio->bioc->replace_stripe_src) { 1298 /* 1299 * We can skip the whole stripe completely, note 1300 * total_sector_nr will be increased by one anyway. 1301 */ 1302 ASSERT(sectornr == 0); 1303 total_sector_nr += rbio->stripe_nsectors - 1; 1304 continue; 1305 } 1306 1307 /* This vertical stripe has no data, skip it. */ 1308 if (!test_bit(sectornr, &rbio->dbitmap)) 1309 continue; 1310 1311 if (stripe < rbio->nr_data) { 1312 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1313 if (!sector) 1314 continue; 1315 } else { 1316 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1317 } 1318 1319 ret = rbio_add_io_sector(rbio, bio_list, sector, 1320 rbio->real_stripes, 1321 sectornr, REQ_OP_WRITE); 1322 if (ret) 1323 goto error; 1324 } 1325 1326 return 0; 1327 error: 1328 bio_list_put(bio_list); 1329 return -EIO; 1330 } 1331 1332 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) 1333 { 1334 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1335 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1336 rbio->bioc->full_stripe_logical; 1337 int total_nr_sector = offset >> fs_info->sectorsize_bits; 1338 1339 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); 1340 1341 bitmap_set(rbio->error_bitmap, total_nr_sector, 1342 bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 1343 1344 /* 1345 * Special handling for raid56_alloc_missing_rbio() used by 1346 * scrub/replace. Unlike call path in raid56_parity_recover(), they 1347 * pass an empty bio here. Thus we have to find out the missing device 1348 * and mark the stripe error instead. 1349 */ 1350 if (bio->bi_iter.bi_size == 0) { 1351 bool found_missing = false; 1352 int stripe_nr; 1353 1354 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1355 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { 1356 found_missing = true; 1357 bitmap_set(rbio->error_bitmap, 1358 stripe_nr * rbio->stripe_nsectors, 1359 rbio->stripe_nsectors); 1360 } 1361 } 1362 ASSERT(found_missing); 1363 } 1364 } 1365 1366 /* 1367 * For subpage case, we can no longer set page Up-to-date directly for 1368 * stripe_pages[], thus we need to locate the sector. 1369 */ 1370 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1371 struct page *page, 1372 unsigned int pgoff) 1373 { 1374 int i; 1375 1376 for (i = 0; i < rbio->nr_sectors; i++) { 1377 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1378 1379 if (sector->page == page && sector->pgoff == pgoff) 1380 return sector; 1381 } 1382 return NULL; 1383 } 1384 1385 /* 1386 * this sets each page in the bio uptodate. It should only be used on private 1387 * rbio pages, nothing that comes in from the higher layers 1388 */ 1389 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1390 { 1391 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1392 struct bio_vec *bvec; 1393 struct bvec_iter_all iter_all; 1394 1395 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1396 1397 bio_for_each_segment_all(bvec, bio, iter_all) { 1398 struct sector_ptr *sector; 1399 int pgoff; 1400 1401 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1402 pgoff += sectorsize) { 1403 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1404 ASSERT(sector); 1405 if (sector) 1406 sector->uptodate = 1; 1407 } 1408 } 1409 } 1410 1411 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) 1412 { 1413 struct bio_vec *bv = bio_first_bvec_all(bio); 1414 int i; 1415 1416 for (i = 0; i < rbio->nr_sectors; i++) { 1417 struct sector_ptr *sector; 1418 1419 sector = &rbio->stripe_sectors[i]; 1420 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 1421 break; 1422 sector = &rbio->bio_sectors[i]; 1423 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 1424 break; 1425 } 1426 ASSERT(i < rbio->nr_sectors); 1427 return i; 1428 } 1429 1430 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 1431 { 1432 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1433 u32 bio_size = 0; 1434 struct bio_vec *bvec; 1435 int i; 1436 1437 bio_for_each_bvec_all(bvec, bio, i) 1438 bio_size += bvec->bv_len; 1439 1440 /* 1441 * Since we can have multiple bios touching the error_bitmap, we cannot 1442 * call bitmap_set() without protection. 1443 * 1444 * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1445 */ 1446 for (i = total_sector_nr; i < total_sector_nr + 1447 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1448 set_bit(i, rbio->error_bitmap); 1449 } 1450 1451 /* Verify the data sectors at read time. */ 1452 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, 1453 struct bio *bio) 1454 { 1455 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1456 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1457 struct bio_vec *bvec; 1458 struct bvec_iter_all iter_all; 1459 1460 /* No data csum for the whole stripe, no need to verify. */ 1461 if (!rbio->csum_bitmap || !rbio->csum_buf) 1462 return; 1463 1464 /* P/Q stripes, they have no data csum to verify against. */ 1465 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) 1466 return; 1467 1468 bio_for_each_segment_all(bvec, bio, iter_all) { 1469 int bv_offset; 1470 1471 for (bv_offset = bvec->bv_offset; 1472 bv_offset < bvec->bv_offset + bvec->bv_len; 1473 bv_offset += fs_info->sectorsize, total_sector_nr++) { 1474 u8 csum_buf[BTRFS_CSUM_SIZE]; 1475 u8 *expected_csum = rbio->csum_buf + 1476 total_sector_nr * fs_info->csum_size; 1477 int ret; 1478 1479 /* No csum for this sector, skip to the next sector. */ 1480 if (!test_bit(total_sector_nr, rbio->csum_bitmap)) 1481 continue; 1482 1483 ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, 1484 bv_offset, csum_buf, expected_csum); 1485 if (ret < 0) 1486 set_bit(total_sector_nr, rbio->error_bitmap); 1487 } 1488 } 1489 } 1490 1491 static void raid_wait_read_end_io(struct bio *bio) 1492 { 1493 struct btrfs_raid_bio *rbio = bio->bi_private; 1494 1495 if (bio->bi_status) { 1496 rbio_update_error_bitmap(rbio, bio); 1497 } else { 1498 set_bio_pages_uptodate(rbio, bio); 1499 verify_bio_data_sectors(rbio, bio); 1500 } 1501 1502 bio_put(bio); 1503 if (atomic_dec_and_test(&rbio->stripes_pending)) 1504 wake_up(&rbio->io_wait); 1505 } 1506 1507 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, 1508 struct bio_list *bio_list) 1509 { 1510 struct bio *bio; 1511 1512 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1513 while ((bio = bio_list_pop(bio_list))) { 1514 bio->bi_end_io = raid_wait_read_end_io; 1515 1516 if (trace_raid56_read_enabled()) { 1517 struct raid56_bio_trace_info trace_info = { 0 }; 1518 1519 bio_get_trace_info(rbio, bio, &trace_info); 1520 trace_raid56_read(rbio, bio, &trace_info); 1521 } 1522 submit_bio(bio); 1523 } 1524 1525 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 1526 } 1527 1528 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) 1529 { 1530 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1531 int ret; 1532 1533 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); 1534 if (ret < 0) 1535 return ret; 1536 1537 index_stripe_sectors(rbio); 1538 return 0; 1539 } 1540 1541 /* 1542 * We use plugging call backs to collect full stripes. 1543 * Any time we get a partial stripe write while plugged 1544 * we collect it into a list. When the unplug comes down, 1545 * we sort the list by logical block number and merge 1546 * everything we can into the same rbios 1547 */ 1548 struct btrfs_plug_cb { 1549 struct blk_plug_cb cb; 1550 struct btrfs_fs_info *info; 1551 struct list_head rbio_list; 1552 }; 1553 1554 /* 1555 * rbios on the plug list are sorted for easier merging. 1556 */ 1557 static int plug_cmp(void *priv, const struct list_head *a, 1558 const struct list_head *b) 1559 { 1560 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1561 plug_list); 1562 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1563 plug_list); 1564 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1565 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1566 1567 if (a_sector < b_sector) 1568 return -1; 1569 if (a_sector > b_sector) 1570 return 1; 1571 return 0; 1572 } 1573 1574 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1575 { 1576 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); 1577 struct btrfs_raid_bio *cur; 1578 struct btrfs_raid_bio *last = NULL; 1579 1580 list_sort(NULL, &plug->rbio_list, plug_cmp); 1581 1582 while (!list_empty(&plug->rbio_list)) { 1583 cur = list_entry(plug->rbio_list.next, 1584 struct btrfs_raid_bio, plug_list); 1585 list_del_init(&cur->plug_list); 1586 1587 if (rbio_is_full(cur)) { 1588 /* We have a full stripe, queue it down. */ 1589 start_async_work(cur, rmw_rbio_work); 1590 continue; 1591 } 1592 if (last) { 1593 if (rbio_can_merge(last, cur)) { 1594 merge_rbio(last, cur); 1595 free_raid_bio(cur); 1596 continue; 1597 } 1598 start_async_work(last, rmw_rbio_work); 1599 } 1600 last = cur; 1601 } 1602 if (last) 1603 start_async_work(last, rmw_rbio_work); 1604 kfree(plug); 1605 } 1606 1607 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1608 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1609 { 1610 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1611 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1612 const u64 full_stripe_start = rbio->bioc->full_stripe_logical; 1613 const u32 orig_len = orig_bio->bi_iter.bi_size; 1614 const u32 sectorsize = fs_info->sectorsize; 1615 u64 cur_logical; 1616 1617 ASSERT(orig_logical >= full_stripe_start && 1618 orig_logical + orig_len <= full_stripe_start + 1619 rbio->nr_data * BTRFS_STRIPE_LEN); 1620 1621 bio_list_add(&rbio->bio_list, orig_bio); 1622 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1623 1624 /* Update the dbitmap. */ 1625 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1626 cur_logical += sectorsize) { 1627 int bit = ((u32)(cur_logical - full_stripe_start) >> 1628 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1629 1630 set_bit(bit, &rbio->dbitmap); 1631 } 1632 } 1633 1634 /* 1635 * our main entry point for writes from the rest of the FS. 1636 */ 1637 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1638 { 1639 struct btrfs_fs_info *fs_info = bioc->fs_info; 1640 struct btrfs_raid_bio *rbio; 1641 struct btrfs_plug_cb *plug = NULL; 1642 struct blk_plug_cb *cb; 1643 1644 rbio = alloc_rbio(fs_info, bioc); 1645 if (IS_ERR(rbio)) { 1646 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 1647 bio_endio(bio); 1648 return; 1649 } 1650 rbio->operation = BTRFS_RBIO_WRITE; 1651 rbio_add_bio(rbio, bio); 1652 1653 /* 1654 * Don't plug on full rbios, just get them out the door 1655 * as quickly as we can 1656 */ 1657 if (!rbio_is_full(rbio)) { 1658 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); 1659 if (cb) { 1660 plug = container_of(cb, struct btrfs_plug_cb, cb); 1661 if (!plug->info) { 1662 plug->info = fs_info; 1663 INIT_LIST_HEAD(&plug->rbio_list); 1664 } 1665 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1666 return; 1667 } 1668 } 1669 1670 /* 1671 * Either we don't have any existing plug, or we're doing a full stripe, 1672 * queue the rmw work now. 1673 */ 1674 start_async_work(rbio, rmw_rbio_work); 1675 } 1676 1677 static int verify_one_sector(struct btrfs_raid_bio *rbio, 1678 int stripe_nr, int sector_nr) 1679 { 1680 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1681 struct sector_ptr *sector; 1682 u8 csum_buf[BTRFS_CSUM_SIZE]; 1683 u8 *csum_expected; 1684 int ret; 1685 1686 if (!rbio->csum_bitmap || !rbio->csum_buf) 1687 return 0; 1688 1689 /* No way to verify P/Q as they are not covered by data csum. */ 1690 if (stripe_nr >= rbio->nr_data) 1691 return 0; 1692 /* 1693 * If we're rebuilding a read, we have to use pages from the 1694 * bio list if possible. 1695 */ 1696 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1697 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1698 } else { 1699 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1700 } 1701 1702 ASSERT(sector->page); 1703 1704 csum_expected = rbio->csum_buf + 1705 (stripe_nr * rbio->stripe_nsectors + sector_nr) * 1706 fs_info->csum_size; 1707 ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, 1708 csum_buf, csum_expected); 1709 return ret; 1710 } 1711 1712 /* 1713 * Recover a vertical stripe specified by @sector_nr. 1714 * @*pointers are the pre-allocated pointers by the caller, so we don't 1715 * need to allocate/free the pointers again and again. 1716 */ 1717 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 1718 void **pointers, void **unmap_array) 1719 { 1720 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1721 struct sector_ptr *sector; 1722 const u32 sectorsize = fs_info->sectorsize; 1723 int found_errors; 1724 int faila; 1725 int failb; 1726 int stripe_nr; 1727 int ret = 0; 1728 1729 /* 1730 * Now we just use bitmap to mark the horizontal stripes in 1731 * which we have data when doing parity scrub. 1732 */ 1733 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1734 !test_bit(sector_nr, &rbio->dbitmap)) 1735 return 0; 1736 1737 found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, 1738 &failb); 1739 /* 1740 * No errors in the vertical stripe, skip it. Can happen for recovery 1741 * which only part of a stripe failed csum check. 1742 */ 1743 if (!found_errors) 1744 return 0; 1745 1746 if (found_errors > rbio->bioc->max_errors) 1747 return -EIO; 1748 1749 /* 1750 * Setup our array of pointers with sectors from each stripe 1751 * 1752 * NOTE: store a duplicate array of pointers to preserve the 1753 * pointer order. 1754 */ 1755 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1756 /* 1757 * If we're rebuilding a read, we have to use pages from the 1758 * bio list if possible. 1759 */ 1760 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1761 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1762 } else { 1763 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1764 } 1765 ASSERT(sector->page); 1766 pointers[stripe_nr] = kmap_local_page(sector->page) + 1767 sector->pgoff; 1768 unmap_array[stripe_nr] = pointers[stripe_nr]; 1769 } 1770 1771 /* All raid6 handling here */ 1772 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1773 /* Single failure, rebuild from parity raid5 style */ 1774 if (failb < 0) { 1775 if (faila == rbio->nr_data) 1776 /* 1777 * Just the P stripe has failed, without 1778 * a bad data or Q stripe. 1779 * We have nothing to do, just skip the 1780 * recovery for this stripe. 1781 */ 1782 goto cleanup; 1783 /* 1784 * a single failure in raid6 is rebuilt 1785 * in the pstripe code below 1786 */ 1787 goto pstripe; 1788 } 1789 1790 /* 1791 * If the q stripe is failed, do a pstripe reconstruction from 1792 * the xors. 1793 * If both the q stripe and the P stripe are failed, we're 1794 * here due to a crc mismatch and we can't give them the 1795 * data they want. 1796 */ 1797 if (failb == rbio->real_stripes - 1) { 1798 if (faila == rbio->real_stripes - 2) 1799 /* 1800 * Only P and Q are corrupted. 1801 * We only care about data stripes recovery, 1802 * can skip this vertical stripe. 1803 */ 1804 goto cleanup; 1805 /* 1806 * Otherwise we have one bad data stripe and 1807 * a good P stripe. raid5! 1808 */ 1809 goto pstripe; 1810 } 1811 1812 if (failb == rbio->real_stripes - 2) { 1813 raid6_datap_recov(rbio->real_stripes, sectorsize, 1814 faila, pointers); 1815 } else { 1816 raid6_2data_recov(rbio->real_stripes, sectorsize, 1817 faila, failb, pointers); 1818 } 1819 } else { 1820 void *p; 1821 1822 /* Rebuild from P stripe here (raid5 or raid6). */ 1823 ASSERT(failb == -1); 1824 pstripe: 1825 /* Copy parity block into failed block to start with */ 1826 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 1827 1828 /* Rearrange the pointer array */ 1829 p = pointers[faila]; 1830 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 1831 stripe_nr++) 1832 pointers[stripe_nr] = pointers[stripe_nr + 1]; 1833 pointers[rbio->nr_data - 1] = p; 1834 1835 /* Xor in the rest */ 1836 run_xor(pointers, rbio->nr_data - 1, sectorsize); 1837 1838 } 1839 1840 /* 1841 * No matter if this is a RMW or recovery, we should have all 1842 * failed sectors repaired in the vertical stripe, thus they are now 1843 * uptodate. 1844 * Especially if we determine to cache the rbio, we need to 1845 * have at least all data sectors uptodate. 1846 * 1847 * If possible, also check if the repaired sector matches its data 1848 * checksum. 1849 */ 1850 if (faila >= 0) { 1851 ret = verify_one_sector(rbio, faila, sector_nr); 1852 if (ret < 0) 1853 goto cleanup; 1854 1855 sector = rbio_stripe_sector(rbio, faila, sector_nr); 1856 sector->uptodate = 1; 1857 } 1858 if (failb >= 0) { 1859 ret = verify_one_sector(rbio, failb, sector_nr); 1860 if (ret < 0) 1861 goto cleanup; 1862 1863 sector = rbio_stripe_sector(rbio, failb, sector_nr); 1864 sector->uptodate = 1; 1865 } 1866 1867 cleanup: 1868 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 1869 kunmap_local(unmap_array[stripe_nr]); 1870 return ret; 1871 } 1872 1873 static int recover_sectors(struct btrfs_raid_bio *rbio) 1874 { 1875 void **pointers = NULL; 1876 void **unmap_array = NULL; 1877 int sectornr; 1878 int ret = 0; 1879 1880 /* 1881 * @pointers array stores the pointer for each sector. 1882 * 1883 * @unmap_array stores copy of pointers that does not get reordered 1884 * during reconstruction so that kunmap_local works. 1885 */ 1886 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1887 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1888 if (!pointers || !unmap_array) { 1889 ret = -ENOMEM; 1890 goto out; 1891 } 1892 1893 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1894 spin_lock(&rbio->bio_list_lock); 1895 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1896 spin_unlock(&rbio->bio_list_lock); 1897 } 1898 1899 index_rbio_pages(rbio); 1900 1901 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1902 ret = recover_vertical(rbio, sectornr, pointers, unmap_array); 1903 if (ret < 0) 1904 break; 1905 } 1906 1907 out: 1908 kfree(pointers); 1909 kfree(unmap_array); 1910 return ret; 1911 } 1912 1913 static void recover_rbio(struct btrfs_raid_bio *rbio) 1914 { 1915 struct bio_list bio_list = BIO_EMPTY_LIST; 1916 int total_sector_nr; 1917 int ret = 0; 1918 1919 /* 1920 * Either we're doing recover for a read failure or degraded write, 1921 * caller should have set error bitmap correctly. 1922 */ 1923 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); 1924 1925 /* For recovery, we need to read all sectors including P/Q. */ 1926 ret = alloc_rbio_pages(rbio); 1927 if (ret < 0) 1928 goto out; 1929 1930 index_rbio_pages(rbio); 1931 1932 /* 1933 * Read everything that hasn't failed. However this time we will 1934 * not trust any cached sector. 1935 * As we may read out some stale data but higher layer is not reading 1936 * that stale part. 1937 * 1938 * So here we always re-read everything in recovery path. 1939 */ 1940 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1941 total_sector_nr++) { 1942 int stripe = total_sector_nr / rbio->stripe_nsectors; 1943 int sectornr = total_sector_nr % rbio->stripe_nsectors; 1944 struct sector_ptr *sector; 1945 1946 /* 1947 * Skip the range which has error. It can be a range which is 1948 * marked error (for csum mismatch), or it can be a missing 1949 * device. 1950 */ 1951 if (!rbio->bioc->stripes[stripe].dev->bdev || 1952 test_bit(total_sector_nr, rbio->error_bitmap)) { 1953 /* 1954 * Also set the error bit for missing device, which 1955 * may not yet have its error bit set. 1956 */ 1957 set_bit(total_sector_nr, rbio->error_bitmap); 1958 continue; 1959 } 1960 1961 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1962 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1963 sectornr, REQ_OP_READ); 1964 if (ret < 0) { 1965 bio_list_put(&bio_list); 1966 goto out; 1967 } 1968 } 1969 1970 submit_read_wait_bio_list(rbio, &bio_list); 1971 ret = recover_sectors(rbio); 1972 out: 1973 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 1974 } 1975 1976 static void recover_rbio_work(struct work_struct *work) 1977 { 1978 struct btrfs_raid_bio *rbio; 1979 1980 rbio = container_of(work, struct btrfs_raid_bio, work); 1981 if (!lock_stripe_add(rbio)) 1982 recover_rbio(rbio); 1983 } 1984 1985 static void recover_rbio_work_locked(struct work_struct *work) 1986 { 1987 recover_rbio(container_of(work, struct btrfs_raid_bio, work)); 1988 } 1989 1990 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) 1991 { 1992 bool found = false; 1993 int sector_nr; 1994 1995 /* 1996 * This is for RAID6 extra recovery tries, thus mirror number should 1997 * be large than 2. 1998 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using 1999 * RAID5 methods. 2000 */ 2001 ASSERT(mirror_num > 2); 2002 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2003 int found_errors; 2004 int faila; 2005 int failb; 2006 2007 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2008 &faila, &failb); 2009 /* This vertical stripe doesn't have errors. */ 2010 if (!found_errors) 2011 continue; 2012 2013 /* 2014 * If we found errors, there should be only one error marked 2015 * by previous set_rbio_range_error(). 2016 */ 2017 ASSERT(found_errors == 1); 2018 found = true; 2019 2020 /* Now select another stripe to mark as error. */ 2021 failb = rbio->real_stripes - (mirror_num - 1); 2022 if (failb <= faila) 2023 failb--; 2024 2025 /* Set the extra bit in error bitmap. */ 2026 if (failb >= 0) 2027 set_bit(failb * rbio->stripe_nsectors + sector_nr, 2028 rbio->error_bitmap); 2029 } 2030 2031 /* We should found at least one vertical stripe with error.*/ 2032 ASSERT(found); 2033 } 2034 2035 /* 2036 * the main entry point for reads from the higher layers. This 2037 * is really only called when the normal read path had a failure, 2038 * so we assume the bio they send down corresponds to a failed part 2039 * of the drive. 2040 */ 2041 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2042 int mirror_num) 2043 { 2044 struct btrfs_fs_info *fs_info = bioc->fs_info; 2045 struct btrfs_raid_bio *rbio; 2046 2047 rbio = alloc_rbio(fs_info, bioc); 2048 if (IS_ERR(rbio)) { 2049 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2050 bio_endio(bio); 2051 return; 2052 } 2053 2054 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2055 rbio_add_bio(rbio, bio); 2056 2057 set_rbio_range_error(rbio, bio); 2058 2059 /* 2060 * Loop retry: 2061 * for 'mirror == 2', reconstruct from all other stripes. 2062 * for 'mirror_num > 2', select a stripe to fail on every retry. 2063 */ 2064 if (mirror_num > 2) 2065 set_rbio_raid6_extra_error(rbio, mirror_num); 2066 2067 start_async_work(rbio, recover_rbio_work); 2068 } 2069 2070 static void fill_data_csums(struct btrfs_raid_bio *rbio) 2071 { 2072 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 2073 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 2074 rbio->bioc->full_stripe_logical); 2075 const u64 start = rbio->bioc->full_stripe_logical; 2076 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << 2077 fs_info->sectorsize_bits; 2078 int ret; 2079 2080 /* The rbio should not have its csum buffer initialized. */ 2081 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); 2082 2083 /* 2084 * Skip the csum search if: 2085 * 2086 * - The rbio doesn't belong to data block groups 2087 * Then we are doing IO for tree blocks, no need to search csums. 2088 * 2089 * - The rbio belongs to mixed block groups 2090 * This is to avoid deadlock, as we're already holding the full 2091 * stripe lock, if we trigger a metadata read, and it needs to do 2092 * raid56 recovery, we will deadlock. 2093 */ 2094 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || 2095 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) 2096 return; 2097 2098 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * 2099 fs_info->csum_size, GFP_NOFS); 2100 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, 2101 GFP_NOFS); 2102 if (!rbio->csum_buf || !rbio->csum_bitmap) { 2103 ret = -ENOMEM; 2104 goto error; 2105 } 2106 2107 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, 2108 rbio->csum_buf, rbio->csum_bitmap); 2109 if (ret < 0) 2110 goto error; 2111 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) 2112 goto no_csum; 2113 return; 2114 2115 error: 2116 /* 2117 * We failed to allocate memory or grab the csum, but it's not fatal, 2118 * we can still continue. But better to warn users that RMW is no 2119 * longer safe for this particular sub-stripe write. 2120 */ 2121 btrfs_warn_rl(fs_info, 2122 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", 2123 rbio->bioc->full_stripe_logical, ret); 2124 no_csum: 2125 kfree(rbio->csum_buf); 2126 bitmap_free(rbio->csum_bitmap); 2127 rbio->csum_buf = NULL; 2128 rbio->csum_bitmap = NULL; 2129 } 2130 2131 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) 2132 { 2133 struct bio_list bio_list = BIO_EMPTY_LIST; 2134 int total_sector_nr; 2135 int ret = 0; 2136 2137 /* 2138 * Fill the data csums we need for data verification. We need to fill 2139 * the csum_bitmap/csum_buf first, as our endio function will try to 2140 * verify the data sectors. 2141 */ 2142 fill_data_csums(rbio); 2143 2144 /* 2145 * Build a list of bios to read all sectors (including data and P/Q). 2146 * 2147 * This behavior is to compensate the later csum verification and recovery. 2148 */ 2149 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2150 total_sector_nr++) { 2151 struct sector_ptr *sector; 2152 int stripe = total_sector_nr / rbio->stripe_nsectors; 2153 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2154 2155 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2156 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2157 stripe, sectornr, REQ_OP_READ); 2158 if (ret) { 2159 bio_list_put(&bio_list); 2160 return ret; 2161 } 2162 } 2163 2164 /* 2165 * We may or may not have any corrupted sectors (including missing dev 2166 * and csum mismatch), just let recover_sectors() to handle them all. 2167 */ 2168 submit_read_wait_bio_list(rbio, &bio_list); 2169 return recover_sectors(rbio); 2170 } 2171 2172 static void raid_wait_write_end_io(struct bio *bio) 2173 { 2174 struct btrfs_raid_bio *rbio = bio->bi_private; 2175 blk_status_t err = bio->bi_status; 2176 2177 if (err) 2178 rbio_update_error_bitmap(rbio, bio); 2179 bio_put(bio); 2180 if (atomic_dec_and_test(&rbio->stripes_pending)) 2181 wake_up(&rbio->io_wait); 2182 } 2183 2184 static void submit_write_bios(struct btrfs_raid_bio *rbio, 2185 struct bio_list *bio_list) 2186 { 2187 struct bio *bio; 2188 2189 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 2190 while ((bio = bio_list_pop(bio_list))) { 2191 bio->bi_end_io = raid_wait_write_end_io; 2192 2193 if (trace_raid56_write_enabled()) { 2194 struct raid56_bio_trace_info trace_info = { 0 }; 2195 2196 bio_get_trace_info(rbio, bio, &trace_info); 2197 trace_raid56_write(rbio, bio, &trace_info); 2198 } 2199 submit_bio(bio); 2200 } 2201 } 2202 2203 /* 2204 * To determine if we need to read any sector from the disk. 2205 * Should only be utilized in RMW path, to skip cached rbio. 2206 */ 2207 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) 2208 { 2209 int i; 2210 2211 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { 2212 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 2213 2214 /* 2215 * We have a sector which doesn't have page nor uptodate, 2216 * thus this rbio can not be cached one, as cached one must 2217 * have all its data sectors present and uptodate. 2218 */ 2219 if (!sector->page || !sector->uptodate) 2220 return true; 2221 } 2222 return false; 2223 } 2224 2225 static void rmw_rbio(struct btrfs_raid_bio *rbio) 2226 { 2227 struct bio_list bio_list; 2228 int sectornr; 2229 int ret = 0; 2230 2231 /* 2232 * Allocate the pages for parity first, as P/Q pages will always be 2233 * needed for both full-stripe and sub-stripe writes. 2234 */ 2235 ret = alloc_rbio_parity_pages(rbio); 2236 if (ret < 0) 2237 goto out; 2238 2239 /* 2240 * Either full stripe write, or we have every data sector already 2241 * cached, can go to write path immediately. 2242 */ 2243 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { 2244 /* 2245 * Now we're doing sub-stripe write, also need all data stripes 2246 * to do the full RMW. 2247 */ 2248 ret = alloc_rbio_data_pages(rbio); 2249 if (ret < 0) 2250 goto out; 2251 2252 index_rbio_pages(rbio); 2253 2254 ret = rmw_read_wait_recover(rbio); 2255 if (ret < 0) 2256 goto out; 2257 } 2258 2259 /* 2260 * At this stage we're not allowed to add any new bios to the 2261 * bio list any more, anyone else that wants to change this stripe 2262 * needs to do their own rmw. 2263 */ 2264 spin_lock(&rbio->bio_list_lock); 2265 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2266 spin_unlock(&rbio->bio_list_lock); 2267 2268 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2269 2270 index_rbio_pages(rbio); 2271 2272 /* 2273 * We don't cache full rbios because we're assuming 2274 * the higher layers are unlikely to use this area of 2275 * the disk again soon. If they do use it again, 2276 * hopefully they will send another full bio. 2277 */ 2278 if (!rbio_is_full(rbio)) 2279 cache_rbio_pages(rbio); 2280 else 2281 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2282 2283 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 2284 generate_pq_vertical(rbio, sectornr); 2285 2286 bio_list_init(&bio_list); 2287 ret = rmw_assemble_write_bios(rbio, &bio_list); 2288 if (ret < 0) 2289 goto out; 2290 2291 /* We should have at least one bio assembled. */ 2292 ASSERT(bio_list_size(&bio_list)); 2293 submit_write_bios(rbio, &bio_list); 2294 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2295 2296 /* We may have more errors than our tolerance during the read. */ 2297 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2298 int found_errors; 2299 2300 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); 2301 if (found_errors > rbio->bioc->max_errors) { 2302 ret = -EIO; 2303 break; 2304 } 2305 } 2306 out: 2307 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2308 } 2309 2310 static void rmw_rbio_work(struct work_struct *work) 2311 { 2312 struct btrfs_raid_bio *rbio; 2313 2314 rbio = container_of(work, struct btrfs_raid_bio, work); 2315 if (lock_stripe_add(rbio) == 0) 2316 rmw_rbio(rbio); 2317 } 2318 2319 static void rmw_rbio_work_locked(struct work_struct *work) 2320 { 2321 rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); 2322 } 2323 2324 /* 2325 * The following code is used to scrub/replace the parity stripe 2326 * 2327 * Caller must have already increased bio_counter for getting @bioc. 2328 * 2329 * Note: We need make sure all the pages that add into the scrub/replace 2330 * raid bio are correct and not be changed during the scrub/replace. That 2331 * is those pages just hold metadata or file data with checksum. 2332 */ 2333 2334 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2335 struct btrfs_io_context *bioc, 2336 struct btrfs_device *scrub_dev, 2337 unsigned long *dbitmap, int stripe_nsectors) 2338 { 2339 struct btrfs_fs_info *fs_info = bioc->fs_info; 2340 struct btrfs_raid_bio *rbio; 2341 int i; 2342 2343 rbio = alloc_rbio(fs_info, bioc); 2344 if (IS_ERR(rbio)) 2345 return NULL; 2346 bio_list_add(&rbio->bio_list, bio); 2347 /* 2348 * This is a special bio which is used to hold the completion handler 2349 * and make the scrub rbio is similar to the other types 2350 */ 2351 ASSERT(!bio->bi_iter.bi_size); 2352 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2353 2354 /* 2355 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2356 * to the end position, so this search can start from the first parity 2357 * stripe. 2358 */ 2359 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2360 if (bioc->stripes[i].dev == scrub_dev) { 2361 rbio->scrubp = i; 2362 break; 2363 } 2364 } 2365 ASSERT(i < rbio->real_stripes); 2366 2367 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2368 return rbio; 2369 } 2370 2371 /* 2372 * We just scrub the parity that we have correct data on the same horizontal, 2373 * so we needn't allocate all pages for all the stripes. 2374 */ 2375 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2376 { 2377 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2378 int total_sector_nr; 2379 2380 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2381 total_sector_nr++) { 2382 struct page *page; 2383 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2384 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 2385 2386 if (!test_bit(sectornr, &rbio->dbitmap)) 2387 continue; 2388 if (rbio->stripe_pages[index]) 2389 continue; 2390 page = alloc_page(GFP_NOFS); 2391 if (!page) 2392 return -ENOMEM; 2393 rbio->stripe_pages[index] = page; 2394 } 2395 index_stripe_sectors(rbio); 2396 return 0; 2397 } 2398 2399 static int finish_parity_scrub(struct btrfs_raid_bio *rbio) 2400 { 2401 struct btrfs_io_context *bioc = rbio->bioc; 2402 const u32 sectorsize = bioc->fs_info->sectorsize; 2403 void **pointers = rbio->finish_pointers; 2404 unsigned long *pbitmap = &rbio->finish_pbitmap; 2405 int nr_data = rbio->nr_data; 2406 int stripe; 2407 int sectornr; 2408 bool has_qstripe; 2409 struct sector_ptr p_sector = { 0 }; 2410 struct sector_ptr q_sector = { 0 }; 2411 struct bio_list bio_list; 2412 int is_replace = 0; 2413 int ret; 2414 2415 bio_list_init(&bio_list); 2416 2417 if (rbio->real_stripes - rbio->nr_data == 1) 2418 has_qstripe = false; 2419 else if (rbio->real_stripes - rbio->nr_data == 2) 2420 has_qstripe = true; 2421 else 2422 BUG(); 2423 2424 /* 2425 * Replace is running and our P/Q stripe is being replaced, then we 2426 * need to duplicate the final write to replace target. 2427 */ 2428 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { 2429 is_replace = 1; 2430 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2431 } 2432 2433 /* 2434 * Because the higher layers(scrubber) are unlikely to 2435 * use this area of the disk again soon, so don't cache 2436 * it. 2437 */ 2438 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2439 2440 p_sector.page = alloc_page(GFP_NOFS); 2441 if (!p_sector.page) 2442 return -ENOMEM; 2443 p_sector.pgoff = 0; 2444 p_sector.uptodate = 1; 2445 2446 if (has_qstripe) { 2447 /* RAID6, allocate and map temp space for the Q stripe */ 2448 q_sector.page = alloc_page(GFP_NOFS); 2449 if (!q_sector.page) { 2450 __free_page(p_sector.page); 2451 p_sector.page = NULL; 2452 return -ENOMEM; 2453 } 2454 q_sector.pgoff = 0; 2455 q_sector.uptodate = 1; 2456 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2457 } 2458 2459 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2460 2461 /* Map the parity stripe just once */ 2462 pointers[nr_data] = kmap_local_page(p_sector.page); 2463 2464 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2465 struct sector_ptr *sector; 2466 void *parity; 2467 2468 /* first collect one page from each data stripe */ 2469 for (stripe = 0; stripe < nr_data; stripe++) { 2470 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2471 pointers[stripe] = kmap_local_page(sector->page) + 2472 sector->pgoff; 2473 } 2474 2475 if (has_qstripe) { 2476 /* RAID6, call the library function to fill in our P/Q */ 2477 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2478 pointers); 2479 } else { 2480 /* raid5 */ 2481 memcpy(pointers[nr_data], pointers[0], sectorsize); 2482 run_xor(pointers + 1, nr_data - 1, sectorsize); 2483 } 2484 2485 /* Check scrubbing parity and repair it */ 2486 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2487 parity = kmap_local_page(sector->page) + sector->pgoff; 2488 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2489 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2490 else 2491 /* Parity is right, needn't writeback */ 2492 bitmap_clear(&rbio->dbitmap, sectornr, 1); 2493 kunmap_local(parity); 2494 2495 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2496 kunmap_local(pointers[stripe]); 2497 } 2498 2499 kunmap_local(pointers[nr_data]); 2500 __free_page(p_sector.page); 2501 p_sector.page = NULL; 2502 if (q_sector.page) { 2503 kunmap_local(pointers[rbio->real_stripes - 1]); 2504 __free_page(q_sector.page); 2505 q_sector.page = NULL; 2506 } 2507 2508 /* 2509 * time to start writing. Make bios for everything from the 2510 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2511 * everything else. 2512 */ 2513 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2514 struct sector_ptr *sector; 2515 2516 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2517 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2518 sectornr, REQ_OP_WRITE); 2519 if (ret) 2520 goto cleanup; 2521 } 2522 2523 if (!is_replace) 2524 goto submit_write; 2525 2526 /* 2527 * Replace is running and our parity stripe needs to be duplicated to 2528 * the target device. Check we have a valid source stripe number. 2529 */ 2530 ASSERT(rbio->bioc->replace_stripe_src >= 0); 2531 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2532 struct sector_ptr *sector; 2533 2534 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2535 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2536 rbio->real_stripes, 2537 sectornr, REQ_OP_WRITE); 2538 if (ret) 2539 goto cleanup; 2540 } 2541 2542 submit_write: 2543 submit_write_bios(rbio, &bio_list); 2544 return 0; 2545 2546 cleanup: 2547 bio_list_put(&bio_list); 2548 return ret; 2549 } 2550 2551 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2552 { 2553 if (stripe >= 0 && stripe < rbio->nr_data) 2554 return 1; 2555 return 0; 2556 } 2557 2558 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) 2559 { 2560 void **pointers = NULL; 2561 void **unmap_array = NULL; 2562 int sector_nr; 2563 int ret = 0; 2564 2565 /* 2566 * @pointers array stores the pointer for each sector. 2567 * 2568 * @unmap_array stores copy of pointers that does not get reordered 2569 * during reconstruction so that kunmap_local works. 2570 */ 2571 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2572 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2573 if (!pointers || !unmap_array) { 2574 ret = -ENOMEM; 2575 goto out; 2576 } 2577 2578 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2579 int dfail = 0, failp = -1; 2580 int faila; 2581 int failb; 2582 int found_errors; 2583 2584 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2585 &faila, &failb); 2586 if (found_errors > rbio->bioc->max_errors) { 2587 ret = -EIO; 2588 goto out; 2589 } 2590 if (found_errors == 0) 2591 continue; 2592 2593 /* We should have at least one error here. */ 2594 ASSERT(faila >= 0 || failb >= 0); 2595 2596 if (is_data_stripe(rbio, faila)) 2597 dfail++; 2598 else if (is_parity_stripe(faila)) 2599 failp = faila; 2600 2601 if (is_data_stripe(rbio, failb)) 2602 dfail++; 2603 else if (is_parity_stripe(failb)) 2604 failp = failb; 2605 /* 2606 * Because we can not use a scrubbing parity to repair the 2607 * data, so the capability of the repair is declined. (In the 2608 * case of RAID5, we can not repair anything.) 2609 */ 2610 if (dfail > rbio->bioc->max_errors - 1) { 2611 ret = -EIO; 2612 goto out; 2613 } 2614 /* 2615 * If all data is good, only parity is correctly, just repair 2616 * the parity, no need to recover data stripes. 2617 */ 2618 if (dfail == 0) 2619 continue; 2620 2621 /* 2622 * Here means we got one corrupted data stripe and one 2623 * corrupted parity on RAID6, if the corrupted parity is 2624 * scrubbing parity, luckily, use the other one to repair the 2625 * data, or we can not repair the data stripe. 2626 */ 2627 if (failp != rbio->scrubp) { 2628 ret = -EIO; 2629 goto out; 2630 } 2631 2632 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); 2633 if (ret < 0) 2634 goto out; 2635 } 2636 out: 2637 kfree(pointers); 2638 kfree(unmap_array); 2639 return ret; 2640 } 2641 2642 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) 2643 { 2644 struct bio_list bio_list = BIO_EMPTY_LIST; 2645 int total_sector_nr; 2646 int ret = 0; 2647 2648 /* Build a list of bios to read all the missing parts. */ 2649 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2650 total_sector_nr++) { 2651 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2652 int stripe = total_sector_nr / rbio->stripe_nsectors; 2653 struct sector_ptr *sector; 2654 2655 /* No data in the vertical stripe, no need to read. */ 2656 if (!test_bit(sectornr, &rbio->dbitmap)) 2657 continue; 2658 2659 /* 2660 * We want to find all the sectors missing from the rbio and 2661 * read them from the disk. If sector_in_rbio() finds a sector 2662 * in the bio list we don't need to read it off the stripe. 2663 */ 2664 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2665 if (sector) 2666 continue; 2667 2668 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2669 /* 2670 * The bio cache may have handed us an uptodate sector. If so, 2671 * use it. 2672 */ 2673 if (sector->uptodate) 2674 continue; 2675 2676 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2677 sectornr, REQ_OP_READ); 2678 if (ret) { 2679 bio_list_put(&bio_list); 2680 return ret; 2681 } 2682 } 2683 2684 submit_read_wait_bio_list(rbio, &bio_list); 2685 return 0; 2686 } 2687 2688 static void scrub_rbio(struct btrfs_raid_bio *rbio) 2689 { 2690 int sector_nr; 2691 int ret; 2692 2693 ret = alloc_rbio_essential_pages(rbio); 2694 if (ret) 2695 goto out; 2696 2697 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2698 2699 ret = scrub_assemble_read_bios(rbio); 2700 if (ret < 0) 2701 goto out; 2702 2703 /* We may have some failures, recover the failed sectors first. */ 2704 ret = recover_scrub_rbio(rbio); 2705 if (ret < 0) 2706 goto out; 2707 2708 /* 2709 * We have every sector properly prepared. Can finish the scrub 2710 * and writeback the good content. 2711 */ 2712 ret = finish_parity_scrub(rbio); 2713 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2714 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2715 int found_errors; 2716 2717 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); 2718 if (found_errors > rbio->bioc->max_errors) { 2719 ret = -EIO; 2720 break; 2721 } 2722 } 2723 out: 2724 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2725 } 2726 2727 static void scrub_rbio_work_locked(struct work_struct *work) 2728 { 2729 scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); 2730 } 2731 2732 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2733 { 2734 if (!lock_stripe_add(rbio)) 2735 start_async_work(rbio, scrub_rbio_work_locked); 2736 } 2737 2738 /* 2739 * This is for scrub call sites where we already have correct data contents. 2740 * This allows us to avoid reading data stripes again. 2741 * 2742 * Unfortunately here we have to do page copy, other than reusing the pages. 2743 * This is due to the fact rbio has its own page management for its cache. 2744 */ 2745 void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, 2746 struct page **data_pages, u64 data_logical) 2747 { 2748 const u64 offset_in_full_stripe = data_logical - 2749 rbio->bioc->full_stripe_logical; 2750 const int page_index = offset_in_full_stripe >> PAGE_SHIFT; 2751 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2752 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2753 int ret; 2754 2755 /* 2756 * If we hit ENOMEM temporarily, but later at 2757 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do 2758 * the extra read, not a big deal. 2759 * 2760 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, 2761 * the bio would got proper error number set. 2762 */ 2763 ret = alloc_rbio_data_pages(rbio); 2764 if (ret < 0) 2765 return; 2766 2767 /* data_logical must be at stripe boundary and inside the full stripe. */ 2768 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); 2769 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); 2770 2771 for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { 2772 struct page *dst = rbio->stripe_pages[page_nr + page_index]; 2773 struct page *src = data_pages[page_nr]; 2774 2775 memcpy_page(dst, 0, src, 0, PAGE_SIZE); 2776 for (int sector_nr = sectors_per_page * page_index; 2777 sector_nr < sectors_per_page * (page_index + 1); 2778 sector_nr++) 2779 rbio->stripe_sectors[sector_nr].uptodate = true; 2780 } 2781 } 2782