1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "misc.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 23 /* set when additional merges to this rbio are not allowed */ 24 #define RBIO_RMW_LOCKED_BIT 1 25 26 /* 27 * set when this rbio is sitting in the hash, but it is just a cache 28 * of past RMW 29 */ 30 #define RBIO_CACHE_BIT 2 31 32 /* 33 * set when it is safe to trust the stripe_pages for caching 34 */ 35 #define RBIO_CACHE_READY_BIT 3 36 37 #define RBIO_CACHE_SIZE 1024 38 39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 40 41 /* Used by the raid56 code to lock stripes for read/modify/write */ 42 struct btrfs_stripe_hash { 43 struct list_head hash_list; 44 spinlock_t lock; 45 }; 46 47 /* Used by the raid56 code to lock stripes for read/modify/write */ 48 struct btrfs_stripe_hash_table { 49 struct list_head stripe_cache; 50 spinlock_t cache_lock; 51 int cache_size; 52 struct btrfs_stripe_hash table[]; 53 }; 54 55 /* 56 * A bvec like structure to present a sector inside a page. 57 * 58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59 */ 60 struct sector_ptr { 61 struct page *page; 62 unsigned int pgoff:24; 63 unsigned int uptodate:8; 64 }; 65 66 enum btrfs_rbio_ops { 67 BTRFS_RBIO_WRITE, 68 BTRFS_RBIO_READ_REBUILD, 69 BTRFS_RBIO_PARITY_SCRUB, 70 BTRFS_RBIO_REBUILD_MISSING, 71 }; 72 73 struct btrfs_raid_bio { 74 struct btrfs_io_context *bioc; 75 76 /* while we're doing rmw on a stripe 77 * we put it into a hash table so we can 78 * lock the stripe and merge more rbios 79 * into it. 80 */ 81 struct list_head hash_list; 82 83 /* 84 * LRU list for the stripe cache 85 */ 86 struct list_head stripe_cache; 87 88 /* 89 * for scheduling work in the helper threads 90 */ 91 struct work_struct work; 92 93 /* 94 * bio list and bio_list_lock are used 95 * to add more bios into the stripe 96 * in hopes of avoiding the full rmw 97 */ 98 struct bio_list bio_list; 99 spinlock_t bio_list_lock; 100 101 /* also protected by the bio_list_lock, the 102 * plug list is used by the plugging code 103 * to collect partial bios while plugged. The 104 * stripe locking code also uses it to hand off 105 * the stripe lock to the next pending IO 106 */ 107 struct list_head plug_list; 108 109 /* 110 * flags that tell us if it is safe to 111 * merge with this bio 112 */ 113 unsigned long flags; 114 115 /* 116 * set if we're doing a parity rebuild 117 * for a read from higher up, which is handled 118 * differently from a parity rebuild as part of 119 * rmw 120 */ 121 enum btrfs_rbio_ops operation; 122 123 /* Size of each individual stripe on disk */ 124 u32 stripe_len; 125 126 /* How many pages there are for the full stripe including P/Q */ 127 u16 nr_pages; 128 129 /* How many sectors there are for the full stripe including P/Q */ 130 u16 nr_sectors; 131 132 /* Number of data stripes (no p/q) */ 133 u8 nr_data; 134 135 /* Numer of all stripes (including P/Q) */ 136 u8 real_stripes; 137 138 /* How many pages there are for each stripe */ 139 u8 stripe_npages; 140 141 /* How many sectors there are for each stripe */ 142 u8 stripe_nsectors; 143 144 /* First bad stripe, -1 means no corruption */ 145 s8 faila; 146 147 /* Second bad stripe (for RAID6 use) */ 148 s8 failb; 149 150 /* Stripe number that we're scrubbing */ 151 u8 scrubp; 152 153 /* 154 * size of all the bios in the bio_list. This 155 * helps us decide if the rbio maps to a full 156 * stripe or not 157 */ 158 int bio_list_bytes; 159 160 int generic_bio_cnt; 161 162 refcount_t refs; 163 164 atomic_t stripes_pending; 165 166 atomic_t error; 167 /* 168 * these are two arrays of pointers. We allocate the 169 * rbio big enough to hold them both and setup their 170 * locations when the rbio is allocated 171 */ 172 173 /* pointers to pages that we allocated for 174 * reading/writing stripes directly from the disk (including P/Q) 175 */ 176 struct page **stripe_pages; 177 178 /* Pointers to the sectors in the bio_list, for faster lookup */ 179 struct sector_ptr *bio_sectors; 180 181 /* 182 * For subpage support, we need to map each sector to above 183 * stripe_pages. 184 */ 185 struct sector_ptr *stripe_sectors; 186 187 /* Bitmap to record which horizontal stripe has data */ 188 unsigned long *dbitmap; 189 190 /* allocated with real_stripes-many pointers for finish_*() calls */ 191 void **finish_pointers; 192 193 /* Allocated with stripe_nsectors-many bits for finish_*() calls */ 194 unsigned long *finish_pbitmap; 195 }; 196 197 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 198 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 199 static void rmw_work(struct work_struct *work); 200 static void read_rebuild_work(struct work_struct *work); 201 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 202 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 203 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 204 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 205 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 206 207 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 208 int need_check); 209 static void scrub_parity_work(struct work_struct *work); 210 211 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 212 { 213 INIT_WORK(&rbio->work, work_func); 214 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 215 } 216 217 /* 218 * the stripe hash table is used for locking, and to collect 219 * bios in hopes of making a full stripe 220 */ 221 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 222 { 223 struct btrfs_stripe_hash_table *table; 224 struct btrfs_stripe_hash_table *x; 225 struct btrfs_stripe_hash *cur; 226 struct btrfs_stripe_hash *h; 227 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 228 int i; 229 230 if (info->stripe_hash_table) 231 return 0; 232 233 /* 234 * The table is large, starting with order 4 and can go as high as 235 * order 7 in case lock debugging is turned on. 236 * 237 * Try harder to allocate and fallback to vmalloc to lower the chance 238 * of a failing mount. 239 */ 240 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 241 if (!table) 242 return -ENOMEM; 243 244 spin_lock_init(&table->cache_lock); 245 INIT_LIST_HEAD(&table->stripe_cache); 246 247 h = table->table; 248 249 for (i = 0; i < num_entries; i++) { 250 cur = h + i; 251 INIT_LIST_HEAD(&cur->hash_list); 252 spin_lock_init(&cur->lock); 253 } 254 255 x = cmpxchg(&info->stripe_hash_table, NULL, table); 256 kvfree(x); 257 return 0; 258 } 259 260 /* 261 * caching an rbio means to copy anything from the 262 * bio_sectors array into the stripe_pages array. We 263 * use the page uptodate bit in the stripe cache array 264 * to indicate if it has valid data 265 * 266 * once the caching is done, we set the cache ready 267 * bit. 268 */ 269 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 270 { 271 int i; 272 int ret; 273 274 ret = alloc_rbio_pages(rbio); 275 if (ret) 276 return; 277 278 for (i = 0; i < rbio->nr_sectors; i++) { 279 /* Some range not covered by bio (partial write), skip it */ 280 if (!rbio->bio_sectors[i].page) 281 continue; 282 283 ASSERT(rbio->stripe_sectors[i].page); 284 memcpy_page(rbio->stripe_sectors[i].page, 285 rbio->stripe_sectors[i].pgoff, 286 rbio->bio_sectors[i].page, 287 rbio->bio_sectors[i].pgoff, 288 rbio->bioc->fs_info->sectorsize); 289 rbio->stripe_sectors[i].uptodate = 1; 290 } 291 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 292 } 293 294 /* 295 * we hash on the first logical address of the stripe 296 */ 297 static int rbio_bucket(struct btrfs_raid_bio *rbio) 298 { 299 u64 num = rbio->bioc->raid_map[0]; 300 301 /* 302 * we shift down quite a bit. We're using byte 303 * addressing, and most of the lower bits are zeros. 304 * This tends to upset hash_64, and it consistently 305 * returns just one or two different values. 306 * 307 * shifting off the lower bits fixes things. 308 */ 309 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 310 } 311 312 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 313 unsigned int page_nr) 314 { 315 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 316 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 317 int i; 318 319 ASSERT(page_nr < rbio->nr_pages); 320 321 for (i = sectors_per_page * page_nr; 322 i < sectors_per_page * page_nr + sectors_per_page; 323 i++) { 324 if (!rbio->stripe_sectors[i].uptodate) 325 return false; 326 } 327 return true; 328 } 329 330 /* 331 * Update the stripe_sectors[] array to use correct page and pgoff 332 * 333 * Should be called every time any page pointer in stripes_pages[] got modified. 334 */ 335 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 336 { 337 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 338 u32 offset; 339 int i; 340 341 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 342 int page_index = offset >> PAGE_SHIFT; 343 344 ASSERT(page_index < rbio->nr_pages); 345 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 346 rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 347 } 348 } 349 350 /* 351 * Stealing an rbio means taking all the uptodate pages from the stripe array 352 * in the source rbio and putting them into the destination rbio. 353 * 354 * This will also update the involved stripe_sectors[] which are referring to 355 * the old pages. 356 */ 357 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 358 { 359 int i; 360 struct page *s; 361 struct page *d; 362 363 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 364 return; 365 366 for (i = 0; i < dest->nr_pages; i++) { 367 s = src->stripe_pages[i]; 368 if (!s || !full_page_sectors_uptodate(src, i)) 369 continue; 370 371 d = dest->stripe_pages[i]; 372 if (d) 373 __free_page(d); 374 375 dest->stripe_pages[i] = s; 376 src->stripe_pages[i] = NULL; 377 } 378 index_stripe_sectors(dest); 379 index_stripe_sectors(src); 380 } 381 382 /* 383 * merging means we take the bio_list from the victim and 384 * splice it into the destination. The victim should 385 * be discarded afterwards. 386 * 387 * must be called with dest->rbio_list_lock held 388 */ 389 static void merge_rbio(struct btrfs_raid_bio *dest, 390 struct btrfs_raid_bio *victim) 391 { 392 bio_list_merge(&dest->bio_list, &victim->bio_list); 393 dest->bio_list_bytes += victim->bio_list_bytes; 394 dest->generic_bio_cnt += victim->generic_bio_cnt; 395 bio_list_init(&victim->bio_list); 396 } 397 398 /* 399 * used to prune items that are in the cache. The caller 400 * must hold the hash table lock. 401 */ 402 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 403 { 404 int bucket = rbio_bucket(rbio); 405 struct btrfs_stripe_hash_table *table; 406 struct btrfs_stripe_hash *h; 407 int freeit = 0; 408 409 /* 410 * check the bit again under the hash table lock. 411 */ 412 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 413 return; 414 415 table = rbio->bioc->fs_info->stripe_hash_table; 416 h = table->table + bucket; 417 418 /* hold the lock for the bucket because we may be 419 * removing it from the hash table 420 */ 421 spin_lock(&h->lock); 422 423 /* 424 * hold the lock for the bio list because we need 425 * to make sure the bio list is empty 426 */ 427 spin_lock(&rbio->bio_list_lock); 428 429 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 430 list_del_init(&rbio->stripe_cache); 431 table->cache_size -= 1; 432 freeit = 1; 433 434 /* if the bio list isn't empty, this rbio is 435 * still involved in an IO. We take it out 436 * of the cache list, and drop the ref that 437 * was held for the list. 438 * 439 * If the bio_list was empty, we also remove 440 * the rbio from the hash_table, and drop 441 * the corresponding ref 442 */ 443 if (bio_list_empty(&rbio->bio_list)) { 444 if (!list_empty(&rbio->hash_list)) { 445 list_del_init(&rbio->hash_list); 446 refcount_dec(&rbio->refs); 447 BUG_ON(!list_empty(&rbio->plug_list)); 448 } 449 } 450 } 451 452 spin_unlock(&rbio->bio_list_lock); 453 spin_unlock(&h->lock); 454 455 if (freeit) 456 __free_raid_bio(rbio); 457 } 458 459 /* 460 * prune a given rbio from the cache 461 */ 462 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 463 { 464 struct btrfs_stripe_hash_table *table; 465 unsigned long flags; 466 467 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 468 return; 469 470 table = rbio->bioc->fs_info->stripe_hash_table; 471 472 spin_lock_irqsave(&table->cache_lock, flags); 473 __remove_rbio_from_cache(rbio); 474 spin_unlock_irqrestore(&table->cache_lock, flags); 475 } 476 477 /* 478 * remove everything in the cache 479 */ 480 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 481 { 482 struct btrfs_stripe_hash_table *table; 483 unsigned long flags; 484 struct btrfs_raid_bio *rbio; 485 486 table = info->stripe_hash_table; 487 488 spin_lock_irqsave(&table->cache_lock, flags); 489 while (!list_empty(&table->stripe_cache)) { 490 rbio = list_entry(table->stripe_cache.next, 491 struct btrfs_raid_bio, 492 stripe_cache); 493 __remove_rbio_from_cache(rbio); 494 } 495 spin_unlock_irqrestore(&table->cache_lock, flags); 496 } 497 498 /* 499 * remove all cached entries and free the hash table 500 * used by unmount 501 */ 502 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 503 { 504 if (!info->stripe_hash_table) 505 return; 506 btrfs_clear_rbio_cache(info); 507 kvfree(info->stripe_hash_table); 508 info->stripe_hash_table = NULL; 509 } 510 511 /* 512 * insert an rbio into the stripe cache. It 513 * must have already been prepared by calling 514 * cache_rbio_pages 515 * 516 * If this rbio was already cached, it gets 517 * moved to the front of the lru. 518 * 519 * If the size of the rbio cache is too big, we 520 * prune an item. 521 */ 522 static void cache_rbio(struct btrfs_raid_bio *rbio) 523 { 524 struct btrfs_stripe_hash_table *table; 525 unsigned long flags; 526 527 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 528 return; 529 530 table = rbio->bioc->fs_info->stripe_hash_table; 531 532 spin_lock_irqsave(&table->cache_lock, flags); 533 spin_lock(&rbio->bio_list_lock); 534 535 /* bump our ref if we were not in the list before */ 536 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 537 refcount_inc(&rbio->refs); 538 539 if (!list_empty(&rbio->stripe_cache)){ 540 list_move(&rbio->stripe_cache, &table->stripe_cache); 541 } else { 542 list_add(&rbio->stripe_cache, &table->stripe_cache); 543 table->cache_size += 1; 544 } 545 546 spin_unlock(&rbio->bio_list_lock); 547 548 if (table->cache_size > RBIO_CACHE_SIZE) { 549 struct btrfs_raid_bio *found; 550 551 found = list_entry(table->stripe_cache.prev, 552 struct btrfs_raid_bio, 553 stripe_cache); 554 555 if (found != rbio) 556 __remove_rbio_from_cache(found); 557 } 558 559 spin_unlock_irqrestore(&table->cache_lock, flags); 560 } 561 562 /* 563 * helper function to run the xor_blocks api. It is only 564 * able to do MAX_XOR_BLOCKS at a time, so we need to 565 * loop through. 566 */ 567 static void run_xor(void **pages, int src_cnt, ssize_t len) 568 { 569 int src_off = 0; 570 int xor_src_cnt = 0; 571 void *dest = pages[src_cnt]; 572 573 while(src_cnt > 0) { 574 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 575 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 576 577 src_cnt -= xor_src_cnt; 578 src_off += xor_src_cnt; 579 } 580 } 581 582 /* 583 * Returns true if the bio list inside this rbio covers an entire stripe (no 584 * rmw required). 585 */ 586 static int rbio_is_full(struct btrfs_raid_bio *rbio) 587 { 588 unsigned long flags; 589 unsigned long size = rbio->bio_list_bytes; 590 int ret = 1; 591 592 spin_lock_irqsave(&rbio->bio_list_lock, flags); 593 if (size != rbio->nr_data * rbio->stripe_len) 594 ret = 0; 595 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 596 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 597 598 return ret; 599 } 600 601 /* 602 * returns 1 if it is safe to merge two rbios together. 603 * The merging is safe if the two rbios correspond to 604 * the same stripe and if they are both going in the same 605 * direction (read vs write), and if neither one is 606 * locked for final IO 607 * 608 * The caller is responsible for locking such that 609 * rmw_locked is safe to test 610 */ 611 static int rbio_can_merge(struct btrfs_raid_bio *last, 612 struct btrfs_raid_bio *cur) 613 { 614 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 615 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 616 return 0; 617 618 /* 619 * we can't merge with cached rbios, since the 620 * idea is that when we merge the destination 621 * rbio is going to run our IO for us. We can 622 * steal from cached rbios though, other functions 623 * handle that. 624 */ 625 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 626 test_bit(RBIO_CACHE_BIT, &cur->flags)) 627 return 0; 628 629 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 630 return 0; 631 632 /* we can't merge with different operations */ 633 if (last->operation != cur->operation) 634 return 0; 635 /* 636 * We've need read the full stripe from the drive. 637 * check and repair the parity and write the new results. 638 * 639 * We're not allowed to add any new bios to the 640 * bio list here, anyone else that wants to 641 * change this stripe needs to do their own rmw. 642 */ 643 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 644 return 0; 645 646 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 647 return 0; 648 649 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 650 int fa = last->faila; 651 int fb = last->failb; 652 int cur_fa = cur->faila; 653 int cur_fb = cur->failb; 654 655 if (last->faila >= last->failb) { 656 fa = last->failb; 657 fb = last->faila; 658 } 659 660 if (cur->faila >= cur->failb) { 661 cur_fa = cur->failb; 662 cur_fb = cur->faila; 663 } 664 665 if (fa != cur_fa || fb != cur_fb) 666 return 0; 667 } 668 return 1; 669 } 670 671 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 672 unsigned int stripe_nr, 673 unsigned int sector_nr) 674 { 675 ASSERT(stripe_nr < rbio->real_stripes); 676 ASSERT(sector_nr < rbio->stripe_nsectors); 677 678 return stripe_nr * rbio->stripe_nsectors + sector_nr; 679 } 680 681 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 682 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 683 unsigned int stripe_nr, 684 unsigned int sector_nr) 685 { 686 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 687 sector_nr)]; 688 } 689 690 /* Grab a sector inside P stripe */ 691 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 692 unsigned int sector_nr) 693 { 694 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 695 } 696 697 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 698 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 699 unsigned int sector_nr) 700 { 701 if (rbio->nr_data + 1 == rbio->real_stripes) 702 return NULL; 703 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 704 } 705 706 /* 707 * The first stripe in the table for a logical address 708 * has the lock. rbios are added in one of three ways: 709 * 710 * 1) Nobody has the stripe locked yet. The rbio is given 711 * the lock and 0 is returned. The caller must start the IO 712 * themselves. 713 * 714 * 2) Someone has the stripe locked, but we're able to merge 715 * with the lock owner. The rbio is freed and the IO will 716 * start automatically along with the existing rbio. 1 is returned. 717 * 718 * 3) Someone has the stripe locked, but we're not able to merge. 719 * The rbio is added to the lock owner's plug list, or merged into 720 * an rbio already on the plug list. When the lock owner unlocks, 721 * the next rbio on the list is run and the IO is started automatically. 722 * 1 is returned 723 * 724 * If we return 0, the caller still owns the rbio and must continue with 725 * IO submission. If we return 1, the caller must assume the rbio has 726 * already been freed. 727 */ 728 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 729 { 730 struct btrfs_stripe_hash *h; 731 struct btrfs_raid_bio *cur; 732 struct btrfs_raid_bio *pending; 733 unsigned long flags; 734 struct btrfs_raid_bio *freeit = NULL; 735 struct btrfs_raid_bio *cache_drop = NULL; 736 int ret = 0; 737 738 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 739 740 spin_lock_irqsave(&h->lock, flags); 741 list_for_each_entry(cur, &h->hash_list, hash_list) { 742 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 743 continue; 744 745 spin_lock(&cur->bio_list_lock); 746 747 /* Can we steal this cached rbio's pages? */ 748 if (bio_list_empty(&cur->bio_list) && 749 list_empty(&cur->plug_list) && 750 test_bit(RBIO_CACHE_BIT, &cur->flags) && 751 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 752 list_del_init(&cur->hash_list); 753 refcount_dec(&cur->refs); 754 755 steal_rbio(cur, rbio); 756 cache_drop = cur; 757 spin_unlock(&cur->bio_list_lock); 758 759 goto lockit; 760 } 761 762 /* Can we merge into the lock owner? */ 763 if (rbio_can_merge(cur, rbio)) { 764 merge_rbio(cur, rbio); 765 spin_unlock(&cur->bio_list_lock); 766 freeit = rbio; 767 ret = 1; 768 goto out; 769 } 770 771 772 /* 773 * We couldn't merge with the running rbio, see if we can merge 774 * with the pending ones. We don't have to check for rmw_locked 775 * because there is no way they are inside finish_rmw right now 776 */ 777 list_for_each_entry(pending, &cur->plug_list, plug_list) { 778 if (rbio_can_merge(pending, rbio)) { 779 merge_rbio(pending, rbio); 780 spin_unlock(&cur->bio_list_lock); 781 freeit = rbio; 782 ret = 1; 783 goto out; 784 } 785 } 786 787 /* 788 * No merging, put us on the tail of the plug list, our rbio 789 * will be started with the currently running rbio unlocks 790 */ 791 list_add_tail(&rbio->plug_list, &cur->plug_list); 792 spin_unlock(&cur->bio_list_lock); 793 ret = 1; 794 goto out; 795 } 796 lockit: 797 refcount_inc(&rbio->refs); 798 list_add(&rbio->hash_list, &h->hash_list); 799 out: 800 spin_unlock_irqrestore(&h->lock, flags); 801 if (cache_drop) 802 remove_rbio_from_cache(cache_drop); 803 if (freeit) 804 __free_raid_bio(freeit); 805 return ret; 806 } 807 808 /* 809 * called as rmw or parity rebuild is completed. If the plug list has more 810 * rbios waiting for this stripe, the next one on the list will be started 811 */ 812 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 813 { 814 int bucket; 815 struct btrfs_stripe_hash *h; 816 unsigned long flags; 817 int keep_cache = 0; 818 819 bucket = rbio_bucket(rbio); 820 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 821 822 if (list_empty(&rbio->plug_list)) 823 cache_rbio(rbio); 824 825 spin_lock_irqsave(&h->lock, flags); 826 spin_lock(&rbio->bio_list_lock); 827 828 if (!list_empty(&rbio->hash_list)) { 829 /* 830 * if we're still cached and there is no other IO 831 * to perform, just leave this rbio here for others 832 * to steal from later 833 */ 834 if (list_empty(&rbio->plug_list) && 835 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 836 keep_cache = 1; 837 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 838 BUG_ON(!bio_list_empty(&rbio->bio_list)); 839 goto done; 840 } 841 842 list_del_init(&rbio->hash_list); 843 refcount_dec(&rbio->refs); 844 845 /* 846 * we use the plug list to hold all the rbios 847 * waiting for the chance to lock this stripe. 848 * hand the lock over to one of them. 849 */ 850 if (!list_empty(&rbio->plug_list)) { 851 struct btrfs_raid_bio *next; 852 struct list_head *head = rbio->plug_list.next; 853 854 next = list_entry(head, struct btrfs_raid_bio, 855 plug_list); 856 857 list_del_init(&rbio->plug_list); 858 859 list_add(&next->hash_list, &h->hash_list); 860 refcount_inc(&next->refs); 861 spin_unlock(&rbio->bio_list_lock); 862 spin_unlock_irqrestore(&h->lock, flags); 863 864 if (next->operation == BTRFS_RBIO_READ_REBUILD) 865 start_async_work(next, read_rebuild_work); 866 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 867 steal_rbio(rbio, next); 868 start_async_work(next, read_rebuild_work); 869 } else if (next->operation == BTRFS_RBIO_WRITE) { 870 steal_rbio(rbio, next); 871 start_async_work(next, rmw_work); 872 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 873 steal_rbio(rbio, next); 874 start_async_work(next, scrub_parity_work); 875 } 876 877 goto done_nolock; 878 } 879 } 880 done: 881 spin_unlock(&rbio->bio_list_lock); 882 spin_unlock_irqrestore(&h->lock, flags); 883 884 done_nolock: 885 if (!keep_cache) 886 remove_rbio_from_cache(rbio); 887 } 888 889 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 890 { 891 int i; 892 893 if (!refcount_dec_and_test(&rbio->refs)) 894 return; 895 896 WARN_ON(!list_empty(&rbio->stripe_cache)); 897 WARN_ON(!list_empty(&rbio->hash_list)); 898 WARN_ON(!bio_list_empty(&rbio->bio_list)); 899 900 for (i = 0; i < rbio->nr_pages; i++) { 901 if (rbio->stripe_pages[i]) { 902 __free_page(rbio->stripe_pages[i]); 903 rbio->stripe_pages[i] = NULL; 904 } 905 } 906 907 btrfs_put_bioc(rbio->bioc); 908 kfree(rbio); 909 } 910 911 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 912 { 913 struct bio *next; 914 915 while (cur) { 916 next = cur->bi_next; 917 cur->bi_next = NULL; 918 cur->bi_status = err; 919 bio_endio(cur); 920 cur = next; 921 } 922 } 923 924 /* 925 * this frees the rbio and runs through all the bios in the 926 * bio_list and calls end_io on them 927 */ 928 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 929 { 930 struct bio *cur = bio_list_get(&rbio->bio_list); 931 struct bio *extra; 932 933 if (rbio->generic_bio_cnt) 934 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); 935 936 /* 937 * At this moment, rbio->bio_list is empty, however since rbio does not 938 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 939 * hash list, rbio may be merged with others so that rbio->bio_list 940 * becomes non-empty. 941 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 942 * more and we can call bio_endio() on all queued bios. 943 */ 944 unlock_stripe(rbio); 945 extra = bio_list_get(&rbio->bio_list); 946 __free_raid_bio(rbio); 947 948 rbio_endio_bio_list(cur, err); 949 if (extra) 950 rbio_endio_bio_list(extra, err); 951 } 952 953 /* 954 * end io function used by finish_rmw. When we finally 955 * get here, we've written a full stripe 956 */ 957 static void raid_write_end_io(struct bio *bio) 958 { 959 struct btrfs_raid_bio *rbio = bio->bi_private; 960 blk_status_t err = bio->bi_status; 961 int max_errors; 962 963 if (err) 964 fail_bio_stripe(rbio, bio); 965 966 bio_put(bio); 967 968 if (!atomic_dec_and_test(&rbio->stripes_pending)) 969 return; 970 971 err = BLK_STS_OK; 972 973 /* OK, we have read all the stripes we need to. */ 974 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 975 0 : rbio->bioc->max_errors; 976 if (atomic_read(&rbio->error) > max_errors) 977 err = BLK_STS_IOERR; 978 979 rbio_orig_end_io(rbio, err); 980 } 981 982 /** 983 * Get a sector pointer specified by its @stripe_nr and @sector_nr 984 * 985 * @rbio: The raid bio 986 * @stripe_nr: Stripe number, valid range [0, real_stripe) 987 * @sector_nr: Sector number inside the stripe, 988 * valid range [0, stripe_nsectors) 989 * @bio_list_only: Whether to use sectors inside the bio list only. 990 * 991 * The read/modify/write code wants to reuse the original bio page as much 992 * as possible, and only use stripe_sectors as fallback. 993 */ 994 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 995 int stripe_nr, int sector_nr, 996 bool bio_list_only) 997 { 998 struct sector_ptr *sector; 999 int index; 1000 1001 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 1002 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1003 1004 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 1005 ASSERT(index >= 0 && index < rbio->nr_sectors); 1006 1007 spin_lock_irq(&rbio->bio_list_lock); 1008 sector = &rbio->bio_sectors[index]; 1009 if (sector->page || bio_list_only) { 1010 /* Don't return sector without a valid page pointer */ 1011 if (!sector->page) 1012 sector = NULL; 1013 spin_unlock_irq(&rbio->bio_list_lock); 1014 return sector; 1015 } 1016 spin_unlock_irq(&rbio->bio_list_lock); 1017 1018 return &rbio->stripe_sectors[index]; 1019 } 1020 1021 /* 1022 * allocation and initial setup for the btrfs_raid_bio. Not 1023 * this does not allocate any pages for rbio->pages. 1024 */ 1025 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 1026 struct btrfs_io_context *bioc, 1027 u32 stripe_len) 1028 { 1029 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 1030 const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; 1031 const unsigned int num_pages = stripe_npages * real_stripes; 1032 const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; 1033 const unsigned int num_sectors = stripe_nsectors * real_stripes; 1034 struct btrfs_raid_bio *rbio; 1035 int nr_data = 0; 1036 void *p; 1037 1038 ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); 1039 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 1040 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 1041 1042 rbio = kzalloc(sizeof(*rbio) + 1043 sizeof(*rbio->stripe_pages) * num_pages + 1044 sizeof(*rbio->bio_sectors) * num_sectors + 1045 sizeof(*rbio->stripe_sectors) * num_sectors + 1046 sizeof(*rbio->finish_pointers) * real_stripes + 1047 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + 1048 sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), 1049 GFP_NOFS); 1050 if (!rbio) 1051 return ERR_PTR(-ENOMEM); 1052 1053 bio_list_init(&rbio->bio_list); 1054 INIT_LIST_HEAD(&rbio->plug_list); 1055 spin_lock_init(&rbio->bio_list_lock); 1056 INIT_LIST_HEAD(&rbio->stripe_cache); 1057 INIT_LIST_HEAD(&rbio->hash_list); 1058 rbio->bioc = bioc; 1059 rbio->stripe_len = stripe_len; 1060 rbio->nr_pages = num_pages; 1061 rbio->nr_sectors = num_sectors; 1062 rbio->real_stripes = real_stripes; 1063 rbio->stripe_npages = stripe_npages; 1064 rbio->stripe_nsectors = stripe_nsectors; 1065 rbio->faila = -1; 1066 rbio->failb = -1; 1067 refcount_set(&rbio->refs, 1); 1068 atomic_set(&rbio->error, 0); 1069 atomic_set(&rbio->stripes_pending, 0); 1070 1071 /* 1072 * The stripe_pages, bio_sectors, etc arrays point to the extra memory 1073 * we allocated past the end of the rbio. 1074 */ 1075 p = rbio + 1; 1076 #define CONSUME_ALLOC(ptr, count) do { \ 1077 ptr = p; \ 1078 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1079 } while (0) 1080 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1081 CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 1082 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 1083 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1084 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); 1085 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); 1086 #undef CONSUME_ALLOC 1087 1088 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) 1089 nr_data = real_stripes - 1; 1090 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) 1091 nr_data = real_stripes - 2; 1092 else 1093 BUG(); 1094 1095 rbio->nr_data = nr_data; 1096 return rbio; 1097 } 1098 1099 /* allocate pages for all the stripes in the bio, including parity */ 1100 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1101 { 1102 int ret; 1103 1104 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 1105 if (ret < 0) 1106 return ret; 1107 /* Mapping all sectors */ 1108 index_stripe_sectors(rbio); 1109 return 0; 1110 } 1111 1112 /* only allocate pages for p/q stripes */ 1113 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1114 { 1115 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1116 int ret; 1117 1118 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1119 rbio->stripe_pages + data_pages); 1120 if (ret < 0) 1121 return ret; 1122 1123 index_stripe_sectors(rbio); 1124 return 0; 1125 } 1126 1127 /* 1128 * Add a single sector @sector into our list of bios for IO. 1129 * 1130 * Return 0 if everything went well. 1131 * Return <0 for error. 1132 */ 1133 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1134 struct bio_list *bio_list, 1135 struct sector_ptr *sector, 1136 unsigned int stripe_nr, 1137 unsigned int sector_nr, 1138 unsigned long bio_max_len, 1139 unsigned int opf) 1140 { 1141 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1142 struct bio *last = bio_list->tail; 1143 int ret; 1144 struct bio *bio; 1145 struct btrfs_io_stripe *stripe; 1146 u64 disk_start; 1147 1148 /* 1149 * Note: here stripe_nr has taken device replace into consideration, 1150 * thus it can be larger than rbio->real_stripe. 1151 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1152 */ 1153 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1154 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1155 ASSERT(sector->page); 1156 1157 stripe = &rbio->bioc->stripes[stripe_nr]; 1158 disk_start = stripe->physical + sector_nr * sectorsize; 1159 1160 /* if the device is missing, just fail this stripe */ 1161 if (!stripe->dev->bdev) 1162 return fail_rbio_index(rbio, stripe_nr); 1163 1164 /* see if we can add this page onto our existing bio */ 1165 if (last) { 1166 u64 last_end = last->bi_iter.bi_sector << 9; 1167 last_end += last->bi_iter.bi_size; 1168 1169 /* 1170 * we can't merge these if they are from different 1171 * devices or if they are not contiguous 1172 */ 1173 if (last_end == disk_start && !last->bi_status && 1174 last->bi_bdev == stripe->dev->bdev) { 1175 ret = bio_add_page(last, sector->page, sectorsize, 1176 sector->pgoff); 1177 if (ret == sectorsize) 1178 return 0; 1179 } 1180 } 1181 1182 /* put a new bio on the list */ 1183 bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), 1184 opf, GFP_NOFS); 1185 bio->bi_iter.bi_sector = disk_start >> 9; 1186 bio->bi_private = rbio; 1187 1188 bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1189 bio_list_add(bio_list, bio); 1190 return 0; 1191 } 1192 1193 /* 1194 * while we're doing the read/modify/write cycle, we could 1195 * have errors in reading pages off the disk. This checks 1196 * for errors and if we're not able to read the page it'll 1197 * trigger parity reconstruction. The rmw will be finished 1198 * after we've reconstructed the failed stripes 1199 */ 1200 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1201 { 1202 if (rbio->faila >= 0 || rbio->failb >= 0) { 1203 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1204 __raid56_parity_recover(rbio); 1205 } else { 1206 finish_rmw(rbio); 1207 } 1208 } 1209 1210 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1211 { 1212 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1213 struct bio_vec bvec; 1214 struct bvec_iter iter; 1215 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1216 rbio->bioc->raid_map[0]; 1217 1218 if (bio_flagged(bio, BIO_CLONED)) 1219 bio->bi_iter = btrfs_bio(bio)->iter; 1220 1221 bio_for_each_segment(bvec, bio, iter) { 1222 u32 bvec_offset; 1223 1224 for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1225 bvec_offset += sectorsize, offset += sectorsize) { 1226 int index = offset / sectorsize; 1227 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1228 1229 sector->page = bvec.bv_page; 1230 sector->pgoff = bvec.bv_offset + bvec_offset; 1231 ASSERT(sector->pgoff < PAGE_SIZE); 1232 } 1233 } 1234 } 1235 1236 /* 1237 * helper function to walk our bio list and populate the bio_pages array with 1238 * the result. This seems expensive, but it is faster than constantly 1239 * searching through the bio list as we setup the IO in finish_rmw or stripe 1240 * reconstruction. 1241 * 1242 * This must be called before you trust the answers from page_in_rbio 1243 */ 1244 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1245 { 1246 struct bio *bio; 1247 1248 spin_lock_irq(&rbio->bio_list_lock); 1249 bio_list_for_each(bio, &rbio->bio_list) 1250 index_one_bio(rbio, bio); 1251 1252 spin_unlock_irq(&rbio->bio_list_lock); 1253 } 1254 1255 /* 1256 * this is called from one of two situations. We either 1257 * have a full stripe from the higher layers, or we've read all 1258 * the missing bits off disk. 1259 * 1260 * This will calculate the parity and then send down any 1261 * changed blocks. 1262 */ 1263 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1264 { 1265 struct btrfs_io_context *bioc = rbio->bioc; 1266 const u32 sectorsize = bioc->fs_info->sectorsize; 1267 void **pointers = rbio->finish_pointers; 1268 int nr_data = rbio->nr_data; 1269 int stripe; 1270 int sectornr; 1271 bool has_qstripe; 1272 struct bio_list bio_list; 1273 struct bio *bio; 1274 int ret; 1275 1276 bio_list_init(&bio_list); 1277 1278 if (rbio->real_stripes - rbio->nr_data == 1) 1279 has_qstripe = false; 1280 else if (rbio->real_stripes - rbio->nr_data == 2) 1281 has_qstripe = true; 1282 else 1283 BUG(); 1284 1285 /* at this point we either have a full stripe, 1286 * or we've read the full stripe from the drive. 1287 * recalculate the parity and write the new results. 1288 * 1289 * We're not allowed to add any new bios to the 1290 * bio list here, anyone else that wants to 1291 * change this stripe needs to do their own rmw. 1292 */ 1293 spin_lock_irq(&rbio->bio_list_lock); 1294 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1295 spin_unlock_irq(&rbio->bio_list_lock); 1296 1297 atomic_set(&rbio->error, 0); 1298 1299 /* 1300 * now that we've set rmw_locked, run through the 1301 * bio list one last time and map the page pointers 1302 * 1303 * We don't cache full rbios because we're assuming 1304 * the higher layers are unlikely to use this area of 1305 * the disk again soon. If they do use it again, 1306 * hopefully they will send another full bio. 1307 */ 1308 index_rbio_pages(rbio); 1309 if (!rbio_is_full(rbio)) 1310 cache_rbio_pages(rbio); 1311 else 1312 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1313 1314 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1315 struct sector_ptr *sector; 1316 1317 /* First collect one sector from each data stripe */ 1318 for (stripe = 0; stripe < nr_data; stripe++) { 1319 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1320 pointers[stripe] = kmap_local_page(sector->page) + 1321 sector->pgoff; 1322 } 1323 1324 /* Then add the parity stripe */ 1325 sector = rbio_pstripe_sector(rbio, sectornr); 1326 sector->uptodate = 1; 1327 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1328 1329 if (has_qstripe) { 1330 /* 1331 * RAID6, add the qstripe and call the library function 1332 * to fill in our p/q 1333 */ 1334 sector = rbio_qstripe_sector(rbio, sectornr); 1335 sector->uptodate = 1; 1336 pointers[stripe++] = kmap_local_page(sector->page) + 1337 sector->pgoff; 1338 1339 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1340 pointers); 1341 } else { 1342 /* raid5 */ 1343 memcpy(pointers[nr_data], pointers[0], sectorsize); 1344 run_xor(pointers + 1, nr_data - 1, sectorsize); 1345 } 1346 for (stripe = stripe - 1; stripe >= 0; stripe--) 1347 kunmap_local(pointers[stripe]); 1348 } 1349 1350 /* 1351 * time to start writing. Make bios for everything from the 1352 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1353 * everything else. 1354 */ 1355 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1356 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1357 struct sector_ptr *sector; 1358 1359 if (stripe < rbio->nr_data) { 1360 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1361 if (!sector) 1362 continue; 1363 } else { 1364 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1365 } 1366 1367 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1368 sectornr, rbio->stripe_len, 1369 REQ_OP_WRITE); 1370 if (ret) 1371 goto cleanup; 1372 } 1373 } 1374 1375 if (likely(!bioc->num_tgtdevs)) 1376 goto write_data; 1377 1378 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1379 if (!bioc->tgtdev_map[stripe]) 1380 continue; 1381 1382 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1383 struct sector_ptr *sector; 1384 1385 if (stripe < rbio->nr_data) { 1386 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1387 if (!sector) 1388 continue; 1389 } else { 1390 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1391 } 1392 1393 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1394 rbio->bioc->tgtdev_map[stripe], 1395 sectornr, rbio->stripe_len, 1396 REQ_OP_WRITE); 1397 if (ret) 1398 goto cleanup; 1399 } 1400 } 1401 1402 write_data: 1403 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1404 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1405 1406 while ((bio = bio_list_pop(&bio_list))) { 1407 bio->bi_end_io = raid_write_end_io; 1408 1409 submit_bio(bio); 1410 } 1411 return; 1412 1413 cleanup: 1414 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1415 1416 while ((bio = bio_list_pop(&bio_list))) 1417 bio_put(bio); 1418 } 1419 1420 /* 1421 * helper to find the stripe number for a given bio. Used to figure out which 1422 * stripe has failed. This expects the bio to correspond to a physical disk, 1423 * so it looks up based on physical sector numbers. 1424 */ 1425 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1426 struct bio *bio) 1427 { 1428 u64 physical = bio->bi_iter.bi_sector; 1429 int i; 1430 struct btrfs_io_stripe *stripe; 1431 1432 physical <<= 9; 1433 1434 for (i = 0; i < rbio->bioc->num_stripes; i++) { 1435 stripe = &rbio->bioc->stripes[i]; 1436 if (in_range(physical, stripe->physical, rbio->stripe_len) && 1437 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1438 return i; 1439 } 1440 } 1441 return -1; 1442 } 1443 1444 /* 1445 * helper to find the stripe number for a given 1446 * bio (before mapping). Used to figure out which stripe has 1447 * failed. This looks up based on logical block numbers. 1448 */ 1449 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1450 struct bio *bio) 1451 { 1452 u64 logical = bio->bi_iter.bi_sector << 9; 1453 int i; 1454 1455 for (i = 0; i < rbio->nr_data; i++) { 1456 u64 stripe_start = rbio->bioc->raid_map[i]; 1457 1458 if (in_range(logical, stripe_start, rbio->stripe_len)) 1459 return i; 1460 } 1461 return -1; 1462 } 1463 1464 /* 1465 * returns -EIO if we had too many failures 1466 */ 1467 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1468 { 1469 unsigned long flags; 1470 int ret = 0; 1471 1472 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1473 1474 /* we already know this stripe is bad, move on */ 1475 if (rbio->faila == failed || rbio->failb == failed) 1476 goto out; 1477 1478 if (rbio->faila == -1) { 1479 /* first failure on this rbio */ 1480 rbio->faila = failed; 1481 atomic_inc(&rbio->error); 1482 } else if (rbio->failb == -1) { 1483 /* second failure on this rbio */ 1484 rbio->failb = failed; 1485 atomic_inc(&rbio->error); 1486 } else { 1487 ret = -EIO; 1488 } 1489 out: 1490 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1491 1492 return ret; 1493 } 1494 1495 /* 1496 * helper to fail a stripe based on a physical disk 1497 * bio. 1498 */ 1499 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1500 struct bio *bio) 1501 { 1502 int failed = find_bio_stripe(rbio, bio); 1503 1504 if (failed < 0) 1505 return -EIO; 1506 1507 return fail_rbio_index(rbio, failed); 1508 } 1509 1510 /* 1511 * For subpage case, we can no longer set page Uptodate directly for 1512 * stripe_pages[], thus we need to locate the sector. 1513 */ 1514 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1515 struct page *page, 1516 unsigned int pgoff) 1517 { 1518 int i; 1519 1520 for (i = 0; i < rbio->nr_sectors; i++) { 1521 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1522 1523 if (sector->page == page && sector->pgoff == pgoff) 1524 return sector; 1525 } 1526 return NULL; 1527 } 1528 1529 /* 1530 * this sets each page in the bio uptodate. It should only be used on private 1531 * rbio pages, nothing that comes in from the higher layers 1532 */ 1533 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1534 { 1535 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1536 struct bio_vec *bvec; 1537 struct bvec_iter_all iter_all; 1538 1539 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1540 1541 bio_for_each_segment_all(bvec, bio, iter_all) { 1542 struct sector_ptr *sector; 1543 int pgoff; 1544 1545 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1546 pgoff += sectorsize) { 1547 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1548 ASSERT(sector); 1549 if (sector) 1550 sector->uptodate = 1; 1551 } 1552 } 1553 } 1554 1555 /* 1556 * end io for the read phase of the rmw cycle. All the bios here are physical 1557 * stripe bios we've read from the disk so we can recalculate the parity of the 1558 * stripe. 1559 * 1560 * This will usually kick off finish_rmw once all the bios are read in, but it 1561 * may trigger parity reconstruction if we had any errors along the way 1562 */ 1563 static void raid_rmw_end_io(struct bio *bio) 1564 { 1565 struct btrfs_raid_bio *rbio = bio->bi_private; 1566 1567 if (bio->bi_status) 1568 fail_bio_stripe(rbio, bio); 1569 else 1570 set_bio_pages_uptodate(rbio, bio); 1571 1572 bio_put(bio); 1573 1574 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1575 return; 1576 1577 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 1578 goto cleanup; 1579 1580 /* 1581 * this will normally call finish_rmw to start our write 1582 * but if there are any failed stripes we'll reconstruct 1583 * from parity first 1584 */ 1585 validate_rbio_for_rmw(rbio); 1586 return; 1587 1588 cleanup: 1589 1590 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1591 } 1592 1593 /* 1594 * the stripe must be locked by the caller. It will 1595 * unlock after all the writes are done 1596 */ 1597 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1598 { 1599 int bios_to_read = 0; 1600 struct bio_list bio_list; 1601 int ret; 1602 int sectornr; 1603 int stripe; 1604 struct bio *bio; 1605 1606 bio_list_init(&bio_list); 1607 1608 ret = alloc_rbio_pages(rbio); 1609 if (ret) 1610 goto cleanup; 1611 1612 index_rbio_pages(rbio); 1613 1614 atomic_set(&rbio->error, 0); 1615 /* 1616 * build a list of bios to read all the missing parts of this 1617 * stripe 1618 */ 1619 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1620 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1621 struct sector_ptr *sector; 1622 1623 /* 1624 * We want to find all the sectors missing from the 1625 * rbio and read them from the disk. If * sector_in_rbio() 1626 * finds a page in the bio list we don't need to read 1627 * it off the stripe. 1628 */ 1629 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1630 if (sector) 1631 continue; 1632 1633 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1634 /* 1635 * The bio cache may have handed us an uptodate page. 1636 * If so, be happy and use it. 1637 */ 1638 if (sector->uptodate) 1639 continue; 1640 1641 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1642 stripe, sectornr, rbio->stripe_len, 1643 REQ_OP_READ); 1644 if (ret) 1645 goto cleanup; 1646 } 1647 } 1648 1649 bios_to_read = bio_list_size(&bio_list); 1650 if (!bios_to_read) { 1651 /* 1652 * this can happen if others have merged with 1653 * us, it means there is nothing left to read. 1654 * But if there are missing devices it may not be 1655 * safe to do the full stripe write yet. 1656 */ 1657 goto finish; 1658 } 1659 1660 /* 1661 * The bioc may be freed once we submit the last bio. Make sure not to 1662 * touch it after that. 1663 */ 1664 atomic_set(&rbio->stripes_pending, bios_to_read); 1665 while ((bio = bio_list_pop(&bio_list))) { 1666 bio->bi_end_io = raid_rmw_end_io; 1667 1668 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1669 1670 submit_bio(bio); 1671 } 1672 /* the actual write will happen once the reads are done */ 1673 return 0; 1674 1675 cleanup: 1676 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1677 1678 while ((bio = bio_list_pop(&bio_list))) 1679 bio_put(bio); 1680 1681 return -EIO; 1682 1683 finish: 1684 validate_rbio_for_rmw(rbio); 1685 return 0; 1686 } 1687 1688 /* 1689 * if the upper layers pass in a full stripe, we thank them by only allocating 1690 * enough pages to hold the parity, and sending it all down quickly. 1691 */ 1692 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1693 { 1694 int ret; 1695 1696 ret = alloc_rbio_parity_pages(rbio); 1697 if (ret) { 1698 __free_raid_bio(rbio); 1699 return ret; 1700 } 1701 1702 ret = lock_stripe_add(rbio); 1703 if (ret == 0) 1704 finish_rmw(rbio); 1705 return 0; 1706 } 1707 1708 /* 1709 * partial stripe writes get handed over to async helpers. 1710 * We're really hoping to merge a few more writes into this 1711 * rbio before calculating new parity 1712 */ 1713 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1714 { 1715 int ret; 1716 1717 ret = lock_stripe_add(rbio); 1718 if (ret == 0) 1719 start_async_work(rbio, rmw_work); 1720 return 0; 1721 } 1722 1723 /* 1724 * sometimes while we were reading from the drive to 1725 * recalculate parity, enough new bios come into create 1726 * a full stripe. So we do a check here to see if we can 1727 * go directly to finish_rmw 1728 */ 1729 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1730 { 1731 /* head off into rmw land if we don't have a full stripe */ 1732 if (!rbio_is_full(rbio)) 1733 return partial_stripe_write(rbio); 1734 return full_stripe_write(rbio); 1735 } 1736 1737 /* 1738 * We use plugging call backs to collect full stripes. 1739 * Any time we get a partial stripe write while plugged 1740 * we collect it into a list. When the unplug comes down, 1741 * we sort the list by logical block number and merge 1742 * everything we can into the same rbios 1743 */ 1744 struct btrfs_plug_cb { 1745 struct blk_plug_cb cb; 1746 struct btrfs_fs_info *info; 1747 struct list_head rbio_list; 1748 struct work_struct work; 1749 }; 1750 1751 /* 1752 * rbios on the plug list are sorted for easier merging. 1753 */ 1754 static int plug_cmp(void *priv, const struct list_head *a, 1755 const struct list_head *b) 1756 { 1757 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1758 plug_list); 1759 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1760 plug_list); 1761 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1762 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1763 1764 if (a_sector < b_sector) 1765 return -1; 1766 if (a_sector > b_sector) 1767 return 1; 1768 return 0; 1769 } 1770 1771 static void run_plug(struct btrfs_plug_cb *plug) 1772 { 1773 struct btrfs_raid_bio *cur; 1774 struct btrfs_raid_bio *last = NULL; 1775 1776 /* 1777 * sort our plug list then try to merge 1778 * everything we can in hopes of creating full 1779 * stripes. 1780 */ 1781 list_sort(NULL, &plug->rbio_list, plug_cmp); 1782 while (!list_empty(&plug->rbio_list)) { 1783 cur = list_entry(plug->rbio_list.next, 1784 struct btrfs_raid_bio, plug_list); 1785 list_del_init(&cur->plug_list); 1786 1787 if (rbio_is_full(cur)) { 1788 int ret; 1789 1790 /* we have a full stripe, send it down */ 1791 ret = full_stripe_write(cur); 1792 BUG_ON(ret); 1793 continue; 1794 } 1795 if (last) { 1796 if (rbio_can_merge(last, cur)) { 1797 merge_rbio(last, cur); 1798 __free_raid_bio(cur); 1799 continue; 1800 1801 } 1802 __raid56_parity_write(last); 1803 } 1804 last = cur; 1805 } 1806 if (last) { 1807 __raid56_parity_write(last); 1808 } 1809 kfree(plug); 1810 } 1811 1812 /* 1813 * if the unplug comes from schedule, we have to push the 1814 * work off to a helper thread 1815 */ 1816 static void unplug_work(struct work_struct *work) 1817 { 1818 struct btrfs_plug_cb *plug; 1819 plug = container_of(work, struct btrfs_plug_cb, work); 1820 run_plug(plug); 1821 } 1822 1823 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1824 { 1825 struct btrfs_plug_cb *plug; 1826 plug = container_of(cb, struct btrfs_plug_cb, cb); 1827 1828 if (from_schedule) { 1829 INIT_WORK(&plug->work, unplug_work); 1830 queue_work(plug->info->rmw_workers, &plug->work); 1831 return; 1832 } 1833 run_plug(plug); 1834 } 1835 1836 /* 1837 * our main entry point for writes from the rest of the FS. 1838 */ 1839 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) 1840 { 1841 struct btrfs_fs_info *fs_info = bioc->fs_info; 1842 struct btrfs_raid_bio *rbio; 1843 struct btrfs_plug_cb *plug = NULL; 1844 struct blk_plug_cb *cb; 1845 int ret; 1846 1847 rbio = alloc_rbio(fs_info, bioc, stripe_len); 1848 if (IS_ERR(rbio)) { 1849 btrfs_put_bioc(bioc); 1850 return PTR_ERR(rbio); 1851 } 1852 bio_list_add(&rbio->bio_list, bio); 1853 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1854 rbio->operation = BTRFS_RBIO_WRITE; 1855 1856 btrfs_bio_counter_inc_noblocked(fs_info); 1857 rbio->generic_bio_cnt = 1; 1858 1859 /* 1860 * don't plug on full rbios, just get them out the door 1861 * as quickly as we can 1862 */ 1863 if (rbio_is_full(rbio)) { 1864 ret = full_stripe_write(rbio); 1865 if (ret) 1866 btrfs_bio_counter_dec(fs_info); 1867 return ret; 1868 } 1869 1870 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1871 if (cb) { 1872 plug = container_of(cb, struct btrfs_plug_cb, cb); 1873 if (!plug->info) { 1874 plug->info = fs_info; 1875 INIT_LIST_HEAD(&plug->rbio_list); 1876 } 1877 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1878 ret = 0; 1879 } else { 1880 ret = __raid56_parity_write(rbio); 1881 if (ret) 1882 btrfs_bio_counter_dec(fs_info); 1883 } 1884 return ret; 1885 } 1886 1887 /* 1888 * all parity reconstruction happens here. We've read in everything 1889 * we can find from the drives and this does the heavy lifting of 1890 * sorting the good from the bad. 1891 */ 1892 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1893 { 1894 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1895 int sectornr, stripe; 1896 void **pointers; 1897 void **unmap_array; 1898 int faila = -1, failb = -1; 1899 blk_status_t err; 1900 int i; 1901 1902 /* 1903 * This array stores the pointer for each sector, thus it has the extra 1904 * pgoff value added from each sector 1905 */ 1906 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1907 if (!pointers) { 1908 err = BLK_STS_RESOURCE; 1909 goto cleanup_io; 1910 } 1911 1912 /* 1913 * Store copy of pointers that does not get reordered during 1914 * reconstruction so that kunmap_local works. 1915 */ 1916 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1917 if (!unmap_array) { 1918 err = BLK_STS_RESOURCE; 1919 goto cleanup_pointers; 1920 } 1921 1922 faila = rbio->faila; 1923 failb = rbio->failb; 1924 1925 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1926 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1927 spin_lock_irq(&rbio->bio_list_lock); 1928 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1929 spin_unlock_irq(&rbio->bio_list_lock); 1930 } 1931 1932 index_rbio_pages(rbio); 1933 1934 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1935 struct sector_ptr *sector; 1936 1937 /* 1938 * Now we just use bitmap to mark the horizontal stripes in 1939 * which we have data when doing parity scrub. 1940 */ 1941 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1942 !test_bit(sectornr, rbio->dbitmap)) 1943 continue; 1944 1945 /* 1946 * Setup our array of pointers with sectors from each stripe 1947 * 1948 * NOTE: store a duplicate array of pointers to preserve the 1949 * pointer order 1950 */ 1951 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1952 /* 1953 * If we're rebuilding a read, we have to use 1954 * pages from the bio list 1955 */ 1956 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1957 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1958 (stripe == faila || stripe == failb)) { 1959 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1960 } else { 1961 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1962 } 1963 ASSERT(sector->page); 1964 pointers[stripe] = kmap_local_page(sector->page) + 1965 sector->pgoff; 1966 unmap_array[stripe] = pointers[stripe]; 1967 } 1968 1969 /* All raid6 handling here */ 1970 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1971 /* Single failure, rebuild from parity raid5 style */ 1972 if (failb < 0) { 1973 if (faila == rbio->nr_data) { 1974 /* 1975 * Just the P stripe has failed, without 1976 * a bad data or Q stripe. 1977 * TODO, we should redo the xor here. 1978 */ 1979 err = BLK_STS_IOERR; 1980 goto cleanup; 1981 } 1982 /* 1983 * a single failure in raid6 is rebuilt 1984 * in the pstripe code below 1985 */ 1986 goto pstripe; 1987 } 1988 1989 /* make sure our ps and qs are in order */ 1990 if (faila > failb) 1991 swap(faila, failb); 1992 1993 /* if the q stripe is failed, do a pstripe reconstruction 1994 * from the xors. 1995 * If both the q stripe and the P stripe are failed, we're 1996 * here due to a crc mismatch and we can't give them the 1997 * data they want 1998 */ 1999 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 2000 if (rbio->bioc->raid_map[faila] == 2001 RAID5_P_STRIPE) { 2002 err = BLK_STS_IOERR; 2003 goto cleanup; 2004 } 2005 /* 2006 * otherwise we have one bad data stripe and 2007 * a good P stripe. raid5! 2008 */ 2009 goto pstripe; 2010 } 2011 2012 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 2013 raid6_datap_recov(rbio->real_stripes, 2014 sectorsize, faila, pointers); 2015 } else { 2016 raid6_2data_recov(rbio->real_stripes, 2017 sectorsize, faila, failb, 2018 pointers); 2019 } 2020 } else { 2021 void *p; 2022 2023 /* rebuild from P stripe here (raid5 or raid6) */ 2024 BUG_ON(failb != -1); 2025 pstripe: 2026 /* Copy parity block into failed block to start with */ 2027 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 2028 2029 /* rearrange the pointer array */ 2030 p = pointers[faila]; 2031 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 2032 pointers[stripe] = pointers[stripe + 1]; 2033 pointers[rbio->nr_data - 1] = p; 2034 2035 /* xor in the rest */ 2036 run_xor(pointers, rbio->nr_data - 1, sectorsize); 2037 } 2038 /* if we're doing this rebuild as part of an rmw, go through 2039 * and set all of our private rbio pages in the 2040 * failed stripes as uptodate. This way finish_rmw will 2041 * know they can be trusted. If this was a read reconstruction, 2042 * other endio functions will fiddle the uptodate bits 2043 */ 2044 if (rbio->operation == BTRFS_RBIO_WRITE) { 2045 for (i = 0; i < rbio->stripe_nsectors; i++) { 2046 if (faila != -1) { 2047 sector = rbio_stripe_sector(rbio, faila, i); 2048 sector->uptodate = 1; 2049 } 2050 if (failb != -1) { 2051 sector = rbio_stripe_sector(rbio, failb, i); 2052 sector->uptodate = 1; 2053 } 2054 } 2055 } 2056 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 2057 kunmap_local(unmap_array[stripe]); 2058 } 2059 2060 err = BLK_STS_OK; 2061 cleanup: 2062 kfree(unmap_array); 2063 cleanup_pointers: 2064 kfree(pointers); 2065 2066 cleanup_io: 2067 /* 2068 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2069 * valid rbio which is consistent with ondisk content, thus such a 2070 * valid rbio can be cached to avoid further disk reads. 2071 */ 2072 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2073 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 2074 /* 2075 * - In case of two failures, where rbio->failb != -1: 2076 * 2077 * Do not cache this rbio since the above read reconstruction 2078 * (raid6_datap_recov() or raid6_2data_recov()) may have 2079 * changed some content of stripes which are not identical to 2080 * on-disk content any more, otherwise, a later write/recover 2081 * may steal stripe_pages from this rbio and end up with 2082 * corruptions or rebuild failures. 2083 * 2084 * - In case of single failure, where rbio->failb == -1: 2085 * 2086 * Cache this rbio iff the above read reconstruction is 2087 * executed without problems. 2088 */ 2089 if (err == BLK_STS_OK && rbio->failb < 0) 2090 cache_rbio_pages(rbio); 2091 else 2092 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2093 2094 rbio_orig_end_io(rbio, err); 2095 } else if (err == BLK_STS_OK) { 2096 rbio->faila = -1; 2097 rbio->failb = -1; 2098 2099 if (rbio->operation == BTRFS_RBIO_WRITE) 2100 finish_rmw(rbio); 2101 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2102 finish_parity_scrub(rbio, 0); 2103 else 2104 BUG(); 2105 } else { 2106 rbio_orig_end_io(rbio, err); 2107 } 2108 } 2109 2110 /* 2111 * This is called only for stripes we've read from disk to 2112 * reconstruct the parity. 2113 */ 2114 static void raid_recover_end_io(struct bio *bio) 2115 { 2116 struct btrfs_raid_bio *rbio = bio->bi_private; 2117 2118 /* 2119 * we only read stripe pages off the disk, set them 2120 * up to date if there were no errors 2121 */ 2122 if (bio->bi_status) 2123 fail_bio_stripe(rbio, bio); 2124 else 2125 set_bio_pages_uptodate(rbio, bio); 2126 bio_put(bio); 2127 2128 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2129 return; 2130 2131 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2132 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2133 else 2134 __raid_recover_end_io(rbio); 2135 } 2136 2137 /* 2138 * reads everything we need off the disk to reconstruct 2139 * the parity. endio handlers trigger final reconstruction 2140 * when the IO is done. 2141 * 2142 * This is used both for reads from the higher layers and for 2143 * parity construction required to finish a rmw cycle. 2144 */ 2145 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2146 { 2147 int bios_to_read = 0; 2148 struct bio_list bio_list; 2149 int ret; 2150 int sectornr; 2151 int stripe; 2152 struct bio *bio; 2153 2154 bio_list_init(&bio_list); 2155 2156 ret = alloc_rbio_pages(rbio); 2157 if (ret) 2158 goto cleanup; 2159 2160 atomic_set(&rbio->error, 0); 2161 2162 /* 2163 * read everything that hasn't failed. Thanks to the 2164 * stripe cache, it is possible that some or all of these 2165 * pages are going to be uptodate. 2166 */ 2167 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2168 if (rbio->faila == stripe || rbio->failb == stripe) { 2169 atomic_inc(&rbio->error); 2170 continue; 2171 } 2172 2173 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2174 struct sector_ptr *sector; 2175 2176 /* 2177 * the rmw code may have already read this 2178 * page in 2179 */ 2180 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2181 if (sector->uptodate) 2182 continue; 2183 2184 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2185 stripe, sectornr, rbio->stripe_len, 2186 REQ_OP_READ); 2187 if (ret < 0) 2188 goto cleanup; 2189 } 2190 } 2191 2192 bios_to_read = bio_list_size(&bio_list); 2193 if (!bios_to_read) { 2194 /* 2195 * we might have no bios to read just because the pages 2196 * were up to date, or we might have no bios to read because 2197 * the devices were gone. 2198 */ 2199 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 2200 __raid_recover_end_io(rbio); 2201 return 0; 2202 } else { 2203 goto cleanup; 2204 } 2205 } 2206 2207 /* 2208 * The bioc may be freed once we submit the last bio. Make sure not to 2209 * touch it after that. 2210 */ 2211 atomic_set(&rbio->stripes_pending, bios_to_read); 2212 while ((bio = bio_list_pop(&bio_list))) { 2213 bio->bi_end_io = raid_recover_end_io; 2214 2215 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2216 2217 submit_bio(bio); 2218 } 2219 2220 return 0; 2221 2222 cleanup: 2223 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2224 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2225 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2226 2227 while ((bio = bio_list_pop(&bio_list))) 2228 bio_put(bio); 2229 2230 return -EIO; 2231 } 2232 2233 /* 2234 * the main entry point for reads from the higher layers. This 2235 * is really only called when the normal read path had a failure, 2236 * so we assume the bio they send down corresponds to a failed part 2237 * of the drive. 2238 */ 2239 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2240 u32 stripe_len, int mirror_num, int generic_io) 2241 { 2242 struct btrfs_fs_info *fs_info = bioc->fs_info; 2243 struct btrfs_raid_bio *rbio; 2244 int ret; 2245 2246 if (generic_io) { 2247 ASSERT(bioc->mirror_num == mirror_num); 2248 btrfs_bio(bio)->mirror_num = mirror_num; 2249 } 2250 2251 rbio = alloc_rbio(fs_info, bioc, stripe_len); 2252 if (IS_ERR(rbio)) { 2253 if (generic_io) 2254 btrfs_put_bioc(bioc); 2255 return PTR_ERR(rbio); 2256 } 2257 2258 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2259 bio_list_add(&rbio->bio_list, bio); 2260 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2261 2262 rbio->faila = find_logical_bio_stripe(rbio, bio); 2263 if (rbio->faila == -1) { 2264 btrfs_warn(fs_info, 2265 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 2266 __func__, bio->bi_iter.bi_sector << 9, 2267 (u64)bio->bi_iter.bi_size, bioc->map_type); 2268 if (generic_io) 2269 btrfs_put_bioc(bioc); 2270 kfree(rbio); 2271 return -EIO; 2272 } 2273 2274 if (generic_io) { 2275 btrfs_bio_counter_inc_noblocked(fs_info); 2276 rbio->generic_bio_cnt = 1; 2277 } else { 2278 btrfs_get_bioc(bioc); 2279 } 2280 2281 /* 2282 * Loop retry: 2283 * for 'mirror == 2', reconstruct from all other stripes. 2284 * for 'mirror_num > 2', select a stripe to fail on every retry. 2285 */ 2286 if (mirror_num > 2) { 2287 /* 2288 * 'mirror == 3' is to fail the p stripe and 2289 * reconstruct from the q stripe. 'mirror > 3' is to 2290 * fail a data stripe and reconstruct from p+q stripe. 2291 */ 2292 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2293 ASSERT(rbio->failb > 0); 2294 if (rbio->failb <= rbio->faila) 2295 rbio->failb--; 2296 } 2297 2298 ret = lock_stripe_add(rbio); 2299 2300 /* 2301 * __raid56_parity_recover will end the bio with 2302 * any errors it hits. We don't want to return 2303 * its error value up the stack because our caller 2304 * will end up calling bio_endio with any nonzero 2305 * return 2306 */ 2307 if (ret == 0) 2308 __raid56_parity_recover(rbio); 2309 /* 2310 * our rbio has been added to the list of 2311 * rbios that will be handled after the 2312 * currently lock owner is done 2313 */ 2314 return 0; 2315 2316 } 2317 2318 static void rmw_work(struct work_struct *work) 2319 { 2320 struct btrfs_raid_bio *rbio; 2321 2322 rbio = container_of(work, struct btrfs_raid_bio, work); 2323 raid56_rmw_stripe(rbio); 2324 } 2325 2326 static void read_rebuild_work(struct work_struct *work) 2327 { 2328 struct btrfs_raid_bio *rbio; 2329 2330 rbio = container_of(work, struct btrfs_raid_bio, work); 2331 __raid56_parity_recover(rbio); 2332 } 2333 2334 /* 2335 * The following code is used to scrub/replace the parity stripe 2336 * 2337 * Caller must have already increased bio_counter for getting @bioc. 2338 * 2339 * Note: We need make sure all the pages that add into the scrub/replace 2340 * raid bio are correct and not be changed during the scrub/replace. That 2341 * is those pages just hold metadata or file data with checksum. 2342 */ 2343 2344 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2345 struct btrfs_io_context *bioc, 2346 u32 stripe_len, struct btrfs_device *scrub_dev, 2347 unsigned long *dbitmap, int stripe_nsectors) 2348 { 2349 struct btrfs_fs_info *fs_info = bioc->fs_info; 2350 struct btrfs_raid_bio *rbio; 2351 int i; 2352 2353 rbio = alloc_rbio(fs_info, bioc, stripe_len); 2354 if (IS_ERR(rbio)) 2355 return NULL; 2356 bio_list_add(&rbio->bio_list, bio); 2357 /* 2358 * This is a special bio which is used to hold the completion handler 2359 * and make the scrub rbio is similar to the other types 2360 */ 2361 ASSERT(!bio->bi_iter.bi_size); 2362 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2363 2364 /* 2365 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2366 * to the end position, so this search can start from the first parity 2367 * stripe. 2368 */ 2369 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2370 if (bioc->stripes[i].dev == scrub_dev) { 2371 rbio->scrubp = i; 2372 break; 2373 } 2374 } 2375 ASSERT(i < rbio->real_stripes); 2376 2377 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2378 2379 /* 2380 * We have already increased bio_counter when getting bioc, record it 2381 * so we can free it at rbio_orig_end_io(). 2382 */ 2383 rbio->generic_bio_cnt = 1; 2384 2385 return rbio; 2386 } 2387 2388 /* Used for both parity scrub and missing. */ 2389 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2390 unsigned int pgoff, u64 logical) 2391 { 2392 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2393 int stripe_offset; 2394 int index; 2395 2396 ASSERT(logical >= rbio->bioc->raid_map[0]); 2397 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2398 rbio->stripe_len * rbio->nr_data); 2399 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 2400 index = stripe_offset / sectorsize; 2401 rbio->bio_sectors[index].page = page; 2402 rbio->bio_sectors[index].pgoff = pgoff; 2403 } 2404 2405 /* 2406 * We just scrub the parity that we have correct data on the same horizontal, 2407 * so we needn't allocate all pages for all the stripes. 2408 */ 2409 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2410 { 2411 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2412 int stripe; 2413 int sectornr; 2414 2415 for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2416 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2417 struct page *page; 2418 int index = (stripe * rbio->stripe_nsectors + sectornr) * 2419 sectorsize >> PAGE_SHIFT; 2420 2421 if (rbio->stripe_pages[index]) 2422 continue; 2423 2424 page = alloc_page(GFP_NOFS); 2425 if (!page) 2426 return -ENOMEM; 2427 rbio->stripe_pages[index] = page; 2428 } 2429 } 2430 index_stripe_sectors(rbio); 2431 return 0; 2432 } 2433 2434 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2435 int need_check) 2436 { 2437 struct btrfs_io_context *bioc = rbio->bioc; 2438 const u32 sectorsize = bioc->fs_info->sectorsize; 2439 void **pointers = rbio->finish_pointers; 2440 unsigned long *pbitmap = rbio->finish_pbitmap; 2441 int nr_data = rbio->nr_data; 2442 int stripe; 2443 int sectornr; 2444 bool has_qstripe; 2445 struct sector_ptr p_sector = { 0 }; 2446 struct sector_ptr q_sector = { 0 }; 2447 struct bio_list bio_list; 2448 struct bio *bio; 2449 int is_replace = 0; 2450 int ret; 2451 2452 bio_list_init(&bio_list); 2453 2454 if (rbio->real_stripes - rbio->nr_data == 1) 2455 has_qstripe = false; 2456 else if (rbio->real_stripes - rbio->nr_data == 2) 2457 has_qstripe = true; 2458 else 2459 BUG(); 2460 2461 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 2462 is_replace = 1; 2463 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); 2464 } 2465 2466 /* 2467 * Because the higher layers(scrubber) are unlikely to 2468 * use this area of the disk again soon, so don't cache 2469 * it. 2470 */ 2471 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2472 2473 if (!need_check) 2474 goto writeback; 2475 2476 p_sector.page = alloc_page(GFP_NOFS); 2477 if (!p_sector.page) 2478 goto cleanup; 2479 p_sector.pgoff = 0; 2480 p_sector.uptodate = 1; 2481 2482 if (has_qstripe) { 2483 /* RAID6, allocate and map temp space for the Q stripe */ 2484 q_sector.page = alloc_page(GFP_NOFS); 2485 if (!q_sector.page) { 2486 __free_page(p_sector.page); 2487 p_sector.page = NULL; 2488 goto cleanup; 2489 } 2490 q_sector.pgoff = 0; 2491 q_sector.uptodate = 1; 2492 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2493 } 2494 2495 atomic_set(&rbio->error, 0); 2496 2497 /* Map the parity stripe just once */ 2498 pointers[nr_data] = kmap_local_page(p_sector.page); 2499 2500 for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2501 struct sector_ptr *sector; 2502 void *parity; 2503 2504 /* first collect one page from each data stripe */ 2505 for (stripe = 0; stripe < nr_data; stripe++) { 2506 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2507 pointers[stripe] = kmap_local_page(sector->page) + 2508 sector->pgoff; 2509 } 2510 2511 if (has_qstripe) { 2512 /* RAID6, call the library function to fill in our P/Q */ 2513 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2514 pointers); 2515 } else { 2516 /* raid5 */ 2517 memcpy(pointers[nr_data], pointers[0], sectorsize); 2518 run_xor(pointers + 1, nr_data - 1, sectorsize); 2519 } 2520 2521 /* Check scrubbing parity and repair it */ 2522 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2523 parity = kmap_local_page(sector->page) + sector->pgoff; 2524 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2525 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2526 else 2527 /* Parity is right, needn't writeback */ 2528 bitmap_clear(rbio->dbitmap, sectornr, 1); 2529 kunmap_local(parity); 2530 2531 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2532 kunmap_local(pointers[stripe]); 2533 } 2534 2535 kunmap_local(pointers[nr_data]); 2536 __free_page(p_sector.page); 2537 p_sector.page = NULL; 2538 if (q_sector.page) { 2539 kunmap_local(pointers[rbio->real_stripes - 1]); 2540 __free_page(q_sector.page); 2541 q_sector.page = NULL; 2542 } 2543 2544 writeback: 2545 /* 2546 * time to start writing. Make bios for everything from the 2547 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2548 * everything else. 2549 */ 2550 for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2551 struct sector_ptr *sector; 2552 2553 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2554 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2555 sectornr, rbio->stripe_len, REQ_OP_WRITE); 2556 if (ret) 2557 goto cleanup; 2558 } 2559 2560 if (!is_replace) 2561 goto submit_write; 2562 2563 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2564 struct sector_ptr *sector; 2565 2566 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2567 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2568 bioc->tgtdev_map[rbio->scrubp], 2569 sectornr, rbio->stripe_len, REQ_OP_WRITE); 2570 if (ret) 2571 goto cleanup; 2572 } 2573 2574 submit_write: 2575 nr_data = bio_list_size(&bio_list); 2576 if (!nr_data) { 2577 /* Every parity is right */ 2578 rbio_orig_end_io(rbio, BLK_STS_OK); 2579 return; 2580 } 2581 2582 atomic_set(&rbio->stripes_pending, nr_data); 2583 2584 while ((bio = bio_list_pop(&bio_list))) { 2585 bio->bi_end_io = raid_write_end_io; 2586 2587 submit_bio(bio); 2588 } 2589 return; 2590 2591 cleanup: 2592 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2593 2594 while ((bio = bio_list_pop(&bio_list))) 2595 bio_put(bio); 2596 } 2597 2598 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2599 { 2600 if (stripe >= 0 && stripe < rbio->nr_data) 2601 return 1; 2602 return 0; 2603 } 2604 2605 /* 2606 * While we're doing the parity check and repair, we could have errors 2607 * in reading pages off the disk. This checks for errors and if we're 2608 * not able to read the page it'll trigger parity reconstruction. The 2609 * parity scrub will be finished after we've reconstructed the failed 2610 * stripes 2611 */ 2612 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2613 { 2614 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2615 goto cleanup; 2616 2617 if (rbio->faila >= 0 || rbio->failb >= 0) { 2618 int dfail = 0, failp = -1; 2619 2620 if (is_data_stripe(rbio, rbio->faila)) 2621 dfail++; 2622 else if (is_parity_stripe(rbio->faila)) 2623 failp = rbio->faila; 2624 2625 if (is_data_stripe(rbio, rbio->failb)) 2626 dfail++; 2627 else if (is_parity_stripe(rbio->failb)) 2628 failp = rbio->failb; 2629 2630 /* 2631 * Because we can not use a scrubbing parity to repair 2632 * the data, so the capability of the repair is declined. 2633 * (In the case of RAID5, we can not repair anything) 2634 */ 2635 if (dfail > rbio->bioc->max_errors - 1) 2636 goto cleanup; 2637 2638 /* 2639 * If all data is good, only parity is correctly, just 2640 * repair the parity. 2641 */ 2642 if (dfail == 0) { 2643 finish_parity_scrub(rbio, 0); 2644 return; 2645 } 2646 2647 /* 2648 * Here means we got one corrupted data stripe and one 2649 * corrupted parity on RAID6, if the corrupted parity 2650 * is scrubbing parity, luckily, use the other one to repair 2651 * the data, or we can not repair the data stripe. 2652 */ 2653 if (failp != rbio->scrubp) 2654 goto cleanup; 2655 2656 __raid_recover_end_io(rbio); 2657 } else { 2658 finish_parity_scrub(rbio, 1); 2659 } 2660 return; 2661 2662 cleanup: 2663 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2664 } 2665 2666 /* 2667 * end io for the read phase of the rmw cycle. All the bios here are physical 2668 * stripe bios we've read from the disk so we can recalculate the parity of the 2669 * stripe. 2670 * 2671 * This will usually kick off finish_rmw once all the bios are read in, but it 2672 * may trigger parity reconstruction if we had any errors along the way 2673 */ 2674 static void raid56_parity_scrub_end_io(struct bio *bio) 2675 { 2676 struct btrfs_raid_bio *rbio = bio->bi_private; 2677 2678 if (bio->bi_status) 2679 fail_bio_stripe(rbio, bio); 2680 else 2681 set_bio_pages_uptodate(rbio, bio); 2682 2683 bio_put(bio); 2684 2685 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2686 return; 2687 2688 /* 2689 * this will normally call finish_rmw to start our write 2690 * but if there are any failed stripes we'll reconstruct 2691 * from parity first 2692 */ 2693 validate_rbio_for_parity_scrub(rbio); 2694 } 2695 2696 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2697 { 2698 int bios_to_read = 0; 2699 struct bio_list bio_list; 2700 int ret; 2701 int sectornr; 2702 int stripe; 2703 struct bio *bio; 2704 2705 bio_list_init(&bio_list); 2706 2707 ret = alloc_rbio_essential_pages(rbio); 2708 if (ret) 2709 goto cleanup; 2710 2711 atomic_set(&rbio->error, 0); 2712 /* 2713 * build a list of bios to read all the missing parts of this 2714 * stripe 2715 */ 2716 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2717 for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { 2718 struct sector_ptr *sector; 2719 /* 2720 * We want to find all the sectors missing from the 2721 * rbio and read them from the disk. If * sector_in_rbio() 2722 * finds a sector in the bio list we don't need to read 2723 * it off the stripe. 2724 */ 2725 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2726 if (sector) 2727 continue; 2728 2729 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2730 /* 2731 * The bio cache may have handed us an uptodate sector. 2732 * If so, be happy and use it. 2733 */ 2734 if (sector->uptodate) 2735 continue; 2736 2737 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2738 stripe, sectornr, rbio->stripe_len, 2739 REQ_OP_READ); 2740 if (ret) 2741 goto cleanup; 2742 } 2743 } 2744 2745 bios_to_read = bio_list_size(&bio_list); 2746 if (!bios_to_read) { 2747 /* 2748 * this can happen if others have merged with 2749 * us, it means there is nothing left to read. 2750 * But if there are missing devices it may not be 2751 * safe to do the full stripe write yet. 2752 */ 2753 goto finish; 2754 } 2755 2756 /* 2757 * The bioc may be freed once we submit the last bio. Make sure not to 2758 * touch it after that. 2759 */ 2760 atomic_set(&rbio->stripes_pending, bios_to_read); 2761 while ((bio = bio_list_pop(&bio_list))) { 2762 bio->bi_end_io = raid56_parity_scrub_end_io; 2763 2764 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2765 2766 submit_bio(bio); 2767 } 2768 /* the actual write will happen once the reads are done */ 2769 return; 2770 2771 cleanup: 2772 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2773 2774 while ((bio = bio_list_pop(&bio_list))) 2775 bio_put(bio); 2776 2777 return; 2778 2779 finish: 2780 validate_rbio_for_parity_scrub(rbio); 2781 } 2782 2783 static void scrub_parity_work(struct work_struct *work) 2784 { 2785 struct btrfs_raid_bio *rbio; 2786 2787 rbio = container_of(work, struct btrfs_raid_bio, work); 2788 raid56_parity_scrub_stripe(rbio); 2789 } 2790 2791 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2792 { 2793 if (!lock_stripe_add(rbio)) 2794 start_async_work(rbio, scrub_parity_work); 2795 } 2796 2797 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2798 2799 struct btrfs_raid_bio * 2800 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, 2801 u64 length) 2802 { 2803 struct btrfs_fs_info *fs_info = bioc->fs_info; 2804 struct btrfs_raid_bio *rbio; 2805 2806 rbio = alloc_rbio(fs_info, bioc, length); 2807 if (IS_ERR(rbio)) 2808 return NULL; 2809 2810 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2811 bio_list_add(&rbio->bio_list, bio); 2812 /* 2813 * This is a special bio which is used to hold the completion handler 2814 * and make the scrub rbio is similar to the other types 2815 */ 2816 ASSERT(!bio->bi_iter.bi_size); 2817 2818 rbio->faila = find_logical_bio_stripe(rbio, bio); 2819 if (rbio->faila == -1) { 2820 BUG(); 2821 kfree(rbio); 2822 return NULL; 2823 } 2824 2825 /* 2826 * When we get bioc, we have already increased bio_counter, record it 2827 * so we can free it at rbio_orig_end_io() 2828 */ 2829 rbio->generic_bio_cnt = 1; 2830 2831 return rbio; 2832 } 2833 2834 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2835 { 2836 if (!lock_stripe_add(rbio)) 2837 start_async_work(rbio, read_rebuild_work); 2838 } 2839