1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "misc.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 23 /* set when additional merges to this rbio are not allowed */ 24 #define RBIO_RMW_LOCKED_BIT 1 25 26 /* 27 * set when this rbio is sitting in the hash, but it is just a cache 28 * of past RMW 29 */ 30 #define RBIO_CACHE_BIT 2 31 32 /* 33 * set when it is safe to trust the stripe_pages for caching 34 */ 35 #define RBIO_CACHE_READY_BIT 3 36 37 #define RBIO_CACHE_SIZE 1024 38 39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 40 41 /* Used by the raid56 code to lock stripes for read/modify/write */ 42 struct btrfs_stripe_hash { 43 struct list_head hash_list; 44 spinlock_t lock; 45 }; 46 47 /* Used by the raid56 code to lock stripes for read/modify/write */ 48 struct btrfs_stripe_hash_table { 49 struct list_head stripe_cache; 50 spinlock_t cache_lock; 51 int cache_size; 52 struct btrfs_stripe_hash table[]; 53 }; 54 55 /* 56 * A bvec like structure to present a sector inside a page. 57 * 58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59 */ 60 struct sector_ptr { 61 struct page *page; 62 unsigned int pgoff:24; 63 unsigned int uptodate:8; 64 }; 65 66 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 67 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 68 static void rmw_work(struct work_struct *work); 69 static void read_rebuild_work(struct work_struct *work); 70 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 71 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 72 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 73 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 74 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 75 76 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 77 int need_check); 78 static void scrub_parity_work(struct work_struct *work); 79 80 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 81 { 82 INIT_WORK(&rbio->work, work_func); 83 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 84 } 85 86 /* 87 * the stripe hash table is used for locking, and to collect 88 * bios in hopes of making a full stripe 89 */ 90 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 91 { 92 struct btrfs_stripe_hash_table *table; 93 struct btrfs_stripe_hash_table *x; 94 struct btrfs_stripe_hash *cur; 95 struct btrfs_stripe_hash *h; 96 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 97 int i; 98 99 if (info->stripe_hash_table) 100 return 0; 101 102 /* 103 * The table is large, starting with order 4 and can go as high as 104 * order 7 in case lock debugging is turned on. 105 * 106 * Try harder to allocate and fallback to vmalloc to lower the chance 107 * of a failing mount. 108 */ 109 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 110 if (!table) 111 return -ENOMEM; 112 113 spin_lock_init(&table->cache_lock); 114 INIT_LIST_HEAD(&table->stripe_cache); 115 116 h = table->table; 117 118 for (i = 0; i < num_entries; i++) { 119 cur = h + i; 120 INIT_LIST_HEAD(&cur->hash_list); 121 spin_lock_init(&cur->lock); 122 } 123 124 x = cmpxchg(&info->stripe_hash_table, NULL, table); 125 kvfree(x); 126 return 0; 127 } 128 129 /* 130 * caching an rbio means to copy anything from the 131 * bio_sectors array into the stripe_pages array. We 132 * use the page uptodate bit in the stripe cache array 133 * to indicate if it has valid data 134 * 135 * once the caching is done, we set the cache ready 136 * bit. 137 */ 138 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 139 { 140 int i; 141 int ret; 142 143 ret = alloc_rbio_pages(rbio); 144 if (ret) 145 return; 146 147 for (i = 0; i < rbio->nr_sectors; i++) { 148 /* Some range not covered by bio (partial write), skip it */ 149 if (!rbio->bio_sectors[i].page) 150 continue; 151 152 ASSERT(rbio->stripe_sectors[i].page); 153 memcpy_page(rbio->stripe_sectors[i].page, 154 rbio->stripe_sectors[i].pgoff, 155 rbio->bio_sectors[i].page, 156 rbio->bio_sectors[i].pgoff, 157 rbio->bioc->fs_info->sectorsize); 158 rbio->stripe_sectors[i].uptodate = 1; 159 } 160 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 161 } 162 163 /* 164 * we hash on the first logical address of the stripe 165 */ 166 static int rbio_bucket(struct btrfs_raid_bio *rbio) 167 { 168 u64 num = rbio->bioc->raid_map[0]; 169 170 /* 171 * we shift down quite a bit. We're using byte 172 * addressing, and most of the lower bits are zeros. 173 * This tends to upset hash_64, and it consistently 174 * returns just one or two different values. 175 * 176 * shifting off the lower bits fixes things. 177 */ 178 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 179 } 180 181 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 182 unsigned int page_nr) 183 { 184 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 185 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 186 int i; 187 188 ASSERT(page_nr < rbio->nr_pages); 189 190 for (i = sectors_per_page * page_nr; 191 i < sectors_per_page * page_nr + sectors_per_page; 192 i++) { 193 if (!rbio->stripe_sectors[i].uptodate) 194 return false; 195 } 196 return true; 197 } 198 199 /* 200 * Update the stripe_sectors[] array to use correct page and pgoff 201 * 202 * Should be called every time any page pointer in stripes_pages[] got modified. 203 */ 204 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 205 { 206 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 207 u32 offset; 208 int i; 209 210 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 211 int page_index = offset >> PAGE_SHIFT; 212 213 ASSERT(page_index < rbio->nr_pages); 214 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 215 rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 216 } 217 } 218 219 static void steal_rbio_page(struct btrfs_raid_bio *src, 220 struct btrfs_raid_bio *dest, int page_nr) 221 { 222 const u32 sectorsize = src->bioc->fs_info->sectorsize; 223 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 224 int i; 225 226 if (dest->stripe_pages[page_nr]) 227 __free_page(dest->stripe_pages[page_nr]); 228 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 229 src->stripe_pages[page_nr] = NULL; 230 231 /* Also update the sector->uptodate bits. */ 232 for (i = sectors_per_page * page_nr; 233 i < sectors_per_page * page_nr + sectors_per_page; i++) 234 dest->stripe_sectors[i].uptodate = true; 235 } 236 237 /* 238 * Stealing an rbio means taking all the uptodate pages from the stripe array 239 * in the source rbio and putting them into the destination rbio. 240 * 241 * This will also update the involved stripe_sectors[] which are referring to 242 * the old pages. 243 */ 244 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 245 { 246 int i; 247 struct page *s; 248 249 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 250 return; 251 252 for (i = 0; i < dest->nr_pages; i++) { 253 s = src->stripe_pages[i]; 254 if (!s || !full_page_sectors_uptodate(src, i)) 255 continue; 256 257 steal_rbio_page(src, dest, i); 258 } 259 index_stripe_sectors(dest); 260 index_stripe_sectors(src); 261 } 262 263 /* 264 * merging means we take the bio_list from the victim and 265 * splice it into the destination. The victim should 266 * be discarded afterwards. 267 * 268 * must be called with dest->rbio_list_lock held 269 */ 270 static void merge_rbio(struct btrfs_raid_bio *dest, 271 struct btrfs_raid_bio *victim) 272 { 273 bio_list_merge(&dest->bio_list, &victim->bio_list); 274 dest->bio_list_bytes += victim->bio_list_bytes; 275 /* Also inherit the bitmaps from @victim. */ 276 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 277 dest->stripe_nsectors); 278 bio_list_init(&victim->bio_list); 279 } 280 281 /* 282 * used to prune items that are in the cache. The caller 283 * must hold the hash table lock. 284 */ 285 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 286 { 287 int bucket = rbio_bucket(rbio); 288 struct btrfs_stripe_hash_table *table; 289 struct btrfs_stripe_hash *h; 290 int freeit = 0; 291 292 /* 293 * check the bit again under the hash table lock. 294 */ 295 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 296 return; 297 298 table = rbio->bioc->fs_info->stripe_hash_table; 299 h = table->table + bucket; 300 301 /* hold the lock for the bucket because we may be 302 * removing it from the hash table 303 */ 304 spin_lock(&h->lock); 305 306 /* 307 * hold the lock for the bio list because we need 308 * to make sure the bio list is empty 309 */ 310 spin_lock(&rbio->bio_list_lock); 311 312 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 313 list_del_init(&rbio->stripe_cache); 314 table->cache_size -= 1; 315 freeit = 1; 316 317 /* if the bio list isn't empty, this rbio is 318 * still involved in an IO. We take it out 319 * of the cache list, and drop the ref that 320 * was held for the list. 321 * 322 * If the bio_list was empty, we also remove 323 * the rbio from the hash_table, and drop 324 * the corresponding ref 325 */ 326 if (bio_list_empty(&rbio->bio_list)) { 327 if (!list_empty(&rbio->hash_list)) { 328 list_del_init(&rbio->hash_list); 329 refcount_dec(&rbio->refs); 330 BUG_ON(!list_empty(&rbio->plug_list)); 331 } 332 } 333 } 334 335 spin_unlock(&rbio->bio_list_lock); 336 spin_unlock(&h->lock); 337 338 if (freeit) 339 __free_raid_bio(rbio); 340 } 341 342 /* 343 * prune a given rbio from the cache 344 */ 345 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 346 { 347 struct btrfs_stripe_hash_table *table; 348 unsigned long flags; 349 350 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 351 return; 352 353 table = rbio->bioc->fs_info->stripe_hash_table; 354 355 spin_lock_irqsave(&table->cache_lock, flags); 356 __remove_rbio_from_cache(rbio); 357 spin_unlock_irqrestore(&table->cache_lock, flags); 358 } 359 360 /* 361 * remove everything in the cache 362 */ 363 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 364 { 365 struct btrfs_stripe_hash_table *table; 366 unsigned long flags; 367 struct btrfs_raid_bio *rbio; 368 369 table = info->stripe_hash_table; 370 371 spin_lock_irqsave(&table->cache_lock, flags); 372 while (!list_empty(&table->stripe_cache)) { 373 rbio = list_entry(table->stripe_cache.next, 374 struct btrfs_raid_bio, 375 stripe_cache); 376 __remove_rbio_from_cache(rbio); 377 } 378 spin_unlock_irqrestore(&table->cache_lock, flags); 379 } 380 381 /* 382 * remove all cached entries and free the hash table 383 * used by unmount 384 */ 385 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 386 { 387 if (!info->stripe_hash_table) 388 return; 389 btrfs_clear_rbio_cache(info); 390 kvfree(info->stripe_hash_table); 391 info->stripe_hash_table = NULL; 392 } 393 394 /* 395 * insert an rbio into the stripe cache. It 396 * must have already been prepared by calling 397 * cache_rbio_pages 398 * 399 * If this rbio was already cached, it gets 400 * moved to the front of the lru. 401 * 402 * If the size of the rbio cache is too big, we 403 * prune an item. 404 */ 405 static void cache_rbio(struct btrfs_raid_bio *rbio) 406 { 407 struct btrfs_stripe_hash_table *table; 408 unsigned long flags; 409 410 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 411 return; 412 413 table = rbio->bioc->fs_info->stripe_hash_table; 414 415 spin_lock_irqsave(&table->cache_lock, flags); 416 spin_lock(&rbio->bio_list_lock); 417 418 /* bump our ref if we were not in the list before */ 419 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 420 refcount_inc(&rbio->refs); 421 422 if (!list_empty(&rbio->stripe_cache)){ 423 list_move(&rbio->stripe_cache, &table->stripe_cache); 424 } else { 425 list_add(&rbio->stripe_cache, &table->stripe_cache); 426 table->cache_size += 1; 427 } 428 429 spin_unlock(&rbio->bio_list_lock); 430 431 if (table->cache_size > RBIO_CACHE_SIZE) { 432 struct btrfs_raid_bio *found; 433 434 found = list_entry(table->stripe_cache.prev, 435 struct btrfs_raid_bio, 436 stripe_cache); 437 438 if (found != rbio) 439 __remove_rbio_from_cache(found); 440 } 441 442 spin_unlock_irqrestore(&table->cache_lock, flags); 443 } 444 445 /* 446 * helper function to run the xor_blocks api. It is only 447 * able to do MAX_XOR_BLOCKS at a time, so we need to 448 * loop through. 449 */ 450 static void run_xor(void **pages, int src_cnt, ssize_t len) 451 { 452 int src_off = 0; 453 int xor_src_cnt = 0; 454 void *dest = pages[src_cnt]; 455 456 while(src_cnt > 0) { 457 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 458 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 459 460 src_cnt -= xor_src_cnt; 461 src_off += xor_src_cnt; 462 } 463 } 464 465 /* 466 * Returns true if the bio list inside this rbio covers an entire stripe (no 467 * rmw required). 468 */ 469 static int rbio_is_full(struct btrfs_raid_bio *rbio) 470 { 471 unsigned long flags; 472 unsigned long size = rbio->bio_list_bytes; 473 int ret = 1; 474 475 spin_lock_irqsave(&rbio->bio_list_lock, flags); 476 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 477 ret = 0; 478 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 479 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 480 481 return ret; 482 } 483 484 /* 485 * returns 1 if it is safe to merge two rbios together. 486 * The merging is safe if the two rbios correspond to 487 * the same stripe and if they are both going in the same 488 * direction (read vs write), and if neither one is 489 * locked for final IO 490 * 491 * The caller is responsible for locking such that 492 * rmw_locked is safe to test 493 */ 494 static int rbio_can_merge(struct btrfs_raid_bio *last, 495 struct btrfs_raid_bio *cur) 496 { 497 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 498 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 499 return 0; 500 501 /* 502 * we can't merge with cached rbios, since the 503 * idea is that when we merge the destination 504 * rbio is going to run our IO for us. We can 505 * steal from cached rbios though, other functions 506 * handle that. 507 */ 508 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 509 test_bit(RBIO_CACHE_BIT, &cur->flags)) 510 return 0; 511 512 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 513 return 0; 514 515 /* we can't merge with different operations */ 516 if (last->operation != cur->operation) 517 return 0; 518 /* 519 * We've need read the full stripe from the drive. 520 * check and repair the parity and write the new results. 521 * 522 * We're not allowed to add any new bios to the 523 * bio list here, anyone else that wants to 524 * change this stripe needs to do their own rmw. 525 */ 526 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 527 return 0; 528 529 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 530 return 0; 531 532 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 533 int fa = last->faila; 534 int fb = last->failb; 535 int cur_fa = cur->faila; 536 int cur_fb = cur->failb; 537 538 if (last->faila >= last->failb) { 539 fa = last->failb; 540 fb = last->faila; 541 } 542 543 if (cur->faila >= cur->failb) { 544 cur_fa = cur->failb; 545 cur_fb = cur->faila; 546 } 547 548 if (fa != cur_fa || fb != cur_fb) 549 return 0; 550 } 551 return 1; 552 } 553 554 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 555 unsigned int stripe_nr, 556 unsigned int sector_nr) 557 { 558 ASSERT(stripe_nr < rbio->real_stripes); 559 ASSERT(sector_nr < rbio->stripe_nsectors); 560 561 return stripe_nr * rbio->stripe_nsectors + sector_nr; 562 } 563 564 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 565 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 566 unsigned int stripe_nr, 567 unsigned int sector_nr) 568 { 569 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 570 sector_nr)]; 571 } 572 573 /* Grab a sector inside P stripe */ 574 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 575 unsigned int sector_nr) 576 { 577 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 578 } 579 580 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 581 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 582 unsigned int sector_nr) 583 { 584 if (rbio->nr_data + 1 == rbio->real_stripes) 585 return NULL; 586 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 587 } 588 589 /* 590 * The first stripe in the table for a logical address 591 * has the lock. rbios are added in one of three ways: 592 * 593 * 1) Nobody has the stripe locked yet. The rbio is given 594 * the lock and 0 is returned. The caller must start the IO 595 * themselves. 596 * 597 * 2) Someone has the stripe locked, but we're able to merge 598 * with the lock owner. The rbio is freed and the IO will 599 * start automatically along with the existing rbio. 1 is returned. 600 * 601 * 3) Someone has the stripe locked, but we're not able to merge. 602 * The rbio is added to the lock owner's plug list, or merged into 603 * an rbio already on the plug list. When the lock owner unlocks, 604 * the next rbio on the list is run and the IO is started automatically. 605 * 1 is returned 606 * 607 * If we return 0, the caller still owns the rbio and must continue with 608 * IO submission. If we return 1, the caller must assume the rbio has 609 * already been freed. 610 */ 611 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 612 { 613 struct btrfs_stripe_hash *h; 614 struct btrfs_raid_bio *cur; 615 struct btrfs_raid_bio *pending; 616 unsigned long flags; 617 struct btrfs_raid_bio *freeit = NULL; 618 struct btrfs_raid_bio *cache_drop = NULL; 619 int ret = 0; 620 621 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 622 623 spin_lock_irqsave(&h->lock, flags); 624 list_for_each_entry(cur, &h->hash_list, hash_list) { 625 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 626 continue; 627 628 spin_lock(&cur->bio_list_lock); 629 630 /* Can we steal this cached rbio's pages? */ 631 if (bio_list_empty(&cur->bio_list) && 632 list_empty(&cur->plug_list) && 633 test_bit(RBIO_CACHE_BIT, &cur->flags) && 634 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 635 list_del_init(&cur->hash_list); 636 refcount_dec(&cur->refs); 637 638 steal_rbio(cur, rbio); 639 cache_drop = cur; 640 spin_unlock(&cur->bio_list_lock); 641 642 goto lockit; 643 } 644 645 /* Can we merge into the lock owner? */ 646 if (rbio_can_merge(cur, rbio)) { 647 merge_rbio(cur, rbio); 648 spin_unlock(&cur->bio_list_lock); 649 freeit = rbio; 650 ret = 1; 651 goto out; 652 } 653 654 655 /* 656 * We couldn't merge with the running rbio, see if we can merge 657 * with the pending ones. We don't have to check for rmw_locked 658 * because there is no way they are inside finish_rmw right now 659 */ 660 list_for_each_entry(pending, &cur->plug_list, plug_list) { 661 if (rbio_can_merge(pending, rbio)) { 662 merge_rbio(pending, rbio); 663 spin_unlock(&cur->bio_list_lock); 664 freeit = rbio; 665 ret = 1; 666 goto out; 667 } 668 } 669 670 /* 671 * No merging, put us on the tail of the plug list, our rbio 672 * will be started with the currently running rbio unlocks 673 */ 674 list_add_tail(&rbio->plug_list, &cur->plug_list); 675 spin_unlock(&cur->bio_list_lock); 676 ret = 1; 677 goto out; 678 } 679 lockit: 680 refcount_inc(&rbio->refs); 681 list_add(&rbio->hash_list, &h->hash_list); 682 out: 683 spin_unlock_irqrestore(&h->lock, flags); 684 if (cache_drop) 685 remove_rbio_from_cache(cache_drop); 686 if (freeit) 687 __free_raid_bio(freeit); 688 return ret; 689 } 690 691 /* 692 * called as rmw or parity rebuild is completed. If the plug list has more 693 * rbios waiting for this stripe, the next one on the list will be started 694 */ 695 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 696 { 697 int bucket; 698 struct btrfs_stripe_hash *h; 699 unsigned long flags; 700 int keep_cache = 0; 701 702 bucket = rbio_bucket(rbio); 703 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 704 705 if (list_empty(&rbio->plug_list)) 706 cache_rbio(rbio); 707 708 spin_lock_irqsave(&h->lock, flags); 709 spin_lock(&rbio->bio_list_lock); 710 711 if (!list_empty(&rbio->hash_list)) { 712 /* 713 * if we're still cached and there is no other IO 714 * to perform, just leave this rbio here for others 715 * to steal from later 716 */ 717 if (list_empty(&rbio->plug_list) && 718 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 719 keep_cache = 1; 720 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 721 BUG_ON(!bio_list_empty(&rbio->bio_list)); 722 goto done; 723 } 724 725 list_del_init(&rbio->hash_list); 726 refcount_dec(&rbio->refs); 727 728 /* 729 * we use the plug list to hold all the rbios 730 * waiting for the chance to lock this stripe. 731 * hand the lock over to one of them. 732 */ 733 if (!list_empty(&rbio->plug_list)) { 734 struct btrfs_raid_bio *next; 735 struct list_head *head = rbio->plug_list.next; 736 737 next = list_entry(head, struct btrfs_raid_bio, 738 plug_list); 739 740 list_del_init(&rbio->plug_list); 741 742 list_add(&next->hash_list, &h->hash_list); 743 refcount_inc(&next->refs); 744 spin_unlock(&rbio->bio_list_lock); 745 spin_unlock_irqrestore(&h->lock, flags); 746 747 if (next->operation == BTRFS_RBIO_READ_REBUILD) 748 start_async_work(next, read_rebuild_work); 749 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 750 steal_rbio(rbio, next); 751 start_async_work(next, read_rebuild_work); 752 } else if (next->operation == BTRFS_RBIO_WRITE) { 753 steal_rbio(rbio, next); 754 start_async_work(next, rmw_work); 755 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 756 steal_rbio(rbio, next); 757 start_async_work(next, scrub_parity_work); 758 } 759 760 goto done_nolock; 761 } 762 } 763 done: 764 spin_unlock(&rbio->bio_list_lock); 765 spin_unlock_irqrestore(&h->lock, flags); 766 767 done_nolock: 768 if (!keep_cache) 769 remove_rbio_from_cache(rbio); 770 } 771 772 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 773 { 774 int i; 775 776 if (!refcount_dec_and_test(&rbio->refs)) 777 return; 778 779 WARN_ON(!list_empty(&rbio->stripe_cache)); 780 WARN_ON(!list_empty(&rbio->hash_list)); 781 WARN_ON(!bio_list_empty(&rbio->bio_list)); 782 783 for (i = 0; i < rbio->nr_pages; i++) { 784 if (rbio->stripe_pages[i]) { 785 __free_page(rbio->stripe_pages[i]); 786 rbio->stripe_pages[i] = NULL; 787 } 788 } 789 790 btrfs_put_bioc(rbio->bioc); 791 kfree(rbio); 792 } 793 794 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 795 { 796 struct bio *next; 797 798 while (cur) { 799 next = cur->bi_next; 800 cur->bi_next = NULL; 801 cur->bi_status = err; 802 bio_endio(cur); 803 cur = next; 804 } 805 } 806 807 /* 808 * this frees the rbio and runs through all the bios in the 809 * bio_list and calls end_io on them 810 */ 811 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 812 { 813 struct bio *cur = bio_list_get(&rbio->bio_list); 814 struct bio *extra; 815 816 /* 817 * Clear the data bitmap, as the rbio may be cached for later usage. 818 * do this before before unlock_stripe() so there will be no new bio 819 * for this bio. 820 */ 821 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 822 823 /* 824 * At this moment, rbio->bio_list is empty, however since rbio does not 825 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 826 * hash list, rbio may be merged with others so that rbio->bio_list 827 * becomes non-empty. 828 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 829 * more and we can call bio_endio() on all queued bios. 830 */ 831 unlock_stripe(rbio); 832 extra = bio_list_get(&rbio->bio_list); 833 __free_raid_bio(rbio); 834 835 rbio_endio_bio_list(cur, err); 836 if (extra) 837 rbio_endio_bio_list(extra, err); 838 } 839 840 /* 841 * end io function used by finish_rmw. When we finally 842 * get here, we've written a full stripe 843 */ 844 static void raid_write_end_io(struct bio *bio) 845 { 846 struct btrfs_raid_bio *rbio = bio->bi_private; 847 blk_status_t err = bio->bi_status; 848 int max_errors; 849 850 if (err) 851 fail_bio_stripe(rbio, bio); 852 853 bio_put(bio); 854 855 if (!atomic_dec_and_test(&rbio->stripes_pending)) 856 return; 857 858 err = BLK_STS_OK; 859 860 /* OK, we have read all the stripes we need to. */ 861 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 862 0 : rbio->bioc->max_errors; 863 if (atomic_read(&rbio->error) > max_errors) 864 err = BLK_STS_IOERR; 865 866 rbio_orig_end_io(rbio, err); 867 } 868 869 /** 870 * Get a sector pointer specified by its @stripe_nr and @sector_nr 871 * 872 * @rbio: The raid bio 873 * @stripe_nr: Stripe number, valid range [0, real_stripe) 874 * @sector_nr: Sector number inside the stripe, 875 * valid range [0, stripe_nsectors) 876 * @bio_list_only: Whether to use sectors inside the bio list only. 877 * 878 * The read/modify/write code wants to reuse the original bio page as much 879 * as possible, and only use stripe_sectors as fallback. 880 */ 881 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 882 int stripe_nr, int sector_nr, 883 bool bio_list_only) 884 { 885 struct sector_ptr *sector; 886 int index; 887 888 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 889 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 890 891 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 892 ASSERT(index >= 0 && index < rbio->nr_sectors); 893 894 spin_lock_irq(&rbio->bio_list_lock); 895 sector = &rbio->bio_sectors[index]; 896 if (sector->page || bio_list_only) { 897 /* Don't return sector without a valid page pointer */ 898 if (!sector->page) 899 sector = NULL; 900 spin_unlock_irq(&rbio->bio_list_lock); 901 return sector; 902 } 903 spin_unlock_irq(&rbio->bio_list_lock); 904 905 return &rbio->stripe_sectors[index]; 906 } 907 908 /* 909 * allocation and initial setup for the btrfs_raid_bio. Not 910 * this does not allocate any pages for rbio->pages. 911 */ 912 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 913 struct btrfs_io_context *bioc) 914 { 915 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 916 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 917 const unsigned int num_pages = stripe_npages * real_stripes; 918 const unsigned int stripe_nsectors = 919 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 920 const unsigned int num_sectors = stripe_nsectors * real_stripes; 921 struct btrfs_raid_bio *rbio; 922 void *p; 923 924 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 925 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 926 /* 927 * Our current stripe len should be fixed to 64k thus stripe_nsectors 928 * (at most 16) should be no larger than BITS_PER_LONG. 929 */ 930 ASSERT(stripe_nsectors <= BITS_PER_LONG); 931 932 rbio = kzalloc(sizeof(*rbio) + 933 sizeof(*rbio->stripe_pages) * num_pages + 934 sizeof(*rbio->bio_sectors) * num_sectors + 935 sizeof(*rbio->stripe_sectors) * num_sectors + 936 sizeof(*rbio->finish_pointers) * real_stripes, 937 GFP_NOFS); 938 if (!rbio) 939 return ERR_PTR(-ENOMEM); 940 941 bio_list_init(&rbio->bio_list); 942 INIT_LIST_HEAD(&rbio->plug_list); 943 spin_lock_init(&rbio->bio_list_lock); 944 INIT_LIST_HEAD(&rbio->stripe_cache); 945 INIT_LIST_HEAD(&rbio->hash_list); 946 btrfs_get_bioc(bioc); 947 rbio->bioc = bioc; 948 rbio->nr_pages = num_pages; 949 rbio->nr_sectors = num_sectors; 950 rbio->real_stripes = real_stripes; 951 rbio->stripe_npages = stripe_npages; 952 rbio->stripe_nsectors = stripe_nsectors; 953 rbio->faila = -1; 954 rbio->failb = -1; 955 refcount_set(&rbio->refs, 1); 956 atomic_set(&rbio->error, 0); 957 atomic_set(&rbio->stripes_pending, 0); 958 959 /* 960 * The stripe_pages, bio_sectors, etc arrays point to the extra memory 961 * we allocated past the end of the rbio. 962 */ 963 p = rbio + 1; 964 #define CONSUME_ALLOC(ptr, count) do { \ 965 ptr = p; \ 966 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 967 } while (0) 968 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 969 CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 970 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 971 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 972 #undef CONSUME_ALLOC 973 974 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 975 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 976 977 return rbio; 978 } 979 980 /* allocate pages for all the stripes in the bio, including parity */ 981 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 982 { 983 int ret; 984 985 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 986 if (ret < 0) 987 return ret; 988 /* Mapping all sectors */ 989 index_stripe_sectors(rbio); 990 return 0; 991 } 992 993 /* only allocate pages for p/q stripes */ 994 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 995 { 996 const int data_pages = rbio->nr_data * rbio->stripe_npages; 997 int ret; 998 999 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1000 rbio->stripe_pages + data_pages); 1001 if (ret < 0) 1002 return ret; 1003 1004 index_stripe_sectors(rbio); 1005 return 0; 1006 } 1007 1008 /* 1009 * Add a single sector @sector into our list of bios for IO. 1010 * 1011 * Return 0 if everything went well. 1012 * Return <0 for error. 1013 */ 1014 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1015 struct bio_list *bio_list, 1016 struct sector_ptr *sector, 1017 unsigned int stripe_nr, 1018 unsigned int sector_nr, 1019 enum req_op op) 1020 { 1021 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1022 struct bio *last = bio_list->tail; 1023 int ret; 1024 struct bio *bio; 1025 struct btrfs_io_stripe *stripe; 1026 u64 disk_start; 1027 1028 /* 1029 * Note: here stripe_nr has taken device replace into consideration, 1030 * thus it can be larger than rbio->real_stripe. 1031 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1032 */ 1033 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1034 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1035 ASSERT(sector->page); 1036 1037 stripe = &rbio->bioc->stripes[stripe_nr]; 1038 disk_start = stripe->physical + sector_nr * sectorsize; 1039 1040 /* if the device is missing, just fail this stripe */ 1041 if (!stripe->dev->bdev) 1042 return fail_rbio_index(rbio, stripe_nr); 1043 1044 /* see if we can add this page onto our existing bio */ 1045 if (last) { 1046 u64 last_end = last->bi_iter.bi_sector << 9; 1047 last_end += last->bi_iter.bi_size; 1048 1049 /* 1050 * we can't merge these if they are from different 1051 * devices or if they are not contiguous 1052 */ 1053 if (last_end == disk_start && !last->bi_status && 1054 last->bi_bdev == stripe->dev->bdev) { 1055 ret = bio_add_page(last, sector->page, sectorsize, 1056 sector->pgoff); 1057 if (ret == sectorsize) 1058 return 0; 1059 } 1060 } 1061 1062 /* put a new bio on the list */ 1063 bio = bio_alloc(stripe->dev->bdev, 1064 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1065 op, GFP_NOFS); 1066 bio->bi_iter.bi_sector = disk_start >> 9; 1067 bio->bi_private = rbio; 1068 1069 bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1070 bio_list_add(bio_list, bio); 1071 return 0; 1072 } 1073 1074 /* 1075 * while we're doing the read/modify/write cycle, we could 1076 * have errors in reading pages off the disk. This checks 1077 * for errors and if we're not able to read the page it'll 1078 * trigger parity reconstruction. The rmw will be finished 1079 * after we've reconstructed the failed stripes 1080 */ 1081 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1082 { 1083 if (rbio->faila >= 0 || rbio->failb >= 0) { 1084 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1085 __raid56_parity_recover(rbio); 1086 } else { 1087 finish_rmw(rbio); 1088 } 1089 } 1090 1091 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1092 { 1093 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1094 struct bio_vec bvec; 1095 struct bvec_iter iter; 1096 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1097 rbio->bioc->raid_map[0]; 1098 1099 bio_for_each_segment(bvec, bio, iter) { 1100 u32 bvec_offset; 1101 1102 for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1103 bvec_offset += sectorsize, offset += sectorsize) { 1104 int index = offset / sectorsize; 1105 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1106 1107 sector->page = bvec.bv_page; 1108 sector->pgoff = bvec.bv_offset + bvec_offset; 1109 ASSERT(sector->pgoff < PAGE_SIZE); 1110 } 1111 } 1112 } 1113 1114 /* 1115 * helper function to walk our bio list and populate the bio_pages array with 1116 * the result. This seems expensive, but it is faster than constantly 1117 * searching through the bio list as we setup the IO in finish_rmw or stripe 1118 * reconstruction. 1119 * 1120 * This must be called before you trust the answers from page_in_rbio 1121 */ 1122 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1123 { 1124 struct bio *bio; 1125 1126 spin_lock_irq(&rbio->bio_list_lock); 1127 bio_list_for_each(bio, &rbio->bio_list) 1128 index_one_bio(rbio, bio); 1129 1130 spin_unlock_irq(&rbio->bio_list_lock); 1131 } 1132 1133 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1134 struct raid56_bio_trace_info *trace_info) 1135 { 1136 const struct btrfs_io_context *bioc = rbio->bioc; 1137 int i; 1138 1139 ASSERT(bioc); 1140 1141 /* We rely on bio->bi_bdev to find the stripe number. */ 1142 if (!bio->bi_bdev) 1143 goto not_found; 1144 1145 for (i = 0; i < bioc->num_stripes; i++) { 1146 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1147 continue; 1148 trace_info->stripe_nr = i; 1149 trace_info->devid = bioc->stripes[i].dev->devid; 1150 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1151 bioc->stripes[i].physical; 1152 return; 1153 } 1154 1155 not_found: 1156 trace_info->devid = -1; 1157 trace_info->offset = -1; 1158 trace_info->stripe_nr = -1; 1159 } 1160 1161 /* 1162 * this is called from one of two situations. We either 1163 * have a full stripe from the higher layers, or we've read all 1164 * the missing bits off disk. 1165 * 1166 * This will calculate the parity and then send down any 1167 * changed blocks. 1168 */ 1169 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1170 { 1171 struct btrfs_io_context *bioc = rbio->bioc; 1172 const u32 sectorsize = bioc->fs_info->sectorsize; 1173 void **pointers = rbio->finish_pointers; 1174 int nr_data = rbio->nr_data; 1175 /* The total sector number inside the full stripe. */ 1176 int total_sector_nr; 1177 int stripe; 1178 /* Sector number inside a stripe. */ 1179 int sectornr; 1180 bool has_qstripe; 1181 struct bio_list bio_list; 1182 struct bio *bio; 1183 int ret; 1184 1185 bio_list_init(&bio_list); 1186 1187 if (rbio->real_stripes - rbio->nr_data == 1) 1188 has_qstripe = false; 1189 else if (rbio->real_stripes - rbio->nr_data == 2) 1190 has_qstripe = true; 1191 else 1192 BUG(); 1193 1194 /* We should have at least one data sector. */ 1195 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1196 1197 /* at this point we either have a full stripe, 1198 * or we've read the full stripe from the drive. 1199 * recalculate the parity and write the new results. 1200 * 1201 * We're not allowed to add any new bios to the 1202 * bio list here, anyone else that wants to 1203 * change this stripe needs to do their own rmw. 1204 */ 1205 spin_lock_irq(&rbio->bio_list_lock); 1206 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1207 spin_unlock_irq(&rbio->bio_list_lock); 1208 1209 atomic_set(&rbio->error, 0); 1210 1211 /* 1212 * now that we've set rmw_locked, run through the 1213 * bio list one last time and map the page pointers 1214 * 1215 * We don't cache full rbios because we're assuming 1216 * the higher layers are unlikely to use this area of 1217 * the disk again soon. If they do use it again, 1218 * hopefully they will send another full bio. 1219 */ 1220 index_rbio_pages(rbio); 1221 if (!rbio_is_full(rbio)) 1222 cache_rbio_pages(rbio); 1223 else 1224 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1225 1226 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1227 struct sector_ptr *sector; 1228 1229 /* First collect one sector from each data stripe */ 1230 for (stripe = 0; stripe < nr_data; stripe++) { 1231 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1232 pointers[stripe] = kmap_local_page(sector->page) + 1233 sector->pgoff; 1234 } 1235 1236 /* Then add the parity stripe */ 1237 sector = rbio_pstripe_sector(rbio, sectornr); 1238 sector->uptodate = 1; 1239 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1240 1241 if (has_qstripe) { 1242 /* 1243 * RAID6, add the qstripe and call the library function 1244 * to fill in our p/q 1245 */ 1246 sector = rbio_qstripe_sector(rbio, sectornr); 1247 sector->uptodate = 1; 1248 pointers[stripe++] = kmap_local_page(sector->page) + 1249 sector->pgoff; 1250 1251 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1252 pointers); 1253 } else { 1254 /* raid5 */ 1255 memcpy(pointers[nr_data], pointers[0], sectorsize); 1256 run_xor(pointers + 1, nr_data - 1, sectorsize); 1257 } 1258 for (stripe = stripe - 1; stripe >= 0; stripe--) 1259 kunmap_local(pointers[stripe]); 1260 } 1261 1262 /* 1263 * Start writing. Make bios for everything from the higher layers (the 1264 * bio_list in our rbio) and our P/Q. Ignore everything else. 1265 */ 1266 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1267 total_sector_nr++) { 1268 struct sector_ptr *sector; 1269 1270 stripe = total_sector_nr / rbio->stripe_nsectors; 1271 sectornr = total_sector_nr % rbio->stripe_nsectors; 1272 1273 /* This vertical stripe has no data, skip it. */ 1274 if (!test_bit(sectornr, &rbio->dbitmap)) 1275 continue; 1276 1277 if (stripe < rbio->nr_data) { 1278 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1279 if (!sector) 1280 continue; 1281 } else { 1282 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1283 } 1284 1285 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1286 sectornr, REQ_OP_WRITE); 1287 if (ret) 1288 goto cleanup; 1289 } 1290 1291 if (likely(!bioc->num_tgtdevs)) 1292 goto write_data; 1293 1294 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1295 total_sector_nr++) { 1296 struct sector_ptr *sector; 1297 1298 stripe = total_sector_nr / rbio->stripe_nsectors; 1299 sectornr = total_sector_nr % rbio->stripe_nsectors; 1300 1301 if (!bioc->tgtdev_map[stripe]) { 1302 /* 1303 * We can skip the whole stripe completely, note 1304 * total_sector_nr will be increased by one anyway. 1305 */ 1306 ASSERT(sectornr == 0); 1307 total_sector_nr += rbio->stripe_nsectors - 1; 1308 continue; 1309 } 1310 1311 /* This vertical stripe has no data, skip it. */ 1312 if (!test_bit(sectornr, &rbio->dbitmap)) 1313 continue; 1314 1315 if (stripe < rbio->nr_data) { 1316 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1317 if (!sector) 1318 continue; 1319 } else { 1320 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1321 } 1322 1323 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1324 rbio->bioc->tgtdev_map[stripe], 1325 sectornr, REQ_OP_WRITE); 1326 if (ret) 1327 goto cleanup; 1328 } 1329 1330 write_data: 1331 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1332 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1333 1334 while ((bio = bio_list_pop(&bio_list))) { 1335 bio->bi_end_io = raid_write_end_io; 1336 1337 if (trace_raid56_write_stripe_enabled()) { 1338 struct raid56_bio_trace_info trace_info = { 0 }; 1339 1340 bio_get_trace_info(rbio, bio, &trace_info); 1341 trace_raid56_write_stripe(rbio, bio, &trace_info); 1342 } 1343 submit_bio(bio); 1344 } 1345 return; 1346 1347 cleanup: 1348 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1349 1350 while ((bio = bio_list_pop(&bio_list))) 1351 bio_put(bio); 1352 } 1353 1354 /* 1355 * helper to find the stripe number for a given bio. Used to figure out which 1356 * stripe has failed. This expects the bio to correspond to a physical disk, 1357 * so it looks up based on physical sector numbers. 1358 */ 1359 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1360 struct bio *bio) 1361 { 1362 u64 physical = bio->bi_iter.bi_sector; 1363 int i; 1364 struct btrfs_io_stripe *stripe; 1365 1366 physical <<= 9; 1367 1368 for (i = 0; i < rbio->bioc->num_stripes; i++) { 1369 stripe = &rbio->bioc->stripes[i]; 1370 if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && 1371 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1372 return i; 1373 } 1374 } 1375 return -1; 1376 } 1377 1378 /* 1379 * helper to find the stripe number for a given 1380 * bio (before mapping). Used to figure out which stripe has 1381 * failed. This looks up based on logical block numbers. 1382 */ 1383 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1384 struct bio *bio) 1385 { 1386 u64 logical = bio->bi_iter.bi_sector << 9; 1387 int i; 1388 1389 for (i = 0; i < rbio->nr_data; i++) { 1390 u64 stripe_start = rbio->bioc->raid_map[i]; 1391 1392 if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) 1393 return i; 1394 } 1395 return -1; 1396 } 1397 1398 /* 1399 * returns -EIO if we had too many failures 1400 */ 1401 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1402 { 1403 unsigned long flags; 1404 int ret = 0; 1405 1406 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1407 1408 /* we already know this stripe is bad, move on */ 1409 if (rbio->faila == failed || rbio->failb == failed) 1410 goto out; 1411 1412 if (rbio->faila == -1) { 1413 /* first failure on this rbio */ 1414 rbio->faila = failed; 1415 atomic_inc(&rbio->error); 1416 } else if (rbio->failb == -1) { 1417 /* second failure on this rbio */ 1418 rbio->failb = failed; 1419 atomic_inc(&rbio->error); 1420 } else { 1421 ret = -EIO; 1422 } 1423 out: 1424 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1425 1426 return ret; 1427 } 1428 1429 /* 1430 * helper to fail a stripe based on a physical disk 1431 * bio. 1432 */ 1433 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1434 struct bio *bio) 1435 { 1436 int failed = find_bio_stripe(rbio, bio); 1437 1438 if (failed < 0) 1439 return -EIO; 1440 1441 return fail_rbio_index(rbio, failed); 1442 } 1443 1444 /* 1445 * For subpage case, we can no longer set page Uptodate directly for 1446 * stripe_pages[], thus we need to locate the sector. 1447 */ 1448 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1449 struct page *page, 1450 unsigned int pgoff) 1451 { 1452 int i; 1453 1454 for (i = 0; i < rbio->nr_sectors; i++) { 1455 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1456 1457 if (sector->page == page && sector->pgoff == pgoff) 1458 return sector; 1459 } 1460 return NULL; 1461 } 1462 1463 /* 1464 * this sets each page in the bio uptodate. It should only be used on private 1465 * rbio pages, nothing that comes in from the higher layers 1466 */ 1467 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1468 { 1469 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1470 struct bio_vec *bvec; 1471 struct bvec_iter_all iter_all; 1472 1473 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1474 1475 bio_for_each_segment_all(bvec, bio, iter_all) { 1476 struct sector_ptr *sector; 1477 int pgoff; 1478 1479 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1480 pgoff += sectorsize) { 1481 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1482 ASSERT(sector); 1483 if (sector) 1484 sector->uptodate = 1; 1485 } 1486 } 1487 } 1488 1489 static void raid56_bio_end_io(struct bio *bio) 1490 { 1491 struct btrfs_raid_bio *rbio = bio->bi_private; 1492 1493 if (bio->bi_status) 1494 fail_bio_stripe(rbio, bio); 1495 else 1496 set_bio_pages_uptodate(rbio, bio); 1497 1498 bio_put(bio); 1499 1500 if (atomic_dec_and_test(&rbio->stripes_pending)) 1501 queue_work(rbio->bioc->fs_info->endio_raid56_workers, 1502 &rbio->end_io_work); 1503 } 1504 1505 /* 1506 * End io handler for the read phase of the RMW cycle. All the bios here are 1507 * physical stripe bios we've read from the disk so we can recalculate the 1508 * parity of the stripe. 1509 * 1510 * This will usually kick off finish_rmw once all the bios are read in, but it 1511 * may trigger parity reconstruction if we had any errors along the way 1512 */ 1513 static void raid56_rmw_end_io_work(struct work_struct *work) 1514 { 1515 struct btrfs_raid_bio *rbio = 1516 container_of(work, struct btrfs_raid_bio, end_io_work); 1517 1518 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { 1519 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1520 return; 1521 } 1522 1523 /* 1524 * This will normally call finish_rmw to start our write but if there 1525 * are any failed stripes we'll reconstruct from parity first. 1526 */ 1527 validate_rbio_for_rmw(rbio); 1528 } 1529 1530 /* 1531 * the stripe must be locked by the caller. It will 1532 * unlock after all the writes are done 1533 */ 1534 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1535 { 1536 int bios_to_read = 0; 1537 struct bio_list bio_list; 1538 const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; 1539 int ret; 1540 int total_sector_nr; 1541 struct bio *bio; 1542 1543 bio_list_init(&bio_list); 1544 1545 ret = alloc_rbio_pages(rbio); 1546 if (ret) 1547 goto cleanup; 1548 1549 index_rbio_pages(rbio); 1550 1551 atomic_set(&rbio->error, 0); 1552 /* Build a list of bios to read all the missing data sectors. */ 1553 for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; 1554 total_sector_nr++) { 1555 struct sector_ptr *sector; 1556 int stripe = total_sector_nr / rbio->stripe_nsectors; 1557 int sectornr = total_sector_nr % rbio->stripe_nsectors; 1558 1559 /* 1560 * We want to find all the sectors missing from the rbio and 1561 * read them from the disk. If sector_in_rbio() finds a page 1562 * in the bio list we don't need to read it off the stripe. 1563 */ 1564 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1565 if (sector) 1566 continue; 1567 1568 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1569 /* 1570 * The bio cache may have handed us an uptodate page. If so, 1571 * use it. 1572 */ 1573 if (sector->uptodate) 1574 continue; 1575 1576 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1577 stripe, sectornr, REQ_OP_READ); 1578 if (ret) 1579 goto cleanup; 1580 } 1581 1582 bios_to_read = bio_list_size(&bio_list); 1583 if (!bios_to_read) { 1584 /* 1585 * this can happen if others have merged with 1586 * us, it means there is nothing left to read. 1587 * But if there are missing devices it may not be 1588 * safe to do the full stripe write yet. 1589 */ 1590 goto finish; 1591 } 1592 1593 /* 1594 * The bioc may be freed once we submit the last bio. Make sure not to 1595 * touch it after that. 1596 */ 1597 atomic_set(&rbio->stripes_pending, bios_to_read); 1598 INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); 1599 while ((bio = bio_list_pop(&bio_list))) { 1600 bio->bi_end_io = raid56_bio_end_io; 1601 1602 if (trace_raid56_read_partial_enabled()) { 1603 struct raid56_bio_trace_info trace_info = { 0 }; 1604 1605 bio_get_trace_info(rbio, bio, &trace_info); 1606 trace_raid56_read_partial(rbio, bio, &trace_info); 1607 } 1608 submit_bio(bio); 1609 } 1610 /* the actual write will happen once the reads are done */ 1611 return 0; 1612 1613 cleanup: 1614 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1615 1616 while ((bio = bio_list_pop(&bio_list))) 1617 bio_put(bio); 1618 1619 return -EIO; 1620 1621 finish: 1622 validate_rbio_for_rmw(rbio); 1623 return 0; 1624 } 1625 1626 /* 1627 * if the upper layers pass in a full stripe, we thank them by only allocating 1628 * enough pages to hold the parity, and sending it all down quickly. 1629 */ 1630 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1631 { 1632 int ret; 1633 1634 ret = alloc_rbio_parity_pages(rbio); 1635 if (ret) { 1636 __free_raid_bio(rbio); 1637 return ret; 1638 } 1639 1640 ret = lock_stripe_add(rbio); 1641 if (ret == 0) 1642 finish_rmw(rbio); 1643 return 0; 1644 } 1645 1646 /* 1647 * partial stripe writes get handed over to async helpers. 1648 * We're really hoping to merge a few more writes into this 1649 * rbio before calculating new parity 1650 */ 1651 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1652 { 1653 int ret; 1654 1655 ret = lock_stripe_add(rbio); 1656 if (ret == 0) 1657 start_async_work(rbio, rmw_work); 1658 return 0; 1659 } 1660 1661 /* 1662 * sometimes while we were reading from the drive to 1663 * recalculate parity, enough new bios come into create 1664 * a full stripe. So we do a check here to see if we can 1665 * go directly to finish_rmw 1666 */ 1667 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1668 { 1669 /* head off into rmw land if we don't have a full stripe */ 1670 if (!rbio_is_full(rbio)) 1671 return partial_stripe_write(rbio); 1672 return full_stripe_write(rbio); 1673 } 1674 1675 /* 1676 * We use plugging call backs to collect full stripes. 1677 * Any time we get a partial stripe write while plugged 1678 * we collect it into a list. When the unplug comes down, 1679 * we sort the list by logical block number and merge 1680 * everything we can into the same rbios 1681 */ 1682 struct btrfs_plug_cb { 1683 struct blk_plug_cb cb; 1684 struct btrfs_fs_info *info; 1685 struct list_head rbio_list; 1686 struct work_struct work; 1687 }; 1688 1689 /* 1690 * rbios on the plug list are sorted for easier merging. 1691 */ 1692 static int plug_cmp(void *priv, const struct list_head *a, 1693 const struct list_head *b) 1694 { 1695 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1696 plug_list); 1697 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1698 plug_list); 1699 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1700 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1701 1702 if (a_sector < b_sector) 1703 return -1; 1704 if (a_sector > b_sector) 1705 return 1; 1706 return 0; 1707 } 1708 1709 static void run_plug(struct btrfs_plug_cb *plug) 1710 { 1711 struct btrfs_raid_bio *cur; 1712 struct btrfs_raid_bio *last = NULL; 1713 1714 /* 1715 * sort our plug list then try to merge 1716 * everything we can in hopes of creating full 1717 * stripes. 1718 */ 1719 list_sort(NULL, &plug->rbio_list, plug_cmp); 1720 while (!list_empty(&plug->rbio_list)) { 1721 cur = list_entry(plug->rbio_list.next, 1722 struct btrfs_raid_bio, plug_list); 1723 list_del_init(&cur->plug_list); 1724 1725 if (rbio_is_full(cur)) { 1726 int ret; 1727 1728 /* we have a full stripe, send it down */ 1729 ret = full_stripe_write(cur); 1730 BUG_ON(ret); 1731 continue; 1732 } 1733 if (last) { 1734 if (rbio_can_merge(last, cur)) { 1735 merge_rbio(last, cur); 1736 __free_raid_bio(cur); 1737 continue; 1738 1739 } 1740 __raid56_parity_write(last); 1741 } 1742 last = cur; 1743 } 1744 if (last) { 1745 __raid56_parity_write(last); 1746 } 1747 kfree(plug); 1748 } 1749 1750 /* 1751 * if the unplug comes from schedule, we have to push the 1752 * work off to a helper thread 1753 */ 1754 static void unplug_work(struct work_struct *work) 1755 { 1756 struct btrfs_plug_cb *plug; 1757 plug = container_of(work, struct btrfs_plug_cb, work); 1758 run_plug(plug); 1759 } 1760 1761 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1762 { 1763 struct btrfs_plug_cb *plug; 1764 plug = container_of(cb, struct btrfs_plug_cb, cb); 1765 1766 if (from_schedule) { 1767 INIT_WORK(&plug->work, unplug_work); 1768 queue_work(plug->info->rmw_workers, &plug->work); 1769 return; 1770 } 1771 run_plug(plug); 1772 } 1773 1774 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1775 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1776 { 1777 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1778 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1779 const u64 full_stripe_start = rbio->bioc->raid_map[0]; 1780 const u32 orig_len = orig_bio->bi_iter.bi_size; 1781 const u32 sectorsize = fs_info->sectorsize; 1782 u64 cur_logical; 1783 1784 ASSERT(orig_logical >= full_stripe_start && 1785 orig_logical + orig_len <= full_stripe_start + 1786 rbio->nr_data * BTRFS_STRIPE_LEN); 1787 1788 bio_list_add(&rbio->bio_list, orig_bio); 1789 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1790 1791 /* Update the dbitmap. */ 1792 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1793 cur_logical += sectorsize) { 1794 int bit = ((u32)(cur_logical - full_stripe_start) >> 1795 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1796 1797 set_bit(bit, &rbio->dbitmap); 1798 } 1799 } 1800 1801 /* 1802 * our main entry point for writes from the rest of the FS. 1803 */ 1804 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1805 { 1806 struct btrfs_fs_info *fs_info = bioc->fs_info; 1807 struct btrfs_raid_bio *rbio; 1808 struct btrfs_plug_cb *plug = NULL; 1809 struct blk_plug_cb *cb; 1810 int ret = 0; 1811 1812 rbio = alloc_rbio(fs_info, bioc); 1813 if (IS_ERR(rbio)) { 1814 ret = PTR_ERR(rbio); 1815 goto fail; 1816 } 1817 rbio->operation = BTRFS_RBIO_WRITE; 1818 rbio_add_bio(rbio, bio); 1819 1820 /* 1821 * don't plug on full rbios, just get them out the door 1822 * as quickly as we can 1823 */ 1824 if (rbio_is_full(rbio)) { 1825 ret = full_stripe_write(rbio); 1826 if (ret) 1827 goto fail; 1828 return; 1829 } 1830 1831 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1832 if (cb) { 1833 plug = container_of(cb, struct btrfs_plug_cb, cb); 1834 if (!plug->info) { 1835 plug->info = fs_info; 1836 INIT_LIST_HEAD(&plug->rbio_list); 1837 } 1838 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1839 } else { 1840 ret = __raid56_parity_write(rbio); 1841 if (ret) 1842 goto fail; 1843 } 1844 1845 return; 1846 1847 fail: 1848 bio->bi_status = errno_to_blk_status(ret); 1849 bio_endio(bio); 1850 } 1851 1852 /* 1853 * all parity reconstruction happens here. We've read in everything 1854 * we can find from the drives and this does the heavy lifting of 1855 * sorting the good from the bad. 1856 */ 1857 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1858 { 1859 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1860 int sectornr, stripe; 1861 void **pointers; 1862 void **unmap_array; 1863 int faila = -1, failb = -1; 1864 blk_status_t err; 1865 int i; 1866 1867 /* 1868 * This array stores the pointer for each sector, thus it has the extra 1869 * pgoff value added from each sector 1870 */ 1871 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1872 if (!pointers) { 1873 err = BLK_STS_RESOURCE; 1874 goto cleanup_io; 1875 } 1876 1877 /* 1878 * Store copy of pointers that does not get reordered during 1879 * reconstruction so that kunmap_local works. 1880 */ 1881 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1882 if (!unmap_array) { 1883 err = BLK_STS_RESOURCE; 1884 goto cleanup_pointers; 1885 } 1886 1887 faila = rbio->faila; 1888 failb = rbio->failb; 1889 1890 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1891 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1892 spin_lock_irq(&rbio->bio_list_lock); 1893 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1894 spin_unlock_irq(&rbio->bio_list_lock); 1895 } 1896 1897 index_rbio_pages(rbio); 1898 1899 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1900 struct sector_ptr *sector; 1901 1902 /* 1903 * Now we just use bitmap to mark the horizontal stripes in 1904 * which we have data when doing parity scrub. 1905 */ 1906 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1907 !test_bit(sectornr, &rbio->dbitmap)) 1908 continue; 1909 1910 /* 1911 * Setup our array of pointers with sectors from each stripe 1912 * 1913 * NOTE: store a duplicate array of pointers to preserve the 1914 * pointer order 1915 */ 1916 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1917 /* 1918 * If we're rebuilding a read, we have to use 1919 * pages from the bio list 1920 */ 1921 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1922 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1923 (stripe == faila || stripe == failb)) { 1924 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1925 } else { 1926 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1927 } 1928 ASSERT(sector->page); 1929 pointers[stripe] = kmap_local_page(sector->page) + 1930 sector->pgoff; 1931 unmap_array[stripe] = pointers[stripe]; 1932 } 1933 1934 /* All raid6 handling here */ 1935 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1936 /* Single failure, rebuild from parity raid5 style */ 1937 if (failb < 0) { 1938 if (faila == rbio->nr_data) { 1939 /* 1940 * Just the P stripe has failed, without 1941 * a bad data or Q stripe. 1942 * TODO, we should redo the xor here. 1943 */ 1944 err = BLK_STS_IOERR; 1945 goto cleanup; 1946 } 1947 /* 1948 * a single failure in raid6 is rebuilt 1949 * in the pstripe code below 1950 */ 1951 goto pstripe; 1952 } 1953 1954 /* make sure our ps and qs are in order */ 1955 if (faila > failb) 1956 swap(faila, failb); 1957 1958 /* if the q stripe is failed, do a pstripe reconstruction 1959 * from the xors. 1960 * If both the q stripe and the P stripe are failed, we're 1961 * here due to a crc mismatch and we can't give them the 1962 * data they want 1963 */ 1964 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 1965 if (rbio->bioc->raid_map[faila] == 1966 RAID5_P_STRIPE) { 1967 err = BLK_STS_IOERR; 1968 goto cleanup; 1969 } 1970 /* 1971 * otherwise we have one bad data stripe and 1972 * a good P stripe. raid5! 1973 */ 1974 goto pstripe; 1975 } 1976 1977 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 1978 raid6_datap_recov(rbio->real_stripes, 1979 sectorsize, faila, pointers); 1980 } else { 1981 raid6_2data_recov(rbio->real_stripes, 1982 sectorsize, faila, failb, 1983 pointers); 1984 } 1985 } else { 1986 void *p; 1987 1988 /* rebuild from P stripe here (raid5 or raid6) */ 1989 BUG_ON(failb != -1); 1990 pstripe: 1991 /* Copy parity block into failed block to start with */ 1992 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 1993 1994 /* rearrange the pointer array */ 1995 p = pointers[faila]; 1996 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1997 pointers[stripe] = pointers[stripe + 1]; 1998 pointers[rbio->nr_data - 1] = p; 1999 2000 /* xor in the rest */ 2001 run_xor(pointers, rbio->nr_data - 1, sectorsize); 2002 } 2003 /* if we're doing this rebuild as part of an rmw, go through 2004 * and set all of our private rbio pages in the 2005 * failed stripes as uptodate. This way finish_rmw will 2006 * know they can be trusted. If this was a read reconstruction, 2007 * other endio functions will fiddle the uptodate bits 2008 */ 2009 if (rbio->operation == BTRFS_RBIO_WRITE) { 2010 for (i = 0; i < rbio->stripe_nsectors; i++) { 2011 if (faila != -1) { 2012 sector = rbio_stripe_sector(rbio, faila, i); 2013 sector->uptodate = 1; 2014 } 2015 if (failb != -1) { 2016 sector = rbio_stripe_sector(rbio, failb, i); 2017 sector->uptodate = 1; 2018 } 2019 } 2020 } 2021 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 2022 kunmap_local(unmap_array[stripe]); 2023 } 2024 2025 err = BLK_STS_OK; 2026 cleanup: 2027 kfree(unmap_array); 2028 cleanup_pointers: 2029 kfree(pointers); 2030 2031 cleanup_io: 2032 /* 2033 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2034 * valid rbio which is consistent with ondisk content, thus such a 2035 * valid rbio can be cached to avoid further disk reads. 2036 */ 2037 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2038 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 2039 /* 2040 * - In case of two failures, where rbio->failb != -1: 2041 * 2042 * Do not cache this rbio since the above read reconstruction 2043 * (raid6_datap_recov() or raid6_2data_recov()) may have 2044 * changed some content of stripes which are not identical to 2045 * on-disk content any more, otherwise, a later write/recover 2046 * may steal stripe_pages from this rbio and end up with 2047 * corruptions or rebuild failures. 2048 * 2049 * - In case of single failure, where rbio->failb == -1: 2050 * 2051 * Cache this rbio iff the above read reconstruction is 2052 * executed without problems. 2053 */ 2054 if (err == BLK_STS_OK && rbio->failb < 0) 2055 cache_rbio_pages(rbio); 2056 else 2057 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2058 2059 rbio_orig_end_io(rbio, err); 2060 } else if (err == BLK_STS_OK) { 2061 rbio->faila = -1; 2062 rbio->failb = -1; 2063 2064 if (rbio->operation == BTRFS_RBIO_WRITE) 2065 finish_rmw(rbio); 2066 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2067 finish_parity_scrub(rbio, 0); 2068 else 2069 BUG(); 2070 } else { 2071 rbio_orig_end_io(rbio, err); 2072 } 2073 } 2074 2075 /* 2076 * This is called only for stripes we've read from disk to reconstruct the 2077 * parity. 2078 */ 2079 static void raid_recover_end_io_work(struct work_struct *work) 2080 { 2081 struct btrfs_raid_bio *rbio = 2082 container_of(work, struct btrfs_raid_bio, end_io_work); 2083 2084 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2085 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2086 else 2087 __raid_recover_end_io(rbio); 2088 } 2089 2090 /* 2091 * reads everything we need off the disk to reconstruct 2092 * the parity. endio handlers trigger final reconstruction 2093 * when the IO is done. 2094 * 2095 * This is used both for reads from the higher layers and for 2096 * parity construction required to finish a rmw cycle. 2097 */ 2098 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2099 { 2100 int bios_to_read = 0; 2101 struct bio_list bio_list; 2102 int ret; 2103 int total_sector_nr; 2104 struct bio *bio; 2105 2106 bio_list_init(&bio_list); 2107 2108 ret = alloc_rbio_pages(rbio); 2109 if (ret) 2110 goto cleanup; 2111 2112 atomic_set(&rbio->error, 0); 2113 2114 /* 2115 * Read everything that hasn't failed. However this time we will 2116 * not trust any cached sector. 2117 * As we may read out some stale data but higher layer is not reading 2118 * that stale part. 2119 * 2120 * So here we always re-read everything in recovery path. 2121 */ 2122 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2123 total_sector_nr++) { 2124 int stripe = total_sector_nr / rbio->stripe_nsectors; 2125 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2126 struct sector_ptr *sector; 2127 2128 if (rbio->faila == stripe || rbio->failb == stripe) { 2129 atomic_inc(&rbio->error); 2130 /* Skip the current stripe. */ 2131 ASSERT(sectornr == 0); 2132 total_sector_nr += rbio->stripe_nsectors - 1; 2133 continue; 2134 } 2135 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2136 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2137 sectornr, REQ_OP_READ); 2138 if (ret < 0) 2139 goto cleanup; 2140 } 2141 2142 bios_to_read = bio_list_size(&bio_list); 2143 if (!bios_to_read) { 2144 /* 2145 * we might have no bios to read just because the pages 2146 * were up to date, or we might have no bios to read because 2147 * the devices were gone. 2148 */ 2149 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 2150 __raid_recover_end_io(rbio); 2151 return 0; 2152 } else { 2153 goto cleanup; 2154 } 2155 } 2156 2157 /* 2158 * The bioc may be freed once we submit the last bio. Make sure not to 2159 * touch it after that. 2160 */ 2161 atomic_set(&rbio->stripes_pending, bios_to_read); 2162 INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); 2163 while ((bio = bio_list_pop(&bio_list))) { 2164 bio->bi_end_io = raid56_bio_end_io; 2165 2166 if (trace_raid56_scrub_read_recover_enabled()) { 2167 struct raid56_bio_trace_info trace_info = { 0 }; 2168 2169 bio_get_trace_info(rbio, bio, &trace_info); 2170 trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 2171 } 2172 submit_bio(bio); 2173 } 2174 2175 return 0; 2176 2177 cleanup: 2178 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2179 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2180 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2181 2182 while ((bio = bio_list_pop(&bio_list))) 2183 bio_put(bio); 2184 2185 return -EIO; 2186 } 2187 2188 /* 2189 * the main entry point for reads from the higher layers. This 2190 * is really only called when the normal read path had a failure, 2191 * so we assume the bio they send down corresponds to a failed part 2192 * of the drive. 2193 */ 2194 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2195 int mirror_num) 2196 { 2197 struct btrfs_fs_info *fs_info = bioc->fs_info; 2198 struct btrfs_raid_bio *rbio; 2199 2200 rbio = alloc_rbio(fs_info, bioc); 2201 if (IS_ERR(rbio)) { 2202 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2203 goto out_end_bio; 2204 } 2205 2206 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2207 rbio_add_bio(rbio, bio); 2208 2209 rbio->faila = find_logical_bio_stripe(rbio, bio); 2210 if (rbio->faila == -1) { 2211 btrfs_warn(fs_info, 2212 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 2213 __func__, bio->bi_iter.bi_sector << 9, 2214 (u64)bio->bi_iter.bi_size, bioc->map_type); 2215 __free_raid_bio(rbio); 2216 bio->bi_status = BLK_STS_IOERR; 2217 goto out_end_bio; 2218 } 2219 2220 /* 2221 * Loop retry: 2222 * for 'mirror == 2', reconstruct from all other stripes. 2223 * for 'mirror_num > 2', select a stripe to fail on every retry. 2224 */ 2225 if (mirror_num > 2) { 2226 /* 2227 * 'mirror == 3' is to fail the p stripe and 2228 * reconstruct from the q stripe. 'mirror > 3' is to 2229 * fail a data stripe and reconstruct from p+q stripe. 2230 */ 2231 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2232 ASSERT(rbio->failb > 0); 2233 if (rbio->failb <= rbio->faila) 2234 rbio->failb--; 2235 } 2236 2237 if (lock_stripe_add(rbio)) 2238 return; 2239 2240 /* 2241 * This adds our rbio to the list of rbios that will be handled after 2242 * the current lock owner is done. 2243 */ 2244 __raid56_parity_recover(rbio); 2245 return; 2246 2247 out_end_bio: 2248 bio_endio(bio); 2249 } 2250 2251 static void rmw_work(struct work_struct *work) 2252 { 2253 struct btrfs_raid_bio *rbio; 2254 2255 rbio = container_of(work, struct btrfs_raid_bio, work); 2256 raid56_rmw_stripe(rbio); 2257 } 2258 2259 static void read_rebuild_work(struct work_struct *work) 2260 { 2261 struct btrfs_raid_bio *rbio; 2262 2263 rbio = container_of(work, struct btrfs_raid_bio, work); 2264 __raid56_parity_recover(rbio); 2265 } 2266 2267 /* 2268 * The following code is used to scrub/replace the parity stripe 2269 * 2270 * Caller must have already increased bio_counter for getting @bioc. 2271 * 2272 * Note: We need make sure all the pages that add into the scrub/replace 2273 * raid bio are correct and not be changed during the scrub/replace. That 2274 * is those pages just hold metadata or file data with checksum. 2275 */ 2276 2277 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2278 struct btrfs_io_context *bioc, 2279 struct btrfs_device *scrub_dev, 2280 unsigned long *dbitmap, int stripe_nsectors) 2281 { 2282 struct btrfs_fs_info *fs_info = bioc->fs_info; 2283 struct btrfs_raid_bio *rbio; 2284 int i; 2285 2286 rbio = alloc_rbio(fs_info, bioc); 2287 if (IS_ERR(rbio)) 2288 return NULL; 2289 bio_list_add(&rbio->bio_list, bio); 2290 /* 2291 * This is a special bio which is used to hold the completion handler 2292 * and make the scrub rbio is similar to the other types 2293 */ 2294 ASSERT(!bio->bi_iter.bi_size); 2295 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2296 2297 /* 2298 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2299 * to the end position, so this search can start from the first parity 2300 * stripe. 2301 */ 2302 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2303 if (bioc->stripes[i].dev == scrub_dev) { 2304 rbio->scrubp = i; 2305 break; 2306 } 2307 } 2308 ASSERT(i < rbio->real_stripes); 2309 2310 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2311 return rbio; 2312 } 2313 2314 /* Used for both parity scrub and missing. */ 2315 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2316 unsigned int pgoff, u64 logical) 2317 { 2318 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2319 int stripe_offset; 2320 int index; 2321 2322 ASSERT(logical >= rbio->bioc->raid_map[0]); 2323 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2324 BTRFS_STRIPE_LEN * rbio->nr_data); 2325 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 2326 index = stripe_offset / sectorsize; 2327 rbio->bio_sectors[index].page = page; 2328 rbio->bio_sectors[index].pgoff = pgoff; 2329 } 2330 2331 /* 2332 * We just scrub the parity that we have correct data on the same horizontal, 2333 * so we needn't allocate all pages for all the stripes. 2334 */ 2335 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2336 { 2337 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2338 int total_sector_nr; 2339 2340 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2341 total_sector_nr++) { 2342 struct page *page; 2343 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2344 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 2345 2346 if (!test_bit(sectornr, &rbio->dbitmap)) 2347 continue; 2348 if (rbio->stripe_pages[index]) 2349 continue; 2350 page = alloc_page(GFP_NOFS); 2351 if (!page) 2352 return -ENOMEM; 2353 rbio->stripe_pages[index] = page; 2354 } 2355 index_stripe_sectors(rbio); 2356 return 0; 2357 } 2358 2359 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2360 int need_check) 2361 { 2362 struct btrfs_io_context *bioc = rbio->bioc; 2363 const u32 sectorsize = bioc->fs_info->sectorsize; 2364 void **pointers = rbio->finish_pointers; 2365 unsigned long *pbitmap = &rbio->finish_pbitmap; 2366 int nr_data = rbio->nr_data; 2367 int stripe; 2368 int sectornr; 2369 bool has_qstripe; 2370 struct sector_ptr p_sector = { 0 }; 2371 struct sector_ptr q_sector = { 0 }; 2372 struct bio_list bio_list; 2373 struct bio *bio; 2374 int is_replace = 0; 2375 int ret; 2376 2377 bio_list_init(&bio_list); 2378 2379 if (rbio->real_stripes - rbio->nr_data == 1) 2380 has_qstripe = false; 2381 else if (rbio->real_stripes - rbio->nr_data == 2) 2382 has_qstripe = true; 2383 else 2384 BUG(); 2385 2386 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 2387 is_replace = 1; 2388 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2389 } 2390 2391 /* 2392 * Because the higher layers(scrubber) are unlikely to 2393 * use this area of the disk again soon, so don't cache 2394 * it. 2395 */ 2396 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2397 2398 if (!need_check) 2399 goto writeback; 2400 2401 p_sector.page = alloc_page(GFP_NOFS); 2402 if (!p_sector.page) 2403 goto cleanup; 2404 p_sector.pgoff = 0; 2405 p_sector.uptodate = 1; 2406 2407 if (has_qstripe) { 2408 /* RAID6, allocate and map temp space for the Q stripe */ 2409 q_sector.page = alloc_page(GFP_NOFS); 2410 if (!q_sector.page) { 2411 __free_page(p_sector.page); 2412 p_sector.page = NULL; 2413 goto cleanup; 2414 } 2415 q_sector.pgoff = 0; 2416 q_sector.uptodate = 1; 2417 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2418 } 2419 2420 atomic_set(&rbio->error, 0); 2421 2422 /* Map the parity stripe just once */ 2423 pointers[nr_data] = kmap_local_page(p_sector.page); 2424 2425 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2426 struct sector_ptr *sector; 2427 void *parity; 2428 2429 /* first collect one page from each data stripe */ 2430 for (stripe = 0; stripe < nr_data; stripe++) { 2431 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2432 pointers[stripe] = kmap_local_page(sector->page) + 2433 sector->pgoff; 2434 } 2435 2436 if (has_qstripe) { 2437 /* RAID6, call the library function to fill in our P/Q */ 2438 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2439 pointers); 2440 } else { 2441 /* raid5 */ 2442 memcpy(pointers[nr_data], pointers[0], sectorsize); 2443 run_xor(pointers + 1, nr_data - 1, sectorsize); 2444 } 2445 2446 /* Check scrubbing parity and repair it */ 2447 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2448 parity = kmap_local_page(sector->page) + sector->pgoff; 2449 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2450 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2451 else 2452 /* Parity is right, needn't writeback */ 2453 bitmap_clear(&rbio->dbitmap, sectornr, 1); 2454 kunmap_local(parity); 2455 2456 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2457 kunmap_local(pointers[stripe]); 2458 } 2459 2460 kunmap_local(pointers[nr_data]); 2461 __free_page(p_sector.page); 2462 p_sector.page = NULL; 2463 if (q_sector.page) { 2464 kunmap_local(pointers[rbio->real_stripes - 1]); 2465 __free_page(q_sector.page); 2466 q_sector.page = NULL; 2467 } 2468 2469 writeback: 2470 /* 2471 * time to start writing. Make bios for everything from the 2472 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2473 * everything else. 2474 */ 2475 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2476 struct sector_ptr *sector; 2477 2478 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2479 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2480 sectornr, REQ_OP_WRITE); 2481 if (ret) 2482 goto cleanup; 2483 } 2484 2485 if (!is_replace) 2486 goto submit_write; 2487 2488 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2489 struct sector_ptr *sector; 2490 2491 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2492 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2493 bioc->tgtdev_map[rbio->scrubp], 2494 sectornr, REQ_OP_WRITE); 2495 if (ret) 2496 goto cleanup; 2497 } 2498 2499 submit_write: 2500 nr_data = bio_list_size(&bio_list); 2501 if (!nr_data) { 2502 /* Every parity is right */ 2503 rbio_orig_end_io(rbio, BLK_STS_OK); 2504 return; 2505 } 2506 2507 atomic_set(&rbio->stripes_pending, nr_data); 2508 2509 while ((bio = bio_list_pop(&bio_list))) { 2510 bio->bi_end_io = raid_write_end_io; 2511 2512 if (trace_raid56_scrub_write_stripe_enabled()) { 2513 struct raid56_bio_trace_info trace_info = { 0 }; 2514 2515 bio_get_trace_info(rbio, bio, &trace_info); 2516 trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); 2517 } 2518 submit_bio(bio); 2519 } 2520 return; 2521 2522 cleanup: 2523 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2524 2525 while ((bio = bio_list_pop(&bio_list))) 2526 bio_put(bio); 2527 } 2528 2529 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2530 { 2531 if (stripe >= 0 && stripe < rbio->nr_data) 2532 return 1; 2533 return 0; 2534 } 2535 2536 /* 2537 * While we're doing the parity check and repair, we could have errors 2538 * in reading pages off the disk. This checks for errors and if we're 2539 * not able to read the page it'll trigger parity reconstruction. The 2540 * parity scrub will be finished after we've reconstructed the failed 2541 * stripes 2542 */ 2543 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2544 { 2545 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2546 goto cleanup; 2547 2548 if (rbio->faila >= 0 || rbio->failb >= 0) { 2549 int dfail = 0, failp = -1; 2550 2551 if (is_data_stripe(rbio, rbio->faila)) 2552 dfail++; 2553 else if (is_parity_stripe(rbio->faila)) 2554 failp = rbio->faila; 2555 2556 if (is_data_stripe(rbio, rbio->failb)) 2557 dfail++; 2558 else if (is_parity_stripe(rbio->failb)) 2559 failp = rbio->failb; 2560 2561 /* 2562 * Because we can not use a scrubbing parity to repair 2563 * the data, so the capability of the repair is declined. 2564 * (In the case of RAID5, we can not repair anything) 2565 */ 2566 if (dfail > rbio->bioc->max_errors - 1) 2567 goto cleanup; 2568 2569 /* 2570 * If all data is good, only parity is correctly, just 2571 * repair the parity. 2572 */ 2573 if (dfail == 0) { 2574 finish_parity_scrub(rbio, 0); 2575 return; 2576 } 2577 2578 /* 2579 * Here means we got one corrupted data stripe and one 2580 * corrupted parity on RAID6, if the corrupted parity 2581 * is scrubbing parity, luckily, use the other one to repair 2582 * the data, or we can not repair the data stripe. 2583 */ 2584 if (failp != rbio->scrubp) 2585 goto cleanup; 2586 2587 __raid_recover_end_io(rbio); 2588 } else { 2589 finish_parity_scrub(rbio, 1); 2590 } 2591 return; 2592 2593 cleanup: 2594 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2595 } 2596 2597 /* 2598 * end io for the read phase of the rmw cycle. All the bios here are physical 2599 * stripe bios we've read from the disk so we can recalculate the parity of the 2600 * stripe. 2601 * 2602 * This will usually kick off finish_rmw once all the bios are read in, but it 2603 * may trigger parity reconstruction if we had any errors along the way 2604 */ 2605 static void raid56_parity_scrub_end_io_work(struct work_struct *work) 2606 { 2607 struct btrfs_raid_bio *rbio = 2608 container_of(work, struct btrfs_raid_bio, end_io_work); 2609 2610 /* 2611 * This will normally call finish_rmw to start our write, but if there 2612 * are any failed stripes we'll reconstruct from parity first 2613 */ 2614 validate_rbio_for_parity_scrub(rbio); 2615 } 2616 2617 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2618 { 2619 int bios_to_read = 0; 2620 struct bio_list bio_list; 2621 int ret; 2622 int total_sector_nr; 2623 struct bio *bio; 2624 2625 bio_list_init(&bio_list); 2626 2627 ret = alloc_rbio_essential_pages(rbio); 2628 if (ret) 2629 goto cleanup; 2630 2631 atomic_set(&rbio->error, 0); 2632 /* Build a list of bios to read all the missing parts. */ 2633 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2634 total_sector_nr++) { 2635 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2636 int stripe = total_sector_nr / rbio->stripe_nsectors; 2637 struct sector_ptr *sector; 2638 2639 /* No data in the vertical stripe, no need to read. */ 2640 if (!test_bit(sectornr, &rbio->dbitmap)) 2641 continue; 2642 2643 /* 2644 * We want to find all the sectors missing from the rbio and 2645 * read them from the disk. If sector_in_rbio() finds a sector 2646 * in the bio list we don't need to read it off the stripe. 2647 */ 2648 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2649 if (sector) 2650 continue; 2651 2652 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2653 /* 2654 * The bio cache may have handed us an uptodate sector. If so, 2655 * use it. 2656 */ 2657 if (sector->uptodate) 2658 continue; 2659 2660 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2661 sectornr, REQ_OP_READ); 2662 if (ret) 2663 goto cleanup; 2664 } 2665 2666 bios_to_read = bio_list_size(&bio_list); 2667 if (!bios_to_read) { 2668 /* 2669 * this can happen if others have merged with 2670 * us, it means there is nothing left to read. 2671 * But if there are missing devices it may not be 2672 * safe to do the full stripe write yet. 2673 */ 2674 goto finish; 2675 } 2676 2677 /* 2678 * The bioc may be freed once we submit the last bio. Make sure not to 2679 * touch it after that. 2680 */ 2681 atomic_set(&rbio->stripes_pending, bios_to_read); 2682 INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); 2683 while ((bio = bio_list_pop(&bio_list))) { 2684 bio->bi_end_io = raid56_bio_end_io; 2685 2686 if (trace_raid56_scrub_read_enabled()) { 2687 struct raid56_bio_trace_info trace_info = { 0 }; 2688 2689 bio_get_trace_info(rbio, bio, &trace_info); 2690 trace_raid56_scrub_read(rbio, bio, &trace_info); 2691 } 2692 submit_bio(bio); 2693 } 2694 /* the actual write will happen once the reads are done */ 2695 return; 2696 2697 cleanup: 2698 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2699 2700 while ((bio = bio_list_pop(&bio_list))) 2701 bio_put(bio); 2702 2703 return; 2704 2705 finish: 2706 validate_rbio_for_parity_scrub(rbio); 2707 } 2708 2709 static void scrub_parity_work(struct work_struct *work) 2710 { 2711 struct btrfs_raid_bio *rbio; 2712 2713 rbio = container_of(work, struct btrfs_raid_bio, work); 2714 raid56_parity_scrub_stripe(rbio); 2715 } 2716 2717 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2718 { 2719 if (!lock_stripe_add(rbio)) 2720 start_async_work(rbio, scrub_parity_work); 2721 } 2722 2723 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2724 2725 struct btrfs_raid_bio * 2726 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) 2727 { 2728 struct btrfs_fs_info *fs_info = bioc->fs_info; 2729 struct btrfs_raid_bio *rbio; 2730 2731 rbio = alloc_rbio(fs_info, bioc); 2732 if (IS_ERR(rbio)) 2733 return NULL; 2734 2735 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2736 bio_list_add(&rbio->bio_list, bio); 2737 /* 2738 * This is a special bio which is used to hold the completion handler 2739 * and make the scrub rbio is similar to the other types 2740 */ 2741 ASSERT(!bio->bi_iter.bi_size); 2742 2743 rbio->faila = find_logical_bio_stripe(rbio, bio); 2744 if (rbio->faila == -1) { 2745 BUG(); 2746 kfree(rbio); 2747 return NULL; 2748 } 2749 2750 return rbio; 2751 } 2752 2753 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2754 { 2755 if (!lock_stripe_add(rbio)) 2756 start_async_work(rbio, read_rebuild_work); 2757 } 2758