1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "ctree.h" 17 #include "disk-io.h" 18 #include "volumes.h" 19 #include "raid56.h" 20 #include "async-thread.h" 21 22 /* set when additional merges to this rbio are not allowed */ 23 #define RBIO_RMW_LOCKED_BIT 1 24 25 /* 26 * set when this rbio is sitting in the hash, but it is just a cache 27 * of past RMW 28 */ 29 #define RBIO_CACHE_BIT 2 30 31 /* 32 * set when it is safe to trust the stripe_pages for caching 33 */ 34 #define RBIO_CACHE_READY_BIT 3 35 36 #define RBIO_CACHE_SIZE 1024 37 38 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 39 40 /* Used by the raid56 code to lock stripes for read/modify/write */ 41 struct btrfs_stripe_hash { 42 struct list_head hash_list; 43 spinlock_t lock; 44 }; 45 46 /* Used by the raid56 code to lock stripes for read/modify/write */ 47 struct btrfs_stripe_hash_table { 48 struct list_head stripe_cache; 49 spinlock_t cache_lock; 50 int cache_size; 51 struct btrfs_stripe_hash table[]; 52 }; 53 54 enum btrfs_rbio_ops { 55 BTRFS_RBIO_WRITE, 56 BTRFS_RBIO_READ_REBUILD, 57 BTRFS_RBIO_PARITY_SCRUB, 58 BTRFS_RBIO_REBUILD_MISSING, 59 }; 60 61 struct btrfs_raid_bio { 62 struct btrfs_fs_info *fs_info; 63 struct btrfs_bio *bbio; 64 65 /* while we're doing rmw on a stripe 66 * we put it into a hash table so we can 67 * lock the stripe and merge more rbios 68 * into it. 69 */ 70 struct list_head hash_list; 71 72 /* 73 * LRU list for the stripe cache 74 */ 75 struct list_head stripe_cache; 76 77 /* 78 * for scheduling work in the helper threads 79 */ 80 struct btrfs_work work; 81 82 /* 83 * bio list and bio_list_lock are used 84 * to add more bios into the stripe 85 * in hopes of avoiding the full rmw 86 */ 87 struct bio_list bio_list; 88 spinlock_t bio_list_lock; 89 90 /* also protected by the bio_list_lock, the 91 * plug list is used by the plugging code 92 * to collect partial bios while plugged. The 93 * stripe locking code also uses it to hand off 94 * the stripe lock to the next pending IO 95 */ 96 struct list_head plug_list; 97 98 /* 99 * flags that tell us if it is safe to 100 * merge with this bio 101 */ 102 unsigned long flags; 103 104 /* size of each individual stripe on disk */ 105 int stripe_len; 106 107 /* number of data stripes (no p/q) */ 108 int nr_data; 109 110 int real_stripes; 111 112 int stripe_npages; 113 /* 114 * set if we're doing a parity rebuild 115 * for a read from higher up, which is handled 116 * differently from a parity rebuild as part of 117 * rmw 118 */ 119 enum btrfs_rbio_ops operation; 120 121 /* first bad stripe */ 122 int faila; 123 124 /* second bad stripe (for raid6 use) */ 125 int failb; 126 127 int scrubp; 128 /* 129 * number of pages needed to represent the full 130 * stripe 131 */ 132 int nr_pages; 133 134 /* 135 * size of all the bios in the bio_list. This 136 * helps us decide if the rbio maps to a full 137 * stripe or not 138 */ 139 int bio_list_bytes; 140 141 int generic_bio_cnt; 142 143 refcount_t refs; 144 145 atomic_t stripes_pending; 146 147 atomic_t error; 148 /* 149 * these are two arrays of pointers. We allocate the 150 * rbio big enough to hold them both and setup their 151 * locations when the rbio is allocated 152 */ 153 154 /* pointers to pages that we allocated for 155 * reading/writing stripes directly from the disk (including P/Q) 156 */ 157 struct page **stripe_pages; 158 159 /* 160 * pointers to the pages in the bio_list. Stored 161 * here for faster lookup 162 */ 163 struct page **bio_pages; 164 165 /* 166 * bitmap to record which horizontal stripe has data 167 */ 168 unsigned long *dbitmap; 169 170 /* allocated with real_stripes-many pointers for finish_*() calls */ 171 void **finish_pointers; 172 173 /* allocated with stripe_npages-many bits for finish_*() calls */ 174 unsigned long *finish_pbitmap; 175 }; 176 177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 179 static void rmw_work(struct btrfs_work *work); 180 static void read_rebuild_work(struct btrfs_work *work); 181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 183 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 184 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 186 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 188 int need_check); 189 static void scrub_parity_work(struct btrfs_work *work); 190 191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 192 { 193 btrfs_init_work(&rbio->work, work_func, NULL, NULL); 194 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 195 } 196 197 /* 198 * the stripe hash table is used for locking, and to collect 199 * bios in hopes of making a full stripe 200 */ 201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 202 { 203 struct btrfs_stripe_hash_table *table; 204 struct btrfs_stripe_hash_table *x; 205 struct btrfs_stripe_hash *cur; 206 struct btrfs_stripe_hash *h; 207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 208 int i; 209 210 if (info->stripe_hash_table) 211 return 0; 212 213 /* 214 * The table is large, starting with order 4 and can go as high as 215 * order 7 in case lock debugging is turned on. 216 * 217 * Try harder to allocate and fallback to vmalloc to lower the chance 218 * of a failing mount. 219 */ 220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 221 if (!table) 222 return -ENOMEM; 223 224 spin_lock_init(&table->cache_lock); 225 INIT_LIST_HEAD(&table->stripe_cache); 226 227 h = table->table; 228 229 for (i = 0; i < num_entries; i++) { 230 cur = h + i; 231 INIT_LIST_HEAD(&cur->hash_list); 232 spin_lock_init(&cur->lock); 233 } 234 235 x = cmpxchg(&info->stripe_hash_table, NULL, table); 236 if (x) 237 kvfree(x); 238 return 0; 239 } 240 241 /* 242 * caching an rbio means to copy anything from the 243 * bio_pages array into the stripe_pages array. We 244 * use the page uptodate bit in the stripe cache array 245 * to indicate if it has valid data 246 * 247 * once the caching is done, we set the cache ready 248 * bit. 249 */ 250 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 251 { 252 int i; 253 char *s; 254 char *d; 255 int ret; 256 257 ret = alloc_rbio_pages(rbio); 258 if (ret) 259 return; 260 261 for (i = 0; i < rbio->nr_pages; i++) { 262 if (!rbio->bio_pages[i]) 263 continue; 264 265 s = kmap(rbio->bio_pages[i]); 266 d = kmap(rbio->stripe_pages[i]); 267 268 copy_page(d, s); 269 270 kunmap(rbio->bio_pages[i]); 271 kunmap(rbio->stripe_pages[i]); 272 SetPageUptodate(rbio->stripe_pages[i]); 273 } 274 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 275 } 276 277 /* 278 * we hash on the first logical address of the stripe 279 */ 280 static int rbio_bucket(struct btrfs_raid_bio *rbio) 281 { 282 u64 num = rbio->bbio->raid_map[0]; 283 284 /* 285 * we shift down quite a bit. We're using byte 286 * addressing, and most of the lower bits are zeros. 287 * This tends to upset hash_64, and it consistently 288 * returns just one or two different values. 289 * 290 * shifting off the lower bits fixes things. 291 */ 292 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 293 } 294 295 /* 296 * stealing an rbio means taking all the uptodate pages from the stripe 297 * array in the source rbio and putting them into the destination rbio 298 */ 299 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 300 { 301 int i; 302 struct page *s; 303 struct page *d; 304 305 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 306 return; 307 308 for (i = 0; i < dest->nr_pages; i++) { 309 s = src->stripe_pages[i]; 310 if (!s || !PageUptodate(s)) { 311 continue; 312 } 313 314 d = dest->stripe_pages[i]; 315 if (d) 316 __free_page(d); 317 318 dest->stripe_pages[i] = s; 319 src->stripe_pages[i] = NULL; 320 } 321 } 322 323 /* 324 * merging means we take the bio_list from the victim and 325 * splice it into the destination. The victim should 326 * be discarded afterwards. 327 * 328 * must be called with dest->rbio_list_lock held 329 */ 330 static void merge_rbio(struct btrfs_raid_bio *dest, 331 struct btrfs_raid_bio *victim) 332 { 333 bio_list_merge(&dest->bio_list, &victim->bio_list); 334 dest->bio_list_bytes += victim->bio_list_bytes; 335 dest->generic_bio_cnt += victim->generic_bio_cnt; 336 bio_list_init(&victim->bio_list); 337 } 338 339 /* 340 * used to prune items that are in the cache. The caller 341 * must hold the hash table lock. 342 */ 343 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 344 { 345 int bucket = rbio_bucket(rbio); 346 struct btrfs_stripe_hash_table *table; 347 struct btrfs_stripe_hash *h; 348 int freeit = 0; 349 350 /* 351 * check the bit again under the hash table lock. 352 */ 353 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 354 return; 355 356 table = rbio->fs_info->stripe_hash_table; 357 h = table->table + bucket; 358 359 /* hold the lock for the bucket because we may be 360 * removing it from the hash table 361 */ 362 spin_lock(&h->lock); 363 364 /* 365 * hold the lock for the bio list because we need 366 * to make sure the bio list is empty 367 */ 368 spin_lock(&rbio->bio_list_lock); 369 370 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 371 list_del_init(&rbio->stripe_cache); 372 table->cache_size -= 1; 373 freeit = 1; 374 375 /* if the bio list isn't empty, this rbio is 376 * still involved in an IO. We take it out 377 * of the cache list, and drop the ref that 378 * was held for the list. 379 * 380 * If the bio_list was empty, we also remove 381 * the rbio from the hash_table, and drop 382 * the corresponding ref 383 */ 384 if (bio_list_empty(&rbio->bio_list)) { 385 if (!list_empty(&rbio->hash_list)) { 386 list_del_init(&rbio->hash_list); 387 refcount_dec(&rbio->refs); 388 BUG_ON(!list_empty(&rbio->plug_list)); 389 } 390 } 391 } 392 393 spin_unlock(&rbio->bio_list_lock); 394 spin_unlock(&h->lock); 395 396 if (freeit) 397 __free_raid_bio(rbio); 398 } 399 400 /* 401 * prune a given rbio from the cache 402 */ 403 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 404 { 405 struct btrfs_stripe_hash_table *table; 406 unsigned long flags; 407 408 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 409 return; 410 411 table = rbio->fs_info->stripe_hash_table; 412 413 spin_lock_irqsave(&table->cache_lock, flags); 414 __remove_rbio_from_cache(rbio); 415 spin_unlock_irqrestore(&table->cache_lock, flags); 416 } 417 418 /* 419 * remove everything in the cache 420 */ 421 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 422 { 423 struct btrfs_stripe_hash_table *table; 424 unsigned long flags; 425 struct btrfs_raid_bio *rbio; 426 427 table = info->stripe_hash_table; 428 429 spin_lock_irqsave(&table->cache_lock, flags); 430 while (!list_empty(&table->stripe_cache)) { 431 rbio = list_entry(table->stripe_cache.next, 432 struct btrfs_raid_bio, 433 stripe_cache); 434 __remove_rbio_from_cache(rbio); 435 } 436 spin_unlock_irqrestore(&table->cache_lock, flags); 437 } 438 439 /* 440 * remove all cached entries and free the hash table 441 * used by unmount 442 */ 443 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 444 { 445 if (!info->stripe_hash_table) 446 return; 447 btrfs_clear_rbio_cache(info); 448 kvfree(info->stripe_hash_table); 449 info->stripe_hash_table = NULL; 450 } 451 452 /* 453 * insert an rbio into the stripe cache. It 454 * must have already been prepared by calling 455 * cache_rbio_pages 456 * 457 * If this rbio was already cached, it gets 458 * moved to the front of the lru. 459 * 460 * If the size of the rbio cache is too big, we 461 * prune an item. 462 */ 463 static void cache_rbio(struct btrfs_raid_bio *rbio) 464 { 465 struct btrfs_stripe_hash_table *table; 466 unsigned long flags; 467 468 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 469 return; 470 471 table = rbio->fs_info->stripe_hash_table; 472 473 spin_lock_irqsave(&table->cache_lock, flags); 474 spin_lock(&rbio->bio_list_lock); 475 476 /* bump our ref if we were not in the list before */ 477 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 478 refcount_inc(&rbio->refs); 479 480 if (!list_empty(&rbio->stripe_cache)){ 481 list_move(&rbio->stripe_cache, &table->stripe_cache); 482 } else { 483 list_add(&rbio->stripe_cache, &table->stripe_cache); 484 table->cache_size += 1; 485 } 486 487 spin_unlock(&rbio->bio_list_lock); 488 489 if (table->cache_size > RBIO_CACHE_SIZE) { 490 struct btrfs_raid_bio *found; 491 492 found = list_entry(table->stripe_cache.prev, 493 struct btrfs_raid_bio, 494 stripe_cache); 495 496 if (found != rbio) 497 __remove_rbio_from_cache(found); 498 } 499 500 spin_unlock_irqrestore(&table->cache_lock, flags); 501 } 502 503 /* 504 * helper function to run the xor_blocks api. It is only 505 * able to do MAX_XOR_BLOCKS at a time, so we need to 506 * loop through. 507 */ 508 static void run_xor(void **pages, int src_cnt, ssize_t len) 509 { 510 int src_off = 0; 511 int xor_src_cnt = 0; 512 void *dest = pages[src_cnt]; 513 514 while(src_cnt > 0) { 515 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 516 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 517 518 src_cnt -= xor_src_cnt; 519 src_off += xor_src_cnt; 520 } 521 } 522 523 /* 524 * Returns true if the bio list inside this rbio covers an entire stripe (no 525 * rmw required). 526 */ 527 static int rbio_is_full(struct btrfs_raid_bio *rbio) 528 { 529 unsigned long flags; 530 unsigned long size = rbio->bio_list_bytes; 531 int ret = 1; 532 533 spin_lock_irqsave(&rbio->bio_list_lock, flags); 534 if (size != rbio->nr_data * rbio->stripe_len) 535 ret = 0; 536 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 537 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 538 539 return ret; 540 } 541 542 /* 543 * returns 1 if it is safe to merge two rbios together. 544 * The merging is safe if the two rbios correspond to 545 * the same stripe and if they are both going in the same 546 * direction (read vs write), and if neither one is 547 * locked for final IO 548 * 549 * The caller is responsible for locking such that 550 * rmw_locked is safe to test 551 */ 552 static int rbio_can_merge(struct btrfs_raid_bio *last, 553 struct btrfs_raid_bio *cur) 554 { 555 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 556 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 557 return 0; 558 559 /* 560 * we can't merge with cached rbios, since the 561 * idea is that when we merge the destination 562 * rbio is going to run our IO for us. We can 563 * steal from cached rbios though, other functions 564 * handle that. 565 */ 566 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 567 test_bit(RBIO_CACHE_BIT, &cur->flags)) 568 return 0; 569 570 if (last->bbio->raid_map[0] != 571 cur->bbio->raid_map[0]) 572 return 0; 573 574 /* we can't merge with different operations */ 575 if (last->operation != cur->operation) 576 return 0; 577 /* 578 * We've need read the full stripe from the drive. 579 * check and repair the parity and write the new results. 580 * 581 * We're not allowed to add any new bios to the 582 * bio list here, anyone else that wants to 583 * change this stripe needs to do their own rmw. 584 */ 585 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 586 return 0; 587 588 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 589 return 0; 590 591 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 592 int fa = last->faila; 593 int fb = last->failb; 594 int cur_fa = cur->faila; 595 int cur_fb = cur->failb; 596 597 if (last->faila >= last->failb) { 598 fa = last->failb; 599 fb = last->faila; 600 } 601 602 if (cur->faila >= cur->failb) { 603 cur_fa = cur->failb; 604 cur_fb = cur->faila; 605 } 606 607 if (fa != cur_fa || fb != cur_fb) 608 return 0; 609 } 610 return 1; 611 } 612 613 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 614 int index) 615 { 616 return stripe * rbio->stripe_npages + index; 617 } 618 619 /* 620 * these are just the pages from the rbio array, not from anything 621 * the FS sent down to us 622 */ 623 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 624 int index) 625 { 626 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 627 } 628 629 /* 630 * helper to index into the pstripe 631 */ 632 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 633 { 634 return rbio_stripe_page(rbio, rbio->nr_data, index); 635 } 636 637 /* 638 * helper to index into the qstripe, returns null 639 * if there is no qstripe 640 */ 641 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 642 { 643 if (rbio->nr_data + 1 == rbio->real_stripes) 644 return NULL; 645 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 646 } 647 648 /* 649 * The first stripe in the table for a logical address 650 * has the lock. rbios are added in one of three ways: 651 * 652 * 1) Nobody has the stripe locked yet. The rbio is given 653 * the lock and 0 is returned. The caller must start the IO 654 * themselves. 655 * 656 * 2) Someone has the stripe locked, but we're able to merge 657 * with the lock owner. The rbio is freed and the IO will 658 * start automatically along with the existing rbio. 1 is returned. 659 * 660 * 3) Someone has the stripe locked, but we're not able to merge. 661 * The rbio is added to the lock owner's plug list, or merged into 662 * an rbio already on the plug list. When the lock owner unlocks, 663 * the next rbio on the list is run and the IO is started automatically. 664 * 1 is returned 665 * 666 * If we return 0, the caller still owns the rbio and must continue with 667 * IO submission. If we return 1, the caller must assume the rbio has 668 * already been freed. 669 */ 670 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 671 { 672 struct btrfs_stripe_hash *h; 673 struct btrfs_raid_bio *cur; 674 struct btrfs_raid_bio *pending; 675 unsigned long flags; 676 struct btrfs_raid_bio *freeit = NULL; 677 struct btrfs_raid_bio *cache_drop = NULL; 678 int ret = 0; 679 680 h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 681 682 spin_lock_irqsave(&h->lock, flags); 683 list_for_each_entry(cur, &h->hash_list, hash_list) { 684 if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) 685 continue; 686 687 spin_lock(&cur->bio_list_lock); 688 689 /* Can we steal this cached rbio's pages? */ 690 if (bio_list_empty(&cur->bio_list) && 691 list_empty(&cur->plug_list) && 692 test_bit(RBIO_CACHE_BIT, &cur->flags) && 693 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 694 list_del_init(&cur->hash_list); 695 refcount_dec(&cur->refs); 696 697 steal_rbio(cur, rbio); 698 cache_drop = cur; 699 spin_unlock(&cur->bio_list_lock); 700 701 goto lockit; 702 } 703 704 /* Can we merge into the lock owner? */ 705 if (rbio_can_merge(cur, rbio)) { 706 merge_rbio(cur, rbio); 707 spin_unlock(&cur->bio_list_lock); 708 freeit = rbio; 709 ret = 1; 710 goto out; 711 } 712 713 714 /* 715 * We couldn't merge with the running rbio, see if we can merge 716 * with the pending ones. We don't have to check for rmw_locked 717 * because there is no way they are inside finish_rmw right now 718 */ 719 list_for_each_entry(pending, &cur->plug_list, plug_list) { 720 if (rbio_can_merge(pending, rbio)) { 721 merge_rbio(pending, rbio); 722 spin_unlock(&cur->bio_list_lock); 723 freeit = rbio; 724 ret = 1; 725 goto out; 726 } 727 } 728 729 /* 730 * No merging, put us on the tail of the plug list, our rbio 731 * will be started with the currently running rbio unlocks 732 */ 733 list_add_tail(&rbio->plug_list, &cur->plug_list); 734 spin_unlock(&cur->bio_list_lock); 735 ret = 1; 736 goto out; 737 } 738 lockit: 739 refcount_inc(&rbio->refs); 740 list_add(&rbio->hash_list, &h->hash_list); 741 out: 742 spin_unlock_irqrestore(&h->lock, flags); 743 if (cache_drop) 744 remove_rbio_from_cache(cache_drop); 745 if (freeit) 746 __free_raid_bio(freeit); 747 return ret; 748 } 749 750 /* 751 * called as rmw or parity rebuild is completed. If the plug list has more 752 * rbios waiting for this stripe, the next one on the list will be started 753 */ 754 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 755 { 756 int bucket; 757 struct btrfs_stripe_hash *h; 758 unsigned long flags; 759 int keep_cache = 0; 760 761 bucket = rbio_bucket(rbio); 762 h = rbio->fs_info->stripe_hash_table->table + bucket; 763 764 if (list_empty(&rbio->plug_list)) 765 cache_rbio(rbio); 766 767 spin_lock_irqsave(&h->lock, flags); 768 spin_lock(&rbio->bio_list_lock); 769 770 if (!list_empty(&rbio->hash_list)) { 771 /* 772 * if we're still cached and there is no other IO 773 * to perform, just leave this rbio here for others 774 * to steal from later 775 */ 776 if (list_empty(&rbio->plug_list) && 777 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 778 keep_cache = 1; 779 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 780 BUG_ON(!bio_list_empty(&rbio->bio_list)); 781 goto done; 782 } 783 784 list_del_init(&rbio->hash_list); 785 refcount_dec(&rbio->refs); 786 787 /* 788 * we use the plug list to hold all the rbios 789 * waiting for the chance to lock this stripe. 790 * hand the lock over to one of them. 791 */ 792 if (!list_empty(&rbio->plug_list)) { 793 struct btrfs_raid_bio *next; 794 struct list_head *head = rbio->plug_list.next; 795 796 next = list_entry(head, struct btrfs_raid_bio, 797 plug_list); 798 799 list_del_init(&rbio->plug_list); 800 801 list_add(&next->hash_list, &h->hash_list); 802 refcount_inc(&next->refs); 803 spin_unlock(&rbio->bio_list_lock); 804 spin_unlock_irqrestore(&h->lock, flags); 805 806 if (next->operation == BTRFS_RBIO_READ_REBUILD) 807 start_async_work(next, read_rebuild_work); 808 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 809 steal_rbio(rbio, next); 810 start_async_work(next, read_rebuild_work); 811 } else if (next->operation == BTRFS_RBIO_WRITE) { 812 steal_rbio(rbio, next); 813 start_async_work(next, rmw_work); 814 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 815 steal_rbio(rbio, next); 816 start_async_work(next, scrub_parity_work); 817 } 818 819 goto done_nolock; 820 } 821 } 822 done: 823 spin_unlock(&rbio->bio_list_lock); 824 spin_unlock_irqrestore(&h->lock, flags); 825 826 done_nolock: 827 if (!keep_cache) 828 remove_rbio_from_cache(rbio); 829 } 830 831 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 832 { 833 int i; 834 835 if (!refcount_dec_and_test(&rbio->refs)) 836 return; 837 838 WARN_ON(!list_empty(&rbio->stripe_cache)); 839 WARN_ON(!list_empty(&rbio->hash_list)); 840 WARN_ON(!bio_list_empty(&rbio->bio_list)); 841 842 for (i = 0; i < rbio->nr_pages; i++) { 843 if (rbio->stripe_pages[i]) { 844 __free_page(rbio->stripe_pages[i]); 845 rbio->stripe_pages[i] = NULL; 846 } 847 } 848 849 btrfs_put_bbio(rbio->bbio); 850 kfree(rbio); 851 } 852 853 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 854 { 855 struct bio *next; 856 857 while (cur) { 858 next = cur->bi_next; 859 cur->bi_next = NULL; 860 cur->bi_status = err; 861 bio_endio(cur); 862 cur = next; 863 } 864 } 865 866 /* 867 * this frees the rbio and runs through all the bios in the 868 * bio_list and calls end_io on them 869 */ 870 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 871 { 872 struct bio *cur = bio_list_get(&rbio->bio_list); 873 struct bio *extra; 874 875 if (rbio->generic_bio_cnt) 876 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 877 878 /* 879 * At this moment, rbio->bio_list is empty, however since rbio does not 880 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 881 * hash list, rbio may be merged with others so that rbio->bio_list 882 * becomes non-empty. 883 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 884 * more and we can call bio_endio() on all queued bios. 885 */ 886 unlock_stripe(rbio); 887 extra = bio_list_get(&rbio->bio_list); 888 __free_raid_bio(rbio); 889 890 rbio_endio_bio_list(cur, err); 891 if (extra) 892 rbio_endio_bio_list(extra, err); 893 } 894 895 /* 896 * end io function used by finish_rmw. When we finally 897 * get here, we've written a full stripe 898 */ 899 static void raid_write_end_io(struct bio *bio) 900 { 901 struct btrfs_raid_bio *rbio = bio->bi_private; 902 blk_status_t err = bio->bi_status; 903 int max_errors; 904 905 if (err) 906 fail_bio_stripe(rbio, bio); 907 908 bio_put(bio); 909 910 if (!atomic_dec_and_test(&rbio->stripes_pending)) 911 return; 912 913 err = BLK_STS_OK; 914 915 /* OK, we have read all the stripes we need to. */ 916 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 917 0 : rbio->bbio->max_errors; 918 if (atomic_read(&rbio->error) > max_errors) 919 err = BLK_STS_IOERR; 920 921 rbio_orig_end_io(rbio, err); 922 } 923 924 /* 925 * the read/modify/write code wants to use the original bio for 926 * any pages it included, and then use the rbio for everything 927 * else. This function decides if a given index (stripe number) 928 * and page number in that stripe fall inside the original bio 929 * or the rbio. 930 * 931 * if you set bio_list_only, you'll get a NULL back for any ranges 932 * that are outside the bio_list 933 * 934 * This doesn't take any refs on anything, you get a bare page pointer 935 * and the caller must bump refs as required. 936 * 937 * You must call index_rbio_pages once before you can trust 938 * the answers from this function. 939 */ 940 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 941 int index, int pagenr, int bio_list_only) 942 { 943 int chunk_page; 944 struct page *p = NULL; 945 946 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 947 948 spin_lock_irq(&rbio->bio_list_lock); 949 p = rbio->bio_pages[chunk_page]; 950 spin_unlock_irq(&rbio->bio_list_lock); 951 952 if (p || bio_list_only) 953 return p; 954 955 return rbio->stripe_pages[chunk_page]; 956 } 957 958 /* 959 * number of pages we need for the entire stripe across all the 960 * drives 961 */ 962 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 963 { 964 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 965 } 966 967 /* 968 * allocation and initial setup for the btrfs_raid_bio. Not 969 * this does not allocate any pages for rbio->pages. 970 */ 971 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 972 struct btrfs_bio *bbio, 973 u64 stripe_len) 974 { 975 struct btrfs_raid_bio *rbio; 976 int nr_data = 0; 977 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 978 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 979 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 980 void *p; 981 982 rbio = kzalloc(sizeof(*rbio) + 983 sizeof(*rbio->stripe_pages) * num_pages + 984 sizeof(*rbio->bio_pages) * num_pages + 985 sizeof(*rbio->finish_pointers) * real_stripes + 986 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 987 sizeof(*rbio->finish_pbitmap) * 988 BITS_TO_LONGS(stripe_npages), 989 GFP_NOFS); 990 if (!rbio) 991 return ERR_PTR(-ENOMEM); 992 993 bio_list_init(&rbio->bio_list); 994 INIT_LIST_HEAD(&rbio->plug_list); 995 spin_lock_init(&rbio->bio_list_lock); 996 INIT_LIST_HEAD(&rbio->stripe_cache); 997 INIT_LIST_HEAD(&rbio->hash_list); 998 rbio->bbio = bbio; 999 rbio->fs_info = fs_info; 1000 rbio->stripe_len = stripe_len; 1001 rbio->nr_pages = num_pages; 1002 rbio->real_stripes = real_stripes; 1003 rbio->stripe_npages = stripe_npages; 1004 rbio->faila = -1; 1005 rbio->failb = -1; 1006 refcount_set(&rbio->refs, 1); 1007 atomic_set(&rbio->error, 0); 1008 atomic_set(&rbio->stripes_pending, 0); 1009 1010 /* 1011 * the stripe_pages, bio_pages, etc arrays point to the extra 1012 * memory we allocated past the end of the rbio 1013 */ 1014 p = rbio + 1; 1015 #define CONSUME_ALLOC(ptr, count) do { \ 1016 ptr = p; \ 1017 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1018 } while (0) 1019 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1020 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1021 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1022 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1023 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1024 #undef CONSUME_ALLOC 1025 1026 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1027 nr_data = real_stripes - 1; 1028 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1029 nr_data = real_stripes - 2; 1030 else 1031 BUG(); 1032 1033 rbio->nr_data = nr_data; 1034 return rbio; 1035 } 1036 1037 /* allocate pages for all the stripes in the bio, including parity */ 1038 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1039 { 1040 int i; 1041 struct page *page; 1042 1043 for (i = 0; i < rbio->nr_pages; i++) { 1044 if (rbio->stripe_pages[i]) 1045 continue; 1046 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1047 if (!page) 1048 return -ENOMEM; 1049 rbio->stripe_pages[i] = page; 1050 } 1051 return 0; 1052 } 1053 1054 /* only allocate pages for p/q stripes */ 1055 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1056 { 1057 int i; 1058 struct page *page; 1059 1060 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1061 1062 for (; i < rbio->nr_pages; i++) { 1063 if (rbio->stripe_pages[i]) 1064 continue; 1065 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1066 if (!page) 1067 return -ENOMEM; 1068 rbio->stripe_pages[i] = page; 1069 } 1070 return 0; 1071 } 1072 1073 /* 1074 * add a single page from a specific stripe into our list of bios for IO 1075 * this will try to merge into existing bios if possible, and returns 1076 * zero if all went well. 1077 */ 1078 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1079 struct bio_list *bio_list, 1080 struct page *page, 1081 int stripe_nr, 1082 unsigned long page_index, 1083 unsigned long bio_max_len) 1084 { 1085 struct bio *last = bio_list->tail; 1086 u64 last_end = 0; 1087 int ret; 1088 struct bio *bio; 1089 struct btrfs_bio_stripe *stripe; 1090 u64 disk_start; 1091 1092 stripe = &rbio->bbio->stripes[stripe_nr]; 1093 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1094 1095 /* if the device is missing, just fail this stripe */ 1096 if (!stripe->dev->bdev) 1097 return fail_rbio_index(rbio, stripe_nr); 1098 1099 /* see if we can add this page onto our existing bio */ 1100 if (last) { 1101 last_end = (u64)last->bi_iter.bi_sector << 9; 1102 last_end += last->bi_iter.bi_size; 1103 1104 /* 1105 * we can't merge these if they are from different 1106 * devices or if they are not contiguous 1107 */ 1108 if (last_end == disk_start && stripe->dev->bdev && 1109 !last->bi_status && 1110 last->bi_disk == stripe->dev->bdev->bd_disk && 1111 last->bi_partno == stripe->dev->bdev->bd_partno) { 1112 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1113 if (ret == PAGE_SIZE) 1114 return 0; 1115 } 1116 } 1117 1118 /* put a new bio on the list */ 1119 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1120 bio->bi_iter.bi_size = 0; 1121 bio_set_dev(bio, stripe->dev->bdev); 1122 bio->bi_iter.bi_sector = disk_start >> 9; 1123 1124 bio_add_page(bio, page, PAGE_SIZE, 0); 1125 bio_list_add(bio_list, bio); 1126 return 0; 1127 } 1128 1129 /* 1130 * while we're doing the read/modify/write cycle, we could 1131 * have errors in reading pages off the disk. This checks 1132 * for errors and if we're not able to read the page it'll 1133 * trigger parity reconstruction. The rmw will be finished 1134 * after we've reconstructed the failed stripes 1135 */ 1136 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1137 { 1138 if (rbio->faila >= 0 || rbio->failb >= 0) { 1139 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1140 __raid56_parity_recover(rbio); 1141 } else { 1142 finish_rmw(rbio); 1143 } 1144 } 1145 1146 /* 1147 * helper function to walk our bio list and populate the bio_pages array with 1148 * the result. This seems expensive, but it is faster than constantly 1149 * searching through the bio list as we setup the IO in finish_rmw or stripe 1150 * reconstruction. 1151 * 1152 * This must be called before you trust the answers from page_in_rbio 1153 */ 1154 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1155 { 1156 struct bio *bio; 1157 u64 start; 1158 unsigned long stripe_offset; 1159 unsigned long page_index; 1160 1161 spin_lock_irq(&rbio->bio_list_lock); 1162 bio_list_for_each(bio, &rbio->bio_list) { 1163 struct bio_vec bvec; 1164 struct bvec_iter iter; 1165 int i = 0; 1166 1167 start = (u64)bio->bi_iter.bi_sector << 9; 1168 stripe_offset = start - rbio->bbio->raid_map[0]; 1169 page_index = stripe_offset >> PAGE_SHIFT; 1170 1171 if (bio_flagged(bio, BIO_CLONED)) 1172 bio->bi_iter = btrfs_io_bio(bio)->iter; 1173 1174 bio_for_each_segment(bvec, bio, iter) { 1175 rbio->bio_pages[page_index + i] = bvec.bv_page; 1176 i++; 1177 } 1178 } 1179 spin_unlock_irq(&rbio->bio_list_lock); 1180 } 1181 1182 /* 1183 * this is called from one of two situations. We either 1184 * have a full stripe from the higher layers, or we've read all 1185 * the missing bits off disk. 1186 * 1187 * This will calculate the parity and then send down any 1188 * changed blocks. 1189 */ 1190 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1191 { 1192 struct btrfs_bio *bbio = rbio->bbio; 1193 void **pointers = rbio->finish_pointers; 1194 int nr_data = rbio->nr_data; 1195 int stripe; 1196 int pagenr; 1197 bool has_qstripe; 1198 struct bio_list bio_list; 1199 struct bio *bio; 1200 int ret; 1201 1202 bio_list_init(&bio_list); 1203 1204 if (rbio->real_stripes - rbio->nr_data == 1) 1205 has_qstripe = false; 1206 else if (rbio->real_stripes - rbio->nr_data == 2) 1207 has_qstripe = true; 1208 else 1209 BUG(); 1210 1211 /* at this point we either have a full stripe, 1212 * or we've read the full stripe from the drive. 1213 * recalculate the parity and write the new results. 1214 * 1215 * We're not allowed to add any new bios to the 1216 * bio list here, anyone else that wants to 1217 * change this stripe needs to do their own rmw. 1218 */ 1219 spin_lock_irq(&rbio->bio_list_lock); 1220 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1221 spin_unlock_irq(&rbio->bio_list_lock); 1222 1223 atomic_set(&rbio->error, 0); 1224 1225 /* 1226 * now that we've set rmw_locked, run through the 1227 * bio list one last time and map the page pointers 1228 * 1229 * We don't cache full rbios because we're assuming 1230 * the higher layers are unlikely to use this area of 1231 * the disk again soon. If they do use it again, 1232 * hopefully they will send another full bio. 1233 */ 1234 index_rbio_pages(rbio); 1235 if (!rbio_is_full(rbio)) 1236 cache_rbio_pages(rbio); 1237 else 1238 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1239 1240 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1241 struct page *p; 1242 /* first collect one page from each data stripe */ 1243 for (stripe = 0; stripe < nr_data; stripe++) { 1244 p = page_in_rbio(rbio, stripe, pagenr, 0); 1245 pointers[stripe] = kmap(p); 1246 } 1247 1248 /* then add the parity stripe */ 1249 p = rbio_pstripe_page(rbio, pagenr); 1250 SetPageUptodate(p); 1251 pointers[stripe++] = kmap(p); 1252 1253 if (has_qstripe) { 1254 1255 /* 1256 * raid6, add the qstripe and call the 1257 * library function to fill in our p/q 1258 */ 1259 p = rbio_qstripe_page(rbio, pagenr); 1260 SetPageUptodate(p); 1261 pointers[stripe++] = kmap(p); 1262 1263 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1264 pointers); 1265 } else { 1266 /* raid5 */ 1267 copy_page(pointers[nr_data], pointers[0]); 1268 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1269 } 1270 1271 1272 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1273 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1274 } 1275 1276 /* 1277 * time to start writing. Make bios for everything from the 1278 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1279 * everything else. 1280 */ 1281 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1282 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1283 struct page *page; 1284 if (stripe < rbio->nr_data) { 1285 page = page_in_rbio(rbio, stripe, pagenr, 1); 1286 if (!page) 1287 continue; 1288 } else { 1289 page = rbio_stripe_page(rbio, stripe, pagenr); 1290 } 1291 1292 ret = rbio_add_io_page(rbio, &bio_list, 1293 page, stripe, pagenr, rbio->stripe_len); 1294 if (ret) 1295 goto cleanup; 1296 } 1297 } 1298 1299 if (likely(!bbio->num_tgtdevs)) 1300 goto write_data; 1301 1302 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1303 if (!bbio->tgtdev_map[stripe]) 1304 continue; 1305 1306 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1307 struct page *page; 1308 if (stripe < rbio->nr_data) { 1309 page = page_in_rbio(rbio, stripe, pagenr, 1); 1310 if (!page) 1311 continue; 1312 } else { 1313 page = rbio_stripe_page(rbio, stripe, pagenr); 1314 } 1315 1316 ret = rbio_add_io_page(rbio, &bio_list, page, 1317 rbio->bbio->tgtdev_map[stripe], 1318 pagenr, rbio->stripe_len); 1319 if (ret) 1320 goto cleanup; 1321 } 1322 } 1323 1324 write_data: 1325 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1326 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1327 1328 while (1) { 1329 bio = bio_list_pop(&bio_list); 1330 if (!bio) 1331 break; 1332 1333 bio->bi_private = rbio; 1334 bio->bi_end_io = raid_write_end_io; 1335 bio->bi_opf = REQ_OP_WRITE; 1336 1337 submit_bio(bio); 1338 } 1339 return; 1340 1341 cleanup: 1342 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1343 1344 while ((bio = bio_list_pop(&bio_list))) 1345 bio_put(bio); 1346 } 1347 1348 /* 1349 * helper to find the stripe number for a given bio. Used to figure out which 1350 * stripe has failed. This expects the bio to correspond to a physical disk, 1351 * so it looks up based on physical sector numbers. 1352 */ 1353 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1354 struct bio *bio) 1355 { 1356 u64 physical = bio->bi_iter.bi_sector; 1357 u64 stripe_start; 1358 int i; 1359 struct btrfs_bio_stripe *stripe; 1360 1361 physical <<= 9; 1362 1363 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1364 stripe = &rbio->bbio->stripes[i]; 1365 stripe_start = stripe->physical; 1366 if (physical >= stripe_start && 1367 physical < stripe_start + rbio->stripe_len && 1368 stripe->dev->bdev && 1369 bio->bi_disk == stripe->dev->bdev->bd_disk && 1370 bio->bi_partno == stripe->dev->bdev->bd_partno) { 1371 return i; 1372 } 1373 } 1374 return -1; 1375 } 1376 1377 /* 1378 * helper to find the stripe number for a given 1379 * bio (before mapping). Used to figure out which stripe has 1380 * failed. This looks up based on logical block numbers. 1381 */ 1382 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1383 struct bio *bio) 1384 { 1385 u64 logical = bio->bi_iter.bi_sector; 1386 u64 stripe_start; 1387 int i; 1388 1389 logical <<= 9; 1390 1391 for (i = 0; i < rbio->nr_data; i++) { 1392 stripe_start = rbio->bbio->raid_map[i]; 1393 if (logical >= stripe_start && 1394 logical < stripe_start + rbio->stripe_len) { 1395 return i; 1396 } 1397 } 1398 return -1; 1399 } 1400 1401 /* 1402 * returns -EIO if we had too many failures 1403 */ 1404 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1405 { 1406 unsigned long flags; 1407 int ret = 0; 1408 1409 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1410 1411 /* we already know this stripe is bad, move on */ 1412 if (rbio->faila == failed || rbio->failb == failed) 1413 goto out; 1414 1415 if (rbio->faila == -1) { 1416 /* first failure on this rbio */ 1417 rbio->faila = failed; 1418 atomic_inc(&rbio->error); 1419 } else if (rbio->failb == -1) { 1420 /* second failure on this rbio */ 1421 rbio->failb = failed; 1422 atomic_inc(&rbio->error); 1423 } else { 1424 ret = -EIO; 1425 } 1426 out: 1427 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1428 1429 return ret; 1430 } 1431 1432 /* 1433 * helper to fail a stripe based on a physical disk 1434 * bio. 1435 */ 1436 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1437 struct bio *bio) 1438 { 1439 int failed = find_bio_stripe(rbio, bio); 1440 1441 if (failed < 0) 1442 return -EIO; 1443 1444 return fail_rbio_index(rbio, failed); 1445 } 1446 1447 /* 1448 * this sets each page in the bio uptodate. It should only be used on private 1449 * rbio pages, nothing that comes in from the higher layers 1450 */ 1451 static void set_bio_pages_uptodate(struct bio *bio) 1452 { 1453 struct bio_vec *bvec; 1454 struct bvec_iter_all iter_all; 1455 1456 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1457 1458 bio_for_each_segment_all(bvec, bio, iter_all) 1459 SetPageUptodate(bvec->bv_page); 1460 } 1461 1462 /* 1463 * end io for the read phase of the rmw cycle. All the bios here are physical 1464 * stripe bios we've read from the disk so we can recalculate the parity of the 1465 * stripe. 1466 * 1467 * This will usually kick off finish_rmw once all the bios are read in, but it 1468 * may trigger parity reconstruction if we had any errors along the way 1469 */ 1470 static void raid_rmw_end_io(struct bio *bio) 1471 { 1472 struct btrfs_raid_bio *rbio = bio->bi_private; 1473 1474 if (bio->bi_status) 1475 fail_bio_stripe(rbio, bio); 1476 else 1477 set_bio_pages_uptodate(bio); 1478 1479 bio_put(bio); 1480 1481 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1482 return; 1483 1484 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1485 goto cleanup; 1486 1487 /* 1488 * this will normally call finish_rmw to start our write 1489 * but if there are any failed stripes we'll reconstruct 1490 * from parity first 1491 */ 1492 validate_rbio_for_rmw(rbio); 1493 return; 1494 1495 cleanup: 1496 1497 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1498 } 1499 1500 /* 1501 * the stripe must be locked by the caller. It will 1502 * unlock after all the writes are done 1503 */ 1504 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1505 { 1506 int bios_to_read = 0; 1507 struct bio_list bio_list; 1508 int ret; 1509 int pagenr; 1510 int stripe; 1511 struct bio *bio; 1512 1513 bio_list_init(&bio_list); 1514 1515 ret = alloc_rbio_pages(rbio); 1516 if (ret) 1517 goto cleanup; 1518 1519 index_rbio_pages(rbio); 1520 1521 atomic_set(&rbio->error, 0); 1522 /* 1523 * build a list of bios to read all the missing parts of this 1524 * stripe 1525 */ 1526 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1527 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1528 struct page *page; 1529 /* 1530 * we want to find all the pages missing from 1531 * the rbio and read them from the disk. If 1532 * page_in_rbio finds a page in the bio list 1533 * we don't need to read it off the stripe. 1534 */ 1535 page = page_in_rbio(rbio, stripe, pagenr, 1); 1536 if (page) 1537 continue; 1538 1539 page = rbio_stripe_page(rbio, stripe, pagenr); 1540 /* 1541 * the bio cache may have handed us an uptodate 1542 * page. If so, be happy and use it 1543 */ 1544 if (PageUptodate(page)) 1545 continue; 1546 1547 ret = rbio_add_io_page(rbio, &bio_list, page, 1548 stripe, pagenr, rbio->stripe_len); 1549 if (ret) 1550 goto cleanup; 1551 } 1552 } 1553 1554 bios_to_read = bio_list_size(&bio_list); 1555 if (!bios_to_read) { 1556 /* 1557 * this can happen if others have merged with 1558 * us, it means there is nothing left to read. 1559 * But if there are missing devices it may not be 1560 * safe to do the full stripe write yet. 1561 */ 1562 goto finish; 1563 } 1564 1565 /* 1566 * the bbio may be freed once we submit the last bio. Make sure 1567 * not to touch it after that 1568 */ 1569 atomic_set(&rbio->stripes_pending, bios_to_read); 1570 while (1) { 1571 bio = bio_list_pop(&bio_list); 1572 if (!bio) 1573 break; 1574 1575 bio->bi_private = rbio; 1576 bio->bi_end_io = raid_rmw_end_io; 1577 bio->bi_opf = REQ_OP_READ; 1578 1579 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1580 1581 submit_bio(bio); 1582 } 1583 /* the actual write will happen once the reads are done */ 1584 return 0; 1585 1586 cleanup: 1587 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1588 1589 while ((bio = bio_list_pop(&bio_list))) 1590 bio_put(bio); 1591 1592 return -EIO; 1593 1594 finish: 1595 validate_rbio_for_rmw(rbio); 1596 return 0; 1597 } 1598 1599 /* 1600 * if the upper layers pass in a full stripe, we thank them by only allocating 1601 * enough pages to hold the parity, and sending it all down quickly. 1602 */ 1603 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1604 { 1605 int ret; 1606 1607 ret = alloc_rbio_parity_pages(rbio); 1608 if (ret) { 1609 __free_raid_bio(rbio); 1610 return ret; 1611 } 1612 1613 ret = lock_stripe_add(rbio); 1614 if (ret == 0) 1615 finish_rmw(rbio); 1616 return 0; 1617 } 1618 1619 /* 1620 * partial stripe writes get handed over to async helpers. 1621 * We're really hoping to merge a few more writes into this 1622 * rbio before calculating new parity 1623 */ 1624 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1625 { 1626 int ret; 1627 1628 ret = lock_stripe_add(rbio); 1629 if (ret == 0) 1630 start_async_work(rbio, rmw_work); 1631 return 0; 1632 } 1633 1634 /* 1635 * sometimes while we were reading from the drive to 1636 * recalculate parity, enough new bios come into create 1637 * a full stripe. So we do a check here to see if we can 1638 * go directly to finish_rmw 1639 */ 1640 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1641 { 1642 /* head off into rmw land if we don't have a full stripe */ 1643 if (!rbio_is_full(rbio)) 1644 return partial_stripe_write(rbio); 1645 return full_stripe_write(rbio); 1646 } 1647 1648 /* 1649 * We use plugging call backs to collect full stripes. 1650 * Any time we get a partial stripe write while plugged 1651 * we collect it into a list. When the unplug comes down, 1652 * we sort the list by logical block number and merge 1653 * everything we can into the same rbios 1654 */ 1655 struct btrfs_plug_cb { 1656 struct blk_plug_cb cb; 1657 struct btrfs_fs_info *info; 1658 struct list_head rbio_list; 1659 struct btrfs_work work; 1660 }; 1661 1662 /* 1663 * rbios on the plug list are sorted for easier merging. 1664 */ 1665 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1666 { 1667 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1668 plug_list); 1669 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1670 plug_list); 1671 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1672 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1673 1674 if (a_sector < b_sector) 1675 return -1; 1676 if (a_sector > b_sector) 1677 return 1; 1678 return 0; 1679 } 1680 1681 static void run_plug(struct btrfs_plug_cb *plug) 1682 { 1683 struct btrfs_raid_bio *cur; 1684 struct btrfs_raid_bio *last = NULL; 1685 1686 /* 1687 * sort our plug list then try to merge 1688 * everything we can in hopes of creating full 1689 * stripes. 1690 */ 1691 list_sort(NULL, &plug->rbio_list, plug_cmp); 1692 while (!list_empty(&plug->rbio_list)) { 1693 cur = list_entry(plug->rbio_list.next, 1694 struct btrfs_raid_bio, plug_list); 1695 list_del_init(&cur->plug_list); 1696 1697 if (rbio_is_full(cur)) { 1698 int ret; 1699 1700 /* we have a full stripe, send it down */ 1701 ret = full_stripe_write(cur); 1702 BUG_ON(ret); 1703 continue; 1704 } 1705 if (last) { 1706 if (rbio_can_merge(last, cur)) { 1707 merge_rbio(last, cur); 1708 __free_raid_bio(cur); 1709 continue; 1710 1711 } 1712 __raid56_parity_write(last); 1713 } 1714 last = cur; 1715 } 1716 if (last) { 1717 __raid56_parity_write(last); 1718 } 1719 kfree(plug); 1720 } 1721 1722 /* 1723 * if the unplug comes from schedule, we have to push the 1724 * work off to a helper thread 1725 */ 1726 static void unplug_work(struct btrfs_work *work) 1727 { 1728 struct btrfs_plug_cb *plug; 1729 plug = container_of(work, struct btrfs_plug_cb, work); 1730 run_plug(plug); 1731 } 1732 1733 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1734 { 1735 struct btrfs_plug_cb *plug; 1736 plug = container_of(cb, struct btrfs_plug_cb, cb); 1737 1738 if (from_schedule) { 1739 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1740 btrfs_queue_work(plug->info->rmw_workers, 1741 &plug->work); 1742 return; 1743 } 1744 run_plug(plug); 1745 } 1746 1747 /* 1748 * our main entry point for writes from the rest of the FS. 1749 */ 1750 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1751 struct btrfs_bio *bbio, u64 stripe_len) 1752 { 1753 struct btrfs_raid_bio *rbio; 1754 struct btrfs_plug_cb *plug = NULL; 1755 struct blk_plug_cb *cb; 1756 int ret; 1757 1758 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1759 if (IS_ERR(rbio)) { 1760 btrfs_put_bbio(bbio); 1761 return PTR_ERR(rbio); 1762 } 1763 bio_list_add(&rbio->bio_list, bio); 1764 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1765 rbio->operation = BTRFS_RBIO_WRITE; 1766 1767 btrfs_bio_counter_inc_noblocked(fs_info); 1768 rbio->generic_bio_cnt = 1; 1769 1770 /* 1771 * don't plug on full rbios, just get them out the door 1772 * as quickly as we can 1773 */ 1774 if (rbio_is_full(rbio)) { 1775 ret = full_stripe_write(rbio); 1776 if (ret) 1777 btrfs_bio_counter_dec(fs_info); 1778 return ret; 1779 } 1780 1781 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1782 if (cb) { 1783 plug = container_of(cb, struct btrfs_plug_cb, cb); 1784 if (!plug->info) { 1785 plug->info = fs_info; 1786 INIT_LIST_HEAD(&plug->rbio_list); 1787 } 1788 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1789 ret = 0; 1790 } else { 1791 ret = __raid56_parity_write(rbio); 1792 if (ret) 1793 btrfs_bio_counter_dec(fs_info); 1794 } 1795 return ret; 1796 } 1797 1798 /* 1799 * all parity reconstruction happens here. We've read in everything 1800 * we can find from the drives and this does the heavy lifting of 1801 * sorting the good from the bad. 1802 */ 1803 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1804 { 1805 int pagenr, stripe; 1806 void **pointers; 1807 int faila = -1, failb = -1; 1808 struct page *page; 1809 blk_status_t err; 1810 int i; 1811 1812 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1813 if (!pointers) { 1814 err = BLK_STS_RESOURCE; 1815 goto cleanup_io; 1816 } 1817 1818 faila = rbio->faila; 1819 failb = rbio->failb; 1820 1821 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1822 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1823 spin_lock_irq(&rbio->bio_list_lock); 1824 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1825 spin_unlock_irq(&rbio->bio_list_lock); 1826 } 1827 1828 index_rbio_pages(rbio); 1829 1830 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1831 /* 1832 * Now we just use bitmap to mark the horizontal stripes in 1833 * which we have data when doing parity scrub. 1834 */ 1835 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1836 !test_bit(pagenr, rbio->dbitmap)) 1837 continue; 1838 1839 /* setup our array of pointers with pages 1840 * from each stripe 1841 */ 1842 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1843 /* 1844 * if we're rebuilding a read, we have to use 1845 * pages from the bio list 1846 */ 1847 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1848 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1849 (stripe == faila || stripe == failb)) { 1850 page = page_in_rbio(rbio, stripe, pagenr, 0); 1851 } else { 1852 page = rbio_stripe_page(rbio, stripe, pagenr); 1853 } 1854 pointers[stripe] = kmap(page); 1855 } 1856 1857 /* all raid6 handling here */ 1858 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1859 /* 1860 * single failure, rebuild from parity raid5 1861 * style 1862 */ 1863 if (failb < 0) { 1864 if (faila == rbio->nr_data) { 1865 /* 1866 * Just the P stripe has failed, without 1867 * a bad data or Q stripe. 1868 * TODO, we should redo the xor here. 1869 */ 1870 err = BLK_STS_IOERR; 1871 goto cleanup; 1872 } 1873 /* 1874 * a single failure in raid6 is rebuilt 1875 * in the pstripe code below 1876 */ 1877 goto pstripe; 1878 } 1879 1880 /* make sure our ps and qs are in order */ 1881 if (faila > failb) { 1882 int tmp = failb; 1883 failb = faila; 1884 faila = tmp; 1885 } 1886 1887 /* if the q stripe is failed, do a pstripe reconstruction 1888 * from the xors. 1889 * If both the q stripe and the P stripe are failed, we're 1890 * here due to a crc mismatch and we can't give them the 1891 * data they want 1892 */ 1893 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1894 if (rbio->bbio->raid_map[faila] == 1895 RAID5_P_STRIPE) { 1896 err = BLK_STS_IOERR; 1897 goto cleanup; 1898 } 1899 /* 1900 * otherwise we have one bad data stripe and 1901 * a good P stripe. raid5! 1902 */ 1903 goto pstripe; 1904 } 1905 1906 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1907 raid6_datap_recov(rbio->real_stripes, 1908 PAGE_SIZE, faila, pointers); 1909 } else { 1910 raid6_2data_recov(rbio->real_stripes, 1911 PAGE_SIZE, faila, failb, 1912 pointers); 1913 } 1914 } else { 1915 void *p; 1916 1917 /* rebuild from P stripe here (raid5 or raid6) */ 1918 BUG_ON(failb != -1); 1919 pstripe: 1920 /* Copy parity block into failed block to start with */ 1921 copy_page(pointers[faila], pointers[rbio->nr_data]); 1922 1923 /* rearrange the pointer array */ 1924 p = pointers[faila]; 1925 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1926 pointers[stripe] = pointers[stripe + 1]; 1927 pointers[rbio->nr_data - 1] = p; 1928 1929 /* xor in the rest */ 1930 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1931 } 1932 /* if we're doing this rebuild as part of an rmw, go through 1933 * and set all of our private rbio pages in the 1934 * failed stripes as uptodate. This way finish_rmw will 1935 * know they can be trusted. If this was a read reconstruction, 1936 * other endio functions will fiddle the uptodate bits 1937 */ 1938 if (rbio->operation == BTRFS_RBIO_WRITE) { 1939 for (i = 0; i < rbio->stripe_npages; i++) { 1940 if (faila != -1) { 1941 page = rbio_stripe_page(rbio, faila, i); 1942 SetPageUptodate(page); 1943 } 1944 if (failb != -1) { 1945 page = rbio_stripe_page(rbio, failb, i); 1946 SetPageUptodate(page); 1947 } 1948 } 1949 } 1950 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1951 /* 1952 * if we're rebuilding a read, we have to use 1953 * pages from the bio list 1954 */ 1955 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1956 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1957 (stripe == faila || stripe == failb)) { 1958 page = page_in_rbio(rbio, stripe, pagenr, 0); 1959 } else { 1960 page = rbio_stripe_page(rbio, stripe, pagenr); 1961 } 1962 kunmap(page); 1963 } 1964 } 1965 1966 err = BLK_STS_OK; 1967 cleanup: 1968 kfree(pointers); 1969 1970 cleanup_io: 1971 /* 1972 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1973 * valid rbio which is consistent with ondisk content, thus such a 1974 * valid rbio can be cached to avoid further disk reads. 1975 */ 1976 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1977 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1978 /* 1979 * - In case of two failures, where rbio->failb != -1: 1980 * 1981 * Do not cache this rbio since the above read reconstruction 1982 * (raid6_datap_recov() or raid6_2data_recov()) may have 1983 * changed some content of stripes which are not identical to 1984 * on-disk content any more, otherwise, a later write/recover 1985 * may steal stripe_pages from this rbio and end up with 1986 * corruptions or rebuild failures. 1987 * 1988 * - In case of single failure, where rbio->failb == -1: 1989 * 1990 * Cache this rbio iff the above read reconstruction is 1991 * executed without problems. 1992 */ 1993 if (err == BLK_STS_OK && rbio->failb < 0) 1994 cache_rbio_pages(rbio); 1995 else 1996 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1997 1998 rbio_orig_end_io(rbio, err); 1999 } else if (err == BLK_STS_OK) { 2000 rbio->faila = -1; 2001 rbio->failb = -1; 2002 2003 if (rbio->operation == BTRFS_RBIO_WRITE) 2004 finish_rmw(rbio); 2005 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2006 finish_parity_scrub(rbio, 0); 2007 else 2008 BUG(); 2009 } else { 2010 rbio_orig_end_io(rbio, err); 2011 } 2012 } 2013 2014 /* 2015 * This is called only for stripes we've read from disk to 2016 * reconstruct the parity. 2017 */ 2018 static void raid_recover_end_io(struct bio *bio) 2019 { 2020 struct btrfs_raid_bio *rbio = bio->bi_private; 2021 2022 /* 2023 * we only read stripe pages off the disk, set them 2024 * up to date if there were no errors 2025 */ 2026 if (bio->bi_status) 2027 fail_bio_stripe(rbio, bio); 2028 else 2029 set_bio_pages_uptodate(bio); 2030 bio_put(bio); 2031 2032 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2033 return; 2034 2035 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2036 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2037 else 2038 __raid_recover_end_io(rbio); 2039 } 2040 2041 /* 2042 * reads everything we need off the disk to reconstruct 2043 * the parity. endio handlers trigger final reconstruction 2044 * when the IO is done. 2045 * 2046 * This is used both for reads from the higher layers and for 2047 * parity construction required to finish a rmw cycle. 2048 */ 2049 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2050 { 2051 int bios_to_read = 0; 2052 struct bio_list bio_list; 2053 int ret; 2054 int pagenr; 2055 int stripe; 2056 struct bio *bio; 2057 2058 bio_list_init(&bio_list); 2059 2060 ret = alloc_rbio_pages(rbio); 2061 if (ret) 2062 goto cleanup; 2063 2064 atomic_set(&rbio->error, 0); 2065 2066 /* 2067 * read everything that hasn't failed. Thanks to the 2068 * stripe cache, it is possible that some or all of these 2069 * pages are going to be uptodate. 2070 */ 2071 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2072 if (rbio->faila == stripe || rbio->failb == stripe) { 2073 atomic_inc(&rbio->error); 2074 continue; 2075 } 2076 2077 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2078 struct page *p; 2079 2080 /* 2081 * the rmw code may have already read this 2082 * page in 2083 */ 2084 p = rbio_stripe_page(rbio, stripe, pagenr); 2085 if (PageUptodate(p)) 2086 continue; 2087 2088 ret = rbio_add_io_page(rbio, &bio_list, 2089 rbio_stripe_page(rbio, stripe, pagenr), 2090 stripe, pagenr, rbio->stripe_len); 2091 if (ret < 0) 2092 goto cleanup; 2093 } 2094 } 2095 2096 bios_to_read = bio_list_size(&bio_list); 2097 if (!bios_to_read) { 2098 /* 2099 * we might have no bios to read just because the pages 2100 * were up to date, or we might have no bios to read because 2101 * the devices were gone. 2102 */ 2103 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2104 __raid_recover_end_io(rbio); 2105 goto out; 2106 } else { 2107 goto cleanup; 2108 } 2109 } 2110 2111 /* 2112 * the bbio may be freed once we submit the last bio. Make sure 2113 * not to touch it after that 2114 */ 2115 atomic_set(&rbio->stripes_pending, bios_to_read); 2116 while (1) { 2117 bio = bio_list_pop(&bio_list); 2118 if (!bio) 2119 break; 2120 2121 bio->bi_private = rbio; 2122 bio->bi_end_io = raid_recover_end_io; 2123 bio->bi_opf = REQ_OP_READ; 2124 2125 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2126 2127 submit_bio(bio); 2128 } 2129 out: 2130 return 0; 2131 2132 cleanup: 2133 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2134 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2135 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2136 2137 while ((bio = bio_list_pop(&bio_list))) 2138 bio_put(bio); 2139 2140 return -EIO; 2141 } 2142 2143 /* 2144 * the main entry point for reads from the higher layers. This 2145 * is really only called when the normal read path had a failure, 2146 * so we assume the bio they send down corresponds to a failed part 2147 * of the drive. 2148 */ 2149 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2150 struct btrfs_bio *bbio, u64 stripe_len, 2151 int mirror_num, int generic_io) 2152 { 2153 struct btrfs_raid_bio *rbio; 2154 int ret; 2155 2156 if (generic_io) { 2157 ASSERT(bbio->mirror_num == mirror_num); 2158 btrfs_io_bio(bio)->mirror_num = mirror_num; 2159 } 2160 2161 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2162 if (IS_ERR(rbio)) { 2163 if (generic_io) 2164 btrfs_put_bbio(bbio); 2165 return PTR_ERR(rbio); 2166 } 2167 2168 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2169 bio_list_add(&rbio->bio_list, bio); 2170 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2171 2172 rbio->faila = find_logical_bio_stripe(rbio, bio); 2173 if (rbio->faila == -1) { 2174 btrfs_warn(fs_info, 2175 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2176 __func__, (u64)bio->bi_iter.bi_sector << 9, 2177 (u64)bio->bi_iter.bi_size, bbio->map_type); 2178 if (generic_io) 2179 btrfs_put_bbio(bbio); 2180 kfree(rbio); 2181 return -EIO; 2182 } 2183 2184 if (generic_io) { 2185 btrfs_bio_counter_inc_noblocked(fs_info); 2186 rbio->generic_bio_cnt = 1; 2187 } else { 2188 btrfs_get_bbio(bbio); 2189 } 2190 2191 /* 2192 * Loop retry: 2193 * for 'mirror == 2', reconstruct from all other stripes. 2194 * for 'mirror_num > 2', select a stripe to fail on every retry. 2195 */ 2196 if (mirror_num > 2) { 2197 /* 2198 * 'mirror == 3' is to fail the p stripe and 2199 * reconstruct from the q stripe. 'mirror > 3' is to 2200 * fail a data stripe and reconstruct from p+q stripe. 2201 */ 2202 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2203 ASSERT(rbio->failb > 0); 2204 if (rbio->failb <= rbio->faila) 2205 rbio->failb--; 2206 } 2207 2208 ret = lock_stripe_add(rbio); 2209 2210 /* 2211 * __raid56_parity_recover will end the bio with 2212 * any errors it hits. We don't want to return 2213 * its error value up the stack because our caller 2214 * will end up calling bio_endio with any nonzero 2215 * return 2216 */ 2217 if (ret == 0) 2218 __raid56_parity_recover(rbio); 2219 /* 2220 * our rbio has been added to the list of 2221 * rbios that will be handled after the 2222 * currently lock owner is done 2223 */ 2224 return 0; 2225 2226 } 2227 2228 static void rmw_work(struct btrfs_work *work) 2229 { 2230 struct btrfs_raid_bio *rbio; 2231 2232 rbio = container_of(work, struct btrfs_raid_bio, work); 2233 raid56_rmw_stripe(rbio); 2234 } 2235 2236 static void read_rebuild_work(struct btrfs_work *work) 2237 { 2238 struct btrfs_raid_bio *rbio; 2239 2240 rbio = container_of(work, struct btrfs_raid_bio, work); 2241 __raid56_parity_recover(rbio); 2242 } 2243 2244 /* 2245 * The following code is used to scrub/replace the parity stripe 2246 * 2247 * Caller must have already increased bio_counter for getting @bbio. 2248 * 2249 * Note: We need make sure all the pages that add into the scrub/replace 2250 * raid bio are correct and not be changed during the scrub/replace. That 2251 * is those pages just hold metadata or file data with checksum. 2252 */ 2253 2254 struct btrfs_raid_bio * 2255 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2256 struct btrfs_bio *bbio, u64 stripe_len, 2257 struct btrfs_device *scrub_dev, 2258 unsigned long *dbitmap, int stripe_nsectors) 2259 { 2260 struct btrfs_raid_bio *rbio; 2261 int i; 2262 2263 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2264 if (IS_ERR(rbio)) 2265 return NULL; 2266 bio_list_add(&rbio->bio_list, bio); 2267 /* 2268 * This is a special bio which is used to hold the completion handler 2269 * and make the scrub rbio is similar to the other types 2270 */ 2271 ASSERT(!bio->bi_iter.bi_size); 2272 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2273 2274 /* 2275 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2276 * to the end position, so this search can start from the first parity 2277 * stripe. 2278 */ 2279 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2280 if (bbio->stripes[i].dev == scrub_dev) { 2281 rbio->scrubp = i; 2282 break; 2283 } 2284 } 2285 ASSERT(i < rbio->real_stripes); 2286 2287 /* Now we just support the sectorsize equals to page size */ 2288 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2289 ASSERT(rbio->stripe_npages == stripe_nsectors); 2290 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2291 2292 /* 2293 * We have already increased bio_counter when getting bbio, record it 2294 * so we can free it at rbio_orig_end_io(). 2295 */ 2296 rbio->generic_bio_cnt = 1; 2297 2298 return rbio; 2299 } 2300 2301 /* Used for both parity scrub and missing. */ 2302 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2303 u64 logical) 2304 { 2305 int stripe_offset; 2306 int index; 2307 2308 ASSERT(logical >= rbio->bbio->raid_map[0]); 2309 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2310 rbio->stripe_len * rbio->nr_data); 2311 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2312 index = stripe_offset >> PAGE_SHIFT; 2313 rbio->bio_pages[index] = page; 2314 } 2315 2316 /* 2317 * We just scrub the parity that we have correct data on the same horizontal, 2318 * so we needn't allocate all pages for all the stripes. 2319 */ 2320 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2321 { 2322 int i; 2323 int bit; 2324 int index; 2325 struct page *page; 2326 2327 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2328 for (i = 0; i < rbio->real_stripes; i++) { 2329 index = i * rbio->stripe_npages + bit; 2330 if (rbio->stripe_pages[index]) 2331 continue; 2332 2333 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2334 if (!page) 2335 return -ENOMEM; 2336 rbio->stripe_pages[index] = page; 2337 } 2338 } 2339 return 0; 2340 } 2341 2342 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2343 int need_check) 2344 { 2345 struct btrfs_bio *bbio = rbio->bbio; 2346 void **pointers = rbio->finish_pointers; 2347 unsigned long *pbitmap = rbio->finish_pbitmap; 2348 int nr_data = rbio->nr_data; 2349 int stripe; 2350 int pagenr; 2351 bool has_qstripe; 2352 struct page *p_page = NULL; 2353 struct page *q_page = NULL; 2354 struct bio_list bio_list; 2355 struct bio *bio; 2356 int is_replace = 0; 2357 int ret; 2358 2359 bio_list_init(&bio_list); 2360 2361 if (rbio->real_stripes - rbio->nr_data == 1) 2362 has_qstripe = false; 2363 else if (rbio->real_stripes - rbio->nr_data == 2) 2364 has_qstripe = true; 2365 else 2366 BUG(); 2367 2368 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2369 is_replace = 1; 2370 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2371 } 2372 2373 /* 2374 * Because the higher layers(scrubber) are unlikely to 2375 * use this area of the disk again soon, so don't cache 2376 * it. 2377 */ 2378 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2379 2380 if (!need_check) 2381 goto writeback; 2382 2383 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2384 if (!p_page) 2385 goto cleanup; 2386 SetPageUptodate(p_page); 2387 2388 if (has_qstripe) { 2389 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2390 if (!q_page) { 2391 __free_page(p_page); 2392 goto cleanup; 2393 } 2394 SetPageUptodate(q_page); 2395 } 2396 2397 atomic_set(&rbio->error, 0); 2398 2399 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2400 struct page *p; 2401 void *parity; 2402 /* first collect one page from each data stripe */ 2403 for (stripe = 0; stripe < nr_data; stripe++) { 2404 p = page_in_rbio(rbio, stripe, pagenr, 0); 2405 pointers[stripe] = kmap(p); 2406 } 2407 2408 /* then add the parity stripe */ 2409 pointers[stripe++] = kmap(p_page); 2410 2411 if (has_qstripe) { 2412 /* 2413 * raid6, add the qstripe and call the 2414 * library function to fill in our p/q 2415 */ 2416 pointers[stripe++] = kmap(q_page); 2417 2418 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2419 pointers); 2420 } else { 2421 /* raid5 */ 2422 copy_page(pointers[nr_data], pointers[0]); 2423 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2424 } 2425 2426 /* Check scrubbing parity and repair it */ 2427 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2428 parity = kmap(p); 2429 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2430 copy_page(parity, pointers[rbio->scrubp]); 2431 else 2432 /* Parity is right, needn't writeback */ 2433 bitmap_clear(rbio->dbitmap, pagenr, 1); 2434 kunmap(p); 2435 2436 for (stripe = 0; stripe < nr_data; stripe++) 2437 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2438 kunmap(p_page); 2439 } 2440 2441 __free_page(p_page); 2442 if (q_page) 2443 __free_page(q_page); 2444 2445 writeback: 2446 /* 2447 * time to start writing. Make bios for everything from the 2448 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2449 * everything else. 2450 */ 2451 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2452 struct page *page; 2453 2454 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2455 ret = rbio_add_io_page(rbio, &bio_list, 2456 page, rbio->scrubp, pagenr, rbio->stripe_len); 2457 if (ret) 2458 goto cleanup; 2459 } 2460 2461 if (!is_replace) 2462 goto submit_write; 2463 2464 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2465 struct page *page; 2466 2467 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2468 ret = rbio_add_io_page(rbio, &bio_list, page, 2469 bbio->tgtdev_map[rbio->scrubp], 2470 pagenr, rbio->stripe_len); 2471 if (ret) 2472 goto cleanup; 2473 } 2474 2475 submit_write: 2476 nr_data = bio_list_size(&bio_list); 2477 if (!nr_data) { 2478 /* Every parity is right */ 2479 rbio_orig_end_io(rbio, BLK_STS_OK); 2480 return; 2481 } 2482 2483 atomic_set(&rbio->stripes_pending, nr_data); 2484 2485 while (1) { 2486 bio = bio_list_pop(&bio_list); 2487 if (!bio) 2488 break; 2489 2490 bio->bi_private = rbio; 2491 bio->bi_end_io = raid_write_end_io; 2492 bio->bi_opf = REQ_OP_WRITE; 2493 2494 submit_bio(bio); 2495 } 2496 return; 2497 2498 cleanup: 2499 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2500 2501 while ((bio = bio_list_pop(&bio_list))) 2502 bio_put(bio); 2503 } 2504 2505 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2506 { 2507 if (stripe >= 0 && stripe < rbio->nr_data) 2508 return 1; 2509 return 0; 2510 } 2511 2512 /* 2513 * While we're doing the parity check and repair, we could have errors 2514 * in reading pages off the disk. This checks for errors and if we're 2515 * not able to read the page it'll trigger parity reconstruction. The 2516 * parity scrub will be finished after we've reconstructed the failed 2517 * stripes 2518 */ 2519 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2520 { 2521 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2522 goto cleanup; 2523 2524 if (rbio->faila >= 0 || rbio->failb >= 0) { 2525 int dfail = 0, failp = -1; 2526 2527 if (is_data_stripe(rbio, rbio->faila)) 2528 dfail++; 2529 else if (is_parity_stripe(rbio->faila)) 2530 failp = rbio->faila; 2531 2532 if (is_data_stripe(rbio, rbio->failb)) 2533 dfail++; 2534 else if (is_parity_stripe(rbio->failb)) 2535 failp = rbio->failb; 2536 2537 /* 2538 * Because we can not use a scrubbing parity to repair 2539 * the data, so the capability of the repair is declined. 2540 * (In the case of RAID5, we can not repair anything) 2541 */ 2542 if (dfail > rbio->bbio->max_errors - 1) 2543 goto cleanup; 2544 2545 /* 2546 * If all data is good, only parity is correctly, just 2547 * repair the parity. 2548 */ 2549 if (dfail == 0) { 2550 finish_parity_scrub(rbio, 0); 2551 return; 2552 } 2553 2554 /* 2555 * Here means we got one corrupted data stripe and one 2556 * corrupted parity on RAID6, if the corrupted parity 2557 * is scrubbing parity, luckily, use the other one to repair 2558 * the data, or we can not repair the data stripe. 2559 */ 2560 if (failp != rbio->scrubp) 2561 goto cleanup; 2562 2563 __raid_recover_end_io(rbio); 2564 } else { 2565 finish_parity_scrub(rbio, 1); 2566 } 2567 return; 2568 2569 cleanup: 2570 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2571 } 2572 2573 /* 2574 * end io for the read phase of the rmw cycle. All the bios here are physical 2575 * stripe bios we've read from the disk so we can recalculate the parity of the 2576 * stripe. 2577 * 2578 * This will usually kick off finish_rmw once all the bios are read in, but it 2579 * may trigger parity reconstruction if we had any errors along the way 2580 */ 2581 static void raid56_parity_scrub_end_io(struct bio *bio) 2582 { 2583 struct btrfs_raid_bio *rbio = bio->bi_private; 2584 2585 if (bio->bi_status) 2586 fail_bio_stripe(rbio, bio); 2587 else 2588 set_bio_pages_uptodate(bio); 2589 2590 bio_put(bio); 2591 2592 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2593 return; 2594 2595 /* 2596 * this will normally call finish_rmw to start our write 2597 * but if there are any failed stripes we'll reconstruct 2598 * from parity first 2599 */ 2600 validate_rbio_for_parity_scrub(rbio); 2601 } 2602 2603 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2604 { 2605 int bios_to_read = 0; 2606 struct bio_list bio_list; 2607 int ret; 2608 int pagenr; 2609 int stripe; 2610 struct bio *bio; 2611 2612 bio_list_init(&bio_list); 2613 2614 ret = alloc_rbio_essential_pages(rbio); 2615 if (ret) 2616 goto cleanup; 2617 2618 atomic_set(&rbio->error, 0); 2619 /* 2620 * build a list of bios to read all the missing parts of this 2621 * stripe 2622 */ 2623 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2624 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2625 struct page *page; 2626 /* 2627 * we want to find all the pages missing from 2628 * the rbio and read them from the disk. If 2629 * page_in_rbio finds a page in the bio list 2630 * we don't need to read it off the stripe. 2631 */ 2632 page = page_in_rbio(rbio, stripe, pagenr, 1); 2633 if (page) 2634 continue; 2635 2636 page = rbio_stripe_page(rbio, stripe, pagenr); 2637 /* 2638 * the bio cache may have handed us an uptodate 2639 * page. If so, be happy and use it 2640 */ 2641 if (PageUptodate(page)) 2642 continue; 2643 2644 ret = rbio_add_io_page(rbio, &bio_list, page, 2645 stripe, pagenr, rbio->stripe_len); 2646 if (ret) 2647 goto cleanup; 2648 } 2649 } 2650 2651 bios_to_read = bio_list_size(&bio_list); 2652 if (!bios_to_read) { 2653 /* 2654 * this can happen if others have merged with 2655 * us, it means there is nothing left to read. 2656 * But if there are missing devices it may not be 2657 * safe to do the full stripe write yet. 2658 */ 2659 goto finish; 2660 } 2661 2662 /* 2663 * the bbio may be freed once we submit the last bio. Make sure 2664 * not to touch it after that 2665 */ 2666 atomic_set(&rbio->stripes_pending, bios_to_read); 2667 while (1) { 2668 bio = bio_list_pop(&bio_list); 2669 if (!bio) 2670 break; 2671 2672 bio->bi_private = rbio; 2673 bio->bi_end_io = raid56_parity_scrub_end_io; 2674 bio->bi_opf = REQ_OP_READ; 2675 2676 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2677 2678 submit_bio(bio); 2679 } 2680 /* the actual write will happen once the reads are done */ 2681 return; 2682 2683 cleanup: 2684 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2685 2686 while ((bio = bio_list_pop(&bio_list))) 2687 bio_put(bio); 2688 2689 return; 2690 2691 finish: 2692 validate_rbio_for_parity_scrub(rbio); 2693 } 2694 2695 static void scrub_parity_work(struct btrfs_work *work) 2696 { 2697 struct btrfs_raid_bio *rbio; 2698 2699 rbio = container_of(work, struct btrfs_raid_bio, work); 2700 raid56_parity_scrub_stripe(rbio); 2701 } 2702 2703 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2704 { 2705 if (!lock_stripe_add(rbio)) 2706 start_async_work(rbio, scrub_parity_work); 2707 } 2708 2709 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2710 2711 struct btrfs_raid_bio * 2712 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2713 struct btrfs_bio *bbio, u64 length) 2714 { 2715 struct btrfs_raid_bio *rbio; 2716 2717 rbio = alloc_rbio(fs_info, bbio, length); 2718 if (IS_ERR(rbio)) 2719 return NULL; 2720 2721 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2722 bio_list_add(&rbio->bio_list, bio); 2723 /* 2724 * This is a special bio which is used to hold the completion handler 2725 * and make the scrub rbio is similar to the other types 2726 */ 2727 ASSERT(!bio->bi_iter.bi_size); 2728 2729 rbio->faila = find_logical_bio_stripe(rbio, bio); 2730 if (rbio->faila == -1) { 2731 BUG(); 2732 kfree(rbio); 2733 return NULL; 2734 } 2735 2736 /* 2737 * When we get bbio, we have already increased bio_counter, record it 2738 * so we can free it at rbio_orig_end_io() 2739 */ 2740 rbio->generic_bio_cnt = 1; 2741 2742 return rbio; 2743 } 2744 2745 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2746 { 2747 if (!lock_stripe_add(rbio)) 2748 start_async_work(rbio, read_rebuild_work); 2749 } 2750