1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "misc.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 23 /* set when additional merges to this rbio are not allowed */ 24 #define RBIO_RMW_LOCKED_BIT 1 25 26 /* 27 * set when this rbio is sitting in the hash, but it is just a cache 28 * of past RMW 29 */ 30 #define RBIO_CACHE_BIT 2 31 32 /* 33 * set when it is safe to trust the stripe_pages for caching 34 */ 35 #define RBIO_CACHE_READY_BIT 3 36 37 #define RBIO_CACHE_SIZE 1024 38 39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 40 41 /* Used by the raid56 code to lock stripes for read/modify/write */ 42 struct btrfs_stripe_hash { 43 struct list_head hash_list; 44 spinlock_t lock; 45 }; 46 47 /* Used by the raid56 code to lock stripes for read/modify/write */ 48 struct btrfs_stripe_hash_table { 49 struct list_head stripe_cache; 50 spinlock_t cache_lock; 51 int cache_size; 52 struct btrfs_stripe_hash table[]; 53 }; 54 55 enum btrfs_rbio_ops { 56 BTRFS_RBIO_WRITE, 57 BTRFS_RBIO_READ_REBUILD, 58 BTRFS_RBIO_PARITY_SCRUB, 59 BTRFS_RBIO_REBUILD_MISSING, 60 }; 61 62 struct btrfs_raid_bio { 63 struct btrfs_io_context *bioc; 64 65 /* while we're doing rmw on a stripe 66 * we put it into a hash table so we can 67 * lock the stripe and merge more rbios 68 * into it. 69 */ 70 struct list_head hash_list; 71 72 /* 73 * LRU list for the stripe cache 74 */ 75 struct list_head stripe_cache; 76 77 /* 78 * for scheduling work in the helper threads 79 */ 80 struct btrfs_work work; 81 82 /* 83 * bio list and bio_list_lock are used 84 * to add more bios into the stripe 85 * in hopes of avoiding the full rmw 86 */ 87 struct bio_list bio_list; 88 spinlock_t bio_list_lock; 89 90 /* also protected by the bio_list_lock, the 91 * plug list is used by the plugging code 92 * to collect partial bios while plugged. The 93 * stripe locking code also uses it to hand off 94 * the stripe lock to the next pending IO 95 */ 96 struct list_head plug_list; 97 98 /* 99 * flags that tell us if it is safe to 100 * merge with this bio 101 */ 102 unsigned long flags; 103 104 /* size of each individual stripe on disk */ 105 int stripe_len; 106 107 /* number of data stripes (no p/q) */ 108 int nr_data; 109 110 int real_stripes; 111 112 int stripe_npages; 113 /* 114 * set if we're doing a parity rebuild 115 * for a read from higher up, which is handled 116 * differently from a parity rebuild as part of 117 * rmw 118 */ 119 enum btrfs_rbio_ops operation; 120 121 /* first bad stripe */ 122 int faila; 123 124 /* second bad stripe (for raid6 use) */ 125 int failb; 126 127 int scrubp; 128 /* 129 * number of pages needed to represent the full 130 * stripe 131 */ 132 int nr_pages; 133 134 /* 135 * size of all the bios in the bio_list. This 136 * helps us decide if the rbio maps to a full 137 * stripe or not 138 */ 139 int bio_list_bytes; 140 141 int generic_bio_cnt; 142 143 refcount_t refs; 144 145 atomic_t stripes_pending; 146 147 atomic_t error; 148 /* 149 * these are two arrays of pointers. We allocate the 150 * rbio big enough to hold them both and setup their 151 * locations when the rbio is allocated 152 */ 153 154 /* pointers to pages that we allocated for 155 * reading/writing stripes directly from the disk (including P/Q) 156 */ 157 struct page **stripe_pages; 158 159 /* 160 * pointers to the pages in the bio_list. Stored 161 * here for faster lookup 162 */ 163 struct page **bio_pages; 164 165 /* 166 * bitmap to record which horizontal stripe has data 167 */ 168 unsigned long *dbitmap; 169 170 /* allocated with real_stripes-many pointers for finish_*() calls */ 171 void **finish_pointers; 172 173 /* allocated with stripe_npages-many bits for finish_*() calls */ 174 unsigned long *finish_pbitmap; 175 }; 176 177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 179 static void rmw_work(struct btrfs_work *work); 180 static void read_rebuild_work(struct btrfs_work *work); 181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 183 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 184 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 186 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 188 int need_check); 189 static void scrub_parity_work(struct btrfs_work *work); 190 191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 192 { 193 btrfs_init_work(&rbio->work, work_func, NULL, NULL); 194 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 195 } 196 197 /* 198 * the stripe hash table is used for locking, and to collect 199 * bios in hopes of making a full stripe 200 */ 201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 202 { 203 struct btrfs_stripe_hash_table *table; 204 struct btrfs_stripe_hash_table *x; 205 struct btrfs_stripe_hash *cur; 206 struct btrfs_stripe_hash *h; 207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 208 int i; 209 210 if (info->stripe_hash_table) 211 return 0; 212 213 /* 214 * The table is large, starting with order 4 and can go as high as 215 * order 7 in case lock debugging is turned on. 216 * 217 * Try harder to allocate and fallback to vmalloc to lower the chance 218 * of a failing mount. 219 */ 220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 221 if (!table) 222 return -ENOMEM; 223 224 spin_lock_init(&table->cache_lock); 225 INIT_LIST_HEAD(&table->stripe_cache); 226 227 h = table->table; 228 229 for (i = 0; i < num_entries; i++) { 230 cur = h + i; 231 INIT_LIST_HEAD(&cur->hash_list); 232 spin_lock_init(&cur->lock); 233 } 234 235 x = cmpxchg(&info->stripe_hash_table, NULL, table); 236 kvfree(x); 237 return 0; 238 } 239 240 /* 241 * caching an rbio means to copy anything from the 242 * bio_pages array into the stripe_pages array. We 243 * use the page uptodate bit in the stripe cache array 244 * to indicate if it has valid data 245 * 246 * once the caching is done, we set the cache ready 247 * bit. 248 */ 249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 250 { 251 int i; 252 int ret; 253 254 ret = alloc_rbio_pages(rbio); 255 if (ret) 256 return; 257 258 for (i = 0; i < rbio->nr_pages; i++) { 259 if (!rbio->bio_pages[i]) 260 continue; 261 262 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); 263 SetPageUptodate(rbio->stripe_pages[i]); 264 } 265 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 266 } 267 268 /* 269 * we hash on the first logical address of the stripe 270 */ 271 static int rbio_bucket(struct btrfs_raid_bio *rbio) 272 { 273 u64 num = rbio->bioc->raid_map[0]; 274 275 /* 276 * we shift down quite a bit. We're using byte 277 * addressing, and most of the lower bits are zeros. 278 * This tends to upset hash_64, and it consistently 279 * returns just one or two different values. 280 * 281 * shifting off the lower bits fixes things. 282 */ 283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 284 } 285 286 /* 287 * stealing an rbio means taking all the uptodate pages from the stripe 288 * array in the source rbio and putting them into the destination rbio 289 */ 290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 291 { 292 int i; 293 struct page *s; 294 struct page *d; 295 296 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 297 return; 298 299 for (i = 0; i < dest->nr_pages; i++) { 300 s = src->stripe_pages[i]; 301 if (!s || !PageUptodate(s)) { 302 continue; 303 } 304 305 d = dest->stripe_pages[i]; 306 if (d) 307 __free_page(d); 308 309 dest->stripe_pages[i] = s; 310 src->stripe_pages[i] = NULL; 311 } 312 } 313 314 /* 315 * merging means we take the bio_list from the victim and 316 * splice it into the destination. The victim should 317 * be discarded afterwards. 318 * 319 * must be called with dest->rbio_list_lock held 320 */ 321 static void merge_rbio(struct btrfs_raid_bio *dest, 322 struct btrfs_raid_bio *victim) 323 { 324 bio_list_merge(&dest->bio_list, &victim->bio_list); 325 dest->bio_list_bytes += victim->bio_list_bytes; 326 dest->generic_bio_cnt += victim->generic_bio_cnt; 327 bio_list_init(&victim->bio_list); 328 } 329 330 /* 331 * used to prune items that are in the cache. The caller 332 * must hold the hash table lock. 333 */ 334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 335 { 336 int bucket = rbio_bucket(rbio); 337 struct btrfs_stripe_hash_table *table; 338 struct btrfs_stripe_hash *h; 339 int freeit = 0; 340 341 /* 342 * check the bit again under the hash table lock. 343 */ 344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 345 return; 346 347 table = rbio->bioc->fs_info->stripe_hash_table; 348 h = table->table + bucket; 349 350 /* hold the lock for the bucket because we may be 351 * removing it from the hash table 352 */ 353 spin_lock(&h->lock); 354 355 /* 356 * hold the lock for the bio list because we need 357 * to make sure the bio list is empty 358 */ 359 spin_lock(&rbio->bio_list_lock); 360 361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 362 list_del_init(&rbio->stripe_cache); 363 table->cache_size -= 1; 364 freeit = 1; 365 366 /* if the bio list isn't empty, this rbio is 367 * still involved in an IO. We take it out 368 * of the cache list, and drop the ref that 369 * was held for the list. 370 * 371 * If the bio_list was empty, we also remove 372 * the rbio from the hash_table, and drop 373 * the corresponding ref 374 */ 375 if (bio_list_empty(&rbio->bio_list)) { 376 if (!list_empty(&rbio->hash_list)) { 377 list_del_init(&rbio->hash_list); 378 refcount_dec(&rbio->refs); 379 BUG_ON(!list_empty(&rbio->plug_list)); 380 } 381 } 382 } 383 384 spin_unlock(&rbio->bio_list_lock); 385 spin_unlock(&h->lock); 386 387 if (freeit) 388 __free_raid_bio(rbio); 389 } 390 391 /* 392 * prune a given rbio from the cache 393 */ 394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 395 { 396 struct btrfs_stripe_hash_table *table; 397 unsigned long flags; 398 399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 400 return; 401 402 table = rbio->bioc->fs_info->stripe_hash_table; 403 404 spin_lock_irqsave(&table->cache_lock, flags); 405 __remove_rbio_from_cache(rbio); 406 spin_unlock_irqrestore(&table->cache_lock, flags); 407 } 408 409 /* 410 * remove everything in the cache 411 */ 412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 413 { 414 struct btrfs_stripe_hash_table *table; 415 unsigned long flags; 416 struct btrfs_raid_bio *rbio; 417 418 table = info->stripe_hash_table; 419 420 spin_lock_irqsave(&table->cache_lock, flags); 421 while (!list_empty(&table->stripe_cache)) { 422 rbio = list_entry(table->stripe_cache.next, 423 struct btrfs_raid_bio, 424 stripe_cache); 425 __remove_rbio_from_cache(rbio); 426 } 427 spin_unlock_irqrestore(&table->cache_lock, flags); 428 } 429 430 /* 431 * remove all cached entries and free the hash table 432 * used by unmount 433 */ 434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 435 { 436 if (!info->stripe_hash_table) 437 return; 438 btrfs_clear_rbio_cache(info); 439 kvfree(info->stripe_hash_table); 440 info->stripe_hash_table = NULL; 441 } 442 443 /* 444 * insert an rbio into the stripe cache. It 445 * must have already been prepared by calling 446 * cache_rbio_pages 447 * 448 * If this rbio was already cached, it gets 449 * moved to the front of the lru. 450 * 451 * If the size of the rbio cache is too big, we 452 * prune an item. 453 */ 454 static void cache_rbio(struct btrfs_raid_bio *rbio) 455 { 456 struct btrfs_stripe_hash_table *table; 457 unsigned long flags; 458 459 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 460 return; 461 462 table = rbio->bioc->fs_info->stripe_hash_table; 463 464 spin_lock_irqsave(&table->cache_lock, flags); 465 spin_lock(&rbio->bio_list_lock); 466 467 /* bump our ref if we were not in the list before */ 468 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 469 refcount_inc(&rbio->refs); 470 471 if (!list_empty(&rbio->stripe_cache)){ 472 list_move(&rbio->stripe_cache, &table->stripe_cache); 473 } else { 474 list_add(&rbio->stripe_cache, &table->stripe_cache); 475 table->cache_size += 1; 476 } 477 478 spin_unlock(&rbio->bio_list_lock); 479 480 if (table->cache_size > RBIO_CACHE_SIZE) { 481 struct btrfs_raid_bio *found; 482 483 found = list_entry(table->stripe_cache.prev, 484 struct btrfs_raid_bio, 485 stripe_cache); 486 487 if (found != rbio) 488 __remove_rbio_from_cache(found); 489 } 490 491 spin_unlock_irqrestore(&table->cache_lock, flags); 492 } 493 494 /* 495 * helper function to run the xor_blocks api. It is only 496 * able to do MAX_XOR_BLOCKS at a time, so we need to 497 * loop through. 498 */ 499 static void run_xor(void **pages, int src_cnt, ssize_t len) 500 { 501 int src_off = 0; 502 int xor_src_cnt = 0; 503 void *dest = pages[src_cnt]; 504 505 while(src_cnt > 0) { 506 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 507 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 508 509 src_cnt -= xor_src_cnt; 510 src_off += xor_src_cnt; 511 } 512 } 513 514 /* 515 * Returns true if the bio list inside this rbio covers an entire stripe (no 516 * rmw required). 517 */ 518 static int rbio_is_full(struct btrfs_raid_bio *rbio) 519 { 520 unsigned long flags; 521 unsigned long size = rbio->bio_list_bytes; 522 int ret = 1; 523 524 spin_lock_irqsave(&rbio->bio_list_lock, flags); 525 if (size != rbio->nr_data * rbio->stripe_len) 526 ret = 0; 527 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 528 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 529 530 return ret; 531 } 532 533 /* 534 * returns 1 if it is safe to merge two rbios together. 535 * The merging is safe if the two rbios correspond to 536 * the same stripe and if they are both going in the same 537 * direction (read vs write), and if neither one is 538 * locked for final IO 539 * 540 * The caller is responsible for locking such that 541 * rmw_locked is safe to test 542 */ 543 static int rbio_can_merge(struct btrfs_raid_bio *last, 544 struct btrfs_raid_bio *cur) 545 { 546 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 547 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 548 return 0; 549 550 /* 551 * we can't merge with cached rbios, since the 552 * idea is that when we merge the destination 553 * rbio is going to run our IO for us. We can 554 * steal from cached rbios though, other functions 555 * handle that. 556 */ 557 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 558 test_bit(RBIO_CACHE_BIT, &cur->flags)) 559 return 0; 560 561 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 562 return 0; 563 564 /* we can't merge with different operations */ 565 if (last->operation != cur->operation) 566 return 0; 567 /* 568 * We've need read the full stripe from the drive. 569 * check and repair the parity and write the new results. 570 * 571 * We're not allowed to add any new bios to the 572 * bio list here, anyone else that wants to 573 * change this stripe needs to do their own rmw. 574 */ 575 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 576 return 0; 577 578 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 579 return 0; 580 581 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 582 int fa = last->faila; 583 int fb = last->failb; 584 int cur_fa = cur->faila; 585 int cur_fb = cur->failb; 586 587 if (last->faila >= last->failb) { 588 fa = last->failb; 589 fb = last->faila; 590 } 591 592 if (cur->faila >= cur->failb) { 593 cur_fa = cur->failb; 594 cur_fb = cur->faila; 595 } 596 597 if (fa != cur_fa || fb != cur_fb) 598 return 0; 599 } 600 return 1; 601 } 602 603 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 604 int index) 605 { 606 return stripe * rbio->stripe_npages + index; 607 } 608 609 /* 610 * these are just the pages from the rbio array, not from anything 611 * the FS sent down to us 612 */ 613 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 614 int index) 615 { 616 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 617 } 618 619 /* 620 * helper to index into the pstripe 621 */ 622 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 623 { 624 return rbio_stripe_page(rbio, rbio->nr_data, index); 625 } 626 627 /* 628 * helper to index into the qstripe, returns null 629 * if there is no qstripe 630 */ 631 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 632 { 633 if (rbio->nr_data + 1 == rbio->real_stripes) 634 return NULL; 635 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 636 } 637 638 /* 639 * The first stripe in the table for a logical address 640 * has the lock. rbios are added in one of three ways: 641 * 642 * 1) Nobody has the stripe locked yet. The rbio is given 643 * the lock and 0 is returned. The caller must start the IO 644 * themselves. 645 * 646 * 2) Someone has the stripe locked, but we're able to merge 647 * with the lock owner. The rbio is freed and the IO will 648 * start automatically along with the existing rbio. 1 is returned. 649 * 650 * 3) Someone has the stripe locked, but we're not able to merge. 651 * The rbio is added to the lock owner's plug list, or merged into 652 * an rbio already on the plug list. When the lock owner unlocks, 653 * the next rbio on the list is run and the IO is started automatically. 654 * 1 is returned 655 * 656 * If we return 0, the caller still owns the rbio and must continue with 657 * IO submission. If we return 1, the caller must assume the rbio has 658 * already been freed. 659 */ 660 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 661 { 662 struct btrfs_stripe_hash *h; 663 struct btrfs_raid_bio *cur; 664 struct btrfs_raid_bio *pending; 665 unsigned long flags; 666 struct btrfs_raid_bio *freeit = NULL; 667 struct btrfs_raid_bio *cache_drop = NULL; 668 int ret = 0; 669 670 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 671 672 spin_lock_irqsave(&h->lock, flags); 673 list_for_each_entry(cur, &h->hash_list, hash_list) { 674 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 675 continue; 676 677 spin_lock(&cur->bio_list_lock); 678 679 /* Can we steal this cached rbio's pages? */ 680 if (bio_list_empty(&cur->bio_list) && 681 list_empty(&cur->plug_list) && 682 test_bit(RBIO_CACHE_BIT, &cur->flags) && 683 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 684 list_del_init(&cur->hash_list); 685 refcount_dec(&cur->refs); 686 687 steal_rbio(cur, rbio); 688 cache_drop = cur; 689 spin_unlock(&cur->bio_list_lock); 690 691 goto lockit; 692 } 693 694 /* Can we merge into the lock owner? */ 695 if (rbio_can_merge(cur, rbio)) { 696 merge_rbio(cur, rbio); 697 spin_unlock(&cur->bio_list_lock); 698 freeit = rbio; 699 ret = 1; 700 goto out; 701 } 702 703 704 /* 705 * We couldn't merge with the running rbio, see if we can merge 706 * with the pending ones. We don't have to check for rmw_locked 707 * because there is no way they are inside finish_rmw right now 708 */ 709 list_for_each_entry(pending, &cur->plug_list, plug_list) { 710 if (rbio_can_merge(pending, rbio)) { 711 merge_rbio(pending, rbio); 712 spin_unlock(&cur->bio_list_lock); 713 freeit = rbio; 714 ret = 1; 715 goto out; 716 } 717 } 718 719 /* 720 * No merging, put us on the tail of the plug list, our rbio 721 * will be started with the currently running rbio unlocks 722 */ 723 list_add_tail(&rbio->plug_list, &cur->plug_list); 724 spin_unlock(&cur->bio_list_lock); 725 ret = 1; 726 goto out; 727 } 728 lockit: 729 refcount_inc(&rbio->refs); 730 list_add(&rbio->hash_list, &h->hash_list); 731 out: 732 spin_unlock_irqrestore(&h->lock, flags); 733 if (cache_drop) 734 remove_rbio_from_cache(cache_drop); 735 if (freeit) 736 __free_raid_bio(freeit); 737 return ret; 738 } 739 740 /* 741 * called as rmw or parity rebuild is completed. If the plug list has more 742 * rbios waiting for this stripe, the next one on the list will be started 743 */ 744 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 745 { 746 int bucket; 747 struct btrfs_stripe_hash *h; 748 unsigned long flags; 749 int keep_cache = 0; 750 751 bucket = rbio_bucket(rbio); 752 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 753 754 if (list_empty(&rbio->plug_list)) 755 cache_rbio(rbio); 756 757 spin_lock_irqsave(&h->lock, flags); 758 spin_lock(&rbio->bio_list_lock); 759 760 if (!list_empty(&rbio->hash_list)) { 761 /* 762 * if we're still cached and there is no other IO 763 * to perform, just leave this rbio here for others 764 * to steal from later 765 */ 766 if (list_empty(&rbio->plug_list) && 767 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 768 keep_cache = 1; 769 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 770 BUG_ON(!bio_list_empty(&rbio->bio_list)); 771 goto done; 772 } 773 774 list_del_init(&rbio->hash_list); 775 refcount_dec(&rbio->refs); 776 777 /* 778 * we use the plug list to hold all the rbios 779 * waiting for the chance to lock this stripe. 780 * hand the lock over to one of them. 781 */ 782 if (!list_empty(&rbio->plug_list)) { 783 struct btrfs_raid_bio *next; 784 struct list_head *head = rbio->plug_list.next; 785 786 next = list_entry(head, struct btrfs_raid_bio, 787 plug_list); 788 789 list_del_init(&rbio->plug_list); 790 791 list_add(&next->hash_list, &h->hash_list); 792 refcount_inc(&next->refs); 793 spin_unlock(&rbio->bio_list_lock); 794 spin_unlock_irqrestore(&h->lock, flags); 795 796 if (next->operation == BTRFS_RBIO_READ_REBUILD) 797 start_async_work(next, read_rebuild_work); 798 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 799 steal_rbio(rbio, next); 800 start_async_work(next, read_rebuild_work); 801 } else if (next->operation == BTRFS_RBIO_WRITE) { 802 steal_rbio(rbio, next); 803 start_async_work(next, rmw_work); 804 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 805 steal_rbio(rbio, next); 806 start_async_work(next, scrub_parity_work); 807 } 808 809 goto done_nolock; 810 } 811 } 812 done: 813 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock_irqrestore(&h->lock, flags); 815 816 done_nolock: 817 if (!keep_cache) 818 remove_rbio_from_cache(rbio); 819 } 820 821 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 822 { 823 int i; 824 825 if (!refcount_dec_and_test(&rbio->refs)) 826 return; 827 828 WARN_ON(!list_empty(&rbio->stripe_cache)); 829 WARN_ON(!list_empty(&rbio->hash_list)); 830 WARN_ON(!bio_list_empty(&rbio->bio_list)); 831 832 for (i = 0; i < rbio->nr_pages; i++) { 833 if (rbio->stripe_pages[i]) { 834 __free_page(rbio->stripe_pages[i]); 835 rbio->stripe_pages[i] = NULL; 836 } 837 } 838 839 btrfs_put_bioc(rbio->bioc); 840 kfree(rbio); 841 } 842 843 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 844 { 845 struct bio *next; 846 847 while (cur) { 848 next = cur->bi_next; 849 cur->bi_next = NULL; 850 cur->bi_status = err; 851 bio_endio(cur); 852 cur = next; 853 } 854 } 855 856 /* 857 * this frees the rbio and runs through all the bios in the 858 * bio_list and calls end_io on them 859 */ 860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 861 { 862 struct bio *cur = bio_list_get(&rbio->bio_list); 863 struct bio *extra; 864 865 if (rbio->generic_bio_cnt) 866 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); 867 868 /* 869 * At this moment, rbio->bio_list is empty, however since rbio does not 870 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 871 * hash list, rbio may be merged with others so that rbio->bio_list 872 * becomes non-empty. 873 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 874 * more and we can call bio_endio() on all queued bios. 875 */ 876 unlock_stripe(rbio); 877 extra = bio_list_get(&rbio->bio_list); 878 __free_raid_bio(rbio); 879 880 rbio_endio_bio_list(cur, err); 881 if (extra) 882 rbio_endio_bio_list(extra, err); 883 } 884 885 /* 886 * end io function used by finish_rmw. When we finally 887 * get here, we've written a full stripe 888 */ 889 static void raid_write_end_io(struct bio *bio) 890 { 891 struct btrfs_raid_bio *rbio = bio->bi_private; 892 blk_status_t err = bio->bi_status; 893 int max_errors; 894 895 if (err) 896 fail_bio_stripe(rbio, bio); 897 898 bio_put(bio); 899 900 if (!atomic_dec_and_test(&rbio->stripes_pending)) 901 return; 902 903 err = BLK_STS_OK; 904 905 /* OK, we have read all the stripes we need to. */ 906 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 907 0 : rbio->bioc->max_errors; 908 if (atomic_read(&rbio->error) > max_errors) 909 err = BLK_STS_IOERR; 910 911 rbio_orig_end_io(rbio, err); 912 } 913 914 /* 915 * the read/modify/write code wants to use the original bio for 916 * any pages it included, and then use the rbio for everything 917 * else. This function decides if a given index (stripe number) 918 * and page number in that stripe fall inside the original bio 919 * or the rbio. 920 * 921 * if you set bio_list_only, you'll get a NULL back for any ranges 922 * that are outside the bio_list 923 * 924 * This doesn't take any refs on anything, you get a bare page pointer 925 * and the caller must bump refs as required. 926 * 927 * You must call index_rbio_pages once before you can trust 928 * the answers from this function. 929 */ 930 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 931 int index, int pagenr, int bio_list_only) 932 { 933 int chunk_page; 934 struct page *p = NULL; 935 936 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 937 938 spin_lock_irq(&rbio->bio_list_lock); 939 p = rbio->bio_pages[chunk_page]; 940 spin_unlock_irq(&rbio->bio_list_lock); 941 942 if (p || bio_list_only) 943 return p; 944 945 return rbio->stripe_pages[chunk_page]; 946 } 947 948 /* 949 * number of pages we need for the entire stripe across all the 950 * drives 951 */ 952 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 953 { 954 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 955 } 956 957 /* 958 * allocation and initial setup for the btrfs_raid_bio. Not 959 * this does not allocate any pages for rbio->pages. 960 */ 961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 962 struct btrfs_io_context *bioc, 963 u64 stripe_len) 964 { 965 struct btrfs_raid_bio *rbio; 966 int nr_data = 0; 967 int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 968 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 969 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 970 void *p; 971 972 rbio = kzalloc(sizeof(*rbio) + 973 sizeof(*rbio->stripe_pages) * num_pages + 974 sizeof(*rbio->bio_pages) * num_pages + 975 sizeof(*rbio->finish_pointers) * real_stripes + 976 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 977 sizeof(*rbio->finish_pbitmap) * 978 BITS_TO_LONGS(stripe_npages), 979 GFP_NOFS); 980 if (!rbio) 981 return ERR_PTR(-ENOMEM); 982 983 bio_list_init(&rbio->bio_list); 984 INIT_LIST_HEAD(&rbio->plug_list); 985 spin_lock_init(&rbio->bio_list_lock); 986 INIT_LIST_HEAD(&rbio->stripe_cache); 987 INIT_LIST_HEAD(&rbio->hash_list); 988 rbio->bioc = bioc; 989 rbio->stripe_len = stripe_len; 990 rbio->nr_pages = num_pages; 991 rbio->real_stripes = real_stripes; 992 rbio->stripe_npages = stripe_npages; 993 rbio->faila = -1; 994 rbio->failb = -1; 995 refcount_set(&rbio->refs, 1); 996 atomic_set(&rbio->error, 0); 997 atomic_set(&rbio->stripes_pending, 0); 998 999 /* 1000 * the stripe_pages, bio_pages, etc arrays point to the extra 1001 * memory we allocated past the end of the rbio 1002 */ 1003 p = rbio + 1; 1004 #define CONSUME_ALLOC(ptr, count) do { \ 1005 ptr = p; \ 1006 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1007 } while (0) 1008 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1009 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1010 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1011 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1012 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1013 #undef CONSUME_ALLOC 1014 1015 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) 1016 nr_data = real_stripes - 1; 1017 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) 1018 nr_data = real_stripes - 2; 1019 else 1020 BUG(); 1021 1022 rbio->nr_data = nr_data; 1023 return rbio; 1024 } 1025 1026 /* allocate pages for all the stripes in the bio, including parity */ 1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1028 { 1029 int i; 1030 struct page *page; 1031 1032 for (i = 0; i < rbio->nr_pages; i++) { 1033 if (rbio->stripe_pages[i]) 1034 continue; 1035 page = alloc_page(GFP_NOFS); 1036 if (!page) 1037 return -ENOMEM; 1038 rbio->stripe_pages[i] = page; 1039 } 1040 return 0; 1041 } 1042 1043 /* only allocate pages for p/q stripes */ 1044 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1045 { 1046 int i; 1047 struct page *page; 1048 1049 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1050 1051 for (; i < rbio->nr_pages; i++) { 1052 if (rbio->stripe_pages[i]) 1053 continue; 1054 page = alloc_page(GFP_NOFS); 1055 if (!page) 1056 return -ENOMEM; 1057 rbio->stripe_pages[i] = page; 1058 } 1059 return 0; 1060 } 1061 1062 /* 1063 * add a single page from a specific stripe into our list of bios for IO 1064 * this will try to merge into existing bios if possible, and returns 1065 * zero if all went well. 1066 */ 1067 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1068 struct bio_list *bio_list, 1069 struct page *page, 1070 int stripe_nr, 1071 unsigned long page_index, 1072 unsigned long bio_max_len) 1073 { 1074 struct bio *last = bio_list->tail; 1075 int ret; 1076 struct bio *bio; 1077 struct btrfs_io_stripe *stripe; 1078 u64 disk_start; 1079 1080 stripe = &rbio->bioc->stripes[stripe_nr]; 1081 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1082 1083 /* if the device is missing, just fail this stripe */ 1084 if (!stripe->dev->bdev) 1085 return fail_rbio_index(rbio, stripe_nr); 1086 1087 /* see if we can add this page onto our existing bio */ 1088 if (last) { 1089 u64 last_end = last->bi_iter.bi_sector << 9; 1090 last_end += last->bi_iter.bi_size; 1091 1092 /* 1093 * we can't merge these if they are from different 1094 * devices or if they are not contiguous 1095 */ 1096 if (last_end == disk_start && !last->bi_status && 1097 last->bi_bdev == stripe->dev->bdev) { 1098 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1099 if (ret == PAGE_SIZE) 1100 return 0; 1101 } 1102 } 1103 1104 /* put a new bio on the list */ 1105 bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1106 btrfs_bio(bio)->device = stripe->dev; 1107 bio->bi_iter.bi_size = 0; 1108 bio_set_dev(bio, stripe->dev->bdev); 1109 bio->bi_iter.bi_sector = disk_start >> 9; 1110 1111 bio_add_page(bio, page, PAGE_SIZE, 0); 1112 bio_list_add(bio_list, bio); 1113 return 0; 1114 } 1115 1116 /* 1117 * while we're doing the read/modify/write cycle, we could 1118 * have errors in reading pages off the disk. This checks 1119 * for errors and if we're not able to read the page it'll 1120 * trigger parity reconstruction. The rmw will be finished 1121 * after we've reconstructed the failed stripes 1122 */ 1123 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1124 { 1125 if (rbio->faila >= 0 || rbio->failb >= 0) { 1126 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1127 __raid56_parity_recover(rbio); 1128 } else { 1129 finish_rmw(rbio); 1130 } 1131 } 1132 1133 /* 1134 * helper function to walk our bio list and populate the bio_pages array with 1135 * the result. This seems expensive, but it is faster than constantly 1136 * searching through the bio list as we setup the IO in finish_rmw or stripe 1137 * reconstruction. 1138 * 1139 * This must be called before you trust the answers from page_in_rbio 1140 */ 1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1142 { 1143 struct bio *bio; 1144 u64 start; 1145 unsigned long stripe_offset; 1146 unsigned long page_index; 1147 1148 spin_lock_irq(&rbio->bio_list_lock); 1149 bio_list_for_each(bio, &rbio->bio_list) { 1150 struct bio_vec bvec; 1151 struct bvec_iter iter; 1152 int i = 0; 1153 1154 start = bio->bi_iter.bi_sector << 9; 1155 stripe_offset = start - rbio->bioc->raid_map[0]; 1156 page_index = stripe_offset >> PAGE_SHIFT; 1157 1158 if (bio_flagged(bio, BIO_CLONED)) 1159 bio->bi_iter = btrfs_bio(bio)->iter; 1160 1161 bio_for_each_segment(bvec, bio, iter) { 1162 rbio->bio_pages[page_index + i] = bvec.bv_page; 1163 i++; 1164 } 1165 } 1166 spin_unlock_irq(&rbio->bio_list_lock); 1167 } 1168 1169 /* 1170 * this is called from one of two situations. We either 1171 * have a full stripe from the higher layers, or we've read all 1172 * the missing bits off disk. 1173 * 1174 * This will calculate the parity and then send down any 1175 * changed blocks. 1176 */ 1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1178 { 1179 struct btrfs_io_context *bioc = rbio->bioc; 1180 void **pointers = rbio->finish_pointers; 1181 int nr_data = rbio->nr_data; 1182 int stripe; 1183 int pagenr; 1184 bool has_qstripe; 1185 struct bio_list bio_list; 1186 struct bio *bio; 1187 int ret; 1188 1189 bio_list_init(&bio_list); 1190 1191 if (rbio->real_stripes - rbio->nr_data == 1) 1192 has_qstripe = false; 1193 else if (rbio->real_stripes - rbio->nr_data == 2) 1194 has_qstripe = true; 1195 else 1196 BUG(); 1197 1198 /* at this point we either have a full stripe, 1199 * or we've read the full stripe from the drive. 1200 * recalculate the parity and write the new results. 1201 * 1202 * We're not allowed to add any new bios to the 1203 * bio list here, anyone else that wants to 1204 * change this stripe needs to do their own rmw. 1205 */ 1206 spin_lock_irq(&rbio->bio_list_lock); 1207 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1208 spin_unlock_irq(&rbio->bio_list_lock); 1209 1210 atomic_set(&rbio->error, 0); 1211 1212 /* 1213 * now that we've set rmw_locked, run through the 1214 * bio list one last time and map the page pointers 1215 * 1216 * We don't cache full rbios because we're assuming 1217 * the higher layers are unlikely to use this area of 1218 * the disk again soon. If they do use it again, 1219 * hopefully they will send another full bio. 1220 */ 1221 index_rbio_pages(rbio); 1222 if (!rbio_is_full(rbio)) 1223 cache_rbio_pages(rbio); 1224 else 1225 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1226 1227 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1228 struct page *p; 1229 /* first collect one page from each data stripe */ 1230 for (stripe = 0; stripe < nr_data; stripe++) { 1231 p = page_in_rbio(rbio, stripe, pagenr, 0); 1232 pointers[stripe] = kmap_local_page(p); 1233 } 1234 1235 /* then add the parity stripe */ 1236 p = rbio_pstripe_page(rbio, pagenr); 1237 SetPageUptodate(p); 1238 pointers[stripe++] = kmap_local_page(p); 1239 1240 if (has_qstripe) { 1241 1242 /* 1243 * raid6, add the qstripe and call the 1244 * library function to fill in our p/q 1245 */ 1246 p = rbio_qstripe_page(rbio, pagenr); 1247 SetPageUptodate(p); 1248 pointers[stripe++] = kmap_local_page(p); 1249 1250 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1251 pointers); 1252 } else { 1253 /* raid5 */ 1254 copy_page(pointers[nr_data], pointers[0]); 1255 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1256 } 1257 for (stripe = stripe - 1; stripe >= 0; stripe--) 1258 kunmap_local(pointers[stripe]); 1259 } 1260 1261 /* 1262 * time to start writing. Make bios for everything from the 1263 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1264 * everything else. 1265 */ 1266 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1267 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1268 struct page *page; 1269 if (stripe < rbio->nr_data) { 1270 page = page_in_rbio(rbio, stripe, pagenr, 1); 1271 if (!page) 1272 continue; 1273 } else { 1274 page = rbio_stripe_page(rbio, stripe, pagenr); 1275 } 1276 1277 ret = rbio_add_io_page(rbio, &bio_list, 1278 page, stripe, pagenr, rbio->stripe_len); 1279 if (ret) 1280 goto cleanup; 1281 } 1282 } 1283 1284 if (likely(!bioc->num_tgtdevs)) 1285 goto write_data; 1286 1287 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1288 if (!bioc->tgtdev_map[stripe]) 1289 continue; 1290 1291 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1292 struct page *page; 1293 if (stripe < rbio->nr_data) { 1294 page = page_in_rbio(rbio, stripe, pagenr, 1); 1295 if (!page) 1296 continue; 1297 } else { 1298 page = rbio_stripe_page(rbio, stripe, pagenr); 1299 } 1300 1301 ret = rbio_add_io_page(rbio, &bio_list, page, 1302 rbio->bioc->tgtdev_map[stripe], 1303 pagenr, rbio->stripe_len); 1304 if (ret) 1305 goto cleanup; 1306 } 1307 } 1308 1309 write_data: 1310 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1311 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1312 1313 while ((bio = bio_list_pop(&bio_list))) { 1314 bio->bi_private = rbio; 1315 bio->bi_end_io = raid_write_end_io; 1316 bio->bi_opf = REQ_OP_WRITE; 1317 1318 submit_bio(bio); 1319 } 1320 return; 1321 1322 cleanup: 1323 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1324 1325 while ((bio = bio_list_pop(&bio_list))) 1326 bio_put(bio); 1327 } 1328 1329 /* 1330 * helper to find the stripe number for a given bio. Used to figure out which 1331 * stripe has failed. This expects the bio to correspond to a physical disk, 1332 * so it looks up based on physical sector numbers. 1333 */ 1334 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1335 struct bio *bio) 1336 { 1337 u64 physical = bio->bi_iter.bi_sector; 1338 int i; 1339 struct btrfs_io_stripe *stripe; 1340 1341 physical <<= 9; 1342 1343 for (i = 0; i < rbio->bioc->num_stripes; i++) { 1344 stripe = &rbio->bioc->stripes[i]; 1345 if (in_range(physical, stripe->physical, rbio->stripe_len) && 1346 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1347 return i; 1348 } 1349 } 1350 return -1; 1351 } 1352 1353 /* 1354 * helper to find the stripe number for a given 1355 * bio (before mapping). Used to figure out which stripe has 1356 * failed. This looks up based on logical block numbers. 1357 */ 1358 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1359 struct bio *bio) 1360 { 1361 u64 logical = bio->bi_iter.bi_sector << 9; 1362 int i; 1363 1364 for (i = 0; i < rbio->nr_data; i++) { 1365 u64 stripe_start = rbio->bioc->raid_map[i]; 1366 1367 if (in_range(logical, stripe_start, rbio->stripe_len)) 1368 return i; 1369 } 1370 return -1; 1371 } 1372 1373 /* 1374 * returns -EIO if we had too many failures 1375 */ 1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1377 { 1378 unsigned long flags; 1379 int ret = 0; 1380 1381 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1382 1383 /* we already know this stripe is bad, move on */ 1384 if (rbio->faila == failed || rbio->failb == failed) 1385 goto out; 1386 1387 if (rbio->faila == -1) { 1388 /* first failure on this rbio */ 1389 rbio->faila = failed; 1390 atomic_inc(&rbio->error); 1391 } else if (rbio->failb == -1) { 1392 /* second failure on this rbio */ 1393 rbio->failb = failed; 1394 atomic_inc(&rbio->error); 1395 } else { 1396 ret = -EIO; 1397 } 1398 out: 1399 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1400 1401 return ret; 1402 } 1403 1404 /* 1405 * helper to fail a stripe based on a physical disk 1406 * bio. 1407 */ 1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1409 struct bio *bio) 1410 { 1411 int failed = find_bio_stripe(rbio, bio); 1412 1413 if (failed < 0) 1414 return -EIO; 1415 1416 return fail_rbio_index(rbio, failed); 1417 } 1418 1419 /* 1420 * this sets each page in the bio uptodate. It should only be used on private 1421 * rbio pages, nothing that comes in from the higher layers 1422 */ 1423 static void set_bio_pages_uptodate(struct bio *bio) 1424 { 1425 struct bio_vec *bvec; 1426 struct bvec_iter_all iter_all; 1427 1428 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1429 1430 bio_for_each_segment_all(bvec, bio, iter_all) 1431 SetPageUptodate(bvec->bv_page); 1432 } 1433 1434 /* 1435 * end io for the read phase of the rmw cycle. All the bios here are physical 1436 * stripe bios we've read from the disk so we can recalculate the parity of the 1437 * stripe. 1438 * 1439 * This will usually kick off finish_rmw once all the bios are read in, but it 1440 * may trigger parity reconstruction if we had any errors along the way 1441 */ 1442 static void raid_rmw_end_io(struct bio *bio) 1443 { 1444 struct btrfs_raid_bio *rbio = bio->bi_private; 1445 1446 if (bio->bi_status) 1447 fail_bio_stripe(rbio, bio); 1448 else 1449 set_bio_pages_uptodate(bio); 1450 1451 bio_put(bio); 1452 1453 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1454 return; 1455 1456 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 1457 goto cleanup; 1458 1459 /* 1460 * this will normally call finish_rmw to start our write 1461 * but if there are any failed stripes we'll reconstruct 1462 * from parity first 1463 */ 1464 validate_rbio_for_rmw(rbio); 1465 return; 1466 1467 cleanup: 1468 1469 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1470 } 1471 1472 /* 1473 * the stripe must be locked by the caller. It will 1474 * unlock after all the writes are done 1475 */ 1476 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1477 { 1478 int bios_to_read = 0; 1479 struct bio_list bio_list; 1480 int ret; 1481 int pagenr; 1482 int stripe; 1483 struct bio *bio; 1484 1485 bio_list_init(&bio_list); 1486 1487 ret = alloc_rbio_pages(rbio); 1488 if (ret) 1489 goto cleanup; 1490 1491 index_rbio_pages(rbio); 1492 1493 atomic_set(&rbio->error, 0); 1494 /* 1495 * build a list of bios to read all the missing parts of this 1496 * stripe 1497 */ 1498 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1499 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1500 struct page *page; 1501 /* 1502 * we want to find all the pages missing from 1503 * the rbio and read them from the disk. If 1504 * page_in_rbio finds a page in the bio list 1505 * we don't need to read it off the stripe. 1506 */ 1507 page = page_in_rbio(rbio, stripe, pagenr, 1); 1508 if (page) 1509 continue; 1510 1511 page = rbio_stripe_page(rbio, stripe, pagenr); 1512 /* 1513 * the bio cache may have handed us an uptodate 1514 * page. If so, be happy and use it 1515 */ 1516 if (PageUptodate(page)) 1517 continue; 1518 1519 ret = rbio_add_io_page(rbio, &bio_list, page, 1520 stripe, pagenr, rbio->stripe_len); 1521 if (ret) 1522 goto cleanup; 1523 } 1524 } 1525 1526 bios_to_read = bio_list_size(&bio_list); 1527 if (!bios_to_read) { 1528 /* 1529 * this can happen if others have merged with 1530 * us, it means there is nothing left to read. 1531 * But if there are missing devices it may not be 1532 * safe to do the full stripe write yet. 1533 */ 1534 goto finish; 1535 } 1536 1537 /* 1538 * The bioc may be freed once we submit the last bio. Make sure not to 1539 * touch it after that. 1540 */ 1541 atomic_set(&rbio->stripes_pending, bios_to_read); 1542 while ((bio = bio_list_pop(&bio_list))) { 1543 bio->bi_private = rbio; 1544 bio->bi_end_io = raid_rmw_end_io; 1545 bio->bi_opf = REQ_OP_READ; 1546 1547 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1548 1549 submit_bio(bio); 1550 } 1551 /* the actual write will happen once the reads are done */ 1552 return 0; 1553 1554 cleanup: 1555 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1556 1557 while ((bio = bio_list_pop(&bio_list))) 1558 bio_put(bio); 1559 1560 return -EIO; 1561 1562 finish: 1563 validate_rbio_for_rmw(rbio); 1564 return 0; 1565 } 1566 1567 /* 1568 * if the upper layers pass in a full stripe, we thank them by only allocating 1569 * enough pages to hold the parity, and sending it all down quickly. 1570 */ 1571 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1572 { 1573 int ret; 1574 1575 ret = alloc_rbio_parity_pages(rbio); 1576 if (ret) { 1577 __free_raid_bio(rbio); 1578 return ret; 1579 } 1580 1581 ret = lock_stripe_add(rbio); 1582 if (ret == 0) 1583 finish_rmw(rbio); 1584 return 0; 1585 } 1586 1587 /* 1588 * partial stripe writes get handed over to async helpers. 1589 * We're really hoping to merge a few more writes into this 1590 * rbio before calculating new parity 1591 */ 1592 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1593 { 1594 int ret; 1595 1596 ret = lock_stripe_add(rbio); 1597 if (ret == 0) 1598 start_async_work(rbio, rmw_work); 1599 return 0; 1600 } 1601 1602 /* 1603 * sometimes while we were reading from the drive to 1604 * recalculate parity, enough new bios come into create 1605 * a full stripe. So we do a check here to see if we can 1606 * go directly to finish_rmw 1607 */ 1608 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1609 { 1610 /* head off into rmw land if we don't have a full stripe */ 1611 if (!rbio_is_full(rbio)) 1612 return partial_stripe_write(rbio); 1613 return full_stripe_write(rbio); 1614 } 1615 1616 /* 1617 * We use plugging call backs to collect full stripes. 1618 * Any time we get a partial stripe write while plugged 1619 * we collect it into a list. When the unplug comes down, 1620 * we sort the list by logical block number and merge 1621 * everything we can into the same rbios 1622 */ 1623 struct btrfs_plug_cb { 1624 struct blk_plug_cb cb; 1625 struct btrfs_fs_info *info; 1626 struct list_head rbio_list; 1627 struct btrfs_work work; 1628 }; 1629 1630 /* 1631 * rbios on the plug list are sorted for easier merging. 1632 */ 1633 static int plug_cmp(void *priv, const struct list_head *a, 1634 const struct list_head *b) 1635 { 1636 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1637 plug_list); 1638 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1639 plug_list); 1640 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1641 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1642 1643 if (a_sector < b_sector) 1644 return -1; 1645 if (a_sector > b_sector) 1646 return 1; 1647 return 0; 1648 } 1649 1650 static void run_plug(struct btrfs_plug_cb *plug) 1651 { 1652 struct btrfs_raid_bio *cur; 1653 struct btrfs_raid_bio *last = NULL; 1654 1655 /* 1656 * sort our plug list then try to merge 1657 * everything we can in hopes of creating full 1658 * stripes. 1659 */ 1660 list_sort(NULL, &plug->rbio_list, plug_cmp); 1661 while (!list_empty(&plug->rbio_list)) { 1662 cur = list_entry(plug->rbio_list.next, 1663 struct btrfs_raid_bio, plug_list); 1664 list_del_init(&cur->plug_list); 1665 1666 if (rbio_is_full(cur)) { 1667 int ret; 1668 1669 /* we have a full stripe, send it down */ 1670 ret = full_stripe_write(cur); 1671 BUG_ON(ret); 1672 continue; 1673 } 1674 if (last) { 1675 if (rbio_can_merge(last, cur)) { 1676 merge_rbio(last, cur); 1677 __free_raid_bio(cur); 1678 continue; 1679 1680 } 1681 __raid56_parity_write(last); 1682 } 1683 last = cur; 1684 } 1685 if (last) { 1686 __raid56_parity_write(last); 1687 } 1688 kfree(plug); 1689 } 1690 1691 /* 1692 * if the unplug comes from schedule, we have to push the 1693 * work off to a helper thread 1694 */ 1695 static void unplug_work(struct btrfs_work *work) 1696 { 1697 struct btrfs_plug_cb *plug; 1698 plug = container_of(work, struct btrfs_plug_cb, work); 1699 run_plug(plug); 1700 } 1701 1702 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1703 { 1704 struct btrfs_plug_cb *plug; 1705 plug = container_of(cb, struct btrfs_plug_cb, cb); 1706 1707 if (from_schedule) { 1708 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1709 btrfs_queue_work(plug->info->rmw_workers, 1710 &plug->work); 1711 return; 1712 } 1713 run_plug(plug); 1714 } 1715 1716 /* 1717 * our main entry point for writes from the rest of the FS. 1718 */ 1719 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, 1720 u64 stripe_len) 1721 { 1722 struct btrfs_fs_info *fs_info = bioc->fs_info; 1723 struct btrfs_raid_bio *rbio; 1724 struct btrfs_plug_cb *plug = NULL; 1725 struct blk_plug_cb *cb; 1726 int ret; 1727 1728 rbio = alloc_rbio(fs_info, bioc, stripe_len); 1729 if (IS_ERR(rbio)) { 1730 btrfs_put_bioc(bioc); 1731 return PTR_ERR(rbio); 1732 } 1733 bio_list_add(&rbio->bio_list, bio); 1734 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1735 rbio->operation = BTRFS_RBIO_WRITE; 1736 1737 btrfs_bio_counter_inc_noblocked(fs_info); 1738 rbio->generic_bio_cnt = 1; 1739 1740 /* 1741 * don't plug on full rbios, just get them out the door 1742 * as quickly as we can 1743 */ 1744 if (rbio_is_full(rbio)) { 1745 ret = full_stripe_write(rbio); 1746 if (ret) 1747 btrfs_bio_counter_dec(fs_info); 1748 return ret; 1749 } 1750 1751 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1752 if (cb) { 1753 plug = container_of(cb, struct btrfs_plug_cb, cb); 1754 if (!plug->info) { 1755 plug->info = fs_info; 1756 INIT_LIST_HEAD(&plug->rbio_list); 1757 } 1758 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1759 ret = 0; 1760 } else { 1761 ret = __raid56_parity_write(rbio); 1762 if (ret) 1763 btrfs_bio_counter_dec(fs_info); 1764 } 1765 return ret; 1766 } 1767 1768 /* 1769 * all parity reconstruction happens here. We've read in everything 1770 * we can find from the drives and this does the heavy lifting of 1771 * sorting the good from the bad. 1772 */ 1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1774 { 1775 int pagenr, stripe; 1776 void **pointers; 1777 void **unmap_array; 1778 int faila = -1, failb = -1; 1779 struct page *page; 1780 blk_status_t err; 1781 int i; 1782 1783 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1784 if (!pointers) { 1785 err = BLK_STS_RESOURCE; 1786 goto cleanup_io; 1787 } 1788 1789 /* 1790 * Store copy of pointers that does not get reordered during 1791 * reconstruction so that kunmap_local works. 1792 */ 1793 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1794 if (!unmap_array) { 1795 err = BLK_STS_RESOURCE; 1796 goto cleanup_pointers; 1797 } 1798 1799 faila = rbio->faila; 1800 failb = rbio->failb; 1801 1802 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1803 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1804 spin_lock_irq(&rbio->bio_list_lock); 1805 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1806 spin_unlock_irq(&rbio->bio_list_lock); 1807 } 1808 1809 index_rbio_pages(rbio); 1810 1811 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1812 /* 1813 * Now we just use bitmap to mark the horizontal stripes in 1814 * which we have data when doing parity scrub. 1815 */ 1816 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1817 !test_bit(pagenr, rbio->dbitmap)) 1818 continue; 1819 1820 /* 1821 * Setup our array of pointers with pages from each stripe 1822 * 1823 * NOTE: store a duplicate array of pointers to preserve the 1824 * pointer order 1825 */ 1826 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1827 /* 1828 * if we're rebuilding a read, we have to use 1829 * pages from the bio list 1830 */ 1831 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1832 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1833 (stripe == faila || stripe == failb)) { 1834 page = page_in_rbio(rbio, stripe, pagenr, 0); 1835 } else { 1836 page = rbio_stripe_page(rbio, stripe, pagenr); 1837 } 1838 pointers[stripe] = kmap_local_page(page); 1839 unmap_array[stripe] = pointers[stripe]; 1840 } 1841 1842 /* all raid6 handling here */ 1843 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1844 /* 1845 * single failure, rebuild from parity raid5 1846 * style 1847 */ 1848 if (failb < 0) { 1849 if (faila == rbio->nr_data) { 1850 /* 1851 * Just the P stripe has failed, without 1852 * a bad data or Q stripe. 1853 * TODO, we should redo the xor here. 1854 */ 1855 err = BLK_STS_IOERR; 1856 goto cleanup; 1857 } 1858 /* 1859 * a single failure in raid6 is rebuilt 1860 * in the pstripe code below 1861 */ 1862 goto pstripe; 1863 } 1864 1865 /* make sure our ps and qs are in order */ 1866 if (faila > failb) 1867 swap(faila, failb); 1868 1869 /* if the q stripe is failed, do a pstripe reconstruction 1870 * from the xors. 1871 * If both the q stripe and the P stripe are failed, we're 1872 * here due to a crc mismatch and we can't give them the 1873 * data they want 1874 */ 1875 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 1876 if (rbio->bioc->raid_map[faila] == 1877 RAID5_P_STRIPE) { 1878 err = BLK_STS_IOERR; 1879 goto cleanup; 1880 } 1881 /* 1882 * otherwise we have one bad data stripe and 1883 * a good P stripe. raid5! 1884 */ 1885 goto pstripe; 1886 } 1887 1888 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 1889 raid6_datap_recov(rbio->real_stripes, 1890 PAGE_SIZE, faila, pointers); 1891 } else { 1892 raid6_2data_recov(rbio->real_stripes, 1893 PAGE_SIZE, faila, failb, 1894 pointers); 1895 } 1896 } else { 1897 void *p; 1898 1899 /* rebuild from P stripe here (raid5 or raid6) */ 1900 BUG_ON(failb != -1); 1901 pstripe: 1902 /* Copy parity block into failed block to start with */ 1903 copy_page(pointers[faila], pointers[rbio->nr_data]); 1904 1905 /* rearrange the pointer array */ 1906 p = pointers[faila]; 1907 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1908 pointers[stripe] = pointers[stripe + 1]; 1909 pointers[rbio->nr_data - 1] = p; 1910 1911 /* xor in the rest */ 1912 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1913 } 1914 /* if we're doing this rebuild as part of an rmw, go through 1915 * and set all of our private rbio pages in the 1916 * failed stripes as uptodate. This way finish_rmw will 1917 * know they can be trusted. If this was a read reconstruction, 1918 * other endio functions will fiddle the uptodate bits 1919 */ 1920 if (rbio->operation == BTRFS_RBIO_WRITE) { 1921 for (i = 0; i < rbio->stripe_npages; i++) { 1922 if (faila != -1) { 1923 page = rbio_stripe_page(rbio, faila, i); 1924 SetPageUptodate(page); 1925 } 1926 if (failb != -1) { 1927 page = rbio_stripe_page(rbio, failb, i); 1928 SetPageUptodate(page); 1929 } 1930 } 1931 } 1932 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 1933 kunmap_local(unmap_array[stripe]); 1934 } 1935 1936 err = BLK_STS_OK; 1937 cleanup: 1938 kfree(unmap_array); 1939 cleanup_pointers: 1940 kfree(pointers); 1941 1942 cleanup_io: 1943 /* 1944 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1945 * valid rbio which is consistent with ondisk content, thus such a 1946 * valid rbio can be cached to avoid further disk reads. 1947 */ 1948 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1949 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1950 /* 1951 * - In case of two failures, where rbio->failb != -1: 1952 * 1953 * Do not cache this rbio since the above read reconstruction 1954 * (raid6_datap_recov() or raid6_2data_recov()) may have 1955 * changed some content of stripes which are not identical to 1956 * on-disk content any more, otherwise, a later write/recover 1957 * may steal stripe_pages from this rbio and end up with 1958 * corruptions or rebuild failures. 1959 * 1960 * - In case of single failure, where rbio->failb == -1: 1961 * 1962 * Cache this rbio iff the above read reconstruction is 1963 * executed without problems. 1964 */ 1965 if (err == BLK_STS_OK && rbio->failb < 0) 1966 cache_rbio_pages(rbio); 1967 else 1968 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1969 1970 rbio_orig_end_io(rbio, err); 1971 } else if (err == BLK_STS_OK) { 1972 rbio->faila = -1; 1973 rbio->failb = -1; 1974 1975 if (rbio->operation == BTRFS_RBIO_WRITE) 1976 finish_rmw(rbio); 1977 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1978 finish_parity_scrub(rbio, 0); 1979 else 1980 BUG(); 1981 } else { 1982 rbio_orig_end_io(rbio, err); 1983 } 1984 } 1985 1986 /* 1987 * This is called only for stripes we've read from disk to 1988 * reconstruct the parity. 1989 */ 1990 static void raid_recover_end_io(struct bio *bio) 1991 { 1992 struct btrfs_raid_bio *rbio = bio->bi_private; 1993 1994 /* 1995 * we only read stripe pages off the disk, set them 1996 * up to date if there were no errors 1997 */ 1998 if (bio->bi_status) 1999 fail_bio_stripe(rbio, bio); 2000 else 2001 set_bio_pages_uptodate(bio); 2002 bio_put(bio); 2003 2004 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2005 return; 2006 2007 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2008 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2009 else 2010 __raid_recover_end_io(rbio); 2011 } 2012 2013 /* 2014 * reads everything we need off the disk to reconstruct 2015 * the parity. endio handlers trigger final reconstruction 2016 * when the IO is done. 2017 * 2018 * This is used both for reads from the higher layers and for 2019 * parity construction required to finish a rmw cycle. 2020 */ 2021 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2022 { 2023 int bios_to_read = 0; 2024 struct bio_list bio_list; 2025 int ret; 2026 int pagenr; 2027 int stripe; 2028 struct bio *bio; 2029 2030 bio_list_init(&bio_list); 2031 2032 ret = alloc_rbio_pages(rbio); 2033 if (ret) 2034 goto cleanup; 2035 2036 atomic_set(&rbio->error, 0); 2037 2038 /* 2039 * read everything that hasn't failed. Thanks to the 2040 * stripe cache, it is possible that some or all of these 2041 * pages are going to be uptodate. 2042 */ 2043 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2044 if (rbio->faila == stripe || rbio->failb == stripe) { 2045 atomic_inc(&rbio->error); 2046 continue; 2047 } 2048 2049 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2050 struct page *p; 2051 2052 /* 2053 * the rmw code may have already read this 2054 * page in 2055 */ 2056 p = rbio_stripe_page(rbio, stripe, pagenr); 2057 if (PageUptodate(p)) 2058 continue; 2059 2060 ret = rbio_add_io_page(rbio, &bio_list, 2061 rbio_stripe_page(rbio, stripe, pagenr), 2062 stripe, pagenr, rbio->stripe_len); 2063 if (ret < 0) 2064 goto cleanup; 2065 } 2066 } 2067 2068 bios_to_read = bio_list_size(&bio_list); 2069 if (!bios_to_read) { 2070 /* 2071 * we might have no bios to read just because the pages 2072 * were up to date, or we might have no bios to read because 2073 * the devices were gone. 2074 */ 2075 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 2076 __raid_recover_end_io(rbio); 2077 return 0; 2078 } else { 2079 goto cleanup; 2080 } 2081 } 2082 2083 /* 2084 * The bioc may be freed once we submit the last bio. Make sure not to 2085 * touch it after that. 2086 */ 2087 atomic_set(&rbio->stripes_pending, bios_to_read); 2088 while ((bio = bio_list_pop(&bio_list))) { 2089 bio->bi_private = rbio; 2090 bio->bi_end_io = raid_recover_end_io; 2091 bio->bi_opf = REQ_OP_READ; 2092 2093 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2094 2095 submit_bio(bio); 2096 } 2097 2098 return 0; 2099 2100 cleanup: 2101 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2102 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2103 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2104 2105 while ((bio = bio_list_pop(&bio_list))) 2106 bio_put(bio); 2107 2108 return -EIO; 2109 } 2110 2111 /* 2112 * the main entry point for reads from the higher layers. This 2113 * is really only called when the normal read path had a failure, 2114 * so we assume the bio they send down corresponds to a failed part 2115 * of the drive. 2116 */ 2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2118 u64 stripe_len, int mirror_num, int generic_io) 2119 { 2120 struct btrfs_fs_info *fs_info = bioc->fs_info; 2121 struct btrfs_raid_bio *rbio; 2122 int ret; 2123 2124 if (generic_io) { 2125 ASSERT(bioc->mirror_num == mirror_num); 2126 btrfs_bio(bio)->mirror_num = mirror_num; 2127 } 2128 2129 rbio = alloc_rbio(fs_info, bioc, stripe_len); 2130 if (IS_ERR(rbio)) { 2131 if (generic_io) 2132 btrfs_put_bioc(bioc); 2133 return PTR_ERR(rbio); 2134 } 2135 2136 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2137 bio_list_add(&rbio->bio_list, bio); 2138 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2139 2140 rbio->faila = find_logical_bio_stripe(rbio, bio); 2141 if (rbio->faila == -1) { 2142 btrfs_warn(fs_info, 2143 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 2144 __func__, bio->bi_iter.bi_sector << 9, 2145 (u64)bio->bi_iter.bi_size, bioc->map_type); 2146 if (generic_io) 2147 btrfs_put_bioc(bioc); 2148 kfree(rbio); 2149 return -EIO; 2150 } 2151 2152 if (generic_io) { 2153 btrfs_bio_counter_inc_noblocked(fs_info); 2154 rbio->generic_bio_cnt = 1; 2155 } else { 2156 btrfs_get_bioc(bioc); 2157 } 2158 2159 /* 2160 * Loop retry: 2161 * for 'mirror == 2', reconstruct from all other stripes. 2162 * for 'mirror_num > 2', select a stripe to fail on every retry. 2163 */ 2164 if (mirror_num > 2) { 2165 /* 2166 * 'mirror == 3' is to fail the p stripe and 2167 * reconstruct from the q stripe. 'mirror > 3' is to 2168 * fail a data stripe and reconstruct from p+q stripe. 2169 */ 2170 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2171 ASSERT(rbio->failb > 0); 2172 if (rbio->failb <= rbio->faila) 2173 rbio->failb--; 2174 } 2175 2176 ret = lock_stripe_add(rbio); 2177 2178 /* 2179 * __raid56_parity_recover will end the bio with 2180 * any errors it hits. We don't want to return 2181 * its error value up the stack because our caller 2182 * will end up calling bio_endio with any nonzero 2183 * return 2184 */ 2185 if (ret == 0) 2186 __raid56_parity_recover(rbio); 2187 /* 2188 * our rbio has been added to the list of 2189 * rbios that will be handled after the 2190 * currently lock owner is done 2191 */ 2192 return 0; 2193 2194 } 2195 2196 static void rmw_work(struct btrfs_work *work) 2197 { 2198 struct btrfs_raid_bio *rbio; 2199 2200 rbio = container_of(work, struct btrfs_raid_bio, work); 2201 raid56_rmw_stripe(rbio); 2202 } 2203 2204 static void read_rebuild_work(struct btrfs_work *work) 2205 { 2206 struct btrfs_raid_bio *rbio; 2207 2208 rbio = container_of(work, struct btrfs_raid_bio, work); 2209 __raid56_parity_recover(rbio); 2210 } 2211 2212 /* 2213 * The following code is used to scrub/replace the parity stripe 2214 * 2215 * Caller must have already increased bio_counter for getting @bioc. 2216 * 2217 * Note: We need make sure all the pages that add into the scrub/replace 2218 * raid bio are correct and not be changed during the scrub/replace. That 2219 * is those pages just hold metadata or file data with checksum. 2220 */ 2221 2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2223 struct btrfs_io_context *bioc, 2224 u64 stripe_len, struct btrfs_device *scrub_dev, 2225 unsigned long *dbitmap, int stripe_nsectors) 2226 { 2227 struct btrfs_fs_info *fs_info = bioc->fs_info; 2228 struct btrfs_raid_bio *rbio; 2229 int i; 2230 2231 rbio = alloc_rbio(fs_info, bioc, stripe_len); 2232 if (IS_ERR(rbio)) 2233 return NULL; 2234 bio_list_add(&rbio->bio_list, bio); 2235 /* 2236 * This is a special bio which is used to hold the completion handler 2237 * and make the scrub rbio is similar to the other types 2238 */ 2239 ASSERT(!bio->bi_iter.bi_size); 2240 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2241 2242 /* 2243 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2244 * to the end position, so this search can start from the first parity 2245 * stripe. 2246 */ 2247 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2248 if (bioc->stripes[i].dev == scrub_dev) { 2249 rbio->scrubp = i; 2250 break; 2251 } 2252 } 2253 ASSERT(i < rbio->real_stripes); 2254 2255 /* Now we just support the sectorsize equals to page size */ 2256 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2257 ASSERT(rbio->stripe_npages == stripe_nsectors); 2258 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2259 2260 /* 2261 * We have already increased bio_counter when getting bioc, record it 2262 * so we can free it at rbio_orig_end_io(). 2263 */ 2264 rbio->generic_bio_cnt = 1; 2265 2266 return rbio; 2267 } 2268 2269 /* Used for both parity scrub and missing. */ 2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2271 u64 logical) 2272 { 2273 int stripe_offset; 2274 int index; 2275 2276 ASSERT(logical >= rbio->bioc->raid_map[0]); 2277 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + 2278 rbio->stripe_len * rbio->nr_data); 2279 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 2280 index = stripe_offset >> PAGE_SHIFT; 2281 rbio->bio_pages[index] = page; 2282 } 2283 2284 /* 2285 * We just scrub the parity that we have correct data on the same horizontal, 2286 * so we needn't allocate all pages for all the stripes. 2287 */ 2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2289 { 2290 int i; 2291 int bit; 2292 int index; 2293 struct page *page; 2294 2295 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2296 for (i = 0; i < rbio->real_stripes; i++) { 2297 index = i * rbio->stripe_npages + bit; 2298 if (rbio->stripe_pages[index]) 2299 continue; 2300 2301 page = alloc_page(GFP_NOFS); 2302 if (!page) 2303 return -ENOMEM; 2304 rbio->stripe_pages[index] = page; 2305 } 2306 } 2307 return 0; 2308 } 2309 2310 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2311 int need_check) 2312 { 2313 struct btrfs_io_context *bioc = rbio->bioc; 2314 void **pointers = rbio->finish_pointers; 2315 unsigned long *pbitmap = rbio->finish_pbitmap; 2316 int nr_data = rbio->nr_data; 2317 int stripe; 2318 int pagenr; 2319 bool has_qstripe; 2320 struct page *p_page = NULL; 2321 struct page *q_page = NULL; 2322 struct bio_list bio_list; 2323 struct bio *bio; 2324 int is_replace = 0; 2325 int ret; 2326 2327 bio_list_init(&bio_list); 2328 2329 if (rbio->real_stripes - rbio->nr_data == 1) 2330 has_qstripe = false; 2331 else if (rbio->real_stripes - rbio->nr_data == 2) 2332 has_qstripe = true; 2333 else 2334 BUG(); 2335 2336 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 2337 is_replace = 1; 2338 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2339 } 2340 2341 /* 2342 * Because the higher layers(scrubber) are unlikely to 2343 * use this area of the disk again soon, so don't cache 2344 * it. 2345 */ 2346 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2347 2348 if (!need_check) 2349 goto writeback; 2350 2351 p_page = alloc_page(GFP_NOFS); 2352 if (!p_page) 2353 goto cleanup; 2354 SetPageUptodate(p_page); 2355 2356 if (has_qstripe) { 2357 /* RAID6, allocate and map temp space for the Q stripe */ 2358 q_page = alloc_page(GFP_NOFS); 2359 if (!q_page) { 2360 __free_page(p_page); 2361 goto cleanup; 2362 } 2363 SetPageUptodate(q_page); 2364 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); 2365 } 2366 2367 atomic_set(&rbio->error, 0); 2368 2369 /* Map the parity stripe just once */ 2370 pointers[nr_data] = kmap_local_page(p_page); 2371 2372 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2373 struct page *p; 2374 void *parity; 2375 /* first collect one page from each data stripe */ 2376 for (stripe = 0; stripe < nr_data; stripe++) { 2377 p = page_in_rbio(rbio, stripe, pagenr, 0); 2378 pointers[stripe] = kmap_local_page(p); 2379 } 2380 2381 if (has_qstripe) { 2382 /* RAID6, call the library function to fill in our P/Q */ 2383 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2384 pointers); 2385 } else { 2386 /* raid5 */ 2387 copy_page(pointers[nr_data], pointers[0]); 2388 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2389 } 2390 2391 /* Check scrubbing parity and repair it */ 2392 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2393 parity = kmap_local_page(p); 2394 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2395 copy_page(parity, pointers[rbio->scrubp]); 2396 else 2397 /* Parity is right, needn't writeback */ 2398 bitmap_clear(rbio->dbitmap, pagenr, 1); 2399 kunmap_local(parity); 2400 2401 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2402 kunmap_local(pointers[stripe]); 2403 } 2404 2405 kunmap_local(pointers[nr_data]); 2406 __free_page(p_page); 2407 if (q_page) { 2408 kunmap_local(pointers[rbio->real_stripes - 1]); 2409 __free_page(q_page); 2410 } 2411 2412 writeback: 2413 /* 2414 * time to start writing. Make bios for everything from the 2415 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2416 * everything else. 2417 */ 2418 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2419 struct page *page; 2420 2421 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2422 ret = rbio_add_io_page(rbio, &bio_list, 2423 page, rbio->scrubp, pagenr, rbio->stripe_len); 2424 if (ret) 2425 goto cleanup; 2426 } 2427 2428 if (!is_replace) 2429 goto submit_write; 2430 2431 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2432 struct page *page; 2433 2434 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2435 ret = rbio_add_io_page(rbio, &bio_list, page, 2436 bioc->tgtdev_map[rbio->scrubp], 2437 pagenr, rbio->stripe_len); 2438 if (ret) 2439 goto cleanup; 2440 } 2441 2442 submit_write: 2443 nr_data = bio_list_size(&bio_list); 2444 if (!nr_data) { 2445 /* Every parity is right */ 2446 rbio_orig_end_io(rbio, BLK_STS_OK); 2447 return; 2448 } 2449 2450 atomic_set(&rbio->stripes_pending, nr_data); 2451 2452 while ((bio = bio_list_pop(&bio_list))) { 2453 bio->bi_private = rbio; 2454 bio->bi_end_io = raid_write_end_io; 2455 bio->bi_opf = REQ_OP_WRITE; 2456 2457 submit_bio(bio); 2458 } 2459 return; 2460 2461 cleanup: 2462 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2463 2464 while ((bio = bio_list_pop(&bio_list))) 2465 bio_put(bio); 2466 } 2467 2468 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2469 { 2470 if (stripe >= 0 && stripe < rbio->nr_data) 2471 return 1; 2472 return 0; 2473 } 2474 2475 /* 2476 * While we're doing the parity check and repair, we could have errors 2477 * in reading pages off the disk. This checks for errors and if we're 2478 * not able to read the page it'll trigger parity reconstruction. The 2479 * parity scrub will be finished after we've reconstructed the failed 2480 * stripes 2481 */ 2482 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2483 { 2484 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2485 goto cleanup; 2486 2487 if (rbio->faila >= 0 || rbio->failb >= 0) { 2488 int dfail = 0, failp = -1; 2489 2490 if (is_data_stripe(rbio, rbio->faila)) 2491 dfail++; 2492 else if (is_parity_stripe(rbio->faila)) 2493 failp = rbio->faila; 2494 2495 if (is_data_stripe(rbio, rbio->failb)) 2496 dfail++; 2497 else if (is_parity_stripe(rbio->failb)) 2498 failp = rbio->failb; 2499 2500 /* 2501 * Because we can not use a scrubbing parity to repair 2502 * the data, so the capability of the repair is declined. 2503 * (In the case of RAID5, we can not repair anything) 2504 */ 2505 if (dfail > rbio->bioc->max_errors - 1) 2506 goto cleanup; 2507 2508 /* 2509 * If all data is good, only parity is correctly, just 2510 * repair the parity. 2511 */ 2512 if (dfail == 0) { 2513 finish_parity_scrub(rbio, 0); 2514 return; 2515 } 2516 2517 /* 2518 * Here means we got one corrupted data stripe and one 2519 * corrupted parity on RAID6, if the corrupted parity 2520 * is scrubbing parity, luckily, use the other one to repair 2521 * the data, or we can not repair the data stripe. 2522 */ 2523 if (failp != rbio->scrubp) 2524 goto cleanup; 2525 2526 __raid_recover_end_io(rbio); 2527 } else { 2528 finish_parity_scrub(rbio, 1); 2529 } 2530 return; 2531 2532 cleanup: 2533 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2534 } 2535 2536 /* 2537 * end io for the read phase of the rmw cycle. All the bios here are physical 2538 * stripe bios we've read from the disk so we can recalculate the parity of the 2539 * stripe. 2540 * 2541 * This will usually kick off finish_rmw once all the bios are read in, but it 2542 * may trigger parity reconstruction if we had any errors along the way 2543 */ 2544 static void raid56_parity_scrub_end_io(struct bio *bio) 2545 { 2546 struct btrfs_raid_bio *rbio = bio->bi_private; 2547 2548 if (bio->bi_status) 2549 fail_bio_stripe(rbio, bio); 2550 else 2551 set_bio_pages_uptodate(bio); 2552 2553 bio_put(bio); 2554 2555 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2556 return; 2557 2558 /* 2559 * this will normally call finish_rmw to start our write 2560 * but if there are any failed stripes we'll reconstruct 2561 * from parity first 2562 */ 2563 validate_rbio_for_parity_scrub(rbio); 2564 } 2565 2566 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2567 { 2568 int bios_to_read = 0; 2569 struct bio_list bio_list; 2570 int ret; 2571 int pagenr; 2572 int stripe; 2573 struct bio *bio; 2574 2575 bio_list_init(&bio_list); 2576 2577 ret = alloc_rbio_essential_pages(rbio); 2578 if (ret) 2579 goto cleanup; 2580 2581 atomic_set(&rbio->error, 0); 2582 /* 2583 * build a list of bios to read all the missing parts of this 2584 * stripe 2585 */ 2586 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2587 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2588 struct page *page; 2589 /* 2590 * we want to find all the pages missing from 2591 * the rbio and read them from the disk. If 2592 * page_in_rbio finds a page in the bio list 2593 * we don't need to read it off the stripe. 2594 */ 2595 page = page_in_rbio(rbio, stripe, pagenr, 1); 2596 if (page) 2597 continue; 2598 2599 page = rbio_stripe_page(rbio, stripe, pagenr); 2600 /* 2601 * the bio cache may have handed us an uptodate 2602 * page. If so, be happy and use it 2603 */ 2604 if (PageUptodate(page)) 2605 continue; 2606 2607 ret = rbio_add_io_page(rbio, &bio_list, page, 2608 stripe, pagenr, rbio->stripe_len); 2609 if (ret) 2610 goto cleanup; 2611 } 2612 } 2613 2614 bios_to_read = bio_list_size(&bio_list); 2615 if (!bios_to_read) { 2616 /* 2617 * this can happen if others have merged with 2618 * us, it means there is nothing left to read. 2619 * But if there are missing devices it may not be 2620 * safe to do the full stripe write yet. 2621 */ 2622 goto finish; 2623 } 2624 2625 /* 2626 * The bioc may be freed once we submit the last bio. Make sure not to 2627 * touch it after that. 2628 */ 2629 atomic_set(&rbio->stripes_pending, bios_to_read); 2630 while ((bio = bio_list_pop(&bio_list))) { 2631 bio->bi_private = rbio; 2632 bio->bi_end_io = raid56_parity_scrub_end_io; 2633 bio->bi_opf = REQ_OP_READ; 2634 2635 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2636 2637 submit_bio(bio); 2638 } 2639 /* the actual write will happen once the reads are done */ 2640 return; 2641 2642 cleanup: 2643 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2644 2645 while ((bio = bio_list_pop(&bio_list))) 2646 bio_put(bio); 2647 2648 return; 2649 2650 finish: 2651 validate_rbio_for_parity_scrub(rbio); 2652 } 2653 2654 static void scrub_parity_work(struct btrfs_work *work) 2655 { 2656 struct btrfs_raid_bio *rbio; 2657 2658 rbio = container_of(work, struct btrfs_raid_bio, work); 2659 raid56_parity_scrub_stripe(rbio); 2660 } 2661 2662 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2663 { 2664 if (!lock_stripe_add(rbio)) 2665 start_async_work(rbio, scrub_parity_work); 2666 } 2667 2668 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2669 2670 struct btrfs_raid_bio * 2671 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, 2672 u64 length) 2673 { 2674 struct btrfs_fs_info *fs_info = bioc->fs_info; 2675 struct btrfs_raid_bio *rbio; 2676 2677 rbio = alloc_rbio(fs_info, bioc, length); 2678 if (IS_ERR(rbio)) 2679 return NULL; 2680 2681 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2682 bio_list_add(&rbio->bio_list, bio); 2683 /* 2684 * This is a special bio which is used to hold the completion handler 2685 * and make the scrub rbio is similar to the other types 2686 */ 2687 ASSERT(!bio->bi_iter.bi_size); 2688 2689 rbio->faila = find_logical_bio_stripe(rbio, bio); 2690 if (rbio->faila == -1) { 2691 BUG(); 2692 kfree(rbio); 2693 return NULL; 2694 } 2695 2696 /* 2697 * When we get bioc, we have already increased bio_counter, record it 2698 * so we can free it at rbio_orig_end_io() 2699 */ 2700 rbio->generic_bio_cnt = 1; 2701 2702 return rbio; 2703 } 2704 2705 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2706 { 2707 if (!lock_stripe_add(rbio)) 2708 start_async_work(rbio, read_rebuild_work); 2709 } 2710