1 /* 2 * Copyright (C) 2012 Fusion-io All rights reserved. 3 * Copyright (C) 2012 Intel Corp. All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public 7 * License v2 as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public 15 * License along with this program; if not, write to the 16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 * Boston, MA 021110-1307, USA. 18 */ 19 #include <linux/sched.h> 20 #include <linux/wait.h> 21 #include <linux/bio.h> 22 #include <linux/slab.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/random.h> 26 #include <linux/iocontext.h> 27 #include <linux/capability.h> 28 #include <linux/ratelimit.h> 29 #include <linux/kthread.h> 30 #include <linux/raid/pq.h> 31 #include <linux/hash.h> 32 #include <linux/list_sort.h> 33 #include <linux/raid/xor.h> 34 #include <linux/vmalloc.h> 35 #include <asm/div64.h> 36 #include "ctree.h" 37 #include "extent_map.h" 38 #include "disk-io.h" 39 #include "transaction.h" 40 #include "print-tree.h" 41 #include "volumes.h" 42 #include "raid56.h" 43 #include "async-thread.h" 44 #include "check-integrity.h" 45 #include "rcu-string.h" 46 47 /* set when additional merges to this rbio are not allowed */ 48 #define RBIO_RMW_LOCKED_BIT 1 49 50 /* 51 * set when this rbio is sitting in the hash, but it is just a cache 52 * of past RMW 53 */ 54 #define RBIO_CACHE_BIT 2 55 56 /* 57 * set when it is safe to trust the stripe_pages for caching 58 */ 59 #define RBIO_CACHE_READY_BIT 3 60 61 #define RBIO_CACHE_SIZE 1024 62 63 enum btrfs_rbio_ops { 64 BTRFS_RBIO_WRITE = 0, 65 BTRFS_RBIO_READ_REBUILD = 1, 66 BTRFS_RBIO_PARITY_SCRUB = 2, 67 }; 68 69 struct btrfs_raid_bio { 70 struct btrfs_fs_info *fs_info; 71 struct btrfs_bio *bbio; 72 73 /* while we're doing rmw on a stripe 74 * we put it into a hash table so we can 75 * lock the stripe and merge more rbios 76 * into it. 77 */ 78 struct list_head hash_list; 79 80 /* 81 * LRU list for the stripe cache 82 */ 83 struct list_head stripe_cache; 84 85 /* 86 * for scheduling work in the helper threads 87 */ 88 struct btrfs_work work; 89 90 /* 91 * bio list and bio_list_lock are used 92 * to add more bios into the stripe 93 * in hopes of avoiding the full rmw 94 */ 95 struct bio_list bio_list; 96 spinlock_t bio_list_lock; 97 98 /* also protected by the bio_list_lock, the 99 * plug list is used by the plugging code 100 * to collect partial bios while plugged. The 101 * stripe locking code also uses it to hand off 102 * the stripe lock to the next pending IO 103 */ 104 struct list_head plug_list; 105 106 /* 107 * flags that tell us if it is safe to 108 * merge with this bio 109 */ 110 unsigned long flags; 111 112 /* size of each individual stripe on disk */ 113 int stripe_len; 114 115 /* number of data stripes (no p/q) */ 116 int nr_data; 117 118 int real_stripes; 119 120 int stripe_npages; 121 /* 122 * set if we're doing a parity rebuild 123 * for a read from higher up, which is handled 124 * differently from a parity rebuild as part of 125 * rmw 126 */ 127 enum btrfs_rbio_ops operation; 128 129 /* first bad stripe */ 130 int faila; 131 132 /* second bad stripe (for raid6 use) */ 133 int failb; 134 135 int scrubp; 136 /* 137 * number of pages needed to represent the full 138 * stripe 139 */ 140 int nr_pages; 141 142 /* 143 * size of all the bios in the bio_list. This 144 * helps us decide if the rbio maps to a full 145 * stripe or not 146 */ 147 int bio_list_bytes; 148 149 int generic_bio_cnt; 150 151 atomic_t refs; 152 153 atomic_t stripes_pending; 154 155 atomic_t error; 156 /* 157 * these are two arrays of pointers. We allocate the 158 * rbio big enough to hold them both and setup their 159 * locations when the rbio is allocated 160 */ 161 162 /* pointers to pages that we allocated for 163 * reading/writing stripes directly from the disk (including P/Q) 164 */ 165 struct page **stripe_pages; 166 167 /* 168 * pointers to the pages in the bio_list. Stored 169 * here for faster lookup 170 */ 171 struct page **bio_pages; 172 173 /* 174 * bitmap to record which horizontal stripe has data 175 */ 176 unsigned long *dbitmap; 177 }; 178 179 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 180 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 181 static void rmw_work(struct btrfs_work *work); 182 static void read_rebuild_work(struct btrfs_work *work); 183 static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 184 static void async_read_rebuild(struct btrfs_raid_bio *rbio); 185 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 186 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 187 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 188 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 189 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 190 191 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 192 int need_check); 193 static void async_scrub_parity(struct btrfs_raid_bio *rbio); 194 195 /* 196 * the stripe hash table is used for locking, and to collect 197 * bios in hopes of making a full stripe 198 */ 199 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 200 { 201 struct btrfs_stripe_hash_table *table; 202 struct btrfs_stripe_hash_table *x; 203 struct btrfs_stripe_hash *cur; 204 struct btrfs_stripe_hash *h; 205 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 206 int i; 207 int table_size; 208 209 if (info->stripe_hash_table) 210 return 0; 211 212 /* 213 * The table is large, starting with order 4 and can go as high as 214 * order 7 in case lock debugging is turned on. 215 * 216 * Try harder to allocate and fallback to vmalloc to lower the chance 217 * of a failing mount. 218 */ 219 table_size = sizeof(*table) + sizeof(*h) * num_entries; 220 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 221 if (!table) { 222 table = vzalloc(table_size); 223 if (!table) 224 return -ENOMEM; 225 } 226 227 spin_lock_init(&table->cache_lock); 228 INIT_LIST_HEAD(&table->stripe_cache); 229 230 h = table->table; 231 232 for (i = 0; i < num_entries; i++) { 233 cur = h + i; 234 INIT_LIST_HEAD(&cur->hash_list); 235 spin_lock_init(&cur->lock); 236 init_waitqueue_head(&cur->wait); 237 } 238 239 x = cmpxchg(&info->stripe_hash_table, NULL, table); 240 if (x) 241 kvfree(x); 242 return 0; 243 } 244 245 /* 246 * caching an rbio means to copy anything from the 247 * bio_pages array into the stripe_pages array. We 248 * use the page uptodate bit in the stripe cache array 249 * to indicate if it has valid data 250 * 251 * once the caching is done, we set the cache ready 252 * bit. 253 */ 254 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 255 { 256 int i; 257 char *s; 258 char *d; 259 int ret; 260 261 ret = alloc_rbio_pages(rbio); 262 if (ret) 263 return; 264 265 for (i = 0; i < rbio->nr_pages; i++) { 266 if (!rbio->bio_pages[i]) 267 continue; 268 269 s = kmap(rbio->bio_pages[i]); 270 d = kmap(rbio->stripe_pages[i]); 271 272 memcpy(d, s, PAGE_CACHE_SIZE); 273 274 kunmap(rbio->bio_pages[i]); 275 kunmap(rbio->stripe_pages[i]); 276 SetPageUptodate(rbio->stripe_pages[i]); 277 } 278 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 279 } 280 281 /* 282 * we hash on the first logical address of the stripe 283 */ 284 static int rbio_bucket(struct btrfs_raid_bio *rbio) 285 { 286 u64 num = rbio->bbio->raid_map[0]; 287 288 /* 289 * we shift down quite a bit. We're using byte 290 * addressing, and most of the lower bits are zeros. 291 * This tends to upset hash_64, and it consistently 292 * returns just one or two different values. 293 * 294 * shifting off the lower bits fixes things. 295 */ 296 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 297 } 298 299 /* 300 * stealing an rbio means taking all the uptodate pages from the stripe 301 * array in the source rbio and putting them into the destination rbio 302 */ 303 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 304 { 305 int i; 306 struct page *s; 307 struct page *d; 308 309 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 310 return; 311 312 for (i = 0; i < dest->nr_pages; i++) { 313 s = src->stripe_pages[i]; 314 if (!s || !PageUptodate(s)) { 315 continue; 316 } 317 318 d = dest->stripe_pages[i]; 319 if (d) 320 __free_page(d); 321 322 dest->stripe_pages[i] = s; 323 src->stripe_pages[i] = NULL; 324 } 325 } 326 327 /* 328 * merging means we take the bio_list from the victim and 329 * splice it into the destination. The victim should 330 * be discarded afterwards. 331 * 332 * must be called with dest->rbio_list_lock held 333 */ 334 static void merge_rbio(struct btrfs_raid_bio *dest, 335 struct btrfs_raid_bio *victim) 336 { 337 bio_list_merge(&dest->bio_list, &victim->bio_list); 338 dest->bio_list_bytes += victim->bio_list_bytes; 339 dest->generic_bio_cnt += victim->generic_bio_cnt; 340 bio_list_init(&victim->bio_list); 341 } 342 343 /* 344 * used to prune items that are in the cache. The caller 345 * must hold the hash table lock. 346 */ 347 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 348 { 349 int bucket = rbio_bucket(rbio); 350 struct btrfs_stripe_hash_table *table; 351 struct btrfs_stripe_hash *h; 352 int freeit = 0; 353 354 /* 355 * check the bit again under the hash table lock. 356 */ 357 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 358 return; 359 360 table = rbio->fs_info->stripe_hash_table; 361 h = table->table + bucket; 362 363 /* hold the lock for the bucket because we may be 364 * removing it from the hash table 365 */ 366 spin_lock(&h->lock); 367 368 /* 369 * hold the lock for the bio list because we need 370 * to make sure the bio list is empty 371 */ 372 spin_lock(&rbio->bio_list_lock); 373 374 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 375 list_del_init(&rbio->stripe_cache); 376 table->cache_size -= 1; 377 freeit = 1; 378 379 /* if the bio list isn't empty, this rbio is 380 * still involved in an IO. We take it out 381 * of the cache list, and drop the ref that 382 * was held for the list. 383 * 384 * If the bio_list was empty, we also remove 385 * the rbio from the hash_table, and drop 386 * the corresponding ref 387 */ 388 if (bio_list_empty(&rbio->bio_list)) { 389 if (!list_empty(&rbio->hash_list)) { 390 list_del_init(&rbio->hash_list); 391 atomic_dec(&rbio->refs); 392 BUG_ON(!list_empty(&rbio->plug_list)); 393 } 394 } 395 } 396 397 spin_unlock(&rbio->bio_list_lock); 398 spin_unlock(&h->lock); 399 400 if (freeit) 401 __free_raid_bio(rbio); 402 } 403 404 /* 405 * prune a given rbio from the cache 406 */ 407 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 408 { 409 struct btrfs_stripe_hash_table *table; 410 unsigned long flags; 411 412 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 413 return; 414 415 table = rbio->fs_info->stripe_hash_table; 416 417 spin_lock_irqsave(&table->cache_lock, flags); 418 __remove_rbio_from_cache(rbio); 419 spin_unlock_irqrestore(&table->cache_lock, flags); 420 } 421 422 /* 423 * remove everything in the cache 424 */ 425 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 426 { 427 struct btrfs_stripe_hash_table *table; 428 unsigned long flags; 429 struct btrfs_raid_bio *rbio; 430 431 table = info->stripe_hash_table; 432 433 spin_lock_irqsave(&table->cache_lock, flags); 434 while (!list_empty(&table->stripe_cache)) { 435 rbio = list_entry(table->stripe_cache.next, 436 struct btrfs_raid_bio, 437 stripe_cache); 438 __remove_rbio_from_cache(rbio); 439 } 440 spin_unlock_irqrestore(&table->cache_lock, flags); 441 } 442 443 /* 444 * remove all cached entries and free the hash table 445 * used by unmount 446 */ 447 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 448 { 449 if (!info->stripe_hash_table) 450 return; 451 btrfs_clear_rbio_cache(info); 452 kvfree(info->stripe_hash_table); 453 info->stripe_hash_table = NULL; 454 } 455 456 /* 457 * insert an rbio into the stripe cache. It 458 * must have already been prepared by calling 459 * cache_rbio_pages 460 * 461 * If this rbio was already cached, it gets 462 * moved to the front of the lru. 463 * 464 * If the size of the rbio cache is too big, we 465 * prune an item. 466 */ 467 static void cache_rbio(struct btrfs_raid_bio *rbio) 468 { 469 struct btrfs_stripe_hash_table *table; 470 unsigned long flags; 471 472 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 473 return; 474 475 table = rbio->fs_info->stripe_hash_table; 476 477 spin_lock_irqsave(&table->cache_lock, flags); 478 spin_lock(&rbio->bio_list_lock); 479 480 /* bump our ref if we were not in the list before */ 481 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 482 atomic_inc(&rbio->refs); 483 484 if (!list_empty(&rbio->stripe_cache)){ 485 list_move(&rbio->stripe_cache, &table->stripe_cache); 486 } else { 487 list_add(&rbio->stripe_cache, &table->stripe_cache); 488 table->cache_size += 1; 489 } 490 491 spin_unlock(&rbio->bio_list_lock); 492 493 if (table->cache_size > RBIO_CACHE_SIZE) { 494 struct btrfs_raid_bio *found; 495 496 found = list_entry(table->stripe_cache.prev, 497 struct btrfs_raid_bio, 498 stripe_cache); 499 500 if (found != rbio) 501 __remove_rbio_from_cache(found); 502 } 503 504 spin_unlock_irqrestore(&table->cache_lock, flags); 505 return; 506 } 507 508 /* 509 * helper function to run the xor_blocks api. It is only 510 * able to do MAX_XOR_BLOCKS at a time, so we need to 511 * loop through. 512 */ 513 static void run_xor(void **pages, int src_cnt, ssize_t len) 514 { 515 int src_off = 0; 516 int xor_src_cnt = 0; 517 void *dest = pages[src_cnt]; 518 519 while(src_cnt > 0) { 520 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 521 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 522 523 src_cnt -= xor_src_cnt; 524 src_off += xor_src_cnt; 525 } 526 } 527 528 /* 529 * returns true if the bio list inside this rbio 530 * covers an entire stripe (no rmw required). 531 * Must be called with the bio list lock held, or 532 * at a time when you know it is impossible to add 533 * new bios into the list 534 */ 535 static int __rbio_is_full(struct btrfs_raid_bio *rbio) 536 { 537 unsigned long size = rbio->bio_list_bytes; 538 int ret = 1; 539 540 if (size != rbio->nr_data * rbio->stripe_len) 541 ret = 0; 542 543 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 544 return ret; 545 } 546 547 static int rbio_is_full(struct btrfs_raid_bio *rbio) 548 { 549 unsigned long flags; 550 int ret; 551 552 spin_lock_irqsave(&rbio->bio_list_lock, flags); 553 ret = __rbio_is_full(rbio); 554 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 555 return ret; 556 } 557 558 /* 559 * returns 1 if it is safe to merge two rbios together. 560 * The merging is safe if the two rbios correspond to 561 * the same stripe and if they are both going in the same 562 * direction (read vs write), and if neither one is 563 * locked for final IO 564 * 565 * The caller is responsible for locking such that 566 * rmw_locked is safe to test 567 */ 568 static int rbio_can_merge(struct btrfs_raid_bio *last, 569 struct btrfs_raid_bio *cur) 570 { 571 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 572 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 573 return 0; 574 575 /* 576 * we can't merge with cached rbios, since the 577 * idea is that when we merge the destination 578 * rbio is going to run our IO for us. We can 579 * steal from cached rbio's though, other functions 580 * handle that. 581 */ 582 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 583 test_bit(RBIO_CACHE_BIT, &cur->flags)) 584 return 0; 585 586 if (last->bbio->raid_map[0] != 587 cur->bbio->raid_map[0]) 588 return 0; 589 590 /* we can't merge with different operations */ 591 if (last->operation != cur->operation) 592 return 0; 593 /* 594 * We've need read the full stripe from the drive. 595 * check and repair the parity and write the new results. 596 * 597 * We're not allowed to add any new bios to the 598 * bio list here, anyone else that wants to 599 * change this stripe needs to do their own rmw. 600 */ 601 if (last->operation == BTRFS_RBIO_PARITY_SCRUB || 602 cur->operation == BTRFS_RBIO_PARITY_SCRUB) 603 return 0; 604 605 return 1; 606 } 607 608 /* 609 * helper to index into the pstripe 610 */ 611 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 612 { 613 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 614 return rbio->stripe_pages[index]; 615 } 616 617 /* 618 * helper to index into the qstripe, returns null 619 * if there is no qstripe 620 */ 621 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 622 { 623 if (rbio->nr_data + 1 == rbio->real_stripes) 624 return NULL; 625 626 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 627 PAGE_CACHE_SHIFT; 628 return rbio->stripe_pages[index]; 629 } 630 631 /* 632 * The first stripe in the table for a logical address 633 * has the lock. rbios are added in one of three ways: 634 * 635 * 1) Nobody has the stripe locked yet. The rbio is given 636 * the lock and 0 is returned. The caller must start the IO 637 * themselves. 638 * 639 * 2) Someone has the stripe locked, but we're able to merge 640 * with the lock owner. The rbio is freed and the IO will 641 * start automatically along with the existing rbio. 1 is returned. 642 * 643 * 3) Someone has the stripe locked, but we're not able to merge. 644 * The rbio is added to the lock owner's plug list, or merged into 645 * an rbio already on the plug list. When the lock owner unlocks, 646 * the next rbio on the list is run and the IO is started automatically. 647 * 1 is returned 648 * 649 * If we return 0, the caller still owns the rbio and must continue with 650 * IO submission. If we return 1, the caller must assume the rbio has 651 * already been freed. 652 */ 653 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 654 { 655 int bucket = rbio_bucket(rbio); 656 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 657 struct btrfs_raid_bio *cur; 658 struct btrfs_raid_bio *pending; 659 unsigned long flags; 660 DEFINE_WAIT(wait); 661 struct btrfs_raid_bio *freeit = NULL; 662 struct btrfs_raid_bio *cache_drop = NULL; 663 int ret = 0; 664 int walk = 0; 665 666 spin_lock_irqsave(&h->lock, flags); 667 list_for_each_entry(cur, &h->hash_list, hash_list) { 668 walk++; 669 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { 670 spin_lock(&cur->bio_list_lock); 671 672 /* can we steal this cached rbio's pages? */ 673 if (bio_list_empty(&cur->bio_list) && 674 list_empty(&cur->plug_list) && 675 test_bit(RBIO_CACHE_BIT, &cur->flags) && 676 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 677 list_del_init(&cur->hash_list); 678 atomic_dec(&cur->refs); 679 680 steal_rbio(cur, rbio); 681 cache_drop = cur; 682 spin_unlock(&cur->bio_list_lock); 683 684 goto lockit; 685 } 686 687 /* can we merge into the lock owner? */ 688 if (rbio_can_merge(cur, rbio)) { 689 merge_rbio(cur, rbio); 690 spin_unlock(&cur->bio_list_lock); 691 freeit = rbio; 692 ret = 1; 693 goto out; 694 } 695 696 697 /* 698 * we couldn't merge with the running 699 * rbio, see if we can merge with the 700 * pending ones. We don't have to 701 * check for rmw_locked because there 702 * is no way they are inside finish_rmw 703 * right now 704 */ 705 list_for_each_entry(pending, &cur->plug_list, 706 plug_list) { 707 if (rbio_can_merge(pending, rbio)) { 708 merge_rbio(pending, rbio); 709 spin_unlock(&cur->bio_list_lock); 710 freeit = rbio; 711 ret = 1; 712 goto out; 713 } 714 } 715 716 /* no merging, put us on the tail of the plug list, 717 * our rbio will be started with the currently 718 * running rbio unlocks 719 */ 720 list_add_tail(&rbio->plug_list, &cur->plug_list); 721 spin_unlock(&cur->bio_list_lock); 722 ret = 1; 723 goto out; 724 } 725 } 726 lockit: 727 atomic_inc(&rbio->refs); 728 list_add(&rbio->hash_list, &h->hash_list); 729 out: 730 spin_unlock_irqrestore(&h->lock, flags); 731 if (cache_drop) 732 remove_rbio_from_cache(cache_drop); 733 if (freeit) 734 __free_raid_bio(freeit); 735 return ret; 736 } 737 738 /* 739 * called as rmw or parity rebuild is completed. If the plug list has more 740 * rbios waiting for this stripe, the next one on the list will be started 741 */ 742 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 743 { 744 int bucket; 745 struct btrfs_stripe_hash *h; 746 unsigned long flags; 747 int keep_cache = 0; 748 749 bucket = rbio_bucket(rbio); 750 h = rbio->fs_info->stripe_hash_table->table + bucket; 751 752 if (list_empty(&rbio->plug_list)) 753 cache_rbio(rbio); 754 755 spin_lock_irqsave(&h->lock, flags); 756 spin_lock(&rbio->bio_list_lock); 757 758 if (!list_empty(&rbio->hash_list)) { 759 /* 760 * if we're still cached and there is no other IO 761 * to perform, just leave this rbio here for others 762 * to steal from later 763 */ 764 if (list_empty(&rbio->plug_list) && 765 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 766 keep_cache = 1; 767 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 768 BUG_ON(!bio_list_empty(&rbio->bio_list)); 769 goto done; 770 } 771 772 list_del_init(&rbio->hash_list); 773 atomic_dec(&rbio->refs); 774 775 /* 776 * we use the plug list to hold all the rbios 777 * waiting for the chance to lock this stripe. 778 * hand the lock over to one of them. 779 */ 780 if (!list_empty(&rbio->plug_list)) { 781 struct btrfs_raid_bio *next; 782 struct list_head *head = rbio->plug_list.next; 783 784 next = list_entry(head, struct btrfs_raid_bio, 785 plug_list); 786 787 list_del_init(&rbio->plug_list); 788 789 list_add(&next->hash_list, &h->hash_list); 790 atomic_inc(&next->refs); 791 spin_unlock(&rbio->bio_list_lock); 792 spin_unlock_irqrestore(&h->lock, flags); 793 794 if (next->operation == BTRFS_RBIO_READ_REBUILD) 795 async_read_rebuild(next); 796 else if (next->operation == BTRFS_RBIO_WRITE) { 797 steal_rbio(rbio, next); 798 async_rmw_stripe(next); 799 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 800 steal_rbio(rbio, next); 801 async_scrub_parity(next); 802 } 803 804 goto done_nolock; 805 } else if (waitqueue_active(&h->wait)) { 806 spin_unlock(&rbio->bio_list_lock); 807 spin_unlock_irqrestore(&h->lock, flags); 808 wake_up(&h->wait); 809 goto done_nolock; 810 } 811 } 812 done: 813 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock_irqrestore(&h->lock, flags); 815 816 done_nolock: 817 if (!keep_cache) 818 remove_rbio_from_cache(rbio); 819 } 820 821 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 822 { 823 int i; 824 825 WARN_ON(atomic_read(&rbio->refs) < 0); 826 if (!atomic_dec_and_test(&rbio->refs)) 827 return; 828 829 WARN_ON(!list_empty(&rbio->stripe_cache)); 830 WARN_ON(!list_empty(&rbio->hash_list)); 831 WARN_ON(!bio_list_empty(&rbio->bio_list)); 832 833 for (i = 0; i < rbio->nr_pages; i++) { 834 if (rbio->stripe_pages[i]) { 835 __free_page(rbio->stripe_pages[i]); 836 rbio->stripe_pages[i] = NULL; 837 } 838 } 839 840 btrfs_put_bbio(rbio->bbio); 841 kfree(rbio); 842 } 843 844 static void free_raid_bio(struct btrfs_raid_bio *rbio) 845 { 846 unlock_stripe(rbio); 847 __free_raid_bio(rbio); 848 } 849 850 /* 851 * this frees the rbio and runs through all the bios in the 852 * bio_list and calls end_io on them 853 */ 854 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) 855 { 856 struct bio *cur = bio_list_get(&rbio->bio_list); 857 struct bio *next; 858 859 if (rbio->generic_bio_cnt) 860 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 861 862 free_raid_bio(rbio); 863 864 while (cur) { 865 next = cur->bi_next; 866 cur->bi_next = NULL; 867 cur->bi_error = err; 868 bio_endio(cur); 869 cur = next; 870 } 871 } 872 873 /* 874 * end io function used by finish_rmw. When we finally 875 * get here, we've written a full stripe 876 */ 877 static void raid_write_end_io(struct bio *bio) 878 { 879 struct btrfs_raid_bio *rbio = bio->bi_private; 880 int err = bio->bi_error; 881 882 if (err) 883 fail_bio_stripe(rbio, bio); 884 885 bio_put(bio); 886 887 if (!atomic_dec_and_test(&rbio->stripes_pending)) 888 return; 889 890 err = 0; 891 892 /* OK, we have read all the stripes we need to. */ 893 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 894 err = -EIO; 895 896 rbio_orig_end_io(rbio, err); 897 return; 898 } 899 900 /* 901 * the read/modify/write code wants to use the original bio for 902 * any pages it included, and then use the rbio for everything 903 * else. This function decides if a given index (stripe number) 904 * and page number in that stripe fall inside the original bio 905 * or the rbio. 906 * 907 * if you set bio_list_only, you'll get a NULL back for any ranges 908 * that are outside the bio_list 909 * 910 * This doesn't take any refs on anything, you get a bare page pointer 911 * and the caller must bump refs as required. 912 * 913 * You must call index_rbio_pages once before you can trust 914 * the answers from this function. 915 */ 916 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 917 int index, int pagenr, int bio_list_only) 918 { 919 int chunk_page; 920 struct page *p = NULL; 921 922 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 923 924 spin_lock_irq(&rbio->bio_list_lock); 925 p = rbio->bio_pages[chunk_page]; 926 spin_unlock_irq(&rbio->bio_list_lock); 927 928 if (p || bio_list_only) 929 return p; 930 931 return rbio->stripe_pages[chunk_page]; 932 } 933 934 /* 935 * number of pages we need for the entire stripe across all the 936 * drives 937 */ 938 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 939 { 940 unsigned long nr = stripe_len * nr_stripes; 941 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); 942 } 943 944 /* 945 * allocation and initial setup for the btrfs_raid_bio. Not 946 * this does not allocate any pages for rbio->pages. 947 */ 948 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 949 struct btrfs_bio *bbio, u64 stripe_len) 950 { 951 struct btrfs_raid_bio *rbio; 952 int nr_data = 0; 953 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 954 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 955 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 956 void *p; 957 958 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + 959 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), 960 GFP_NOFS); 961 if (!rbio) 962 return ERR_PTR(-ENOMEM); 963 964 bio_list_init(&rbio->bio_list); 965 INIT_LIST_HEAD(&rbio->plug_list); 966 spin_lock_init(&rbio->bio_list_lock); 967 INIT_LIST_HEAD(&rbio->stripe_cache); 968 INIT_LIST_HEAD(&rbio->hash_list); 969 rbio->bbio = bbio; 970 rbio->fs_info = root->fs_info; 971 rbio->stripe_len = stripe_len; 972 rbio->nr_pages = num_pages; 973 rbio->real_stripes = real_stripes; 974 rbio->stripe_npages = stripe_npages; 975 rbio->faila = -1; 976 rbio->failb = -1; 977 atomic_set(&rbio->refs, 1); 978 atomic_set(&rbio->error, 0); 979 atomic_set(&rbio->stripes_pending, 0); 980 981 /* 982 * the stripe_pages and bio_pages array point to the extra 983 * memory we allocated past the end of the rbio 984 */ 985 p = rbio + 1; 986 rbio->stripe_pages = p; 987 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 988 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; 989 990 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 991 nr_data = real_stripes - 1; 992 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 993 nr_data = real_stripes - 2; 994 else 995 BUG(); 996 997 rbio->nr_data = nr_data; 998 return rbio; 999 } 1000 1001 /* allocate pages for all the stripes in the bio, including parity */ 1002 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1003 { 1004 int i; 1005 struct page *page; 1006 1007 for (i = 0; i < rbio->nr_pages; i++) { 1008 if (rbio->stripe_pages[i]) 1009 continue; 1010 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1011 if (!page) 1012 return -ENOMEM; 1013 rbio->stripe_pages[i] = page; 1014 ClearPageUptodate(page); 1015 } 1016 return 0; 1017 } 1018 1019 /* allocate pages for just the p/q stripes */ 1020 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1021 { 1022 int i; 1023 struct page *page; 1024 1025 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 1026 1027 for (; i < rbio->nr_pages; i++) { 1028 if (rbio->stripe_pages[i]) 1029 continue; 1030 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1031 if (!page) 1032 return -ENOMEM; 1033 rbio->stripe_pages[i] = page; 1034 } 1035 return 0; 1036 } 1037 1038 /* 1039 * add a single page from a specific stripe into our list of bios for IO 1040 * this will try to merge into existing bios if possible, and returns 1041 * zero if all went well. 1042 */ 1043 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1044 struct bio_list *bio_list, 1045 struct page *page, 1046 int stripe_nr, 1047 unsigned long page_index, 1048 unsigned long bio_max_len) 1049 { 1050 struct bio *last = bio_list->tail; 1051 u64 last_end = 0; 1052 int ret; 1053 struct bio *bio; 1054 struct btrfs_bio_stripe *stripe; 1055 u64 disk_start; 1056 1057 stripe = &rbio->bbio->stripes[stripe_nr]; 1058 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 1059 1060 /* if the device is missing, just fail this stripe */ 1061 if (!stripe->dev->bdev) 1062 return fail_rbio_index(rbio, stripe_nr); 1063 1064 /* see if we can add this page onto our existing bio */ 1065 if (last) { 1066 last_end = (u64)last->bi_iter.bi_sector << 9; 1067 last_end += last->bi_iter.bi_size; 1068 1069 /* 1070 * we can't merge these if they are from different 1071 * devices or if they are not contiguous 1072 */ 1073 if (last_end == disk_start && stripe->dev->bdev && 1074 !last->bi_error && 1075 last->bi_bdev == stripe->dev->bdev) { 1076 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 1077 if (ret == PAGE_CACHE_SIZE) 1078 return 0; 1079 } 1080 } 1081 1082 /* put a new bio on the list */ 1083 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1084 if (!bio) 1085 return -ENOMEM; 1086 1087 bio->bi_iter.bi_size = 0; 1088 bio->bi_bdev = stripe->dev->bdev; 1089 bio->bi_iter.bi_sector = disk_start >> 9; 1090 1091 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 1092 bio_list_add(bio_list, bio); 1093 return 0; 1094 } 1095 1096 /* 1097 * while we're doing the read/modify/write cycle, we could 1098 * have errors in reading pages off the disk. This checks 1099 * for errors and if we're not able to read the page it'll 1100 * trigger parity reconstruction. The rmw will be finished 1101 * after we've reconstructed the failed stripes 1102 */ 1103 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1104 { 1105 if (rbio->faila >= 0 || rbio->failb >= 0) { 1106 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1107 __raid56_parity_recover(rbio); 1108 } else { 1109 finish_rmw(rbio); 1110 } 1111 } 1112 1113 /* 1114 * these are just the pages from the rbio array, not from anything 1115 * the FS sent down to us 1116 */ 1117 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 1118 { 1119 int index; 1120 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 1121 index += page; 1122 return rbio->stripe_pages[index]; 1123 } 1124 1125 /* 1126 * helper function to walk our bio list and populate the bio_pages array with 1127 * the result. This seems expensive, but it is faster than constantly 1128 * searching through the bio list as we setup the IO in finish_rmw or stripe 1129 * reconstruction. 1130 * 1131 * This must be called before you trust the answers from page_in_rbio 1132 */ 1133 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1134 { 1135 struct bio *bio; 1136 u64 start; 1137 unsigned long stripe_offset; 1138 unsigned long page_index; 1139 struct page *p; 1140 int i; 1141 1142 spin_lock_irq(&rbio->bio_list_lock); 1143 bio_list_for_each(bio, &rbio->bio_list) { 1144 start = (u64)bio->bi_iter.bi_sector << 9; 1145 stripe_offset = start - rbio->bbio->raid_map[0]; 1146 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1147 1148 for (i = 0; i < bio->bi_vcnt; i++) { 1149 p = bio->bi_io_vec[i].bv_page; 1150 rbio->bio_pages[page_index + i] = p; 1151 } 1152 } 1153 spin_unlock_irq(&rbio->bio_list_lock); 1154 } 1155 1156 /* 1157 * this is called from one of two situations. We either 1158 * have a full stripe from the higher layers, or we've read all 1159 * the missing bits off disk. 1160 * 1161 * This will calculate the parity and then send down any 1162 * changed blocks. 1163 */ 1164 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1165 { 1166 struct btrfs_bio *bbio = rbio->bbio; 1167 void *pointers[rbio->real_stripes]; 1168 int stripe_len = rbio->stripe_len; 1169 int nr_data = rbio->nr_data; 1170 int stripe; 1171 int pagenr; 1172 int p_stripe = -1; 1173 int q_stripe = -1; 1174 struct bio_list bio_list; 1175 struct bio *bio; 1176 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 1177 int ret; 1178 1179 bio_list_init(&bio_list); 1180 1181 if (rbio->real_stripes - rbio->nr_data == 1) { 1182 p_stripe = rbio->real_stripes - 1; 1183 } else if (rbio->real_stripes - rbio->nr_data == 2) { 1184 p_stripe = rbio->real_stripes - 2; 1185 q_stripe = rbio->real_stripes - 1; 1186 } else { 1187 BUG(); 1188 } 1189 1190 /* at this point we either have a full stripe, 1191 * or we've read the full stripe from the drive. 1192 * recalculate the parity and write the new results. 1193 * 1194 * We're not allowed to add any new bios to the 1195 * bio list here, anyone else that wants to 1196 * change this stripe needs to do their own rmw. 1197 */ 1198 spin_lock_irq(&rbio->bio_list_lock); 1199 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1200 spin_unlock_irq(&rbio->bio_list_lock); 1201 1202 atomic_set(&rbio->error, 0); 1203 1204 /* 1205 * now that we've set rmw_locked, run through the 1206 * bio list one last time and map the page pointers 1207 * 1208 * We don't cache full rbios because we're assuming 1209 * the higher layers are unlikely to use this area of 1210 * the disk again soon. If they do use it again, 1211 * hopefully they will send another full bio. 1212 */ 1213 index_rbio_pages(rbio); 1214 if (!rbio_is_full(rbio)) 1215 cache_rbio_pages(rbio); 1216 else 1217 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1218 1219 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1220 struct page *p; 1221 /* first collect one page from each data stripe */ 1222 for (stripe = 0; stripe < nr_data; stripe++) { 1223 p = page_in_rbio(rbio, stripe, pagenr, 0); 1224 pointers[stripe] = kmap(p); 1225 } 1226 1227 /* then add the parity stripe */ 1228 p = rbio_pstripe_page(rbio, pagenr); 1229 SetPageUptodate(p); 1230 pointers[stripe++] = kmap(p); 1231 1232 if (q_stripe != -1) { 1233 1234 /* 1235 * raid6, add the qstripe and call the 1236 * library function to fill in our p/q 1237 */ 1238 p = rbio_qstripe_page(rbio, pagenr); 1239 SetPageUptodate(p); 1240 pointers[stripe++] = kmap(p); 1241 1242 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1243 pointers); 1244 } else { 1245 /* raid5 */ 1246 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1247 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 1248 } 1249 1250 1251 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1252 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1253 } 1254 1255 /* 1256 * time to start writing. Make bios for everything from the 1257 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1258 * everything else. 1259 */ 1260 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1261 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1262 struct page *page; 1263 if (stripe < rbio->nr_data) { 1264 page = page_in_rbio(rbio, stripe, pagenr, 1); 1265 if (!page) 1266 continue; 1267 } else { 1268 page = rbio_stripe_page(rbio, stripe, pagenr); 1269 } 1270 1271 ret = rbio_add_io_page(rbio, &bio_list, 1272 page, stripe, pagenr, rbio->stripe_len); 1273 if (ret) 1274 goto cleanup; 1275 } 1276 } 1277 1278 if (likely(!bbio->num_tgtdevs)) 1279 goto write_data; 1280 1281 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1282 if (!bbio->tgtdev_map[stripe]) 1283 continue; 1284 1285 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1286 struct page *page; 1287 if (stripe < rbio->nr_data) { 1288 page = page_in_rbio(rbio, stripe, pagenr, 1); 1289 if (!page) 1290 continue; 1291 } else { 1292 page = rbio_stripe_page(rbio, stripe, pagenr); 1293 } 1294 1295 ret = rbio_add_io_page(rbio, &bio_list, page, 1296 rbio->bbio->tgtdev_map[stripe], 1297 pagenr, rbio->stripe_len); 1298 if (ret) 1299 goto cleanup; 1300 } 1301 } 1302 1303 write_data: 1304 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1305 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1306 1307 while (1) { 1308 bio = bio_list_pop(&bio_list); 1309 if (!bio) 1310 break; 1311 1312 bio->bi_private = rbio; 1313 bio->bi_end_io = raid_write_end_io; 1314 submit_bio(WRITE, bio); 1315 } 1316 return; 1317 1318 cleanup: 1319 rbio_orig_end_io(rbio, -EIO); 1320 } 1321 1322 /* 1323 * helper to find the stripe number for a given bio. Used to figure out which 1324 * stripe has failed. This expects the bio to correspond to a physical disk, 1325 * so it looks up based on physical sector numbers. 1326 */ 1327 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1328 struct bio *bio) 1329 { 1330 u64 physical = bio->bi_iter.bi_sector; 1331 u64 stripe_start; 1332 int i; 1333 struct btrfs_bio_stripe *stripe; 1334 1335 physical <<= 9; 1336 1337 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1338 stripe = &rbio->bbio->stripes[i]; 1339 stripe_start = stripe->physical; 1340 if (physical >= stripe_start && 1341 physical < stripe_start + rbio->stripe_len && 1342 bio->bi_bdev == stripe->dev->bdev) { 1343 return i; 1344 } 1345 } 1346 return -1; 1347 } 1348 1349 /* 1350 * helper to find the stripe number for a given 1351 * bio (before mapping). Used to figure out which stripe has 1352 * failed. This looks up based on logical block numbers. 1353 */ 1354 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1355 struct bio *bio) 1356 { 1357 u64 logical = bio->bi_iter.bi_sector; 1358 u64 stripe_start; 1359 int i; 1360 1361 logical <<= 9; 1362 1363 for (i = 0; i < rbio->nr_data; i++) { 1364 stripe_start = rbio->bbio->raid_map[i]; 1365 if (logical >= stripe_start && 1366 logical < stripe_start + rbio->stripe_len) { 1367 return i; 1368 } 1369 } 1370 return -1; 1371 } 1372 1373 /* 1374 * returns -EIO if we had too many failures 1375 */ 1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1377 { 1378 unsigned long flags; 1379 int ret = 0; 1380 1381 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1382 1383 /* we already know this stripe is bad, move on */ 1384 if (rbio->faila == failed || rbio->failb == failed) 1385 goto out; 1386 1387 if (rbio->faila == -1) { 1388 /* first failure on this rbio */ 1389 rbio->faila = failed; 1390 atomic_inc(&rbio->error); 1391 } else if (rbio->failb == -1) { 1392 /* second failure on this rbio */ 1393 rbio->failb = failed; 1394 atomic_inc(&rbio->error); 1395 } else { 1396 ret = -EIO; 1397 } 1398 out: 1399 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1400 1401 return ret; 1402 } 1403 1404 /* 1405 * helper to fail a stripe based on a physical disk 1406 * bio. 1407 */ 1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1409 struct bio *bio) 1410 { 1411 int failed = find_bio_stripe(rbio, bio); 1412 1413 if (failed < 0) 1414 return -EIO; 1415 1416 return fail_rbio_index(rbio, failed); 1417 } 1418 1419 /* 1420 * this sets each page in the bio uptodate. It should only be used on private 1421 * rbio pages, nothing that comes in from the higher layers 1422 */ 1423 static void set_bio_pages_uptodate(struct bio *bio) 1424 { 1425 int i; 1426 struct page *p; 1427 1428 for (i = 0; i < bio->bi_vcnt; i++) { 1429 p = bio->bi_io_vec[i].bv_page; 1430 SetPageUptodate(p); 1431 } 1432 } 1433 1434 /* 1435 * end io for the read phase of the rmw cycle. All the bios here are physical 1436 * stripe bios we've read from the disk so we can recalculate the parity of the 1437 * stripe. 1438 * 1439 * This will usually kick off finish_rmw once all the bios are read in, but it 1440 * may trigger parity reconstruction if we had any errors along the way 1441 */ 1442 static void raid_rmw_end_io(struct bio *bio) 1443 { 1444 struct btrfs_raid_bio *rbio = bio->bi_private; 1445 1446 if (bio->bi_error) 1447 fail_bio_stripe(rbio, bio); 1448 else 1449 set_bio_pages_uptodate(bio); 1450 1451 bio_put(bio); 1452 1453 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1454 return; 1455 1456 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1457 goto cleanup; 1458 1459 /* 1460 * this will normally call finish_rmw to start our write 1461 * but if there are any failed stripes we'll reconstruct 1462 * from parity first 1463 */ 1464 validate_rbio_for_rmw(rbio); 1465 return; 1466 1467 cleanup: 1468 1469 rbio_orig_end_io(rbio, -EIO); 1470 } 1471 1472 static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1473 { 1474 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1475 rmw_work, NULL, NULL); 1476 1477 btrfs_queue_work(rbio->fs_info->rmw_workers, 1478 &rbio->work); 1479 } 1480 1481 static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1482 { 1483 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1484 read_rebuild_work, NULL, NULL); 1485 1486 btrfs_queue_work(rbio->fs_info->rmw_workers, 1487 &rbio->work); 1488 } 1489 1490 /* 1491 * the stripe must be locked by the caller. It will 1492 * unlock after all the writes are done 1493 */ 1494 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1495 { 1496 int bios_to_read = 0; 1497 struct bio_list bio_list; 1498 int ret; 1499 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1500 int pagenr; 1501 int stripe; 1502 struct bio *bio; 1503 1504 bio_list_init(&bio_list); 1505 1506 ret = alloc_rbio_pages(rbio); 1507 if (ret) 1508 goto cleanup; 1509 1510 index_rbio_pages(rbio); 1511 1512 atomic_set(&rbio->error, 0); 1513 /* 1514 * build a list of bios to read all the missing parts of this 1515 * stripe 1516 */ 1517 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1518 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1519 struct page *page; 1520 /* 1521 * we want to find all the pages missing from 1522 * the rbio and read them from the disk. If 1523 * page_in_rbio finds a page in the bio list 1524 * we don't need to read it off the stripe. 1525 */ 1526 page = page_in_rbio(rbio, stripe, pagenr, 1); 1527 if (page) 1528 continue; 1529 1530 page = rbio_stripe_page(rbio, stripe, pagenr); 1531 /* 1532 * the bio cache may have handed us an uptodate 1533 * page. If so, be happy and use it 1534 */ 1535 if (PageUptodate(page)) 1536 continue; 1537 1538 ret = rbio_add_io_page(rbio, &bio_list, page, 1539 stripe, pagenr, rbio->stripe_len); 1540 if (ret) 1541 goto cleanup; 1542 } 1543 } 1544 1545 bios_to_read = bio_list_size(&bio_list); 1546 if (!bios_to_read) { 1547 /* 1548 * this can happen if others have merged with 1549 * us, it means there is nothing left to read. 1550 * But if there are missing devices it may not be 1551 * safe to do the full stripe write yet. 1552 */ 1553 goto finish; 1554 } 1555 1556 /* 1557 * the bbio may be freed once we submit the last bio. Make sure 1558 * not to touch it after that 1559 */ 1560 atomic_set(&rbio->stripes_pending, bios_to_read); 1561 while (1) { 1562 bio = bio_list_pop(&bio_list); 1563 if (!bio) 1564 break; 1565 1566 bio->bi_private = rbio; 1567 bio->bi_end_io = raid_rmw_end_io; 1568 1569 btrfs_bio_wq_end_io(rbio->fs_info, bio, 1570 BTRFS_WQ_ENDIO_RAID56); 1571 1572 submit_bio(READ, bio); 1573 } 1574 /* the actual write will happen once the reads are done */ 1575 return 0; 1576 1577 cleanup: 1578 rbio_orig_end_io(rbio, -EIO); 1579 return -EIO; 1580 1581 finish: 1582 validate_rbio_for_rmw(rbio); 1583 return 0; 1584 } 1585 1586 /* 1587 * if the upper layers pass in a full stripe, we thank them by only allocating 1588 * enough pages to hold the parity, and sending it all down quickly. 1589 */ 1590 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1591 { 1592 int ret; 1593 1594 ret = alloc_rbio_parity_pages(rbio); 1595 if (ret) { 1596 __free_raid_bio(rbio); 1597 return ret; 1598 } 1599 1600 ret = lock_stripe_add(rbio); 1601 if (ret == 0) 1602 finish_rmw(rbio); 1603 return 0; 1604 } 1605 1606 /* 1607 * partial stripe writes get handed over to async helpers. 1608 * We're really hoping to merge a few more writes into this 1609 * rbio before calculating new parity 1610 */ 1611 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1612 { 1613 int ret; 1614 1615 ret = lock_stripe_add(rbio); 1616 if (ret == 0) 1617 async_rmw_stripe(rbio); 1618 return 0; 1619 } 1620 1621 /* 1622 * sometimes while we were reading from the drive to 1623 * recalculate parity, enough new bios come into create 1624 * a full stripe. So we do a check here to see if we can 1625 * go directly to finish_rmw 1626 */ 1627 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1628 { 1629 /* head off into rmw land if we don't have a full stripe */ 1630 if (!rbio_is_full(rbio)) 1631 return partial_stripe_write(rbio); 1632 return full_stripe_write(rbio); 1633 } 1634 1635 /* 1636 * We use plugging call backs to collect full stripes. 1637 * Any time we get a partial stripe write while plugged 1638 * we collect it into a list. When the unplug comes down, 1639 * we sort the list by logical block number and merge 1640 * everything we can into the same rbios 1641 */ 1642 struct btrfs_plug_cb { 1643 struct blk_plug_cb cb; 1644 struct btrfs_fs_info *info; 1645 struct list_head rbio_list; 1646 struct btrfs_work work; 1647 }; 1648 1649 /* 1650 * rbios on the plug list are sorted for easier merging. 1651 */ 1652 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1653 { 1654 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1655 plug_list); 1656 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1657 plug_list); 1658 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1659 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1660 1661 if (a_sector < b_sector) 1662 return -1; 1663 if (a_sector > b_sector) 1664 return 1; 1665 return 0; 1666 } 1667 1668 static void run_plug(struct btrfs_plug_cb *plug) 1669 { 1670 struct btrfs_raid_bio *cur; 1671 struct btrfs_raid_bio *last = NULL; 1672 1673 /* 1674 * sort our plug list then try to merge 1675 * everything we can in hopes of creating full 1676 * stripes. 1677 */ 1678 list_sort(NULL, &plug->rbio_list, plug_cmp); 1679 while (!list_empty(&plug->rbio_list)) { 1680 cur = list_entry(plug->rbio_list.next, 1681 struct btrfs_raid_bio, plug_list); 1682 list_del_init(&cur->plug_list); 1683 1684 if (rbio_is_full(cur)) { 1685 /* we have a full stripe, send it down */ 1686 full_stripe_write(cur); 1687 continue; 1688 } 1689 if (last) { 1690 if (rbio_can_merge(last, cur)) { 1691 merge_rbio(last, cur); 1692 __free_raid_bio(cur); 1693 continue; 1694 1695 } 1696 __raid56_parity_write(last); 1697 } 1698 last = cur; 1699 } 1700 if (last) { 1701 __raid56_parity_write(last); 1702 } 1703 kfree(plug); 1704 } 1705 1706 /* 1707 * if the unplug comes from schedule, we have to push the 1708 * work off to a helper thread 1709 */ 1710 static void unplug_work(struct btrfs_work *work) 1711 { 1712 struct btrfs_plug_cb *plug; 1713 plug = container_of(work, struct btrfs_plug_cb, work); 1714 run_plug(plug); 1715 } 1716 1717 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1718 { 1719 struct btrfs_plug_cb *plug; 1720 plug = container_of(cb, struct btrfs_plug_cb, cb); 1721 1722 if (from_schedule) { 1723 btrfs_init_work(&plug->work, btrfs_rmw_helper, 1724 unplug_work, NULL, NULL); 1725 btrfs_queue_work(plug->info->rmw_workers, 1726 &plug->work); 1727 return; 1728 } 1729 run_plug(plug); 1730 } 1731 1732 /* 1733 * our main entry point for writes from the rest of the FS. 1734 */ 1735 int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1736 struct btrfs_bio *bbio, u64 stripe_len) 1737 { 1738 struct btrfs_raid_bio *rbio; 1739 struct btrfs_plug_cb *plug = NULL; 1740 struct blk_plug_cb *cb; 1741 int ret; 1742 1743 rbio = alloc_rbio(root, bbio, stripe_len); 1744 if (IS_ERR(rbio)) { 1745 btrfs_put_bbio(bbio); 1746 return PTR_ERR(rbio); 1747 } 1748 bio_list_add(&rbio->bio_list, bio); 1749 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1750 rbio->operation = BTRFS_RBIO_WRITE; 1751 1752 btrfs_bio_counter_inc_noblocked(root->fs_info); 1753 rbio->generic_bio_cnt = 1; 1754 1755 /* 1756 * don't plug on full rbios, just get them out the door 1757 * as quickly as we can 1758 */ 1759 if (rbio_is_full(rbio)) { 1760 ret = full_stripe_write(rbio); 1761 if (ret) 1762 btrfs_bio_counter_dec(root->fs_info); 1763 return ret; 1764 } 1765 1766 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1767 sizeof(*plug)); 1768 if (cb) { 1769 plug = container_of(cb, struct btrfs_plug_cb, cb); 1770 if (!plug->info) { 1771 plug->info = root->fs_info; 1772 INIT_LIST_HEAD(&plug->rbio_list); 1773 } 1774 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1775 ret = 0; 1776 } else { 1777 ret = __raid56_parity_write(rbio); 1778 if (ret) 1779 btrfs_bio_counter_dec(root->fs_info); 1780 } 1781 return ret; 1782 } 1783 1784 /* 1785 * all parity reconstruction happens here. We've read in everything 1786 * we can find from the drives and this does the heavy lifting of 1787 * sorting the good from the bad. 1788 */ 1789 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1790 { 1791 int pagenr, stripe; 1792 void **pointers; 1793 int faila = -1, failb = -1; 1794 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1795 struct page *page; 1796 int err; 1797 int i; 1798 1799 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1800 if (!pointers) { 1801 err = -ENOMEM; 1802 goto cleanup_io; 1803 } 1804 1805 faila = rbio->faila; 1806 failb = rbio->failb; 1807 1808 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1809 spin_lock_irq(&rbio->bio_list_lock); 1810 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1811 spin_unlock_irq(&rbio->bio_list_lock); 1812 } 1813 1814 index_rbio_pages(rbio); 1815 1816 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1817 /* 1818 * Now we just use bitmap to mark the horizontal stripes in 1819 * which we have data when doing parity scrub. 1820 */ 1821 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1822 !test_bit(pagenr, rbio->dbitmap)) 1823 continue; 1824 1825 /* setup our array of pointers with pages 1826 * from each stripe 1827 */ 1828 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1829 /* 1830 * if we're rebuilding a read, we have to use 1831 * pages from the bio list 1832 */ 1833 if (rbio->operation == BTRFS_RBIO_READ_REBUILD && 1834 (stripe == faila || stripe == failb)) { 1835 page = page_in_rbio(rbio, stripe, pagenr, 0); 1836 } else { 1837 page = rbio_stripe_page(rbio, stripe, pagenr); 1838 } 1839 pointers[stripe] = kmap(page); 1840 } 1841 1842 /* all raid6 handling here */ 1843 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1844 /* 1845 * single failure, rebuild from parity raid5 1846 * style 1847 */ 1848 if (failb < 0) { 1849 if (faila == rbio->nr_data) { 1850 /* 1851 * Just the P stripe has failed, without 1852 * a bad data or Q stripe. 1853 * TODO, we should redo the xor here. 1854 */ 1855 err = -EIO; 1856 goto cleanup; 1857 } 1858 /* 1859 * a single failure in raid6 is rebuilt 1860 * in the pstripe code below 1861 */ 1862 goto pstripe; 1863 } 1864 1865 /* make sure our ps and qs are in order */ 1866 if (faila > failb) { 1867 int tmp = failb; 1868 failb = faila; 1869 faila = tmp; 1870 } 1871 1872 /* if the q stripe is failed, do a pstripe reconstruction 1873 * from the xors. 1874 * If both the q stripe and the P stripe are failed, we're 1875 * here due to a crc mismatch and we can't give them the 1876 * data they want 1877 */ 1878 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1879 if (rbio->bbio->raid_map[faila] == 1880 RAID5_P_STRIPE) { 1881 err = -EIO; 1882 goto cleanup; 1883 } 1884 /* 1885 * otherwise we have one bad data stripe and 1886 * a good P stripe. raid5! 1887 */ 1888 goto pstripe; 1889 } 1890 1891 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1892 raid6_datap_recov(rbio->real_stripes, 1893 PAGE_SIZE, faila, pointers); 1894 } else { 1895 raid6_2data_recov(rbio->real_stripes, 1896 PAGE_SIZE, faila, failb, 1897 pointers); 1898 } 1899 } else { 1900 void *p; 1901 1902 /* rebuild from P stripe here (raid5 or raid6) */ 1903 BUG_ON(failb != -1); 1904 pstripe: 1905 /* Copy parity block into failed block to start with */ 1906 memcpy(pointers[faila], 1907 pointers[rbio->nr_data], 1908 PAGE_CACHE_SIZE); 1909 1910 /* rearrange the pointer array */ 1911 p = pointers[faila]; 1912 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1913 pointers[stripe] = pointers[stripe + 1]; 1914 pointers[rbio->nr_data - 1] = p; 1915 1916 /* xor in the rest */ 1917 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1918 } 1919 /* if we're doing this rebuild as part of an rmw, go through 1920 * and set all of our private rbio pages in the 1921 * failed stripes as uptodate. This way finish_rmw will 1922 * know they can be trusted. If this was a read reconstruction, 1923 * other endio functions will fiddle the uptodate bits 1924 */ 1925 if (rbio->operation == BTRFS_RBIO_WRITE) { 1926 for (i = 0; i < nr_pages; i++) { 1927 if (faila != -1) { 1928 page = rbio_stripe_page(rbio, faila, i); 1929 SetPageUptodate(page); 1930 } 1931 if (failb != -1) { 1932 page = rbio_stripe_page(rbio, failb, i); 1933 SetPageUptodate(page); 1934 } 1935 } 1936 } 1937 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1938 /* 1939 * if we're rebuilding a read, we have to use 1940 * pages from the bio list 1941 */ 1942 if (rbio->operation == BTRFS_RBIO_READ_REBUILD && 1943 (stripe == faila || stripe == failb)) { 1944 page = page_in_rbio(rbio, stripe, pagenr, 0); 1945 } else { 1946 page = rbio_stripe_page(rbio, stripe, pagenr); 1947 } 1948 kunmap(page); 1949 } 1950 } 1951 1952 err = 0; 1953 cleanup: 1954 kfree(pointers); 1955 1956 cleanup_io: 1957 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1958 if (err == 0) 1959 cache_rbio_pages(rbio); 1960 else 1961 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1962 1963 rbio_orig_end_io(rbio, err); 1964 } else if (err == 0) { 1965 rbio->faila = -1; 1966 rbio->failb = -1; 1967 1968 if (rbio->operation == BTRFS_RBIO_WRITE) 1969 finish_rmw(rbio); 1970 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1971 finish_parity_scrub(rbio, 0); 1972 else 1973 BUG(); 1974 } else { 1975 rbio_orig_end_io(rbio, err); 1976 } 1977 } 1978 1979 /* 1980 * This is called only for stripes we've read from disk to 1981 * reconstruct the parity. 1982 */ 1983 static void raid_recover_end_io(struct bio *bio) 1984 { 1985 struct btrfs_raid_bio *rbio = bio->bi_private; 1986 1987 /* 1988 * we only read stripe pages off the disk, set them 1989 * up to date if there were no errors 1990 */ 1991 if (bio->bi_error) 1992 fail_bio_stripe(rbio, bio); 1993 else 1994 set_bio_pages_uptodate(bio); 1995 bio_put(bio); 1996 1997 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1998 return; 1999 2000 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2001 rbio_orig_end_io(rbio, -EIO); 2002 else 2003 __raid_recover_end_io(rbio); 2004 } 2005 2006 /* 2007 * reads everything we need off the disk to reconstruct 2008 * the parity. endio handlers trigger final reconstruction 2009 * when the IO is done. 2010 * 2011 * This is used both for reads from the higher layers and for 2012 * parity construction required to finish a rmw cycle. 2013 */ 2014 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2015 { 2016 int bios_to_read = 0; 2017 struct bio_list bio_list; 2018 int ret; 2019 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2020 int pagenr; 2021 int stripe; 2022 struct bio *bio; 2023 2024 bio_list_init(&bio_list); 2025 2026 ret = alloc_rbio_pages(rbio); 2027 if (ret) 2028 goto cleanup; 2029 2030 atomic_set(&rbio->error, 0); 2031 2032 /* 2033 * read everything that hasn't failed. Thanks to the 2034 * stripe cache, it is possible that some or all of these 2035 * pages are going to be uptodate. 2036 */ 2037 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2038 if (rbio->faila == stripe || rbio->failb == stripe) { 2039 atomic_inc(&rbio->error); 2040 continue; 2041 } 2042 2043 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 2044 struct page *p; 2045 2046 /* 2047 * the rmw code may have already read this 2048 * page in 2049 */ 2050 p = rbio_stripe_page(rbio, stripe, pagenr); 2051 if (PageUptodate(p)) 2052 continue; 2053 2054 ret = rbio_add_io_page(rbio, &bio_list, 2055 rbio_stripe_page(rbio, stripe, pagenr), 2056 stripe, pagenr, rbio->stripe_len); 2057 if (ret < 0) 2058 goto cleanup; 2059 } 2060 } 2061 2062 bios_to_read = bio_list_size(&bio_list); 2063 if (!bios_to_read) { 2064 /* 2065 * we might have no bios to read just because the pages 2066 * were up to date, or we might have no bios to read because 2067 * the devices were gone. 2068 */ 2069 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2070 __raid_recover_end_io(rbio); 2071 goto out; 2072 } else { 2073 goto cleanup; 2074 } 2075 } 2076 2077 /* 2078 * the bbio may be freed once we submit the last bio. Make sure 2079 * not to touch it after that 2080 */ 2081 atomic_set(&rbio->stripes_pending, bios_to_read); 2082 while (1) { 2083 bio = bio_list_pop(&bio_list); 2084 if (!bio) 2085 break; 2086 2087 bio->bi_private = rbio; 2088 bio->bi_end_io = raid_recover_end_io; 2089 2090 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2091 BTRFS_WQ_ENDIO_RAID56); 2092 2093 submit_bio(READ, bio); 2094 } 2095 out: 2096 return 0; 2097 2098 cleanup: 2099 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) 2100 rbio_orig_end_io(rbio, -EIO); 2101 return -EIO; 2102 } 2103 2104 /* 2105 * the main entry point for reads from the higher layers. This 2106 * is really only called when the normal read path had a failure, 2107 * so we assume the bio they send down corresponds to a failed part 2108 * of the drive. 2109 */ 2110 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2111 struct btrfs_bio *bbio, u64 stripe_len, 2112 int mirror_num, int generic_io) 2113 { 2114 struct btrfs_raid_bio *rbio; 2115 int ret; 2116 2117 rbio = alloc_rbio(root, bbio, stripe_len); 2118 if (IS_ERR(rbio)) { 2119 if (generic_io) 2120 btrfs_put_bbio(bbio); 2121 return PTR_ERR(rbio); 2122 } 2123 2124 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2125 bio_list_add(&rbio->bio_list, bio); 2126 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2127 2128 rbio->faila = find_logical_bio_stripe(rbio, bio); 2129 if (rbio->faila == -1) { 2130 BUG(); 2131 if (generic_io) 2132 btrfs_put_bbio(bbio); 2133 kfree(rbio); 2134 return -EIO; 2135 } 2136 2137 if (generic_io) { 2138 btrfs_bio_counter_inc_noblocked(root->fs_info); 2139 rbio->generic_bio_cnt = 1; 2140 } else { 2141 btrfs_get_bbio(bbio); 2142 } 2143 2144 /* 2145 * reconstruct from the q stripe if they are 2146 * asking for mirror 3 2147 */ 2148 if (mirror_num == 3) 2149 rbio->failb = rbio->real_stripes - 2; 2150 2151 ret = lock_stripe_add(rbio); 2152 2153 /* 2154 * __raid56_parity_recover will end the bio with 2155 * any errors it hits. We don't want to return 2156 * its error value up the stack because our caller 2157 * will end up calling bio_endio with any nonzero 2158 * return 2159 */ 2160 if (ret == 0) 2161 __raid56_parity_recover(rbio); 2162 /* 2163 * our rbio has been added to the list of 2164 * rbios that will be handled after the 2165 * currently lock owner is done 2166 */ 2167 return 0; 2168 2169 } 2170 2171 static void rmw_work(struct btrfs_work *work) 2172 { 2173 struct btrfs_raid_bio *rbio; 2174 2175 rbio = container_of(work, struct btrfs_raid_bio, work); 2176 raid56_rmw_stripe(rbio); 2177 } 2178 2179 static void read_rebuild_work(struct btrfs_work *work) 2180 { 2181 struct btrfs_raid_bio *rbio; 2182 2183 rbio = container_of(work, struct btrfs_raid_bio, work); 2184 __raid56_parity_recover(rbio); 2185 } 2186 2187 /* 2188 * The following code is used to scrub/replace the parity stripe 2189 * 2190 * Note: We need make sure all the pages that add into the scrub/replace 2191 * raid bio are correct and not be changed during the scrub/replace. That 2192 * is those pages just hold metadata or file data with checksum. 2193 */ 2194 2195 struct btrfs_raid_bio * 2196 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, 2197 struct btrfs_bio *bbio, u64 stripe_len, 2198 struct btrfs_device *scrub_dev, 2199 unsigned long *dbitmap, int stripe_nsectors) 2200 { 2201 struct btrfs_raid_bio *rbio; 2202 int i; 2203 2204 rbio = alloc_rbio(root, bbio, stripe_len); 2205 if (IS_ERR(rbio)) 2206 return NULL; 2207 bio_list_add(&rbio->bio_list, bio); 2208 /* 2209 * This is a special bio which is used to hold the completion handler 2210 * and make the scrub rbio is similar to the other types 2211 */ 2212 ASSERT(!bio->bi_iter.bi_size); 2213 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2214 2215 for (i = 0; i < rbio->real_stripes; i++) { 2216 if (bbio->stripes[i].dev == scrub_dev) { 2217 rbio->scrubp = i; 2218 break; 2219 } 2220 } 2221 2222 /* Now we just support the sectorsize equals to page size */ 2223 ASSERT(root->sectorsize == PAGE_SIZE); 2224 ASSERT(rbio->stripe_npages == stripe_nsectors); 2225 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2226 2227 return rbio; 2228 } 2229 2230 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, 2231 struct page *page, u64 logical) 2232 { 2233 int stripe_offset; 2234 int index; 2235 2236 ASSERT(logical >= rbio->bbio->raid_map[0]); 2237 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2238 rbio->stripe_len * rbio->nr_data); 2239 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2240 index = stripe_offset >> PAGE_CACHE_SHIFT; 2241 rbio->bio_pages[index] = page; 2242 } 2243 2244 /* 2245 * We just scrub the parity that we have correct data on the same horizontal, 2246 * so we needn't allocate all pages for all the stripes. 2247 */ 2248 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2249 { 2250 int i; 2251 int bit; 2252 int index; 2253 struct page *page; 2254 2255 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2256 for (i = 0; i < rbio->real_stripes; i++) { 2257 index = i * rbio->stripe_npages + bit; 2258 if (rbio->stripe_pages[index]) 2259 continue; 2260 2261 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2262 if (!page) 2263 return -ENOMEM; 2264 rbio->stripe_pages[index] = page; 2265 ClearPageUptodate(page); 2266 } 2267 } 2268 return 0; 2269 } 2270 2271 /* 2272 * end io function used by finish_rmw. When we finally 2273 * get here, we've written a full stripe 2274 */ 2275 static void raid_write_parity_end_io(struct bio *bio) 2276 { 2277 struct btrfs_raid_bio *rbio = bio->bi_private; 2278 int err = bio->bi_error; 2279 2280 if (bio->bi_error) 2281 fail_bio_stripe(rbio, bio); 2282 2283 bio_put(bio); 2284 2285 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2286 return; 2287 2288 err = 0; 2289 2290 if (atomic_read(&rbio->error)) 2291 err = -EIO; 2292 2293 rbio_orig_end_io(rbio, err); 2294 } 2295 2296 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2297 int need_check) 2298 { 2299 struct btrfs_bio *bbio = rbio->bbio; 2300 void *pointers[rbio->real_stripes]; 2301 DECLARE_BITMAP(pbitmap, rbio->stripe_npages); 2302 int nr_data = rbio->nr_data; 2303 int stripe; 2304 int pagenr; 2305 int p_stripe = -1; 2306 int q_stripe = -1; 2307 struct page *p_page = NULL; 2308 struct page *q_page = NULL; 2309 struct bio_list bio_list; 2310 struct bio *bio; 2311 int is_replace = 0; 2312 int ret; 2313 2314 bio_list_init(&bio_list); 2315 2316 if (rbio->real_stripes - rbio->nr_data == 1) { 2317 p_stripe = rbio->real_stripes - 1; 2318 } else if (rbio->real_stripes - rbio->nr_data == 2) { 2319 p_stripe = rbio->real_stripes - 2; 2320 q_stripe = rbio->real_stripes - 1; 2321 } else { 2322 BUG(); 2323 } 2324 2325 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2326 is_replace = 1; 2327 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2328 } 2329 2330 /* 2331 * Because the higher layers(scrubber) are unlikely to 2332 * use this area of the disk again soon, so don't cache 2333 * it. 2334 */ 2335 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2336 2337 if (!need_check) 2338 goto writeback; 2339 2340 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2341 if (!p_page) 2342 goto cleanup; 2343 SetPageUptodate(p_page); 2344 2345 if (q_stripe != -1) { 2346 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2347 if (!q_page) { 2348 __free_page(p_page); 2349 goto cleanup; 2350 } 2351 SetPageUptodate(q_page); 2352 } 2353 2354 atomic_set(&rbio->error, 0); 2355 2356 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2357 struct page *p; 2358 void *parity; 2359 /* first collect one page from each data stripe */ 2360 for (stripe = 0; stripe < nr_data; stripe++) { 2361 p = page_in_rbio(rbio, stripe, pagenr, 0); 2362 pointers[stripe] = kmap(p); 2363 } 2364 2365 /* then add the parity stripe */ 2366 pointers[stripe++] = kmap(p_page); 2367 2368 if (q_stripe != -1) { 2369 2370 /* 2371 * raid6, add the qstripe and call the 2372 * library function to fill in our p/q 2373 */ 2374 pointers[stripe++] = kmap(q_page); 2375 2376 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2377 pointers); 2378 } else { 2379 /* raid5 */ 2380 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 2381 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 2382 } 2383 2384 /* Check scrubbing pairty and repair it */ 2385 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2386 parity = kmap(p); 2387 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) 2388 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); 2389 else 2390 /* Parity is right, needn't writeback */ 2391 bitmap_clear(rbio->dbitmap, pagenr, 1); 2392 kunmap(p); 2393 2394 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 2395 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2396 } 2397 2398 __free_page(p_page); 2399 if (q_page) 2400 __free_page(q_page); 2401 2402 writeback: 2403 /* 2404 * time to start writing. Make bios for everything from the 2405 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2406 * everything else. 2407 */ 2408 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2409 struct page *page; 2410 2411 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2412 ret = rbio_add_io_page(rbio, &bio_list, 2413 page, rbio->scrubp, pagenr, rbio->stripe_len); 2414 if (ret) 2415 goto cleanup; 2416 } 2417 2418 if (!is_replace) 2419 goto submit_write; 2420 2421 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2422 struct page *page; 2423 2424 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2425 ret = rbio_add_io_page(rbio, &bio_list, page, 2426 bbio->tgtdev_map[rbio->scrubp], 2427 pagenr, rbio->stripe_len); 2428 if (ret) 2429 goto cleanup; 2430 } 2431 2432 submit_write: 2433 nr_data = bio_list_size(&bio_list); 2434 if (!nr_data) { 2435 /* Every parity is right */ 2436 rbio_orig_end_io(rbio, 0); 2437 return; 2438 } 2439 2440 atomic_set(&rbio->stripes_pending, nr_data); 2441 2442 while (1) { 2443 bio = bio_list_pop(&bio_list); 2444 if (!bio) 2445 break; 2446 2447 bio->bi_private = rbio; 2448 bio->bi_end_io = raid_write_parity_end_io; 2449 submit_bio(WRITE, bio); 2450 } 2451 return; 2452 2453 cleanup: 2454 rbio_orig_end_io(rbio, -EIO); 2455 } 2456 2457 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2458 { 2459 if (stripe >= 0 && stripe < rbio->nr_data) 2460 return 1; 2461 return 0; 2462 } 2463 2464 /* 2465 * While we're doing the parity check and repair, we could have errors 2466 * in reading pages off the disk. This checks for errors and if we're 2467 * not able to read the page it'll trigger parity reconstruction. The 2468 * parity scrub will be finished after we've reconstructed the failed 2469 * stripes 2470 */ 2471 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2472 { 2473 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2474 goto cleanup; 2475 2476 if (rbio->faila >= 0 || rbio->failb >= 0) { 2477 int dfail = 0, failp = -1; 2478 2479 if (is_data_stripe(rbio, rbio->faila)) 2480 dfail++; 2481 else if (is_parity_stripe(rbio->faila)) 2482 failp = rbio->faila; 2483 2484 if (is_data_stripe(rbio, rbio->failb)) 2485 dfail++; 2486 else if (is_parity_stripe(rbio->failb)) 2487 failp = rbio->failb; 2488 2489 /* 2490 * Because we can not use a scrubbing parity to repair 2491 * the data, so the capability of the repair is declined. 2492 * (In the case of RAID5, we can not repair anything) 2493 */ 2494 if (dfail > rbio->bbio->max_errors - 1) 2495 goto cleanup; 2496 2497 /* 2498 * If all data is good, only parity is correctly, just 2499 * repair the parity. 2500 */ 2501 if (dfail == 0) { 2502 finish_parity_scrub(rbio, 0); 2503 return; 2504 } 2505 2506 /* 2507 * Here means we got one corrupted data stripe and one 2508 * corrupted parity on RAID6, if the corrupted parity 2509 * is scrubbing parity, luckly, use the other one to repair 2510 * the data, or we can not repair the data stripe. 2511 */ 2512 if (failp != rbio->scrubp) 2513 goto cleanup; 2514 2515 __raid_recover_end_io(rbio); 2516 } else { 2517 finish_parity_scrub(rbio, 1); 2518 } 2519 return; 2520 2521 cleanup: 2522 rbio_orig_end_io(rbio, -EIO); 2523 } 2524 2525 /* 2526 * end io for the read phase of the rmw cycle. All the bios here are physical 2527 * stripe bios we've read from the disk so we can recalculate the parity of the 2528 * stripe. 2529 * 2530 * This will usually kick off finish_rmw once all the bios are read in, but it 2531 * may trigger parity reconstruction if we had any errors along the way 2532 */ 2533 static void raid56_parity_scrub_end_io(struct bio *bio) 2534 { 2535 struct btrfs_raid_bio *rbio = bio->bi_private; 2536 2537 if (bio->bi_error) 2538 fail_bio_stripe(rbio, bio); 2539 else 2540 set_bio_pages_uptodate(bio); 2541 2542 bio_put(bio); 2543 2544 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2545 return; 2546 2547 /* 2548 * this will normally call finish_rmw to start our write 2549 * but if there are any failed stripes we'll reconstruct 2550 * from parity first 2551 */ 2552 validate_rbio_for_parity_scrub(rbio); 2553 } 2554 2555 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2556 { 2557 int bios_to_read = 0; 2558 struct bio_list bio_list; 2559 int ret; 2560 int pagenr; 2561 int stripe; 2562 struct bio *bio; 2563 2564 ret = alloc_rbio_essential_pages(rbio); 2565 if (ret) 2566 goto cleanup; 2567 2568 bio_list_init(&bio_list); 2569 2570 atomic_set(&rbio->error, 0); 2571 /* 2572 * build a list of bios to read all the missing parts of this 2573 * stripe 2574 */ 2575 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2576 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2577 struct page *page; 2578 /* 2579 * we want to find all the pages missing from 2580 * the rbio and read them from the disk. If 2581 * page_in_rbio finds a page in the bio list 2582 * we don't need to read it off the stripe. 2583 */ 2584 page = page_in_rbio(rbio, stripe, pagenr, 1); 2585 if (page) 2586 continue; 2587 2588 page = rbio_stripe_page(rbio, stripe, pagenr); 2589 /* 2590 * the bio cache may have handed us an uptodate 2591 * page. If so, be happy and use it 2592 */ 2593 if (PageUptodate(page)) 2594 continue; 2595 2596 ret = rbio_add_io_page(rbio, &bio_list, page, 2597 stripe, pagenr, rbio->stripe_len); 2598 if (ret) 2599 goto cleanup; 2600 } 2601 } 2602 2603 bios_to_read = bio_list_size(&bio_list); 2604 if (!bios_to_read) { 2605 /* 2606 * this can happen if others have merged with 2607 * us, it means there is nothing left to read. 2608 * But if there are missing devices it may not be 2609 * safe to do the full stripe write yet. 2610 */ 2611 goto finish; 2612 } 2613 2614 /* 2615 * the bbio may be freed once we submit the last bio. Make sure 2616 * not to touch it after that 2617 */ 2618 atomic_set(&rbio->stripes_pending, bios_to_read); 2619 while (1) { 2620 bio = bio_list_pop(&bio_list); 2621 if (!bio) 2622 break; 2623 2624 bio->bi_private = rbio; 2625 bio->bi_end_io = raid56_parity_scrub_end_io; 2626 2627 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2628 BTRFS_WQ_ENDIO_RAID56); 2629 2630 submit_bio(READ, bio); 2631 } 2632 /* the actual write will happen once the reads are done */ 2633 return; 2634 2635 cleanup: 2636 rbio_orig_end_io(rbio, -EIO); 2637 return; 2638 2639 finish: 2640 validate_rbio_for_parity_scrub(rbio); 2641 } 2642 2643 static void scrub_parity_work(struct btrfs_work *work) 2644 { 2645 struct btrfs_raid_bio *rbio; 2646 2647 rbio = container_of(work, struct btrfs_raid_bio, work); 2648 raid56_parity_scrub_stripe(rbio); 2649 } 2650 2651 static void async_scrub_parity(struct btrfs_raid_bio *rbio) 2652 { 2653 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 2654 scrub_parity_work, NULL, NULL); 2655 2656 btrfs_queue_work(rbio->fs_info->rmw_workers, 2657 &rbio->work); 2658 } 2659 2660 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2661 { 2662 if (!lock_stripe_add(rbio)) 2663 async_scrub_parity(rbio); 2664 } 2665