1 /* 2 * Copyright (C) 2012 Fusion-io All rights reserved. 3 * Copyright (C) 2012 Intel Corp. All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public 7 * License v2 as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public 15 * License along with this program; if not, write to the 16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 * Boston, MA 021110-1307, USA. 18 */ 19 #include <linux/sched.h> 20 #include <linux/wait.h> 21 #include <linux/bio.h> 22 #include <linux/slab.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/random.h> 26 #include <linux/iocontext.h> 27 #include <linux/capability.h> 28 #include <linux/ratelimit.h> 29 #include <linux/kthread.h> 30 #include <linux/raid/pq.h> 31 #include <linux/hash.h> 32 #include <linux/list_sort.h> 33 #include <linux/raid/xor.h> 34 #include <linux/vmalloc.h> 35 #include <asm/div64.h> 36 #include "ctree.h" 37 #include "extent_map.h" 38 #include "disk-io.h" 39 #include "transaction.h" 40 #include "print-tree.h" 41 #include "volumes.h" 42 #include "raid56.h" 43 #include "async-thread.h" 44 #include "check-integrity.h" 45 #include "rcu-string.h" 46 47 /* set when additional merges to this rbio are not allowed */ 48 #define RBIO_RMW_LOCKED_BIT 1 49 50 /* 51 * set when this rbio is sitting in the hash, but it is just a cache 52 * of past RMW 53 */ 54 #define RBIO_CACHE_BIT 2 55 56 /* 57 * set when it is safe to trust the stripe_pages for caching 58 */ 59 #define RBIO_CACHE_READY_BIT 3 60 61 62 #define RBIO_CACHE_SIZE 1024 63 64 struct btrfs_raid_bio { 65 struct btrfs_fs_info *fs_info; 66 struct btrfs_bio *bbio; 67 68 /* 69 * logical block numbers for the start of each stripe 70 * The last one or two are p/q. These are sorted, 71 * so raid_map[0] is the start of our full stripe 72 */ 73 u64 *raid_map; 74 75 /* while we're doing rmw on a stripe 76 * we put it into a hash table so we can 77 * lock the stripe and merge more rbios 78 * into it. 79 */ 80 struct list_head hash_list; 81 82 /* 83 * LRU list for the stripe cache 84 */ 85 struct list_head stripe_cache; 86 87 /* 88 * for scheduling work in the helper threads 89 */ 90 struct btrfs_work work; 91 92 /* 93 * bio list and bio_list_lock are used 94 * to add more bios into the stripe 95 * in hopes of avoiding the full rmw 96 */ 97 struct bio_list bio_list; 98 spinlock_t bio_list_lock; 99 100 /* also protected by the bio_list_lock, the 101 * plug list is used by the plugging code 102 * to collect partial bios while plugged. The 103 * stripe locking code also uses it to hand off 104 * the stripe lock to the next pending IO 105 */ 106 struct list_head plug_list; 107 108 /* 109 * flags that tell us if it is safe to 110 * merge with this bio 111 */ 112 unsigned long flags; 113 114 /* size of each individual stripe on disk */ 115 int stripe_len; 116 117 /* number of data stripes (no p/q) */ 118 int nr_data; 119 120 /* 121 * set if we're doing a parity rebuild 122 * for a read from higher up, which is handled 123 * differently from a parity rebuild as part of 124 * rmw 125 */ 126 int read_rebuild; 127 128 /* first bad stripe */ 129 int faila; 130 131 /* second bad stripe (for raid6 use) */ 132 int failb; 133 134 /* 135 * number of pages needed to represent the full 136 * stripe 137 */ 138 int nr_pages; 139 140 /* 141 * size of all the bios in the bio_list. This 142 * helps us decide if the rbio maps to a full 143 * stripe or not 144 */ 145 int bio_list_bytes; 146 147 atomic_t refs; 148 149 /* 150 * these are two arrays of pointers. We allocate the 151 * rbio big enough to hold them both and setup their 152 * locations when the rbio is allocated 153 */ 154 155 /* pointers to pages that we allocated for 156 * reading/writing stripes directly from the disk (including P/Q) 157 */ 158 struct page **stripe_pages; 159 160 /* 161 * pointers to the pages in the bio_list. Stored 162 * here for faster lookup 163 */ 164 struct page **bio_pages; 165 }; 166 167 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 168 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 169 static void rmw_work(struct btrfs_work *work); 170 static void read_rebuild_work(struct btrfs_work *work); 171 static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 172 static void async_read_rebuild(struct btrfs_raid_bio *rbio); 173 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 174 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 175 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 176 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 177 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 178 179 /* 180 * the stripe hash table is used for locking, and to collect 181 * bios in hopes of making a full stripe 182 */ 183 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 184 { 185 struct btrfs_stripe_hash_table *table; 186 struct btrfs_stripe_hash_table *x; 187 struct btrfs_stripe_hash *cur; 188 struct btrfs_stripe_hash *h; 189 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 190 int i; 191 int table_size; 192 193 if (info->stripe_hash_table) 194 return 0; 195 196 /* 197 * The table is large, starting with order 4 and can go as high as 198 * order 7 in case lock debugging is turned on. 199 * 200 * Try harder to allocate and fallback to vmalloc to lower the chance 201 * of a failing mount. 202 */ 203 table_size = sizeof(*table) + sizeof(*h) * num_entries; 204 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 205 if (!table) { 206 table = vzalloc(table_size); 207 if (!table) 208 return -ENOMEM; 209 } 210 211 spin_lock_init(&table->cache_lock); 212 INIT_LIST_HEAD(&table->stripe_cache); 213 214 h = table->table; 215 216 for (i = 0; i < num_entries; i++) { 217 cur = h + i; 218 INIT_LIST_HEAD(&cur->hash_list); 219 spin_lock_init(&cur->lock); 220 init_waitqueue_head(&cur->wait); 221 } 222 223 x = cmpxchg(&info->stripe_hash_table, NULL, table); 224 if (x) { 225 if (is_vmalloc_addr(x)) 226 vfree(x); 227 else 228 kfree(x); 229 } 230 return 0; 231 } 232 233 /* 234 * caching an rbio means to copy anything from the 235 * bio_pages array into the stripe_pages array. We 236 * use the page uptodate bit in the stripe cache array 237 * to indicate if it has valid data 238 * 239 * once the caching is done, we set the cache ready 240 * bit. 241 */ 242 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 243 { 244 int i; 245 char *s; 246 char *d; 247 int ret; 248 249 ret = alloc_rbio_pages(rbio); 250 if (ret) 251 return; 252 253 for (i = 0; i < rbio->nr_pages; i++) { 254 if (!rbio->bio_pages[i]) 255 continue; 256 257 s = kmap(rbio->bio_pages[i]); 258 d = kmap(rbio->stripe_pages[i]); 259 260 memcpy(d, s, PAGE_CACHE_SIZE); 261 262 kunmap(rbio->bio_pages[i]); 263 kunmap(rbio->stripe_pages[i]); 264 SetPageUptodate(rbio->stripe_pages[i]); 265 } 266 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 267 } 268 269 /* 270 * we hash on the first logical address of the stripe 271 */ 272 static int rbio_bucket(struct btrfs_raid_bio *rbio) 273 { 274 u64 num = rbio->raid_map[0]; 275 276 /* 277 * we shift down quite a bit. We're using byte 278 * addressing, and most of the lower bits are zeros. 279 * This tends to upset hash_64, and it consistently 280 * returns just one or two different values. 281 * 282 * shifting off the lower bits fixes things. 283 */ 284 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 285 } 286 287 /* 288 * stealing an rbio means taking all the uptodate pages from the stripe 289 * array in the source rbio and putting them into the destination rbio 290 */ 291 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 292 { 293 int i; 294 struct page *s; 295 struct page *d; 296 297 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 298 return; 299 300 for (i = 0; i < dest->nr_pages; i++) { 301 s = src->stripe_pages[i]; 302 if (!s || !PageUptodate(s)) { 303 continue; 304 } 305 306 d = dest->stripe_pages[i]; 307 if (d) 308 __free_page(d); 309 310 dest->stripe_pages[i] = s; 311 src->stripe_pages[i] = NULL; 312 } 313 } 314 315 /* 316 * merging means we take the bio_list from the victim and 317 * splice it into the destination. The victim should 318 * be discarded afterwards. 319 * 320 * must be called with dest->rbio_list_lock held 321 */ 322 static void merge_rbio(struct btrfs_raid_bio *dest, 323 struct btrfs_raid_bio *victim) 324 { 325 bio_list_merge(&dest->bio_list, &victim->bio_list); 326 dest->bio_list_bytes += victim->bio_list_bytes; 327 bio_list_init(&victim->bio_list); 328 } 329 330 /* 331 * used to prune items that are in the cache. The caller 332 * must hold the hash table lock. 333 */ 334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 335 { 336 int bucket = rbio_bucket(rbio); 337 struct btrfs_stripe_hash_table *table; 338 struct btrfs_stripe_hash *h; 339 int freeit = 0; 340 341 /* 342 * check the bit again under the hash table lock. 343 */ 344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 345 return; 346 347 table = rbio->fs_info->stripe_hash_table; 348 h = table->table + bucket; 349 350 /* hold the lock for the bucket because we may be 351 * removing it from the hash table 352 */ 353 spin_lock(&h->lock); 354 355 /* 356 * hold the lock for the bio list because we need 357 * to make sure the bio list is empty 358 */ 359 spin_lock(&rbio->bio_list_lock); 360 361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 362 list_del_init(&rbio->stripe_cache); 363 table->cache_size -= 1; 364 freeit = 1; 365 366 /* if the bio list isn't empty, this rbio is 367 * still involved in an IO. We take it out 368 * of the cache list, and drop the ref that 369 * was held for the list. 370 * 371 * If the bio_list was empty, we also remove 372 * the rbio from the hash_table, and drop 373 * the corresponding ref 374 */ 375 if (bio_list_empty(&rbio->bio_list)) { 376 if (!list_empty(&rbio->hash_list)) { 377 list_del_init(&rbio->hash_list); 378 atomic_dec(&rbio->refs); 379 BUG_ON(!list_empty(&rbio->plug_list)); 380 } 381 } 382 } 383 384 spin_unlock(&rbio->bio_list_lock); 385 spin_unlock(&h->lock); 386 387 if (freeit) 388 __free_raid_bio(rbio); 389 } 390 391 /* 392 * prune a given rbio from the cache 393 */ 394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 395 { 396 struct btrfs_stripe_hash_table *table; 397 unsigned long flags; 398 399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 400 return; 401 402 table = rbio->fs_info->stripe_hash_table; 403 404 spin_lock_irqsave(&table->cache_lock, flags); 405 __remove_rbio_from_cache(rbio); 406 spin_unlock_irqrestore(&table->cache_lock, flags); 407 } 408 409 /* 410 * remove everything in the cache 411 */ 412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 413 { 414 struct btrfs_stripe_hash_table *table; 415 unsigned long flags; 416 struct btrfs_raid_bio *rbio; 417 418 table = info->stripe_hash_table; 419 420 spin_lock_irqsave(&table->cache_lock, flags); 421 while (!list_empty(&table->stripe_cache)) { 422 rbio = list_entry(table->stripe_cache.next, 423 struct btrfs_raid_bio, 424 stripe_cache); 425 __remove_rbio_from_cache(rbio); 426 } 427 spin_unlock_irqrestore(&table->cache_lock, flags); 428 } 429 430 /* 431 * remove all cached entries and free the hash table 432 * used by unmount 433 */ 434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 435 { 436 if (!info->stripe_hash_table) 437 return; 438 btrfs_clear_rbio_cache(info); 439 if (is_vmalloc_addr(info->stripe_hash_table)) 440 vfree(info->stripe_hash_table); 441 else 442 kfree(info->stripe_hash_table); 443 info->stripe_hash_table = NULL; 444 } 445 446 /* 447 * insert an rbio into the stripe cache. It 448 * must have already been prepared by calling 449 * cache_rbio_pages 450 * 451 * If this rbio was already cached, it gets 452 * moved to the front of the lru. 453 * 454 * If the size of the rbio cache is too big, we 455 * prune an item. 456 */ 457 static void cache_rbio(struct btrfs_raid_bio *rbio) 458 { 459 struct btrfs_stripe_hash_table *table; 460 unsigned long flags; 461 462 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 463 return; 464 465 table = rbio->fs_info->stripe_hash_table; 466 467 spin_lock_irqsave(&table->cache_lock, flags); 468 spin_lock(&rbio->bio_list_lock); 469 470 /* bump our ref if we were not in the list before */ 471 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 472 atomic_inc(&rbio->refs); 473 474 if (!list_empty(&rbio->stripe_cache)){ 475 list_move(&rbio->stripe_cache, &table->stripe_cache); 476 } else { 477 list_add(&rbio->stripe_cache, &table->stripe_cache); 478 table->cache_size += 1; 479 } 480 481 spin_unlock(&rbio->bio_list_lock); 482 483 if (table->cache_size > RBIO_CACHE_SIZE) { 484 struct btrfs_raid_bio *found; 485 486 found = list_entry(table->stripe_cache.prev, 487 struct btrfs_raid_bio, 488 stripe_cache); 489 490 if (found != rbio) 491 __remove_rbio_from_cache(found); 492 } 493 494 spin_unlock_irqrestore(&table->cache_lock, flags); 495 return; 496 } 497 498 /* 499 * helper function to run the xor_blocks api. It is only 500 * able to do MAX_XOR_BLOCKS at a time, so we need to 501 * loop through. 502 */ 503 static void run_xor(void **pages, int src_cnt, ssize_t len) 504 { 505 int src_off = 0; 506 int xor_src_cnt = 0; 507 void *dest = pages[src_cnt]; 508 509 while(src_cnt > 0) { 510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 511 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 512 513 src_cnt -= xor_src_cnt; 514 src_off += xor_src_cnt; 515 } 516 } 517 518 /* 519 * returns true if the bio list inside this rbio 520 * covers an entire stripe (no rmw required). 521 * Must be called with the bio list lock held, or 522 * at a time when you know it is impossible to add 523 * new bios into the list 524 */ 525 static int __rbio_is_full(struct btrfs_raid_bio *rbio) 526 { 527 unsigned long size = rbio->bio_list_bytes; 528 int ret = 1; 529 530 if (size != rbio->nr_data * rbio->stripe_len) 531 ret = 0; 532 533 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 534 return ret; 535 } 536 537 static int rbio_is_full(struct btrfs_raid_bio *rbio) 538 { 539 unsigned long flags; 540 int ret; 541 542 spin_lock_irqsave(&rbio->bio_list_lock, flags); 543 ret = __rbio_is_full(rbio); 544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 545 return ret; 546 } 547 548 /* 549 * returns 1 if it is safe to merge two rbios together. 550 * The merging is safe if the two rbios correspond to 551 * the same stripe and if they are both going in the same 552 * direction (read vs write), and if neither one is 553 * locked for final IO 554 * 555 * The caller is responsible for locking such that 556 * rmw_locked is safe to test 557 */ 558 static int rbio_can_merge(struct btrfs_raid_bio *last, 559 struct btrfs_raid_bio *cur) 560 { 561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 563 return 0; 564 565 /* 566 * we can't merge with cached rbios, since the 567 * idea is that when we merge the destination 568 * rbio is going to run our IO for us. We can 569 * steal from cached rbio's though, other functions 570 * handle that. 571 */ 572 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 573 test_bit(RBIO_CACHE_BIT, &cur->flags)) 574 return 0; 575 576 if (last->raid_map[0] != 577 cur->raid_map[0]) 578 return 0; 579 580 /* reads can't merge with writes */ 581 if (last->read_rebuild != 582 cur->read_rebuild) { 583 return 0; 584 } 585 586 return 1; 587 } 588 589 /* 590 * helper to index into the pstripe 591 */ 592 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 593 { 594 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 595 return rbio->stripe_pages[index]; 596 } 597 598 /* 599 * helper to index into the qstripe, returns null 600 * if there is no qstripe 601 */ 602 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 603 { 604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 605 return NULL; 606 607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 608 PAGE_CACHE_SHIFT; 609 return rbio->stripe_pages[index]; 610 } 611 612 /* 613 * The first stripe in the table for a logical address 614 * has the lock. rbios are added in one of three ways: 615 * 616 * 1) Nobody has the stripe locked yet. The rbio is given 617 * the lock and 0 is returned. The caller must start the IO 618 * themselves. 619 * 620 * 2) Someone has the stripe locked, but we're able to merge 621 * with the lock owner. The rbio is freed and the IO will 622 * start automatically along with the existing rbio. 1 is returned. 623 * 624 * 3) Someone has the stripe locked, but we're not able to merge. 625 * The rbio is added to the lock owner's plug list, or merged into 626 * an rbio already on the plug list. When the lock owner unlocks, 627 * the next rbio on the list is run and the IO is started automatically. 628 * 1 is returned 629 * 630 * If we return 0, the caller still owns the rbio and must continue with 631 * IO submission. If we return 1, the caller must assume the rbio has 632 * already been freed. 633 */ 634 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 635 { 636 int bucket = rbio_bucket(rbio); 637 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 638 struct btrfs_raid_bio *cur; 639 struct btrfs_raid_bio *pending; 640 unsigned long flags; 641 DEFINE_WAIT(wait); 642 struct btrfs_raid_bio *freeit = NULL; 643 struct btrfs_raid_bio *cache_drop = NULL; 644 int ret = 0; 645 int walk = 0; 646 647 spin_lock_irqsave(&h->lock, flags); 648 list_for_each_entry(cur, &h->hash_list, hash_list) { 649 walk++; 650 if (cur->raid_map[0] == rbio->raid_map[0]) { 651 spin_lock(&cur->bio_list_lock); 652 653 /* can we steal this cached rbio's pages? */ 654 if (bio_list_empty(&cur->bio_list) && 655 list_empty(&cur->plug_list) && 656 test_bit(RBIO_CACHE_BIT, &cur->flags) && 657 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 658 list_del_init(&cur->hash_list); 659 atomic_dec(&cur->refs); 660 661 steal_rbio(cur, rbio); 662 cache_drop = cur; 663 spin_unlock(&cur->bio_list_lock); 664 665 goto lockit; 666 } 667 668 /* can we merge into the lock owner? */ 669 if (rbio_can_merge(cur, rbio)) { 670 merge_rbio(cur, rbio); 671 spin_unlock(&cur->bio_list_lock); 672 freeit = rbio; 673 ret = 1; 674 goto out; 675 } 676 677 678 /* 679 * we couldn't merge with the running 680 * rbio, see if we can merge with the 681 * pending ones. We don't have to 682 * check for rmw_locked because there 683 * is no way they are inside finish_rmw 684 * right now 685 */ 686 list_for_each_entry(pending, &cur->plug_list, 687 plug_list) { 688 if (rbio_can_merge(pending, rbio)) { 689 merge_rbio(pending, rbio); 690 spin_unlock(&cur->bio_list_lock); 691 freeit = rbio; 692 ret = 1; 693 goto out; 694 } 695 } 696 697 /* no merging, put us on the tail of the plug list, 698 * our rbio will be started with the currently 699 * running rbio unlocks 700 */ 701 list_add_tail(&rbio->plug_list, &cur->plug_list); 702 spin_unlock(&cur->bio_list_lock); 703 ret = 1; 704 goto out; 705 } 706 } 707 lockit: 708 atomic_inc(&rbio->refs); 709 list_add(&rbio->hash_list, &h->hash_list); 710 out: 711 spin_unlock_irqrestore(&h->lock, flags); 712 if (cache_drop) 713 remove_rbio_from_cache(cache_drop); 714 if (freeit) 715 __free_raid_bio(freeit); 716 return ret; 717 } 718 719 /* 720 * called as rmw or parity rebuild is completed. If the plug list has more 721 * rbios waiting for this stripe, the next one on the list will be started 722 */ 723 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 724 { 725 int bucket; 726 struct btrfs_stripe_hash *h; 727 unsigned long flags; 728 int keep_cache = 0; 729 730 bucket = rbio_bucket(rbio); 731 h = rbio->fs_info->stripe_hash_table->table + bucket; 732 733 if (list_empty(&rbio->plug_list)) 734 cache_rbio(rbio); 735 736 spin_lock_irqsave(&h->lock, flags); 737 spin_lock(&rbio->bio_list_lock); 738 739 if (!list_empty(&rbio->hash_list)) { 740 /* 741 * if we're still cached and there is no other IO 742 * to perform, just leave this rbio here for others 743 * to steal from later 744 */ 745 if (list_empty(&rbio->plug_list) && 746 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 747 keep_cache = 1; 748 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 749 BUG_ON(!bio_list_empty(&rbio->bio_list)); 750 goto done; 751 } 752 753 list_del_init(&rbio->hash_list); 754 atomic_dec(&rbio->refs); 755 756 /* 757 * we use the plug list to hold all the rbios 758 * waiting for the chance to lock this stripe. 759 * hand the lock over to one of them. 760 */ 761 if (!list_empty(&rbio->plug_list)) { 762 struct btrfs_raid_bio *next; 763 struct list_head *head = rbio->plug_list.next; 764 765 next = list_entry(head, struct btrfs_raid_bio, 766 plug_list); 767 768 list_del_init(&rbio->plug_list); 769 770 list_add(&next->hash_list, &h->hash_list); 771 atomic_inc(&next->refs); 772 spin_unlock(&rbio->bio_list_lock); 773 spin_unlock_irqrestore(&h->lock, flags); 774 775 if (next->read_rebuild) 776 async_read_rebuild(next); 777 else { 778 steal_rbio(rbio, next); 779 async_rmw_stripe(next); 780 } 781 782 goto done_nolock; 783 } else if (waitqueue_active(&h->wait)) { 784 spin_unlock(&rbio->bio_list_lock); 785 spin_unlock_irqrestore(&h->lock, flags); 786 wake_up(&h->wait); 787 goto done_nolock; 788 } 789 } 790 done: 791 spin_unlock(&rbio->bio_list_lock); 792 spin_unlock_irqrestore(&h->lock, flags); 793 794 done_nolock: 795 if (!keep_cache) 796 remove_rbio_from_cache(rbio); 797 } 798 799 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 800 { 801 int i; 802 803 WARN_ON(atomic_read(&rbio->refs) < 0); 804 if (!atomic_dec_and_test(&rbio->refs)) 805 return; 806 807 WARN_ON(!list_empty(&rbio->stripe_cache)); 808 WARN_ON(!list_empty(&rbio->hash_list)); 809 WARN_ON(!bio_list_empty(&rbio->bio_list)); 810 811 for (i = 0; i < rbio->nr_pages; i++) { 812 if (rbio->stripe_pages[i]) { 813 __free_page(rbio->stripe_pages[i]); 814 rbio->stripe_pages[i] = NULL; 815 } 816 } 817 kfree(rbio->raid_map); 818 kfree(rbio->bbio); 819 kfree(rbio); 820 } 821 822 static void free_raid_bio(struct btrfs_raid_bio *rbio) 823 { 824 unlock_stripe(rbio); 825 __free_raid_bio(rbio); 826 } 827 828 /* 829 * this frees the rbio and runs through all the bios in the 830 * bio_list and calls end_io on them 831 */ 832 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 833 { 834 struct bio *cur = bio_list_get(&rbio->bio_list); 835 struct bio *next; 836 free_raid_bio(rbio); 837 838 while (cur) { 839 next = cur->bi_next; 840 cur->bi_next = NULL; 841 if (uptodate) 842 set_bit(BIO_UPTODATE, &cur->bi_flags); 843 bio_endio(cur, err); 844 cur = next; 845 } 846 } 847 848 /* 849 * end io function used by finish_rmw. When we finally 850 * get here, we've written a full stripe 851 */ 852 static void raid_write_end_io(struct bio *bio, int err) 853 { 854 struct btrfs_raid_bio *rbio = bio->bi_private; 855 856 if (err) 857 fail_bio_stripe(rbio, bio); 858 859 bio_put(bio); 860 861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 862 return; 863 864 err = 0; 865 866 /* OK, we have read all the stripes we need to. */ 867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 868 err = -EIO; 869 870 rbio_orig_end_io(rbio, err, 0); 871 return; 872 } 873 874 /* 875 * the read/modify/write code wants to use the original bio for 876 * any pages it included, and then use the rbio for everything 877 * else. This function decides if a given index (stripe number) 878 * and page number in that stripe fall inside the original bio 879 * or the rbio. 880 * 881 * if you set bio_list_only, you'll get a NULL back for any ranges 882 * that are outside the bio_list 883 * 884 * This doesn't take any refs on anything, you get a bare page pointer 885 * and the caller must bump refs as required. 886 * 887 * You must call index_rbio_pages once before you can trust 888 * the answers from this function. 889 */ 890 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 891 int index, int pagenr, int bio_list_only) 892 { 893 int chunk_page; 894 struct page *p = NULL; 895 896 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 897 898 spin_lock_irq(&rbio->bio_list_lock); 899 p = rbio->bio_pages[chunk_page]; 900 spin_unlock_irq(&rbio->bio_list_lock); 901 902 if (p || bio_list_only) 903 return p; 904 905 return rbio->stripe_pages[chunk_page]; 906 } 907 908 /* 909 * number of pages we need for the entire stripe across all the 910 * drives 911 */ 912 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 913 { 914 unsigned long nr = stripe_len * nr_stripes; 915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 916 } 917 918 /* 919 * allocation and initial setup for the btrfs_raid_bio. Not 920 * this does not allocate any pages for rbio->pages. 921 */ 922 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 923 struct btrfs_bio *bbio, u64 *raid_map, 924 u64 stripe_len) 925 { 926 struct btrfs_raid_bio *rbio; 927 int nr_data = 0; 928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 929 void *p; 930 931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 932 GFP_NOFS); 933 if (!rbio) { 934 kfree(raid_map); 935 kfree(bbio); 936 return ERR_PTR(-ENOMEM); 937 } 938 939 bio_list_init(&rbio->bio_list); 940 INIT_LIST_HEAD(&rbio->plug_list); 941 spin_lock_init(&rbio->bio_list_lock); 942 INIT_LIST_HEAD(&rbio->stripe_cache); 943 INIT_LIST_HEAD(&rbio->hash_list); 944 rbio->bbio = bbio; 945 rbio->raid_map = raid_map; 946 rbio->fs_info = root->fs_info; 947 rbio->stripe_len = stripe_len; 948 rbio->nr_pages = num_pages; 949 rbio->faila = -1; 950 rbio->failb = -1; 951 atomic_set(&rbio->refs, 1); 952 953 /* 954 * the stripe_pages and bio_pages array point to the extra 955 * memory we allocated past the end of the rbio 956 */ 957 p = rbio + 1; 958 rbio->stripe_pages = p; 959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 960 961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 962 nr_data = bbio->num_stripes - 2; 963 else 964 nr_data = bbio->num_stripes - 1; 965 966 rbio->nr_data = nr_data; 967 return rbio; 968 } 969 970 /* allocate pages for all the stripes in the bio, including parity */ 971 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 972 { 973 int i; 974 struct page *page; 975 976 for (i = 0; i < rbio->nr_pages; i++) { 977 if (rbio->stripe_pages[i]) 978 continue; 979 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 980 if (!page) 981 return -ENOMEM; 982 rbio->stripe_pages[i] = page; 983 ClearPageUptodate(page); 984 } 985 return 0; 986 } 987 988 /* allocate pages for just the p/q stripes */ 989 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 990 { 991 int i; 992 struct page *page; 993 994 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 995 996 for (; i < rbio->nr_pages; i++) { 997 if (rbio->stripe_pages[i]) 998 continue; 999 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1000 if (!page) 1001 return -ENOMEM; 1002 rbio->stripe_pages[i] = page; 1003 } 1004 return 0; 1005 } 1006 1007 /* 1008 * add a single page from a specific stripe into our list of bios for IO 1009 * this will try to merge into existing bios if possible, and returns 1010 * zero if all went well. 1011 */ 1012 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1013 struct bio_list *bio_list, 1014 struct page *page, 1015 int stripe_nr, 1016 unsigned long page_index, 1017 unsigned long bio_max_len) 1018 { 1019 struct bio *last = bio_list->tail; 1020 u64 last_end = 0; 1021 int ret; 1022 struct bio *bio; 1023 struct btrfs_bio_stripe *stripe; 1024 u64 disk_start; 1025 1026 stripe = &rbio->bbio->stripes[stripe_nr]; 1027 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 1028 1029 /* if the device is missing, just fail this stripe */ 1030 if (!stripe->dev->bdev) 1031 return fail_rbio_index(rbio, stripe_nr); 1032 1033 /* see if we can add this page onto our existing bio */ 1034 if (last) { 1035 last_end = (u64)last->bi_iter.bi_sector << 9; 1036 last_end += last->bi_iter.bi_size; 1037 1038 /* 1039 * we can't merge these if they are from different 1040 * devices or if they are not contiguous 1041 */ 1042 if (last_end == disk_start && stripe->dev->bdev && 1043 test_bit(BIO_UPTODATE, &last->bi_flags) && 1044 last->bi_bdev == stripe->dev->bdev) { 1045 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 1046 if (ret == PAGE_CACHE_SIZE) 1047 return 0; 1048 } 1049 } 1050 1051 /* put a new bio on the list */ 1052 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1053 if (!bio) 1054 return -ENOMEM; 1055 1056 bio->bi_iter.bi_size = 0; 1057 bio->bi_bdev = stripe->dev->bdev; 1058 bio->bi_iter.bi_sector = disk_start >> 9; 1059 set_bit(BIO_UPTODATE, &bio->bi_flags); 1060 1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 1062 bio_list_add(bio_list, bio); 1063 return 0; 1064 } 1065 1066 /* 1067 * while we're doing the read/modify/write cycle, we could 1068 * have errors in reading pages off the disk. This checks 1069 * for errors and if we're not able to read the page it'll 1070 * trigger parity reconstruction. The rmw will be finished 1071 * after we've reconstructed the failed stripes 1072 */ 1073 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1074 { 1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1077 __raid56_parity_recover(rbio); 1078 } else { 1079 finish_rmw(rbio); 1080 } 1081 } 1082 1083 /* 1084 * these are just the pages from the rbio array, not from anything 1085 * the FS sent down to us 1086 */ 1087 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 1088 { 1089 int index; 1090 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 1091 index += page; 1092 return rbio->stripe_pages[index]; 1093 } 1094 1095 /* 1096 * helper function to walk our bio list and populate the bio_pages array with 1097 * the result. This seems expensive, but it is faster than constantly 1098 * searching through the bio list as we setup the IO in finish_rmw or stripe 1099 * reconstruction. 1100 * 1101 * This must be called before you trust the answers from page_in_rbio 1102 */ 1103 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1104 { 1105 struct bio *bio; 1106 u64 start; 1107 unsigned long stripe_offset; 1108 unsigned long page_index; 1109 struct page *p; 1110 int i; 1111 1112 spin_lock_irq(&rbio->bio_list_lock); 1113 bio_list_for_each(bio, &rbio->bio_list) { 1114 start = (u64)bio->bi_iter.bi_sector << 9; 1115 stripe_offset = start - rbio->raid_map[0]; 1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1117 1118 for (i = 0; i < bio->bi_vcnt; i++) { 1119 p = bio->bi_io_vec[i].bv_page; 1120 rbio->bio_pages[page_index + i] = p; 1121 } 1122 } 1123 spin_unlock_irq(&rbio->bio_list_lock); 1124 } 1125 1126 /* 1127 * this is called from one of two situations. We either 1128 * have a full stripe from the higher layers, or we've read all 1129 * the missing bits off disk. 1130 * 1131 * This will calculate the parity and then send down any 1132 * changed blocks. 1133 */ 1134 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1135 { 1136 struct btrfs_bio *bbio = rbio->bbio; 1137 void *pointers[bbio->num_stripes]; 1138 int stripe_len = rbio->stripe_len; 1139 int nr_data = rbio->nr_data; 1140 int stripe; 1141 int pagenr; 1142 int p_stripe = -1; 1143 int q_stripe = -1; 1144 struct bio_list bio_list; 1145 struct bio *bio; 1146 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 1147 int ret; 1148 1149 bio_list_init(&bio_list); 1150 1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1152 p_stripe = bbio->num_stripes - 1; 1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1154 p_stripe = bbio->num_stripes - 2; 1155 q_stripe = bbio->num_stripes - 1; 1156 } else { 1157 BUG(); 1158 } 1159 1160 /* at this point we either have a full stripe, 1161 * or we've read the full stripe from the drive. 1162 * recalculate the parity and write the new results. 1163 * 1164 * We're not allowed to add any new bios to the 1165 * bio list here, anyone else that wants to 1166 * change this stripe needs to do their own rmw. 1167 */ 1168 spin_lock_irq(&rbio->bio_list_lock); 1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1170 spin_unlock_irq(&rbio->bio_list_lock); 1171 1172 atomic_set(&rbio->bbio->error, 0); 1173 1174 /* 1175 * now that we've set rmw_locked, run through the 1176 * bio list one last time and map the page pointers 1177 * 1178 * We don't cache full rbios because we're assuming 1179 * the higher layers are unlikely to use this area of 1180 * the disk again soon. If they do use it again, 1181 * hopefully they will send another full bio. 1182 */ 1183 index_rbio_pages(rbio); 1184 if (!rbio_is_full(rbio)) 1185 cache_rbio_pages(rbio); 1186 else 1187 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1188 1189 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1190 struct page *p; 1191 /* first collect one page from each data stripe */ 1192 for (stripe = 0; stripe < nr_data; stripe++) { 1193 p = page_in_rbio(rbio, stripe, pagenr, 0); 1194 pointers[stripe] = kmap(p); 1195 } 1196 1197 /* then add the parity stripe */ 1198 p = rbio_pstripe_page(rbio, pagenr); 1199 SetPageUptodate(p); 1200 pointers[stripe++] = kmap(p); 1201 1202 if (q_stripe != -1) { 1203 1204 /* 1205 * raid6, add the qstripe and call the 1206 * library function to fill in our p/q 1207 */ 1208 p = rbio_qstripe_page(rbio, pagenr); 1209 SetPageUptodate(p); 1210 pointers[stripe++] = kmap(p); 1211 1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1213 pointers); 1214 } else { 1215 /* raid5 */ 1216 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1217 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 1218 } 1219 1220 1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1223 } 1224 1225 /* 1226 * time to start writing. Make bios for everything from the 1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1228 * everything else. 1229 */ 1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1232 struct page *page; 1233 if (stripe < rbio->nr_data) { 1234 page = page_in_rbio(rbio, stripe, pagenr, 1); 1235 if (!page) 1236 continue; 1237 } else { 1238 page = rbio_stripe_page(rbio, stripe, pagenr); 1239 } 1240 1241 ret = rbio_add_io_page(rbio, &bio_list, 1242 page, stripe, pagenr, rbio->stripe_len); 1243 if (ret) 1244 goto cleanup; 1245 } 1246 } 1247 1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1250 1251 while (1) { 1252 bio = bio_list_pop(&bio_list); 1253 if (!bio) 1254 break; 1255 1256 bio->bi_private = rbio; 1257 bio->bi_end_io = raid_write_end_io; 1258 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1259 submit_bio(WRITE, bio); 1260 } 1261 return; 1262 1263 cleanup: 1264 rbio_orig_end_io(rbio, -EIO, 0); 1265 } 1266 1267 /* 1268 * helper to find the stripe number for a given bio. Used to figure out which 1269 * stripe has failed. This expects the bio to correspond to a physical disk, 1270 * so it looks up based on physical sector numbers. 1271 */ 1272 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1273 struct bio *bio) 1274 { 1275 u64 physical = bio->bi_iter.bi_sector; 1276 u64 stripe_start; 1277 int i; 1278 struct btrfs_bio_stripe *stripe; 1279 1280 physical <<= 9; 1281 1282 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1283 stripe = &rbio->bbio->stripes[i]; 1284 stripe_start = stripe->physical; 1285 if (physical >= stripe_start && 1286 physical < stripe_start + rbio->stripe_len) { 1287 return i; 1288 } 1289 } 1290 return -1; 1291 } 1292 1293 /* 1294 * helper to find the stripe number for a given 1295 * bio (before mapping). Used to figure out which stripe has 1296 * failed. This looks up based on logical block numbers. 1297 */ 1298 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1299 struct bio *bio) 1300 { 1301 u64 logical = bio->bi_iter.bi_sector; 1302 u64 stripe_start; 1303 int i; 1304 1305 logical <<= 9; 1306 1307 for (i = 0; i < rbio->nr_data; i++) { 1308 stripe_start = rbio->raid_map[i]; 1309 if (logical >= stripe_start && 1310 logical < stripe_start + rbio->stripe_len) { 1311 return i; 1312 } 1313 } 1314 return -1; 1315 } 1316 1317 /* 1318 * returns -EIO if we had too many failures 1319 */ 1320 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1321 { 1322 unsigned long flags; 1323 int ret = 0; 1324 1325 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1326 1327 /* we already know this stripe is bad, move on */ 1328 if (rbio->faila == failed || rbio->failb == failed) 1329 goto out; 1330 1331 if (rbio->faila == -1) { 1332 /* first failure on this rbio */ 1333 rbio->faila = failed; 1334 atomic_inc(&rbio->bbio->error); 1335 } else if (rbio->failb == -1) { 1336 /* second failure on this rbio */ 1337 rbio->failb = failed; 1338 atomic_inc(&rbio->bbio->error); 1339 } else { 1340 ret = -EIO; 1341 } 1342 out: 1343 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1344 1345 return ret; 1346 } 1347 1348 /* 1349 * helper to fail a stripe based on a physical disk 1350 * bio. 1351 */ 1352 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1353 struct bio *bio) 1354 { 1355 int failed = find_bio_stripe(rbio, bio); 1356 1357 if (failed < 0) 1358 return -EIO; 1359 1360 return fail_rbio_index(rbio, failed); 1361 } 1362 1363 /* 1364 * this sets each page in the bio uptodate. It should only be used on private 1365 * rbio pages, nothing that comes in from the higher layers 1366 */ 1367 static void set_bio_pages_uptodate(struct bio *bio) 1368 { 1369 int i; 1370 struct page *p; 1371 1372 for (i = 0; i < bio->bi_vcnt; i++) { 1373 p = bio->bi_io_vec[i].bv_page; 1374 SetPageUptodate(p); 1375 } 1376 } 1377 1378 /* 1379 * end io for the read phase of the rmw cycle. All the bios here are physical 1380 * stripe bios we've read from the disk so we can recalculate the parity of the 1381 * stripe. 1382 * 1383 * This will usually kick off finish_rmw once all the bios are read in, but it 1384 * may trigger parity reconstruction if we had any errors along the way 1385 */ 1386 static void raid_rmw_end_io(struct bio *bio, int err) 1387 { 1388 struct btrfs_raid_bio *rbio = bio->bi_private; 1389 1390 if (err) 1391 fail_bio_stripe(rbio, bio); 1392 else 1393 set_bio_pages_uptodate(bio); 1394 1395 bio_put(bio); 1396 1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1398 return; 1399 1400 err = 0; 1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1402 goto cleanup; 1403 1404 /* 1405 * this will normally call finish_rmw to start our write 1406 * but if there are any failed stripes we'll reconstruct 1407 * from parity first 1408 */ 1409 validate_rbio_for_rmw(rbio); 1410 return; 1411 1412 cleanup: 1413 1414 rbio_orig_end_io(rbio, -EIO, 0); 1415 } 1416 1417 static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1418 { 1419 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1420 rmw_work, NULL, NULL); 1421 1422 btrfs_queue_work(rbio->fs_info->rmw_workers, 1423 &rbio->work); 1424 } 1425 1426 static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1427 { 1428 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1429 read_rebuild_work, NULL, NULL); 1430 1431 btrfs_queue_work(rbio->fs_info->rmw_workers, 1432 &rbio->work); 1433 } 1434 1435 /* 1436 * the stripe must be locked by the caller. It will 1437 * unlock after all the writes are done 1438 */ 1439 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1440 { 1441 int bios_to_read = 0; 1442 struct btrfs_bio *bbio = rbio->bbio; 1443 struct bio_list bio_list; 1444 int ret; 1445 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1446 int pagenr; 1447 int stripe; 1448 struct bio *bio; 1449 1450 bio_list_init(&bio_list); 1451 1452 ret = alloc_rbio_pages(rbio); 1453 if (ret) 1454 goto cleanup; 1455 1456 index_rbio_pages(rbio); 1457 1458 atomic_set(&rbio->bbio->error, 0); 1459 /* 1460 * build a list of bios to read all the missing parts of this 1461 * stripe 1462 */ 1463 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1464 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1465 struct page *page; 1466 /* 1467 * we want to find all the pages missing from 1468 * the rbio and read them from the disk. If 1469 * page_in_rbio finds a page in the bio list 1470 * we don't need to read it off the stripe. 1471 */ 1472 page = page_in_rbio(rbio, stripe, pagenr, 1); 1473 if (page) 1474 continue; 1475 1476 page = rbio_stripe_page(rbio, stripe, pagenr); 1477 /* 1478 * the bio cache may have handed us an uptodate 1479 * page. If so, be happy and use it 1480 */ 1481 if (PageUptodate(page)) 1482 continue; 1483 1484 ret = rbio_add_io_page(rbio, &bio_list, page, 1485 stripe, pagenr, rbio->stripe_len); 1486 if (ret) 1487 goto cleanup; 1488 } 1489 } 1490 1491 bios_to_read = bio_list_size(&bio_list); 1492 if (!bios_to_read) { 1493 /* 1494 * this can happen if others have merged with 1495 * us, it means there is nothing left to read. 1496 * But if there are missing devices it may not be 1497 * safe to do the full stripe write yet. 1498 */ 1499 goto finish; 1500 } 1501 1502 /* 1503 * the bbio may be freed once we submit the last bio. Make sure 1504 * not to touch it after that 1505 */ 1506 atomic_set(&bbio->stripes_pending, bios_to_read); 1507 while (1) { 1508 bio = bio_list_pop(&bio_list); 1509 if (!bio) 1510 break; 1511 1512 bio->bi_private = rbio; 1513 bio->bi_end_io = raid_rmw_end_io; 1514 1515 btrfs_bio_wq_end_io(rbio->fs_info, bio, 1516 BTRFS_WQ_ENDIO_RAID56); 1517 1518 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1519 submit_bio(READ, bio); 1520 } 1521 /* the actual write will happen once the reads are done */ 1522 return 0; 1523 1524 cleanup: 1525 rbio_orig_end_io(rbio, -EIO, 0); 1526 return -EIO; 1527 1528 finish: 1529 validate_rbio_for_rmw(rbio); 1530 return 0; 1531 } 1532 1533 /* 1534 * if the upper layers pass in a full stripe, we thank them by only allocating 1535 * enough pages to hold the parity, and sending it all down quickly. 1536 */ 1537 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1538 { 1539 int ret; 1540 1541 ret = alloc_rbio_parity_pages(rbio); 1542 if (ret) { 1543 __free_raid_bio(rbio); 1544 return ret; 1545 } 1546 1547 ret = lock_stripe_add(rbio); 1548 if (ret == 0) 1549 finish_rmw(rbio); 1550 return 0; 1551 } 1552 1553 /* 1554 * partial stripe writes get handed over to async helpers. 1555 * We're really hoping to merge a few more writes into this 1556 * rbio before calculating new parity 1557 */ 1558 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1559 { 1560 int ret; 1561 1562 ret = lock_stripe_add(rbio); 1563 if (ret == 0) 1564 async_rmw_stripe(rbio); 1565 return 0; 1566 } 1567 1568 /* 1569 * sometimes while we were reading from the drive to 1570 * recalculate parity, enough new bios come into create 1571 * a full stripe. So we do a check here to see if we can 1572 * go directly to finish_rmw 1573 */ 1574 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1575 { 1576 /* head off into rmw land if we don't have a full stripe */ 1577 if (!rbio_is_full(rbio)) 1578 return partial_stripe_write(rbio); 1579 return full_stripe_write(rbio); 1580 } 1581 1582 /* 1583 * We use plugging call backs to collect full stripes. 1584 * Any time we get a partial stripe write while plugged 1585 * we collect it into a list. When the unplug comes down, 1586 * we sort the list by logical block number and merge 1587 * everything we can into the same rbios 1588 */ 1589 struct btrfs_plug_cb { 1590 struct blk_plug_cb cb; 1591 struct btrfs_fs_info *info; 1592 struct list_head rbio_list; 1593 struct btrfs_work work; 1594 }; 1595 1596 /* 1597 * rbios on the plug list are sorted for easier merging. 1598 */ 1599 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1600 { 1601 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1602 plug_list); 1603 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1604 plug_list); 1605 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1606 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1607 1608 if (a_sector < b_sector) 1609 return -1; 1610 if (a_sector > b_sector) 1611 return 1; 1612 return 0; 1613 } 1614 1615 static void run_plug(struct btrfs_plug_cb *plug) 1616 { 1617 struct btrfs_raid_bio *cur; 1618 struct btrfs_raid_bio *last = NULL; 1619 1620 /* 1621 * sort our plug list then try to merge 1622 * everything we can in hopes of creating full 1623 * stripes. 1624 */ 1625 list_sort(NULL, &plug->rbio_list, plug_cmp); 1626 while (!list_empty(&plug->rbio_list)) { 1627 cur = list_entry(plug->rbio_list.next, 1628 struct btrfs_raid_bio, plug_list); 1629 list_del_init(&cur->plug_list); 1630 1631 if (rbio_is_full(cur)) { 1632 /* we have a full stripe, send it down */ 1633 full_stripe_write(cur); 1634 continue; 1635 } 1636 if (last) { 1637 if (rbio_can_merge(last, cur)) { 1638 merge_rbio(last, cur); 1639 __free_raid_bio(cur); 1640 continue; 1641 1642 } 1643 __raid56_parity_write(last); 1644 } 1645 last = cur; 1646 } 1647 if (last) { 1648 __raid56_parity_write(last); 1649 } 1650 kfree(plug); 1651 } 1652 1653 /* 1654 * if the unplug comes from schedule, we have to push the 1655 * work off to a helper thread 1656 */ 1657 static void unplug_work(struct btrfs_work *work) 1658 { 1659 struct btrfs_plug_cb *plug; 1660 plug = container_of(work, struct btrfs_plug_cb, work); 1661 run_plug(plug); 1662 } 1663 1664 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1665 { 1666 struct btrfs_plug_cb *plug; 1667 plug = container_of(cb, struct btrfs_plug_cb, cb); 1668 1669 if (from_schedule) { 1670 btrfs_init_work(&plug->work, btrfs_rmw_helper, 1671 unplug_work, NULL, NULL); 1672 btrfs_queue_work(plug->info->rmw_workers, 1673 &plug->work); 1674 return; 1675 } 1676 run_plug(plug); 1677 } 1678 1679 /* 1680 * our main entry point for writes from the rest of the FS. 1681 */ 1682 int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1683 struct btrfs_bio *bbio, u64 *raid_map, 1684 u64 stripe_len) 1685 { 1686 struct btrfs_raid_bio *rbio; 1687 struct btrfs_plug_cb *plug = NULL; 1688 struct blk_plug_cb *cb; 1689 1690 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1691 if (IS_ERR(rbio)) 1692 return PTR_ERR(rbio); 1693 bio_list_add(&rbio->bio_list, bio); 1694 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1695 1696 /* 1697 * don't plug on full rbios, just get them out the door 1698 * as quickly as we can 1699 */ 1700 if (rbio_is_full(rbio)) 1701 return full_stripe_write(rbio); 1702 1703 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1704 sizeof(*plug)); 1705 if (cb) { 1706 plug = container_of(cb, struct btrfs_plug_cb, cb); 1707 if (!plug->info) { 1708 plug->info = root->fs_info; 1709 INIT_LIST_HEAD(&plug->rbio_list); 1710 } 1711 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1712 } else { 1713 return __raid56_parity_write(rbio); 1714 } 1715 return 0; 1716 } 1717 1718 /* 1719 * all parity reconstruction happens here. We've read in everything 1720 * we can find from the drives and this does the heavy lifting of 1721 * sorting the good from the bad. 1722 */ 1723 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1724 { 1725 int pagenr, stripe; 1726 void **pointers; 1727 int faila = -1, failb = -1; 1728 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1729 struct page *page; 1730 int err; 1731 int i; 1732 1733 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1734 GFP_NOFS); 1735 if (!pointers) { 1736 err = -ENOMEM; 1737 goto cleanup_io; 1738 } 1739 1740 faila = rbio->faila; 1741 failb = rbio->failb; 1742 1743 if (rbio->read_rebuild) { 1744 spin_lock_irq(&rbio->bio_list_lock); 1745 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1746 spin_unlock_irq(&rbio->bio_list_lock); 1747 } 1748 1749 index_rbio_pages(rbio); 1750 1751 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1752 /* setup our array of pointers with pages 1753 * from each stripe 1754 */ 1755 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1756 /* 1757 * if we're rebuilding a read, we have to use 1758 * pages from the bio list 1759 */ 1760 if (rbio->read_rebuild && 1761 (stripe == faila || stripe == failb)) { 1762 page = page_in_rbio(rbio, stripe, pagenr, 0); 1763 } else { 1764 page = rbio_stripe_page(rbio, stripe, pagenr); 1765 } 1766 pointers[stripe] = kmap(page); 1767 } 1768 1769 /* all raid6 handling here */ 1770 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1771 RAID6_Q_STRIPE) { 1772 1773 /* 1774 * single failure, rebuild from parity raid5 1775 * style 1776 */ 1777 if (failb < 0) { 1778 if (faila == rbio->nr_data) { 1779 /* 1780 * Just the P stripe has failed, without 1781 * a bad data or Q stripe. 1782 * TODO, we should redo the xor here. 1783 */ 1784 err = -EIO; 1785 goto cleanup; 1786 } 1787 /* 1788 * a single failure in raid6 is rebuilt 1789 * in the pstripe code below 1790 */ 1791 goto pstripe; 1792 } 1793 1794 /* make sure our ps and qs are in order */ 1795 if (faila > failb) { 1796 int tmp = failb; 1797 failb = faila; 1798 faila = tmp; 1799 } 1800 1801 /* if the q stripe is failed, do a pstripe reconstruction 1802 * from the xors. 1803 * If both the q stripe and the P stripe are failed, we're 1804 * here due to a crc mismatch and we can't give them the 1805 * data they want 1806 */ 1807 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1808 if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1809 err = -EIO; 1810 goto cleanup; 1811 } 1812 /* 1813 * otherwise we have one bad data stripe and 1814 * a good P stripe. raid5! 1815 */ 1816 goto pstripe; 1817 } 1818 1819 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1820 raid6_datap_recov(rbio->bbio->num_stripes, 1821 PAGE_SIZE, faila, pointers); 1822 } else { 1823 raid6_2data_recov(rbio->bbio->num_stripes, 1824 PAGE_SIZE, faila, failb, 1825 pointers); 1826 } 1827 } else { 1828 void *p; 1829 1830 /* rebuild from P stripe here (raid5 or raid6) */ 1831 BUG_ON(failb != -1); 1832 pstripe: 1833 /* Copy parity block into failed block to start with */ 1834 memcpy(pointers[faila], 1835 pointers[rbio->nr_data], 1836 PAGE_CACHE_SIZE); 1837 1838 /* rearrange the pointer array */ 1839 p = pointers[faila]; 1840 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1841 pointers[stripe] = pointers[stripe + 1]; 1842 pointers[rbio->nr_data - 1] = p; 1843 1844 /* xor in the rest */ 1845 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1846 } 1847 /* if we're doing this rebuild as part of an rmw, go through 1848 * and set all of our private rbio pages in the 1849 * failed stripes as uptodate. This way finish_rmw will 1850 * know they can be trusted. If this was a read reconstruction, 1851 * other endio functions will fiddle the uptodate bits 1852 */ 1853 if (!rbio->read_rebuild) { 1854 for (i = 0; i < nr_pages; i++) { 1855 if (faila != -1) { 1856 page = rbio_stripe_page(rbio, faila, i); 1857 SetPageUptodate(page); 1858 } 1859 if (failb != -1) { 1860 page = rbio_stripe_page(rbio, failb, i); 1861 SetPageUptodate(page); 1862 } 1863 } 1864 } 1865 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1866 /* 1867 * if we're rebuilding a read, we have to use 1868 * pages from the bio list 1869 */ 1870 if (rbio->read_rebuild && 1871 (stripe == faila || stripe == failb)) { 1872 page = page_in_rbio(rbio, stripe, pagenr, 0); 1873 } else { 1874 page = rbio_stripe_page(rbio, stripe, pagenr); 1875 } 1876 kunmap(page); 1877 } 1878 } 1879 1880 err = 0; 1881 cleanup: 1882 kfree(pointers); 1883 1884 cleanup_io: 1885 1886 if (rbio->read_rebuild) { 1887 if (err == 0) 1888 cache_rbio_pages(rbio); 1889 else 1890 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1891 1892 rbio_orig_end_io(rbio, err, err == 0); 1893 } else if (err == 0) { 1894 rbio->faila = -1; 1895 rbio->failb = -1; 1896 finish_rmw(rbio); 1897 } else { 1898 rbio_orig_end_io(rbio, err, 0); 1899 } 1900 } 1901 1902 /* 1903 * This is called only for stripes we've read from disk to 1904 * reconstruct the parity. 1905 */ 1906 static void raid_recover_end_io(struct bio *bio, int err) 1907 { 1908 struct btrfs_raid_bio *rbio = bio->bi_private; 1909 1910 /* 1911 * we only read stripe pages off the disk, set them 1912 * up to date if there were no errors 1913 */ 1914 if (err) 1915 fail_bio_stripe(rbio, bio); 1916 else 1917 set_bio_pages_uptodate(bio); 1918 bio_put(bio); 1919 1920 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1921 return; 1922 1923 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1924 rbio_orig_end_io(rbio, -EIO, 0); 1925 else 1926 __raid_recover_end_io(rbio); 1927 } 1928 1929 /* 1930 * reads everything we need off the disk to reconstruct 1931 * the parity. endio handlers trigger final reconstruction 1932 * when the IO is done. 1933 * 1934 * This is used both for reads from the higher layers and for 1935 * parity construction required to finish a rmw cycle. 1936 */ 1937 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 1938 { 1939 int bios_to_read = 0; 1940 struct btrfs_bio *bbio = rbio->bbio; 1941 struct bio_list bio_list; 1942 int ret; 1943 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1944 int pagenr; 1945 int stripe; 1946 struct bio *bio; 1947 1948 bio_list_init(&bio_list); 1949 1950 ret = alloc_rbio_pages(rbio); 1951 if (ret) 1952 goto cleanup; 1953 1954 atomic_set(&rbio->bbio->error, 0); 1955 1956 /* 1957 * read everything that hasn't failed. Thanks to the 1958 * stripe cache, it is possible that some or all of these 1959 * pages are going to be uptodate. 1960 */ 1961 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1962 if (rbio->faila == stripe || rbio->failb == stripe) { 1963 atomic_inc(&rbio->bbio->error); 1964 continue; 1965 } 1966 1967 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1968 struct page *p; 1969 1970 /* 1971 * the rmw code may have already read this 1972 * page in 1973 */ 1974 p = rbio_stripe_page(rbio, stripe, pagenr); 1975 if (PageUptodate(p)) 1976 continue; 1977 1978 ret = rbio_add_io_page(rbio, &bio_list, 1979 rbio_stripe_page(rbio, stripe, pagenr), 1980 stripe, pagenr, rbio->stripe_len); 1981 if (ret < 0) 1982 goto cleanup; 1983 } 1984 } 1985 1986 bios_to_read = bio_list_size(&bio_list); 1987 if (!bios_to_read) { 1988 /* 1989 * we might have no bios to read just because the pages 1990 * were up to date, or we might have no bios to read because 1991 * the devices were gone. 1992 */ 1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 1994 __raid_recover_end_io(rbio); 1995 goto out; 1996 } else { 1997 goto cleanup; 1998 } 1999 } 2000 2001 /* 2002 * the bbio may be freed once we submit the last bio. Make sure 2003 * not to touch it after that 2004 */ 2005 atomic_set(&bbio->stripes_pending, bios_to_read); 2006 while (1) { 2007 bio = bio_list_pop(&bio_list); 2008 if (!bio) 2009 break; 2010 2011 bio->bi_private = rbio; 2012 bio->bi_end_io = raid_recover_end_io; 2013 2014 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2015 BTRFS_WQ_ENDIO_RAID56); 2016 2017 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 2018 submit_bio(READ, bio); 2019 } 2020 out: 2021 return 0; 2022 2023 cleanup: 2024 if (rbio->read_rebuild) 2025 rbio_orig_end_io(rbio, -EIO, 0); 2026 return -EIO; 2027 } 2028 2029 /* 2030 * the main entry point for reads from the higher layers. This 2031 * is really only called when the normal read path had a failure, 2032 * so we assume the bio they send down corresponds to a failed part 2033 * of the drive. 2034 */ 2035 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2036 struct btrfs_bio *bbio, u64 *raid_map, 2037 u64 stripe_len, int mirror_num) 2038 { 2039 struct btrfs_raid_bio *rbio; 2040 int ret; 2041 2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2043 if (IS_ERR(rbio)) 2044 return PTR_ERR(rbio); 2045 2046 rbio->read_rebuild = 1; 2047 bio_list_add(&rbio->bio_list, bio); 2048 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2049 2050 rbio->faila = find_logical_bio_stripe(rbio, bio); 2051 if (rbio->faila == -1) { 2052 BUG(); 2053 kfree(raid_map); 2054 kfree(bbio); 2055 kfree(rbio); 2056 return -EIO; 2057 } 2058 2059 /* 2060 * reconstruct from the q stripe if they are 2061 * asking for mirror 3 2062 */ 2063 if (mirror_num == 3) 2064 rbio->failb = bbio->num_stripes - 2; 2065 2066 ret = lock_stripe_add(rbio); 2067 2068 /* 2069 * __raid56_parity_recover will end the bio with 2070 * any errors it hits. We don't want to return 2071 * its error value up the stack because our caller 2072 * will end up calling bio_endio with any nonzero 2073 * return 2074 */ 2075 if (ret == 0) 2076 __raid56_parity_recover(rbio); 2077 /* 2078 * our rbio has been added to the list of 2079 * rbios that will be handled after the 2080 * currently lock owner is done 2081 */ 2082 return 0; 2083 2084 } 2085 2086 static void rmw_work(struct btrfs_work *work) 2087 { 2088 struct btrfs_raid_bio *rbio; 2089 2090 rbio = container_of(work, struct btrfs_raid_bio, work); 2091 raid56_rmw_stripe(rbio); 2092 } 2093 2094 static void read_rebuild_work(struct btrfs_work *work) 2095 { 2096 struct btrfs_raid_bio *rbio; 2097 2098 rbio = container_of(work, struct btrfs_raid_bio, work); 2099 __raid56_parity_recover(rbio); 2100 } 2101