1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include "md.h" 57 #include "raid5.h" 58 #include "raid0.h" 59 #include "bitmap.h" 60 61 /* 62 * Stripe cache 63 */ 64 65 #define NR_STRIPES 256 66 #define STRIPE_SIZE PAGE_SIZE 67 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 68 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 69 #define IO_THRESHOLD 1 70 #define BYPASS_THRESHOLD 1 71 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 72 #define HASH_MASK (NR_HASH - 1) 73 74 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 75 { 76 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 77 return &conf->stripe_hashtbl[hash]; 78 } 79 80 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 81 * order without overlap. There may be several bio's per stripe+device, and 82 * a bio could span several devices. 83 * When walking this list for a particular stripe+device, we must never proceed 84 * beyond a bio that extends past this device, as the next bio might no longer 85 * be valid. 86 * This function is used to determine the 'next' bio in the list, given the sector 87 * of the current stripe+device 88 */ 89 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 90 { 91 int sectors = bio->bi_size >> 9; 92 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 93 return bio->bi_next; 94 else 95 return NULL; 96 } 97 98 /* 99 * We maintain a biased count of active stripes in the bottom 16 bits of 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 */ 102 static inline int raid5_bi_phys_segments(struct bio *bio) 103 { 104 return bio->bi_phys_segments & 0xffff; 105 } 106 107 static inline int raid5_bi_hw_segments(struct bio *bio) 108 { 109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 } 111 112 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 113 { 114 --bio->bi_phys_segments; 115 return raid5_bi_phys_segments(bio); 116 } 117 118 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 119 { 120 unsigned short val = raid5_bi_hw_segments(bio); 121 122 --val; 123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 124 return val; 125 } 126 127 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 128 { 129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 130 } 131 132 /* Find first data disk in a raid6 stripe */ 133 static inline int raid6_d0(struct stripe_head *sh) 134 { 135 if (sh->ddf_layout) 136 /* ddf always start from first device */ 137 return 0; 138 /* md starts just after Q block */ 139 if (sh->qd_idx == sh->disks - 1) 140 return 0; 141 else 142 return sh->qd_idx + 1; 143 } 144 static inline int raid6_next_disk(int disk, int raid_disks) 145 { 146 disk++; 147 return (disk < raid_disks) ? disk : 0; 148 } 149 150 /* When walking through the disks in a raid5, starting at raid6_d0, 151 * We need to map each disk to a 'slot', where the data disks are slot 152 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 153 * is raid_disks-1. This help does that mapping. 154 */ 155 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 156 int *count, int syndrome_disks) 157 { 158 int slot = *count; 159 160 if (sh->ddf_layout) 161 (*count)++; 162 if (idx == sh->pd_idx) 163 return syndrome_disks; 164 if (idx == sh->qd_idx) 165 return syndrome_disks + 1; 166 if (!sh->ddf_layout) 167 (*count)++; 168 return slot; 169 } 170 171 static void return_io(struct bio *return_bi) 172 { 173 struct bio *bi = return_bi; 174 while (bi) { 175 176 return_bi = bi->bi_next; 177 bi->bi_next = NULL; 178 bi->bi_size = 0; 179 bio_endio(bi, 0); 180 bi = return_bi; 181 } 182 } 183 184 static void print_raid5_conf (struct r5conf *conf); 185 186 static int stripe_operations_active(struct stripe_head *sh) 187 { 188 return sh->check_state || sh->reconstruct_state || 189 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 191 } 192 193 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 194 { 195 if (atomic_dec_and_test(&sh->count)) { 196 BUG_ON(!list_empty(&sh->lru)); 197 BUG_ON(atomic_read(&conf->active_stripes)==0); 198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 199 if (test_bit(STRIPE_DELAYED, &sh->state) && 200 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 201 list_add_tail(&sh->lru, &conf->delayed_list); 202 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 203 sh->bm_seq - conf->seq_write > 0) 204 list_add_tail(&sh->lru, &conf->bitmap_list); 205 else { 206 clear_bit(STRIPE_DELAYED, &sh->state); 207 clear_bit(STRIPE_BIT_DELAY, &sh->state); 208 list_add_tail(&sh->lru, &conf->handle_list); 209 } 210 md_wakeup_thread(conf->mddev->thread); 211 } else { 212 BUG_ON(stripe_operations_active(sh)); 213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 214 if (atomic_dec_return(&conf->preread_active_stripes) 215 < IO_THRESHOLD) 216 md_wakeup_thread(conf->mddev->thread); 217 atomic_dec(&conf->active_stripes); 218 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 219 list_add_tail(&sh->lru, &conf->inactive_list); 220 wake_up(&conf->wait_for_stripe); 221 if (conf->retry_read_aligned) 222 md_wakeup_thread(conf->mddev->thread); 223 } 224 } 225 } 226 } 227 228 static void release_stripe(struct stripe_head *sh) 229 { 230 struct r5conf *conf = sh->raid_conf; 231 unsigned long flags; 232 233 spin_lock_irqsave(&conf->device_lock, flags); 234 __release_stripe(conf, sh); 235 spin_unlock_irqrestore(&conf->device_lock, flags); 236 } 237 238 static inline void remove_hash(struct stripe_head *sh) 239 { 240 pr_debug("remove_hash(), stripe %llu\n", 241 (unsigned long long)sh->sector); 242 243 hlist_del_init(&sh->hash); 244 } 245 246 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 247 { 248 struct hlist_head *hp = stripe_hash(conf, sh->sector); 249 250 pr_debug("insert_hash(), stripe %llu\n", 251 (unsigned long long)sh->sector); 252 253 hlist_add_head(&sh->hash, hp); 254 } 255 256 257 /* find an idle stripe, make sure it is unhashed, and return it. */ 258 static struct stripe_head *get_free_stripe(struct r5conf *conf) 259 { 260 struct stripe_head *sh = NULL; 261 struct list_head *first; 262 263 if (list_empty(&conf->inactive_list)) 264 goto out; 265 first = conf->inactive_list.next; 266 sh = list_entry(first, struct stripe_head, lru); 267 list_del_init(first); 268 remove_hash(sh); 269 atomic_inc(&conf->active_stripes); 270 out: 271 return sh; 272 } 273 274 static void shrink_buffers(struct stripe_head *sh) 275 { 276 struct page *p; 277 int i; 278 int num = sh->raid_conf->pool_size; 279 280 for (i = 0; i < num ; i++) { 281 p = sh->dev[i].page; 282 if (!p) 283 continue; 284 sh->dev[i].page = NULL; 285 put_page(p); 286 } 287 } 288 289 static int grow_buffers(struct stripe_head *sh) 290 { 291 int i; 292 int num = sh->raid_conf->pool_size; 293 294 for (i = 0; i < num; i++) { 295 struct page *page; 296 297 if (!(page = alloc_page(GFP_KERNEL))) { 298 return 1; 299 } 300 sh->dev[i].page = page; 301 } 302 return 0; 303 } 304 305 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 306 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 307 struct stripe_head *sh); 308 309 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 310 { 311 struct r5conf *conf = sh->raid_conf; 312 int i; 313 314 BUG_ON(atomic_read(&sh->count) != 0); 315 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 316 BUG_ON(stripe_operations_active(sh)); 317 318 pr_debug("init_stripe called, stripe %llu\n", 319 (unsigned long long)sh->sector); 320 321 remove_hash(sh); 322 323 sh->generation = conf->generation - previous; 324 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 325 sh->sector = sector; 326 stripe_set_idx(sector, conf, previous, sh); 327 sh->state = 0; 328 329 330 for (i = sh->disks; i--; ) { 331 struct r5dev *dev = &sh->dev[i]; 332 333 if (dev->toread || dev->read || dev->towrite || dev->written || 334 test_bit(R5_LOCKED, &dev->flags)) { 335 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 336 (unsigned long long)sh->sector, i, dev->toread, 337 dev->read, dev->towrite, dev->written, 338 test_bit(R5_LOCKED, &dev->flags)); 339 WARN_ON(1); 340 } 341 dev->flags = 0; 342 raid5_build_block(sh, i, previous); 343 } 344 insert_hash(conf, sh); 345 } 346 347 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 348 short generation) 349 { 350 struct stripe_head *sh; 351 struct hlist_node *hn; 352 353 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 354 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 355 if (sh->sector == sector && sh->generation == generation) 356 return sh; 357 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 358 return NULL; 359 } 360 361 /* 362 * Need to check if array has failed when deciding whether to: 363 * - start an array 364 * - remove non-faulty devices 365 * - add a spare 366 * - allow a reshape 367 * This determination is simple when no reshape is happening. 368 * However if there is a reshape, we need to carefully check 369 * both the before and after sections. 370 * This is because some failed devices may only affect one 371 * of the two sections, and some non-in_sync devices may 372 * be insync in the section most affected by failed devices. 373 */ 374 static int calc_degraded(struct r5conf *conf) 375 { 376 int degraded, degraded2; 377 int i; 378 379 rcu_read_lock(); 380 degraded = 0; 381 for (i = 0; i < conf->previous_raid_disks; i++) { 382 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 383 if (!rdev || test_bit(Faulty, &rdev->flags)) 384 degraded++; 385 else if (test_bit(In_sync, &rdev->flags)) 386 ; 387 else 388 /* not in-sync or faulty. 389 * If the reshape increases the number of devices, 390 * this is being recovered by the reshape, so 391 * this 'previous' section is not in_sync. 392 * If the number of devices is being reduced however, 393 * the device can only be part of the array if 394 * we are reverting a reshape, so this section will 395 * be in-sync. 396 */ 397 if (conf->raid_disks >= conf->previous_raid_disks) 398 degraded++; 399 } 400 rcu_read_unlock(); 401 if (conf->raid_disks == conf->previous_raid_disks) 402 return degraded; 403 rcu_read_lock(); 404 degraded2 = 0; 405 for (i = 0; i < conf->raid_disks; i++) { 406 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 407 if (!rdev || test_bit(Faulty, &rdev->flags)) 408 degraded2++; 409 else if (test_bit(In_sync, &rdev->flags)) 410 ; 411 else 412 /* not in-sync or faulty. 413 * If reshape increases the number of devices, this 414 * section has already been recovered, else it 415 * almost certainly hasn't. 416 */ 417 if (conf->raid_disks <= conf->previous_raid_disks) 418 degraded2++; 419 } 420 rcu_read_unlock(); 421 if (degraded2 > degraded) 422 return degraded2; 423 return degraded; 424 } 425 426 static int has_failed(struct r5conf *conf) 427 { 428 int degraded; 429 430 if (conf->mddev->reshape_position == MaxSector) 431 return conf->mddev->degraded > conf->max_degraded; 432 433 degraded = calc_degraded(conf); 434 if (degraded > conf->max_degraded) 435 return 1; 436 return 0; 437 } 438 439 static struct stripe_head * 440 get_active_stripe(struct r5conf *conf, sector_t sector, 441 int previous, int noblock, int noquiesce) 442 { 443 struct stripe_head *sh; 444 445 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 446 447 spin_lock_irq(&conf->device_lock); 448 449 do { 450 wait_event_lock_irq(conf->wait_for_stripe, 451 conf->quiesce == 0 || noquiesce, 452 conf->device_lock, /* nothing */); 453 sh = __find_stripe(conf, sector, conf->generation - previous); 454 if (!sh) { 455 if (!conf->inactive_blocked) 456 sh = get_free_stripe(conf); 457 if (noblock && sh == NULL) 458 break; 459 if (!sh) { 460 conf->inactive_blocked = 1; 461 wait_event_lock_irq(conf->wait_for_stripe, 462 !list_empty(&conf->inactive_list) && 463 (atomic_read(&conf->active_stripes) 464 < (conf->max_nr_stripes *3/4) 465 || !conf->inactive_blocked), 466 conf->device_lock, 467 ); 468 conf->inactive_blocked = 0; 469 } else 470 init_stripe(sh, sector, previous); 471 } else { 472 if (atomic_read(&sh->count)) { 473 BUG_ON(!list_empty(&sh->lru) 474 && !test_bit(STRIPE_EXPANDING, &sh->state)); 475 } else { 476 if (!test_bit(STRIPE_HANDLE, &sh->state)) 477 atomic_inc(&conf->active_stripes); 478 if (list_empty(&sh->lru) && 479 !test_bit(STRIPE_EXPANDING, &sh->state)) 480 BUG(); 481 list_del_init(&sh->lru); 482 } 483 } 484 } while (sh == NULL); 485 486 if (sh) 487 atomic_inc(&sh->count); 488 489 spin_unlock_irq(&conf->device_lock); 490 return sh; 491 } 492 493 /* Determine if 'data_offset' or 'new_data_offset' should be used 494 * in this stripe_head. 495 */ 496 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 497 { 498 sector_t progress = conf->reshape_progress; 499 /* Need a memory barrier to make sure we see the value 500 * of conf->generation, or ->data_offset that was set before 501 * reshape_progress was updated. 502 */ 503 smp_rmb(); 504 if (progress == MaxSector) 505 return 0; 506 if (sh->generation == conf->generation - 1) 507 return 0; 508 /* We are in a reshape, and this is a new-generation stripe, 509 * so use new_data_offset. 510 */ 511 return 1; 512 } 513 514 static void 515 raid5_end_read_request(struct bio *bi, int error); 516 static void 517 raid5_end_write_request(struct bio *bi, int error); 518 519 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 520 { 521 struct r5conf *conf = sh->raid_conf; 522 int i, disks = sh->disks; 523 524 might_sleep(); 525 526 for (i = disks; i--; ) { 527 int rw; 528 int replace_only = 0; 529 struct bio *bi, *rbi; 530 struct md_rdev *rdev, *rrdev = NULL; 531 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 532 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 533 rw = WRITE_FUA; 534 else 535 rw = WRITE; 536 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 537 rw = READ; 538 else if (test_and_clear_bit(R5_WantReplace, 539 &sh->dev[i].flags)) { 540 rw = WRITE; 541 replace_only = 1; 542 } else 543 continue; 544 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 545 rw |= REQ_SYNC; 546 547 bi = &sh->dev[i].req; 548 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 549 550 bi->bi_rw = rw; 551 rbi->bi_rw = rw; 552 if (rw & WRITE) { 553 bi->bi_end_io = raid5_end_write_request; 554 rbi->bi_end_io = raid5_end_write_request; 555 } else 556 bi->bi_end_io = raid5_end_read_request; 557 558 rcu_read_lock(); 559 rrdev = rcu_dereference(conf->disks[i].replacement); 560 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 561 rdev = rcu_dereference(conf->disks[i].rdev); 562 if (!rdev) { 563 rdev = rrdev; 564 rrdev = NULL; 565 } 566 if (rw & WRITE) { 567 if (replace_only) 568 rdev = NULL; 569 if (rdev == rrdev) 570 /* We raced and saw duplicates */ 571 rrdev = NULL; 572 } else { 573 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 574 rdev = rrdev; 575 rrdev = NULL; 576 } 577 578 if (rdev && test_bit(Faulty, &rdev->flags)) 579 rdev = NULL; 580 if (rdev) 581 atomic_inc(&rdev->nr_pending); 582 if (rrdev && test_bit(Faulty, &rrdev->flags)) 583 rrdev = NULL; 584 if (rrdev) 585 atomic_inc(&rrdev->nr_pending); 586 rcu_read_unlock(); 587 588 /* We have already checked bad blocks for reads. Now 589 * need to check for writes. We never accept write errors 590 * on the replacement, so we don't to check rrdev. 591 */ 592 while ((rw & WRITE) && rdev && 593 test_bit(WriteErrorSeen, &rdev->flags)) { 594 sector_t first_bad; 595 int bad_sectors; 596 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 597 &first_bad, &bad_sectors); 598 if (!bad) 599 break; 600 601 if (bad < 0) { 602 set_bit(BlockedBadBlocks, &rdev->flags); 603 if (!conf->mddev->external && 604 conf->mddev->flags) { 605 /* It is very unlikely, but we might 606 * still need to write out the 607 * bad block log - better give it 608 * a chance*/ 609 md_check_recovery(conf->mddev); 610 } 611 /* 612 * Because md_wait_for_blocked_rdev 613 * will dec nr_pending, we must 614 * increment it first. 615 */ 616 atomic_inc(&rdev->nr_pending); 617 md_wait_for_blocked_rdev(rdev, conf->mddev); 618 } else { 619 /* Acknowledged bad block - skip the write */ 620 rdev_dec_pending(rdev, conf->mddev); 621 rdev = NULL; 622 } 623 } 624 625 if (rdev) { 626 if (s->syncing || s->expanding || s->expanded 627 || s->replacing) 628 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 629 630 set_bit(STRIPE_IO_STARTED, &sh->state); 631 632 bi->bi_bdev = rdev->bdev; 633 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 634 __func__, (unsigned long long)sh->sector, 635 bi->bi_rw, i); 636 atomic_inc(&sh->count); 637 if (use_new_offset(conf, sh)) 638 bi->bi_sector = (sh->sector 639 + rdev->new_data_offset); 640 else 641 bi->bi_sector = (sh->sector 642 + rdev->data_offset); 643 bi->bi_flags = 1 << BIO_UPTODATE; 644 bi->bi_idx = 0; 645 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 646 bi->bi_io_vec[0].bv_offset = 0; 647 bi->bi_size = STRIPE_SIZE; 648 bi->bi_next = NULL; 649 if (rrdev) 650 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 651 generic_make_request(bi); 652 } 653 if (rrdev) { 654 if (s->syncing || s->expanding || s->expanded 655 || s->replacing) 656 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 657 658 set_bit(STRIPE_IO_STARTED, &sh->state); 659 660 rbi->bi_bdev = rrdev->bdev; 661 pr_debug("%s: for %llu schedule op %ld on " 662 "replacement disc %d\n", 663 __func__, (unsigned long long)sh->sector, 664 rbi->bi_rw, i); 665 atomic_inc(&sh->count); 666 if (use_new_offset(conf, sh)) 667 rbi->bi_sector = (sh->sector 668 + rrdev->new_data_offset); 669 else 670 rbi->bi_sector = (sh->sector 671 + rrdev->data_offset); 672 rbi->bi_flags = 1 << BIO_UPTODATE; 673 rbi->bi_idx = 0; 674 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 675 rbi->bi_io_vec[0].bv_offset = 0; 676 rbi->bi_size = STRIPE_SIZE; 677 rbi->bi_next = NULL; 678 generic_make_request(rbi); 679 } 680 if (!rdev && !rrdev) { 681 if (rw & WRITE) 682 set_bit(STRIPE_DEGRADED, &sh->state); 683 pr_debug("skip op %ld on disc %d for sector %llu\n", 684 bi->bi_rw, i, (unsigned long long)sh->sector); 685 clear_bit(R5_LOCKED, &sh->dev[i].flags); 686 set_bit(STRIPE_HANDLE, &sh->state); 687 } 688 } 689 } 690 691 static struct dma_async_tx_descriptor * 692 async_copy_data(int frombio, struct bio *bio, struct page *page, 693 sector_t sector, struct dma_async_tx_descriptor *tx) 694 { 695 struct bio_vec *bvl; 696 struct page *bio_page; 697 int i; 698 int page_offset; 699 struct async_submit_ctl submit; 700 enum async_tx_flags flags = 0; 701 702 if (bio->bi_sector >= sector) 703 page_offset = (signed)(bio->bi_sector - sector) * 512; 704 else 705 page_offset = (signed)(sector - bio->bi_sector) * -512; 706 707 if (frombio) 708 flags |= ASYNC_TX_FENCE; 709 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 710 711 bio_for_each_segment(bvl, bio, i) { 712 int len = bvl->bv_len; 713 int clen; 714 int b_offset = 0; 715 716 if (page_offset < 0) { 717 b_offset = -page_offset; 718 page_offset += b_offset; 719 len -= b_offset; 720 } 721 722 if (len > 0 && page_offset + len > STRIPE_SIZE) 723 clen = STRIPE_SIZE - page_offset; 724 else 725 clen = len; 726 727 if (clen > 0) { 728 b_offset += bvl->bv_offset; 729 bio_page = bvl->bv_page; 730 if (frombio) 731 tx = async_memcpy(page, bio_page, page_offset, 732 b_offset, clen, &submit); 733 else 734 tx = async_memcpy(bio_page, page, b_offset, 735 page_offset, clen, &submit); 736 } 737 /* chain the operations */ 738 submit.depend_tx = tx; 739 740 if (clen < len) /* hit end of page */ 741 break; 742 page_offset += len; 743 } 744 745 return tx; 746 } 747 748 static void ops_complete_biofill(void *stripe_head_ref) 749 { 750 struct stripe_head *sh = stripe_head_ref; 751 struct bio *return_bi = NULL; 752 struct r5conf *conf = sh->raid_conf; 753 int i; 754 755 pr_debug("%s: stripe %llu\n", __func__, 756 (unsigned long long)sh->sector); 757 758 /* clear completed biofills */ 759 spin_lock_irq(&conf->device_lock); 760 for (i = sh->disks; i--; ) { 761 struct r5dev *dev = &sh->dev[i]; 762 763 /* acknowledge completion of a biofill operation */ 764 /* and check if we need to reply to a read request, 765 * new R5_Wantfill requests are held off until 766 * !STRIPE_BIOFILL_RUN 767 */ 768 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 769 struct bio *rbi, *rbi2; 770 771 BUG_ON(!dev->read); 772 rbi = dev->read; 773 dev->read = NULL; 774 while (rbi && rbi->bi_sector < 775 dev->sector + STRIPE_SECTORS) { 776 rbi2 = r5_next_bio(rbi, dev->sector); 777 if (!raid5_dec_bi_phys_segments(rbi)) { 778 rbi->bi_next = return_bi; 779 return_bi = rbi; 780 } 781 rbi = rbi2; 782 } 783 } 784 } 785 spin_unlock_irq(&conf->device_lock); 786 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 787 788 return_io(return_bi); 789 790 set_bit(STRIPE_HANDLE, &sh->state); 791 release_stripe(sh); 792 } 793 794 static void ops_run_biofill(struct stripe_head *sh) 795 { 796 struct dma_async_tx_descriptor *tx = NULL; 797 struct r5conf *conf = sh->raid_conf; 798 struct async_submit_ctl submit; 799 int i; 800 801 pr_debug("%s: stripe %llu\n", __func__, 802 (unsigned long long)sh->sector); 803 804 for (i = sh->disks; i--; ) { 805 struct r5dev *dev = &sh->dev[i]; 806 if (test_bit(R5_Wantfill, &dev->flags)) { 807 struct bio *rbi; 808 spin_lock_irq(&conf->device_lock); 809 dev->read = rbi = dev->toread; 810 dev->toread = NULL; 811 spin_unlock_irq(&conf->device_lock); 812 while (rbi && rbi->bi_sector < 813 dev->sector + STRIPE_SECTORS) { 814 tx = async_copy_data(0, rbi, dev->page, 815 dev->sector, tx); 816 rbi = r5_next_bio(rbi, dev->sector); 817 } 818 } 819 } 820 821 atomic_inc(&sh->count); 822 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 823 async_trigger_callback(&submit); 824 } 825 826 static void mark_target_uptodate(struct stripe_head *sh, int target) 827 { 828 struct r5dev *tgt; 829 830 if (target < 0) 831 return; 832 833 tgt = &sh->dev[target]; 834 set_bit(R5_UPTODATE, &tgt->flags); 835 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 836 clear_bit(R5_Wantcompute, &tgt->flags); 837 } 838 839 static void ops_complete_compute(void *stripe_head_ref) 840 { 841 struct stripe_head *sh = stripe_head_ref; 842 843 pr_debug("%s: stripe %llu\n", __func__, 844 (unsigned long long)sh->sector); 845 846 /* mark the computed target(s) as uptodate */ 847 mark_target_uptodate(sh, sh->ops.target); 848 mark_target_uptodate(sh, sh->ops.target2); 849 850 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 851 if (sh->check_state == check_state_compute_run) 852 sh->check_state = check_state_compute_result; 853 set_bit(STRIPE_HANDLE, &sh->state); 854 release_stripe(sh); 855 } 856 857 /* return a pointer to the address conversion region of the scribble buffer */ 858 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 859 struct raid5_percpu *percpu) 860 { 861 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 862 } 863 864 static struct dma_async_tx_descriptor * 865 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 866 { 867 int disks = sh->disks; 868 struct page **xor_srcs = percpu->scribble; 869 int target = sh->ops.target; 870 struct r5dev *tgt = &sh->dev[target]; 871 struct page *xor_dest = tgt->page; 872 int count = 0; 873 struct dma_async_tx_descriptor *tx; 874 struct async_submit_ctl submit; 875 int i; 876 877 pr_debug("%s: stripe %llu block: %d\n", 878 __func__, (unsigned long long)sh->sector, target); 879 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 880 881 for (i = disks; i--; ) 882 if (i != target) 883 xor_srcs[count++] = sh->dev[i].page; 884 885 atomic_inc(&sh->count); 886 887 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 888 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 889 if (unlikely(count == 1)) 890 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 891 else 892 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 893 894 return tx; 895 } 896 897 /* set_syndrome_sources - populate source buffers for gen_syndrome 898 * @srcs - (struct page *) array of size sh->disks 899 * @sh - stripe_head to parse 900 * 901 * Populates srcs in proper layout order for the stripe and returns the 902 * 'count' of sources to be used in a call to async_gen_syndrome. The P 903 * destination buffer is recorded in srcs[count] and the Q destination 904 * is recorded in srcs[count+1]]. 905 */ 906 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 907 { 908 int disks = sh->disks; 909 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 910 int d0_idx = raid6_d0(sh); 911 int count; 912 int i; 913 914 for (i = 0; i < disks; i++) 915 srcs[i] = NULL; 916 917 count = 0; 918 i = d0_idx; 919 do { 920 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 921 922 srcs[slot] = sh->dev[i].page; 923 i = raid6_next_disk(i, disks); 924 } while (i != d0_idx); 925 926 return syndrome_disks; 927 } 928 929 static struct dma_async_tx_descriptor * 930 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 931 { 932 int disks = sh->disks; 933 struct page **blocks = percpu->scribble; 934 int target; 935 int qd_idx = sh->qd_idx; 936 struct dma_async_tx_descriptor *tx; 937 struct async_submit_ctl submit; 938 struct r5dev *tgt; 939 struct page *dest; 940 int i; 941 int count; 942 943 if (sh->ops.target < 0) 944 target = sh->ops.target2; 945 else if (sh->ops.target2 < 0) 946 target = sh->ops.target; 947 else 948 /* we should only have one valid target */ 949 BUG(); 950 BUG_ON(target < 0); 951 pr_debug("%s: stripe %llu block: %d\n", 952 __func__, (unsigned long long)sh->sector, target); 953 954 tgt = &sh->dev[target]; 955 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 956 dest = tgt->page; 957 958 atomic_inc(&sh->count); 959 960 if (target == qd_idx) { 961 count = set_syndrome_sources(blocks, sh); 962 blocks[count] = NULL; /* regenerating p is not necessary */ 963 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 964 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 965 ops_complete_compute, sh, 966 to_addr_conv(sh, percpu)); 967 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 968 } else { 969 /* Compute any data- or p-drive using XOR */ 970 count = 0; 971 for (i = disks; i-- ; ) { 972 if (i == target || i == qd_idx) 973 continue; 974 blocks[count++] = sh->dev[i].page; 975 } 976 977 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 978 NULL, ops_complete_compute, sh, 979 to_addr_conv(sh, percpu)); 980 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 981 } 982 983 return tx; 984 } 985 986 static struct dma_async_tx_descriptor * 987 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 988 { 989 int i, count, disks = sh->disks; 990 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 991 int d0_idx = raid6_d0(sh); 992 int faila = -1, failb = -1; 993 int target = sh->ops.target; 994 int target2 = sh->ops.target2; 995 struct r5dev *tgt = &sh->dev[target]; 996 struct r5dev *tgt2 = &sh->dev[target2]; 997 struct dma_async_tx_descriptor *tx; 998 struct page **blocks = percpu->scribble; 999 struct async_submit_ctl submit; 1000 1001 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1002 __func__, (unsigned long long)sh->sector, target, target2); 1003 BUG_ON(target < 0 || target2 < 0); 1004 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1005 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1006 1007 /* we need to open-code set_syndrome_sources to handle the 1008 * slot number conversion for 'faila' and 'failb' 1009 */ 1010 for (i = 0; i < disks ; i++) 1011 blocks[i] = NULL; 1012 count = 0; 1013 i = d0_idx; 1014 do { 1015 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1016 1017 blocks[slot] = sh->dev[i].page; 1018 1019 if (i == target) 1020 faila = slot; 1021 if (i == target2) 1022 failb = slot; 1023 i = raid6_next_disk(i, disks); 1024 } while (i != d0_idx); 1025 1026 BUG_ON(faila == failb); 1027 if (failb < faila) 1028 swap(faila, failb); 1029 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1030 __func__, (unsigned long long)sh->sector, faila, failb); 1031 1032 atomic_inc(&sh->count); 1033 1034 if (failb == syndrome_disks+1) { 1035 /* Q disk is one of the missing disks */ 1036 if (faila == syndrome_disks) { 1037 /* Missing P+Q, just recompute */ 1038 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1039 ops_complete_compute, sh, 1040 to_addr_conv(sh, percpu)); 1041 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1042 STRIPE_SIZE, &submit); 1043 } else { 1044 struct page *dest; 1045 int data_target; 1046 int qd_idx = sh->qd_idx; 1047 1048 /* Missing D+Q: recompute D from P, then recompute Q */ 1049 if (target == qd_idx) 1050 data_target = target2; 1051 else 1052 data_target = target; 1053 1054 count = 0; 1055 for (i = disks; i-- ; ) { 1056 if (i == data_target || i == qd_idx) 1057 continue; 1058 blocks[count++] = sh->dev[i].page; 1059 } 1060 dest = sh->dev[data_target].page; 1061 init_async_submit(&submit, 1062 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1063 NULL, NULL, NULL, 1064 to_addr_conv(sh, percpu)); 1065 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1066 &submit); 1067 1068 count = set_syndrome_sources(blocks, sh); 1069 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1070 ops_complete_compute, sh, 1071 to_addr_conv(sh, percpu)); 1072 return async_gen_syndrome(blocks, 0, count+2, 1073 STRIPE_SIZE, &submit); 1074 } 1075 } else { 1076 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1077 ops_complete_compute, sh, 1078 to_addr_conv(sh, percpu)); 1079 if (failb == syndrome_disks) { 1080 /* We're missing D+P. */ 1081 return async_raid6_datap_recov(syndrome_disks+2, 1082 STRIPE_SIZE, faila, 1083 blocks, &submit); 1084 } else { 1085 /* We're missing D+D. */ 1086 return async_raid6_2data_recov(syndrome_disks+2, 1087 STRIPE_SIZE, faila, failb, 1088 blocks, &submit); 1089 } 1090 } 1091 } 1092 1093 1094 static void ops_complete_prexor(void *stripe_head_ref) 1095 { 1096 struct stripe_head *sh = stripe_head_ref; 1097 1098 pr_debug("%s: stripe %llu\n", __func__, 1099 (unsigned long long)sh->sector); 1100 } 1101 1102 static struct dma_async_tx_descriptor * 1103 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1104 struct dma_async_tx_descriptor *tx) 1105 { 1106 int disks = sh->disks; 1107 struct page **xor_srcs = percpu->scribble; 1108 int count = 0, pd_idx = sh->pd_idx, i; 1109 struct async_submit_ctl submit; 1110 1111 /* existing parity data subtracted */ 1112 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1113 1114 pr_debug("%s: stripe %llu\n", __func__, 1115 (unsigned long long)sh->sector); 1116 1117 for (i = disks; i--; ) { 1118 struct r5dev *dev = &sh->dev[i]; 1119 /* Only process blocks that are known to be uptodate */ 1120 if (test_bit(R5_Wantdrain, &dev->flags)) 1121 xor_srcs[count++] = dev->page; 1122 } 1123 1124 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1125 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1126 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1127 1128 return tx; 1129 } 1130 1131 static struct dma_async_tx_descriptor * 1132 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1133 { 1134 int disks = sh->disks; 1135 int i; 1136 1137 pr_debug("%s: stripe %llu\n", __func__, 1138 (unsigned long long)sh->sector); 1139 1140 for (i = disks; i--; ) { 1141 struct r5dev *dev = &sh->dev[i]; 1142 struct bio *chosen; 1143 1144 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1145 struct bio *wbi; 1146 1147 spin_lock_irq(&sh->raid_conf->device_lock); 1148 chosen = dev->towrite; 1149 dev->towrite = NULL; 1150 BUG_ON(dev->written); 1151 wbi = dev->written = chosen; 1152 spin_unlock_irq(&sh->raid_conf->device_lock); 1153 1154 while (wbi && wbi->bi_sector < 1155 dev->sector + STRIPE_SECTORS) { 1156 if (wbi->bi_rw & REQ_FUA) 1157 set_bit(R5_WantFUA, &dev->flags); 1158 if (wbi->bi_rw & REQ_SYNC) 1159 set_bit(R5_SyncIO, &dev->flags); 1160 tx = async_copy_data(1, wbi, dev->page, 1161 dev->sector, tx); 1162 wbi = r5_next_bio(wbi, dev->sector); 1163 } 1164 } 1165 } 1166 1167 return tx; 1168 } 1169 1170 static void ops_complete_reconstruct(void *stripe_head_ref) 1171 { 1172 struct stripe_head *sh = stripe_head_ref; 1173 int disks = sh->disks; 1174 int pd_idx = sh->pd_idx; 1175 int qd_idx = sh->qd_idx; 1176 int i; 1177 bool fua = false, sync = false; 1178 1179 pr_debug("%s: stripe %llu\n", __func__, 1180 (unsigned long long)sh->sector); 1181 1182 for (i = disks; i--; ) { 1183 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1184 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1185 } 1186 1187 for (i = disks; i--; ) { 1188 struct r5dev *dev = &sh->dev[i]; 1189 1190 if (dev->written || i == pd_idx || i == qd_idx) { 1191 set_bit(R5_UPTODATE, &dev->flags); 1192 if (fua) 1193 set_bit(R5_WantFUA, &dev->flags); 1194 if (sync) 1195 set_bit(R5_SyncIO, &dev->flags); 1196 } 1197 } 1198 1199 if (sh->reconstruct_state == reconstruct_state_drain_run) 1200 sh->reconstruct_state = reconstruct_state_drain_result; 1201 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1202 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1203 else { 1204 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1205 sh->reconstruct_state = reconstruct_state_result; 1206 } 1207 1208 set_bit(STRIPE_HANDLE, &sh->state); 1209 release_stripe(sh); 1210 } 1211 1212 static void 1213 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1214 struct dma_async_tx_descriptor *tx) 1215 { 1216 int disks = sh->disks; 1217 struct page **xor_srcs = percpu->scribble; 1218 struct async_submit_ctl submit; 1219 int count = 0, pd_idx = sh->pd_idx, i; 1220 struct page *xor_dest; 1221 int prexor = 0; 1222 unsigned long flags; 1223 1224 pr_debug("%s: stripe %llu\n", __func__, 1225 (unsigned long long)sh->sector); 1226 1227 /* check if prexor is active which means only process blocks 1228 * that are part of a read-modify-write (written) 1229 */ 1230 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1231 prexor = 1; 1232 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1233 for (i = disks; i--; ) { 1234 struct r5dev *dev = &sh->dev[i]; 1235 if (dev->written) 1236 xor_srcs[count++] = dev->page; 1237 } 1238 } else { 1239 xor_dest = sh->dev[pd_idx].page; 1240 for (i = disks; i--; ) { 1241 struct r5dev *dev = &sh->dev[i]; 1242 if (i != pd_idx) 1243 xor_srcs[count++] = dev->page; 1244 } 1245 } 1246 1247 /* 1/ if we prexor'd then the dest is reused as a source 1248 * 2/ if we did not prexor then we are redoing the parity 1249 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1250 * for the synchronous xor case 1251 */ 1252 flags = ASYNC_TX_ACK | 1253 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1254 1255 atomic_inc(&sh->count); 1256 1257 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1258 to_addr_conv(sh, percpu)); 1259 if (unlikely(count == 1)) 1260 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1261 else 1262 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1263 } 1264 1265 static void 1266 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1267 struct dma_async_tx_descriptor *tx) 1268 { 1269 struct async_submit_ctl submit; 1270 struct page **blocks = percpu->scribble; 1271 int count; 1272 1273 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1274 1275 count = set_syndrome_sources(blocks, sh); 1276 1277 atomic_inc(&sh->count); 1278 1279 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1280 sh, to_addr_conv(sh, percpu)); 1281 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1282 } 1283 1284 static void ops_complete_check(void *stripe_head_ref) 1285 { 1286 struct stripe_head *sh = stripe_head_ref; 1287 1288 pr_debug("%s: stripe %llu\n", __func__, 1289 (unsigned long long)sh->sector); 1290 1291 sh->check_state = check_state_check_result; 1292 set_bit(STRIPE_HANDLE, &sh->state); 1293 release_stripe(sh); 1294 } 1295 1296 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1297 { 1298 int disks = sh->disks; 1299 int pd_idx = sh->pd_idx; 1300 int qd_idx = sh->qd_idx; 1301 struct page *xor_dest; 1302 struct page **xor_srcs = percpu->scribble; 1303 struct dma_async_tx_descriptor *tx; 1304 struct async_submit_ctl submit; 1305 int count; 1306 int i; 1307 1308 pr_debug("%s: stripe %llu\n", __func__, 1309 (unsigned long long)sh->sector); 1310 1311 count = 0; 1312 xor_dest = sh->dev[pd_idx].page; 1313 xor_srcs[count++] = xor_dest; 1314 for (i = disks; i--; ) { 1315 if (i == pd_idx || i == qd_idx) 1316 continue; 1317 xor_srcs[count++] = sh->dev[i].page; 1318 } 1319 1320 init_async_submit(&submit, 0, NULL, NULL, NULL, 1321 to_addr_conv(sh, percpu)); 1322 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1323 &sh->ops.zero_sum_result, &submit); 1324 1325 atomic_inc(&sh->count); 1326 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1327 tx = async_trigger_callback(&submit); 1328 } 1329 1330 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1331 { 1332 struct page **srcs = percpu->scribble; 1333 struct async_submit_ctl submit; 1334 int count; 1335 1336 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1337 (unsigned long long)sh->sector, checkp); 1338 1339 count = set_syndrome_sources(srcs, sh); 1340 if (!checkp) 1341 srcs[count] = NULL; 1342 1343 atomic_inc(&sh->count); 1344 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1345 sh, to_addr_conv(sh, percpu)); 1346 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1347 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1348 } 1349 1350 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1351 { 1352 int overlap_clear = 0, i, disks = sh->disks; 1353 struct dma_async_tx_descriptor *tx = NULL; 1354 struct r5conf *conf = sh->raid_conf; 1355 int level = conf->level; 1356 struct raid5_percpu *percpu; 1357 unsigned long cpu; 1358 1359 cpu = get_cpu(); 1360 percpu = per_cpu_ptr(conf->percpu, cpu); 1361 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1362 ops_run_biofill(sh); 1363 overlap_clear++; 1364 } 1365 1366 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1367 if (level < 6) 1368 tx = ops_run_compute5(sh, percpu); 1369 else { 1370 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1371 tx = ops_run_compute6_1(sh, percpu); 1372 else 1373 tx = ops_run_compute6_2(sh, percpu); 1374 } 1375 /* terminate the chain if reconstruct is not set to be run */ 1376 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1377 async_tx_ack(tx); 1378 } 1379 1380 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1381 tx = ops_run_prexor(sh, percpu, tx); 1382 1383 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1384 tx = ops_run_biodrain(sh, tx); 1385 overlap_clear++; 1386 } 1387 1388 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1389 if (level < 6) 1390 ops_run_reconstruct5(sh, percpu, tx); 1391 else 1392 ops_run_reconstruct6(sh, percpu, tx); 1393 } 1394 1395 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1396 if (sh->check_state == check_state_run) 1397 ops_run_check_p(sh, percpu); 1398 else if (sh->check_state == check_state_run_q) 1399 ops_run_check_pq(sh, percpu, 0); 1400 else if (sh->check_state == check_state_run_pq) 1401 ops_run_check_pq(sh, percpu, 1); 1402 else 1403 BUG(); 1404 } 1405 1406 if (overlap_clear) 1407 for (i = disks; i--; ) { 1408 struct r5dev *dev = &sh->dev[i]; 1409 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1410 wake_up(&sh->raid_conf->wait_for_overlap); 1411 } 1412 put_cpu(); 1413 } 1414 1415 #ifdef CONFIG_MULTICORE_RAID456 1416 static void async_run_ops(void *param, async_cookie_t cookie) 1417 { 1418 struct stripe_head *sh = param; 1419 unsigned long ops_request = sh->ops.request; 1420 1421 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1422 wake_up(&sh->ops.wait_for_ops); 1423 1424 __raid_run_ops(sh, ops_request); 1425 release_stripe(sh); 1426 } 1427 1428 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1429 { 1430 /* since handle_stripe can be called outside of raid5d context 1431 * we need to ensure sh->ops.request is de-staged before another 1432 * request arrives 1433 */ 1434 wait_event(sh->ops.wait_for_ops, 1435 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1436 sh->ops.request = ops_request; 1437 1438 atomic_inc(&sh->count); 1439 async_schedule(async_run_ops, sh); 1440 } 1441 #else 1442 #define raid_run_ops __raid_run_ops 1443 #endif 1444 1445 static int grow_one_stripe(struct r5conf *conf) 1446 { 1447 struct stripe_head *sh; 1448 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1449 if (!sh) 1450 return 0; 1451 1452 sh->raid_conf = conf; 1453 #ifdef CONFIG_MULTICORE_RAID456 1454 init_waitqueue_head(&sh->ops.wait_for_ops); 1455 #endif 1456 1457 if (grow_buffers(sh)) { 1458 shrink_buffers(sh); 1459 kmem_cache_free(conf->slab_cache, sh); 1460 return 0; 1461 } 1462 /* we just created an active stripe so... */ 1463 atomic_set(&sh->count, 1); 1464 atomic_inc(&conf->active_stripes); 1465 INIT_LIST_HEAD(&sh->lru); 1466 release_stripe(sh); 1467 return 1; 1468 } 1469 1470 static int grow_stripes(struct r5conf *conf, int num) 1471 { 1472 struct kmem_cache *sc; 1473 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1474 1475 if (conf->mddev->gendisk) 1476 sprintf(conf->cache_name[0], 1477 "raid%d-%s", conf->level, mdname(conf->mddev)); 1478 else 1479 sprintf(conf->cache_name[0], 1480 "raid%d-%p", conf->level, conf->mddev); 1481 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1482 1483 conf->active_name = 0; 1484 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1485 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1486 0, 0, NULL); 1487 if (!sc) 1488 return 1; 1489 conf->slab_cache = sc; 1490 conf->pool_size = devs; 1491 while (num--) 1492 if (!grow_one_stripe(conf)) 1493 return 1; 1494 return 0; 1495 } 1496 1497 /** 1498 * scribble_len - return the required size of the scribble region 1499 * @num - total number of disks in the array 1500 * 1501 * The size must be enough to contain: 1502 * 1/ a struct page pointer for each device in the array +2 1503 * 2/ room to convert each entry in (1) to its corresponding dma 1504 * (dma_map_page()) or page (page_address()) address. 1505 * 1506 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1507 * calculate over all devices (not just the data blocks), using zeros in place 1508 * of the P and Q blocks. 1509 */ 1510 static size_t scribble_len(int num) 1511 { 1512 size_t len; 1513 1514 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1515 1516 return len; 1517 } 1518 1519 static int resize_stripes(struct r5conf *conf, int newsize) 1520 { 1521 /* Make all the stripes able to hold 'newsize' devices. 1522 * New slots in each stripe get 'page' set to a new page. 1523 * 1524 * This happens in stages: 1525 * 1/ create a new kmem_cache and allocate the required number of 1526 * stripe_heads. 1527 * 2/ gather all the old stripe_heads and tranfer the pages across 1528 * to the new stripe_heads. This will have the side effect of 1529 * freezing the array as once all stripe_heads have been collected, 1530 * no IO will be possible. Old stripe heads are freed once their 1531 * pages have been transferred over, and the old kmem_cache is 1532 * freed when all stripes are done. 1533 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1534 * we simple return a failre status - no need to clean anything up. 1535 * 4/ allocate new pages for the new slots in the new stripe_heads. 1536 * If this fails, we don't bother trying the shrink the 1537 * stripe_heads down again, we just leave them as they are. 1538 * As each stripe_head is processed the new one is released into 1539 * active service. 1540 * 1541 * Once step2 is started, we cannot afford to wait for a write, 1542 * so we use GFP_NOIO allocations. 1543 */ 1544 struct stripe_head *osh, *nsh; 1545 LIST_HEAD(newstripes); 1546 struct disk_info *ndisks; 1547 unsigned long cpu; 1548 int err; 1549 struct kmem_cache *sc; 1550 int i; 1551 1552 if (newsize <= conf->pool_size) 1553 return 0; /* never bother to shrink */ 1554 1555 err = md_allow_write(conf->mddev); 1556 if (err) 1557 return err; 1558 1559 /* Step 1 */ 1560 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1561 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1562 0, 0, NULL); 1563 if (!sc) 1564 return -ENOMEM; 1565 1566 for (i = conf->max_nr_stripes; i; i--) { 1567 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1568 if (!nsh) 1569 break; 1570 1571 nsh->raid_conf = conf; 1572 #ifdef CONFIG_MULTICORE_RAID456 1573 init_waitqueue_head(&nsh->ops.wait_for_ops); 1574 #endif 1575 1576 list_add(&nsh->lru, &newstripes); 1577 } 1578 if (i) { 1579 /* didn't get enough, give up */ 1580 while (!list_empty(&newstripes)) { 1581 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1582 list_del(&nsh->lru); 1583 kmem_cache_free(sc, nsh); 1584 } 1585 kmem_cache_destroy(sc); 1586 return -ENOMEM; 1587 } 1588 /* Step 2 - Must use GFP_NOIO now. 1589 * OK, we have enough stripes, start collecting inactive 1590 * stripes and copying them over 1591 */ 1592 list_for_each_entry(nsh, &newstripes, lru) { 1593 spin_lock_irq(&conf->device_lock); 1594 wait_event_lock_irq(conf->wait_for_stripe, 1595 !list_empty(&conf->inactive_list), 1596 conf->device_lock, 1597 ); 1598 osh = get_free_stripe(conf); 1599 spin_unlock_irq(&conf->device_lock); 1600 atomic_set(&nsh->count, 1); 1601 for(i=0; i<conf->pool_size; i++) 1602 nsh->dev[i].page = osh->dev[i].page; 1603 for( ; i<newsize; i++) 1604 nsh->dev[i].page = NULL; 1605 kmem_cache_free(conf->slab_cache, osh); 1606 } 1607 kmem_cache_destroy(conf->slab_cache); 1608 1609 /* Step 3. 1610 * At this point, we are holding all the stripes so the array 1611 * is completely stalled, so now is a good time to resize 1612 * conf->disks and the scribble region 1613 */ 1614 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1615 if (ndisks) { 1616 for (i=0; i<conf->raid_disks; i++) 1617 ndisks[i] = conf->disks[i]; 1618 kfree(conf->disks); 1619 conf->disks = ndisks; 1620 } else 1621 err = -ENOMEM; 1622 1623 get_online_cpus(); 1624 conf->scribble_len = scribble_len(newsize); 1625 for_each_present_cpu(cpu) { 1626 struct raid5_percpu *percpu; 1627 void *scribble; 1628 1629 percpu = per_cpu_ptr(conf->percpu, cpu); 1630 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1631 1632 if (scribble) { 1633 kfree(percpu->scribble); 1634 percpu->scribble = scribble; 1635 } else { 1636 err = -ENOMEM; 1637 break; 1638 } 1639 } 1640 put_online_cpus(); 1641 1642 /* Step 4, return new stripes to service */ 1643 while(!list_empty(&newstripes)) { 1644 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1645 list_del_init(&nsh->lru); 1646 1647 for (i=conf->raid_disks; i < newsize; i++) 1648 if (nsh->dev[i].page == NULL) { 1649 struct page *p = alloc_page(GFP_NOIO); 1650 nsh->dev[i].page = p; 1651 if (!p) 1652 err = -ENOMEM; 1653 } 1654 release_stripe(nsh); 1655 } 1656 /* critical section pass, GFP_NOIO no longer needed */ 1657 1658 conf->slab_cache = sc; 1659 conf->active_name = 1-conf->active_name; 1660 conf->pool_size = newsize; 1661 return err; 1662 } 1663 1664 static int drop_one_stripe(struct r5conf *conf) 1665 { 1666 struct stripe_head *sh; 1667 1668 spin_lock_irq(&conf->device_lock); 1669 sh = get_free_stripe(conf); 1670 spin_unlock_irq(&conf->device_lock); 1671 if (!sh) 1672 return 0; 1673 BUG_ON(atomic_read(&sh->count)); 1674 shrink_buffers(sh); 1675 kmem_cache_free(conf->slab_cache, sh); 1676 atomic_dec(&conf->active_stripes); 1677 return 1; 1678 } 1679 1680 static void shrink_stripes(struct r5conf *conf) 1681 { 1682 while (drop_one_stripe(conf)) 1683 ; 1684 1685 if (conf->slab_cache) 1686 kmem_cache_destroy(conf->slab_cache); 1687 conf->slab_cache = NULL; 1688 } 1689 1690 static void raid5_end_read_request(struct bio * bi, int error) 1691 { 1692 struct stripe_head *sh = bi->bi_private; 1693 struct r5conf *conf = sh->raid_conf; 1694 int disks = sh->disks, i; 1695 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1696 char b[BDEVNAME_SIZE]; 1697 struct md_rdev *rdev = NULL; 1698 sector_t s; 1699 1700 for (i=0 ; i<disks; i++) 1701 if (bi == &sh->dev[i].req) 1702 break; 1703 1704 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1705 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1706 uptodate); 1707 if (i == disks) { 1708 BUG(); 1709 return; 1710 } 1711 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1712 /* If replacement finished while this request was outstanding, 1713 * 'replacement' might be NULL already. 1714 * In that case it moved down to 'rdev'. 1715 * rdev is not removed until all requests are finished. 1716 */ 1717 rdev = conf->disks[i].replacement; 1718 if (!rdev) 1719 rdev = conf->disks[i].rdev; 1720 1721 if (use_new_offset(conf, sh)) 1722 s = sh->sector + rdev->new_data_offset; 1723 else 1724 s = sh->sector + rdev->data_offset; 1725 if (uptodate) { 1726 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1727 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1728 /* Note that this cannot happen on a 1729 * replacement device. We just fail those on 1730 * any error 1731 */ 1732 printk_ratelimited( 1733 KERN_INFO 1734 "md/raid:%s: read error corrected" 1735 " (%lu sectors at %llu on %s)\n", 1736 mdname(conf->mddev), STRIPE_SECTORS, 1737 (unsigned long long)s, 1738 bdevname(rdev->bdev, b)); 1739 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1742 } 1743 if (atomic_read(&rdev->read_errors)) 1744 atomic_set(&rdev->read_errors, 0); 1745 } else { 1746 const char *bdn = bdevname(rdev->bdev, b); 1747 int retry = 0; 1748 int set_bad = 0; 1749 1750 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1751 atomic_inc(&rdev->read_errors); 1752 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1753 printk_ratelimited( 1754 KERN_WARNING 1755 "md/raid:%s: read error on replacement device " 1756 "(sector %llu on %s).\n", 1757 mdname(conf->mddev), 1758 (unsigned long long)s, 1759 bdn); 1760 else if (conf->mddev->degraded >= conf->max_degraded) { 1761 set_bad = 1; 1762 printk_ratelimited( 1763 KERN_WARNING 1764 "md/raid:%s: read error not correctable " 1765 "(sector %llu on %s).\n", 1766 mdname(conf->mddev), 1767 (unsigned long long)s, 1768 bdn); 1769 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1770 /* Oh, no!!! */ 1771 set_bad = 1; 1772 printk_ratelimited( 1773 KERN_WARNING 1774 "md/raid:%s: read error NOT corrected!! " 1775 "(sector %llu on %s).\n", 1776 mdname(conf->mddev), 1777 (unsigned long long)s, 1778 bdn); 1779 } else if (atomic_read(&rdev->read_errors) 1780 > conf->max_nr_stripes) 1781 printk(KERN_WARNING 1782 "md/raid:%s: Too many read errors, failing device %s.\n", 1783 mdname(conf->mddev), bdn); 1784 else 1785 retry = 1; 1786 if (retry) 1787 set_bit(R5_ReadError, &sh->dev[i].flags); 1788 else { 1789 clear_bit(R5_ReadError, &sh->dev[i].flags); 1790 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1791 if (!(set_bad 1792 && test_bit(In_sync, &rdev->flags) 1793 && rdev_set_badblocks( 1794 rdev, sh->sector, STRIPE_SECTORS, 0))) 1795 md_error(conf->mddev, rdev); 1796 } 1797 } 1798 rdev_dec_pending(rdev, conf->mddev); 1799 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1800 set_bit(STRIPE_HANDLE, &sh->state); 1801 release_stripe(sh); 1802 } 1803 1804 static void raid5_end_write_request(struct bio *bi, int error) 1805 { 1806 struct stripe_head *sh = bi->bi_private; 1807 struct r5conf *conf = sh->raid_conf; 1808 int disks = sh->disks, i; 1809 struct md_rdev *uninitialized_var(rdev); 1810 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1811 sector_t first_bad; 1812 int bad_sectors; 1813 int replacement = 0; 1814 1815 for (i = 0 ; i < disks; i++) { 1816 if (bi == &sh->dev[i].req) { 1817 rdev = conf->disks[i].rdev; 1818 break; 1819 } 1820 if (bi == &sh->dev[i].rreq) { 1821 rdev = conf->disks[i].replacement; 1822 if (rdev) 1823 replacement = 1; 1824 else 1825 /* rdev was removed and 'replacement' 1826 * replaced it. rdev is not removed 1827 * until all requests are finished. 1828 */ 1829 rdev = conf->disks[i].rdev; 1830 break; 1831 } 1832 } 1833 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1834 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1835 uptodate); 1836 if (i == disks) { 1837 BUG(); 1838 return; 1839 } 1840 1841 if (replacement) { 1842 if (!uptodate) 1843 md_error(conf->mddev, rdev); 1844 else if (is_badblock(rdev, sh->sector, 1845 STRIPE_SECTORS, 1846 &first_bad, &bad_sectors)) 1847 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1848 } else { 1849 if (!uptodate) { 1850 set_bit(WriteErrorSeen, &rdev->flags); 1851 set_bit(R5_WriteError, &sh->dev[i].flags); 1852 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1853 set_bit(MD_RECOVERY_NEEDED, 1854 &rdev->mddev->recovery); 1855 } else if (is_badblock(rdev, sh->sector, 1856 STRIPE_SECTORS, 1857 &first_bad, &bad_sectors)) 1858 set_bit(R5_MadeGood, &sh->dev[i].flags); 1859 } 1860 rdev_dec_pending(rdev, conf->mddev); 1861 1862 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1863 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1864 set_bit(STRIPE_HANDLE, &sh->state); 1865 release_stripe(sh); 1866 } 1867 1868 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1869 1870 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1871 { 1872 struct r5dev *dev = &sh->dev[i]; 1873 1874 bio_init(&dev->req); 1875 dev->req.bi_io_vec = &dev->vec; 1876 dev->req.bi_vcnt++; 1877 dev->req.bi_max_vecs++; 1878 dev->req.bi_private = sh; 1879 dev->vec.bv_page = dev->page; 1880 1881 bio_init(&dev->rreq); 1882 dev->rreq.bi_io_vec = &dev->rvec; 1883 dev->rreq.bi_vcnt++; 1884 dev->rreq.bi_max_vecs++; 1885 dev->rreq.bi_private = sh; 1886 dev->rvec.bv_page = dev->page; 1887 1888 dev->flags = 0; 1889 dev->sector = compute_blocknr(sh, i, previous); 1890 } 1891 1892 static void error(struct mddev *mddev, struct md_rdev *rdev) 1893 { 1894 char b[BDEVNAME_SIZE]; 1895 struct r5conf *conf = mddev->private; 1896 unsigned long flags; 1897 pr_debug("raid456: error called\n"); 1898 1899 spin_lock_irqsave(&conf->device_lock, flags); 1900 clear_bit(In_sync, &rdev->flags); 1901 mddev->degraded = calc_degraded(conf); 1902 spin_unlock_irqrestore(&conf->device_lock, flags); 1903 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1904 1905 set_bit(Blocked, &rdev->flags); 1906 set_bit(Faulty, &rdev->flags); 1907 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1908 printk(KERN_ALERT 1909 "md/raid:%s: Disk failure on %s, disabling device.\n" 1910 "md/raid:%s: Operation continuing on %d devices.\n", 1911 mdname(mddev), 1912 bdevname(rdev->bdev, b), 1913 mdname(mddev), 1914 conf->raid_disks - mddev->degraded); 1915 } 1916 1917 /* 1918 * Input: a 'big' sector number, 1919 * Output: index of the data and parity disk, and the sector # in them. 1920 */ 1921 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1922 int previous, int *dd_idx, 1923 struct stripe_head *sh) 1924 { 1925 sector_t stripe, stripe2; 1926 sector_t chunk_number; 1927 unsigned int chunk_offset; 1928 int pd_idx, qd_idx; 1929 int ddf_layout = 0; 1930 sector_t new_sector; 1931 int algorithm = previous ? conf->prev_algo 1932 : conf->algorithm; 1933 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1934 : conf->chunk_sectors; 1935 int raid_disks = previous ? conf->previous_raid_disks 1936 : conf->raid_disks; 1937 int data_disks = raid_disks - conf->max_degraded; 1938 1939 /* First compute the information on this sector */ 1940 1941 /* 1942 * Compute the chunk number and the sector offset inside the chunk 1943 */ 1944 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1945 chunk_number = r_sector; 1946 1947 /* 1948 * Compute the stripe number 1949 */ 1950 stripe = chunk_number; 1951 *dd_idx = sector_div(stripe, data_disks); 1952 stripe2 = stripe; 1953 /* 1954 * Select the parity disk based on the user selected algorithm. 1955 */ 1956 pd_idx = qd_idx = -1; 1957 switch(conf->level) { 1958 case 4: 1959 pd_idx = data_disks; 1960 break; 1961 case 5: 1962 switch (algorithm) { 1963 case ALGORITHM_LEFT_ASYMMETRIC: 1964 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1965 if (*dd_idx >= pd_idx) 1966 (*dd_idx)++; 1967 break; 1968 case ALGORITHM_RIGHT_ASYMMETRIC: 1969 pd_idx = sector_div(stripe2, raid_disks); 1970 if (*dd_idx >= pd_idx) 1971 (*dd_idx)++; 1972 break; 1973 case ALGORITHM_LEFT_SYMMETRIC: 1974 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1975 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1976 break; 1977 case ALGORITHM_RIGHT_SYMMETRIC: 1978 pd_idx = sector_div(stripe2, raid_disks); 1979 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1980 break; 1981 case ALGORITHM_PARITY_0: 1982 pd_idx = 0; 1983 (*dd_idx)++; 1984 break; 1985 case ALGORITHM_PARITY_N: 1986 pd_idx = data_disks; 1987 break; 1988 default: 1989 BUG(); 1990 } 1991 break; 1992 case 6: 1993 1994 switch (algorithm) { 1995 case ALGORITHM_LEFT_ASYMMETRIC: 1996 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1997 qd_idx = pd_idx + 1; 1998 if (pd_idx == raid_disks-1) { 1999 (*dd_idx)++; /* Q D D D P */ 2000 qd_idx = 0; 2001 } else if (*dd_idx >= pd_idx) 2002 (*dd_idx) += 2; /* D D P Q D */ 2003 break; 2004 case ALGORITHM_RIGHT_ASYMMETRIC: 2005 pd_idx = sector_div(stripe2, raid_disks); 2006 qd_idx = pd_idx + 1; 2007 if (pd_idx == raid_disks-1) { 2008 (*dd_idx)++; /* Q D D D P */ 2009 qd_idx = 0; 2010 } else if (*dd_idx >= pd_idx) 2011 (*dd_idx) += 2; /* D D P Q D */ 2012 break; 2013 case ALGORITHM_LEFT_SYMMETRIC: 2014 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2015 qd_idx = (pd_idx + 1) % raid_disks; 2016 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2017 break; 2018 case ALGORITHM_RIGHT_SYMMETRIC: 2019 pd_idx = sector_div(stripe2, raid_disks); 2020 qd_idx = (pd_idx + 1) % raid_disks; 2021 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2022 break; 2023 2024 case ALGORITHM_PARITY_0: 2025 pd_idx = 0; 2026 qd_idx = 1; 2027 (*dd_idx) += 2; 2028 break; 2029 case ALGORITHM_PARITY_N: 2030 pd_idx = data_disks; 2031 qd_idx = data_disks + 1; 2032 break; 2033 2034 case ALGORITHM_ROTATING_ZERO_RESTART: 2035 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2036 * of blocks for computing Q is different. 2037 */ 2038 pd_idx = sector_div(stripe2, raid_disks); 2039 qd_idx = pd_idx + 1; 2040 if (pd_idx == raid_disks-1) { 2041 (*dd_idx)++; /* Q D D D P */ 2042 qd_idx = 0; 2043 } else if (*dd_idx >= pd_idx) 2044 (*dd_idx) += 2; /* D D P Q D */ 2045 ddf_layout = 1; 2046 break; 2047 2048 case ALGORITHM_ROTATING_N_RESTART: 2049 /* Same a left_asymmetric, by first stripe is 2050 * D D D P Q rather than 2051 * Q D D D P 2052 */ 2053 stripe2 += 1; 2054 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2055 qd_idx = pd_idx + 1; 2056 if (pd_idx == raid_disks-1) { 2057 (*dd_idx)++; /* Q D D D P */ 2058 qd_idx = 0; 2059 } else if (*dd_idx >= pd_idx) 2060 (*dd_idx) += 2; /* D D P Q D */ 2061 ddf_layout = 1; 2062 break; 2063 2064 case ALGORITHM_ROTATING_N_CONTINUE: 2065 /* Same as left_symmetric but Q is before P */ 2066 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2067 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2068 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2069 ddf_layout = 1; 2070 break; 2071 2072 case ALGORITHM_LEFT_ASYMMETRIC_6: 2073 /* RAID5 left_asymmetric, with Q on last device */ 2074 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2075 if (*dd_idx >= pd_idx) 2076 (*dd_idx)++; 2077 qd_idx = raid_disks - 1; 2078 break; 2079 2080 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2081 pd_idx = sector_div(stripe2, raid_disks-1); 2082 if (*dd_idx >= pd_idx) 2083 (*dd_idx)++; 2084 qd_idx = raid_disks - 1; 2085 break; 2086 2087 case ALGORITHM_LEFT_SYMMETRIC_6: 2088 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2089 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2090 qd_idx = raid_disks - 1; 2091 break; 2092 2093 case ALGORITHM_RIGHT_SYMMETRIC_6: 2094 pd_idx = sector_div(stripe2, raid_disks-1); 2095 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2096 qd_idx = raid_disks - 1; 2097 break; 2098 2099 case ALGORITHM_PARITY_0_6: 2100 pd_idx = 0; 2101 (*dd_idx)++; 2102 qd_idx = raid_disks - 1; 2103 break; 2104 2105 default: 2106 BUG(); 2107 } 2108 break; 2109 } 2110 2111 if (sh) { 2112 sh->pd_idx = pd_idx; 2113 sh->qd_idx = qd_idx; 2114 sh->ddf_layout = ddf_layout; 2115 } 2116 /* 2117 * Finally, compute the new sector number 2118 */ 2119 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2120 return new_sector; 2121 } 2122 2123 2124 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2125 { 2126 struct r5conf *conf = sh->raid_conf; 2127 int raid_disks = sh->disks; 2128 int data_disks = raid_disks - conf->max_degraded; 2129 sector_t new_sector = sh->sector, check; 2130 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2131 : conf->chunk_sectors; 2132 int algorithm = previous ? conf->prev_algo 2133 : conf->algorithm; 2134 sector_t stripe; 2135 int chunk_offset; 2136 sector_t chunk_number; 2137 int dummy1, dd_idx = i; 2138 sector_t r_sector; 2139 struct stripe_head sh2; 2140 2141 2142 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2143 stripe = new_sector; 2144 2145 if (i == sh->pd_idx) 2146 return 0; 2147 switch(conf->level) { 2148 case 4: break; 2149 case 5: 2150 switch (algorithm) { 2151 case ALGORITHM_LEFT_ASYMMETRIC: 2152 case ALGORITHM_RIGHT_ASYMMETRIC: 2153 if (i > sh->pd_idx) 2154 i--; 2155 break; 2156 case ALGORITHM_LEFT_SYMMETRIC: 2157 case ALGORITHM_RIGHT_SYMMETRIC: 2158 if (i < sh->pd_idx) 2159 i += raid_disks; 2160 i -= (sh->pd_idx + 1); 2161 break; 2162 case ALGORITHM_PARITY_0: 2163 i -= 1; 2164 break; 2165 case ALGORITHM_PARITY_N: 2166 break; 2167 default: 2168 BUG(); 2169 } 2170 break; 2171 case 6: 2172 if (i == sh->qd_idx) 2173 return 0; /* It is the Q disk */ 2174 switch (algorithm) { 2175 case ALGORITHM_LEFT_ASYMMETRIC: 2176 case ALGORITHM_RIGHT_ASYMMETRIC: 2177 case ALGORITHM_ROTATING_ZERO_RESTART: 2178 case ALGORITHM_ROTATING_N_RESTART: 2179 if (sh->pd_idx == raid_disks-1) 2180 i--; /* Q D D D P */ 2181 else if (i > sh->pd_idx) 2182 i -= 2; /* D D P Q D */ 2183 break; 2184 case ALGORITHM_LEFT_SYMMETRIC: 2185 case ALGORITHM_RIGHT_SYMMETRIC: 2186 if (sh->pd_idx == raid_disks-1) 2187 i--; /* Q D D D P */ 2188 else { 2189 /* D D P Q D */ 2190 if (i < sh->pd_idx) 2191 i += raid_disks; 2192 i -= (sh->pd_idx + 2); 2193 } 2194 break; 2195 case ALGORITHM_PARITY_0: 2196 i -= 2; 2197 break; 2198 case ALGORITHM_PARITY_N: 2199 break; 2200 case ALGORITHM_ROTATING_N_CONTINUE: 2201 /* Like left_symmetric, but P is before Q */ 2202 if (sh->pd_idx == 0) 2203 i--; /* P D D D Q */ 2204 else { 2205 /* D D Q P D */ 2206 if (i < sh->pd_idx) 2207 i += raid_disks; 2208 i -= (sh->pd_idx + 1); 2209 } 2210 break; 2211 case ALGORITHM_LEFT_ASYMMETRIC_6: 2212 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2213 if (i > sh->pd_idx) 2214 i--; 2215 break; 2216 case ALGORITHM_LEFT_SYMMETRIC_6: 2217 case ALGORITHM_RIGHT_SYMMETRIC_6: 2218 if (i < sh->pd_idx) 2219 i += data_disks + 1; 2220 i -= (sh->pd_idx + 1); 2221 break; 2222 case ALGORITHM_PARITY_0_6: 2223 i -= 1; 2224 break; 2225 default: 2226 BUG(); 2227 } 2228 break; 2229 } 2230 2231 chunk_number = stripe * data_disks + i; 2232 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2233 2234 check = raid5_compute_sector(conf, r_sector, 2235 previous, &dummy1, &sh2); 2236 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2237 || sh2.qd_idx != sh->qd_idx) { 2238 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2239 mdname(conf->mddev)); 2240 return 0; 2241 } 2242 return r_sector; 2243 } 2244 2245 2246 static void 2247 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2248 int rcw, int expand) 2249 { 2250 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2251 struct r5conf *conf = sh->raid_conf; 2252 int level = conf->level; 2253 2254 if (rcw) { 2255 /* if we are not expanding this is a proper write request, and 2256 * there will be bios with new data to be drained into the 2257 * stripe cache 2258 */ 2259 if (!expand) { 2260 sh->reconstruct_state = reconstruct_state_drain_run; 2261 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2262 } else 2263 sh->reconstruct_state = reconstruct_state_run; 2264 2265 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2266 2267 for (i = disks; i--; ) { 2268 struct r5dev *dev = &sh->dev[i]; 2269 2270 if (dev->towrite) { 2271 set_bit(R5_LOCKED, &dev->flags); 2272 set_bit(R5_Wantdrain, &dev->flags); 2273 if (!expand) 2274 clear_bit(R5_UPTODATE, &dev->flags); 2275 s->locked++; 2276 } 2277 } 2278 if (s->locked + conf->max_degraded == disks) 2279 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2280 atomic_inc(&conf->pending_full_writes); 2281 } else { 2282 BUG_ON(level == 6); 2283 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2284 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2285 2286 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2287 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2288 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2289 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2290 2291 for (i = disks; i--; ) { 2292 struct r5dev *dev = &sh->dev[i]; 2293 if (i == pd_idx) 2294 continue; 2295 2296 if (dev->towrite && 2297 (test_bit(R5_UPTODATE, &dev->flags) || 2298 test_bit(R5_Wantcompute, &dev->flags))) { 2299 set_bit(R5_Wantdrain, &dev->flags); 2300 set_bit(R5_LOCKED, &dev->flags); 2301 clear_bit(R5_UPTODATE, &dev->flags); 2302 s->locked++; 2303 } 2304 } 2305 } 2306 2307 /* keep the parity disk(s) locked while asynchronous operations 2308 * are in flight 2309 */ 2310 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2311 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2312 s->locked++; 2313 2314 if (level == 6) { 2315 int qd_idx = sh->qd_idx; 2316 struct r5dev *dev = &sh->dev[qd_idx]; 2317 2318 set_bit(R5_LOCKED, &dev->flags); 2319 clear_bit(R5_UPTODATE, &dev->flags); 2320 s->locked++; 2321 } 2322 2323 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2324 __func__, (unsigned long long)sh->sector, 2325 s->locked, s->ops_request); 2326 } 2327 2328 /* 2329 * Each stripe/dev can have one or more bion attached. 2330 * toread/towrite point to the first in a chain. 2331 * The bi_next chain must be in order. 2332 */ 2333 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2334 { 2335 struct bio **bip; 2336 struct r5conf *conf = sh->raid_conf; 2337 int firstwrite=0; 2338 2339 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2340 (unsigned long long)bi->bi_sector, 2341 (unsigned long long)sh->sector); 2342 2343 2344 spin_lock_irq(&conf->device_lock); 2345 if (forwrite) { 2346 bip = &sh->dev[dd_idx].towrite; 2347 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2348 firstwrite = 1; 2349 } else 2350 bip = &sh->dev[dd_idx].toread; 2351 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2352 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2353 goto overlap; 2354 bip = & (*bip)->bi_next; 2355 } 2356 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2357 goto overlap; 2358 2359 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2360 if (*bip) 2361 bi->bi_next = *bip; 2362 *bip = bi; 2363 bi->bi_phys_segments++; 2364 2365 if (forwrite) { 2366 /* check if page is covered */ 2367 sector_t sector = sh->dev[dd_idx].sector; 2368 for (bi=sh->dev[dd_idx].towrite; 2369 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2370 bi && bi->bi_sector <= sector; 2371 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2372 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2373 sector = bi->bi_sector + (bi->bi_size>>9); 2374 } 2375 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2376 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2377 } 2378 spin_unlock_irq(&conf->device_lock); 2379 2380 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2381 (unsigned long long)(*bip)->bi_sector, 2382 (unsigned long long)sh->sector, dd_idx); 2383 2384 if (conf->mddev->bitmap && firstwrite) { 2385 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2386 STRIPE_SECTORS, 0); 2387 sh->bm_seq = conf->seq_flush+1; 2388 set_bit(STRIPE_BIT_DELAY, &sh->state); 2389 } 2390 return 1; 2391 2392 overlap: 2393 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2394 spin_unlock_irq(&conf->device_lock); 2395 return 0; 2396 } 2397 2398 static void end_reshape(struct r5conf *conf); 2399 2400 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2401 struct stripe_head *sh) 2402 { 2403 int sectors_per_chunk = 2404 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2405 int dd_idx; 2406 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2407 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2408 2409 raid5_compute_sector(conf, 2410 stripe * (disks - conf->max_degraded) 2411 *sectors_per_chunk + chunk_offset, 2412 previous, 2413 &dd_idx, sh); 2414 } 2415 2416 static void 2417 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2418 struct stripe_head_state *s, int disks, 2419 struct bio **return_bi) 2420 { 2421 int i; 2422 for (i = disks; i--; ) { 2423 struct bio *bi; 2424 int bitmap_end = 0; 2425 2426 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2427 struct md_rdev *rdev; 2428 rcu_read_lock(); 2429 rdev = rcu_dereference(conf->disks[i].rdev); 2430 if (rdev && test_bit(In_sync, &rdev->flags)) 2431 atomic_inc(&rdev->nr_pending); 2432 else 2433 rdev = NULL; 2434 rcu_read_unlock(); 2435 if (rdev) { 2436 if (!rdev_set_badblocks( 2437 rdev, 2438 sh->sector, 2439 STRIPE_SECTORS, 0)) 2440 md_error(conf->mddev, rdev); 2441 rdev_dec_pending(rdev, conf->mddev); 2442 } 2443 } 2444 spin_lock_irq(&conf->device_lock); 2445 /* fail all writes first */ 2446 bi = sh->dev[i].towrite; 2447 sh->dev[i].towrite = NULL; 2448 if (bi) { 2449 s->to_write--; 2450 bitmap_end = 1; 2451 } 2452 2453 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2454 wake_up(&conf->wait_for_overlap); 2455 2456 while (bi && bi->bi_sector < 2457 sh->dev[i].sector + STRIPE_SECTORS) { 2458 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2459 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2460 if (!raid5_dec_bi_phys_segments(bi)) { 2461 md_write_end(conf->mddev); 2462 bi->bi_next = *return_bi; 2463 *return_bi = bi; 2464 } 2465 bi = nextbi; 2466 } 2467 /* and fail all 'written' */ 2468 bi = sh->dev[i].written; 2469 sh->dev[i].written = NULL; 2470 if (bi) bitmap_end = 1; 2471 while (bi && bi->bi_sector < 2472 sh->dev[i].sector + STRIPE_SECTORS) { 2473 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2474 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2475 if (!raid5_dec_bi_phys_segments(bi)) { 2476 md_write_end(conf->mddev); 2477 bi->bi_next = *return_bi; 2478 *return_bi = bi; 2479 } 2480 bi = bi2; 2481 } 2482 2483 /* fail any reads if this device is non-operational and 2484 * the data has not reached the cache yet. 2485 */ 2486 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2487 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2488 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2489 bi = sh->dev[i].toread; 2490 sh->dev[i].toread = NULL; 2491 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2492 wake_up(&conf->wait_for_overlap); 2493 if (bi) s->to_read--; 2494 while (bi && bi->bi_sector < 2495 sh->dev[i].sector + STRIPE_SECTORS) { 2496 struct bio *nextbi = 2497 r5_next_bio(bi, sh->dev[i].sector); 2498 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2499 if (!raid5_dec_bi_phys_segments(bi)) { 2500 bi->bi_next = *return_bi; 2501 *return_bi = bi; 2502 } 2503 bi = nextbi; 2504 } 2505 } 2506 spin_unlock_irq(&conf->device_lock); 2507 if (bitmap_end) 2508 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2509 STRIPE_SECTORS, 0, 0); 2510 /* If we were in the middle of a write the parity block might 2511 * still be locked - so just clear all R5_LOCKED flags 2512 */ 2513 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2514 } 2515 2516 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2517 if (atomic_dec_and_test(&conf->pending_full_writes)) 2518 md_wakeup_thread(conf->mddev->thread); 2519 } 2520 2521 static void 2522 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2523 struct stripe_head_state *s) 2524 { 2525 int abort = 0; 2526 int i; 2527 2528 clear_bit(STRIPE_SYNCING, &sh->state); 2529 s->syncing = 0; 2530 s->replacing = 0; 2531 /* There is nothing more to do for sync/check/repair. 2532 * Don't even need to abort as that is handled elsewhere 2533 * if needed, and not always wanted e.g. if there is a known 2534 * bad block here. 2535 * For recover/replace we need to record a bad block on all 2536 * non-sync devices, or abort the recovery 2537 */ 2538 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2539 /* During recovery devices cannot be removed, so 2540 * locking and refcounting of rdevs is not needed 2541 */ 2542 for (i = 0; i < conf->raid_disks; i++) { 2543 struct md_rdev *rdev = conf->disks[i].rdev; 2544 if (rdev 2545 && !test_bit(Faulty, &rdev->flags) 2546 && !test_bit(In_sync, &rdev->flags) 2547 && !rdev_set_badblocks(rdev, sh->sector, 2548 STRIPE_SECTORS, 0)) 2549 abort = 1; 2550 rdev = conf->disks[i].replacement; 2551 if (rdev 2552 && !test_bit(Faulty, &rdev->flags) 2553 && !test_bit(In_sync, &rdev->flags) 2554 && !rdev_set_badblocks(rdev, sh->sector, 2555 STRIPE_SECTORS, 0)) 2556 abort = 1; 2557 } 2558 if (abort) 2559 conf->recovery_disabled = 2560 conf->mddev->recovery_disabled; 2561 } 2562 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2563 } 2564 2565 static int want_replace(struct stripe_head *sh, int disk_idx) 2566 { 2567 struct md_rdev *rdev; 2568 int rv = 0; 2569 /* Doing recovery so rcu locking not required */ 2570 rdev = sh->raid_conf->disks[disk_idx].replacement; 2571 if (rdev 2572 && !test_bit(Faulty, &rdev->flags) 2573 && !test_bit(In_sync, &rdev->flags) 2574 && (rdev->recovery_offset <= sh->sector 2575 || rdev->mddev->recovery_cp <= sh->sector)) 2576 rv = 1; 2577 2578 return rv; 2579 } 2580 2581 /* fetch_block - checks the given member device to see if its data needs 2582 * to be read or computed to satisfy a request. 2583 * 2584 * Returns 1 when no more member devices need to be checked, otherwise returns 2585 * 0 to tell the loop in handle_stripe_fill to continue 2586 */ 2587 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2588 int disk_idx, int disks) 2589 { 2590 struct r5dev *dev = &sh->dev[disk_idx]; 2591 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2592 &sh->dev[s->failed_num[1]] }; 2593 2594 /* is the data in this block needed, and can we get it? */ 2595 if (!test_bit(R5_LOCKED, &dev->flags) && 2596 !test_bit(R5_UPTODATE, &dev->flags) && 2597 (dev->toread || 2598 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2599 s->syncing || s->expanding || 2600 (s->replacing && want_replace(sh, disk_idx)) || 2601 (s->failed >= 1 && fdev[0]->toread) || 2602 (s->failed >= 2 && fdev[1]->toread) || 2603 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2604 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2605 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2606 /* we would like to get this block, possibly by computing it, 2607 * otherwise read it if the backing disk is insync 2608 */ 2609 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2610 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2611 if ((s->uptodate == disks - 1) && 2612 (s->failed && (disk_idx == s->failed_num[0] || 2613 disk_idx == s->failed_num[1]))) { 2614 /* have disk failed, and we're requested to fetch it; 2615 * do compute it 2616 */ 2617 pr_debug("Computing stripe %llu block %d\n", 2618 (unsigned long long)sh->sector, disk_idx); 2619 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2620 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2621 set_bit(R5_Wantcompute, &dev->flags); 2622 sh->ops.target = disk_idx; 2623 sh->ops.target2 = -1; /* no 2nd target */ 2624 s->req_compute = 1; 2625 /* Careful: from this point on 'uptodate' is in the eye 2626 * of raid_run_ops which services 'compute' operations 2627 * before writes. R5_Wantcompute flags a block that will 2628 * be R5_UPTODATE by the time it is needed for a 2629 * subsequent operation. 2630 */ 2631 s->uptodate++; 2632 return 1; 2633 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2634 /* Computing 2-failure is *very* expensive; only 2635 * do it if failed >= 2 2636 */ 2637 int other; 2638 for (other = disks; other--; ) { 2639 if (other == disk_idx) 2640 continue; 2641 if (!test_bit(R5_UPTODATE, 2642 &sh->dev[other].flags)) 2643 break; 2644 } 2645 BUG_ON(other < 0); 2646 pr_debug("Computing stripe %llu blocks %d,%d\n", 2647 (unsigned long long)sh->sector, 2648 disk_idx, other); 2649 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2650 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2651 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2652 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2653 sh->ops.target = disk_idx; 2654 sh->ops.target2 = other; 2655 s->uptodate += 2; 2656 s->req_compute = 1; 2657 return 1; 2658 } else if (test_bit(R5_Insync, &dev->flags)) { 2659 set_bit(R5_LOCKED, &dev->flags); 2660 set_bit(R5_Wantread, &dev->flags); 2661 s->locked++; 2662 pr_debug("Reading block %d (sync=%d)\n", 2663 disk_idx, s->syncing); 2664 } 2665 } 2666 2667 return 0; 2668 } 2669 2670 /** 2671 * handle_stripe_fill - read or compute data to satisfy pending requests. 2672 */ 2673 static void handle_stripe_fill(struct stripe_head *sh, 2674 struct stripe_head_state *s, 2675 int disks) 2676 { 2677 int i; 2678 2679 /* look for blocks to read/compute, skip this if a compute 2680 * is already in flight, or if the stripe contents are in the 2681 * midst of changing due to a write 2682 */ 2683 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2684 !sh->reconstruct_state) 2685 for (i = disks; i--; ) 2686 if (fetch_block(sh, s, i, disks)) 2687 break; 2688 set_bit(STRIPE_HANDLE, &sh->state); 2689 } 2690 2691 2692 /* handle_stripe_clean_event 2693 * any written block on an uptodate or failed drive can be returned. 2694 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2695 * never LOCKED, so we don't need to test 'failed' directly. 2696 */ 2697 static void handle_stripe_clean_event(struct r5conf *conf, 2698 struct stripe_head *sh, int disks, struct bio **return_bi) 2699 { 2700 int i; 2701 struct r5dev *dev; 2702 2703 for (i = disks; i--; ) 2704 if (sh->dev[i].written) { 2705 dev = &sh->dev[i]; 2706 if (!test_bit(R5_LOCKED, &dev->flags) && 2707 test_bit(R5_UPTODATE, &dev->flags)) { 2708 /* We can return any write requests */ 2709 struct bio *wbi, *wbi2; 2710 int bitmap_end = 0; 2711 pr_debug("Return write for disc %d\n", i); 2712 spin_lock_irq(&conf->device_lock); 2713 wbi = dev->written; 2714 dev->written = NULL; 2715 while (wbi && wbi->bi_sector < 2716 dev->sector + STRIPE_SECTORS) { 2717 wbi2 = r5_next_bio(wbi, dev->sector); 2718 if (!raid5_dec_bi_phys_segments(wbi)) { 2719 md_write_end(conf->mddev); 2720 wbi->bi_next = *return_bi; 2721 *return_bi = wbi; 2722 } 2723 wbi = wbi2; 2724 } 2725 if (dev->towrite == NULL) 2726 bitmap_end = 1; 2727 spin_unlock_irq(&conf->device_lock); 2728 if (bitmap_end) 2729 bitmap_endwrite(conf->mddev->bitmap, 2730 sh->sector, 2731 STRIPE_SECTORS, 2732 !test_bit(STRIPE_DEGRADED, &sh->state), 2733 0); 2734 } 2735 } 2736 2737 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2738 if (atomic_dec_and_test(&conf->pending_full_writes)) 2739 md_wakeup_thread(conf->mddev->thread); 2740 } 2741 2742 static void handle_stripe_dirtying(struct r5conf *conf, 2743 struct stripe_head *sh, 2744 struct stripe_head_state *s, 2745 int disks) 2746 { 2747 int rmw = 0, rcw = 0, i; 2748 if (conf->max_degraded == 2) { 2749 /* RAID6 requires 'rcw' in current implementation 2750 * Calculate the real rcw later - for now fake it 2751 * look like rcw is cheaper 2752 */ 2753 rcw = 1; rmw = 2; 2754 } else for (i = disks; i--; ) { 2755 /* would I have to read this buffer for read_modify_write */ 2756 struct r5dev *dev = &sh->dev[i]; 2757 if ((dev->towrite || i == sh->pd_idx) && 2758 !test_bit(R5_LOCKED, &dev->flags) && 2759 !(test_bit(R5_UPTODATE, &dev->flags) || 2760 test_bit(R5_Wantcompute, &dev->flags))) { 2761 if (test_bit(R5_Insync, &dev->flags)) 2762 rmw++; 2763 else 2764 rmw += 2*disks; /* cannot read it */ 2765 } 2766 /* Would I have to read this buffer for reconstruct_write */ 2767 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2768 !test_bit(R5_LOCKED, &dev->flags) && 2769 !(test_bit(R5_UPTODATE, &dev->flags) || 2770 test_bit(R5_Wantcompute, &dev->flags))) { 2771 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2772 else 2773 rcw += 2*disks; 2774 } 2775 } 2776 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2777 (unsigned long long)sh->sector, rmw, rcw); 2778 set_bit(STRIPE_HANDLE, &sh->state); 2779 if (rmw < rcw && rmw > 0) 2780 /* prefer read-modify-write, but need to get some data */ 2781 for (i = disks; i--; ) { 2782 struct r5dev *dev = &sh->dev[i]; 2783 if ((dev->towrite || i == sh->pd_idx) && 2784 !test_bit(R5_LOCKED, &dev->flags) && 2785 !(test_bit(R5_UPTODATE, &dev->flags) || 2786 test_bit(R5_Wantcompute, &dev->flags)) && 2787 test_bit(R5_Insync, &dev->flags)) { 2788 if ( 2789 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2790 pr_debug("Read_old block " 2791 "%d for r-m-w\n", i); 2792 set_bit(R5_LOCKED, &dev->flags); 2793 set_bit(R5_Wantread, &dev->flags); 2794 s->locked++; 2795 } else { 2796 set_bit(STRIPE_DELAYED, &sh->state); 2797 set_bit(STRIPE_HANDLE, &sh->state); 2798 } 2799 } 2800 } 2801 if (rcw <= rmw && rcw > 0) { 2802 /* want reconstruct write, but need to get some data */ 2803 rcw = 0; 2804 for (i = disks; i--; ) { 2805 struct r5dev *dev = &sh->dev[i]; 2806 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2807 i != sh->pd_idx && i != sh->qd_idx && 2808 !test_bit(R5_LOCKED, &dev->flags) && 2809 !(test_bit(R5_UPTODATE, &dev->flags) || 2810 test_bit(R5_Wantcompute, &dev->flags))) { 2811 rcw++; 2812 if (!test_bit(R5_Insync, &dev->flags)) 2813 continue; /* it's a failed drive */ 2814 if ( 2815 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2816 pr_debug("Read_old block " 2817 "%d for Reconstruct\n", i); 2818 set_bit(R5_LOCKED, &dev->flags); 2819 set_bit(R5_Wantread, &dev->flags); 2820 s->locked++; 2821 } else { 2822 set_bit(STRIPE_DELAYED, &sh->state); 2823 set_bit(STRIPE_HANDLE, &sh->state); 2824 } 2825 } 2826 } 2827 } 2828 /* now if nothing is locked, and if we have enough data, 2829 * we can start a write request 2830 */ 2831 /* since handle_stripe can be called at any time we need to handle the 2832 * case where a compute block operation has been submitted and then a 2833 * subsequent call wants to start a write request. raid_run_ops only 2834 * handles the case where compute block and reconstruct are requested 2835 * simultaneously. If this is not the case then new writes need to be 2836 * held off until the compute completes. 2837 */ 2838 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2839 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2840 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2841 schedule_reconstruction(sh, s, rcw == 0, 0); 2842 } 2843 2844 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2845 struct stripe_head_state *s, int disks) 2846 { 2847 struct r5dev *dev = NULL; 2848 2849 set_bit(STRIPE_HANDLE, &sh->state); 2850 2851 switch (sh->check_state) { 2852 case check_state_idle: 2853 /* start a new check operation if there are no failures */ 2854 if (s->failed == 0) { 2855 BUG_ON(s->uptodate != disks); 2856 sh->check_state = check_state_run; 2857 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2858 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2859 s->uptodate--; 2860 break; 2861 } 2862 dev = &sh->dev[s->failed_num[0]]; 2863 /* fall through */ 2864 case check_state_compute_result: 2865 sh->check_state = check_state_idle; 2866 if (!dev) 2867 dev = &sh->dev[sh->pd_idx]; 2868 2869 /* check that a write has not made the stripe insync */ 2870 if (test_bit(STRIPE_INSYNC, &sh->state)) 2871 break; 2872 2873 /* either failed parity check, or recovery is happening */ 2874 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2875 BUG_ON(s->uptodate != disks); 2876 2877 set_bit(R5_LOCKED, &dev->flags); 2878 s->locked++; 2879 set_bit(R5_Wantwrite, &dev->flags); 2880 2881 clear_bit(STRIPE_DEGRADED, &sh->state); 2882 set_bit(STRIPE_INSYNC, &sh->state); 2883 break; 2884 case check_state_run: 2885 break; /* we will be called again upon completion */ 2886 case check_state_check_result: 2887 sh->check_state = check_state_idle; 2888 2889 /* if a failure occurred during the check operation, leave 2890 * STRIPE_INSYNC not set and let the stripe be handled again 2891 */ 2892 if (s->failed) 2893 break; 2894 2895 /* handle a successful check operation, if parity is correct 2896 * we are done. Otherwise update the mismatch count and repair 2897 * parity if !MD_RECOVERY_CHECK 2898 */ 2899 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2900 /* parity is correct (on disc, 2901 * not in buffer any more) 2902 */ 2903 set_bit(STRIPE_INSYNC, &sh->state); 2904 else { 2905 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2906 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2907 /* don't try to repair!! */ 2908 set_bit(STRIPE_INSYNC, &sh->state); 2909 else { 2910 sh->check_state = check_state_compute_run; 2911 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2912 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2913 set_bit(R5_Wantcompute, 2914 &sh->dev[sh->pd_idx].flags); 2915 sh->ops.target = sh->pd_idx; 2916 sh->ops.target2 = -1; 2917 s->uptodate++; 2918 } 2919 } 2920 break; 2921 case check_state_compute_run: 2922 break; 2923 default: 2924 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2925 __func__, sh->check_state, 2926 (unsigned long long) sh->sector); 2927 BUG(); 2928 } 2929 } 2930 2931 2932 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2933 struct stripe_head_state *s, 2934 int disks) 2935 { 2936 int pd_idx = sh->pd_idx; 2937 int qd_idx = sh->qd_idx; 2938 struct r5dev *dev; 2939 2940 set_bit(STRIPE_HANDLE, &sh->state); 2941 2942 BUG_ON(s->failed > 2); 2943 2944 /* Want to check and possibly repair P and Q. 2945 * However there could be one 'failed' device, in which 2946 * case we can only check one of them, possibly using the 2947 * other to generate missing data 2948 */ 2949 2950 switch (sh->check_state) { 2951 case check_state_idle: 2952 /* start a new check operation if there are < 2 failures */ 2953 if (s->failed == s->q_failed) { 2954 /* The only possible failed device holds Q, so it 2955 * makes sense to check P (If anything else were failed, 2956 * we would have used P to recreate it). 2957 */ 2958 sh->check_state = check_state_run; 2959 } 2960 if (!s->q_failed && s->failed < 2) { 2961 /* Q is not failed, and we didn't use it to generate 2962 * anything, so it makes sense to check it 2963 */ 2964 if (sh->check_state == check_state_run) 2965 sh->check_state = check_state_run_pq; 2966 else 2967 sh->check_state = check_state_run_q; 2968 } 2969 2970 /* discard potentially stale zero_sum_result */ 2971 sh->ops.zero_sum_result = 0; 2972 2973 if (sh->check_state == check_state_run) { 2974 /* async_xor_zero_sum destroys the contents of P */ 2975 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2976 s->uptodate--; 2977 } 2978 if (sh->check_state >= check_state_run && 2979 sh->check_state <= check_state_run_pq) { 2980 /* async_syndrome_zero_sum preserves P and Q, so 2981 * no need to mark them !uptodate here 2982 */ 2983 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2984 break; 2985 } 2986 2987 /* we have 2-disk failure */ 2988 BUG_ON(s->failed != 2); 2989 /* fall through */ 2990 case check_state_compute_result: 2991 sh->check_state = check_state_idle; 2992 2993 /* check that a write has not made the stripe insync */ 2994 if (test_bit(STRIPE_INSYNC, &sh->state)) 2995 break; 2996 2997 /* now write out any block on a failed drive, 2998 * or P or Q if they were recomputed 2999 */ 3000 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3001 if (s->failed == 2) { 3002 dev = &sh->dev[s->failed_num[1]]; 3003 s->locked++; 3004 set_bit(R5_LOCKED, &dev->flags); 3005 set_bit(R5_Wantwrite, &dev->flags); 3006 } 3007 if (s->failed >= 1) { 3008 dev = &sh->dev[s->failed_num[0]]; 3009 s->locked++; 3010 set_bit(R5_LOCKED, &dev->flags); 3011 set_bit(R5_Wantwrite, &dev->flags); 3012 } 3013 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3014 dev = &sh->dev[pd_idx]; 3015 s->locked++; 3016 set_bit(R5_LOCKED, &dev->flags); 3017 set_bit(R5_Wantwrite, &dev->flags); 3018 } 3019 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3020 dev = &sh->dev[qd_idx]; 3021 s->locked++; 3022 set_bit(R5_LOCKED, &dev->flags); 3023 set_bit(R5_Wantwrite, &dev->flags); 3024 } 3025 clear_bit(STRIPE_DEGRADED, &sh->state); 3026 3027 set_bit(STRIPE_INSYNC, &sh->state); 3028 break; 3029 case check_state_run: 3030 case check_state_run_q: 3031 case check_state_run_pq: 3032 break; /* we will be called again upon completion */ 3033 case check_state_check_result: 3034 sh->check_state = check_state_idle; 3035 3036 /* handle a successful check operation, if parity is correct 3037 * we are done. Otherwise update the mismatch count and repair 3038 * parity if !MD_RECOVERY_CHECK 3039 */ 3040 if (sh->ops.zero_sum_result == 0) { 3041 /* both parities are correct */ 3042 if (!s->failed) 3043 set_bit(STRIPE_INSYNC, &sh->state); 3044 else { 3045 /* in contrast to the raid5 case we can validate 3046 * parity, but still have a failure to write 3047 * back 3048 */ 3049 sh->check_state = check_state_compute_result; 3050 /* Returning at this point means that we may go 3051 * off and bring p and/or q uptodate again so 3052 * we make sure to check zero_sum_result again 3053 * to verify if p or q need writeback 3054 */ 3055 } 3056 } else { 3057 conf->mddev->resync_mismatches += STRIPE_SECTORS; 3058 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3059 /* don't try to repair!! */ 3060 set_bit(STRIPE_INSYNC, &sh->state); 3061 else { 3062 int *target = &sh->ops.target; 3063 3064 sh->ops.target = -1; 3065 sh->ops.target2 = -1; 3066 sh->check_state = check_state_compute_run; 3067 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3068 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3069 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3070 set_bit(R5_Wantcompute, 3071 &sh->dev[pd_idx].flags); 3072 *target = pd_idx; 3073 target = &sh->ops.target2; 3074 s->uptodate++; 3075 } 3076 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3077 set_bit(R5_Wantcompute, 3078 &sh->dev[qd_idx].flags); 3079 *target = qd_idx; 3080 s->uptodate++; 3081 } 3082 } 3083 } 3084 break; 3085 case check_state_compute_run: 3086 break; 3087 default: 3088 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3089 __func__, sh->check_state, 3090 (unsigned long long) sh->sector); 3091 BUG(); 3092 } 3093 } 3094 3095 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3096 { 3097 int i; 3098 3099 /* We have read all the blocks in this stripe and now we need to 3100 * copy some of them into a target stripe for expand. 3101 */ 3102 struct dma_async_tx_descriptor *tx = NULL; 3103 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3104 for (i = 0; i < sh->disks; i++) 3105 if (i != sh->pd_idx && i != sh->qd_idx) { 3106 int dd_idx, j; 3107 struct stripe_head *sh2; 3108 struct async_submit_ctl submit; 3109 3110 sector_t bn = compute_blocknr(sh, i, 1); 3111 sector_t s = raid5_compute_sector(conf, bn, 0, 3112 &dd_idx, NULL); 3113 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3114 if (sh2 == NULL) 3115 /* so far only the early blocks of this stripe 3116 * have been requested. When later blocks 3117 * get requested, we will try again 3118 */ 3119 continue; 3120 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3121 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3122 /* must have already done this block */ 3123 release_stripe(sh2); 3124 continue; 3125 } 3126 3127 /* place all the copies on one channel */ 3128 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3129 tx = async_memcpy(sh2->dev[dd_idx].page, 3130 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3131 &submit); 3132 3133 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3134 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3135 for (j = 0; j < conf->raid_disks; j++) 3136 if (j != sh2->pd_idx && 3137 j != sh2->qd_idx && 3138 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3139 break; 3140 if (j == conf->raid_disks) { 3141 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3142 set_bit(STRIPE_HANDLE, &sh2->state); 3143 } 3144 release_stripe(sh2); 3145 3146 } 3147 /* done submitting copies, wait for them to complete */ 3148 if (tx) { 3149 async_tx_ack(tx); 3150 dma_wait_for_async_tx(tx); 3151 } 3152 } 3153 3154 /* 3155 * handle_stripe - do things to a stripe. 3156 * 3157 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3158 * state of various bits to see what needs to be done. 3159 * Possible results: 3160 * return some read requests which now have data 3161 * return some write requests which are safely on storage 3162 * schedule a read on some buffers 3163 * schedule a write of some buffers 3164 * return confirmation of parity correctness 3165 * 3166 */ 3167 3168 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3169 { 3170 struct r5conf *conf = sh->raid_conf; 3171 int disks = sh->disks; 3172 struct r5dev *dev; 3173 int i; 3174 int do_recovery = 0; 3175 3176 memset(s, 0, sizeof(*s)); 3177 3178 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3179 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3180 s->failed_num[0] = -1; 3181 s->failed_num[1] = -1; 3182 3183 /* Now to look around and see what can be done */ 3184 rcu_read_lock(); 3185 spin_lock_irq(&conf->device_lock); 3186 for (i=disks; i--; ) { 3187 struct md_rdev *rdev; 3188 sector_t first_bad; 3189 int bad_sectors; 3190 int is_bad = 0; 3191 3192 dev = &sh->dev[i]; 3193 3194 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3195 i, dev->flags, 3196 dev->toread, dev->towrite, dev->written); 3197 /* maybe we can reply to a read 3198 * 3199 * new wantfill requests are only permitted while 3200 * ops_complete_biofill is guaranteed to be inactive 3201 */ 3202 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3203 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3204 set_bit(R5_Wantfill, &dev->flags); 3205 3206 /* now count some things */ 3207 if (test_bit(R5_LOCKED, &dev->flags)) 3208 s->locked++; 3209 if (test_bit(R5_UPTODATE, &dev->flags)) 3210 s->uptodate++; 3211 if (test_bit(R5_Wantcompute, &dev->flags)) { 3212 s->compute++; 3213 BUG_ON(s->compute > 2); 3214 } 3215 3216 if (test_bit(R5_Wantfill, &dev->flags)) 3217 s->to_fill++; 3218 else if (dev->toread) 3219 s->to_read++; 3220 if (dev->towrite) { 3221 s->to_write++; 3222 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3223 s->non_overwrite++; 3224 } 3225 if (dev->written) 3226 s->written++; 3227 /* Prefer to use the replacement for reads, but only 3228 * if it is recovered enough and has no bad blocks. 3229 */ 3230 rdev = rcu_dereference(conf->disks[i].replacement); 3231 if (rdev && !test_bit(Faulty, &rdev->flags) && 3232 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3233 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3234 &first_bad, &bad_sectors)) 3235 set_bit(R5_ReadRepl, &dev->flags); 3236 else { 3237 if (rdev) 3238 set_bit(R5_NeedReplace, &dev->flags); 3239 rdev = rcu_dereference(conf->disks[i].rdev); 3240 clear_bit(R5_ReadRepl, &dev->flags); 3241 } 3242 if (rdev && test_bit(Faulty, &rdev->flags)) 3243 rdev = NULL; 3244 if (rdev) { 3245 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3246 &first_bad, &bad_sectors); 3247 if (s->blocked_rdev == NULL 3248 && (test_bit(Blocked, &rdev->flags) 3249 || is_bad < 0)) { 3250 if (is_bad < 0) 3251 set_bit(BlockedBadBlocks, 3252 &rdev->flags); 3253 s->blocked_rdev = rdev; 3254 atomic_inc(&rdev->nr_pending); 3255 } 3256 } 3257 clear_bit(R5_Insync, &dev->flags); 3258 if (!rdev) 3259 /* Not in-sync */; 3260 else if (is_bad) { 3261 /* also not in-sync */ 3262 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3263 test_bit(R5_UPTODATE, &dev->flags)) { 3264 /* treat as in-sync, but with a read error 3265 * which we can now try to correct 3266 */ 3267 set_bit(R5_Insync, &dev->flags); 3268 set_bit(R5_ReadError, &dev->flags); 3269 } 3270 } else if (test_bit(In_sync, &rdev->flags)) 3271 set_bit(R5_Insync, &dev->flags); 3272 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3273 /* in sync if before recovery_offset */ 3274 set_bit(R5_Insync, &dev->flags); 3275 else if (test_bit(R5_UPTODATE, &dev->flags) && 3276 test_bit(R5_Expanded, &dev->flags)) 3277 /* If we've reshaped into here, we assume it is Insync. 3278 * We will shortly update recovery_offset to make 3279 * it official. 3280 */ 3281 set_bit(R5_Insync, &dev->flags); 3282 3283 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3284 /* This flag does not apply to '.replacement' 3285 * only to .rdev, so make sure to check that*/ 3286 struct md_rdev *rdev2 = rcu_dereference( 3287 conf->disks[i].rdev); 3288 if (rdev2 == rdev) 3289 clear_bit(R5_Insync, &dev->flags); 3290 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3291 s->handle_bad_blocks = 1; 3292 atomic_inc(&rdev2->nr_pending); 3293 } else 3294 clear_bit(R5_WriteError, &dev->flags); 3295 } 3296 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3297 /* This flag does not apply to '.replacement' 3298 * only to .rdev, so make sure to check that*/ 3299 struct md_rdev *rdev2 = rcu_dereference( 3300 conf->disks[i].rdev); 3301 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3302 s->handle_bad_blocks = 1; 3303 atomic_inc(&rdev2->nr_pending); 3304 } else 3305 clear_bit(R5_MadeGood, &dev->flags); 3306 } 3307 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3308 struct md_rdev *rdev2 = rcu_dereference( 3309 conf->disks[i].replacement); 3310 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3311 s->handle_bad_blocks = 1; 3312 atomic_inc(&rdev2->nr_pending); 3313 } else 3314 clear_bit(R5_MadeGoodRepl, &dev->flags); 3315 } 3316 if (!test_bit(R5_Insync, &dev->flags)) { 3317 /* The ReadError flag will just be confusing now */ 3318 clear_bit(R5_ReadError, &dev->flags); 3319 clear_bit(R5_ReWrite, &dev->flags); 3320 } 3321 if (test_bit(R5_ReadError, &dev->flags)) 3322 clear_bit(R5_Insync, &dev->flags); 3323 if (!test_bit(R5_Insync, &dev->flags)) { 3324 if (s->failed < 2) 3325 s->failed_num[s->failed] = i; 3326 s->failed++; 3327 if (rdev && !test_bit(Faulty, &rdev->flags)) 3328 do_recovery = 1; 3329 } 3330 } 3331 spin_unlock_irq(&conf->device_lock); 3332 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3333 /* If there is a failed device being replaced, 3334 * we must be recovering. 3335 * else if we are after recovery_cp, we must be syncing 3336 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3337 * else we can only be replacing 3338 * sync and recovery both need to read all devices, and so 3339 * use the same flag. 3340 */ 3341 if (do_recovery || 3342 sh->sector >= conf->mddev->recovery_cp || 3343 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3344 s->syncing = 1; 3345 else 3346 s->replacing = 1; 3347 } 3348 rcu_read_unlock(); 3349 } 3350 3351 static void handle_stripe(struct stripe_head *sh) 3352 { 3353 struct stripe_head_state s; 3354 struct r5conf *conf = sh->raid_conf; 3355 int i; 3356 int prexor; 3357 int disks = sh->disks; 3358 struct r5dev *pdev, *qdev; 3359 3360 clear_bit(STRIPE_HANDLE, &sh->state); 3361 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3362 /* already being handled, ensure it gets handled 3363 * again when current action finishes */ 3364 set_bit(STRIPE_HANDLE, &sh->state); 3365 return; 3366 } 3367 3368 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3369 set_bit(STRIPE_SYNCING, &sh->state); 3370 clear_bit(STRIPE_INSYNC, &sh->state); 3371 } 3372 clear_bit(STRIPE_DELAYED, &sh->state); 3373 3374 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3375 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3376 (unsigned long long)sh->sector, sh->state, 3377 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3378 sh->check_state, sh->reconstruct_state); 3379 3380 analyse_stripe(sh, &s); 3381 3382 if (s.handle_bad_blocks) { 3383 set_bit(STRIPE_HANDLE, &sh->state); 3384 goto finish; 3385 } 3386 3387 if (unlikely(s.blocked_rdev)) { 3388 if (s.syncing || s.expanding || s.expanded || 3389 s.replacing || s.to_write || s.written) { 3390 set_bit(STRIPE_HANDLE, &sh->state); 3391 goto finish; 3392 } 3393 /* There is nothing for the blocked_rdev to block */ 3394 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3395 s.blocked_rdev = NULL; 3396 } 3397 3398 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3399 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3400 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3401 } 3402 3403 pr_debug("locked=%d uptodate=%d to_read=%d" 3404 " to_write=%d failed=%d failed_num=%d,%d\n", 3405 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3406 s.failed_num[0], s.failed_num[1]); 3407 /* check if the array has lost more than max_degraded devices and, 3408 * if so, some requests might need to be failed. 3409 */ 3410 if (s.failed > conf->max_degraded) { 3411 sh->check_state = 0; 3412 sh->reconstruct_state = 0; 3413 if (s.to_read+s.to_write+s.written) 3414 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3415 if (s.syncing + s.replacing) 3416 handle_failed_sync(conf, sh, &s); 3417 } 3418 3419 /* 3420 * might be able to return some write requests if the parity blocks 3421 * are safe, or on a failed drive 3422 */ 3423 pdev = &sh->dev[sh->pd_idx]; 3424 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3425 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3426 qdev = &sh->dev[sh->qd_idx]; 3427 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3428 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3429 || conf->level < 6; 3430 3431 if (s.written && 3432 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3433 && !test_bit(R5_LOCKED, &pdev->flags) 3434 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3435 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3436 && !test_bit(R5_LOCKED, &qdev->flags) 3437 && test_bit(R5_UPTODATE, &qdev->flags))))) 3438 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3439 3440 /* Now we might consider reading some blocks, either to check/generate 3441 * parity, or to satisfy requests 3442 * or to load a block that is being partially written. 3443 */ 3444 if (s.to_read || s.non_overwrite 3445 || (conf->level == 6 && s.to_write && s.failed) 3446 || (s.syncing && (s.uptodate + s.compute < disks)) 3447 || s.replacing 3448 || s.expanding) 3449 handle_stripe_fill(sh, &s, disks); 3450 3451 /* Now we check to see if any write operations have recently 3452 * completed 3453 */ 3454 prexor = 0; 3455 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3456 prexor = 1; 3457 if (sh->reconstruct_state == reconstruct_state_drain_result || 3458 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3459 sh->reconstruct_state = reconstruct_state_idle; 3460 3461 /* All the 'written' buffers and the parity block are ready to 3462 * be written back to disk 3463 */ 3464 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3465 BUG_ON(sh->qd_idx >= 0 && 3466 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3467 for (i = disks; i--; ) { 3468 struct r5dev *dev = &sh->dev[i]; 3469 if (test_bit(R5_LOCKED, &dev->flags) && 3470 (i == sh->pd_idx || i == sh->qd_idx || 3471 dev->written)) { 3472 pr_debug("Writing block %d\n", i); 3473 set_bit(R5_Wantwrite, &dev->flags); 3474 if (prexor) 3475 continue; 3476 if (!test_bit(R5_Insync, &dev->flags) || 3477 ((i == sh->pd_idx || i == sh->qd_idx) && 3478 s.failed == 0)) 3479 set_bit(STRIPE_INSYNC, &sh->state); 3480 } 3481 } 3482 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3483 s.dec_preread_active = 1; 3484 } 3485 3486 /* Now to consider new write requests and what else, if anything 3487 * should be read. We do not handle new writes when: 3488 * 1/ A 'write' operation (copy+xor) is already in flight. 3489 * 2/ A 'check' operation is in flight, as it may clobber the parity 3490 * block. 3491 */ 3492 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3493 handle_stripe_dirtying(conf, sh, &s, disks); 3494 3495 /* maybe we need to check and possibly fix the parity for this stripe 3496 * Any reads will already have been scheduled, so we just see if enough 3497 * data is available. The parity check is held off while parity 3498 * dependent operations are in flight. 3499 */ 3500 if (sh->check_state || 3501 (s.syncing && s.locked == 0 && 3502 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3503 !test_bit(STRIPE_INSYNC, &sh->state))) { 3504 if (conf->level == 6) 3505 handle_parity_checks6(conf, sh, &s, disks); 3506 else 3507 handle_parity_checks5(conf, sh, &s, disks); 3508 } 3509 3510 if (s.replacing && s.locked == 0 3511 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3512 /* Write out to replacement devices where possible */ 3513 for (i = 0; i < conf->raid_disks; i++) 3514 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3515 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3516 set_bit(R5_WantReplace, &sh->dev[i].flags); 3517 set_bit(R5_LOCKED, &sh->dev[i].flags); 3518 s.locked++; 3519 } 3520 set_bit(STRIPE_INSYNC, &sh->state); 3521 } 3522 if ((s.syncing || s.replacing) && s.locked == 0 && 3523 test_bit(STRIPE_INSYNC, &sh->state)) { 3524 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3525 clear_bit(STRIPE_SYNCING, &sh->state); 3526 } 3527 3528 /* If the failed drives are just a ReadError, then we might need 3529 * to progress the repair/check process 3530 */ 3531 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3532 for (i = 0; i < s.failed; i++) { 3533 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3534 if (test_bit(R5_ReadError, &dev->flags) 3535 && !test_bit(R5_LOCKED, &dev->flags) 3536 && test_bit(R5_UPTODATE, &dev->flags) 3537 ) { 3538 if (!test_bit(R5_ReWrite, &dev->flags)) { 3539 set_bit(R5_Wantwrite, &dev->flags); 3540 set_bit(R5_ReWrite, &dev->flags); 3541 set_bit(R5_LOCKED, &dev->flags); 3542 s.locked++; 3543 } else { 3544 /* let's read it back */ 3545 set_bit(R5_Wantread, &dev->flags); 3546 set_bit(R5_LOCKED, &dev->flags); 3547 s.locked++; 3548 } 3549 } 3550 } 3551 3552 3553 /* Finish reconstruct operations initiated by the expansion process */ 3554 if (sh->reconstruct_state == reconstruct_state_result) { 3555 struct stripe_head *sh_src 3556 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3557 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3558 /* sh cannot be written until sh_src has been read. 3559 * so arrange for sh to be delayed a little 3560 */ 3561 set_bit(STRIPE_DELAYED, &sh->state); 3562 set_bit(STRIPE_HANDLE, &sh->state); 3563 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3564 &sh_src->state)) 3565 atomic_inc(&conf->preread_active_stripes); 3566 release_stripe(sh_src); 3567 goto finish; 3568 } 3569 if (sh_src) 3570 release_stripe(sh_src); 3571 3572 sh->reconstruct_state = reconstruct_state_idle; 3573 clear_bit(STRIPE_EXPANDING, &sh->state); 3574 for (i = conf->raid_disks; i--; ) { 3575 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3576 set_bit(R5_LOCKED, &sh->dev[i].flags); 3577 s.locked++; 3578 } 3579 } 3580 3581 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3582 !sh->reconstruct_state) { 3583 /* Need to write out all blocks after computing parity */ 3584 sh->disks = conf->raid_disks; 3585 stripe_set_idx(sh->sector, conf, 0, sh); 3586 schedule_reconstruction(sh, &s, 1, 1); 3587 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3588 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3589 atomic_dec(&conf->reshape_stripes); 3590 wake_up(&conf->wait_for_overlap); 3591 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3592 } 3593 3594 if (s.expanding && s.locked == 0 && 3595 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3596 handle_stripe_expansion(conf, sh); 3597 3598 finish: 3599 /* wait for this device to become unblocked */ 3600 if (unlikely(s.blocked_rdev)) { 3601 if (conf->mddev->external) 3602 md_wait_for_blocked_rdev(s.blocked_rdev, 3603 conf->mddev); 3604 else 3605 /* Internal metadata will immediately 3606 * be written by raid5d, so we don't 3607 * need to wait here. 3608 */ 3609 rdev_dec_pending(s.blocked_rdev, 3610 conf->mddev); 3611 } 3612 3613 if (s.handle_bad_blocks) 3614 for (i = disks; i--; ) { 3615 struct md_rdev *rdev; 3616 struct r5dev *dev = &sh->dev[i]; 3617 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3618 /* We own a safe reference to the rdev */ 3619 rdev = conf->disks[i].rdev; 3620 if (!rdev_set_badblocks(rdev, sh->sector, 3621 STRIPE_SECTORS, 0)) 3622 md_error(conf->mddev, rdev); 3623 rdev_dec_pending(rdev, conf->mddev); 3624 } 3625 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3626 rdev = conf->disks[i].rdev; 3627 rdev_clear_badblocks(rdev, sh->sector, 3628 STRIPE_SECTORS, 0); 3629 rdev_dec_pending(rdev, conf->mddev); 3630 } 3631 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3632 rdev = conf->disks[i].replacement; 3633 if (!rdev) 3634 /* rdev have been moved down */ 3635 rdev = conf->disks[i].rdev; 3636 rdev_clear_badblocks(rdev, sh->sector, 3637 STRIPE_SECTORS, 0); 3638 rdev_dec_pending(rdev, conf->mddev); 3639 } 3640 } 3641 3642 if (s.ops_request) 3643 raid_run_ops(sh, s.ops_request); 3644 3645 ops_run_io(sh, &s); 3646 3647 if (s.dec_preread_active) { 3648 /* We delay this until after ops_run_io so that if make_request 3649 * is waiting on a flush, it won't continue until the writes 3650 * have actually been submitted. 3651 */ 3652 atomic_dec(&conf->preread_active_stripes); 3653 if (atomic_read(&conf->preread_active_stripes) < 3654 IO_THRESHOLD) 3655 md_wakeup_thread(conf->mddev->thread); 3656 } 3657 3658 return_io(s.return_bi); 3659 3660 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3661 } 3662 3663 static void raid5_activate_delayed(struct r5conf *conf) 3664 { 3665 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3666 while (!list_empty(&conf->delayed_list)) { 3667 struct list_head *l = conf->delayed_list.next; 3668 struct stripe_head *sh; 3669 sh = list_entry(l, struct stripe_head, lru); 3670 list_del_init(l); 3671 clear_bit(STRIPE_DELAYED, &sh->state); 3672 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3673 atomic_inc(&conf->preread_active_stripes); 3674 list_add_tail(&sh->lru, &conf->hold_list); 3675 } 3676 } 3677 } 3678 3679 static void activate_bit_delay(struct r5conf *conf) 3680 { 3681 /* device_lock is held */ 3682 struct list_head head; 3683 list_add(&head, &conf->bitmap_list); 3684 list_del_init(&conf->bitmap_list); 3685 while (!list_empty(&head)) { 3686 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3687 list_del_init(&sh->lru); 3688 atomic_inc(&sh->count); 3689 __release_stripe(conf, sh); 3690 } 3691 } 3692 3693 int md_raid5_congested(struct mddev *mddev, int bits) 3694 { 3695 struct r5conf *conf = mddev->private; 3696 3697 /* No difference between reads and writes. Just check 3698 * how busy the stripe_cache is 3699 */ 3700 3701 if (conf->inactive_blocked) 3702 return 1; 3703 if (conf->quiesce) 3704 return 1; 3705 if (list_empty_careful(&conf->inactive_list)) 3706 return 1; 3707 3708 return 0; 3709 } 3710 EXPORT_SYMBOL_GPL(md_raid5_congested); 3711 3712 static int raid5_congested(void *data, int bits) 3713 { 3714 struct mddev *mddev = data; 3715 3716 return mddev_congested(mddev, bits) || 3717 md_raid5_congested(mddev, bits); 3718 } 3719 3720 /* We want read requests to align with chunks where possible, 3721 * but write requests don't need to. 3722 */ 3723 static int raid5_mergeable_bvec(struct request_queue *q, 3724 struct bvec_merge_data *bvm, 3725 struct bio_vec *biovec) 3726 { 3727 struct mddev *mddev = q->queuedata; 3728 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3729 int max; 3730 unsigned int chunk_sectors = mddev->chunk_sectors; 3731 unsigned int bio_sectors = bvm->bi_size >> 9; 3732 3733 if ((bvm->bi_rw & 1) == WRITE) 3734 return biovec->bv_len; /* always allow writes to be mergeable */ 3735 3736 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3737 chunk_sectors = mddev->new_chunk_sectors; 3738 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3739 if (max < 0) max = 0; 3740 if (max <= biovec->bv_len && bio_sectors == 0) 3741 return biovec->bv_len; 3742 else 3743 return max; 3744 } 3745 3746 3747 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3748 { 3749 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3750 unsigned int chunk_sectors = mddev->chunk_sectors; 3751 unsigned int bio_sectors = bio->bi_size >> 9; 3752 3753 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3754 chunk_sectors = mddev->new_chunk_sectors; 3755 return chunk_sectors >= 3756 ((sector & (chunk_sectors - 1)) + bio_sectors); 3757 } 3758 3759 /* 3760 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3761 * later sampled by raid5d. 3762 */ 3763 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3764 { 3765 unsigned long flags; 3766 3767 spin_lock_irqsave(&conf->device_lock, flags); 3768 3769 bi->bi_next = conf->retry_read_aligned_list; 3770 conf->retry_read_aligned_list = bi; 3771 3772 spin_unlock_irqrestore(&conf->device_lock, flags); 3773 md_wakeup_thread(conf->mddev->thread); 3774 } 3775 3776 3777 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3778 { 3779 struct bio *bi; 3780 3781 bi = conf->retry_read_aligned; 3782 if (bi) { 3783 conf->retry_read_aligned = NULL; 3784 return bi; 3785 } 3786 bi = conf->retry_read_aligned_list; 3787 if(bi) { 3788 conf->retry_read_aligned_list = bi->bi_next; 3789 bi->bi_next = NULL; 3790 /* 3791 * this sets the active strip count to 1 and the processed 3792 * strip count to zero (upper 8 bits) 3793 */ 3794 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3795 } 3796 3797 return bi; 3798 } 3799 3800 3801 /* 3802 * The "raid5_align_endio" should check if the read succeeded and if it 3803 * did, call bio_endio on the original bio (having bio_put the new bio 3804 * first). 3805 * If the read failed.. 3806 */ 3807 static void raid5_align_endio(struct bio *bi, int error) 3808 { 3809 struct bio* raid_bi = bi->bi_private; 3810 struct mddev *mddev; 3811 struct r5conf *conf; 3812 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3813 struct md_rdev *rdev; 3814 3815 bio_put(bi); 3816 3817 rdev = (void*)raid_bi->bi_next; 3818 raid_bi->bi_next = NULL; 3819 mddev = rdev->mddev; 3820 conf = mddev->private; 3821 3822 rdev_dec_pending(rdev, conf->mddev); 3823 3824 if (!error && uptodate) { 3825 bio_endio(raid_bi, 0); 3826 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3827 wake_up(&conf->wait_for_stripe); 3828 return; 3829 } 3830 3831 3832 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3833 3834 add_bio_to_retry(raid_bi, conf); 3835 } 3836 3837 static int bio_fits_rdev(struct bio *bi) 3838 { 3839 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3840 3841 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3842 return 0; 3843 blk_recount_segments(q, bi); 3844 if (bi->bi_phys_segments > queue_max_segments(q)) 3845 return 0; 3846 3847 if (q->merge_bvec_fn) 3848 /* it's too hard to apply the merge_bvec_fn at this stage, 3849 * just just give up 3850 */ 3851 return 0; 3852 3853 return 1; 3854 } 3855 3856 3857 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3858 { 3859 struct r5conf *conf = mddev->private; 3860 int dd_idx; 3861 struct bio* align_bi; 3862 struct md_rdev *rdev; 3863 sector_t end_sector; 3864 3865 if (!in_chunk_boundary(mddev, raid_bio)) { 3866 pr_debug("chunk_aligned_read : non aligned\n"); 3867 return 0; 3868 } 3869 /* 3870 * use bio_clone_mddev to make a copy of the bio 3871 */ 3872 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3873 if (!align_bi) 3874 return 0; 3875 /* 3876 * set bi_end_io to a new function, and set bi_private to the 3877 * original bio. 3878 */ 3879 align_bi->bi_end_io = raid5_align_endio; 3880 align_bi->bi_private = raid_bio; 3881 /* 3882 * compute position 3883 */ 3884 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3885 0, 3886 &dd_idx, NULL); 3887 3888 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3889 rcu_read_lock(); 3890 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3891 if (!rdev || test_bit(Faulty, &rdev->flags) || 3892 rdev->recovery_offset < end_sector) { 3893 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3894 if (rdev && 3895 (test_bit(Faulty, &rdev->flags) || 3896 !(test_bit(In_sync, &rdev->flags) || 3897 rdev->recovery_offset >= end_sector))) 3898 rdev = NULL; 3899 } 3900 if (rdev) { 3901 sector_t first_bad; 3902 int bad_sectors; 3903 3904 atomic_inc(&rdev->nr_pending); 3905 rcu_read_unlock(); 3906 raid_bio->bi_next = (void*)rdev; 3907 align_bi->bi_bdev = rdev->bdev; 3908 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3909 3910 if (!bio_fits_rdev(align_bi) || 3911 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3912 &first_bad, &bad_sectors)) { 3913 /* too big in some way, or has a known bad block */ 3914 bio_put(align_bi); 3915 rdev_dec_pending(rdev, mddev); 3916 return 0; 3917 } 3918 3919 /* No reshape active, so we can trust rdev->data_offset */ 3920 align_bi->bi_sector += rdev->data_offset; 3921 3922 spin_lock_irq(&conf->device_lock); 3923 wait_event_lock_irq(conf->wait_for_stripe, 3924 conf->quiesce == 0, 3925 conf->device_lock, /* nothing */); 3926 atomic_inc(&conf->active_aligned_reads); 3927 spin_unlock_irq(&conf->device_lock); 3928 3929 generic_make_request(align_bi); 3930 return 1; 3931 } else { 3932 rcu_read_unlock(); 3933 bio_put(align_bi); 3934 return 0; 3935 } 3936 } 3937 3938 /* __get_priority_stripe - get the next stripe to process 3939 * 3940 * Full stripe writes are allowed to pass preread active stripes up until 3941 * the bypass_threshold is exceeded. In general the bypass_count 3942 * increments when the handle_list is handled before the hold_list; however, it 3943 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3944 * stripe with in flight i/o. The bypass_count will be reset when the 3945 * head of the hold_list has changed, i.e. the head was promoted to the 3946 * handle_list. 3947 */ 3948 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3949 { 3950 struct stripe_head *sh; 3951 3952 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3953 __func__, 3954 list_empty(&conf->handle_list) ? "empty" : "busy", 3955 list_empty(&conf->hold_list) ? "empty" : "busy", 3956 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3957 3958 if (!list_empty(&conf->handle_list)) { 3959 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3960 3961 if (list_empty(&conf->hold_list)) 3962 conf->bypass_count = 0; 3963 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3964 if (conf->hold_list.next == conf->last_hold) 3965 conf->bypass_count++; 3966 else { 3967 conf->last_hold = conf->hold_list.next; 3968 conf->bypass_count -= conf->bypass_threshold; 3969 if (conf->bypass_count < 0) 3970 conf->bypass_count = 0; 3971 } 3972 } 3973 } else if (!list_empty(&conf->hold_list) && 3974 ((conf->bypass_threshold && 3975 conf->bypass_count > conf->bypass_threshold) || 3976 atomic_read(&conf->pending_full_writes) == 0)) { 3977 sh = list_entry(conf->hold_list.next, 3978 typeof(*sh), lru); 3979 conf->bypass_count -= conf->bypass_threshold; 3980 if (conf->bypass_count < 0) 3981 conf->bypass_count = 0; 3982 } else 3983 return NULL; 3984 3985 list_del_init(&sh->lru); 3986 atomic_inc(&sh->count); 3987 BUG_ON(atomic_read(&sh->count) != 1); 3988 return sh; 3989 } 3990 3991 static void make_request(struct mddev *mddev, struct bio * bi) 3992 { 3993 struct r5conf *conf = mddev->private; 3994 int dd_idx; 3995 sector_t new_sector; 3996 sector_t logical_sector, last_sector; 3997 struct stripe_head *sh; 3998 const int rw = bio_data_dir(bi); 3999 int remaining; 4000 4001 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4002 md_flush_request(mddev, bi); 4003 return; 4004 } 4005 4006 md_write_start(mddev, bi); 4007 4008 if (rw == READ && 4009 mddev->reshape_position == MaxSector && 4010 chunk_aligned_read(mddev,bi)) 4011 return; 4012 4013 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4014 last_sector = bi->bi_sector + (bi->bi_size>>9); 4015 bi->bi_next = NULL; 4016 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4017 4018 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4019 DEFINE_WAIT(w); 4020 int previous; 4021 4022 retry: 4023 previous = 0; 4024 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4025 if (unlikely(conf->reshape_progress != MaxSector)) { 4026 /* spinlock is needed as reshape_progress may be 4027 * 64bit on a 32bit platform, and so it might be 4028 * possible to see a half-updated value 4029 * Of course reshape_progress could change after 4030 * the lock is dropped, so once we get a reference 4031 * to the stripe that we think it is, we will have 4032 * to check again. 4033 */ 4034 spin_lock_irq(&conf->device_lock); 4035 if (mddev->reshape_backwards 4036 ? logical_sector < conf->reshape_progress 4037 : logical_sector >= conf->reshape_progress) { 4038 previous = 1; 4039 } else { 4040 if (mddev->reshape_backwards 4041 ? logical_sector < conf->reshape_safe 4042 : logical_sector >= conf->reshape_safe) { 4043 spin_unlock_irq(&conf->device_lock); 4044 schedule(); 4045 goto retry; 4046 } 4047 } 4048 spin_unlock_irq(&conf->device_lock); 4049 } 4050 4051 new_sector = raid5_compute_sector(conf, logical_sector, 4052 previous, 4053 &dd_idx, NULL); 4054 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4055 (unsigned long long)new_sector, 4056 (unsigned long long)logical_sector); 4057 4058 sh = get_active_stripe(conf, new_sector, previous, 4059 (bi->bi_rw&RWA_MASK), 0); 4060 if (sh) { 4061 if (unlikely(previous)) { 4062 /* expansion might have moved on while waiting for a 4063 * stripe, so we must do the range check again. 4064 * Expansion could still move past after this 4065 * test, but as we are holding a reference to 4066 * 'sh', we know that if that happens, 4067 * STRIPE_EXPANDING will get set and the expansion 4068 * won't proceed until we finish with the stripe. 4069 */ 4070 int must_retry = 0; 4071 spin_lock_irq(&conf->device_lock); 4072 if (mddev->reshape_backwards 4073 ? logical_sector >= conf->reshape_progress 4074 : logical_sector < conf->reshape_progress) 4075 /* mismatch, need to try again */ 4076 must_retry = 1; 4077 spin_unlock_irq(&conf->device_lock); 4078 if (must_retry) { 4079 release_stripe(sh); 4080 schedule(); 4081 goto retry; 4082 } 4083 } 4084 4085 if (rw == WRITE && 4086 logical_sector >= mddev->suspend_lo && 4087 logical_sector < mddev->suspend_hi) { 4088 release_stripe(sh); 4089 /* As the suspend_* range is controlled by 4090 * userspace, we want an interruptible 4091 * wait. 4092 */ 4093 flush_signals(current); 4094 prepare_to_wait(&conf->wait_for_overlap, 4095 &w, TASK_INTERRUPTIBLE); 4096 if (logical_sector >= mddev->suspend_lo && 4097 logical_sector < mddev->suspend_hi) 4098 schedule(); 4099 goto retry; 4100 } 4101 4102 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4103 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4104 /* Stripe is busy expanding or 4105 * add failed due to overlap. Flush everything 4106 * and wait a while 4107 */ 4108 md_wakeup_thread(mddev->thread); 4109 release_stripe(sh); 4110 schedule(); 4111 goto retry; 4112 } 4113 finish_wait(&conf->wait_for_overlap, &w); 4114 set_bit(STRIPE_HANDLE, &sh->state); 4115 clear_bit(STRIPE_DELAYED, &sh->state); 4116 if ((bi->bi_rw & REQ_SYNC) && 4117 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4118 atomic_inc(&conf->preread_active_stripes); 4119 mddev_check_plugged(mddev); 4120 release_stripe(sh); 4121 } else { 4122 /* cannot get stripe for read-ahead, just give-up */ 4123 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4124 finish_wait(&conf->wait_for_overlap, &w); 4125 break; 4126 } 4127 } 4128 4129 spin_lock_irq(&conf->device_lock); 4130 remaining = raid5_dec_bi_phys_segments(bi); 4131 spin_unlock_irq(&conf->device_lock); 4132 if (remaining == 0) { 4133 4134 if ( rw == WRITE ) 4135 md_write_end(mddev); 4136 4137 bio_endio(bi, 0); 4138 } 4139 } 4140 4141 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4142 4143 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4144 { 4145 /* reshaping is quite different to recovery/resync so it is 4146 * handled quite separately ... here. 4147 * 4148 * On each call to sync_request, we gather one chunk worth of 4149 * destination stripes and flag them as expanding. 4150 * Then we find all the source stripes and request reads. 4151 * As the reads complete, handle_stripe will copy the data 4152 * into the destination stripe and release that stripe. 4153 */ 4154 struct r5conf *conf = mddev->private; 4155 struct stripe_head *sh; 4156 sector_t first_sector, last_sector; 4157 int raid_disks = conf->previous_raid_disks; 4158 int data_disks = raid_disks - conf->max_degraded; 4159 int new_data_disks = conf->raid_disks - conf->max_degraded; 4160 int i; 4161 int dd_idx; 4162 sector_t writepos, readpos, safepos; 4163 sector_t stripe_addr; 4164 int reshape_sectors; 4165 struct list_head stripes; 4166 4167 if (sector_nr == 0) { 4168 /* If restarting in the middle, skip the initial sectors */ 4169 if (mddev->reshape_backwards && 4170 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4171 sector_nr = raid5_size(mddev, 0, 0) 4172 - conf->reshape_progress; 4173 } else if (!mddev->reshape_backwards && 4174 conf->reshape_progress > 0) 4175 sector_nr = conf->reshape_progress; 4176 sector_div(sector_nr, new_data_disks); 4177 if (sector_nr) { 4178 mddev->curr_resync_completed = sector_nr; 4179 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4180 *skipped = 1; 4181 return sector_nr; 4182 } 4183 } 4184 4185 /* We need to process a full chunk at a time. 4186 * If old and new chunk sizes differ, we need to process the 4187 * largest of these 4188 */ 4189 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4190 reshape_sectors = mddev->new_chunk_sectors; 4191 else 4192 reshape_sectors = mddev->chunk_sectors; 4193 4194 /* We update the metadata at least every 10 seconds, or when 4195 * the data about to be copied would over-write the source of 4196 * the data at the front of the range. i.e. one new_stripe 4197 * along from reshape_progress new_maps to after where 4198 * reshape_safe old_maps to 4199 */ 4200 writepos = conf->reshape_progress; 4201 sector_div(writepos, new_data_disks); 4202 readpos = conf->reshape_progress; 4203 sector_div(readpos, data_disks); 4204 safepos = conf->reshape_safe; 4205 sector_div(safepos, data_disks); 4206 if (mddev->reshape_backwards) { 4207 writepos -= min_t(sector_t, reshape_sectors, writepos); 4208 readpos += reshape_sectors; 4209 safepos += reshape_sectors; 4210 } else { 4211 writepos += reshape_sectors; 4212 readpos -= min_t(sector_t, reshape_sectors, readpos); 4213 safepos -= min_t(sector_t, reshape_sectors, safepos); 4214 } 4215 4216 /* Having calculated the 'writepos' possibly use it 4217 * to set 'stripe_addr' which is where we will write to. 4218 */ 4219 if (mddev->reshape_backwards) { 4220 BUG_ON(conf->reshape_progress == 0); 4221 stripe_addr = writepos; 4222 BUG_ON((mddev->dev_sectors & 4223 ~((sector_t)reshape_sectors - 1)) 4224 - reshape_sectors - stripe_addr 4225 != sector_nr); 4226 } else { 4227 BUG_ON(writepos != sector_nr + reshape_sectors); 4228 stripe_addr = sector_nr; 4229 } 4230 4231 /* 'writepos' is the most advanced device address we might write. 4232 * 'readpos' is the least advanced device address we might read. 4233 * 'safepos' is the least address recorded in the metadata as having 4234 * been reshaped. 4235 * If there is a min_offset_diff, these are adjusted either by 4236 * increasing the safepos/readpos if diff is negative, or 4237 * increasing writepos if diff is positive. 4238 * If 'readpos' is then behind 'writepos', there is no way that we can 4239 * ensure safety in the face of a crash - that must be done by userspace 4240 * making a backup of the data. So in that case there is no particular 4241 * rush to update metadata. 4242 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4243 * update the metadata to advance 'safepos' to match 'readpos' so that 4244 * we can be safe in the event of a crash. 4245 * So we insist on updating metadata if safepos is behind writepos and 4246 * readpos is beyond writepos. 4247 * In any case, update the metadata every 10 seconds. 4248 * Maybe that number should be configurable, but I'm not sure it is 4249 * worth it.... maybe it could be a multiple of safemode_delay??? 4250 */ 4251 if (conf->min_offset_diff < 0) { 4252 safepos += -conf->min_offset_diff; 4253 readpos += -conf->min_offset_diff; 4254 } else 4255 writepos += conf->min_offset_diff; 4256 4257 if ((mddev->reshape_backwards 4258 ? (safepos > writepos && readpos < writepos) 4259 : (safepos < writepos && readpos > writepos)) || 4260 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4261 /* Cannot proceed until we've updated the superblock... */ 4262 wait_event(conf->wait_for_overlap, 4263 atomic_read(&conf->reshape_stripes)==0); 4264 mddev->reshape_position = conf->reshape_progress; 4265 mddev->curr_resync_completed = sector_nr; 4266 conf->reshape_checkpoint = jiffies; 4267 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4268 md_wakeup_thread(mddev->thread); 4269 wait_event(mddev->sb_wait, mddev->flags == 0 || 4270 kthread_should_stop()); 4271 spin_lock_irq(&conf->device_lock); 4272 conf->reshape_safe = mddev->reshape_position; 4273 spin_unlock_irq(&conf->device_lock); 4274 wake_up(&conf->wait_for_overlap); 4275 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4276 } 4277 4278 INIT_LIST_HEAD(&stripes); 4279 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4280 int j; 4281 int skipped_disk = 0; 4282 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4283 set_bit(STRIPE_EXPANDING, &sh->state); 4284 atomic_inc(&conf->reshape_stripes); 4285 /* If any of this stripe is beyond the end of the old 4286 * array, then we need to zero those blocks 4287 */ 4288 for (j=sh->disks; j--;) { 4289 sector_t s; 4290 if (j == sh->pd_idx) 4291 continue; 4292 if (conf->level == 6 && 4293 j == sh->qd_idx) 4294 continue; 4295 s = compute_blocknr(sh, j, 0); 4296 if (s < raid5_size(mddev, 0, 0)) { 4297 skipped_disk = 1; 4298 continue; 4299 } 4300 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4301 set_bit(R5_Expanded, &sh->dev[j].flags); 4302 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4303 } 4304 if (!skipped_disk) { 4305 set_bit(STRIPE_EXPAND_READY, &sh->state); 4306 set_bit(STRIPE_HANDLE, &sh->state); 4307 } 4308 list_add(&sh->lru, &stripes); 4309 } 4310 spin_lock_irq(&conf->device_lock); 4311 if (mddev->reshape_backwards) 4312 conf->reshape_progress -= reshape_sectors * new_data_disks; 4313 else 4314 conf->reshape_progress += reshape_sectors * new_data_disks; 4315 spin_unlock_irq(&conf->device_lock); 4316 /* Ok, those stripe are ready. We can start scheduling 4317 * reads on the source stripes. 4318 * The source stripes are determined by mapping the first and last 4319 * block on the destination stripes. 4320 */ 4321 first_sector = 4322 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4323 1, &dd_idx, NULL); 4324 last_sector = 4325 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4326 * new_data_disks - 1), 4327 1, &dd_idx, NULL); 4328 if (last_sector >= mddev->dev_sectors) 4329 last_sector = mddev->dev_sectors - 1; 4330 while (first_sector <= last_sector) { 4331 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4332 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4333 set_bit(STRIPE_HANDLE, &sh->state); 4334 release_stripe(sh); 4335 first_sector += STRIPE_SECTORS; 4336 } 4337 /* Now that the sources are clearly marked, we can release 4338 * the destination stripes 4339 */ 4340 while (!list_empty(&stripes)) { 4341 sh = list_entry(stripes.next, struct stripe_head, lru); 4342 list_del_init(&sh->lru); 4343 release_stripe(sh); 4344 } 4345 /* If this takes us to the resync_max point where we have to pause, 4346 * then we need to write out the superblock. 4347 */ 4348 sector_nr += reshape_sectors; 4349 if ((sector_nr - mddev->curr_resync_completed) * 2 4350 >= mddev->resync_max - mddev->curr_resync_completed) { 4351 /* Cannot proceed until we've updated the superblock... */ 4352 wait_event(conf->wait_for_overlap, 4353 atomic_read(&conf->reshape_stripes) == 0); 4354 mddev->reshape_position = conf->reshape_progress; 4355 mddev->curr_resync_completed = sector_nr; 4356 conf->reshape_checkpoint = jiffies; 4357 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4358 md_wakeup_thread(mddev->thread); 4359 wait_event(mddev->sb_wait, 4360 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4361 || kthread_should_stop()); 4362 spin_lock_irq(&conf->device_lock); 4363 conf->reshape_safe = mddev->reshape_position; 4364 spin_unlock_irq(&conf->device_lock); 4365 wake_up(&conf->wait_for_overlap); 4366 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4367 } 4368 return reshape_sectors; 4369 } 4370 4371 /* FIXME go_faster isn't used */ 4372 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4373 { 4374 struct r5conf *conf = mddev->private; 4375 struct stripe_head *sh; 4376 sector_t max_sector = mddev->dev_sectors; 4377 sector_t sync_blocks; 4378 int still_degraded = 0; 4379 int i; 4380 4381 if (sector_nr >= max_sector) { 4382 /* just being told to finish up .. nothing much to do */ 4383 4384 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4385 end_reshape(conf); 4386 return 0; 4387 } 4388 4389 if (mddev->curr_resync < max_sector) /* aborted */ 4390 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4391 &sync_blocks, 1); 4392 else /* completed sync */ 4393 conf->fullsync = 0; 4394 bitmap_close_sync(mddev->bitmap); 4395 4396 return 0; 4397 } 4398 4399 /* Allow raid5_quiesce to complete */ 4400 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4401 4402 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4403 return reshape_request(mddev, sector_nr, skipped); 4404 4405 /* No need to check resync_max as we never do more than one 4406 * stripe, and as resync_max will always be on a chunk boundary, 4407 * if the check in md_do_sync didn't fire, there is no chance 4408 * of overstepping resync_max here 4409 */ 4410 4411 /* if there is too many failed drives and we are trying 4412 * to resync, then assert that we are finished, because there is 4413 * nothing we can do. 4414 */ 4415 if (mddev->degraded >= conf->max_degraded && 4416 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4417 sector_t rv = mddev->dev_sectors - sector_nr; 4418 *skipped = 1; 4419 return rv; 4420 } 4421 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4422 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4423 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4424 /* we can skip this block, and probably more */ 4425 sync_blocks /= STRIPE_SECTORS; 4426 *skipped = 1; 4427 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4428 } 4429 4430 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4431 4432 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4433 if (sh == NULL) { 4434 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4435 /* make sure we don't swamp the stripe cache if someone else 4436 * is trying to get access 4437 */ 4438 schedule_timeout_uninterruptible(1); 4439 } 4440 /* Need to check if array will still be degraded after recovery/resync 4441 * We don't need to check the 'failed' flag as when that gets set, 4442 * recovery aborts. 4443 */ 4444 for (i = 0; i < conf->raid_disks; i++) 4445 if (conf->disks[i].rdev == NULL) 4446 still_degraded = 1; 4447 4448 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4449 4450 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4451 4452 handle_stripe(sh); 4453 release_stripe(sh); 4454 4455 return STRIPE_SECTORS; 4456 } 4457 4458 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4459 { 4460 /* We may not be able to submit a whole bio at once as there 4461 * may not be enough stripe_heads available. 4462 * We cannot pre-allocate enough stripe_heads as we may need 4463 * more than exist in the cache (if we allow ever large chunks). 4464 * So we do one stripe head at a time and record in 4465 * ->bi_hw_segments how many have been done. 4466 * 4467 * We *know* that this entire raid_bio is in one chunk, so 4468 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4469 */ 4470 struct stripe_head *sh; 4471 int dd_idx; 4472 sector_t sector, logical_sector, last_sector; 4473 int scnt = 0; 4474 int remaining; 4475 int handled = 0; 4476 4477 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4478 sector = raid5_compute_sector(conf, logical_sector, 4479 0, &dd_idx, NULL); 4480 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4481 4482 for (; logical_sector < last_sector; 4483 logical_sector += STRIPE_SECTORS, 4484 sector += STRIPE_SECTORS, 4485 scnt++) { 4486 4487 if (scnt < raid5_bi_hw_segments(raid_bio)) 4488 /* already done this stripe */ 4489 continue; 4490 4491 sh = get_active_stripe(conf, sector, 0, 1, 0); 4492 4493 if (!sh) { 4494 /* failed to get a stripe - must wait */ 4495 raid5_set_bi_hw_segments(raid_bio, scnt); 4496 conf->retry_read_aligned = raid_bio; 4497 return handled; 4498 } 4499 4500 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4501 release_stripe(sh); 4502 raid5_set_bi_hw_segments(raid_bio, scnt); 4503 conf->retry_read_aligned = raid_bio; 4504 return handled; 4505 } 4506 4507 handle_stripe(sh); 4508 release_stripe(sh); 4509 handled++; 4510 } 4511 spin_lock_irq(&conf->device_lock); 4512 remaining = raid5_dec_bi_phys_segments(raid_bio); 4513 spin_unlock_irq(&conf->device_lock); 4514 if (remaining == 0) 4515 bio_endio(raid_bio, 0); 4516 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4517 wake_up(&conf->wait_for_stripe); 4518 return handled; 4519 } 4520 4521 4522 /* 4523 * This is our raid5 kernel thread. 4524 * 4525 * We scan the hash table for stripes which can be handled now. 4526 * During the scan, completed stripes are saved for us by the interrupt 4527 * handler, so that they will not have to wait for our next wakeup. 4528 */ 4529 static void raid5d(struct mddev *mddev) 4530 { 4531 struct stripe_head *sh; 4532 struct r5conf *conf = mddev->private; 4533 int handled; 4534 struct blk_plug plug; 4535 4536 pr_debug("+++ raid5d active\n"); 4537 4538 md_check_recovery(mddev); 4539 4540 blk_start_plug(&plug); 4541 handled = 0; 4542 spin_lock_irq(&conf->device_lock); 4543 while (1) { 4544 struct bio *bio; 4545 4546 if (atomic_read(&mddev->plug_cnt) == 0 && 4547 !list_empty(&conf->bitmap_list)) { 4548 /* Now is a good time to flush some bitmap updates */ 4549 conf->seq_flush++; 4550 spin_unlock_irq(&conf->device_lock); 4551 bitmap_unplug(mddev->bitmap); 4552 spin_lock_irq(&conf->device_lock); 4553 conf->seq_write = conf->seq_flush; 4554 activate_bit_delay(conf); 4555 } 4556 if (atomic_read(&mddev->plug_cnt) == 0) 4557 raid5_activate_delayed(conf); 4558 4559 while ((bio = remove_bio_from_retry(conf))) { 4560 int ok; 4561 spin_unlock_irq(&conf->device_lock); 4562 ok = retry_aligned_read(conf, bio); 4563 spin_lock_irq(&conf->device_lock); 4564 if (!ok) 4565 break; 4566 handled++; 4567 } 4568 4569 sh = __get_priority_stripe(conf); 4570 4571 if (!sh) 4572 break; 4573 spin_unlock_irq(&conf->device_lock); 4574 4575 handled++; 4576 handle_stripe(sh); 4577 release_stripe(sh); 4578 cond_resched(); 4579 4580 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 4581 md_check_recovery(mddev); 4582 4583 spin_lock_irq(&conf->device_lock); 4584 } 4585 pr_debug("%d stripes handled\n", handled); 4586 4587 spin_unlock_irq(&conf->device_lock); 4588 4589 async_tx_issue_pending_all(); 4590 blk_finish_plug(&plug); 4591 4592 pr_debug("--- raid5d inactive\n"); 4593 } 4594 4595 static ssize_t 4596 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4597 { 4598 struct r5conf *conf = mddev->private; 4599 if (conf) 4600 return sprintf(page, "%d\n", conf->max_nr_stripes); 4601 else 4602 return 0; 4603 } 4604 4605 int 4606 raid5_set_cache_size(struct mddev *mddev, int size) 4607 { 4608 struct r5conf *conf = mddev->private; 4609 int err; 4610 4611 if (size <= 16 || size > 32768) 4612 return -EINVAL; 4613 while (size < conf->max_nr_stripes) { 4614 if (drop_one_stripe(conf)) 4615 conf->max_nr_stripes--; 4616 else 4617 break; 4618 } 4619 err = md_allow_write(mddev); 4620 if (err) 4621 return err; 4622 while (size > conf->max_nr_stripes) { 4623 if (grow_one_stripe(conf)) 4624 conf->max_nr_stripes++; 4625 else break; 4626 } 4627 return 0; 4628 } 4629 EXPORT_SYMBOL(raid5_set_cache_size); 4630 4631 static ssize_t 4632 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4633 { 4634 struct r5conf *conf = mddev->private; 4635 unsigned long new; 4636 int err; 4637 4638 if (len >= PAGE_SIZE) 4639 return -EINVAL; 4640 if (!conf) 4641 return -ENODEV; 4642 4643 if (strict_strtoul(page, 10, &new)) 4644 return -EINVAL; 4645 err = raid5_set_cache_size(mddev, new); 4646 if (err) 4647 return err; 4648 return len; 4649 } 4650 4651 static struct md_sysfs_entry 4652 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4653 raid5_show_stripe_cache_size, 4654 raid5_store_stripe_cache_size); 4655 4656 static ssize_t 4657 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4658 { 4659 struct r5conf *conf = mddev->private; 4660 if (conf) 4661 return sprintf(page, "%d\n", conf->bypass_threshold); 4662 else 4663 return 0; 4664 } 4665 4666 static ssize_t 4667 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4668 { 4669 struct r5conf *conf = mddev->private; 4670 unsigned long new; 4671 if (len >= PAGE_SIZE) 4672 return -EINVAL; 4673 if (!conf) 4674 return -ENODEV; 4675 4676 if (strict_strtoul(page, 10, &new)) 4677 return -EINVAL; 4678 if (new > conf->max_nr_stripes) 4679 return -EINVAL; 4680 conf->bypass_threshold = new; 4681 return len; 4682 } 4683 4684 static struct md_sysfs_entry 4685 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4686 S_IRUGO | S_IWUSR, 4687 raid5_show_preread_threshold, 4688 raid5_store_preread_threshold); 4689 4690 static ssize_t 4691 stripe_cache_active_show(struct mddev *mddev, char *page) 4692 { 4693 struct r5conf *conf = mddev->private; 4694 if (conf) 4695 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4696 else 4697 return 0; 4698 } 4699 4700 static struct md_sysfs_entry 4701 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4702 4703 static struct attribute *raid5_attrs[] = { 4704 &raid5_stripecache_size.attr, 4705 &raid5_stripecache_active.attr, 4706 &raid5_preread_bypass_threshold.attr, 4707 NULL, 4708 }; 4709 static struct attribute_group raid5_attrs_group = { 4710 .name = NULL, 4711 .attrs = raid5_attrs, 4712 }; 4713 4714 static sector_t 4715 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4716 { 4717 struct r5conf *conf = mddev->private; 4718 4719 if (!sectors) 4720 sectors = mddev->dev_sectors; 4721 if (!raid_disks) 4722 /* size is defined by the smallest of previous and new size */ 4723 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4724 4725 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4726 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4727 return sectors * (raid_disks - conf->max_degraded); 4728 } 4729 4730 static void raid5_free_percpu(struct r5conf *conf) 4731 { 4732 struct raid5_percpu *percpu; 4733 unsigned long cpu; 4734 4735 if (!conf->percpu) 4736 return; 4737 4738 get_online_cpus(); 4739 for_each_possible_cpu(cpu) { 4740 percpu = per_cpu_ptr(conf->percpu, cpu); 4741 safe_put_page(percpu->spare_page); 4742 kfree(percpu->scribble); 4743 } 4744 #ifdef CONFIG_HOTPLUG_CPU 4745 unregister_cpu_notifier(&conf->cpu_notify); 4746 #endif 4747 put_online_cpus(); 4748 4749 free_percpu(conf->percpu); 4750 } 4751 4752 static void free_conf(struct r5conf *conf) 4753 { 4754 shrink_stripes(conf); 4755 raid5_free_percpu(conf); 4756 kfree(conf->disks); 4757 kfree(conf->stripe_hashtbl); 4758 kfree(conf); 4759 } 4760 4761 #ifdef CONFIG_HOTPLUG_CPU 4762 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4763 void *hcpu) 4764 { 4765 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4766 long cpu = (long)hcpu; 4767 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4768 4769 switch (action) { 4770 case CPU_UP_PREPARE: 4771 case CPU_UP_PREPARE_FROZEN: 4772 if (conf->level == 6 && !percpu->spare_page) 4773 percpu->spare_page = alloc_page(GFP_KERNEL); 4774 if (!percpu->scribble) 4775 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4776 4777 if (!percpu->scribble || 4778 (conf->level == 6 && !percpu->spare_page)) { 4779 safe_put_page(percpu->spare_page); 4780 kfree(percpu->scribble); 4781 pr_err("%s: failed memory allocation for cpu%ld\n", 4782 __func__, cpu); 4783 return notifier_from_errno(-ENOMEM); 4784 } 4785 break; 4786 case CPU_DEAD: 4787 case CPU_DEAD_FROZEN: 4788 safe_put_page(percpu->spare_page); 4789 kfree(percpu->scribble); 4790 percpu->spare_page = NULL; 4791 percpu->scribble = NULL; 4792 break; 4793 default: 4794 break; 4795 } 4796 return NOTIFY_OK; 4797 } 4798 #endif 4799 4800 static int raid5_alloc_percpu(struct r5conf *conf) 4801 { 4802 unsigned long cpu; 4803 struct page *spare_page; 4804 struct raid5_percpu __percpu *allcpus; 4805 void *scribble; 4806 int err; 4807 4808 allcpus = alloc_percpu(struct raid5_percpu); 4809 if (!allcpus) 4810 return -ENOMEM; 4811 conf->percpu = allcpus; 4812 4813 get_online_cpus(); 4814 err = 0; 4815 for_each_present_cpu(cpu) { 4816 if (conf->level == 6) { 4817 spare_page = alloc_page(GFP_KERNEL); 4818 if (!spare_page) { 4819 err = -ENOMEM; 4820 break; 4821 } 4822 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4823 } 4824 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4825 if (!scribble) { 4826 err = -ENOMEM; 4827 break; 4828 } 4829 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4830 } 4831 #ifdef CONFIG_HOTPLUG_CPU 4832 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4833 conf->cpu_notify.priority = 0; 4834 if (err == 0) 4835 err = register_cpu_notifier(&conf->cpu_notify); 4836 #endif 4837 put_online_cpus(); 4838 4839 return err; 4840 } 4841 4842 static struct r5conf *setup_conf(struct mddev *mddev) 4843 { 4844 struct r5conf *conf; 4845 int raid_disk, memory, max_disks; 4846 struct md_rdev *rdev; 4847 struct disk_info *disk; 4848 char pers_name[6]; 4849 4850 if (mddev->new_level != 5 4851 && mddev->new_level != 4 4852 && mddev->new_level != 6) { 4853 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4854 mdname(mddev), mddev->new_level); 4855 return ERR_PTR(-EIO); 4856 } 4857 if ((mddev->new_level == 5 4858 && !algorithm_valid_raid5(mddev->new_layout)) || 4859 (mddev->new_level == 6 4860 && !algorithm_valid_raid6(mddev->new_layout))) { 4861 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4862 mdname(mddev), mddev->new_layout); 4863 return ERR_PTR(-EIO); 4864 } 4865 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4866 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4867 mdname(mddev), mddev->raid_disks); 4868 return ERR_PTR(-EINVAL); 4869 } 4870 4871 if (!mddev->new_chunk_sectors || 4872 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4873 !is_power_of_2(mddev->new_chunk_sectors)) { 4874 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4875 mdname(mddev), mddev->new_chunk_sectors << 9); 4876 return ERR_PTR(-EINVAL); 4877 } 4878 4879 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4880 if (conf == NULL) 4881 goto abort; 4882 spin_lock_init(&conf->device_lock); 4883 init_waitqueue_head(&conf->wait_for_stripe); 4884 init_waitqueue_head(&conf->wait_for_overlap); 4885 INIT_LIST_HEAD(&conf->handle_list); 4886 INIT_LIST_HEAD(&conf->hold_list); 4887 INIT_LIST_HEAD(&conf->delayed_list); 4888 INIT_LIST_HEAD(&conf->bitmap_list); 4889 INIT_LIST_HEAD(&conf->inactive_list); 4890 atomic_set(&conf->active_stripes, 0); 4891 atomic_set(&conf->preread_active_stripes, 0); 4892 atomic_set(&conf->active_aligned_reads, 0); 4893 conf->bypass_threshold = BYPASS_THRESHOLD; 4894 conf->recovery_disabled = mddev->recovery_disabled - 1; 4895 4896 conf->raid_disks = mddev->raid_disks; 4897 if (mddev->reshape_position == MaxSector) 4898 conf->previous_raid_disks = mddev->raid_disks; 4899 else 4900 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4901 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4902 conf->scribble_len = scribble_len(max_disks); 4903 4904 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4905 GFP_KERNEL); 4906 if (!conf->disks) 4907 goto abort; 4908 4909 conf->mddev = mddev; 4910 4911 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4912 goto abort; 4913 4914 conf->level = mddev->new_level; 4915 if (raid5_alloc_percpu(conf) != 0) 4916 goto abort; 4917 4918 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4919 4920 rdev_for_each(rdev, mddev) { 4921 raid_disk = rdev->raid_disk; 4922 if (raid_disk >= max_disks 4923 || raid_disk < 0) 4924 continue; 4925 disk = conf->disks + raid_disk; 4926 4927 if (test_bit(Replacement, &rdev->flags)) { 4928 if (disk->replacement) 4929 goto abort; 4930 disk->replacement = rdev; 4931 } else { 4932 if (disk->rdev) 4933 goto abort; 4934 disk->rdev = rdev; 4935 } 4936 4937 if (test_bit(In_sync, &rdev->flags)) { 4938 char b[BDEVNAME_SIZE]; 4939 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4940 " disk %d\n", 4941 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4942 } else if (rdev->saved_raid_disk != raid_disk) 4943 /* Cannot rely on bitmap to complete recovery */ 4944 conf->fullsync = 1; 4945 } 4946 4947 conf->chunk_sectors = mddev->new_chunk_sectors; 4948 conf->level = mddev->new_level; 4949 if (conf->level == 6) 4950 conf->max_degraded = 2; 4951 else 4952 conf->max_degraded = 1; 4953 conf->algorithm = mddev->new_layout; 4954 conf->max_nr_stripes = NR_STRIPES; 4955 conf->reshape_progress = mddev->reshape_position; 4956 if (conf->reshape_progress != MaxSector) { 4957 conf->prev_chunk_sectors = mddev->chunk_sectors; 4958 conf->prev_algo = mddev->layout; 4959 } 4960 4961 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4962 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4963 if (grow_stripes(conf, conf->max_nr_stripes)) { 4964 printk(KERN_ERR 4965 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4966 mdname(mddev), memory); 4967 goto abort; 4968 } else 4969 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4970 mdname(mddev), memory); 4971 4972 sprintf(pers_name, "raid%d", mddev->new_level); 4973 conf->thread = md_register_thread(raid5d, mddev, pers_name); 4974 if (!conf->thread) { 4975 printk(KERN_ERR 4976 "md/raid:%s: couldn't allocate thread.\n", 4977 mdname(mddev)); 4978 goto abort; 4979 } 4980 4981 return conf; 4982 4983 abort: 4984 if (conf) { 4985 free_conf(conf); 4986 return ERR_PTR(-EIO); 4987 } else 4988 return ERR_PTR(-ENOMEM); 4989 } 4990 4991 4992 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4993 { 4994 switch (algo) { 4995 case ALGORITHM_PARITY_0: 4996 if (raid_disk < max_degraded) 4997 return 1; 4998 break; 4999 case ALGORITHM_PARITY_N: 5000 if (raid_disk >= raid_disks - max_degraded) 5001 return 1; 5002 break; 5003 case ALGORITHM_PARITY_0_6: 5004 if (raid_disk == 0 || 5005 raid_disk == raid_disks - 1) 5006 return 1; 5007 break; 5008 case ALGORITHM_LEFT_ASYMMETRIC_6: 5009 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5010 case ALGORITHM_LEFT_SYMMETRIC_6: 5011 case ALGORITHM_RIGHT_SYMMETRIC_6: 5012 if (raid_disk == raid_disks - 1) 5013 return 1; 5014 } 5015 return 0; 5016 } 5017 5018 static int run(struct mddev *mddev) 5019 { 5020 struct r5conf *conf; 5021 int working_disks = 0; 5022 int dirty_parity_disks = 0; 5023 struct md_rdev *rdev; 5024 sector_t reshape_offset = 0; 5025 int i; 5026 long long min_offset_diff = 0; 5027 int first = 1; 5028 5029 if (mddev->recovery_cp != MaxSector) 5030 printk(KERN_NOTICE "md/raid:%s: not clean" 5031 " -- starting background reconstruction\n", 5032 mdname(mddev)); 5033 5034 rdev_for_each(rdev, mddev) { 5035 long long diff; 5036 if (rdev->raid_disk < 0) 5037 continue; 5038 diff = (rdev->new_data_offset - rdev->data_offset); 5039 if (first) { 5040 min_offset_diff = diff; 5041 first = 0; 5042 } else if (mddev->reshape_backwards && 5043 diff < min_offset_diff) 5044 min_offset_diff = diff; 5045 else if (!mddev->reshape_backwards && 5046 diff > min_offset_diff) 5047 min_offset_diff = diff; 5048 } 5049 5050 if (mddev->reshape_position != MaxSector) { 5051 /* Check that we can continue the reshape. 5052 * Difficulties arise if the stripe we would write to 5053 * next is at or after the stripe we would read from next. 5054 * For a reshape that changes the number of devices, this 5055 * is only possible for a very short time, and mdadm makes 5056 * sure that time appears to have past before assembling 5057 * the array. So we fail if that time hasn't passed. 5058 * For a reshape that keeps the number of devices the same 5059 * mdadm must be monitoring the reshape can keeping the 5060 * critical areas read-only and backed up. It will start 5061 * the array in read-only mode, so we check for that. 5062 */ 5063 sector_t here_new, here_old; 5064 int old_disks; 5065 int max_degraded = (mddev->level == 6 ? 2 : 1); 5066 5067 if (mddev->new_level != mddev->level) { 5068 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5069 "required - aborting.\n", 5070 mdname(mddev)); 5071 return -EINVAL; 5072 } 5073 old_disks = mddev->raid_disks - mddev->delta_disks; 5074 /* reshape_position must be on a new-stripe boundary, and one 5075 * further up in new geometry must map after here in old 5076 * geometry. 5077 */ 5078 here_new = mddev->reshape_position; 5079 if (sector_div(here_new, mddev->new_chunk_sectors * 5080 (mddev->raid_disks - max_degraded))) { 5081 printk(KERN_ERR "md/raid:%s: reshape_position not " 5082 "on a stripe boundary\n", mdname(mddev)); 5083 return -EINVAL; 5084 } 5085 reshape_offset = here_new * mddev->new_chunk_sectors; 5086 /* here_new is the stripe we will write to */ 5087 here_old = mddev->reshape_position; 5088 sector_div(here_old, mddev->chunk_sectors * 5089 (old_disks-max_degraded)); 5090 /* here_old is the first stripe that we might need to read 5091 * from */ 5092 if (mddev->delta_disks == 0) { 5093 if ((here_new * mddev->new_chunk_sectors != 5094 here_old * mddev->chunk_sectors)) { 5095 printk(KERN_ERR "md/raid:%s: reshape position is" 5096 " confused - aborting\n", mdname(mddev)); 5097 return -EINVAL; 5098 } 5099 /* We cannot be sure it is safe to start an in-place 5100 * reshape. It is only safe if user-space is monitoring 5101 * and taking constant backups. 5102 * mdadm always starts a situation like this in 5103 * readonly mode so it can take control before 5104 * allowing any writes. So just check for that. 5105 */ 5106 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5107 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5108 /* not really in-place - so OK */; 5109 else if (mddev->ro == 0) { 5110 printk(KERN_ERR "md/raid:%s: in-place reshape " 5111 "must be started in read-only mode " 5112 "- aborting\n", 5113 mdname(mddev)); 5114 return -EINVAL; 5115 } 5116 } else if (mddev->reshape_backwards 5117 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5118 here_old * mddev->chunk_sectors) 5119 : (here_new * mddev->new_chunk_sectors >= 5120 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5121 /* Reading from the same stripe as writing to - bad */ 5122 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5123 "auto-recovery - aborting.\n", 5124 mdname(mddev)); 5125 return -EINVAL; 5126 } 5127 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5128 mdname(mddev)); 5129 /* OK, we should be able to continue; */ 5130 } else { 5131 BUG_ON(mddev->level != mddev->new_level); 5132 BUG_ON(mddev->layout != mddev->new_layout); 5133 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5134 BUG_ON(mddev->delta_disks != 0); 5135 } 5136 5137 if (mddev->private == NULL) 5138 conf = setup_conf(mddev); 5139 else 5140 conf = mddev->private; 5141 5142 if (IS_ERR(conf)) 5143 return PTR_ERR(conf); 5144 5145 conf->min_offset_diff = min_offset_diff; 5146 mddev->thread = conf->thread; 5147 conf->thread = NULL; 5148 mddev->private = conf; 5149 5150 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5151 i++) { 5152 rdev = conf->disks[i].rdev; 5153 if (!rdev && conf->disks[i].replacement) { 5154 /* The replacement is all we have yet */ 5155 rdev = conf->disks[i].replacement; 5156 conf->disks[i].replacement = NULL; 5157 clear_bit(Replacement, &rdev->flags); 5158 conf->disks[i].rdev = rdev; 5159 } 5160 if (!rdev) 5161 continue; 5162 if (conf->disks[i].replacement && 5163 conf->reshape_progress != MaxSector) { 5164 /* replacements and reshape simply do not mix. */ 5165 printk(KERN_ERR "md: cannot handle concurrent " 5166 "replacement and reshape.\n"); 5167 goto abort; 5168 } 5169 if (test_bit(In_sync, &rdev->flags)) { 5170 working_disks++; 5171 continue; 5172 } 5173 /* This disc is not fully in-sync. However if it 5174 * just stored parity (beyond the recovery_offset), 5175 * when we don't need to be concerned about the 5176 * array being dirty. 5177 * When reshape goes 'backwards', we never have 5178 * partially completed devices, so we only need 5179 * to worry about reshape going forwards. 5180 */ 5181 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5182 if (mddev->major_version == 0 && 5183 mddev->minor_version > 90) 5184 rdev->recovery_offset = reshape_offset; 5185 5186 if (rdev->recovery_offset < reshape_offset) { 5187 /* We need to check old and new layout */ 5188 if (!only_parity(rdev->raid_disk, 5189 conf->algorithm, 5190 conf->raid_disks, 5191 conf->max_degraded)) 5192 continue; 5193 } 5194 if (!only_parity(rdev->raid_disk, 5195 conf->prev_algo, 5196 conf->previous_raid_disks, 5197 conf->max_degraded)) 5198 continue; 5199 dirty_parity_disks++; 5200 } 5201 5202 /* 5203 * 0 for a fully functional array, 1 or 2 for a degraded array. 5204 */ 5205 mddev->degraded = calc_degraded(conf); 5206 5207 if (has_failed(conf)) { 5208 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5209 " (%d/%d failed)\n", 5210 mdname(mddev), mddev->degraded, conf->raid_disks); 5211 goto abort; 5212 } 5213 5214 /* device size must be a multiple of chunk size */ 5215 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5216 mddev->resync_max_sectors = mddev->dev_sectors; 5217 5218 if (mddev->degraded > dirty_parity_disks && 5219 mddev->recovery_cp != MaxSector) { 5220 if (mddev->ok_start_degraded) 5221 printk(KERN_WARNING 5222 "md/raid:%s: starting dirty degraded array" 5223 " - data corruption possible.\n", 5224 mdname(mddev)); 5225 else { 5226 printk(KERN_ERR 5227 "md/raid:%s: cannot start dirty degraded array.\n", 5228 mdname(mddev)); 5229 goto abort; 5230 } 5231 } 5232 5233 if (mddev->degraded == 0) 5234 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5235 " devices, algorithm %d\n", mdname(mddev), conf->level, 5236 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5237 mddev->new_layout); 5238 else 5239 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5240 " out of %d devices, algorithm %d\n", 5241 mdname(mddev), conf->level, 5242 mddev->raid_disks - mddev->degraded, 5243 mddev->raid_disks, mddev->new_layout); 5244 5245 print_raid5_conf(conf); 5246 5247 if (conf->reshape_progress != MaxSector) { 5248 conf->reshape_safe = conf->reshape_progress; 5249 atomic_set(&conf->reshape_stripes, 0); 5250 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5251 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5252 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5253 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5254 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5255 "reshape"); 5256 } 5257 5258 5259 /* Ok, everything is just fine now */ 5260 if (mddev->to_remove == &raid5_attrs_group) 5261 mddev->to_remove = NULL; 5262 else if (mddev->kobj.sd && 5263 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5264 printk(KERN_WARNING 5265 "raid5: failed to create sysfs attributes for %s\n", 5266 mdname(mddev)); 5267 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5268 5269 if (mddev->queue) { 5270 int chunk_size; 5271 /* read-ahead size must cover two whole stripes, which 5272 * is 2 * (datadisks) * chunksize where 'n' is the 5273 * number of raid devices 5274 */ 5275 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5276 int stripe = data_disks * 5277 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5278 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5279 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5280 5281 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5282 5283 mddev->queue->backing_dev_info.congested_data = mddev; 5284 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5285 5286 chunk_size = mddev->chunk_sectors << 9; 5287 blk_queue_io_min(mddev->queue, chunk_size); 5288 blk_queue_io_opt(mddev->queue, chunk_size * 5289 (conf->raid_disks - conf->max_degraded)); 5290 5291 rdev_for_each(rdev, mddev) { 5292 disk_stack_limits(mddev->gendisk, rdev->bdev, 5293 rdev->data_offset << 9); 5294 disk_stack_limits(mddev->gendisk, rdev->bdev, 5295 rdev->new_data_offset << 9); 5296 } 5297 } 5298 5299 return 0; 5300 abort: 5301 md_unregister_thread(&mddev->thread); 5302 print_raid5_conf(conf); 5303 free_conf(conf); 5304 mddev->private = NULL; 5305 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5306 return -EIO; 5307 } 5308 5309 static int stop(struct mddev *mddev) 5310 { 5311 struct r5conf *conf = mddev->private; 5312 5313 md_unregister_thread(&mddev->thread); 5314 if (mddev->queue) 5315 mddev->queue->backing_dev_info.congested_fn = NULL; 5316 free_conf(conf); 5317 mddev->private = NULL; 5318 mddev->to_remove = &raid5_attrs_group; 5319 return 0; 5320 } 5321 5322 static void status(struct seq_file *seq, struct mddev *mddev) 5323 { 5324 struct r5conf *conf = mddev->private; 5325 int i; 5326 5327 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5328 mddev->chunk_sectors / 2, mddev->layout); 5329 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5330 for (i = 0; i < conf->raid_disks; i++) 5331 seq_printf (seq, "%s", 5332 conf->disks[i].rdev && 5333 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5334 seq_printf (seq, "]"); 5335 } 5336 5337 static void print_raid5_conf (struct r5conf *conf) 5338 { 5339 int i; 5340 struct disk_info *tmp; 5341 5342 printk(KERN_DEBUG "RAID conf printout:\n"); 5343 if (!conf) { 5344 printk("(conf==NULL)\n"); 5345 return; 5346 } 5347 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5348 conf->raid_disks, 5349 conf->raid_disks - conf->mddev->degraded); 5350 5351 for (i = 0; i < conf->raid_disks; i++) { 5352 char b[BDEVNAME_SIZE]; 5353 tmp = conf->disks + i; 5354 if (tmp->rdev) 5355 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5356 i, !test_bit(Faulty, &tmp->rdev->flags), 5357 bdevname(tmp->rdev->bdev, b)); 5358 } 5359 } 5360 5361 static int raid5_spare_active(struct mddev *mddev) 5362 { 5363 int i; 5364 struct r5conf *conf = mddev->private; 5365 struct disk_info *tmp; 5366 int count = 0; 5367 unsigned long flags; 5368 5369 for (i = 0; i < conf->raid_disks; i++) { 5370 tmp = conf->disks + i; 5371 if (tmp->replacement 5372 && tmp->replacement->recovery_offset == MaxSector 5373 && !test_bit(Faulty, &tmp->replacement->flags) 5374 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5375 /* Replacement has just become active. */ 5376 if (!tmp->rdev 5377 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5378 count++; 5379 if (tmp->rdev) { 5380 /* Replaced device not technically faulty, 5381 * but we need to be sure it gets removed 5382 * and never re-added. 5383 */ 5384 set_bit(Faulty, &tmp->rdev->flags); 5385 sysfs_notify_dirent_safe( 5386 tmp->rdev->sysfs_state); 5387 } 5388 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5389 } else if (tmp->rdev 5390 && tmp->rdev->recovery_offset == MaxSector 5391 && !test_bit(Faulty, &tmp->rdev->flags) 5392 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5393 count++; 5394 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5395 } 5396 } 5397 spin_lock_irqsave(&conf->device_lock, flags); 5398 mddev->degraded = calc_degraded(conf); 5399 spin_unlock_irqrestore(&conf->device_lock, flags); 5400 print_raid5_conf(conf); 5401 return count; 5402 } 5403 5404 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5405 { 5406 struct r5conf *conf = mddev->private; 5407 int err = 0; 5408 int number = rdev->raid_disk; 5409 struct md_rdev **rdevp; 5410 struct disk_info *p = conf->disks + number; 5411 5412 print_raid5_conf(conf); 5413 if (rdev == p->rdev) 5414 rdevp = &p->rdev; 5415 else if (rdev == p->replacement) 5416 rdevp = &p->replacement; 5417 else 5418 return 0; 5419 5420 if (number >= conf->raid_disks && 5421 conf->reshape_progress == MaxSector) 5422 clear_bit(In_sync, &rdev->flags); 5423 5424 if (test_bit(In_sync, &rdev->flags) || 5425 atomic_read(&rdev->nr_pending)) { 5426 err = -EBUSY; 5427 goto abort; 5428 } 5429 /* Only remove non-faulty devices if recovery 5430 * isn't possible. 5431 */ 5432 if (!test_bit(Faulty, &rdev->flags) && 5433 mddev->recovery_disabled != conf->recovery_disabled && 5434 !has_failed(conf) && 5435 (!p->replacement || p->replacement == rdev) && 5436 number < conf->raid_disks) { 5437 err = -EBUSY; 5438 goto abort; 5439 } 5440 *rdevp = NULL; 5441 synchronize_rcu(); 5442 if (atomic_read(&rdev->nr_pending)) { 5443 /* lost the race, try later */ 5444 err = -EBUSY; 5445 *rdevp = rdev; 5446 } else if (p->replacement) { 5447 /* We must have just cleared 'rdev' */ 5448 p->rdev = p->replacement; 5449 clear_bit(Replacement, &p->replacement->flags); 5450 smp_mb(); /* Make sure other CPUs may see both as identical 5451 * but will never see neither - if they are careful 5452 */ 5453 p->replacement = NULL; 5454 clear_bit(WantReplacement, &rdev->flags); 5455 } else 5456 /* We might have just removed the Replacement as faulty- 5457 * clear the bit just in case 5458 */ 5459 clear_bit(WantReplacement, &rdev->flags); 5460 abort: 5461 5462 print_raid5_conf(conf); 5463 return err; 5464 } 5465 5466 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5467 { 5468 struct r5conf *conf = mddev->private; 5469 int err = -EEXIST; 5470 int disk; 5471 struct disk_info *p; 5472 int first = 0; 5473 int last = conf->raid_disks - 1; 5474 5475 if (mddev->recovery_disabled == conf->recovery_disabled) 5476 return -EBUSY; 5477 5478 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5479 /* no point adding a device */ 5480 return -EINVAL; 5481 5482 if (rdev->raid_disk >= 0) 5483 first = last = rdev->raid_disk; 5484 5485 /* 5486 * find the disk ... but prefer rdev->saved_raid_disk 5487 * if possible. 5488 */ 5489 if (rdev->saved_raid_disk >= 0 && 5490 rdev->saved_raid_disk >= first && 5491 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5492 first = rdev->saved_raid_disk; 5493 5494 for (disk = first; disk <= last; disk++) { 5495 p = conf->disks + disk; 5496 if (p->rdev == NULL) { 5497 clear_bit(In_sync, &rdev->flags); 5498 rdev->raid_disk = disk; 5499 err = 0; 5500 if (rdev->saved_raid_disk != disk) 5501 conf->fullsync = 1; 5502 rcu_assign_pointer(p->rdev, rdev); 5503 goto out; 5504 } 5505 } 5506 for (disk = first; disk <= last; disk++) { 5507 p = conf->disks + disk; 5508 if (test_bit(WantReplacement, &p->rdev->flags) && 5509 p->replacement == NULL) { 5510 clear_bit(In_sync, &rdev->flags); 5511 set_bit(Replacement, &rdev->flags); 5512 rdev->raid_disk = disk; 5513 err = 0; 5514 conf->fullsync = 1; 5515 rcu_assign_pointer(p->replacement, rdev); 5516 break; 5517 } 5518 } 5519 out: 5520 print_raid5_conf(conf); 5521 return err; 5522 } 5523 5524 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5525 { 5526 /* no resync is happening, and there is enough space 5527 * on all devices, so we can resize. 5528 * We need to make sure resync covers any new space. 5529 * If the array is shrinking we should possibly wait until 5530 * any io in the removed space completes, but it hardly seems 5531 * worth it. 5532 */ 5533 sector_t newsize; 5534 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5535 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5536 if (mddev->external_size && 5537 mddev->array_sectors > newsize) 5538 return -EINVAL; 5539 if (mddev->bitmap) { 5540 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5541 if (ret) 5542 return ret; 5543 } 5544 md_set_array_sectors(mddev, newsize); 5545 set_capacity(mddev->gendisk, mddev->array_sectors); 5546 revalidate_disk(mddev->gendisk); 5547 if (sectors > mddev->dev_sectors && 5548 mddev->recovery_cp > mddev->dev_sectors) { 5549 mddev->recovery_cp = mddev->dev_sectors; 5550 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5551 } 5552 mddev->dev_sectors = sectors; 5553 mddev->resync_max_sectors = sectors; 5554 return 0; 5555 } 5556 5557 static int check_stripe_cache(struct mddev *mddev) 5558 { 5559 /* Can only proceed if there are plenty of stripe_heads. 5560 * We need a minimum of one full stripe,, and for sensible progress 5561 * it is best to have about 4 times that. 5562 * If we require 4 times, then the default 256 4K stripe_heads will 5563 * allow for chunk sizes up to 256K, which is probably OK. 5564 * If the chunk size is greater, user-space should request more 5565 * stripe_heads first. 5566 */ 5567 struct r5conf *conf = mddev->private; 5568 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5569 > conf->max_nr_stripes || 5570 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5571 > conf->max_nr_stripes) { 5572 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5573 mdname(mddev), 5574 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5575 / STRIPE_SIZE)*4); 5576 return 0; 5577 } 5578 return 1; 5579 } 5580 5581 static int check_reshape(struct mddev *mddev) 5582 { 5583 struct r5conf *conf = mddev->private; 5584 5585 if (mddev->delta_disks == 0 && 5586 mddev->new_layout == mddev->layout && 5587 mddev->new_chunk_sectors == mddev->chunk_sectors) 5588 return 0; /* nothing to do */ 5589 if (has_failed(conf)) 5590 return -EINVAL; 5591 if (mddev->delta_disks < 0) { 5592 /* We might be able to shrink, but the devices must 5593 * be made bigger first. 5594 * For raid6, 4 is the minimum size. 5595 * Otherwise 2 is the minimum 5596 */ 5597 int min = 2; 5598 if (mddev->level == 6) 5599 min = 4; 5600 if (mddev->raid_disks + mddev->delta_disks < min) 5601 return -EINVAL; 5602 } 5603 5604 if (!check_stripe_cache(mddev)) 5605 return -ENOSPC; 5606 5607 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5608 } 5609 5610 static int raid5_start_reshape(struct mddev *mddev) 5611 { 5612 struct r5conf *conf = mddev->private; 5613 struct md_rdev *rdev; 5614 int spares = 0; 5615 unsigned long flags; 5616 5617 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5618 return -EBUSY; 5619 5620 if (!check_stripe_cache(mddev)) 5621 return -ENOSPC; 5622 5623 if (has_failed(conf)) 5624 return -EINVAL; 5625 5626 rdev_for_each(rdev, mddev) { 5627 if (!test_bit(In_sync, &rdev->flags) 5628 && !test_bit(Faulty, &rdev->flags)) 5629 spares++; 5630 } 5631 5632 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5633 /* Not enough devices even to make a degraded array 5634 * of that size 5635 */ 5636 return -EINVAL; 5637 5638 /* Refuse to reduce size of the array. Any reductions in 5639 * array size must be through explicit setting of array_size 5640 * attribute. 5641 */ 5642 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5643 < mddev->array_sectors) { 5644 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5645 "before number of disks\n", mdname(mddev)); 5646 return -EINVAL; 5647 } 5648 5649 atomic_set(&conf->reshape_stripes, 0); 5650 spin_lock_irq(&conf->device_lock); 5651 conf->previous_raid_disks = conf->raid_disks; 5652 conf->raid_disks += mddev->delta_disks; 5653 conf->prev_chunk_sectors = conf->chunk_sectors; 5654 conf->chunk_sectors = mddev->new_chunk_sectors; 5655 conf->prev_algo = conf->algorithm; 5656 conf->algorithm = mddev->new_layout; 5657 conf->generation++; 5658 /* Code that selects data_offset needs to see the generation update 5659 * if reshape_progress has been set - so a memory barrier needed. 5660 */ 5661 smp_mb(); 5662 if (mddev->reshape_backwards) 5663 conf->reshape_progress = raid5_size(mddev, 0, 0); 5664 else 5665 conf->reshape_progress = 0; 5666 conf->reshape_safe = conf->reshape_progress; 5667 spin_unlock_irq(&conf->device_lock); 5668 5669 /* Add some new drives, as many as will fit. 5670 * We know there are enough to make the newly sized array work. 5671 * Don't add devices if we are reducing the number of 5672 * devices in the array. This is because it is not possible 5673 * to correctly record the "partially reconstructed" state of 5674 * such devices during the reshape and confusion could result. 5675 */ 5676 if (mddev->delta_disks >= 0) { 5677 rdev_for_each(rdev, mddev) 5678 if (rdev->raid_disk < 0 && 5679 !test_bit(Faulty, &rdev->flags)) { 5680 if (raid5_add_disk(mddev, rdev) == 0) { 5681 if (rdev->raid_disk 5682 >= conf->previous_raid_disks) 5683 set_bit(In_sync, &rdev->flags); 5684 else 5685 rdev->recovery_offset = 0; 5686 5687 if (sysfs_link_rdev(mddev, rdev)) 5688 /* Failure here is OK */; 5689 } 5690 } else if (rdev->raid_disk >= conf->previous_raid_disks 5691 && !test_bit(Faulty, &rdev->flags)) { 5692 /* This is a spare that was manually added */ 5693 set_bit(In_sync, &rdev->flags); 5694 } 5695 5696 /* When a reshape changes the number of devices, 5697 * ->degraded is measured against the larger of the 5698 * pre and post number of devices. 5699 */ 5700 spin_lock_irqsave(&conf->device_lock, flags); 5701 mddev->degraded = calc_degraded(conf); 5702 spin_unlock_irqrestore(&conf->device_lock, flags); 5703 } 5704 mddev->raid_disks = conf->raid_disks; 5705 mddev->reshape_position = conf->reshape_progress; 5706 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5707 5708 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5709 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5710 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5711 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5712 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5713 "reshape"); 5714 if (!mddev->sync_thread) { 5715 mddev->recovery = 0; 5716 spin_lock_irq(&conf->device_lock); 5717 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5718 rdev_for_each(rdev, mddev) 5719 rdev->new_data_offset = rdev->data_offset; 5720 smp_wmb(); 5721 conf->reshape_progress = MaxSector; 5722 mddev->reshape_position = MaxSector; 5723 spin_unlock_irq(&conf->device_lock); 5724 return -EAGAIN; 5725 } 5726 conf->reshape_checkpoint = jiffies; 5727 md_wakeup_thread(mddev->sync_thread); 5728 md_new_event(mddev); 5729 return 0; 5730 } 5731 5732 /* This is called from the reshape thread and should make any 5733 * changes needed in 'conf' 5734 */ 5735 static void end_reshape(struct r5conf *conf) 5736 { 5737 5738 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5739 struct md_rdev *rdev; 5740 5741 spin_lock_irq(&conf->device_lock); 5742 conf->previous_raid_disks = conf->raid_disks; 5743 rdev_for_each(rdev, conf->mddev) 5744 rdev->data_offset = rdev->new_data_offset; 5745 smp_wmb(); 5746 conf->reshape_progress = MaxSector; 5747 spin_unlock_irq(&conf->device_lock); 5748 wake_up(&conf->wait_for_overlap); 5749 5750 /* read-ahead size must cover two whole stripes, which is 5751 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5752 */ 5753 if (conf->mddev->queue) { 5754 int data_disks = conf->raid_disks - conf->max_degraded; 5755 int stripe = data_disks * ((conf->chunk_sectors << 9) 5756 / PAGE_SIZE); 5757 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5758 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5759 } 5760 } 5761 } 5762 5763 /* This is called from the raid5d thread with mddev_lock held. 5764 * It makes config changes to the device. 5765 */ 5766 static void raid5_finish_reshape(struct mddev *mddev) 5767 { 5768 struct r5conf *conf = mddev->private; 5769 5770 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5771 5772 if (mddev->delta_disks > 0) { 5773 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5774 set_capacity(mddev->gendisk, mddev->array_sectors); 5775 revalidate_disk(mddev->gendisk); 5776 } else { 5777 int d; 5778 spin_lock_irq(&conf->device_lock); 5779 mddev->degraded = calc_degraded(conf); 5780 spin_unlock_irq(&conf->device_lock); 5781 for (d = conf->raid_disks ; 5782 d < conf->raid_disks - mddev->delta_disks; 5783 d++) { 5784 struct md_rdev *rdev = conf->disks[d].rdev; 5785 if (rdev) 5786 clear_bit(In_sync, &rdev->flags); 5787 rdev = conf->disks[d].replacement; 5788 if (rdev) 5789 clear_bit(In_sync, &rdev->flags); 5790 } 5791 } 5792 mddev->layout = conf->algorithm; 5793 mddev->chunk_sectors = conf->chunk_sectors; 5794 mddev->reshape_position = MaxSector; 5795 mddev->delta_disks = 0; 5796 mddev->reshape_backwards = 0; 5797 } 5798 } 5799 5800 static void raid5_quiesce(struct mddev *mddev, int state) 5801 { 5802 struct r5conf *conf = mddev->private; 5803 5804 switch(state) { 5805 case 2: /* resume for a suspend */ 5806 wake_up(&conf->wait_for_overlap); 5807 break; 5808 5809 case 1: /* stop all writes */ 5810 spin_lock_irq(&conf->device_lock); 5811 /* '2' tells resync/reshape to pause so that all 5812 * active stripes can drain 5813 */ 5814 conf->quiesce = 2; 5815 wait_event_lock_irq(conf->wait_for_stripe, 5816 atomic_read(&conf->active_stripes) == 0 && 5817 atomic_read(&conf->active_aligned_reads) == 0, 5818 conf->device_lock, /* nothing */); 5819 conf->quiesce = 1; 5820 spin_unlock_irq(&conf->device_lock); 5821 /* allow reshape to continue */ 5822 wake_up(&conf->wait_for_overlap); 5823 break; 5824 5825 case 0: /* re-enable writes */ 5826 spin_lock_irq(&conf->device_lock); 5827 conf->quiesce = 0; 5828 wake_up(&conf->wait_for_stripe); 5829 wake_up(&conf->wait_for_overlap); 5830 spin_unlock_irq(&conf->device_lock); 5831 break; 5832 } 5833 } 5834 5835 5836 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5837 { 5838 struct r0conf *raid0_conf = mddev->private; 5839 sector_t sectors; 5840 5841 /* for raid0 takeover only one zone is supported */ 5842 if (raid0_conf->nr_strip_zones > 1) { 5843 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5844 mdname(mddev)); 5845 return ERR_PTR(-EINVAL); 5846 } 5847 5848 sectors = raid0_conf->strip_zone[0].zone_end; 5849 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5850 mddev->dev_sectors = sectors; 5851 mddev->new_level = level; 5852 mddev->new_layout = ALGORITHM_PARITY_N; 5853 mddev->new_chunk_sectors = mddev->chunk_sectors; 5854 mddev->raid_disks += 1; 5855 mddev->delta_disks = 1; 5856 /* make sure it will be not marked as dirty */ 5857 mddev->recovery_cp = MaxSector; 5858 5859 return setup_conf(mddev); 5860 } 5861 5862 5863 static void *raid5_takeover_raid1(struct mddev *mddev) 5864 { 5865 int chunksect; 5866 5867 if (mddev->raid_disks != 2 || 5868 mddev->degraded > 1) 5869 return ERR_PTR(-EINVAL); 5870 5871 /* Should check if there are write-behind devices? */ 5872 5873 chunksect = 64*2; /* 64K by default */ 5874 5875 /* The array must be an exact multiple of chunksize */ 5876 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5877 chunksect >>= 1; 5878 5879 if ((chunksect<<9) < STRIPE_SIZE) 5880 /* array size does not allow a suitable chunk size */ 5881 return ERR_PTR(-EINVAL); 5882 5883 mddev->new_level = 5; 5884 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5885 mddev->new_chunk_sectors = chunksect; 5886 5887 return setup_conf(mddev); 5888 } 5889 5890 static void *raid5_takeover_raid6(struct mddev *mddev) 5891 { 5892 int new_layout; 5893 5894 switch (mddev->layout) { 5895 case ALGORITHM_LEFT_ASYMMETRIC_6: 5896 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5897 break; 5898 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5899 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5900 break; 5901 case ALGORITHM_LEFT_SYMMETRIC_6: 5902 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5903 break; 5904 case ALGORITHM_RIGHT_SYMMETRIC_6: 5905 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5906 break; 5907 case ALGORITHM_PARITY_0_6: 5908 new_layout = ALGORITHM_PARITY_0; 5909 break; 5910 case ALGORITHM_PARITY_N: 5911 new_layout = ALGORITHM_PARITY_N; 5912 break; 5913 default: 5914 return ERR_PTR(-EINVAL); 5915 } 5916 mddev->new_level = 5; 5917 mddev->new_layout = new_layout; 5918 mddev->delta_disks = -1; 5919 mddev->raid_disks -= 1; 5920 return setup_conf(mddev); 5921 } 5922 5923 5924 static int raid5_check_reshape(struct mddev *mddev) 5925 { 5926 /* For a 2-drive array, the layout and chunk size can be changed 5927 * immediately as not restriping is needed. 5928 * For larger arrays we record the new value - after validation 5929 * to be used by a reshape pass. 5930 */ 5931 struct r5conf *conf = mddev->private; 5932 int new_chunk = mddev->new_chunk_sectors; 5933 5934 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5935 return -EINVAL; 5936 if (new_chunk > 0) { 5937 if (!is_power_of_2(new_chunk)) 5938 return -EINVAL; 5939 if (new_chunk < (PAGE_SIZE>>9)) 5940 return -EINVAL; 5941 if (mddev->array_sectors & (new_chunk-1)) 5942 /* not factor of array size */ 5943 return -EINVAL; 5944 } 5945 5946 /* They look valid */ 5947 5948 if (mddev->raid_disks == 2) { 5949 /* can make the change immediately */ 5950 if (mddev->new_layout >= 0) { 5951 conf->algorithm = mddev->new_layout; 5952 mddev->layout = mddev->new_layout; 5953 } 5954 if (new_chunk > 0) { 5955 conf->chunk_sectors = new_chunk ; 5956 mddev->chunk_sectors = new_chunk; 5957 } 5958 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5959 md_wakeup_thread(mddev->thread); 5960 } 5961 return check_reshape(mddev); 5962 } 5963 5964 static int raid6_check_reshape(struct mddev *mddev) 5965 { 5966 int new_chunk = mddev->new_chunk_sectors; 5967 5968 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5969 return -EINVAL; 5970 if (new_chunk > 0) { 5971 if (!is_power_of_2(new_chunk)) 5972 return -EINVAL; 5973 if (new_chunk < (PAGE_SIZE >> 9)) 5974 return -EINVAL; 5975 if (mddev->array_sectors & (new_chunk-1)) 5976 /* not factor of array size */ 5977 return -EINVAL; 5978 } 5979 5980 /* They look valid */ 5981 return check_reshape(mddev); 5982 } 5983 5984 static void *raid5_takeover(struct mddev *mddev) 5985 { 5986 /* raid5 can take over: 5987 * raid0 - if there is only one strip zone - make it a raid4 layout 5988 * raid1 - if there are two drives. We need to know the chunk size 5989 * raid4 - trivial - just use a raid4 layout. 5990 * raid6 - Providing it is a *_6 layout 5991 */ 5992 if (mddev->level == 0) 5993 return raid45_takeover_raid0(mddev, 5); 5994 if (mddev->level == 1) 5995 return raid5_takeover_raid1(mddev); 5996 if (mddev->level == 4) { 5997 mddev->new_layout = ALGORITHM_PARITY_N; 5998 mddev->new_level = 5; 5999 return setup_conf(mddev); 6000 } 6001 if (mddev->level == 6) 6002 return raid5_takeover_raid6(mddev); 6003 6004 return ERR_PTR(-EINVAL); 6005 } 6006 6007 static void *raid4_takeover(struct mddev *mddev) 6008 { 6009 /* raid4 can take over: 6010 * raid0 - if there is only one strip zone 6011 * raid5 - if layout is right 6012 */ 6013 if (mddev->level == 0) 6014 return raid45_takeover_raid0(mddev, 4); 6015 if (mddev->level == 5 && 6016 mddev->layout == ALGORITHM_PARITY_N) { 6017 mddev->new_layout = 0; 6018 mddev->new_level = 4; 6019 return setup_conf(mddev); 6020 } 6021 return ERR_PTR(-EINVAL); 6022 } 6023 6024 static struct md_personality raid5_personality; 6025 6026 static void *raid6_takeover(struct mddev *mddev) 6027 { 6028 /* Currently can only take over a raid5. We map the 6029 * personality to an equivalent raid6 personality 6030 * with the Q block at the end. 6031 */ 6032 int new_layout; 6033 6034 if (mddev->pers != &raid5_personality) 6035 return ERR_PTR(-EINVAL); 6036 if (mddev->degraded > 1) 6037 return ERR_PTR(-EINVAL); 6038 if (mddev->raid_disks > 253) 6039 return ERR_PTR(-EINVAL); 6040 if (mddev->raid_disks < 3) 6041 return ERR_PTR(-EINVAL); 6042 6043 switch (mddev->layout) { 6044 case ALGORITHM_LEFT_ASYMMETRIC: 6045 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6046 break; 6047 case ALGORITHM_RIGHT_ASYMMETRIC: 6048 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6049 break; 6050 case ALGORITHM_LEFT_SYMMETRIC: 6051 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6052 break; 6053 case ALGORITHM_RIGHT_SYMMETRIC: 6054 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6055 break; 6056 case ALGORITHM_PARITY_0: 6057 new_layout = ALGORITHM_PARITY_0_6; 6058 break; 6059 case ALGORITHM_PARITY_N: 6060 new_layout = ALGORITHM_PARITY_N; 6061 break; 6062 default: 6063 return ERR_PTR(-EINVAL); 6064 } 6065 mddev->new_level = 6; 6066 mddev->new_layout = new_layout; 6067 mddev->delta_disks = 1; 6068 mddev->raid_disks += 1; 6069 return setup_conf(mddev); 6070 } 6071 6072 6073 static struct md_personality raid6_personality = 6074 { 6075 .name = "raid6", 6076 .level = 6, 6077 .owner = THIS_MODULE, 6078 .make_request = make_request, 6079 .run = run, 6080 .stop = stop, 6081 .status = status, 6082 .error_handler = error, 6083 .hot_add_disk = raid5_add_disk, 6084 .hot_remove_disk= raid5_remove_disk, 6085 .spare_active = raid5_spare_active, 6086 .sync_request = sync_request, 6087 .resize = raid5_resize, 6088 .size = raid5_size, 6089 .check_reshape = raid6_check_reshape, 6090 .start_reshape = raid5_start_reshape, 6091 .finish_reshape = raid5_finish_reshape, 6092 .quiesce = raid5_quiesce, 6093 .takeover = raid6_takeover, 6094 }; 6095 static struct md_personality raid5_personality = 6096 { 6097 .name = "raid5", 6098 .level = 5, 6099 .owner = THIS_MODULE, 6100 .make_request = make_request, 6101 .run = run, 6102 .stop = stop, 6103 .status = status, 6104 .error_handler = error, 6105 .hot_add_disk = raid5_add_disk, 6106 .hot_remove_disk= raid5_remove_disk, 6107 .spare_active = raid5_spare_active, 6108 .sync_request = sync_request, 6109 .resize = raid5_resize, 6110 .size = raid5_size, 6111 .check_reshape = raid5_check_reshape, 6112 .start_reshape = raid5_start_reshape, 6113 .finish_reshape = raid5_finish_reshape, 6114 .quiesce = raid5_quiesce, 6115 .takeover = raid5_takeover, 6116 }; 6117 6118 static struct md_personality raid4_personality = 6119 { 6120 .name = "raid4", 6121 .level = 4, 6122 .owner = THIS_MODULE, 6123 .make_request = make_request, 6124 .run = run, 6125 .stop = stop, 6126 .status = status, 6127 .error_handler = error, 6128 .hot_add_disk = raid5_add_disk, 6129 .hot_remove_disk= raid5_remove_disk, 6130 .spare_active = raid5_spare_active, 6131 .sync_request = sync_request, 6132 .resize = raid5_resize, 6133 .size = raid5_size, 6134 .check_reshape = raid5_check_reshape, 6135 .start_reshape = raid5_start_reshape, 6136 .finish_reshape = raid5_finish_reshape, 6137 .quiesce = raid5_quiesce, 6138 .takeover = raid4_takeover, 6139 }; 6140 6141 static int __init raid5_init(void) 6142 { 6143 register_md_personality(&raid6_personality); 6144 register_md_personality(&raid5_personality); 6145 register_md_personality(&raid4_personality); 6146 return 0; 6147 } 6148 6149 static void raid5_exit(void) 6150 { 6151 unregister_md_personality(&raid6_personality); 6152 unregister_md_personality(&raid5_personality); 6153 unregister_md_personality(&raid4_personality); 6154 } 6155 6156 module_init(raid5_init); 6157 module_exit(raid5_exit); 6158 MODULE_LICENSE("GPL"); 6159 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6160 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6161 MODULE_ALIAS("md-raid5"); 6162 MODULE_ALIAS("md-raid4"); 6163 MODULE_ALIAS("md-level-5"); 6164 MODULE_ALIAS("md-level-4"); 6165 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6166 MODULE_ALIAS("md-raid6"); 6167 MODULE_ALIAS("md-level-6"); 6168 6169 /* This used to be two separate modules, they were: */ 6170 MODULE_ALIAS("raid5"); 6171 MODULE_ALIAS("raid6"); 6172