1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include "md.h" 57 #include "raid5.h" 58 #include "raid0.h" 59 #include "bitmap.h" 60 61 /* 62 * Stripe cache 63 */ 64 65 #define NR_STRIPES 256 66 #define STRIPE_SIZE PAGE_SIZE 67 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 68 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 69 #define IO_THRESHOLD 1 70 #define BYPASS_THRESHOLD 1 71 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 72 #define HASH_MASK (NR_HASH - 1) 73 74 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 75 { 76 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 77 return &conf->stripe_hashtbl[hash]; 78 } 79 80 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 81 * order without overlap. There may be several bio's per stripe+device, and 82 * a bio could span several devices. 83 * When walking this list for a particular stripe+device, we must never proceed 84 * beyond a bio that extends past this device, as the next bio might no longer 85 * be valid. 86 * This function is used to determine the 'next' bio in the list, given the sector 87 * of the current stripe+device 88 */ 89 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 90 { 91 int sectors = bio->bi_size >> 9; 92 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 93 return bio->bi_next; 94 else 95 return NULL; 96 } 97 98 /* 99 * We maintain a biased count of active stripes in the bottom 16 bits of 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 */ 102 static inline int raid5_bi_processed_stripes(struct bio *bio) 103 { 104 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 105 return (atomic_read(segments) >> 16) & 0xffff; 106 } 107 108 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 109 { 110 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 111 return atomic_sub_return(1, segments) & 0xffff; 112 } 113 114 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 115 { 116 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 117 atomic_inc(segments); 118 } 119 120 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 121 unsigned int cnt) 122 { 123 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 124 int old, new; 125 126 do { 127 old = atomic_read(segments); 128 new = (old & 0xffff) | (cnt << 16); 129 } while (atomic_cmpxchg(segments, old, new) != old); 130 } 131 132 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 133 { 134 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 135 atomic_set(segments, cnt); 136 } 137 138 /* Find first data disk in a raid6 stripe */ 139 static inline int raid6_d0(struct stripe_head *sh) 140 { 141 if (sh->ddf_layout) 142 /* ddf always start from first device */ 143 return 0; 144 /* md starts just after Q block */ 145 if (sh->qd_idx == sh->disks - 1) 146 return 0; 147 else 148 return sh->qd_idx + 1; 149 } 150 static inline int raid6_next_disk(int disk, int raid_disks) 151 { 152 disk++; 153 return (disk < raid_disks) ? disk : 0; 154 } 155 156 /* When walking through the disks in a raid5, starting at raid6_d0, 157 * We need to map each disk to a 'slot', where the data disks are slot 158 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 159 * is raid_disks-1. This help does that mapping. 160 */ 161 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 162 int *count, int syndrome_disks) 163 { 164 int slot = *count; 165 166 if (sh->ddf_layout) 167 (*count)++; 168 if (idx == sh->pd_idx) 169 return syndrome_disks; 170 if (idx == sh->qd_idx) 171 return syndrome_disks + 1; 172 if (!sh->ddf_layout) 173 (*count)++; 174 return slot; 175 } 176 177 static void return_io(struct bio *return_bi) 178 { 179 struct bio *bi = return_bi; 180 while (bi) { 181 182 return_bi = bi->bi_next; 183 bi->bi_next = NULL; 184 bi->bi_size = 0; 185 bio_endio(bi, 0); 186 bi = return_bi; 187 } 188 } 189 190 static void print_raid5_conf (struct r5conf *conf); 191 192 static int stripe_operations_active(struct stripe_head *sh) 193 { 194 return sh->check_state || sh->reconstruct_state || 195 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 196 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 197 } 198 199 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 200 { 201 BUG_ON(!list_empty(&sh->lru)); 202 BUG_ON(atomic_read(&conf->active_stripes)==0); 203 if (test_bit(STRIPE_HANDLE, &sh->state)) { 204 if (test_bit(STRIPE_DELAYED, &sh->state) && 205 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 206 list_add_tail(&sh->lru, &conf->delayed_list); 207 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 208 sh->bm_seq - conf->seq_write > 0) 209 list_add_tail(&sh->lru, &conf->bitmap_list); 210 else { 211 clear_bit(STRIPE_DELAYED, &sh->state); 212 clear_bit(STRIPE_BIT_DELAY, &sh->state); 213 list_add_tail(&sh->lru, &conf->handle_list); 214 } 215 md_wakeup_thread(conf->mddev->thread); 216 } else { 217 BUG_ON(stripe_operations_active(sh)); 218 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 219 if (atomic_dec_return(&conf->preread_active_stripes) 220 < IO_THRESHOLD) 221 md_wakeup_thread(conf->mddev->thread); 222 atomic_dec(&conf->active_stripes); 223 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 224 list_add_tail(&sh->lru, &conf->inactive_list); 225 wake_up(&conf->wait_for_stripe); 226 if (conf->retry_read_aligned) 227 md_wakeup_thread(conf->mddev->thread); 228 } 229 } 230 } 231 232 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 233 { 234 if (atomic_dec_and_test(&sh->count)) 235 do_release_stripe(conf, sh); 236 } 237 238 static void release_stripe(struct stripe_head *sh) 239 { 240 struct r5conf *conf = sh->raid_conf; 241 unsigned long flags; 242 243 local_irq_save(flags); 244 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 245 do_release_stripe(conf, sh); 246 spin_unlock(&conf->device_lock); 247 } 248 local_irq_restore(flags); 249 } 250 251 static inline void remove_hash(struct stripe_head *sh) 252 { 253 pr_debug("remove_hash(), stripe %llu\n", 254 (unsigned long long)sh->sector); 255 256 hlist_del_init(&sh->hash); 257 } 258 259 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 260 { 261 struct hlist_head *hp = stripe_hash(conf, sh->sector); 262 263 pr_debug("insert_hash(), stripe %llu\n", 264 (unsigned long long)sh->sector); 265 266 hlist_add_head(&sh->hash, hp); 267 } 268 269 270 /* find an idle stripe, make sure it is unhashed, and return it. */ 271 static struct stripe_head *get_free_stripe(struct r5conf *conf) 272 { 273 struct stripe_head *sh = NULL; 274 struct list_head *first; 275 276 if (list_empty(&conf->inactive_list)) 277 goto out; 278 first = conf->inactive_list.next; 279 sh = list_entry(first, struct stripe_head, lru); 280 list_del_init(first); 281 remove_hash(sh); 282 atomic_inc(&conf->active_stripes); 283 out: 284 return sh; 285 } 286 287 static void shrink_buffers(struct stripe_head *sh) 288 { 289 struct page *p; 290 int i; 291 int num = sh->raid_conf->pool_size; 292 293 for (i = 0; i < num ; i++) { 294 p = sh->dev[i].page; 295 if (!p) 296 continue; 297 sh->dev[i].page = NULL; 298 put_page(p); 299 } 300 } 301 302 static int grow_buffers(struct stripe_head *sh) 303 { 304 int i; 305 int num = sh->raid_conf->pool_size; 306 307 for (i = 0; i < num; i++) { 308 struct page *page; 309 310 if (!(page = alloc_page(GFP_KERNEL))) { 311 return 1; 312 } 313 sh->dev[i].page = page; 314 } 315 return 0; 316 } 317 318 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 319 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 320 struct stripe_head *sh); 321 322 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 323 { 324 struct r5conf *conf = sh->raid_conf; 325 int i; 326 327 BUG_ON(atomic_read(&sh->count) != 0); 328 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 329 BUG_ON(stripe_operations_active(sh)); 330 331 pr_debug("init_stripe called, stripe %llu\n", 332 (unsigned long long)sh->sector); 333 334 remove_hash(sh); 335 336 sh->generation = conf->generation - previous; 337 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 338 sh->sector = sector; 339 stripe_set_idx(sector, conf, previous, sh); 340 sh->state = 0; 341 342 343 for (i = sh->disks; i--; ) { 344 struct r5dev *dev = &sh->dev[i]; 345 346 if (dev->toread || dev->read || dev->towrite || dev->written || 347 test_bit(R5_LOCKED, &dev->flags)) { 348 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 349 (unsigned long long)sh->sector, i, dev->toread, 350 dev->read, dev->towrite, dev->written, 351 test_bit(R5_LOCKED, &dev->flags)); 352 WARN_ON(1); 353 } 354 dev->flags = 0; 355 raid5_build_block(sh, i, previous); 356 } 357 insert_hash(conf, sh); 358 } 359 360 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 361 short generation) 362 { 363 struct stripe_head *sh; 364 struct hlist_node *hn; 365 366 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 367 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 368 if (sh->sector == sector && sh->generation == generation) 369 return sh; 370 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 371 return NULL; 372 } 373 374 /* 375 * Need to check if array has failed when deciding whether to: 376 * - start an array 377 * - remove non-faulty devices 378 * - add a spare 379 * - allow a reshape 380 * This determination is simple when no reshape is happening. 381 * However if there is a reshape, we need to carefully check 382 * both the before and after sections. 383 * This is because some failed devices may only affect one 384 * of the two sections, and some non-in_sync devices may 385 * be insync in the section most affected by failed devices. 386 */ 387 static int calc_degraded(struct r5conf *conf) 388 { 389 int degraded, degraded2; 390 int i; 391 392 rcu_read_lock(); 393 degraded = 0; 394 for (i = 0; i < conf->previous_raid_disks; i++) { 395 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 396 if (rdev && test_bit(Faulty, &rdev->flags)) 397 rdev = rcu_dereference(conf->disks[i].replacement); 398 if (!rdev || test_bit(Faulty, &rdev->flags)) 399 degraded++; 400 else if (test_bit(In_sync, &rdev->flags)) 401 ; 402 else 403 /* not in-sync or faulty. 404 * If the reshape increases the number of devices, 405 * this is being recovered by the reshape, so 406 * this 'previous' section is not in_sync. 407 * If the number of devices is being reduced however, 408 * the device can only be part of the array if 409 * we are reverting a reshape, so this section will 410 * be in-sync. 411 */ 412 if (conf->raid_disks >= conf->previous_raid_disks) 413 degraded++; 414 } 415 rcu_read_unlock(); 416 if (conf->raid_disks == conf->previous_raid_disks) 417 return degraded; 418 rcu_read_lock(); 419 degraded2 = 0; 420 for (i = 0; i < conf->raid_disks; i++) { 421 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 422 if (rdev && test_bit(Faulty, &rdev->flags)) 423 rdev = rcu_dereference(conf->disks[i].replacement); 424 if (!rdev || test_bit(Faulty, &rdev->flags)) 425 degraded2++; 426 else if (test_bit(In_sync, &rdev->flags)) 427 ; 428 else 429 /* not in-sync or faulty. 430 * If reshape increases the number of devices, this 431 * section has already been recovered, else it 432 * almost certainly hasn't. 433 */ 434 if (conf->raid_disks <= conf->previous_raid_disks) 435 degraded2++; 436 } 437 rcu_read_unlock(); 438 if (degraded2 > degraded) 439 return degraded2; 440 return degraded; 441 } 442 443 static int has_failed(struct r5conf *conf) 444 { 445 int degraded; 446 447 if (conf->mddev->reshape_position == MaxSector) 448 return conf->mddev->degraded > conf->max_degraded; 449 450 degraded = calc_degraded(conf); 451 if (degraded > conf->max_degraded) 452 return 1; 453 return 0; 454 } 455 456 static struct stripe_head * 457 get_active_stripe(struct r5conf *conf, sector_t sector, 458 int previous, int noblock, int noquiesce) 459 { 460 struct stripe_head *sh; 461 462 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 463 464 spin_lock_irq(&conf->device_lock); 465 466 do { 467 wait_event_lock_irq(conf->wait_for_stripe, 468 conf->quiesce == 0 || noquiesce, 469 conf->device_lock, /* nothing */); 470 sh = __find_stripe(conf, sector, conf->generation - previous); 471 if (!sh) { 472 if (!conf->inactive_blocked) 473 sh = get_free_stripe(conf); 474 if (noblock && sh == NULL) 475 break; 476 if (!sh) { 477 conf->inactive_blocked = 1; 478 wait_event_lock_irq(conf->wait_for_stripe, 479 !list_empty(&conf->inactive_list) && 480 (atomic_read(&conf->active_stripes) 481 < (conf->max_nr_stripes *3/4) 482 || !conf->inactive_blocked), 483 conf->device_lock, 484 ); 485 conf->inactive_blocked = 0; 486 } else 487 init_stripe(sh, sector, previous); 488 } else { 489 if (atomic_read(&sh->count)) { 490 BUG_ON(!list_empty(&sh->lru) 491 && !test_bit(STRIPE_EXPANDING, &sh->state) 492 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 493 } else { 494 if (!test_bit(STRIPE_HANDLE, &sh->state)) 495 atomic_inc(&conf->active_stripes); 496 if (list_empty(&sh->lru) && 497 !test_bit(STRIPE_EXPANDING, &sh->state)) 498 BUG(); 499 list_del_init(&sh->lru); 500 } 501 } 502 } while (sh == NULL); 503 504 if (sh) 505 atomic_inc(&sh->count); 506 507 spin_unlock_irq(&conf->device_lock); 508 return sh; 509 } 510 511 /* Determine if 'data_offset' or 'new_data_offset' should be used 512 * in this stripe_head. 513 */ 514 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 515 { 516 sector_t progress = conf->reshape_progress; 517 /* Need a memory barrier to make sure we see the value 518 * of conf->generation, or ->data_offset that was set before 519 * reshape_progress was updated. 520 */ 521 smp_rmb(); 522 if (progress == MaxSector) 523 return 0; 524 if (sh->generation == conf->generation - 1) 525 return 0; 526 /* We are in a reshape, and this is a new-generation stripe, 527 * so use new_data_offset. 528 */ 529 return 1; 530 } 531 532 static void 533 raid5_end_read_request(struct bio *bi, int error); 534 static void 535 raid5_end_write_request(struct bio *bi, int error); 536 537 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 538 { 539 struct r5conf *conf = sh->raid_conf; 540 int i, disks = sh->disks; 541 542 might_sleep(); 543 544 for (i = disks; i--; ) { 545 int rw; 546 int replace_only = 0; 547 struct bio *bi, *rbi; 548 struct md_rdev *rdev, *rrdev = NULL; 549 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 550 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 551 rw = WRITE_FUA; 552 else 553 rw = WRITE; 554 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 555 rw = READ; 556 else if (test_and_clear_bit(R5_WantReplace, 557 &sh->dev[i].flags)) { 558 rw = WRITE; 559 replace_only = 1; 560 } else 561 continue; 562 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 563 rw |= REQ_SYNC; 564 565 bi = &sh->dev[i].req; 566 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 567 568 bi->bi_rw = rw; 569 rbi->bi_rw = rw; 570 if (rw & WRITE) { 571 bi->bi_end_io = raid5_end_write_request; 572 rbi->bi_end_io = raid5_end_write_request; 573 } else 574 bi->bi_end_io = raid5_end_read_request; 575 576 rcu_read_lock(); 577 rrdev = rcu_dereference(conf->disks[i].replacement); 578 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 579 rdev = rcu_dereference(conf->disks[i].rdev); 580 if (!rdev) { 581 rdev = rrdev; 582 rrdev = NULL; 583 } 584 if (rw & WRITE) { 585 if (replace_only) 586 rdev = NULL; 587 if (rdev == rrdev) 588 /* We raced and saw duplicates */ 589 rrdev = NULL; 590 } else { 591 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 592 rdev = rrdev; 593 rrdev = NULL; 594 } 595 596 if (rdev && test_bit(Faulty, &rdev->flags)) 597 rdev = NULL; 598 if (rdev) 599 atomic_inc(&rdev->nr_pending); 600 if (rrdev && test_bit(Faulty, &rrdev->flags)) 601 rrdev = NULL; 602 if (rrdev) 603 atomic_inc(&rrdev->nr_pending); 604 rcu_read_unlock(); 605 606 /* We have already checked bad blocks for reads. Now 607 * need to check for writes. We never accept write errors 608 * on the replacement, so we don't to check rrdev. 609 */ 610 while ((rw & WRITE) && rdev && 611 test_bit(WriteErrorSeen, &rdev->flags)) { 612 sector_t first_bad; 613 int bad_sectors; 614 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 615 &first_bad, &bad_sectors); 616 if (!bad) 617 break; 618 619 if (bad < 0) { 620 set_bit(BlockedBadBlocks, &rdev->flags); 621 if (!conf->mddev->external && 622 conf->mddev->flags) { 623 /* It is very unlikely, but we might 624 * still need to write out the 625 * bad block log - better give it 626 * a chance*/ 627 md_check_recovery(conf->mddev); 628 } 629 /* 630 * Because md_wait_for_blocked_rdev 631 * will dec nr_pending, we must 632 * increment it first. 633 */ 634 atomic_inc(&rdev->nr_pending); 635 md_wait_for_blocked_rdev(rdev, conf->mddev); 636 } else { 637 /* Acknowledged bad block - skip the write */ 638 rdev_dec_pending(rdev, conf->mddev); 639 rdev = NULL; 640 } 641 } 642 643 if (rdev) { 644 if (s->syncing || s->expanding || s->expanded 645 || s->replacing) 646 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 647 648 set_bit(STRIPE_IO_STARTED, &sh->state); 649 650 bi->bi_bdev = rdev->bdev; 651 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 652 __func__, (unsigned long long)sh->sector, 653 bi->bi_rw, i); 654 atomic_inc(&sh->count); 655 if (use_new_offset(conf, sh)) 656 bi->bi_sector = (sh->sector 657 + rdev->new_data_offset); 658 else 659 bi->bi_sector = (sh->sector 660 + rdev->data_offset); 661 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 662 bi->bi_rw |= REQ_FLUSH; 663 664 bi->bi_flags = 1 << BIO_UPTODATE; 665 bi->bi_idx = 0; 666 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 667 bi->bi_io_vec[0].bv_offset = 0; 668 bi->bi_size = STRIPE_SIZE; 669 bi->bi_next = NULL; 670 if (rrdev) 671 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 672 generic_make_request(bi); 673 } 674 if (rrdev) { 675 if (s->syncing || s->expanding || s->expanded 676 || s->replacing) 677 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 678 679 set_bit(STRIPE_IO_STARTED, &sh->state); 680 681 rbi->bi_bdev = rrdev->bdev; 682 pr_debug("%s: for %llu schedule op %ld on " 683 "replacement disc %d\n", 684 __func__, (unsigned long long)sh->sector, 685 rbi->bi_rw, i); 686 atomic_inc(&sh->count); 687 if (use_new_offset(conf, sh)) 688 rbi->bi_sector = (sh->sector 689 + rrdev->new_data_offset); 690 else 691 rbi->bi_sector = (sh->sector 692 + rrdev->data_offset); 693 rbi->bi_flags = 1 << BIO_UPTODATE; 694 rbi->bi_idx = 0; 695 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 696 rbi->bi_io_vec[0].bv_offset = 0; 697 rbi->bi_size = STRIPE_SIZE; 698 rbi->bi_next = NULL; 699 generic_make_request(rbi); 700 } 701 if (!rdev && !rrdev) { 702 if (rw & WRITE) 703 set_bit(STRIPE_DEGRADED, &sh->state); 704 pr_debug("skip op %ld on disc %d for sector %llu\n", 705 bi->bi_rw, i, (unsigned long long)sh->sector); 706 clear_bit(R5_LOCKED, &sh->dev[i].flags); 707 set_bit(STRIPE_HANDLE, &sh->state); 708 } 709 } 710 } 711 712 static struct dma_async_tx_descriptor * 713 async_copy_data(int frombio, struct bio *bio, struct page *page, 714 sector_t sector, struct dma_async_tx_descriptor *tx) 715 { 716 struct bio_vec *bvl; 717 struct page *bio_page; 718 int i; 719 int page_offset; 720 struct async_submit_ctl submit; 721 enum async_tx_flags flags = 0; 722 723 if (bio->bi_sector >= sector) 724 page_offset = (signed)(bio->bi_sector - sector) * 512; 725 else 726 page_offset = (signed)(sector - bio->bi_sector) * -512; 727 728 if (frombio) 729 flags |= ASYNC_TX_FENCE; 730 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 731 732 bio_for_each_segment(bvl, bio, i) { 733 int len = bvl->bv_len; 734 int clen; 735 int b_offset = 0; 736 737 if (page_offset < 0) { 738 b_offset = -page_offset; 739 page_offset += b_offset; 740 len -= b_offset; 741 } 742 743 if (len > 0 && page_offset + len > STRIPE_SIZE) 744 clen = STRIPE_SIZE - page_offset; 745 else 746 clen = len; 747 748 if (clen > 0) { 749 b_offset += bvl->bv_offset; 750 bio_page = bvl->bv_page; 751 if (frombio) 752 tx = async_memcpy(page, bio_page, page_offset, 753 b_offset, clen, &submit); 754 else 755 tx = async_memcpy(bio_page, page, b_offset, 756 page_offset, clen, &submit); 757 } 758 /* chain the operations */ 759 submit.depend_tx = tx; 760 761 if (clen < len) /* hit end of page */ 762 break; 763 page_offset += len; 764 } 765 766 return tx; 767 } 768 769 static void ops_complete_biofill(void *stripe_head_ref) 770 { 771 struct stripe_head *sh = stripe_head_ref; 772 struct bio *return_bi = NULL; 773 int i; 774 775 pr_debug("%s: stripe %llu\n", __func__, 776 (unsigned long long)sh->sector); 777 778 /* clear completed biofills */ 779 for (i = sh->disks; i--; ) { 780 struct r5dev *dev = &sh->dev[i]; 781 782 /* acknowledge completion of a biofill operation */ 783 /* and check if we need to reply to a read request, 784 * new R5_Wantfill requests are held off until 785 * !STRIPE_BIOFILL_RUN 786 */ 787 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 788 struct bio *rbi, *rbi2; 789 790 BUG_ON(!dev->read); 791 rbi = dev->read; 792 dev->read = NULL; 793 while (rbi && rbi->bi_sector < 794 dev->sector + STRIPE_SECTORS) { 795 rbi2 = r5_next_bio(rbi, dev->sector); 796 if (!raid5_dec_bi_active_stripes(rbi)) { 797 rbi->bi_next = return_bi; 798 return_bi = rbi; 799 } 800 rbi = rbi2; 801 } 802 } 803 } 804 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 805 806 return_io(return_bi); 807 808 set_bit(STRIPE_HANDLE, &sh->state); 809 release_stripe(sh); 810 } 811 812 static void ops_run_biofill(struct stripe_head *sh) 813 { 814 struct dma_async_tx_descriptor *tx = NULL; 815 struct async_submit_ctl submit; 816 int i; 817 818 pr_debug("%s: stripe %llu\n", __func__, 819 (unsigned long long)sh->sector); 820 821 for (i = sh->disks; i--; ) { 822 struct r5dev *dev = &sh->dev[i]; 823 if (test_bit(R5_Wantfill, &dev->flags)) { 824 struct bio *rbi; 825 spin_lock_irq(&sh->stripe_lock); 826 dev->read = rbi = dev->toread; 827 dev->toread = NULL; 828 spin_unlock_irq(&sh->stripe_lock); 829 while (rbi && rbi->bi_sector < 830 dev->sector + STRIPE_SECTORS) { 831 tx = async_copy_data(0, rbi, dev->page, 832 dev->sector, tx); 833 rbi = r5_next_bio(rbi, dev->sector); 834 } 835 } 836 } 837 838 atomic_inc(&sh->count); 839 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 840 async_trigger_callback(&submit); 841 } 842 843 static void mark_target_uptodate(struct stripe_head *sh, int target) 844 { 845 struct r5dev *tgt; 846 847 if (target < 0) 848 return; 849 850 tgt = &sh->dev[target]; 851 set_bit(R5_UPTODATE, &tgt->flags); 852 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 853 clear_bit(R5_Wantcompute, &tgt->flags); 854 } 855 856 static void ops_complete_compute(void *stripe_head_ref) 857 { 858 struct stripe_head *sh = stripe_head_ref; 859 860 pr_debug("%s: stripe %llu\n", __func__, 861 (unsigned long long)sh->sector); 862 863 /* mark the computed target(s) as uptodate */ 864 mark_target_uptodate(sh, sh->ops.target); 865 mark_target_uptodate(sh, sh->ops.target2); 866 867 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 868 if (sh->check_state == check_state_compute_run) 869 sh->check_state = check_state_compute_result; 870 set_bit(STRIPE_HANDLE, &sh->state); 871 release_stripe(sh); 872 } 873 874 /* return a pointer to the address conversion region of the scribble buffer */ 875 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 876 struct raid5_percpu *percpu) 877 { 878 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 879 } 880 881 static struct dma_async_tx_descriptor * 882 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 883 { 884 int disks = sh->disks; 885 struct page **xor_srcs = percpu->scribble; 886 int target = sh->ops.target; 887 struct r5dev *tgt = &sh->dev[target]; 888 struct page *xor_dest = tgt->page; 889 int count = 0; 890 struct dma_async_tx_descriptor *tx; 891 struct async_submit_ctl submit; 892 int i; 893 894 pr_debug("%s: stripe %llu block: %d\n", 895 __func__, (unsigned long long)sh->sector, target); 896 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 897 898 for (i = disks; i--; ) 899 if (i != target) 900 xor_srcs[count++] = sh->dev[i].page; 901 902 atomic_inc(&sh->count); 903 904 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 905 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 906 if (unlikely(count == 1)) 907 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 908 else 909 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 910 911 return tx; 912 } 913 914 /* set_syndrome_sources - populate source buffers for gen_syndrome 915 * @srcs - (struct page *) array of size sh->disks 916 * @sh - stripe_head to parse 917 * 918 * Populates srcs in proper layout order for the stripe and returns the 919 * 'count' of sources to be used in a call to async_gen_syndrome. The P 920 * destination buffer is recorded in srcs[count] and the Q destination 921 * is recorded in srcs[count+1]]. 922 */ 923 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 924 { 925 int disks = sh->disks; 926 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 927 int d0_idx = raid6_d0(sh); 928 int count; 929 int i; 930 931 for (i = 0; i < disks; i++) 932 srcs[i] = NULL; 933 934 count = 0; 935 i = d0_idx; 936 do { 937 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 938 939 srcs[slot] = sh->dev[i].page; 940 i = raid6_next_disk(i, disks); 941 } while (i != d0_idx); 942 943 return syndrome_disks; 944 } 945 946 static struct dma_async_tx_descriptor * 947 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 948 { 949 int disks = sh->disks; 950 struct page **blocks = percpu->scribble; 951 int target; 952 int qd_idx = sh->qd_idx; 953 struct dma_async_tx_descriptor *tx; 954 struct async_submit_ctl submit; 955 struct r5dev *tgt; 956 struct page *dest; 957 int i; 958 int count; 959 960 if (sh->ops.target < 0) 961 target = sh->ops.target2; 962 else if (sh->ops.target2 < 0) 963 target = sh->ops.target; 964 else 965 /* we should only have one valid target */ 966 BUG(); 967 BUG_ON(target < 0); 968 pr_debug("%s: stripe %llu block: %d\n", 969 __func__, (unsigned long long)sh->sector, target); 970 971 tgt = &sh->dev[target]; 972 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 973 dest = tgt->page; 974 975 atomic_inc(&sh->count); 976 977 if (target == qd_idx) { 978 count = set_syndrome_sources(blocks, sh); 979 blocks[count] = NULL; /* regenerating p is not necessary */ 980 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 981 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 982 ops_complete_compute, sh, 983 to_addr_conv(sh, percpu)); 984 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 985 } else { 986 /* Compute any data- or p-drive using XOR */ 987 count = 0; 988 for (i = disks; i-- ; ) { 989 if (i == target || i == qd_idx) 990 continue; 991 blocks[count++] = sh->dev[i].page; 992 } 993 994 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 995 NULL, ops_complete_compute, sh, 996 to_addr_conv(sh, percpu)); 997 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 998 } 999 1000 return tx; 1001 } 1002 1003 static struct dma_async_tx_descriptor * 1004 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1005 { 1006 int i, count, disks = sh->disks; 1007 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1008 int d0_idx = raid6_d0(sh); 1009 int faila = -1, failb = -1; 1010 int target = sh->ops.target; 1011 int target2 = sh->ops.target2; 1012 struct r5dev *tgt = &sh->dev[target]; 1013 struct r5dev *tgt2 = &sh->dev[target2]; 1014 struct dma_async_tx_descriptor *tx; 1015 struct page **blocks = percpu->scribble; 1016 struct async_submit_ctl submit; 1017 1018 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1019 __func__, (unsigned long long)sh->sector, target, target2); 1020 BUG_ON(target < 0 || target2 < 0); 1021 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1022 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1023 1024 /* we need to open-code set_syndrome_sources to handle the 1025 * slot number conversion for 'faila' and 'failb' 1026 */ 1027 for (i = 0; i < disks ; i++) 1028 blocks[i] = NULL; 1029 count = 0; 1030 i = d0_idx; 1031 do { 1032 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1033 1034 blocks[slot] = sh->dev[i].page; 1035 1036 if (i == target) 1037 faila = slot; 1038 if (i == target2) 1039 failb = slot; 1040 i = raid6_next_disk(i, disks); 1041 } while (i != d0_idx); 1042 1043 BUG_ON(faila == failb); 1044 if (failb < faila) 1045 swap(faila, failb); 1046 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1047 __func__, (unsigned long long)sh->sector, faila, failb); 1048 1049 atomic_inc(&sh->count); 1050 1051 if (failb == syndrome_disks+1) { 1052 /* Q disk is one of the missing disks */ 1053 if (faila == syndrome_disks) { 1054 /* Missing P+Q, just recompute */ 1055 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1056 ops_complete_compute, sh, 1057 to_addr_conv(sh, percpu)); 1058 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1059 STRIPE_SIZE, &submit); 1060 } else { 1061 struct page *dest; 1062 int data_target; 1063 int qd_idx = sh->qd_idx; 1064 1065 /* Missing D+Q: recompute D from P, then recompute Q */ 1066 if (target == qd_idx) 1067 data_target = target2; 1068 else 1069 data_target = target; 1070 1071 count = 0; 1072 for (i = disks; i-- ; ) { 1073 if (i == data_target || i == qd_idx) 1074 continue; 1075 blocks[count++] = sh->dev[i].page; 1076 } 1077 dest = sh->dev[data_target].page; 1078 init_async_submit(&submit, 1079 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1080 NULL, NULL, NULL, 1081 to_addr_conv(sh, percpu)); 1082 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1083 &submit); 1084 1085 count = set_syndrome_sources(blocks, sh); 1086 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1087 ops_complete_compute, sh, 1088 to_addr_conv(sh, percpu)); 1089 return async_gen_syndrome(blocks, 0, count+2, 1090 STRIPE_SIZE, &submit); 1091 } 1092 } else { 1093 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1094 ops_complete_compute, sh, 1095 to_addr_conv(sh, percpu)); 1096 if (failb == syndrome_disks) { 1097 /* We're missing D+P. */ 1098 return async_raid6_datap_recov(syndrome_disks+2, 1099 STRIPE_SIZE, faila, 1100 blocks, &submit); 1101 } else { 1102 /* We're missing D+D. */ 1103 return async_raid6_2data_recov(syndrome_disks+2, 1104 STRIPE_SIZE, faila, failb, 1105 blocks, &submit); 1106 } 1107 } 1108 } 1109 1110 1111 static void ops_complete_prexor(void *stripe_head_ref) 1112 { 1113 struct stripe_head *sh = stripe_head_ref; 1114 1115 pr_debug("%s: stripe %llu\n", __func__, 1116 (unsigned long long)sh->sector); 1117 } 1118 1119 static struct dma_async_tx_descriptor * 1120 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1121 struct dma_async_tx_descriptor *tx) 1122 { 1123 int disks = sh->disks; 1124 struct page **xor_srcs = percpu->scribble; 1125 int count = 0, pd_idx = sh->pd_idx, i; 1126 struct async_submit_ctl submit; 1127 1128 /* existing parity data subtracted */ 1129 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1130 1131 pr_debug("%s: stripe %llu\n", __func__, 1132 (unsigned long long)sh->sector); 1133 1134 for (i = disks; i--; ) { 1135 struct r5dev *dev = &sh->dev[i]; 1136 /* Only process blocks that are known to be uptodate */ 1137 if (test_bit(R5_Wantdrain, &dev->flags)) 1138 xor_srcs[count++] = dev->page; 1139 } 1140 1141 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1142 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1143 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1144 1145 return tx; 1146 } 1147 1148 static struct dma_async_tx_descriptor * 1149 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1150 { 1151 int disks = sh->disks; 1152 int i; 1153 1154 pr_debug("%s: stripe %llu\n", __func__, 1155 (unsigned long long)sh->sector); 1156 1157 for (i = disks; i--; ) { 1158 struct r5dev *dev = &sh->dev[i]; 1159 struct bio *chosen; 1160 1161 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1162 struct bio *wbi; 1163 1164 spin_lock_irq(&sh->stripe_lock); 1165 chosen = dev->towrite; 1166 dev->towrite = NULL; 1167 BUG_ON(dev->written); 1168 wbi = dev->written = chosen; 1169 spin_unlock_irq(&sh->stripe_lock); 1170 1171 while (wbi && wbi->bi_sector < 1172 dev->sector + STRIPE_SECTORS) { 1173 if (wbi->bi_rw & REQ_FUA) 1174 set_bit(R5_WantFUA, &dev->flags); 1175 if (wbi->bi_rw & REQ_SYNC) 1176 set_bit(R5_SyncIO, &dev->flags); 1177 tx = async_copy_data(1, wbi, dev->page, 1178 dev->sector, tx); 1179 wbi = r5_next_bio(wbi, dev->sector); 1180 } 1181 } 1182 } 1183 1184 return tx; 1185 } 1186 1187 static void ops_complete_reconstruct(void *stripe_head_ref) 1188 { 1189 struct stripe_head *sh = stripe_head_ref; 1190 int disks = sh->disks; 1191 int pd_idx = sh->pd_idx; 1192 int qd_idx = sh->qd_idx; 1193 int i; 1194 bool fua = false, sync = false; 1195 1196 pr_debug("%s: stripe %llu\n", __func__, 1197 (unsigned long long)sh->sector); 1198 1199 for (i = disks; i--; ) { 1200 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1201 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1202 } 1203 1204 for (i = disks; i--; ) { 1205 struct r5dev *dev = &sh->dev[i]; 1206 1207 if (dev->written || i == pd_idx || i == qd_idx) { 1208 set_bit(R5_UPTODATE, &dev->flags); 1209 if (fua) 1210 set_bit(R5_WantFUA, &dev->flags); 1211 if (sync) 1212 set_bit(R5_SyncIO, &dev->flags); 1213 } 1214 } 1215 1216 if (sh->reconstruct_state == reconstruct_state_drain_run) 1217 sh->reconstruct_state = reconstruct_state_drain_result; 1218 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1219 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1220 else { 1221 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1222 sh->reconstruct_state = reconstruct_state_result; 1223 } 1224 1225 set_bit(STRIPE_HANDLE, &sh->state); 1226 release_stripe(sh); 1227 } 1228 1229 static void 1230 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1231 struct dma_async_tx_descriptor *tx) 1232 { 1233 int disks = sh->disks; 1234 struct page **xor_srcs = percpu->scribble; 1235 struct async_submit_ctl submit; 1236 int count = 0, pd_idx = sh->pd_idx, i; 1237 struct page *xor_dest; 1238 int prexor = 0; 1239 unsigned long flags; 1240 1241 pr_debug("%s: stripe %llu\n", __func__, 1242 (unsigned long long)sh->sector); 1243 1244 /* check if prexor is active which means only process blocks 1245 * that are part of a read-modify-write (written) 1246 */ 1247 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1248 prexor = 1; 1249 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1250 for (i = disks; i--; ) { 1251 struct r5dev *dev = &sh->dev[i]; 1252 if (dev->written) 1253 xor_srcs[count++] = dev->page; 1254 } 1255 } else { 1256 xor_dest = sh->dev[pd_idx].page; 1257 for (i = disks; i--; ) { 1258 struct r5dev *dev = &sh->dev[i]; 1259 if (i != pd_idx) 1260 xor_srcs[count++] = dev->page; 1261 } 1262 } 1263 1264 /* 1/ if we prexor'd then the dest is reused as a source 1265 * 2/ if we did not prexor then we are redoing the parity 1266 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1267 * for the synchronous xor case 1268 */ 1269 flags = ASYNC_TX_ACK | 1270 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1271 1272 atomic_inc(&sh->count); 1273 1274 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1275 to_addr_conv(sh, percpu)); 1276 if (unlikely(count == 1)) 1277 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1278 else 1279 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1280 } 1281 1282 static void 1283 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1284 struct dma_async_tx_descriptor *tx) 1285 { 1286 struct async_submit_ctl submit; 1287 struct page **blocks = percpu->scribble; 1288 int count; 1289 1290 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1291 1292 count = set_syndrome_sources(blocks, sh); 1293 1294 atomic_inc(&sh->count); 1295 1296 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1297 sh, to_addr_conv(sh, percpu)); 1298 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1299 } 1300 1301 static void ops_complete_check(void *stripe_head_ref) 1302 { 1303 struct stripe_head *sh = stripe_head_ref; 1304 1305 pr_debug("%s: stripe %llu\n", __func__, 1306 (unsigned long long)sh->sector); 1307 1308 sh->check_state = check_state_check_result; 1309 set_bit(STRIPE_HANDLE, &sh->state); 1310 release_stripe(sh); 1311 } 1312 1313 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1314 { 1315 int disks = sh->disks; 1316 int pd_idx = sh->pd_idx; 1317 int qd_idx = sh->qd_idx; 1318 struct page *xor_dest; 1319 struct page **xor_srcs = percpu->scribble; 1320 struct dma_async_tx_descriptor *tx; 1321 struct async_submit_ctl submit; 1322 int count; 1323 int i; 1324 1325 pr_debug("%s: stripe %llu\n", __func__, 1326 (unsigned long long)sh->sector); 1327 1328 count = 0; 1329 xor_dest = sh->dev[pd_idx].page; 1330 xor_srcs[count++] = xor_dest; 1331 for (i = disks; i--; ) { 1332 if (i == pd_idx || i == qd_idx) 1333 continue; 1334 xor_srcs[count++] = sh->dev[i].page; 1335 } 1336 1337 init_async_submit(&submit, 0, NULL, NULL, NULL, 1338 to_addr_conv(sh, percpu)); 1339 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1340 &sh->ops.zero_sum_result, &submit); 1341 1342 atomic_inc(&sh->count); 1343 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1344 tx = async_trigger_callback(&submit); 1345 } 1346 1347 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1348 { 1349 struct page **srcs = percpu->scribble; 1350 struct async_submit_ctl submit; 1351 int count; 1352 1353 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1354 (unsigned long long)sh->sector, checkp); 1355 1356 count = set_syndrome_sources(srcs, sh); 1357 if (!checkp) 1358 srcs[count] = NULL; 1359 1360 atomic_inc(&sh->count); 1361 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1362 sh, to_addr_conv(sh, percpu)); 1363 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1364 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1365 } 1366 1367 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1368 { 1369 int overlap_clear = 0, i, disks = sh->disks; 1370 struct dma_async_tx_descriptor *tx = NULL; 1371 struct r5conf *conf = sh->raid_conf; 1372 int level = conf->level; 1373 struct raid5_percpu *percpu; 1374 unsigned long cpu; 1375 1376 cpu = get_cpu(); 1377 percpu = per_cpu_ptr(conf->percpu, cpu); 1378 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1379 ops_run_biofill(sh); 1380 overlap_clear++; 1381 } 1382 1383 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1384 if (level < 6) 1385 tx = ops_run_compute5(sh, percpu); 1386 else { 1387 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1388 tx = ops_run_compute6_1(sh, percpu); 1389 else 1390 tx = ops_run_compute6_2(sh, percpu); 1391 } 1392 /* terminate the chain if reconstruct is not set to be run */ 1393 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1394 async_tx_ack(tx); 1395 } 1396 1397 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1398 tx = ops_run_prexor(sh, percpu, tx); 1399 1400 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1401 tx = ops_run_biodrain(sh, tx); 1402 overlap_clear++; 1403 } 1404 1405 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1406 if (level < 6) 1407 ops_run_reconstruct5(sh, percpu, tx); 1408 else 1409 ops_run_reconstruct6(sh, percpu, tx); 1410 } 1411 1412 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1413 if (sh->check_state == check_state_run) 1414 ops_run_check_p(sh, percpu); 1415 else if (sh->check_state == check_state_run_q) 1416 ops_run_check_pq(sh, percpu, 0); 1417 else if (sh->check_state == check_state_run_pq) 1418 ops_run_check_pq(sh, percpu, 1); 1419 else 1420 BUG(); 1421 } 1422 1423 if (overlap_clear) 1424 for (i = disks; i--; ) { 1425 struct r5dev *dev = &sh->dev[i]; 1426 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1427 wake_up(&sh->raid_conf->wait_for_overlap); 1428 } 1429 put_cpu(); 1430 } 1431 1432 #ifdef CONFIG_MULTICORE_RAID456 1433 static void async_run_ops(void *param, async_cookie_t cookie) 1434 { 1435 struct stripe_head *sh = param; 1436 unsigned long ops_request = sh->ops.request; 1437 1438 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1439 wake_up(&sh->ops.wait_for_ops); 1440 1441 __raid_run_ops(sh, ops_request); 1442 release_stripe(sh); 1443 } 1444 1445 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1446 { 1447 /* since handle_stripe can be called outside of raid5d context 1448 * we need to ensure sh->ops.request is de-staged before another 1449 * request arrives 1450 */ 1451 wait_event(sh->ops.wait_for_ops, 1452 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1453 sh->ops.request = ops_request; 1454 1455 atomic_inc(&sh->count); 1456 async_schedule(async_run_ops, sh); 1457 } 1458 #else 1459 #define raid_run_ops __raid_run_ops 1460 #endif 1461 1462 static int grow_one_stripe(struct r5conf *conf) 1463 { 1464 struct stripe_head *sh; 1465 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1466 if (!sh) 1467 return 0; 1468 1469 sh->raid_conf = conf; 1470 #ifdef CONFIG_MULTICORE_RAID456 1471 init_waitqueue_head(&sh->ops.wait_for_ops); 1472 #endif 1473 1474 spin_lock_init(&sh->stripe_lock); 1475 1476 if (grow_buffers(sh)) { 1477 shrink_buffers(sh); 1478 kmem_cache_free(conf->slab_cache, sh); 1479 return 0; 1480 } 1481 /* we just created an active stripe so... */ 1482 atomic_set(&sh->count, 1); 1483 atomic_inc(&conf->active_stripes); 1484 INIT_LIST_HEAD(&sh->lru); 1485 release_stripe(sh); 1486 return 1; 1487 } 1488 1489 static int grow_stripes(struct r5conf *conf, int num) 1490 { 1491 struct kmem_cache *sc; 1492 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1493 1494 if (conf->mddev->gendisk) 1495 sprintf(conf->cache_name[0], 1496 "raid%d-%s", conf->level, mdname(conf->mddev)); 1497 else 1498 sprintf(conf->cache_name[0], 1499 "raid%d-%p", conf->level, conf->mddev); 1500 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1501 1502 conf->active_name = 0; 1503 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1504 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1505 0, 0, NULL); 1506 if (!sc) 1507 return 1; 1508 conf->slab_cache = sc; 1509 conf->pool_size = devs; 1510 while (num--) 1511 if (!grow_one_stripe(conf)) 1512 return 1; 1513 return 0; 1514 } 1515 1516 /** 1517 * scribble_len - return the required size of the scribble region 1518 * @num - total number of disks in the array 1519 * 1520 * The size must be enough to contain: 1521 * 1/ a struct page pointer for each device in the array +2 1522 * 2/ room to convert each entry in (1) to its corresponding dma 1523 * (dma_map_page()) or page (page_address()) address. 1524 * 1525 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1526 * calculate over all devices (not just the data blocks), using zeros in place 1527 * of the P and Q blocks. 1528 */ 1529 static size_t scribble_len(int num) 1530 { 1531 size_t len; 1532 1533 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1534 1535 return len; 1536 } 1537 1538 static int resize_stripes(struct r5conf *conf, int newsize) 1539 { 1540 /* Make all the stripes able to hold 'newsize' devices. 1541 * New slots in each stripe get 'page' set to a new page. 1542 * 1543 * This happens in stages: 1544 * 1/ create a new kmem_cache and allocate the required number of 1545 * stripe_heads. 1546 * 2/ gather all the old stripe_heads and tranfer the pages across 1547 * to the new stripe_heads. This will have the side effect of 1548 * freezing the array as once all stripe_heads have been collected, 1549 * no IO will be possible. Old stripe heads are freed once their 1550 * pages have been transferred over, and the old kmem_cache is 1551 * freed when all stripes are done. 1552 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1553 * we simple return a failre status - no need to clean anything up. 1554 * 4/ allocate new pages for the new slots in the new stripe_heads. 1555 * If this fails, we don't bother trying the shrink the 1556 * stripe_heads down again, we just leave them as they are. 1557 * As each stripe_head is processed the new one is released into 1558 * active service. 1559 * 1560 * Once step2 is started, we cannot afford to wait for a write, 1561 * so we use GFP_NOIO allocations. 1562 */ 1563 struct stripe_head *osh, *nsh; 1564 LIST_HEAD(newstripes); 1565 struct disk_info *ndisks; 1566 unsigned long cpu; 1567 int err; 1568 struct kmem_cache *sc; 1569 int i; 1570 1571 if (newsize <= conf->pool_size) 1572 return 0; /* never bother to shrink */ 1573 1574 err = md_allow_write(conf->mddev); 1575 if (err) 1576 return err; 1577 1578 /* Step 1 */ 1579 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1580 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1581 0, 0, NULL); 1582 if (!sc) 1583 return -ENOMEM; 1584 1585 for (i = conf->max_nr_stripes; i; i--) { 1586 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1587 if (!nsh) 1588 break; 1589 1590 nsh->raid_conf = conf; 1591 #ifdef CONFIG_MULTICORE_RAID456 1592 init_waitqueue_head(&nsh->ops.wait_for_ops); 1593 #endif 1594 spin_lock_init(&nsh->stripe_lock); 1595 1596 list_add(&nsh->lru, &newstripes); 1597 } 1598 if (i) { 1599 /* didn't get enough, give up */ 1600 while (!list_empty(&newstripes)) { 1601 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1602 list_del(&nsh->lru); 1603 kmem_cache_free(sc, nsh); 1604 } 1605 kmem_cache_destroy(sc); 1606 return -ENOMEM; 1607 } 1608 /* Step 2 - Must use GFP_NOIO now. 1609 * OK, we have enough stripes, start collecting inactive 1610 * stripes and copying them over 1611 */ 1612 list_for_each_entry(nsh, &newstripes, lru) { 1613 spin_lock_irq(&conf->device_lock); 1614 wait_event_lock_irq(conf->wait_for_stripe, 1615 !list_empty(&conf->inactive_list), 1616 conf->device_lock, 1617 ); 1618 osh = get_free_stripe(conf); 1619 spin_unlock_irq(&conf->device_lock); 1620 atomic_set(&nsh->count, 1); 1621 for(i=0; i<conf->pool_size; i++) 1622 nsh->dev[i].page = osh->dev[i].page; 1623 for( ; i<newsize; i++) 1624 nsh->dev[i].page = NULL; 1625 kmem_cache_free(conf->slab_cache, osh); 1626 } 1627 kmem_cache_destroy(conf->slab_cache); 1628 1629 /* Step 3. 1630 * At this point, we are holding all the stripes so the array 1631 * is completely stalled, so now is a good time to resize 1632 * conf->disks and the scribble region 1633 */ 1634 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1635 if (ndisks) { 1636 for (i=0; i<conf->raid_disks; i++) 1637 ndisks[i] = conf->disks[i]; 1638 kfree(conf->disks); 1639 conf->disks = ndisks; 1640 } else 1641 err = -ENOMEM; 1642 1643 get_online_cpus(); 1644 conf->scribble_len = scribble_len(newsize); 1645 for_each_present_cpu(cpu) { 1646 struct raid5_percpu *percpu; 1647 void *scribble; 1648 1649 percpu = per_cpu_ptr(conf->percpu, cpu); 1650 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1651 1652 if (scribble) { 1653 kfree(percpu->scribble); 1654 percpu->scribble = scribble; 1655 } else { 1656 err = -ENOMEM; 1657 break; 1658 } 1659 } 1660 put_online_cpus(); 1661 1662 /* Step 4, return new stripes to service */ 1663 while(!list_empty(&newstripes)) { 1664 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1665 list_del_init(&nsh->lru); 1666 1667 for (i=conf->raid_disks; i < newsize; i++) 1668 if (nsh->dev[i].page == NULL) { 1669 struct page *p = alloc_page(GFP_NOIO); 1670 nsh->dev[i].page = p; 1671 if (!p) 1672 err = -ENOMEM; 1673 } 1674 release_stripe(nsh); 1675 } 1676 /* critical section pass, GFP_NOIO no longer needed */ 1677 1678 conf->slab_cache = sc; 1679 conf->active_name = 1-conf->active_name; 1680 conf->pool_size = newsize; 1681 return err; 1682 } 1683 1684 static int drop_one_stripe(struct r5conf *conf) 1685 { 1686 struct stripe_head *sh; 1687 1688 spin_lock_irq(&conf->device_lock); 1689 sh = get_free_stripe(conf); 1690 spin_unlock_irq(&conf->device_lock); 1691 if (!sh) 1692 return 0; 1693 BUG_ON(atomic_read(&sh->count)); 1694 shrink_buffers(sh); 1695 kmem_cache_free(conf->slab_cache, sh); 1696 atomic_dec(&conf->active_stripes); 1697 return 1; 1698 } 1699 1700 static void shrink_stripes(struct r5conf *conf) 1701 { 1702 while (drop_one_stripe(conf)) 1703 ; 1704 1705 if (conf->slab_cache) 1706 kmem_cache_destroy(conf->slab_cache); 1707 conf->slab_cache = NULL; 1708 } 1709 1710 static void raid5_end_read_request(struct bio * bi, int error) 1711 { 1712 struct stripe_head *sh = bi->bi_private; 1713 struct r5conf *conf = sh->raid_conf; 1714 int disks = sh->disks, i; 1715 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1716 char b[BDEVNAME_SIZE]; 1717 struct md_rdev *rdev = NULL; 1718 sector_t s; 1719 1720 for (i=0 ; i<disks; i++) 1721 if (bi == &sh->dev[i].req) 1722 break; 1723 1724 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1725 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1726 uptodate); 1727 if (i == disks) { 1728 BUG(); 1729 return; 1730 } 1731 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1732 /* If replacement finished while this request was outstanding, 1733 * 'replacement' might be NULL already. 1734 * In that case it moved down to 'rdev'. 1735 * rdev is not removed until all requests are finished. 1736 */ 1737 rdev = conf->disks[i].replacement; 1738 if (!rdev) 1739 rdev = conf->disks[i].rdev; 1740 1741 if (use_new_offset(conf, sh)) 1742 s = sh->sector + rdev->new_data_offset; 1743 else 1744 s = sh->sector + rdev->data_offset; 1745 if (uptodate) { 1746 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1747 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1748 /* Note that this cannot happen on a 1749 * replacement device. We just fail those on 1750 * any error 1751 */ 1752 printk_ratelimited( 1753 KERN_INFO 1754 "md/raid:%s: read error corrected" 1755 " (%lu sectors at %llu on %s)\n", 1756 mdname(conf->mddev), STRIPE_SECTORS, 1757 (unsigned long long)s, 1758 bdevname(rdev->bdev, b)); 1759 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1760 clear_bit(R5_ReadError, &sh->dev[i].flags); 1761 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1762 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1763 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1764 1765 if (atomic_read(&rdev->read_errors)) 1766 atomic_set(&rdev->read_errors, 0); 1767 } else { 1768 const char *bdn = bdevname(rdev->bdev, b); 1769 int retry = 0; 1770 int set_bad = 0; 1771 1772 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1773 atomic_inc(&rdev->read_errors); 1774 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1775 printk_ratelimited( 1776 KERN_WARNING 1777 "md/raid:%s: read error on replacement device " 1778 "(sector %llu on %s).\n", 1779 mdname(conf->mddev), 1780 (unsigned long long)s, 1781 bdn); 1782 else if (conf->mddev->degraded >= conf->max_degraded) { 1783 set_bad = 1; 1784 printk_ratelimited( 1785 KERN_WARNING 1786 "md/raid:%s: read error not correctable " 1787 "(sector %llu on %s).\n", 1788 mdname(conf->mddev), 1789 (unsigned long long)s, 1790 bdn); 1791 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1792 /* Oh, no!!! */ 1793 set_bad = 1; 1794 printk_ratelimited( 1795 KERN_WARNING 1796 "md/raid:%s: read error NOT corrected!! " 1797 "(sector %llu on %s).\n", 1798 mdname(conf->mddev), 1799 (unsigned long long)s, 1800 bdn); 1801 } else if (atomic_read(&rdev->read_errors) 1802 > conf->max_nr_stripes) 1803 printk(KERN_WARNING 1804 "md/raid:%s: Too many read errors, failing device %s.\n", 1805 mdname(conf->mddev), bdn); 1806 else 1807 retry = 1; 1808 if (retry) 1809 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1810 set_bit(R5_ReadError, &sh->dev[i].flags); 1811 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1812 } else 1813 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1814 else { 1815 clear_bit(R5_ReadError, &sh->dev[i].flags); 1816 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1817 if (!(set_bad 1818 && test_bit(In_sync, &rdev->flags) 1819 && rdev_set_badblocks( 1820 rdev, sh->sector, STRIPE_SECTORS, 0))) 1821 md_error(conf->mddev, rdev); 1822 } 1823 } 1824 rdev_dec_pending(rdev, conf->mddev); 1825 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1826 set_bit(STRIPE_HANDLE, &sh->state); 1827 release_stripe(sh); 1828 } 1829 1830 static void raid5_end_write_request(struct bio *bi, int error) 1831 { 1832 struct stripe_head *sh = bi->bi_private; 1833 struct r5conf *conf = sh->raid_conf; 1834 int disks = sh->disks, i; 1835 struct md_rdev *uninitialized_var(rdev); 1836 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1837 sector_t first_bad; 1838 int bad_sectors; 1839 int replacement = 0; 1840 1841 for (i = 0 ; i < disks; i++) { 1842 if (bi == &sh->dev[i].req) { 1843 rdev = conf->disks[i].rdev; 1844 break; 1845 } 1846 if (bi == &sh->dev[i].rreq) { 1847 rdev = conf->disks[i].replacement; 1848 if (rdev) 1849 replacement = 1; 1850 else 1851 /* rdev was removed and 'replacement' 1852 * replaced it. rdev is not removed 1853 * until all requests are finished. 1854 */ 1855 rdev = conf->disks[i].rdev; 1856 break; 1857 } 1858 } 1859 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1860 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1861 uptodate); 1862 if (i == disks) { 1863 BUG(); 1864 return; 1865 } 1866 1867 if (replacement) { 1868 if (!uptodate) 1869 md_error(conf->mddev, rdev); 1870 else if (is_badblock(rdev, sh->sector, 1871 STRIPE_SECTORS, 1872 &first_bad, &bad_sectors)) 1873 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1874 } else { 1875 if (!uptodate) { 1876 set_bit(WriteErrorSeen, &rdev->flags); 1877 set_bit(R5_WriteError, &sh->dev[i].flags); 1878 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1879 set_bit(MD_RECOVERY_NEEDED, 1880 &rdev->mddev->recovery); 1881 } else if (is_badblock(rdev, sh->sector, 1882 STRIPE_SECTORS, 1883 &first_bad, &bad_sectors)) 1884 set_bit(R5_MadeGood, &sh->dev[i].flags); 1885 } 1886 rdev_dec_pending(rdev, conf->mddev); 1887 1888 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1889 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1890 set_bit(STRIPE_HANDLE, &sh->state); 1891 release_stripe(sh); 1892 } 1893 1894 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1895 1896 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1897 { 1898 struct r5dev *dev = &sh->dev[i]; 1899 1900 bio_init(&dev->req); 1901 dev->req.bi_io_vec = &dev->vec; 1902 dev->req.bi_vcnt++; 1903 dev->req.bi_max_vecs++; 1904 dev->req.bi_private = sh; 1905 dev->vec.bv_page = dev->page; 1906 1907 bio_init(&dev->rreq); 1908 dev->rreq.bi_io_vec = &dev->rvec; 1909 dev->rreq.bi_vcnt++; 1910 dev->rreq.bi_max_vecs++; 1911 dev->rreq.bi_private = sh; 1912 dev->rvec.bv_page = dev->page; 1913 1914 dev->flags = 0; 1915 dev->sector = compute_blocknr(sh, i, previous); 1916 } 1917 1918 static void error(struct mddev *mddev, struct md_rdev *rdev) 1919 { 1920 char b[BDEVNAME_SIZE]; 1921 struct r5conf *conf = mddev->private; 1922 unsigned long flags; 1923 pr_debug("raid456: error called\n"); 1924 1925 spin_lock_irqsave(&conf->device_lock, flags); 1926 clear_bit(In_sync, &rdev->flags); 1927 mddev->degraded = calc_degraded(conf); 1928 spin_unlock_irqrestore(&conf->device_lock, flags); 1929 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1930 1931 set_bit(Blocked, &rdev->flags); 1932 set_bit(Faulty, &rdev->flags); 1933 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1934 printk(KERN_ALERT 1935 "md/raid:%s: Disk failure on %s, disabling device.\n" 1936 "md/raid:%s: Operation continuing on %d devices.\n", 1937 mdname(mddev), 1938 bdevname(rdev->bdev, b), 1939 mdname(mddev), 1940 conf->raid_disks - mddev->degraded); 1941 } 1942 1943 /* 1944 * Input: a 'big' sector number, 1945 * Output: index of the data and parity disk, and the sector # in them. 1946 */ 1947 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1948 int previous, int *dd_idx, 1949 struct stripe_head *sh) 1950 { 1951 sector_t stripe, stripe2; 1952 sector_t chunk_number; 1953 unsigned int chunk_offset; 1954 int pd_idx, qd_idx; 1955 int ddf_layout = 0; 1956 sector_t new_sector; 1957 int algorithm = previous ? conf->prev_algo 1958 : conf->algorithm; 1959 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1960 : conf->chunk_sectors; 1961 int raid_disks = previous ? conf->previous_raid_disks 1962 : conf->raid_disks; 1963 int data_disks = raid_disks - conf->max_degraded; 1964 1965 /* First compute the information on this sector */ 1966 1967 /* 1968 * Compute the chunk number and the sector offset inside the chunk 1969 */ 1970 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1971 chunk_number = r_sector; 1972 1973 /* 1974 * Compute the stripe number 1975 */ 1976 stripe = chunk_number; 1977 *dd_idx = sector_div(stripe, data_disks); 1978 stripe2 = stripe; 1979 /* 1980 * Select the parity disk based on the user selected algorithm. 1981 */ 1982 pd_idx = qd_idx = -1; 1983 switch(conf->level) { 1984 case 4: 1985 pd_idx = data_disks; 1986 break; 1987 case 5: 1988 switch (algorithm) { 1989 case ALGORITHM_LEFT_ASYMMETRIC: 1990 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1991 if (*dd_idx >= pd_idx) 1992 (*dd_idx)++; 1993 break; 1994 case ALGORITHM_RIGHT_ASYMMETRIC: 1995 pd_idx = sector_div(stripe2, raid_disks); 1996 if (*dd_idx >= pd_idx) 1997 (*dd_idx)++; 1998 break; 1999 case ALGORITHM_LEFT_SYMMETRIC: 2000 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2001 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2002 break; 2003 case ALGORITHM_RIGHT_SYMMETRIC: 2004 pd_idx = sector_div(stripe2, raid_disks); 2005 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2006 break; 2007 case ALGORITHM_PARITY_0: 2008 pd_idx = 0; 2009 (*dd_idx)++; 2010 break; 2011 case ALGORITHM_PARITY_N: 2012 pd_idx = data_disks; 2013 break; 2014 default: 2015 BUG(); 2016 } 2017 break; 2018 case 6: 2019 2020 switch (algorithm) { 2021 case ALGORITHM_LEFT_ASYMMETRIC: 2022 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2023 qd_idx = pd_idx + 1; 2024 if (pd_idx == raid_disks-1) { 2025 (*dd_idx)++; /* Q D D D P */ 2026 qd_idx = 0; 2027 } else if (*dd_idx >= pd_idx) 2028 (*dd_idx) += 2; /* D D P Q D */ 2029 break; 2030 case ALGORITHM_RIGHT_ASYMMETRIC: 2031 pd_idx = sector_div(stripe2, raid_disks); 2032 qd_idx = pd_idx + 1; 2033 if (pd_idx == raid_disks-1) { 2034 (*dd_idx)++; /* Q D D D P */ 2035 qd_idx = 0; 2036 } else if (*dd_idx >= pd_idx) 2037 (*dd_idx) += 2; /* D D P Q D */ 2038 break; 2039 case ALGORITHM_LEFT_SYMMETRIC: 2040 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2041 qd_idx = (pd_idx + 1) % raid_disks; 2042 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2043 break; 2044 case ALGORITHM_RIGHT_SYMMETRIC: 2045 pd_idx = sector_div(stripe2, raid_disks); 2046 qd_idx = (pd_idx + 1) % raid_disks; 2047 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2048 break; 2049 2050 case ALGORITHM_PARITY_0: 2051 pd_idx = 0; 2052 qd_idx = 1; 2053 (*dd_idx) += 2; 2054 break; 2055 case ALGORITHM_PARITY_N: 2056 pd_idx = data_disks; 2057 qd_idx = data_disks + 1; 2058 break; 2059 2060 case ALGORITHM_ROTATING_ZERO_RESTART: 2061 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2062 * of blocks for computing Q is different. 2063 */ 2064 pd_idx = sector_div(stripe2, raid_disks); 2065 qd_idx = pd_idx + 1; 2066 if (pd_idx == raid_disks-1) { 2067 (*dd_idx)++; /* Q D D D P */ 2068 qd_idx = 0; 2069 } else if (*dd_idx >= pd_idx) 2070 (*dd_idx) += 2; /* D D P Q D */ 2071 ddf_layout = 1; 2072 break; 2073 2074 case ALGORITHM_ROTATING_N_RESTART: 2075 /* Same a left_asymmetric, by first stripe is 2076 * D D D P Q rather than 2077 * Q D D D P 2078 */ 2079 stripe2 += 1; 2080 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2081 qd_idx = pd_idx + 1; 2082 if (pd_idx == raid_disks-1) { 2083 (*dd_idx)++; /* Q D D D P */ 2084 qd_idx = 0; 2085 } else if (*dd_idx >= pd_idx) 2086 (*dd_idx) += 2; /* D D P Q D */ 2087 ddf_layout = 1; 2088 break; 2089 2090 case ALGORITHM_ROTATING_N_CONTINUE: 2091 /* Same as left_symmetric but Q is before P */ 2092 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2093 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2094 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2095 ddf_layout = 1; 2096 break; 2097 2098 case ALGORITHM_LEFT_ASYMMETRIC_6: 2099 /* RAID5 left_asymmetric, with Q on last device */ 2100 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2101 if (*dd_idx >= pd_idx) 2102 (*dd_idx)++; 2103 qd_idx = raid_disks - 1; 2104 break; 2105 2106 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2107 pd_idx = sector_div(stripe2, raid_disks-1); 2108 if (*dd_idx >= pd_idx) 2109 (*dd_idx)++; 2110 qd_idx = raid_disks - 1; 2111 break; 2112 2113 case ALGORITHM_LEFT_SYMMETRIC_6: 2114 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2115 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2116 qd_idx = raid_disks - 1; 2117 break; 2118 2119 case ALGORITHM_RIGHT_SYMMETRIC_6: 2120 pd_idx = sector_div(stripe2, raid_disks-1); 2121 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2122 qd_idx = raid_disks - 1; 2123 break; 2124 2125 case ALGORITHM_PARITY_0_6: 2126 pd_idx = 0; 2127 (*dd_idx)++; 2128 qd_idx = raid_disks - 1; 2129 break; 2130 2131 default: 2132 BUG(); 2133 } 2134 break; 2135 } 2136 2137 if (sh) { 2138 sh->pd_idx = pd_idx; 2139 sh->qd_idx = qd_idx; 2140 sh->ddf_layout = ddf_layout; 2141 } 2142 /* 2143 * Finally, compute the new sector number 2144 */ 2145 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2146 return new_sector; 2147 } 2148 2149 2150 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2151 { 2152 struct r5conf *conf = sh->raid_conf; 2153 int raid_disks = sh->disks; 2154 int data_disks = raid_disks - conf->max_degraded; 2155 sector_t new_sector = sh->sector, check; 2156 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2157 : conf->chunk_sectors; 2158 int algorithm = previous ? conf->prev_algo 2159 : conf->algorithm; 2160 sector_t stripe; 2161 int chunk_offset; 2162 sector_t chunk_number; 2163 int dummy1, dd_idx = i; 2164 sector_t r_sector; 2165 struct stripe_head sh2; 2166 2167 2168 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2169 stripe = new_sector; 2170 2171 if (i == sh->pd_idx) 2172 return 0; 2173 switch(conf->level) { 2174 case 4: break; 2175 case 5: 2176 switch (algorithm) { 2177 case ALGORITHM_LEFT_ASYMMETRIC: 2178 case ALGORITHM_RIGHT_ASYMMETRIC: 2179 if (i > sh->pd_idx) 2180 i--; 2181 break; 2182 case ALGORITHM_LEFT_SYMMETRIC: 2183 case ALGORITHM_RIGHT_SYMMETRIC: 2184 if (i < sh->pd_idx) 2185 i += raid_disks; 2186 i -= (sh->pd_idx + 1); 2187 break; 2188 case ALGORITHM_PARITY_0: 2189 i -= 1; 2190 break; 2191 case ALGORITHM_PARITY_N: 2192 break; 2193 default: 2194 BUG(); 2195 } 2196 break; 2197 case 6: 2198 if (i == sh->qd_idx) 2199 return 0; /* It is the Q disk */ 2200 switch (algorithm) { 2201 case ALGORITHM_LEFT_ASYMMETRIC: 2202 case ALGORITHM_RIGHT_ASYMMETRIC: 2203 case ALGORITHM_ROTATING_ZERO_RESTART: 2204 case ALGORITHM_ROTATING_N_RESTART: 2205 if (sh->pd_idx == raid_disks-1) 2206 i--; /* Q D D D P */ 2207 else if (i > sh->pd_idx) 2208 i -= 2; /* D D P Q D */ 2209 break; 2210 case ALGORITHM_LEFT_SYMMETRIC: 2211 case ALGORITHM_RIGHT_SYMMETRIC: 2212 if (sh->pd_idx == raid_disks-1) 2213 i--; /* Q D D D P */ 2214 else { 2215 /* D D P Q D */ 2216 if (i < sh->pd_idx) 2217 i += raid_disks; 2218 i -= (sh->pd_idx + 2); 2219 } 2220 break; 2221 case ALGORITHM_PARITY_0: 2222 i -= 2; 2223 break; 2224 case ALGORITHM_PARITY_N: 2225 break; 2226 case ALGORITHM_ROTATING_N_CONTINUE: 2227 /* Like left_symmetric, but P is before Q */ 2228 if (sh->pd_idx == 0) 2229 i--; /* P D D D Q */ 2230 else { 2231 /* D D Q P D */ 2232 if (i < sh->pd_idx) 2233 i += raid_disks; 2234 i -= (sh->pd_idx + 1); 2235 } 2236 break; 2237 case ALGORITHM_LEFT_ASYMMETRIC_6: 2238 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2239 if (i > sh->pd_idx) 2240 i--; 2241 break; 2242 case ALGORITHM_LEFT_SYMMETRIC_6: 2243 case ALGORITHM_RIGHT_SYMMETRIC_6: 2244 if (i < sh->pd_idx) 2245 i += data_disks + 1; 2246 i -= (sh->pd_idx + 1); 2247 break; 2248 case ALGORITHM_PARITY_0_6: 2249 i -= 1; 2250 break; 2251 default: 2252 BUG(); 2253 } 2254 break; 2255 } 2256 2257 chunk_number = stripe * data_disks + i; 2258 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2259 2260 check = raid5_compute_sector(conf, r_sector, 2261 previous, &dummy1, &sh2); 2262 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2263 || sh2.qd_idx != sh->qd_idx) { 2264 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2265 mdname(conf->mddev)); 2266 return 0; 2267 } 2268 return r_sector; 2269 } 2270 2271 2272 static void 2273 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2274 int rcw, int expand) 2275 { 2276 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2277 struct r5conf *conf = sh->raid_conf; 2278 int level = conf->level; 2279 2280 if (rcw) { 2281 /* if we are not expanding this is a proper write request, and 2282 * there will be bios with new data to be drained into the 2283 * stripe cache 2284 */ 2285 if (!expand) { 2286 sh->reconstruct_state = reconstruct_state_drain_run; 2287 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2288 } else 2289 sh->reconstruct_state = reconstruct_state_run; 2290 2291 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2292 2293 for (i = disks; i--; ) { 2294 struct r5dev *dev = &sh->dev[i]; 2295 2296 if (dev->towrite) { 2297 set_bit(R5_LOCKED, &dev->flags); 2298 set_bit(R5_Wantdrain, &dev->flags); 2299 if (!expand) 2300 clear_bit(R5_UPTODATE, &dev->flags); 2301 s->locked++; 2302 } 2303 } 2304 if (s->locked + conf->max_degraded == disks) 2305 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2306 atomic_inc(&conf->pending_full_writes); 2307 } else { 2308 BUG_ON(level == 6); 2309 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2310 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2311 2312 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2313 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2314 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2315 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2316 2317 for (i = disks; i--; ) { 2318 struct r5dev *dev = &sh->dev[i]; 2319 if (i == pd_idx) 2320 continue; 2321 2322 if (dev->towrite && 2323 (test_bit(R5_UPTODATE, &dev->flags) || 2324 test_bit(R5_Wantcompute, &dev->flags))) { 2325 set_bit(R5_Wantdrain, &dev->flags); 2326 set_bit(R5_LOCKED, &dev->flags); 2327 clear_bit(R5_UPTODATE, &dev->flags); 2328 s->locked++; 2329 } 2330 } 2331 } 2332 2333 /* keep the parity disk(s) locked while asynchronous operations 2334 * are in flight 2335 */ 2336 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2337 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2338 s->locked++; 2339 2340 if (level == 6) { 2341 int qd_idx = sh->qd_idx; 2342 struct r5dev *dev = &sh->dev[qd_idx]; 2343 2344 set_bit(R5_LOCKED, &dev->flags); 2345 clear_bit(R5_UPTODATE, &dev->flags); 2346 s->locked++; 2347 } 2348 2349 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2350 __func__, (unsigned long long)sh->sector, 2351 s->locked, s->ops_request); 2352 } 2353 2354 /* 2355 * Each stripe/dev can have one or more bion attached. 2356 * toread/towrite point to the first in a chain. 2357 * The bi_next chain must be in order. 2358 */ 2359 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2360 { 2361 struct bio **bip; 2362 struct r5conf *conf = sh->raid_conf; 2363 int firstwrite=0; 2364 2365 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2366 (unsigned long long)bi->bi_sector, 2367 (unsigned long long)sh->sector); 2368 2369 /* 2370 * If several bio share a stripe. The bio bi_phys_segments acts as a 2371 * reference count to avoid race. The reference count should already be 2372 * increased before this function is called (for example, in 2373 * make_request()), so other bio sharing this stripe will not free the 2374 * stripe. If a stripe is owned by one stripe, the stripe lock will 2375 * protect it. 2376 */ 2377 spin_lock_irq(&sh->stripe_lock); 2378 if (forwrite) { 2379 bip = &sh->dev[dd_idx].towrite; 2380 if (*bip == NULL) 2381 firstwrite = 1; 2382 } else 2383 bip = &sh->dev[dd_idx].toread; 2384 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2385 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2386 goto overlap; 2387 bip = & (*bip)->bi_next; 2388 } 2389 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2390 goto overlap; 2391 2392 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2393 if (*bip) 2394 bi->bi_next = *bip; 2395 *bip = bi; 2396 raid5_inc_bi_active_stripes(bi); 2397 2398 if (forwrite) { 2399 /* check if page is covered */ 2400 sector_t sector = sh->dev[dd_idx].sector; 2401 for (bi=sh->dev[dd_idx].towrite; 2402 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2403 bi && bi->bi_sector <= sector; 2404 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2405 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2406 sector = bi->bi_sector + (bi->bi_size>>9); 2407 } 2408 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2409 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2410 } 2411 spin_unlock_irq(&sh->stripe_lock); 2412 2413 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2414 (unsigned long long)(*bip)->bi_sector, 2415 (unsigned long long)sh->sector, dd_idx); 2416 2417 if (conf->mddev->bitmap && firstwrite) { 2418 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2419 STRIPE_SECTORS, 0); 2420 sh->bm_seq = conf->seq_flush+1; 2421 set_bit(STRIPE_BIT_DELAY, &sh->state); 2422 } 2423 return 1; 2424 2425 overlap: 2426 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2427 spin_unlock_irq(&sh->stripe_lock); 2428 return 0; 2429 } 2430 2431 static void end_reshape(struct r5conf *conf); 2432 2433 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2434 struct stripe_head *sh) 2435 { 2436 int sectors_per_chunk = 2437 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2438 int dd_idx; 2439 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2440 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2441 2442 raid5_compute_sector(conf, 2443 stripe * (disks - conf->max_degraded) 2444 *sectors_per_chunk + chunk_offset, 2445 previous, 2446 &dd_idx, sh); 2447 } 2448 2449 static void 2450 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2451 struct stripe_head_state *s, int disks, 2452 struct bio **return_bi) 2453 { 2454 int i; 2455 for (i = disks; i--; ) { 2456 struct bio *bi; 2457 int bitmap_end = 0; 2458 2459 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2460 struct md_rdev *rdev; 2461 rcu_read_lock(); 2462 rdev = rcu_dereference(conf->disks[i].rdev); 2463 if (rdev && test_bit(In_sync, &rdev->flags)) 2464 atomic_inc(&rdev->nr_pending); 2465 else 2466 rdev = NULL; 2467 rcu_read_unlock(); 2468 if (rdev) { 2469 if (!rdev_set_badblocks( 2470 rdev, 2471 sh->sector, 2472 STRIPE_SECTORS, 0)) 2473 md_error(conf->mddev, rdev); 2474 rdev_dec_pending(rdev, conf->mddev); 2475 } 2476 } 2477 spin_lock_irq(&sh->stripe_lock); 2478 /* fail all writes first */ 2479 bi = sh->dev[i].towrite; 2480 sh->dev[i].towrite = NULL; 2481 spin_unlock_irq(&sh->stripe_lock); 2482 if (bi) { 2483 s->to_write--; 2484 bitmap_end = 1; 2485 } 2486 2487 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2488 wake_up(&conf->wait_for_overlap); 2489 2490 while (bi && bi->bi_sector < 2491 sh->dev[i].sector + STRIPE_SECTORS) { 2492 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2493 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2494 if (!raid5_dec_bi_active_stripes(bi)) { 2495 md_write_end(conf->mddev); 2496 bi->bi_next = *return_bi; 2497 *return_bi = bi; 2498 } 2499 bi = nextbi; 2500 } 2501 if (bitmap_end) 2502 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2503 STRIPE_SECTORS, 0, 0); 2504 bitmap_end = 0; 2505 /* and fail all 'written' */ 2506 bi = sh->dev[i].written; 2507 sh->dev[i].written = NULL; 2508 if (bi) bitmap_end = 1; 2509 while (bi && bi->bi_sector < 2510 sh->dev[i].sector + STRIPE_SECTORS) { 2511 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2512 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2513 if (!raid5_dec_bi_active_stripes(bi)) { 2514 md_write_end(conf->mddev); 2515 bi->bi_next = *return_bi; 2516 *return_bi = bi; 2517 } 2518 bi = bi2; 2519 } 2520 2521 /* fail any reads if this device is non-operational and 2522 * the data has not reached the cache yet. 2523 */ 2524 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2525 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2526 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2527 bi = sh->dev[i].toread; 2528 sh->dev[i].toread = NULL; 2529 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2530 wake_up(&conf->wait_for_overlap); 2531 if (bi) s->to_read--; 2532 while (bi && bi->bi_sector < 2533 sh->dev[i].sector + STRIPE_SECTORS) { 2534 struct bio *nextbi = 2535 r5_next_bio(bi, sh->dev[i].sector); 2536 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2537 if (!raid5_dec_bi_active_stripes(bi)) { 2538 bi->bi_next = *return_bi; 2539 *return_bi = bi; 2540 } 2541 bi = nextbi; 2542 } 2543 } 2544 if (bitmap_end) 2545 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2546 STRIPE_SECTORS, 0, 0); 2547 /* If we were in the middle of a write the parity block might 2548 * still be locked - so just clear all R5_LOCKED flags 2549 */ 2550 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2551 } 2552 2553 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2554 if (atomic_dec_and_test(&conf->pending_full_writes)) 2555 md_wakeup_thread(conf->mddev->thread); 2556 } 2557 2558 static void 2559 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2560 struct stripe_head_state *s) 2561 { 2562 int abort = 0; 2563 int i; 2564 2565 clear_bit(STRIPE_SYNCING, &sh->state); 2566 s->syncing = 0; 2567 s->replacing = 0; 2568 /* There is nothing more to do for sync/check/repair. 2569 * Don't even need to abort as that is handled elsewhere 2570 * if needed, and not always wanted e.g. if there is a known 2571 * bad block here. 2572 * For recover/replace we need to record a bad block on all 2573 * non-sync devices, or abort the recovery 2574 */ 2575 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2576 /* During recovery devices cannot be removed, so 2577 * locking and refcounting of rdevs is not needed 2578 */ 2579 for (i = 0; i < conf->raid_disks; i++) { 2580 struct md_rdev *rdev = conf->disks[i].rdev; 2581 if (rdev 2582 && !test_bit(Faulty, &rdev->flags) 2583 && !test_bit(In_sync, &rdev->flags) 2584 && !rdev_set_badblocks(rdev, sh->sector, 2585 STRIPE_SECTORS, 0)) 2586 abort = 1; 2587 rdev = conf->disks[i].replacement; 2588 if (rdev 2589 && !test_bit(Faulty, &rdev->flags) 2590 && !test_bit(In_sync, &rdev->flags) 2591 && !rdev_set_badblocks(rdev, sh->sector, 2592 STRIPE_SECTORS, 0)) 2593 abort = 1; 2594 } 2595 if (abort) 2596 conf->recovery_disabled = 2597 conf->mddev->recovery_disabled; 2598 } 2599 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2600 } 2601 2602 static int want_replace(struct stripe_head *sh, int disk_idx) 2603 { 2604 struct md_rdev *rdev; 2605 int rv = 0; 2606 /* Doing recovery so rcu locking not required */ 2607 rdev = sh->raid_conf->disks[disk_idx].replacement; 2608 if (rdev 2609 && !test_bit(Faulty, &rdev->flags) 2610 && !test_bit(In_sync, &rdev->flags) 2611 && (rdev->recovery_offset <= sh->sector 2612 || rdev->mddev->recovery_cp <= sh->sector)) 2613 rv = 1; 2614 2615 return rv; 2616 } 2617 2618 /* fetch_block - checks the given member device to see if its data needs 2619 * to be read or computed to satisfy a request. 2620 * 2621 * Returns 1 when no more member devices need to be checked, otherwise returns 2622 * 0 to tell the loop in handle_stripe_fill to continue 2623 */ 2624 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2625 int disk_idx, int disks) 2626 { 2627 struct r5dev *dev = &sh->dev[disk_idx]; 2628 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2629 &sh->dev[s->failed_num[1]] }; 2630 2631 /* is the data in this block needed, and can we get it? */ 2632 if (!test_bit(R5_LOCKED, &dev->flags) && 2633 !test_bit(R5_UPTODATE, &dev->flags) && 2634 (dev->toread || 2635 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2636 s->syncing || s->expanding || 2637 (s->replacing && want_replace(sh, disk_idx)) || 2638 (s->failed >= 1 && fdev[0]->toread) || 2639 (s->failed >= 2 && fdev[1]->toread) || 2640 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2641 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2642 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2643 /* we would like to get this block, possibly by computing it, 2644 * otherwise read it if the backing disk is insync 2645 */ 2646 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2647 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2648 if ((s->uptodate == disks - 1) && 2649 (s->failed && (disk_idx == s->failed_num[0] || 2650 disk_idx == s->failed_num[1]))) { 2651 /* have disk failed, and we're requested to fetch it; 2652 * do compute it 2653 */ 2654 pr_debug("Computing stripe %llu block %d\n", 2655 (unsigned long long)sh->sector, disk_idx); 2656 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2657 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2658 set_bit(R5_Wantcompute, &dev->flags); 2659 sh->ops.target = disk_idx; 2660 sh->ops.target2 = -1; /* no 2nd target */ 2661 s->req_compute = 1; 2662 /* Careful: from this point on 'uptodate' is in the eye 2663 * of raid_run_ops which services 'compute' operations 2664 * before writes. R5_Wantcompute flags a block that will 2665 * be R5_UPTODATE by the time it is needed for a 2666 * subsequent operation. 2667 */ 2668 s->uptodate++; 2669 return 1; 2670 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2671 /* Computing 2-failure is *very* expensive; only 2672 * do it if failed >= 2 2673 */ 2674 int other; 2675 for (other = disks; other--; ) { 2676 if (other == disk_idx) 2677 continue; 2678 if (!test_bit(R5_UPTODATE, 2679 &sh->dev[other].flags)) 2680 break; 2681 } 2682 BUG_ON(other < 0); 2683 pr_debug("Computing stripe %llu blocks %d,%d\n", 2684 (unsigned long long)sh->sector, 2685 disk_idx, other); 2686 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2687 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2688 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2689 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2690 sh->ops.target = disk_idx; 2691 sh->ops.target2 = other; 2692 s->uptodate += 2; 2693 s->req_compute = 1; 2694 return 1; 2695 } else if (test_bit(R5_Insync, &dev->flags)) { 2696 set_bit(R5_LOCKED, &dev->flags); 2697 set_bit(R5_Wantread, &dev->flags); 2698 s->locked++; 2699 pr_debug("Reading block %d (sync=%d)\n", 2700 disk_idx, s->syncing); 2701 } 2702 } 2703 2704 return 0; 2705 } 2706 2707 /** 2708 * handle_stripe_fill - read or compute data to satisfy pending requests. 2709 */ 2710 static void handle_stripe_fill(struct stripe_head *sh, 2711 struct stripe_head_state *s, 2712 int disks) 2713 { 2714 int i; 2715 2716 /* look for blocks to read/compute, skip this if a compute 2717 * is already in flight, or if the stripe contents are in the 2718 * midst of changing due to a write 2719 */ 2720 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2721 !sh->reconstruct_state) 2722 for (i = disks; i--; ) 2723 if (fetch_block(sh, s, i, disks)) 2724 break; 2725 set_bit(STRIPE_HANDLE, &sh->state); 2726 } 2727 2728 2729 /* handle_stripe_clean_event 2730 * any written block on an uptodate or failed drive can be returned. 2731 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2732 * never LOCKED, so we don't need to test 'failed' directly. 2733 */ 2734 static void handle_stripe_clean_event(struct r5conf *conf, 2735 struct stripe_head *sh, int disks, struct bio **return_bi) 2736 { 2737 int i; 2738 struct r5dev *dev; 2739 2740 for (i = disks; i--; ) 2741 if (sh->dev[i].written) { 2742 dev = &sh->dev[i]; 2743 if (!test_bit(R5_LOCKED, &dev->flags) && 2744 test_bit(R5_UPTODATE, &dev->flags)) { 2745 /* We can return any write requests */ 2746 struct bio *wbi, *wbi2; 2747 pr_debug("Return write for disc %d\n", i); 2748 wbi = dev->written; 2749 dev->written = NULL; 2750 while (wbi && wbi->bi_sector < 2751 dev->sector + STRIPE_SECTORS) { 2752 wbi2 = r5_next_bio(wbi, dev->sector); 2753 if (!raid5_dec_bi_active_stripes(wbi)) { 2754 md_write_end(conf->mddev); 2755 wbi->bi_next = *return_bi; 2756 *return_bi = wbi; 2757 } 2758 wbi = wbi2; 2759 } 2760 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2761 STRIPE_SECTORS, 2762 !test_bit(STRIPE_DEGRADED, &sh->state), 2763 0); 2764 } 2765 } 2766 2767 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2768 if (atomic_dec_and_test(&conf->pending_full_writes)) 2769 md_wakeup_thread(conf->mddev->thread); 2770 } 2771 2772 static void handle_stripe_dirtying(struct r5conf *conf, 2773 struct stripe_head *sh, 2774 struct stripe_head_state *s, 2775 int disks) 2776 { 2777 int rmw = 0, rcw = 0, i; 2778 if (conf->max_degraded == 2) { 2779 /* RAID6 requires 'rcw' in current implementation 2780 * Calculate the real rcw later - for now fake it 2781 * look like rcw is cheaper 2782 */ 2783 rcw = 1; rmw = 2; 2784 } else for (i = disks; i--; ) { 2785 /* would I have to read this buffer for read_modify_write */ 2786 struct r5dev *dev = &sh->dev[i]; 2787 if ((dev->towrite || i == sh->pd_idx) && 2788 !test_bit(R5_LOCKED, &dev->flags) && 2789 !(test_bit(R5_UPTODATE, &dev->flags) || 2790 test_bit(R5_Wantcompute, &dev->flags))) { 2791 if (test_bit(R5_Insync, &dev->flags)) 2792 rmw++; 2793 else 2794 rmw += 2*disks; /* cannot read it */ 2795 } 2796 /* Would I have to read this buffer for reconstruct_write */ 2797 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2798 !test_bit(R5_LOCKED, &dev->flags) && 2799 !(test_bit(R5_UPTODATE, &dev->flags) || 2800 test_bit(R5_Wantcompute, &dev->flags))) { 2801 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2802 else 2803 rcw += 2*disks; 2804 } 2805 } 2806 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2807 (unsigned long long)sh->sector, rmw, rcw); 2808 set_bit(STRIPE_HANDLE, &sh->state); 2809 if (rmw < rcw && rmw > 0) 2810 /* prefer read-modify-write, but need to get some data */ 2811 for (i = disks; i--; ) { 2812 struct r5dev *dev = &sh->dev[i]; 2813 if ((dev->towrite || i == sh->pd_idx) && 2814 !test_bit(R5_LOCKED, &dev->flags) && 2815 !(test_bit(R5_UPTODATE, &dev->flags) || 2816 test_bit(R5_Wantcompute, &dev->flags)) && 2817 test_bit(R5_Insync, &dev->flags)) { 2818 if ( 2819 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2820 pr_debug("Read_old block " 2821 "%d for r-m-w\n", i); 2822 set_bit(R5_LOCKED, &dev->flags); 2823 set_bit(R5_Wantread, &dev->flags); 2824 s->locked++; 2825 } else { 2826 set_bit(STRIPE_DELAYED, &sh->state); 2827 set_bit(STRIPE_HANDLE, &sh->state); 2828 } 2829 } 2830 } 2831 if (rcw <= rmw && rcw > 0) { 2832 /* want reconstruct write, but need to get some data */ 2833 rcw = 0; 2834 for (i = disks; i--; ) { 2835 struct r5dev *dev = &sh->dev[i]; 2836 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2837 i != sh->pd_idx && i != sh->qd_idx && 2838 !test_bit(R5_LOCKED, &dev->flags) && 2839 !(test_bit(R5_UPTODATE, &dev->flags) || 2840 test_bit(R5_Wantcompute, &dev->flags))) { 2841 rcw++; 2842 if (!test_bit(R5_Insync, &dev->flags)) 2843 continue; /* it's a failed drive */ 2844 if ( 2845 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2846 pr_debug("Read_old block " 2847 "%d for Reconstruct\n", i); 2848 set_bit(R5_LOCKED, &dev->flags); 2849 set_bit(R5_Wantread, &dev->flags); 2850 s->locked++; 2851 } else { 2852 set_bit(STRIPE_DELAYED, &sh->state); 2853 set_bit(STRIPE_HANDLE, &sh->state); 2854 } 2855 } 2856 } 2857 } 2858 /* now if nothing is locked, and if we have enough data, 2859 * we can start a write request 2860 */ 2861 /* since handle_stripe can be called at any time we need to handle the 2862 * case where a compute block operation has been submitted and then a 2863 * subsequent call wants to start a write request. raid_run_ops only 2864 * handles the case where compute block and reconstruct are requested 2865 * simultaneously. If this is not the case then new writes need to be 2866 * held off until the compute completes. 2867 */ 2868 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2869 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2870 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2871 schedule_reconstruction(sh, s, rcw == 0, 0); 2872 } 2873 2874 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2875 struct stripe_head_state *s, int disks) 2876 { 2877 struct r5dev *dev = NULL; 2878 2879 set_bit(STRIPE_HANDLE, &sh->state); 2880 2881 switch (sh->check_state) { 2882 case check_state_idle: 2883 /* start a new check operation if there are no failures */ 2884 if (s->failed == 0) { 2885 BUG_ON(s->uptodate != disks); 2886 sh->check_state = check_state_run; 2887 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2888 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2889 s->uptodate--; 2890 break; 2891 } 2892 dev = &sh->dev[s->failed_num[0]]; 2893 /* fall through */ 2894 case check_state_compute_result: 2895 sh->check_state = check_state_idle; 2896 if (!dev) 2897 dev = &sh->dev[sh->pd_idx]; 2898 2899 /* check that a write has not made the stripe insync */ 2900 if (test_bit(STRIPE_INSYNC, &sh->state)) 2901 break; 2902 2903 /* either failed parity check, or recovery is happening */ 2904 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2905 BUG_ON(s->uptodate != disks); 2906 2907 set_bit(R5_LOCKED, &dev->flags); 2908 s->locked++; 2909 set_bit(R5_Wantwrite, &dev->flags); 2910 2911 clear_bit(STRIPE_DEGRADED, &sh->state); 2912 set_bit(STRIPE_INSYNC, &sh->state); 2913 break; 2914 case check_state_run: 2915 break; /* we will be called again upon completion */ 2916 case check_state_check_result: 2917 sh->check_state = check_state_idle; 2918 2919 /* if a failure occurred during the check operation, leave 2920 * STRIPE_INSYNC not set and let the stripe be handled again 2921 */ 2922 if (s->failed) 2923 break; 2924 2925 /* handle a successful check operation, if parity is correct 2926 * we are done. Otherwise update the mismatch count and repair 2927 * parity if !MD_RECOVERY_CHECK 2928 */ 2929 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2930 /* parity is correct (on disc, 2931 * not in buffer any more) 2932 */ 2933 set_bit(STRIPE_INSYNC, &sh->state); 2934 else { 2935 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2936 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2937 /* don't try to repair!! */ 2938 set_bit(STRIPE_INSYNC, &sh->state); 2939 else { 2940 sh->check_state = check_state_compute_run; 2941 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2942 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2943 set_bit(R5_Wantcompute, 2944 &sh->dev[sh->pd_idx].flags); 2945 sh->ops.target = sh->pd_idx; 2946 sh->ops.target2 = -1; 2947 s->uptodate++; 2948 } 2949 } 2950 break; 2951 case check_state_compute_run: 2952 break; 2953 default: 2954 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2955 __func__, sh->check_state, 2956 (unsigned long long) sh->sector); 2957 BUG(); 2958 } 2959 } 2960 2961 2962 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2963 struct stripe_head_state *s, 2964 int disks) 2965 { 2966 int pd_idx = sh->pd_idx; 2967 int qd_idx = sh->qd_idx; 2968 struct r5dev *dev; 2969 2970 set_bit(STRIPE_HANDLE, &sh->state); 2971 2972 BUG_ON(s->failed > 2); 2973 2974 /* Want to check and possibly repair P and Q. 2975 * However there could be one 'failed' device, in which 2976 * case we can only check one of them, possibly using the 2977 * other to generate missing data 2978 */ 2979 2980 switch (sh->check_state) { 2981 case check_state_idle: 2982 /* start a new check operation if there are < 2 failures */ 2983 if (s->failed == s->q_failed) { 2984 /* The only possible failed device holds Q, so it 2985 * makes sense to check P (If anything else were failed, 2986 * we would have used P to recreate it). 2987 */ 2988 sh->check_state = check_state_run; 2989 } 2990 if (!s->q_failed && s->failed < 2) { 2991 /* Q is not failed, and we didn't use it to generate 2992 * anything, so it makes sense to check it 2993 */ 2994 if (sh->check_state == check_state_run) 2995 sh->check_state = check_state_run_pq; 2996 else 2997 sh->check_state = check_state_run_q; 2998 } 2999 3000 /* discard potentially stale zero_sum_result */ 3001 sh->ops.zero_sum_result = 0; 3002 3003 if (sh->check_state == check_state_run) { 3004 /* async_xor_zero_sum destroys the contents of P */ 3005 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3006 s->uptodate--; 3007 } 3008 if (sh->check_state >= check_state_run && 3009 sh->check_state <= check_state_run_pq) { 3010 /* async_syndrome_zero_sum preserves P and Q, so 3011 * no need to mark them !uptodate here 3012 */ 3013 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3014 break; 3015 } 3016 3017 /* we have 2-disk failure */ 3018 BUG_ON(s->failed != 2); 3019 /* fall through */ 3020 case check_state_compute_result: 3021 sh->check_state = check_state_idle; 3022 3023 /* check that a write has not made the stripe insync */ 3024 if (test_bit(STRIPE_INSYNC, &sh->state)) 3025 break; 3026 3027 /* now write out any block on a failed drive, 3028 * or P or Q if they were recomputed 3029 */ 3030 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3031 if (s->failed == 2) { 3032 dev = &sh->dev[s->failed_num[1]]; 3033 s->locked++; 3034 set_bit(R5_LOCKED, &dev->flags); 3035 set_bit(R5_Wantwrite, &dev->flags); 3036 } 3037 if (s->failed >= 1) { 3038 dev = &sh->dev[s->failed_num[0]]; 3039 s->locked++; 3040 set_bit(R5_LOCKED, &dev->flags); 3041 set_bit(R5_Wantwrite, &dev->flags); 3042 } 3043 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3044 dev = &sh->dev[pd_idx]; 3045 s->locked++; 3046 set_bit(R5_LOCKED, &dev->flags); 3047 set_bit(R5_Wantwrite, &dev->flags); 3048 } 3049 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3050 dev = &sh->dev[qd_idx]; 3051 s->locked++; 3052 set_bit(R5_LOCKED, &dev->flags); 3053 set_bit(R5_Wantwrite, &dev->flags); 3054 } 3055 clear_bit(STRIPE_DEGRADED, &sh->state); 3056 3057 set_bit(STRIPE_INSYNC, &sh->state); 3058 break; 3059 case check_state_run: 3060 case check_state_run_q: 3061 case check_state_run_pq: 3062 break; /* we will be called again upon completion */ 3063 case check_state_check_result: 3064 sh->check_state = check_state_idle; 3065 3066 /* handle a successful check operation, if parity is correct 3067 * we are done. Otherwise update the mismatch count and repair 3068 * parity if !MD_RECOVERY_CHECK 3069 */ 3070 if (sh->ops.zero_sum_result == 0) { 3071 /* both parities are correct */ 3072 if (!s->failed) 3073 set_bit(STRIPE_INSYNC, &sh->state); 3074 else { 3075 /* in contrast to the raid5 case we can validate 3076 * parity, but still have a failure to write 3077 * back 3078 */ 3079 sh->check_state = check_state_compute_result; 3080 /* Returning at this point means that we may go 3081 * off and bring p and/or q uptodate again so 3082 * we make sure to check zero_sum_result again 3083 * to verify if p or q need writeback 3084 */ 3085 } 3086 } else { 3087 conf->mddev->resync_mismatches += STRIPE_SECTORS; 3088 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3089 /* don't try to repair!! */ 3090 set_bit(STRIPE_INSYNC, &sh->state); 3091 else { 3092 int *target = &sh->ops.target; 3093 3094 sh->ops.target = -1; 3095 sh->ops.target2 = -1; 3096 sh->check_state = check_state_compute_run; 3097 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3098 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3099 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3100 set_bit(R5_Wantcompute, 3101 &sh->dev[pd_idx].flags); 3102 *target = pd_idx; 3103 target = &sh->ops.target2; 3104 s->uptodate++; 3105 } 3106 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3107 set_bit(R5_Wantcompute, 3108 &sh->dev[qd_idx].flags); 3109 *target = qd_idx; 3110 s->uptodate++; 3111 } 3112 } 3113 } 3114 break; 3115 case check_state_compute_run: 3116 break; 3117 default: 3118 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3119 __func__, sh->check_state, 3120 (unsigned long long) sh->sector); 3121 BUG(); 3122 } 3123 } 3124 3125 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3126 { 3127 int i; 3128 3129 /* We have read all the blocks in this stripe and now we need to 3130 * copy some of them into a target stripe for expand. 3131 */ 3132 struct dma_async_tx_descriptor *tx = NULL; 3133 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3134 for (i = 0; i < sh->disks; i++) 3135 if (i != sh->pd_idx && i != sh->qd_idx) { 3136 int dd_idx, j; 3137 struct stripe_head *sh2; 3138 struct async_submit_ctl submit; 3139 3140 sector_t bn = compute_blocknr(sh, i, 1); 3141 sector_t s = raid5_compute_sector(conf, bn, 0, 3142 &dd_idx, NULL); 3143 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3144 if (sh2 == NULL) 3145 /* so far only the early blocks of this stripe 3146 * have been requested. When later blocks 3147 * get requested, we will try again 3148 */ 3149 continue; 3150 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3151 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3152 /* must have already done this block */ 3153 release_stripe(sh2); 3154 continue; 3155 } 3156 3157 /* place all the copies on one channel */ 3158 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3159 tx = async_memcpy(sh2->dev[dd_idx].page, 3160 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3161 &submit); 3162 3163 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3164 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3165 for (j = 0; j < conf->raid_disks; j++) 3166 if (j != sh2->pd_idx && 3167 j != sh2->qd_idx && 3168 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3169 break; 3170 if (j == conf->raid_disks) { 3171 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3172 set_bit(STRIPE_HANDLE, &sh2->state); 3173 } 3174 release_stripe(sh2); 3175 3176 } 3177 /* done submitting copies, wait for them to complete */ 3178 if (tx) { 3179 async_tx_ack(tx); 3180 dma_wait_for_async_tx(tx); 3181 } 3182 } 3183 3184 /* 3185 * handle_stripe - do things to a stripe. 3186 * 3187 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3188 * state of various bits to see what needs to be done. 3189 * Possible results: 3190 * return some read requests which now have data 3191 * return some write requests which are safely on storage 3192 * schedule a read on some buffers 3193 * schedule a write of some buffers 3194 * return confirmation of parity correctness 3195 * 3196 */ 3197 3198 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3199 { 3200 struct r5conf *conf = sh->raid_conf; 3201 int disks = sh->disks; 3202 struct r5dev *dev; 3203 int i; 3204 int do_recovery = 0; 3205 3206 memset(s, 0, sizeof(*s)); 3207 3208 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3209 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3210 s->failed_num[0] = -1; 3211 s->failed_num[1] = -1; 3212 3213 /* Now to look around and see what can be done */ 3214 rcu_read_lock(); 3215 for (i=disks; i--; ) { 3216 struct md_rdev *rdev; 3217 sector_t first_bad; 3218 int bad_sectors; 3219 int is_bad = 0; 3220 3221 dev = &sh->dev[i]; 3222 3223 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3224 i, dev->flags, 3225 dev->toread, dev->towrite, dev->written); 3226 /* maybe we can reply to a read 3227 * 3228 * new wantfill requests are only permitted while 3229 * ops_complete_biofill is guaranteed to be inactive 3230 */ 3231 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3232 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3233 set_bit(R5_Wantfill, &dev->flags); 3234 3235 /* now count some things */ 3236 if (test_bit(R5_LOCKED, &dev->flags)) 3237 s->locked++; 3238 if (test_bit(R5_UPTODATE, &dev->flags)) 3239 s->uptodate++; 3240 if (test_bit(R5_Wantcompute, &dev->flags)) { 3241 s->compute++; 3242 BUG_ON(s->compute > 2); 3243 } 3244 3245 if (test_bit(R5_Wantfill, &dev->flags)) 3246 s->to_fill++; 3247 else if (dev->toread) 3248 s->to_read++; 3249 if (dev->towrite) { 3250 s->to_write++; 3251 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3252 s->non_overwrite++; 3253 } 3254 if (dev->written) 3255 s->written++; 3256 /* Prefer to use the replacement for reads, but only 3257 * if it is recovered enough and has no bad blocks. 3258 */ 3259 rdev = rcu_dereference(conf->disks[i].replacement); 3260 if (rdev && !test_bit(Faulty, &rdev->flags) && 3261 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3262 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3263 &first_bad, &bad_sectors)) 3264 set_bit(R5_ReadRepl, &dev->flags); 3265 else { 3266 if (rdev) 3267 set_bit(R5_NeedReplace, &dev->flags); 3268 rdev = rcu_dereference(conf->disks[i].rdev); 3269 clear_bit(R5_ReadRepl, &dev->flags); 3270 } 3271 if (rdev && test_bit(Faulty, &rdev->flags)) 3272 rdev = NULL; 3273 if (rdev) { 3274 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3275 &first_bad, &bad_sectors); 3276 if (s->blocked_rdev == NULL 3277 && (test_bit(Blocked, &rdev->flags) 3278 || is_bad < 0)) { 3279 if (is_bad < 0) 3280 set_bit(BlockedBadBlocks, 3281 &rdev->flags); 3282 s->blocked_rdev = rdev; 3283 atomic_inc(&rdev->nr_pending); 3284 } 3285 } 3286 clear_bit(R5_Insync, &dev->flags); 3287 if (!rdev) 3288 /* Not in-sync */; 3289 else if (is_bad) { 3290 /* also not in-sync */ 3291 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3292 test_bit(R5_UPTODATE, &dev->flags)) { 3293 /* treat as in-sync, but with a read error 3294 * which we can now try to correct 3295 */ 3296 set_bit(R5_Insync, &dev->flags); 3297 set_bit(R5_ReadError, &dev->flags); 3298 } 3299 } else if (test_bit(In_sync, &rdev->flags)) 3300 set_bit(R5_Insync, &dev->flags); 3301 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3302 /* in sync if before recovery_offset */ 3303 set_bit(R5_Insync, &dev->flags); 3304 else if (test_bit(R5_UPTODATE, &dev->flags) && 3305 test_bit(R5_Expanded, &dev->flags)) 3306 /* If we've reshaped into here, we assume it is Insync. 3307 * We will shortly update recovery_offset to make 3308 * it official. 3309 */ 3310 set_bit(R5_Insync, &dev->flags); 3311 3312 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3313 /* This flag does not apply to '.replacement' 3314 * only to .rdev, so make sure to check that*/ 3315 struct md_rdev *rdev2 = rcu_dereference( 3316 conf->disks[i].rdev); 3317 if (rdev2 == rdev) 3318 clear_bit(R5_Insync, &dev->flags); 3319 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3320 s->handle_bad_blocks = 1; 3321 atomic_inc(&rdev2->nr_pending); 3322 } else 3323 clear_bit(R5_WriteError, &dev->flags); 3324 } 3325 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3326 /* This flag does not apply to '.replacement' 3327 * only to .rdev, so make sure to check that*/ 3328 struct md_rdev *rdev2 = rcu_dereference( 3329 conf->disks[i].rdev); 3330 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3331 s->handle_bad_blocks = 1; 3332 atomic_inc(&rdev2->nr_pending); 3333 } else 3334 clear_bit(R5_MadeGood, &dev->flags); 3335 } 3336 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3337 struct md_rdev *rdev2 = rcu_dereference( 3338 conf->disks[i].replacement); 3339 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3340 s->handle_bad_blocks = 1; 3341 atomic_inc(&rdev2->nr_pending); 3342 } else 3343 clear_bit(R5_MadeGoodRepl, &dev->flags); 3344 } 3345 if (!test_bit(R5_Insync, &dev->flags)) { 3346 /* The ReadError flag will just be confusing now */ 3347 clear_bit(R5_ReadError, &dev->flags); 3348 clear_bit(R5_ReWrite, &dev->flags); 3349 } 3350 if (test_bit(R5_ReadError, &dev->flags)) 3351 clear_bit(R5_Insync, &dev->flags); 3352 if (!test_bit(R5_Insync, &dev->flags)) { 3353 if (s->failed < 2) 3354 s->failed_num[s->failed] = i; 3355 s->failed++; 3356 if (rdev && !test_bit(Faulty, &rdev->flags)) 3357 do_recovery = 1; 3358 } 3359 } 3360 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3361 /* If there is a failed device being replaced, 3362 * we must be recovering. 3363 * else if we are after recovery_cp, we must be syncing 3364 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3365 * else we can only be replacing 3366 * sync and recovery both need to read all devices, and so 3367 * use the same flag. 3368 */ 3369 if (do_recovery || 3370 sh->sector >= conf->mddev->recovery_cp || 3371 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3372 s->syncing = 1; 3373 else 3374 s->replacing = 1; 3375 } 3376 rcu_read_unlock(); 3377 } 3378 3379 static void handle_stripe(struct stripe_head *sh) 3380 { 3381 struct stripe_head_state s; 3382 struct r5conf *conf = sh->raid_conf; 3383 int i; 3384 int prexor; 3385 int disks = sh->disks; 3386 struct r5dev *pdev, *qdev; 3387 3388 clear_bit(STRIPE_HANDLE, &sh->state); 3389 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3390 /* already being handled, ensure it gets handled 3391 * again when current action finishes */ 3392 set_bit(STRIPE_HANDLE, &sh->state); 3393 return; 3394 } 3395 3396 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3397 set_bit(STRIPE_SYNCING, &sh->state); 3398 clear_bit(STRIPE_INSYNC, &sh->state); 3399 } 3400 clear_bit(STRIPE_DELAYED, &sh->state); 3401 3402 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3403 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3404 (unsigned long long)sh->sector, sh->state, 3405 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3406 sh->check_state, sh->reconstruct_state); 3407 3408 analyse_stripe(sh, &s); 3409 3410 if (s.handle_bad_blocks) { 3411 set_bit(STRIPE_HANDLE, &sh->state); 3412 goto finish; 3413 } 3414 3415 if (unlikely(s.blocked_rdev)) { 3416 if (s.syncing || s.expanding || s.expanded || 3417 s.replacing || s.to_write || s.written) { 3418 set_bit(STRIPE_HANDLE, &sh->state); 3419 goto finish; 3420 } 3421 /* There is nothing for the blocked_rdev to block */ 3422 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3423 s.blocked_rdev = NULL; 3424 } 3425 3426 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3427 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3428 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3429 } 3430 3431 pr_debug("locked=%d uptodate=%d to_read=%d" 3432 " to_write=%d failed=%d failed_num=%d,%d\n", 3433 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3434 s.failed_num[0], s.failed_num[1]); 3435 /* check if the array has lost more than max_degraded devices and, 3436 * if so, some requests might need to be failed. 3437 */ 3438 if (s.failed > conf->max_degraded) { 3439 sh->check_state = 0; 3440 sh->reconstruct_state = 0; 3441 if (s.to_read+s.to_write+s.written) 3442 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3443 if (s.syncing + s.replacing) 3444 handle_failed_sync(conf, sh, &s); 3445 } 3446 3447 /* 3448 * might be able to return some write requests if the parity blocks 3449 * are safe, or on a failed drive 3450 */ 3451 pdev = &sh->dev[sh->pd_idx]; 3452 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3453 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3454 qdev = &sh->dev[sh->qd_idx]; 3455 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3456 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3457 || conf->level < 6; 3458 3459 if (s.written && 3460 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3461 && !test_bit(R5_LOCKED, &pdev->flags) 3462 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3463 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3464 && !test_bit(R5_LOCKED, &qdev->flags) 3465 && test_bit(R5_UPTODATE, &qdev->flags))))) 3466 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3467 3468 /* Now we might consider reading some blocks, either to check/generate 3469 * parity, or to satisfy requests 3470 * or to load a block that is being partially written. 3471 */ 3472 if (s.to_read || s.non_overwrite 3473 || (conf->level == 6 && s.to_write && s.failed) 3474 || (s.syncing && (s.uptodate + s.compute < disks)) 3475 || s.replacing 3476 || s.expanding) 3477 handle_stripe_fill(sh, &s, disks); 3478 3479 /* Now we check to see if any write operations have recently 3480 * completed 3481 */ 3482 prexor = 0; 3483 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3484 prexor = 1; 3485 if (sh->reconstruct_state == reconstruct_state_drain_result || 3486 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3487 sh->reconstruct_state = reconstruct_state_idle; 3488 3489 /* All the 'written' buffers and the parity block are ready to 3490 * be written back to disk 3491 */ 3492 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3493 BUG_ON(sh->qd_idx >= 0 && 3494 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3495 for (i = disks; i--; ) { 3496 struct r5dev *dev = &sh->dev[i]; 3497 if (test_bit(R5_LOCKED, &dev->flags) && 3498 (i == sh->pd_idx || i == sh->qd_idx || 3499 dev->written)) { 3500 pr_debug("Writing block %d\n", i); 3501 set_bit(R5_Wantwrite, &dev->flags); 3502 if (prexor) 3503 continue; 3504 if (!test_bit(R5_Insync, &dev->flags) || 3505 ((i == sh->pd_idx || i == sh->qd_idx) && 3506 s.failed == 0)) 3507 set_bit(STRIPE_INSYNC, &sh->state); 3508 } 3509 } 3510 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3511 s.dec_preread_active = 1; 3512 } 3513 3514 /* Now to consider new write requests and what else, if anything 3515 * should be read. We do not handle new writes when: 3516 * 1/ A 'write' operation (copy+xor) is already in flight. 3517 * 2/ A 'check' operation is in flight, as it may clobber the parity 3518 * block. 3519 */ 3520 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3521 handle_stripe_dirtying(conf, sh, &s, disks); 3522 3523 /* maybe we need to check and possibly fix the parity for this stripe 3524 * Any reads will already have been scheduled, so we just see if enough 3525 * data is available. The parity check is held off while parity 3526 * dependent operations are in flight. 3527 */ 3528 if (sh->check_state || 3529 (s.syncing && s.locked == 0 && 3530 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3531 !test_bit(STRIPE_INSYNC, &sh->state))) { 3532 if (conf->level == 6) 3533 handle_parity_checks6(conf, sh, &s, disks); 3534 else 3535 handle_parity_checks5(conf, sh, &s, disks); 3536 } 3537 3538 if (s.replacing && s.locked == 0 3539 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3540 /* Write out to replacement devices where possible */ 3541 for (i = 0; i < conf->raid_disks; i++) 3542 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3543 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3544 set_bit(R5_WantReplace, &sh->dev[i].flags); 3545 set_bit(R5_LOCKED, &sh->dev[i].flags); 3546 s.locked++; 3547 } 3548 set_bit(STRIPE_INSYNC, &sh->state); 3549 } 3550 if ((s.syncing || s.replacing) && s.locked == 0 && 3551 test_bit(STRIPE_INSYNC, &sh->state)) { 3552 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3553 clear_bit(STRIPE_SYNCING, &sh->state); 3554 } 3555 3556 /* If the failed drives are just a ReadError, then we might need 3557 * to progress the repair/check process 3558 */ 3559 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3560 for (i = 0; i < s.failed; i++) { 3561 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3562 if (test_bit(R5_ReadError, &dev->flags) 3563 && !test_bit(R5_LOCKED, &dev->flags) 3564 && test_bit(R5_UPTODATE, &dev->flags) 3565 ) { 3566 if (!test_bit(R5_ReWrite, &dev->flags)) { 3567 set_bit(R5_Wantwrite, &dev->flags); 3568 set_bit(R5_ReWrite, &dev->flags); 3569 set_bit(R5_LOCKED, &dev->flags); 3570 s.locked++; 3571 } else { 3572 /* let's read it back */ 3573 set_bit(R5_Wantread, &dev->flags); 3574 set_bit(R5_LOCKED, &dev->flags); 3575 s.locked++; 3576 } 3577 } 3578 } 3579 3580 3581 /* Finish reconstruct operations initiated by the expansion process */ 3582 if (sh->reconstruct_state == reconstruct_state_result) { 3583 struct stripe_head *sh_src 3584 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3585 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3586 /* sh cannot be written until sh_src has been read. 3587 * so arrange for sh to be delayed a little 3588 */ 3589 set_bit(STRIPE_DELAYED, &sh->state); 3590 set_bit(STRIPE_HANDLE, &sh->state); 3591 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3592 &sh_src->state)) 3593 atomic_inc(&conf->preread_active_stripes); 3594 release_stripe(sh_src); 3595 goto finish; 3596 } 3597 if (sh_src) 3598 release_stripe(sh_src); 3599 3600 sh->reconstruct_state = reconstruct_state_idle; 3601 clear_bit(STRIPE_EXPANDING, &sh->state); 3602 for (i = conf->raid_disks; i--; ) { 3603 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3604 set_bit(R5_LOCKED, &sh->dev[i].flags); 3605 s.locked++; 3606 } 3607 } 3608 3609 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3610 !sh->reconstruct_state) { 3611 /* Need to write out all blocks after computing parity */ 3612 sh->disks = conf->raid_disks; 3613 stripe_set_idx(sh->sector, conf, 0, sh); 3614 schedule_reconstruction(sh, &s, 1, 1); 3615 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3616 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3617 atomic_dec(&conf->reshape_stripes); 3618 wake_up(&conf->wait_for_overlap); 3619 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3620 } 3621 3622 if (s.expanding && s.locked == 0 && 3623 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3624 handle_stripe_expansion(conf, sh); 3625 3626 finish: 3627 /* wait for this device to become unblocked */ 3628 if (unlikely(s.blocked_rdev)) { 3629 if (conf->mddev->external) 3630 md_wait_for_blocked_rdev(s.blocked_rdev, 3631 conf->mddev); 3632 else 3633 /* Internal metadata will immediately 3634 * be written by raid5d, so we don't 3635 * need to wait here. 3636 */ 3637 rdev_dec_pending(s.blocked_rdev, 3638 conf->mddev); 3639 } 3640 3641 if (s.handle_bad_blocks) 3642 for (i = disks; i--; ) { 3643 struct md_rdev *rdev; 3644 struct r5dev *dev = &sh->dev[i]; 3645 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3646 /* We own a safe reference to the rdev */ 3647 rdev = conf->disks[i].rdev; 3648 if (!rdev_set_badblocks(rdev, sh->sector, 3649 STRIPE_SECTORS, 0)) 3650 md_error(conf->mddev, rdev); 3651 rdev_dec_pending(rdev, conf->mddev); 3652 } 3653 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3654 rdev = conf->disks[i].rdev; 3655 rdev_clear_badblocks(rdev, sh->sector, 3656 STRIPE_SECTORS, 0); 3657 rdev_dec_pending(rdev, conf->mddev); 3658 } 3659 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3660 rdev = conf->disks[i].replacement; 3661 if (!rdev) 3662 /* rdev have been moved down */ 3663 rdev = conf->disks[i].rdev; 3664 rdev_clear_badblocks(rdev, sh->sector, 3665 STRIPE_SECTORS, 0); 3666 rdev_dec_pending(rdev, conf->mddev); 3667 } 3668 } 3669 3670 if (s.ops_request) 3671 raid_run_ops(sh, s.ops_request); 3672 3673 ops_run_io(sh, &s); 3674 3675 if (s.dec_preread_active) { 3676 /* We delay this until after ops_run_io so that if make_request 3677 * is waiting on a flush, it won't continue until the writes 3678 * have actually been submitted. 3679 */ 3680 atomic_dec(&conf->preread_active_stripes); 3681 if (atomic_read(&conf->preread_active_stripes) < 3682 IO_THRESHOLD) 3683 md_wakeup_thread(conf->mddev->thread); 3684 } 3685 3686 return_io(s.return_bi); 3687 3688 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3689 } 3690 3691 static void raid5_activate_delayed(struct r5conf *conf) 3692 { 3693 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3694 while (!list_empty(&conf->delayed_list)) { 3695 struct list_head *l = conf->delayed_list.next; 3696 struct stripe_head *sh; 3697 sh = list_entry(l, struct stripe_head, lru); 3698 list_del_init(l); 3699 clear_bit(STRIPE_DELAYED, &sh->state); 3700 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3701 atomic_inc(&conf->preread_active_stripes); 3702 list_add_tail(&sh->lru, &conf->hold_list); 3703 } 3704 } 3705 } 3706 3707 static void activate_bit_delay(struct r5conf *conf) 3708 { 3709 /* device_lock is held */ 3710 struct list_head head; 3711 list_add(&head, &conf->bitmap_list); 3712 list_del_init(&conf->bitmap_list); 3713 while (!list_empty(&head)) { 3714 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3715 list_del_init(&sh->lru); 3716 atomic_inc(&sh->count); 3717 __release_stripe(conf, sh); 3718 } 3719 } 3720 3721 int md_raid5_congested(struct mddev *mddev, int bits) 3722 { 3723 struct r5conf *conf = mddev->private; 3724 3725 /* No difference between reads and writes. Just check 3726 * how busy the stripe_cache is 3727 */ 3728 3729 if (conf->inactive_blocked) 3730 return 1; 3731 if (conf->quiesce) 3732 return 1; 3733 if (list_empty_careful(&conf->inactive_list)) 3734 return 1; 3735 3736 return 0; 3737 } 3738 EXPORT_SYMBOL_GPL(md_raid5_congested); 3739 3740 static int raid5_congested(void *data, int bits) 3741 { 3742 struct mddev *mddev = data; 3743 3744 return mddev_congested(mddev, bits) || 3745 md_raid5_congested(mddev, bits); 3746 } 3747 3748 /* We want read requests to align with chunks where possible, 3749 * but write requests don't need to. 3750 */ 3751 static int raid5_mergeable_bvec(struct request_queue *q, 3752 struct bvec_merge_data *bvm, 3753 struct bio_vec *biovec) 3754 { 3755 struct mddev *mddev = q->queuedata; 3756 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3757 int max; 3758 unsigned int chunk_sectors = mddev->chunk_sectors; 3759 unsigned int bio_sectors = bvm->bi_size >> 9; 3760 3761 if ((bvm->bi_rw & 1) == WRITE) 3762 return biovec->bv_len; /* always allow writes to be mergeable */ 3763 3764 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3765 chunk_sectors = mddev->new_chunk_sectors; 3766 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3767 if (max < 0) max = 0; 3768 if (max <= biovec->bv_len && bio_sectors == 0) 3769 return biovec->bv_len; 3770 else 3771 return max; 3772 } 3773 3774 3775 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3776 { 3777 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3778 unsigned int chunk_sectors = mddev->chunk_sectors; 3779 unsigned int bio_sectors = bio->bi_size >> 9; 3780 3781 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3782 chunk_sectors = mddev->new_chunk_sectors; 3783 return chunk_sectors >= 3784 ((sector & (chunk_sectors - 1)) + bio_sectors); 3785 } 3786 3787 /* 3788 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3789 * later sampled by raid5d. 3790 */ 3791 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3792 { 3793 unsigned long flags; 3794 3795 spin_lock_irqsave(&conf->device_lock, flags); 3796 3797 bi->bi_next = conf->retry_read_aligned_list; 3798 conf->retry_read_aligned_list = bi; 3799 3800 spin_unlock_irqrestore(&conf->device_lock, flags); 3801 md_wakeup_thread(conf->mddev->thread); 3802 } 3803 3804 3805 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3806 { 3807 struct bio *bi; 3808 3809 bi = conf->retry_read_aligned; 3810 if (bi) { 3811 conf->retry_read_aligned = NULL; 3812 return bi; 3813 } 3814 bi = conf->retry_read_aligned_list; 3815 if(bi) { 3816 conf->retry_read_aligned_list = bi->bi_next; 3817 bi->bi_next = NULL; 3818 /* 3819 * this sets the active strip count to 1 and the processed 3820 * strip count to zero (upper 8 bits) 3821 */ 3822 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3823 } 3824 3825 return bi; 3826 } 3827 3828 3829 /* 3830 * The "raid5_align_endio" should check if the read succeeded and if it 3831 * did, call bio_endio on the original bio (having bio_put the new bio 3832 * first). 3833 * If the read failed.. 3834 */ 3835 static void raid5_align_endio(struct bio *bi, int error) 3836 { 3837 struct bio* raid_bi = bi->bi_private; 3838 struct mddev *mddev; 3839 struct r5conf *conf; 3840 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3841 struct md_rdev *rdev; 3842 3843 bio_put(bi); 3844 3845 rdev = (void*)raid_bi->bi_next; 3846 raid_bi->bi_next = NULL; 3847 mddev = rdev->mddev; 3848 conf = mddev->private; 3849 3850 rdev_dec_pending(rdev, conf->mddev); 3851 3852 if (!error && uptodate) { 3853 bio_endio(raid_bi, 0); 3854 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3855 wake_up(&conf->wait_for_stripe); 3856 return; 3857 } 3858 3859 3860 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3861 3862 add_bio_to_retry(raid_bi, conf); 3863 } 3864 3865 static int bio_fits_rdev(struct bio *bi) 3866 { 3867 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3868 3869 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3870 return 0; 3871 blk_recount_segments(q, bi); 3872 if (bi->bi_phys_segments > queue_max_segments(q)) 3873 return 0; 3874 3875 if (q->merge_bvec_fn) 3876 /* it's too hard to apply the merge_bvec_fn at this stage, 3877 * just just give up 3878 */ 3879 return 0; 3880 3881 return 1; 3882 } 3883 3884 3885 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3886 { 3887 struct r5conf *conf = mddev->private; 3888 int dd_idx; 3889 struct bio* align_bi; 3890 struct md_rdev *rdev; 3891 sector_t end_sector; 3892 3893 if (!in_chunk_boundary(mddev, raid_bio)) { 3894 pr_debug("chunk_aligned_read : non aligned\n"); 3895 return 0; 3896 } 3897 /* 3898 * use bio_clone_mddev to make a copy of the bio 3899 */ 3900 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3901 if (!align_bi) 3902 return 0; 3903 /* 3904 * set bi_end_io to a new function, and set bi_private to the 3905 * original bio. 3906 */ 3907 align_bi->bi_end_io = raid5_align_endio; 3908 align_bi->bi_private = raid_bio; 3909 /* 3910 * compute position 3911 */ 3912 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3913 0, 3914 &dd_idx, NULL); 3915 3916 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3917 rcu_read_lock(); 3918 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3919 if (!rdev || test_bit(Faulty, &rdev->flags) || 3920 rdev->recovery_offset < end_sector) { 3921 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3922 if (rdev && 3923 (test_bit(Faulty, &rdev->flags) || 3924 !(test_bit(In_sync, &rdev->flags) || 3925 rdev->recovery_offset >= end_sector))) 3926 rdev = NULL; 3927 } 3928 if (rdev) { 3929 sector_t first_bad; 3930 int bad_sectors; 3931 3932 atomic_inc(&rdev->nr_pending); 3933 rcu_read_unlock(); 3934 raid_bio->bi_next = (void*)rdev; 3935 align_bi->bi_bdev = rdev->bdev; 3936 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3937 3938 if (!bio_fits_rdev(align_bi) || 3939 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3940 &first_bad, &bad_sectors)) { 3941 /* too big in some way, or has a known bad block */ 3942 bio_put(align_bi); 3943 rdev_dec_pending(rdev, mddev); 3944 return 0; 3945 } 3946 3947 /* No reshape active, so we can trust rdev->data_offset */ 3948 align_bi->bi_sector += rdev->data_offset; 3949 3950 spin_lock_irq(&conf->device_lock); 3951 wait_event_lock_irq(conf->wait_for_stripe, 3952 conf->quiesce == 0, 3953 conf->device_lock, /* nothing */); 3954 atomic_inc(&conf->active_aligned_reads); 3955 spin_unlock_irq(&conf->device_lock); 3956 3957 generic_make_request(align_bi); 3958 return 1; 3959 } else { 3960 rcu_read_unlock(); 3961 bio_put(align_bi); 3962 return 0; 3963 } 3964 } 3965 3966 /* __get_priority_stripe - get the next stripe to process 3967 * 3968 * Full stripe writes are allowed to pass preread active stripes up until 3969 * the bypass_threshold is exceeded. In general the bypass_count 3970 * increments when the handle_list is handled before the hold_list; however, it 3971 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3972 * stripe with in flight i/o. The bypass_count will be reset when the 3973 * head of the hold_list has changed, i.e. the head was promoted to the 3974 * handle_list. 3975 */ 3976 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3977 { 3978 struct stripe_head *sh; 3979 3980 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3981 __func__, 3982 list_empty(&conf->handle_list) ? "empty" : "busy", 3983 list_empty(&conf->hold_list) ? "empty" : "busy", 3984 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3985 3986 if (!list_empty(&conf->handle_list)) { 3987 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3988 3989 if (list_empty(&conf->hold_list)) 3990 conf->bypass_count = 0; 3991 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3992 if (conf->hold_list.next == conf->last_hold) 3993 conf->bypass_count++; 3994 else { 3995 conf->last_hold = conf->hold_list.next; 3996 conf->bypass_count -= conf->bypass_threshold; 3997 if (conf->bypass_count < 0) 3998 conf->bypass_count = 0; 3999 } 4000 } 4001 } else if (!list_empty(&conf->hold_list) && 4002 ((conf->bypass_threshold && 4003 conf->bypass_count > conf->bypass_threshold) || 4004 atomic_read(&conf->pending_full_writes) == 0)) { 4005 sh = list_entry(conf->hold_list.next, 4006 typeof(*sh), lru); 4007 conf->bypass_count -= conf->bypass_threshold; 4008 if (conf->bypass_count < 0) 4009 conf->bypass_count = 0; 4010 } else 4011 return NULL; 4012 4013 list_del_init(&sh->lru); 4014 atomic_inc(&sh->count); 4015 BUG_ON(atomic_read(&sh->count) != 1); 4016 return sh; 4017 } 4018 4019 struct raid5_plug_cb { 4020 struct blk_plug_cb cb; 4021 struct list_head list; 4022 }; 4023 4024 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4025 { 4026 struct raid5_plug_cb *cb = container_of( 4027 blk_cb, struct raid5_plug_cb, cb); 4028 struct stripe_head *sh; 4029 struct mddev *mddev = cb->cb.data; 4030 struct r5conf *conf = mddev->private; 4031 4032 if (cb->list.next && !list_empty(&cb->list)) { 4033 spin_lock_irq(&conf->device_lock); 4034 while (!list_empty(&cb->list)) { 4035 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4036 list_del_init(&sh->lru); 4037 /* 4038 * avoid race release_stripe_plug() sees 4039 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4040 * is still in our list 4041 */ 4042 smp_mb__before_clear_bit(); 4043 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4044 __release_stripe(conf, sh); 4045 } 4046 spin_unlock_irq(&conf->device_lock); 4047 } 4048 kfree(cb); 4049 } 4050 4051 static void release_stripe_plug(struct mddev *mddev, 4052 struct stripe_head *sh) 4053 { 4054 struct blk_plug_cb *blk_cb = blk_check_plugged( 4055 raid5_unplug, mddev, 4056 sizeof(struct raid5_plug_cb)); 4057 struct raid5_plug_cb *cb; 4058 4059 if (!blk_cb) { 4060 release_stripe(sh); 4061 return; 4062 } 4063 4064 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4065 4066 if (cb->list.next == NULL) 4067 INIT_LIST_HEAD(&cb->list); 4068 4069 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4070 list_add_tail(&sh->lru, &cb->list); 4071 else 4072 release_stripe(sh); 4073 } 4074 4075 static void make_request(struct mddev *mddev, struct bio * bi) 4076 { 4077 struct r5conf *conf = mddev->private; 4078 int dd_idx; 4079 sector_t new_sector; 4080 sector_t logical_sector, last_sector; 4081 struct stripe_head *sh; 4082 const int rw = bio_data_dir(bi); 4083 int remaining; 4084 4085 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4086 md_flush_request(mddev, bi); 4087 return; 4088 } 4089 4090 md_write_start(mddev, bi); 4091 4092 if (rw == READ && 4093 mddev->reshape_position == MaxSector && 4094 chunk_aligned_read(mddev,bi)) 4095 return; 4096 4097 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4098 last_sector = bi->bi_sector + (bi->bi_size>>9); 4099 bi->bi_next = NULL; 4100 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4101 4102 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4103 DEFINE_WAIT(w); 4104 int previous; 4105 4106 retry: 4107 previous = 0; 4108 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4109 if (unlikely(conf->reshape_progress != MaxSector)) { 4110 /* spinlock is needed as reshape_progress may be 4111 * 64bit on a 32bit platform, and so it might be 4112 * possible to see a half-updated value 4113 * Of course reshape_progress could change after 4114 * the lock is dropped, so once we get a reference 4115 * to the stripe that we think it is, we will have 4116 * to check again. 4117 */ 4118 spin_lock_irq(&conf->device_lock); 4119 if (mddev->reshape_backwards 4120 ? logical_sector < conf->reshape_progress 4121 : logical_sector >= conf->reshape_progress) { 4122 previous = 1; 4123 } else { 4124 if (mddev->reshape_backwards 4125 ? logical_sector < conf->reshape_safe 4126 : logical_sector >= conf->reshape_safe) { 4127 spin_unlock_irq(&conf->device_lock); 4128 schedule(); 4129 goto retry; 4130 } 4131 } 4132 spin_unlock_irq(&conf->device_lock); 4133 } 4134 4135 new_sector = raid5_compute_sector(conf, logical_sector, 4136 previous, 4137 &dd_idx, NULL); 4138 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4139 (unsigned long long)new_sector, 4140 (unsigned long long)logical_sector); 4141 4142 sh = get_active_stripe(conf, new_sector, previous, 4143 (bi->bi_rw&RWA_MASK), 0); 4144 if (sh) { 4145 if (unlikely(previous)) { 4146 /* expansion might have moved on while waiting for a 4147 * stripe, so we must do the range check again. 4148 * Expansion could still move past after this 4149 * test, but as we are holding a reference to 4150 * 'sh', we know that if that happens, 4151 * STRIPE_EXPANDING will get set and the expansion 4152 * won't proceed until we finish with the stripe. 4153 */ 4154 int must_retry = 0; 4155 spin_lock_irq(&conf->device_lock); 4156 if (mddev->reshape_backwards 4157 ? logical_sector >= conf->reshape_progress 4158 : logical_sector < conf->reshape_progress) 4159 /* mismatch, need to try again */ 4160 must_retry = 1; 4161 spin_unlock_irq(&conf->device_lock); 4162 if (must_retry) { 4163 release_stripe(sh); 4164 schedule(); 4165 goto retry; 4166 } 4167 } 4168 4169 if (rw == WRITE && 4170 logical_sector >= mddev->suspend_lo && 4171 logical_sector < mddev->suspend_hi) { 4172 release_stripe(sh); 4173 /* As the suspend_* range is controlled by 4174 * userspace, we want an interruptible 4175 * wait. 4176 */ 4177 flush_signals(current); 4178 prepare_to_wait(&conf->wait_for_overlap, 4179 &w, TASK_INTERRUPTIBLE); 4180 if (logical_sector >= mddev->suspend_lo && 4181 logical_sector < mddev->suspend_hi) 4182 schedule(); 4183 goto retry; 4184 } 4185 4186 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4187 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4188 /* Stripe is busy expanding or 4189 * add failed due to overlap. Flush everything 4190 * and wait a while 4191 */ 4192 md_wakeup_thread(mddev->thread); 4193 release_stripe(sh); 4194 schedule(); 4195 goto retry; 4196 } 4197 finish_wait(&conf->wait_for_overlap, &w); 4198 set_bit(STRIPE_HANDLE, &sh->state); 4199 clear_bit(STRIPE_DELAYED, &sh->state); 4200 if ((bi->bi_rw & REQ_SYNC) && 4201 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4202 atomic_inc(&conf->preread_active_stripes); 4203 release_stripe_plug(mddev, sh); 4204 } else { 4205 /* cannot get stripe for read-ahead, just give-up */ 4206 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4207 finish_wait(&conf->wait_for_overlap, &w); 4208 break; 4209 } 4210 } 4211 4212 remaining = raid5_dec_bi_active_stripes(bi); 4213 if (remaining == 0) { 4214 4215 if ( rw == WRITE ) 4216 md_write_end(mddev); 4217 4218 bio_endio(bi, 0); 4219 } 4220 } 4221 4222 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4223 4224 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4225 { 4226 /* reshaping is quite different to recovery/resync so it is 4227 * handled quite separately ... here. 4228 * 4229 * On each call to sync_request, we gather one chunk worth of 4230 * destination stripes and flag them as expanding. 4231 * Then we find all the source stripes and request reads. 4232 * As the reads complete, handle_stripe will copy the data 4233 * into the destination stripe and release that stripe. 4234 */ 4235 struct r5conf *conf = mddev->private; 4236 struct stripe_head *sh; 4237 sector_t first_sector, last_sector; 4238 int raid_disks = conf->previous_raid_disks; 4239 int data_disks = raid_disks - conf->max_degraded; 4240 int new_data_disks = conf->raid_disks - conf->max_degraded; 4241 int i; 4242 int dd_idx; 4243 sector_t writepos, readpos, safepos; 4244 sector_t stripe_addr; 4245 int reshape_sectors; 4246 struct list_head stripes; 4247 4248 if (sector_nr == 0) { 4249 /* If restarting in the middle, skip the initial sectors */ 4250 if (mddev->reshape_backwards && 4251 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4252 sector_nr = raid5_size(mddev, 0, 0) 4253 - conf->reshape_progress; 4254 } else if (!mddev->reshape_backwards && 4255 conf->reshape_progress > 0) 4256 sector_nr = conf->reshape_progress; 4257 sector_div(sector_nr, new_data_disks); 4258 if (sector_nr) { 4259 mddev->curr_resync_completed = sector_nr; 4260 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4261 *skipped = 1; 4262 return sector_nr; 4263 } 4264 } 4265 4266 /* We need to process a full chunk at a time. 4267 * If old and new chunk sizes differ, we need to process the 4268 * largest of these 4269 */ 4270 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4271 reshape_sectors = mddev->new_chunk_sectors; 4272 else 4273 reshape_sectors = mddev->chunk_sectors; 4274 4275 /* We update the metadata at least every 10 seconds, or when 4276 * the data about to be copied would over-write the source of 4277 * the data at the front of the range. i.e. one new_stripe 4278 * along from reshape_progress new_maps to after where 4279 * reshape_safe old_maps to 4280 */ 4281 writepos = conf->reshape_progress; 4282 sector_div(writepos, new_data_disks); 4283 readpos = conf->reshape_progress; 4284 sector_div(readpos, data_disks); 4285 safepos = conf->reshape_safe; 4286 sector_div(safepos, data_disks); 4287 if (mddev->reshape_backwards) { 4288 writepos -= min_t(sector_t, reshape_sectors, writepos); 4289 readpos += reshape_sectors; 4290 safepos += reshape_sectors; 4291 } else { 4292 writepos += reshape_sectors; 4293 readpos -= min_t(sector_t, reshape_sectors, readpos); 4294 safepos -= min_t(sector_t, reshape_sectors, safepos); 4295 } 4296 4297 /* Having calculated the 'writepos' possibly use it 4298 * to set 'stripe_addr' which is where we will write to. 4299 */ 4300 if (mddev->reshape_backwards) { 4301 BUG_ON(conf->reshape_progress == 0); 4302 stripe_addr = writepos; 4303 BUG_ON((mddev->dev_sectors & 4304 ~((sector_t)reshape_sectors - 1)) 4305 - reshape_sectors - stripe_addr 4306 != sector_nr); 4307 } else { 4308 BUG_ON(writepos != sector_nr + reshape_sectors); 4309 stripe_addr = sector_nr; 4310 } 4311 4312 /* 'writepos' is the most advanced device address we might write. 4313 * 'readpos' is the least advanced device address we might read. 4314 * 'safepos' is the least address recorded in the metadata as having 4315 * been reshaped. 4316 * If there is a min_offset_diff, these are adjusted either by 4317 * increasing the safepos/readpos if diff is negative, or 4318 * increasing writepos if diff is positive. 4319 * If 'readpos' is then behind 'writepos', there is no way that we can 4320 * ensure safety in the face of a crash - that must be done by userspace 4321 * making a backup of the data. So in that case there is no particular 4322 * rush to update metadata. 4323 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4324 * update the metadata to advance 'safepos' to match 'readpos' so that 4325 * we can be safe in the event of a crash. 4326 * So we insist on updating metadata if safepos is behind writepos and 4327 * readpos is beyond writepos. 4328 * In any case, update the metadata every 10 seconds. 4329 * Maybe that number should be configurable, but I'm not sure it is 4330 * worth it.... maybe it could be a multiple of safemode_delay??? 4331 */ 4332 if (conf->min_offset_diff < 0) { 4333 safepos += -conf->min_offset_diff; 4334 readpos += -conf->min_offset_diff; 4335 } else 4336 writepos += conf->min_offset_diff; 4337 4338 if ((mddev->reshape_backwards 4339 ? (safepos > writepos && readpos < writepos) 4340 : (safepos < writepos && readpos > writepos)) || 4341 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4342 /* Cannot proceed until we've updated the superblock... */ 4343 wait_event(conf->wait_for_overlap, 4344 atomic_read(&conf->reshape_stripes)==0); 4345 mddev->reshape_position = conf->reshape_progress; 4346 mddev->curr_resync_completed = sector_nr; 4347 conf->reshape_checkpoint = jiffies; 4348 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4349 md_wakeup_thread(mddev->thread); 4350 wait_event(mddev->sb_wait, mddev->flags == 0 || 4351 kthread_should_stop()); 4352 spin_lock_irq(&conf->device_lock); 4353 conf->reshape_safe = mddev->reshape_position; 4354 spin_unlock_irq(&conf->device_lock); 4355 wake_up(&conf->wait_for_overlap); 4356 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4357 } 4358 4359 INIT_LIST_HEAD(&stripes); 4360 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4361 int j; 4362 int skipped_disk = 0; 4363 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4364 set_bit(STRIPE_EXPANDING, &sh->state); 4365 atomic_inc(&conf->reshape_stripes); 4366 /* If any of this stripe is beyond the end of the old 4367 * array, then we need to zero those blocks 4368 */ 4369 for (j=sh->disks; j--;) { 4370 sector_t s; 4371 if (j == sh->pd_idx) 4372 continue; 4373 if (conf->level == 6 && 4374 j == sh->qd_idx) 4375 continue; 4376 s = compute_blocknr(sh, j, 0); 4377 if (s < raid5_size(mddev, 0, 0)) { 4378 skipped_disk = 1; 4379 continue; 4380 } 4381 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4382 set_bit(R5_Expanded, &sh->dev[j].flags); 4383 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4384 } 4385 if (!skipped_disk) { 4386 set_bit(STRIPE_EXPAND_READY, &sh->state); 4387 set_bit(STRIPE_HANDLE, &sh->state); 4388 } 4389 list_add(&sh->lru, &stripes); 4390 } 4391 spin_lock_irq(&conf->device_lock); 4392 if (mddev->reshape_backwards) 4393 conf->reshape_progress -= reshape_sectors * new_data_disks; 4394 else 4395 conf->reshape_progress += reshape_sectors * new_data_disks; 4396 spin_unlock_irq(&conf->device_lock); 4397 /* Ok, those stripe are ready. We can start scheduling 4398 * reads on the source stripes. 4399 * The source stripes are determined by mapping the first and last 4400 * block on the destination stripes. 4401 */ 4402 first_sector = 4403 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4404 1, &dd_idx, NULL); 4405 last_sector = 4406 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4407 * new_data_disks - 1), 4408 1, &dd_idx, NULL); 4409 if (last_sector >= mddev->dev_sectors) 4410 last_sector = mddev->dev_sectors - 1; 4411 while (first_sector <= last_sector) { 4412 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4413 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4414 set_bit(STRIPE_HANDLE, &sh->state); 4415 release_stripe(sh); 4416 first_sector += STRIPE_SECTORS; 4417 } 4418 /* Now that the sources are clearly marked, we can release 4419 * the destination stripes 4420 */ 4421 while (!list_empty(&stripes)) { 4422 sh = list_entry(stripes.next, struct stripe_head, lru); 4423 list_del_init(&sh->lru); 4424 release_stripe(sh); 4425 } 4426 /* If this takes us to the resync_max point where we have to pause, 4427 * then we need to write out the superblock. 4428 */ 4429 sector_nr += reshape_sectors; 4430 if ((sector_nr - mddev->curr_resync_completed) * 2 4431 >= mddev->resync_max - mddev->curr_resync_completed) { 4432 /* Cannot proceed until we've updated the superblock... */ 4433 wait_event(conf->wait_for_overlap, 4434 atomic_read(&conf->reshape_stripes) == 0); 4435 mddev->reshape_position = conf->reshape_progress; 4436 mddev->curr_resync_completed = sector_nr; 4437 conf->reshape_checkpoint = jiffies; 4438 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4439 md_wakeup_thread(mddev->thread); 4440 wait_event(mddev->sb_wait, 4441 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4442 || kthread_should_stop()); 4443 spin_lock_irq(&conf->device_lock); 4444 conf->reshape_safe = mddev->reshape_position; 4445 spin_unlock_irq(&conf->device_lock); 4446 wake_up(&conf->wait_for_overlap); 4447 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4448 } 4449 return reshape_sectors; 4450 } 4451 4452 /* FIXME go_faster isn't used */ 4453 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4454 { 4455 struct r5conf *conf = mddev->private; 4456 struct stripe_head *sh; 4457 sector_t max_sector = mddev->dev_sectors; 4458 sector_t sync_blocks; 4459 int still_degraded = 0; 4460 int i; 4461 4462 if (sector_nr >= max_sector) { 4463 /* just being told to finish up .. nothing much to do */ 4464 4465 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4466 end_reshape(conf); 4467 return 0; 4468 } 4469 4470 if (mddev->curr_resync < max_sector) /* aborted */ 4471 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4472 &sync_blocks, 1); 4473 else /* completed sync */ 4474 conf->fullsync = 0; 4475 bitmap_close_sync(mddev->bitmap); 4476 4477 return 0; 4478 } 4479 4480 /* Allow raid5_quiesce to complete */ 4481 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4482 4483 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4484 return reshape_request(mddev, sector_nr, skipped); 4485 4486 /* No need to check resync_max as we never do more than one 4487 * stripe, and as resync_max will always be on a chunk boundary, 4488 * if the check in md_do_sync didn't fire, there is no chance 4489 * of overstepping resync_max here 4490 */ 4491 4492 /* if there is too many failed drives and we are trying 4493 * to resync, then assert that we are finished, because there is 4494 * nothing we can do. 4495 */ 4496 if (mddev->degraded >= conf->max_degraded && 4497 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4498 sector_t rv = mddev->dev_sectors - sector_nr; 4499 *skipped = 1; 4500 return rv; 4501 } 4502 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4503 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4504 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4505 /* we can skip this block, and probably more */ 4506 sync_blocks /= STRIPE_SECTORS; 4507 *skipped = 1; 4508 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4509 } 4510 4511 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4512 4513 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4514 if (sh == NULL) { 4515 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4516 /* make sure we don't swamp the stripe cache if someone else 4517 * is trying to get access 4518 */ 4519 schedule_timeout_uninterruptible(1); 4520 } 4521 /* Need to check if array will still be degraded after recovery/resync 4522 * We don't need to check the 'failed' flag as when that gets set, 4523 * recovery aborts. 4524 */ 4525 for (i = 0; i < conf->raid_disks; i++) 4526 if (conf->disks[i].rdev == NULL) 4527 still_degraded = 1; 4528 4529 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4530 4531 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4532 4533 handle_stripe(sh); 4534 release_stripe(sh); 4535 4536 return STRIPE_SECTORS; 4537 } 4538 4539 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4540 { 4541 /* We may not be able to submit a whole bio at once as there 4542 * may not be enough stripe_heads available. 4543 * We cannot pre-allocate enough stripe_heads as we may need 4544 * more than exist in the cache (if we allow ever large chunks). 4545 * So we do one stripe head at a time and record in 4546 * ->bi_hw_segments how many have been done. 4547 * 4548 * We *know* that this entire raid_bio is in one chunk, so 4549 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4550 */ 4551 struct stripe_head *sh; 4552 int dd_idx; 4553 sector_t sector, logical_sector, last_sector; 4554 int scnt = 0; 4555 int remaining; 4556 int handled = 0; 4557 4558 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4559 sector = raid5_compute_sector(conf, logical_sector, 4560 0, &dd_idx, NULL); 4561 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4562 4563 for (; logical_sector < last_sector; 4564 logical_sector += STRIPE_SECTORS, 4565 sector += STRIPE_SECTORS, 4566 scnt++) { 4567 4568 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4569 /* already done this stripe */ 4570 continue; 4571 4572 sh = get_active_stripe(conf, sector, 0, 1, 0); 4573 4574 if (!sh) { 4575 /* failed to get a stripe - must wait */ 4576 raid5_set_bi_processed_stripes(raid_bio, scnt); 4577 conf->retry_read_aligned = raid_bio; 4578 return handled; 4579 } 4580 4581 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4582 release_stripe(sh); 4583 raid5_set_bi_processed_stripes(raid_bio, scnt); 4584 conf->retry_read_aligned = raid_bio; 4585 return handled; 4586 } 4587 4588 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4589 handle_stripe(sh); 4590 release_stripe(sh); 4591 handled++; 4592 } 4593 remaining = raid5_dec_bi_active_stripes(raid_bio); 4594 if (remaining == 0) 4595 bio_endio(raid_bio, 0); 4596 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4597 wake_up(&conf->wait_for_stripe); 4598 return handled; 4599 } 4600 4601 #define MAX_STRIPE_BATCH 8 4602 static int handle_active_stripes(struct r5conf *conf) 4603 { 4604 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4605 int i, batch_size = 0; 4606 4607 while (batch_size < MAX_STRIPE_BATCH && 4608 (sh = __get_priority_stripe(conf)) != NULL) 4609 batch[batch_size++] = sh; 4610 4611 if (batch_size == 0) 4612 return batch_size; 4613 spin_unlock_irq(&conf->device_lock); 4614 4615 for (i = 0; i < batch_size; i++) 4616 handle_stripe(batch[i]); 4617 4618 cond_resched(); 4619 4620 spin_lock_irq(&conf->device_lock); 4621 for (i = 0; i < batch_size; i++) 4622 __release_stripe(conf, batch[i]); 4623 return batch_size; 4624 } 4625 4626 /* 4627 * This is our raid5 kernel thread. 4628 * 4629 * We scan the hash table for stripes which can be handled now. 4630 * During the scan, completed stripes are saved for us by the interrupt 4631 * handler, so that they will not have to wait for our next wakeup. 4632 */ 4633 static void raid5d(struct mddev *mddev) 4634 { 4635 struct r5conf *conf = mddev->private; 4636 int handled; 4637 struct blk_plug plug; 4638 4639 pr_debug("+++ raid5d active\n"); 4640 4641 md_check_recovery(mddev); 4642 4643 blk_start_plug(&plug); 4644 handled = 0; 4645 spin_lock_irq(&conf->device_lock); 4646 while (1) { 4647 struct bio *bio; 4648 int batch_size; 4649 4650 if ( 4651 !list_empty(&conf->bitmap_list)) { 4652 /* Now is a good time to flush some bitmap updates */ 4653 conf->seq_flush++; 4654 spin_unlock_irq(&conf->device_lock); 4655 bitmap_unplug(mddev->bitmap); 4656 spin_lock_irq(&conf->device_lock); 4657 conf->seq_write = conf->seq_flush; 4658 activate_bit_delay(conf); 4659 } 4660 raid5_activate_delayed(conf); 4661 4662 while ((bio = remove_bio_from_retry(conf))) { 4663 int ok; 4664 spin_unlock_irq(&conf->device_lock); 4665 ok = retry_aligned_read(conf, bio); 4666 spin_lock_irq(&conf->device_lock); 4667 if (!ok) 4668 break; 4669 handled++; 4670 } 4671 4672 batch_size = handle_active_stripes(conf); 4673 if (!batch_size) 4674 break; 4675 handled += batch_size; 4676 4677 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4678 spin_unlock_irq(&conf->device_lock); 4679 md_check_recovery(mddev); 4680 spin_lock_irq(&conf->device_lock); 4681 } 4682 } 4683 pr_debug("%d stripes handled\n", handled); 4684 4685 spin_unlock_irq(&conf->device_lock); 4686 4687 async_tx_issue_pending_all(); 4688 blk_finish_plug(&plug); 4689 4690 pr_debug("--- raid5d inactive\n"); 4691 } 4692 4693 static ssize_t 4694 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4695 { 4696 struct r5conf *conf = mddev->private; 4697 if (conf) 4698 return sprintf(page, "%d\n", conf->max_nr_stripes); 4699 else 4700 return 0; 4701 } 4702 4703 int 4704 raid5_set_cache_size(struct mddev *mddev, int size) 4705 { 4706 struct r5conf *conf = mddev->private; 4707 int err; 4708 4709 if (size <= 16 || size > 32768) 4710 return -EINVAL; 4711 while (size < conf->max_nr_stripes) { 4712 if (drop_one_stripe(conf)) 4713 conf->max_nr_stripes--; 4714 else 4715 break; 4716 } 4717 err = md_allow_write(mddev); 4718 if (err) 4719 return err; 4720 while (size > conf->max_nr_stripes) { 4721 if (grow_one_stripe(conf)) 4722 conf->max_nr_stripes++; 4723 else break; 4724 } 4725 return 0; 4726 } 4727 EXPORT_SYMBOL(raid5_set_cache_size); 4728 4729 static ssize_t 4730 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4731 { 4732 struct r5conf *conf = mddev->private; 4733 unsigned long new; 4734 int err; 4735 4736 if (len >= PAGE_SIZE) 4737 return -EINVAL; 4738 if (!conf) 4739 return -ENODEV; 4740 4741 if (strict_strtoul(page, 10, &new)) 4742 return -EINVAL; 4743 err = raid5_set_cache_size(mddev, new); 4744 if (err) 4745 return err; 4746 return len; 4747 } 4748 4749 static struct md_sysfs_entry 4750 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4751 raid5_show_stripe_cache_size, 4752 raid5_store_stripe_cache_size); 4753 4754 static ssize_t 4755 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4756 { 4757 struct r5conf *conf = mddev->private; 4758 if (conf) 4759 return sprintf(page, "%d\n", conf->bypass_threshold); 4760 else 4761 return 0; 4762 } 4763 4764 static ssize_t 4765 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4766 { 4767 struct r5conf *conf = mddev->private; 4768 unsigned long new; 4769 if (len >= PAGE_SIZE) 4770 return -EINVAL; 4771 if (!conf) 4772 return -ENODEV; 4773 4774 if (strict_strtoul(page, 10, &new)) 4775 return -EINVAL; 4776 if (new > conf->max_nr_stripes) 4777 return -EINVAL; 4778 conf->bypass_threshold = new; 4779 return len; 4780 } 4781 4782 static struct md_sysfs_entry 4783 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4784 S_IRUGO | S_IWUSR, 4785 raid5_show_preread_threshold, 4786 raid5_store_preread_threshold); 4787 4788 static ssize_t 4789 stripe_cache_active_show(struct mddev *mddev, char *page) 4790 { 4791 struct r5conf *conf = mddev->private; 4792 if (conf) 4793 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4794 else 4795 return 0; 4796 } 4797 4798 static struct md_sysfs_entry 4799 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4800 4801 static struct attribute *raid5_attrs[] = { 4802 &raid5_stripecache_size.attr, 4803 &raid5_stripecache_active.attr, 4804 &raid5_preread_bypass_threshold.attr, 4805 NULL, 4806 }; 4807 static struct attribute_group raid5_attrs_group = { 4808 .name = NULL, 4809 .attrs = raid5_attrs, 4810 }; 4811 4812 static sector_t 4813 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4814 { 4815 struct r5conf *conf = mddev->private; 4816 4817 if (!sectors) 4818 sectors = mddev->dev_sectors; 4819 if (!raid_disks) 4820 /* size is defined by the smallest of previous and new size */ 4821 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4822 4823 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4824 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4825 return sectors * (raid_disks - conf->max_degraded); 4826 } 4827 4828 static void raid5_free_percpu(struct r5conf *conf) 4829 { 4830 struct raid5_percpu *percpu; 4831 unsigned long cpu; 4832 4833 if (!conf->percpu) 4834 return; 4835 4836 get_online_cpus(); 4837 for_each_possible_cpu(cpu) { 4838 percpu = per_cpu_ptr(conf->percpu, cpu); 4839 safe_put_page(percpu->spare_page); 4840 kfree(percpu->scribble); 4841 } 4842 #ifdef CONFIG_HOTPLUG_CPU 4843 unregister_cpu_notifier(&conf->cpu_notify); 4844 #endif 4845 put_online_cpus(); 4846 4847 free_percpu(conf->percpu); 4848 } 4849 4850 static void free_conf(struct r5conf *conf) 4851 { 4852 shrink_stripes(conf); 4853 raid5_free_percpu(conf); 4854 kfree(conf->disks); 4855 kfree(conf->stripe_hashtbl); 4856 kfree(conf); 4857 } 4858 4859 #ifdef CONFIG_HOTPLUG_CPU 4860 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4861 void *hcpu) 4862 { 4863 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4864 long cpu = (long)hcpu; 4865 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4866 4867 switch (action) { 4868 case CPU_UP_PREPARE: 4869 case CPU_UP_PREPARE_FROZEN: 4870 if (conf->level == 6 && !percpu->spare_page) 4871 percpu->spare_page = alloc_page(GFP_KERNEL); 4872 if (!percpu->scribble) 4873 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4874 4875 if (!percpu->scribble || 4876 (conf->level == 6 && !percpu->spare_page)) { 4877 safe_put_page(percpu->spare_page); 4878 kfree(percpu->scribble); 4879 pr_err("%s: failed memory allocation for cpu%ld\n", 4880 __func__, cpu); 4881 return notifier_from_errno(-ENOMEM); 4882 } 4883 break; 4884 case CPU_DEAD: 4885 case CPU_DEAD_FROZEN: 4886 safe_put_page(percpu->spare_page); 4887 kfree(percpu->scribble); 4888 percpu->spare_page = NULL; 4889 percpu->scribble = NULL; 4890 break; 4891 default: 4892 break; 4893 } 4894 return NOTIFY_OK; 4895 } 4896 #endif 4897 4898 static int raid5_alloc_percpu(struct r5conf *conf) 4899 { 4900 unsigned long cpu; 4901 struct page *spare_page; 4902 struct raid5_percpu __percpu *allcpus; 4903 void *scribble; 4904 int err; 4905 4906 allcpus = alloc_percpu(struct raid5_percpu); 4907 if (!allcpus) 4908 return -ENOMEM; 4909 conf->percpu = allcpus; 4910 4911 get_online_cpus(); 4912 err = 0; 4913 for_each_present_cpu(cpu) { 4914 if (conf->level == 6) { 4915 spare_page = alloc_page(GFP_KERNEL); 4916 if (!spare_page) { 4917 err = -ENOMEM; 4918 break; 4919 } 4920 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4921 } 4922 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4923 if (!scribble) { 4924 err = -ENOMEM; 4925 break; 4926 } 4927 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4928 } 4929 #ifdef CONFIG_HOTPLUG_CPU 4930 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4931 conf->cpu_notify.priority = 0; 4932 if (err == 0) 4933 err = register_cpu_notifier(&conf->cpu_notify); 4934 #endif 4935 put_online_cpus(); 4936 4937 return err; 4938 } 4939 4940 static struct r5conf *setup_conf(struct mddev *mddev) 4941 { 4942 struct r5conf *conf; 4943 int raid_disk, memory, max_disks; 4944 struct md_rdev *rdev; 4945 struct disk_info *disk; 4946 char pers_name[6]; 4947 4948 if (mddev->new_level != 5 4949 && mddev->new_level != 4 4950 && mddev->new_level != 6) { 4951 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4952 mdname(mddev), mddev->new_level); 4953 return ERR_PTR(-EIO); 4954 } 4955 if ((mddev->new_level == 5 4956 && !algorithm_valid_raid5(mddev->new_layout)) || 4957 (mddev->new_level == 6 4958 && !algorithm_valid_raid6(mddev->new_layout))) { 4959 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4960 mdname(mddev), mddev->new_layout); 4961 return ERR_PTR(-EIO); 4962 } 4963 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4964 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4965 mdname(mddev), mddev->raid_disks); 4966 return ERR_PTR(-EINVAL); 4967 } 4968 4969 if (!mddev->new_chunk_sectors || 4970 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4971 !is_power_of_2(mddev->new_chunk_sectors)) { 4972 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4973 mdname(mddev), mddev->new_chunk_sectors << 9); 4974 return ERR_PTR(-EINVAL); 4975 } 4976 4977 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4978 if (conf == NULL) 4979 goto abort; 4980 spin_lock_init(&conf->device_lock); 4981 init_waitqueue_head(&conf->wait_for_stripe); 4982 init_waitqueue_head(&conf->wait_for_overlap); 4983 INIT_LIST_HEAD(&conf->handle_list); 4984 INIT_LIST_HEAD(&conf->hold_list); 4985 INIT_LIST_HEAD(&conf->delayed_list); 4986 INIT_LIST_HEAD(&conf->bitmap_list); 4987 INIT_LIST_HEAD(&conf->inactive_list); 4988 atomic_set(&conf->active_stripes, 0); 4989 atomic_set(&conf->preread_active_stripes, 0); 4990 atomic_set(&conf->active_aligned_reads, 0); 4991 conf->bypass_threshold = BYPASS_THRESHOLD; 4992 conf->recovery_disabled = mddev->recovery_disabled - 1; 4993 4994 conf->raid_disks = mddev->raid_disks; 4995 if (mddev->reshape_position == MaxSector) 4996 conf->previous_raid_disks = mddev->raid_disks; 4997 else 4998 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4999 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5000 conf->scribble_len = scribble_len(max_disks); 5001 5002 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5003 GFP_KERNEL); 5004 if (!conf->disks) 5005 goto abort; 5006 5007 conf->mddev = mddev; 5008 5009 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5010 goto abort; 5011 5012 conf->level = mddev->new_level; 5013 if (raid5_alloc_percpu(conf) != 0) 5014 goto abort; 5015 5016 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5017 5018 rdev_for_each(rdev, mddev) { 5019 raid_disk = rdev->raid_disk; 5020 if (raid_disk >= max_disks 5021 || raid_disk < 0) 5022 continue; 5023 disk = conf->disks + raid_disk; 5024 5025 if (test_bit(Replacement, &rdev->flags)) { 5026 if (disk->replacement) 5027 goto abort; 5028 disk->replacement = rdev; 5029 } else { 5030 if (disk->rdev) 5031 goto abort; 5032 disk->rdev = rdev; 5033 } 5034 5035 if (test_bit(In_sync, &rdev->flags)) { 5036 char b[BDEVNAME_SIZE]; 5037 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5038 " disk %d\n", 5039 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5040 } else if (rdev->saved_raid_disk != raid_disk) 5041 /* Cannot rely on bitmap to complete recovery */ 5042 conf->fullsync = 1; 5043 } 5044 5045 conf->chunk_sectors = mddev->new_chunk_sectors; 5046 conf->level = mddev->new_level; 5047 if (conf->level == 6) 5048 conf->max_degraded = 2; 5049 else 5050 conf->max_degraded = 1; 5051 conf->algorithm = mddev->new_layout; 5052 conf->max_nr_stripes = NR_STRIPES; 5053 conf->reshape_progress = mddev->reshape_position; 5054 if (conf->reshape_progress != MaxSector) { 5055 conf->prev_chunk_sectors = mddev->chunk_sectors; 5056 conf->prev_algo = mddev->layout; 5057 } 5058 5059 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5060 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5061 if (grow_stripes(conf, conf->max_nr_stripes)) { 5062 printk(KERN_ERR 5063 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5064 mdname(mddev), memory); 5065 goto abort; 5066 } else 5067 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5068 mdname(mddev), memory); 5069 5070 sprintf(pers_name, "raid%d", mddev->new_level); 5071 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5072 if (!conf->thread) { 5073 printk(KERN_ERR 5074 "md/raid:%s: couldn't allocate thread.\n", 5075 mdname(mddev)); 5076 goto abort; 5077 } 5078 5079 return conf; 5080 5081 abort: 5082 if (conf) { 5083 free_conf(conf); 5084 return ERR_PTR(-EIO); 5085 } else 5086 return ERR_PTR(-ENOMEM); 5087 } 5088 5089 5090 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5091 { 5092 switch (algo) { 5093 case ALGORITHM_PARITY_0: 5094 if (raid_disk < max_degraded) 5095 return 1; 5096 break; 5097 case ALGORITHM_PARITY_N: 5098 if (raid_disk >= raid_disks - max_degraded) 5099 return 1; 5100 break; 5101 case ALGORITHM_PARITY_0_6: 5102 if (raid_disk == 0 || 5103 raid_disk == raid_disks - 1) 5104 return 1; 5105 break; 5106 case ALGORITHM_LEFT_ASYMMETRIC_6: 5107 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5108 case ALGORITHM_LEFT_SYMMETRIC_6: 5109 case ALGORITHM_RIGHT_SYMMETRIC_6: 5110 if (raid_disk == raid_disks - 1) 5111 return 1; 5112 } 5113 return 0; 5114 } 5115 5116 static int run(struct mddev *mddev) 5117 { 5118 struct r5conf *conf; 5119 int working_disks = 0; 5120 int dirty_parity_disks = 0; 5121 struct md_rdev *rdev; 5122 sector_t reshape_offset = 0; 5123 int i; 5124 long long min_offset_diff = 0; 5125 int first = 1; 5126 5127 if (mddev->recovery_cp != MaxSector) 5128 printk(KERN_NOTICE "md/raid:%s: not clean" 5129 " -- starting background reconstruction\n", 5130 mdname(mddev)); 5131 5132 rdev_for_each(rdev, mddev) { 5133 long long diff; 5134 if (rdev->raid_disk < 0) 5135 continue; 5136 diff = (rdev->new_data_offset - rdev->data_offset); 5137 if (first) { 5138 min_offset_diff = diff; 5139 first = 0; 5140 } else if (mddev->reshape_backwards && 5141 diff < min_offset_diff) 5142 min_offset_diff = diff; 5143 else if (!mddev->reshape_backwards && 5144 diff > min_offset_diff) 5145 min_offset_diff = diff; 5146 } 5147 5148 if (mddev->reshape_position != MaxSector) { 5149 /* Check that we can continue the reshape. 5150 * Difficulties arise if the stripe we would write to 5151 * next is at or after the stripe we would read from next. 5152 * For a reshape that changes the number of devices, this 5153 * is only possible for a very short time, and mdadm makes 5154 * sure that time appears to have past before assembling 5155 * the array. So we fail if that time hasn't passed. 5156 * For a reshape that keeps the number of devices the same 5157 * mdadm must be monitoring the reshape can keeping the 5158 * critical areas read-only and backed up. It will start 5159 * the array in read-only mode, so we check for that. 5160 */ 5161 sector_t here_new, here_old; 5162 int old_disks; 5163 int max_degraded = (mddev->level == 6 ? 2 : 1); 5164 5165 if (mddev->new_level != mddev->level) { 5166 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5167 "required - aborting.\n", 5168 mdname(mddev)); 5169 return -EINVAL; 5170 } 5171 old_disks = mddev->raid_disks - mddev->delta_disks; 5172 /* reshape_position must be on a new-stripe boundary, and one 5173 * further up in new geometry must map after here in old 5174 * geometry. 5175 */ 5176 here_new = mddev->reshape_position; 5177 if (sector_div(here_new, mddev->new_chunk_sectors * 5178 (mddev->raid_disks - max_degraded))) { 5179 printk(KERN_ERR "md/raid:%s: reshape_position not " 5180 "on a stripe boundary\n", mdname(mddev)); 5181 return -EINVAL; 5182 } 5183 reshape_offset = here_new * mddev->new_chunk_sectors; 5184 /* here_new is the stripe we will write to */ 5185 here_old = mddev->reshape_position; 5186 sector_div(here_old, mddev->chunk_sectors * 5187 (old_disks-max_degraded)); 5188 /* here_old is the first stripe that we might need to read 5189 * from */ 5190 if (mddev->delta_disks == 0) { 5191 if ((here_new * mddev->new_chunk_sectors != 5192 here_old * mddev->chunk_sectors)) { 5193 printk(KERN_ERR "md/raid:%s: reshape position is" 5194 " confused - aborting\n", mdname(mddev)); 5195 return -EINVAL; 5196 } 5197 /* We cannot be sure it is safe to start an in-place 5198 * reshape. It is only safe if user-space is monitoring 5199 * and taking constant backups. 5200 * mdadm always starts a situation like this in 5201 * readonly mode so it can take control before 5202 * allowing any writes. So just check for that. 5203 */ 5204 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5205 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5206 /* not really in-place - so OK */; 5207 else if (mddev->ro == 0) { 5208 printk(KERN_ERR "md/raid:%s: in-place reshape " 5209 "must be started in read-only mode " 5210 "- aborting\n", 5211 mdname(mddev)); 5212 return -EINVAL; 5213 } 5214 } else if (mddev->reshape_backwards 5215 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5216 here_old * mddev->chunk_sectors) 5217 : (here_new * mddev->new_chunk_sectors >= 5218 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5219 /* Reading from the same stripe as writing to - bad */ 5220 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5221 "auto-recovery - aborting.\n", 5222 mdname(mddev)); 5223 return -EINVAL; 5224 } 5225 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5226 mdname(mddev)); 5227 /* OK, we should be able to continue; */ 5228 } else { 5229 BUG_ON(mddev->level != mddev->new_level); 5230 BUG_ON(mddev->layout != mddev->new_layout); 5231 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5232 BUG_ON(mddev->delta_disks != 0); 5233 } 5234 5235 if (mddev->private == NULL) 5236 conf = setup_conf(mddev); 5237 else 5238 conf = mddev->private; 5239 5240 if (IS_ERR(conf)) 5241 return PTR_ERR(conf); 5242 5243 conf->min_offset_diff = min_offset_diff; 5244 mddev->thread = conf->thread; 5245 conf->thread = NULL; 5246 mddev->private = conf; 5247 5248 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5249 i++) { 5250 rdev = conf->disks[i].rdev; 5251 if (!rdev && conf->disks[i].replacement) { 5252 /* The replacement is all we have yet */ 5253 rdev = conf->disks[i].replacement; 5254 conf->disks[i].replacement = NULL; 5255 clear_bit(Replacement, &rdev->flags); 5256 conf->disks[i].rdev = rdev; 5257 } 5258 if (!rdev) 5259 continue; 5260 if (conf->disks[i].replacement && 5261 conf->reshape_progress != MaxSector) { 5262 /* replacements and reshape simply do not mix. */ 5263 printk(KERN_ERR "md: cannot handle concurrent " 5264 "replacement and reshape.\n"); 5265 goto abort; 5266 } 5267 if (test_bit(In_sync, &rdev->flags)) { 5268 working_disks++; 5269 continue; 5270 } 5271 /* This disc is not fully in-sync. However if it 5272 * just stored parity (beyond the recovery_offset), 5273 * when we don't need to be concerned about the 5274 * array being dirty. 5275 * When reshape goes 'backwards', we never have 5276 * partially completed devices, so we only need 5277 * to worry about reshape going forwards. 5278 */ 5279 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5280 if (mddev->major_version == 0 && 5281 mddev->minor_version > 90) 5282 rdev->recovery_offset = reshape_offset; 5283 5284 if (rdev->recovery_offset < reshape_offset) { 5285 /* We need to check old and new layout */ 5286 if (!only_parity(rdev->raid_disk, 5287 conf->algorithm, 5288 conf->raid_disks, 5289 conf->max_degraded)) 5290 continue; 5291 } 5292 if (!only_parity(rdev->raid_disk, 5293 conf->prev_algo, 5294 conf->previous_raid_disks, 5295 conf->max_degraded)) 5296 continue; 5297 dirty_parity_disks++; 5298 } 5299 5300 /* 5301 * 0 for a fully functional array, 1 or 2 for a degraded array. 5302 */ 5303 mddev->degraded = calc_degraded(conf); 5304 5305 if (has_failed(conf)) { 5306 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5307 " (%d/%d failed)\n", 5308 mdname(mddev), mddev->degraded, conf->raid_disks); 5309 goto abort; 5310 } 5311 5312 /* device size must be a multiple of chunk size */ 5313 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5314 mddev->resync_max_sectors = mddev->dev_sectors; 5315 5316 if (mddev->degraded > dirty_parity_disks && 5317 mddev->recovery_cp != MaxSector) { 5318 if (mddev->ok_start_degraded) 5319 printk(KERN_WARNING 5320 "md/raid:%s: starting dirty degraded array" 5321 " - data corruption possible.\n", 5322 mdname(mddev)); 5323 else { 5324 printk(KERN_ERR 5325 "md/raid:%s: cannot start dirty degraded array.\n", 5326 mdname(mddev)); 5327 goto abort; 5328 } 5329 } 5330 5331 if (mddev->degraded == 0) 5332 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5333 " devices, algorithm %d\n", mdname(mddev), conf->level, 5334 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5335 mddev->new_layout); 5336 else 5337 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5338 " out of %d devices, algorithm %d\n", 5339 mdname(mddev), conf->level, 5340 mddev->raid_disks - mddev->degraded, 5341 mddev->raid_disks, mddev->new_layout); 5342 5343 print_raid5_conf(conf); 5344 5345 if (conf->reshape_progress != MaxSector) { 5346 conf->reshape_safe = conf->reshape_progress; 5347 atomic_set(&conf->reshape_stripes, 0); 5348 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5349 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5350 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5351 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5352 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5353 "reshape"); 5354 } 5355 5356 5357 /* Ok, everything is just fine now */ 5358 if (mddev->to_remove == &raid5_attrs_group) 5359 mddev->to_remove = NULL; 5360 else if (mddev->kobj.sd && 5361 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5362 printk(KERN_WARNING 5363 "raid5: failed to create sysfs attributes for %s\n", 5364 mdname(mddev)); 5365 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5366 5367 if (mddev->queue) { 5368 int chunk_size; 5369 /* read-ahead size must cover two whole stripes, which 5370 * is 2 * (datadisks) * chunksize where 'n' is the 5371 * number of raid devices 5372 */ 5373 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5374 int stripe = data_disks * 5375 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5376 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5377 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5378 5379 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5380 5381 mddev->queue->backing_dev_info.congested_data = mddev; 5382 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5383 5384 chunk_size = mddev->chunk_sectors << 9; 5385 blk_queue_io_min(mddev->queue, chunk_size); 5386 blk_queue_io_opt(mddev->queue, chunk_size * 5387 (conf->raid_disks - conf->max_degraded)); 5388 5389 rdev_for_each(rdev, mddev) { 5390 disk_stack_limits(mddev->gendisk, rdev->bdev, 5391 rdev->data_offset << 9); 5392 disk_stack_limits(mddev->gendisk, rdev->bdev, 5393 rdev->new_data_offset << 9); 5394 } 5395 } 5396 5397 return 0; 5398 abort: 5399 md_unregister_thread(&mddev->thread); 5400 print_raid5_conf(conf); 5401 free_conf(conf); 5402 mddev->private = NULL; 5403 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5404 return -EIO; 5405 } 5406 5407 static int stop(struct mddev *mddev) 5408 { 5409 struct r5conf *conf = mddev->private; 5410 5411 md_unregister_thread(&mddev->thread); 5412 if (mddev->queue) 5413 mddev->queue->backing_dev_info.congested_fn = NULL; 5414 free_conf(conf); 5415 mddev->private = NULL; 5416 mddev->to_remove = &raid5_attrs_group; 5417 return 0; 5418 } 5419 5420 static void status(struct seq_file *seq, struct mddev *mddev) 5421 { 5422 struct r5conf *conf = mddev->private; 5423 int i; 5424 5425 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5426 mddev->chunk_sectors / 2, mddev->layout); 5427 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5428 for (i = 0; i < conf->raid_disks; i++) 5429 seq_printf (seq, "%s", 5430 conf->disks[i].rdev && 5431 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5432 seq_printf (seq, "]"); 5433 } 5434 5435 static void print_raid5_conf (struct r5conf *conf) 5436 { 5437 int i; 5438 struct disk_info *tmp; 5439 5440 printk(KERN_DEBUG "RAID conf printout:\n"); 5441 if (!conf) { 5442 printk("(conf==NULL)\n"); 5443 return; 5444 } 5445 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5446 conf->raid_disks, 5447 conf->raid_disks - conf->mddev->degraded); 5448 5449 for (i = 0; i < conf->raid_disks; i++) { 5450 char b[BDEVNAME_SIZE]; 5451 tmp = conf->disks + i; 5452 if (tmp->rdev) 5453 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5454 i, !test_bit(Faulty, &tmp->rdev->flags), 5455 bdevname(tmp->rdev->bdev, b)); 5456 } 5457 } 5458 5459 static int raid5_spare_active(struct mddev *mddev) 5460 { 5461 int i; 5462 struct r5conf *conf = mddev->private; 5463 struct disk_info *tmp; 5464 int count = 0; 5465 unsigned long flags; 5466 5467 for (i = 0; i < conf->raid_disks; i++) { 5468 tmp = conf->disks + i; 5469 if (tmp->replacement 5470 && tmp->replacement->recovery_offset == MaxSector 5471 && !test_bit(Faulty, &tmp->replacement->flags) 5472 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5473 /* Replacement has just become active. */ 5474 if (!tmp->rdev 5475 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5476 count++; 5477 if (tmp->rdev) { 5478 /* Replaced device not technically faulty, 5479 * but we need to be sure it gets removed 5480 * and never re-added. 5481 */ 5482 set_bit(Faulty, &tmp->rdev->flags); 5483 sysfs_notify_dirent_safe( 5484 tmp->rdev->sysfs_state); 5485 } 5486 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5487 } else if (tmp->rdev 5488 && tmp->rdev->recovery_offset == MaxSector 5489 && !test_bit(Faulty, &tmp->rdev->flags) 5490 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5491 count++; 5492 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5493 } 5494 } 5495 spin_lock_irqsave(&conf->device_lock, flags); 5496 mddev->degraded = calc_degraded(conf); 5497 spin_unlock_irqrestore(&conf->device_lock, flags); 5498 print_raid5_conf(conf); 5499 return count; 5500 } 5501 5502 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5503 { 5504 struct r5conf *conf = mddev->private; 5505 int err = 0; 5506 int number = rdev->raid_disk; 5507 struct md_rdev **rdevp; 5508 struct disk_info *p = conf->disks + number; 5509 5510 print_raid5_conf(conf); 5511 if (rdev == p->rdev) 5512 rdevp = &p->rdev; 5513 else if (rdev == p->replacement) 5514 rdevp = &p->replacement; 5515 else 5516 return 0; 5517 5518 if (number >= conf->raid_disks && 5519 conf->reshape_progress == MaxSector) 5520 clear_bit(In_sync, &rdev->flags); 5521 5522 if (test_bit(In_sync, &rdev->flags) || 5523 atomic_read(&rdev->nr_pending)) { 5524 err = -EBUSY; 5525 goto abort; 5526 } 5527 /* Only remove non-faulty devices if recovery 5528 * isn't possible. 5529 */ 5530 if (!test_bit(Faulty, &rdev->flags) && 5531 mddev->recovery_disabled != conf->recovery_disabled && 5532 !has_failed(conf) && 5533 (!p->replacement || p->replacement == rdev) && 5534 number < conf->raid_disks) { 5535 err = -EBUSY; 5536 goto abort; 5537 } 5538 *rdevp = NULL; 5539 synchronize_rcu(); 5540 if (atomic_read(&rdev->nr_pending)) { 5541 /* lost the race, try later */ 5542 err = -EBUSY; 5543 *rdevp = rdev; 5544 } else if (p->replacement) { 5545 /* We must have just cleared 'rdev' */ 5546 p->rdev = p->replacement; 5547 clear_bit(Replacement, &p->replacement->flags); 5548 smp_mb(); /* Make sure other CPUs may see both as identical 5549 * but will never see neither - if they are careful 5550 */ 5551 p->replacement = NULL; 5552 clear_bit(WantReplacement, &rdev->flags); 5553 } else 5554 /* We might have just removed the Replacement as faulty- 5555 * clear the bit just in case 5556 */ 5557 clear_bit(WantReplacement, &rdev->flags); 5558 abort: 5559 5560 print_raid5_conf(conf); 5561 return err; 5562 } 5563 5564 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5565 { 5566 struct r5conf *conf = mddev->private; 5567 int err = -EEXIST; 5568 int disk; 5569 struct disk_info *p; 5570 int first = 0; 5571 int last = conf->raid_disks - 1; 5572 5573 if (mddev->recovery_disabled == conf->recovery_disabled) 5574 return -EBUSY; 5575 5576 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5577 /* no point adding a device */ 5578 return -EINVAL; 5579 5580 if (rdev->raid_disk >= 0) 5581 first = last = rdev->raid_disk; 5582 5583 /* 5584 * find the disk ... but prefer rdev->saved_raid_disk 5585 * if possible. 5586 */ 5587 if (rdev->saved_raid_disk >= 0 && 5588 rdev->saved_raid_disk >= first && 5589 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5590 first = rdev->saved_raid_disk; 5591 5592 for (disk = first; disk <= last; disk++) { 5593 p = conf->disks + disk; 5594 if (p->rdev == NULL) { 5595 clear_bit(In_sync, &rdev->flags); 5596 rdev->raid_disk = disk; 5597 err = 0; 5598 if (rdev->saved_raid_disk != disk) 5599 conf->fullsync = 1; 5600 rcu_assign_pointer(p->rdev, rdev); 5601 goto out; 5602 } 5603 } 5604 for (disk = first; disk <= last; disk++) { 5605 p = conf->disks + disk; 5606 if (test_bit(WantReplacement, &p->rdev->flags) && 5607 p->replacement == NULL) { 5608 clear_bit(In_sync, &rdev->flags); 5609 set_bit(Replacement, &rdev->flags); 5610 rdev->raid_disk = disk; 5611 err = 0; 5612 conf->fullsync = 1; 5613 rcu_assign_pointer(p->replacement, rdev); 5614 break; 5615 } 5616 } 5617 out: 5618 print_raid5_conf(conf); 5619 return err; 5620 } 5621 5622 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5623 { 5624 /* no resync is happening, and there is enough space 5625 * on all devices, so we can resize. 5626 * We need to make sure resync covers any new space. 5627 * If the array is shrinking we should possibly wait until 5628 * any io in the removed space completes, but it hardly seems 5629 * worth it. 5630 */ 5631 sector_t newsize; 5632 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5633 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5634 if (mddev->external_size && 5635 mddev->array_sectors > newsize) 5636 return -EINVAL; 5637 if (mddev->bitmap) { 5638 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5639 if (ret) 5640 return ret; 5641 } 5642 md_set_array_sectors(mddev, newsize); 5643 set_capacity(mddev->gendisk, mddev->array_sectors); 5644 revalidate_disk(mddev->gendisk); 5645 if (sectors > mddev->dev_sectors && 5646 mddev->recovery_cp > mddev->dev_sectors) { 5647 mddev->recovery_cp = mddev->dev_sectors; 5648 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5649 } 5650 mddev->dev_sectors = sectors; 5651 mddev->resync_max_sectors = sectors; 5652 return 0; 5653 } 5654 5655 static int check_stripe_cache(struct mddev *mddev) 5656 { 5657 /* Can only proceed if there are plenty of stripe_heads. 5658 * We need a minimum of one full stripe,, and for sensible progress 5659 * it is best to have about 4 times that. 5660 * If we require 4 times, then the default 256 4K stripe_heads will 5661 * allow for chunk sizes up to 256K, which is probably OK. 5662 * If the chunk size is greater, user-space should request more 5663 * stripe_heads first. 5664 */ 5665 struct r5conf *conf = mddev->private; 5666 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5667 > conf->max_nr_stripes || 5668 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5669 > conf->max_nr_stripes) { 5670 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5671 mdname(mddev), 5672 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5673 / STRIPE_SIZE)*4); 5674 return 0; 5675 } 5676 return 1; 5677 } 5678 5679 static int check_reshape(struct mddev *mddev) 5680 { 5681 struct r5conf *conf = mddev->private; 5682 5683 if (mddev->delta_disks == 0 && 5684 mddev->new_layout == mddev->layout && 5685 mddev->new_chunk_sectors == mddev->chunk_sectors) 5686 return 0; /* nothing to do */ 5687 if (has_failed(conf)) 5688 return -EINVAL; 5689 if (mddev->delta_disks < 0) { 5690 /* We might be able to shrink, but the devices must 5691 * be made bigger first. 5692 * For raid6, 4 is the minimum size. 5693 * Otherwise 2 is the minimum 5694 */ 5695 int min = 2; 5696 if (mddev->level == 6) 5697 min = 4; 5698 if (mddev->raid_disks + mddev->delta_disks < min) 5699 return -EINVAL; 5700 } 5701 5702 if (!check_stripe_cache(mddev)) 5703 return -ENOSPC; 5704 5705 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5706 } 5707 5708 static int raid5_start_reshape(struct mddev *mddev) 5709 { 5710 struct r5conf *conf = mddev->private; 5711 struct md_rdev *rdev; 5712 int spares = 0; 5713 unsigned long flags; 5714 5715 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5716 return -EBUSY; 5717 5718 if (!check_stripe_cache(mddev)) 5719 return -ENOSPC; 5720 5721 if (has_failed(conf)) 5722 return -EINVAL; 5723 5724 rdev_for_each(rdev, mddev) { 5725 if (!test_bit(In_sync, &rdev->flags) 5726 && !test_bit(Faulty, &rdev->flags)) 5727 spares++; 5728 } 5729 5730 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5731 /* Not enough devices even to make a degraded array 5732 * of that size 5733 */ 5734 return -EINVAL; 5735 5736 /* Refuse to reduce size of the array. Any reductions in 5737 * array size must be through explicit setting of array_size 5738 * attribute. 5739 */ 5740 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5741 < mddev->array_sectors) { 5742 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5743 "before number of disks\n", mdname(mddev)); 5744 return -EINVAL; 5745 } 5746 5747 atomic_set(&conf->reshape_stripes, 0); 5748 spin_lock_irq(&conf->device_lock); 5749 conf->previous_raid_disks = conf->raid_disks; 5750 conf->raid_disks += mddev->delta_disks; 5751 conf->prev_chunk_sectors = conf->chunk_sectors; 5752 conf->chunk_sectors = mddev->new_chunk_sectors; 5753 conf->prev_algo = conf->algorithm; 5754 conf->algorithm = mddev->new_layout; 5755 conf->generation++; 5756 /* Code that selects data_offset needs to see the generation update 5757 * if reshape_progress has been set - so a memory barrier needed. 5758 */ 5759 smp_mb(); 5760 if (mddev->reshape_backwards) 5761 conf->reshape_progress = raid5_size(mddev, 0, 0); 5762 else 5763 conf->reshape_progress = 0; 5764 conf->reshape_safe = conf->reshape_progress; 5765 spin_unlock_irq(&conf->device_lock); 5766 5767 /* Add some new drives, as many as will fit. 5768 * We know there are enough to make the newly sized array work. 5769 * Don't add devices if we are reducing the number of 5770 * devices in the array. This is because it is not possible 5771 * to correctly record the "partially reconstructed" state of 5772 * such devices during the reshape and confusion could result. 5773 */ 5774 if (mddev->delta_disks >= 0) { 5775 rdev_for_each(rdev, mddev) 5776 if (rdev->raid_disk < 0 && 5777 !test_bit(Faulty, &rdev->flags)) { 5778 if (raid5_add_disk(mddev, rdev) == 0) { 5779 if (rdev->raid_disk 5780 >= conf->previous_raid_disks) 5781 set_bit(In_sync, &rdev->flags); 5782 else 5783 rdev->recovery_offset = 0; 5784 5785 if (sysfs_link_rdev(mddev, rdev)) 5786 /* Failure here is OK */; 5787 } 5788 } else if (rdev->raid_disk >= conf->previous_raid_disks 5789 && !test_bit(Faulty, &rdev->flags)) { 5790 /* This is a spare that was manually added */ 5791 set_bit(In_sync, &rdev->flags); 5792 } 5793 5794 /* When a reshape changes the number of devices, 5795 * ->degraded is measured against the larger of the 5796 * pre and post number of devices. 5797 */ 5798 spin_lock_irqsave(&conf->device_lock, flags); 5799 mddev->degraded = calc_degraded(conf); 5800 spin_unlock_irqrestore(&conf->device_lock, flags); 5801 } 5802 mddev->raid_disks = conf->raid_disks; 5803 mddev->reshape_position = conf->reshape_progress; 5804 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5805 5806 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5807 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5808 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5809 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5810 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5811 "reshape"); 5812 if (!mddev->sync_thread) { 5813 mddev->recovery = 0; 5814 spin_lock_irq(&conf->device_lock); 5815 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5816 rdev_for_each(rdev, mddev) 5817 rdev->new_data_offset = rdev->data_offset; 5818 smp_wmb(); 5819 conf->reshape_progress = MaxSector; 5820 mddev->reshape_position = MaxSector; 5821 spin_unlock_irq(&conf->device_lock); 5822 return -EAGAIN; 5823 } 5824 conf->reshape_checkpoint = jiffies; 5825 md_wakeup_thread(mddev->sync_thread); 5826 md_new_event(mddev); 5827 return 0; 5828 } 5829 5830 /* This is called from the reshape thread and should make any 5831 * changes needed in 'conf' 5832 */ 5833 static void end_reshape(struct r5conf *conf) 5834 { 5835 5836 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5837 struct md_rdev *rdev; 5838 5839 spin_lock_irq(&conf->device_lock); 5840 conf->previous_raid_disks = conf->raid_disks; 5841 rdev_for_each(rdev, conf->mddev) 5842 rdev->data_offset = rdev->new_data_offset; 5843 smp_wmb(); 5844 conf->reshape_progress = MaxSector; 5845 spin_unlock_irq(&conf->device_lock); 5846 wake_up(&conf->wait_for_overlap); 5847 5848 /* read-ahead size must cover two whole stripes, which is 5849 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5850 */ 5851 if (conf->mddev->queue) { 5852 int data_disks = conf->raid_disks - conf->max_degraded; 5853 int stripe = data_disks * ((conf->chunk_sectors << 9) 5854 / PAGE_SIZE); 5855 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5856 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5857 } 5858 } 5859 } 5860 5861 /* This is called from the raid5d thread with mddev_lock held. 5862 * It makes config changes to the device. 5863 */ 5864 static void raid5_finish_reshape(struct mddev *mddev) 5865 { 5866 struct r5conf *conf = mddev->private; 5867 5868 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5869 5870 if (mddev->delta_disks > 0) { 5871 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5872 set_capacity(mddev->gendisk, mddev->array_sectors); 5873 revalidate_disk(mddev->gendisk); 5874 } else { 5875 int d; 5876 spin_lock_irq(&conf->device_lock); 5877 mddev->degraded = calc_degraded(conf); 5878 spin_unlock_irq(&conf->device_lock); 5879 for (d = conf->raid_disks ; 5880 d < conf->raid_disks - mddev->delta_disks; 5881 d++) { 5882 struct md_rdev *rdev = conf->disks[d].rdev; 5883 if (rdev) 5884 clear_bit(In_sync, &rdev->flags); 5885 rdev = conf->disks[d].replacement; 5886 if (rdev) 5887 clear_bit(In_sync, &rdev->flags); 5888 } 5889 } 5890 mddev->layout = conf->algorithm; 5891 mddev->chunk_sectors = conf->chunk_sectors; 5892 mddev->reshape_position = MaxSector; 5893 mddev->delta_disks = 0; 5894 mddev->reshape_backwards = 0; 5895 } 5896 } 5897 5898 static void raid5_quiesce(struct mddev *mddev, int state) 5899 { 5900 struct r5conf *conf = mddev->private; 5901 5902 switch(state) { 5903 case 2: /* resume for a suspend */ 5904 wake_up(&conf->wait_for_overlap); 5905 break; 5906 5907 case 1: /* stop all writes */ 5908 spin_lock_irq(&conf->device_lock); 5909 /* '2' tells resync/reshape to pause so that all 5910 * active stripes can drain 5911 */ 5912 conf->quiesce = 2; 5913 wait_event_lock_irq(conf->wait_for_stripe, 5914 atomic_read(&conf->active_stripes) == 0 && 5915 atomic_read(&conf->active_aligned_reads) == 0, 5916 conf->device_lock, /* nothing */); 5917 conf->quiesce = 1; 5918 spin_unlock_irq(&conf->device_lock); 5919 /* allow reshape to continue */ 5920 wake_up(&conf->wait_for_overlap); 5921 break; 5922 5923 case 0: /* re-enable writes */ 5924 spin_lock_irq(&conf->device_lock); 5925 conf->quiesce = 0; 5926 wake_up(&conf->wait_for_stripe); 5927 wake_up(&conf->wait_for_overlap); 5928 spin_unlock_irq(&conf->device_lock); 5929 break; 5930 } 5931 } 5932 5933 5934 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5935 { 5936 struct r0conf *raid0_conf = mddev->private; 5937 sector_t sectors; 5938 5939 /* for raid0 takeover only one zone is supported */ 5940 if (raid0_conf->nr_strip_zones > 1) { 5941 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5942 mdname(mddev)); 5943 return ERR_PTR(-EINVAL); 5944 } 5945 5946 sectors = raid0_conf->strip_zone[0].zone_end; 5947 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5948 mddev->dev_sectors = sectors; 5949 mddev->new_level = level; 5950 mddev->new_layout = ALGORITHM_PARITY_N; 5951 mddev->new_chunk_sectors = mddev->chunk_sectors; 5952 mddev->raid_disks += 1; 5953 mddev->delta_disks = 1; 5954 /* make sure it will be not marked as dirty */ 5955 mddev->recovery_cp = MaxSector; 5956 5957 return setup_conf(mddev); 5958 } 5959 5960 5961 static void *raid5_takeover_raid1(struct mddev *mddev) 5962 { 5963 int chunksect; 5964 5965 if (mddev->raid_disks != 2 || 5966 mddev->degraded > 1) 5967 return ERR_PTR(-EINVAL); 5968 5969 /* Should check if there are write-behind devices? */ 5970 5971 chunksect = 64*2; /* 64K by default */ 5972 5973 /* The array must be an exact multiple of chunksize */ 5974 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5975 chunksect >>= 1; 5976 5977 if ((chunksect<<9) < STRIPE_SIZE) 5978 /* array size does not allow a suitable chunk size */ 5979 return ERR_PTR(-EINVAL); 5980 5981 mddev->new_level = 5; 5982 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5983 mddev->new_chunk_sectors = chunksect; 5984 5985 return setup_conf(mddev); 5986 } 5987 5988 static void *raid5_takeover_raid6(struct mddev *mddev) 5989 { 5990 int new_layout; 5991 5992 switch (mddev->layout) { 5993 case ALGORITHM_LEFT_ASYMMETRIC_6: 5994 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5995 break; 5996 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5997 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5998 break; 5999 case ALGORITHM_LEFT_SYMMETRIC_6: 6000 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6001 break; 6002 case ALGORITHM_RIGHT_SYMMETRIC_6: 6003 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6004 break; 6005 case ALGORITHM_PARITY_0_6: 6006 new_layout = ALGORITHM_PARITY_0; 6007 break; 6008 case ALGORITHM_PARITY_N: 6009 new_layout = ALGORITHM_PARITY_N; 6010 break; 6011 default: 6012 return ERR_PTR(-EINVAL); 6013 } 6014 mddev->new_level = 5; 6015 mddev->new_layout = new_layout; 6016 mddev->delta_disks = -1; 6017 mddev->raid_disks -= 1; 6018 return setup_conf(mddev); 6019 } 6020 6021 6022 static int raid5_check_reshape(struct mddev *mddev) 6023 { 6024 /* For a 2-drive array, the layout and chunk size can be changed 6025 * immediately as not restriping is needed. 6026 * For larger arrays we record the new value - after validation 6027 * to be used by a reshape pass. 6028 */ 6029 struct r5conf *conf = mddev->private; 6030 int new_chunk = mddev->new_chunk_sectors; 6031 6032 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6033 return -EINVAL; 6034 if (new_chunk > 0) { 6035 if (!is_power_of_2(new_chunk)) 6036 return -EINVAL; 6037 if (new_chunk < (PAGE_SIZE>>9)) 6038 return -EINVAL; 6039 if (mddev->array_sectors & (new_chunk-1)) 6040 /* not factor of array size */ 6041 return -EINVAL; 6042 } 6043 6044 /* They look valid */ 6045 6046 if (mddev->raid_disks == 2) { 6047 /* can make the change immediately */ 6048 if (mddev->new_layout >= 0) { 6049 conf->algorithm = mddev->new_layout; 6050 mddev->layout = mddev->new_layout; 6051 } 6052 if (new_chunk > 0) { 6053 conf->chunk_sectors = new_chunk ; 6054 mddev->chunk_sectors = new_chunk; 6055 } 6056 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6057 md_wakeup_thread(mddev->thread); 6058 } 6059 return check_reshape(mddev); 6060 } 6061 6062 static int raid6_check_reshape(struct mddev *mddev) 6063 { 6064 int new_chunk = mddev->new_chunk_sectors; 6065 6066 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6067 return -EINVAL; 6068 if (new_chunk > 0) { 6069 if (!is_power_of_2(new_chunk)) 6070 return -EINVAL; 6071 if (new_chunk < (PAGE_SIZE >> 9)) 6072 return -EINVAL; 6073 if (mddev->array_sectors & (new_chunk-1)) 6074 /* not factor of array size */ 6075 return -EINVAL; 6076 } 6077 6078 /* They look valid */ 6079 return check_reshape(mddev); 6080 } 6081 6082 static void *raid5_takeover(struct mddev *mddev) 6083 { 6084 /* raid5 can take over: 6085 * raid0 - if there is only one strip zone - make it a raid4 layout 6086 * raid1 - if there are two drives. We need to know the chunk size 6087 * raid4 - trivial - just use a raid4 layout. 6088 * raid6 - Providing it is a *_6 layout 6089 */ 6090 if (mddev->level == 0) 6091 return raid45_takeover_raid0(mddev, 5); 6092 if (mddev->level == 1) 6093 return raid5_takeover_raid1(mddev); 6094 if (mddev->level == 4) { 6095 mddev->new_layout = ALGORITHM_PARITY_N; 6096 mddev->new_level = 5; 6097 return setup_conf(mddev); 6098 } 6099 if (mddev->level == 6) 6100 return raid5_takeover_raid6(mddev); 6101 6102 return ERR_PTR(-EINVAL); 6103 } 6104 6105 static void *raid4_takeover(struct mddev *mddev) 6106 { 6107 /* raid4 can take over: 6108 * raid0 - if there is only one strip zone 6109 * raid5 - if layout is right 6110 */ 6111 if (mddev->level == 0) 6112 return raid45_takeover_raid0(mddev, 4); 6113 if (mddev->level == 5 && 6114 mddev->layout == ALGORITHM_PARITY_N) { 6115 mddev->new_layout = 0; 6116 mddev->new_level = 4; 6117 return setup_conf(mddev); 6118 } 6119 return ERR_PTR(-EINVAL); 6120 } 6121 6122 static struct md_personality raid5_personality; 6123 6124 static void *raid6_takeover(struct mddev *mddev) 6125 { 6126 /* Currently can only take over a raid5. We map the 6127 * personality to an equivalent raid6 personality 6128 * with the Q block at the end. 6129 */ 6130 int new_layout; 6131 6132 if (mddev->pers != &raid5_personality) 6133 return ERR_PTR(-EINVAL); 6134 if (mddev->degraded > 1) 6135 return ERR_PTR(-EINVAL); 6136 if (mddev->raid_disks > 253) 6137 return ERR_PTR(-EINVAL); 6138 if (mddev->raid_disks < 3) 6139 return ERR_PTR(-EINVAL); 6140 6141 switch (mddev->layout) { 6142 case ALGORITHM_LEFT_ASYMMETRIC: 6143 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6144 break; 6145 case ALGORITHM_RIGHT_ASYMMETRIC: 6146 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6147 break; 6148 case ALGORITHM_LEFT_SYMMETRIC: 6149 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6150 break; 6151 case ALGORITHM_RIGHT_SYMMETRIC: 6152 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6153 break; 6154 case ALGORITHM_PARITY_0: 6155 new_layout = ALGORITHM_PARITY_0_6; 6156 break; 6157 case ALGORITHM_PARITY_N: 6158 new_layout = ALGORITHM_PARITY_N; 6159 break; 6160 default: 6161 return ERR_PTR(-EINVAL); 6162 } 6163 mddev->new_level = 6; 6164 mddev->new_layout = new_layout; 6165 mddev->delta_disks = 1; 6166 mddev->raid_disks += 1; 6167 return setup_conf(mddev); 6168 } 6169 6170 6171 static struct md_personality raid6_personality = 6172 { 6173 .name = "raid6", 6174 .level = 6, 6175 .owner = THIS_MODULE, 6176 .make_request = make_request, 6177 .run = run, 6178 .stop = stop, 6179 .status = status, 6180 .error_handler = error, 6181 .hot_add_disk = raid5_add_disk, 6182 .hot_remove_disk= raid5_remove_disk, 6183 .spare_active = raid5_spare_active, 6184 .sync_request = sync_request, 6185 .resize = raid5_resize, 6186 .size = raid5_size, 6187 .check_reshape = raid6_check_reshape, 6188 .start_reshape = raid5_start_reshape, 6189 .finish_reshape = raid5_finish_reshape, 6190 .quiesce = raid5_quiesce, 6191 .takeover = raid6_takeover, 6192 }; 6193 static struct md_personality raid5_personality = 6194 { 6195 .name = "raid5", 6196 .level = 5, 6197 .owner = THIS_MODULE, 6198 .make_request = make_request, 6199 .run = run, 6200 .stop = stop, 6201 .status = status, 6202 .error_handler = error, 6203 .hot_add_disk = raid5_add_disk, 6204 .hot_remove_disk= raid5_remove_disk, 6205 .spare_active = raid5_spare_active, 6206 .sync_request = sync_request, 6207 .resize = raid5_resize, 6208 .size = raid5_size, 6209 .check_reshape = raid5_check_reshape, 6210 .start_reshape = raid5_start_reshape, 6211 .finish_reshape = raid5_finish_reshape, 6212 .quiesce = raid5_quiesce, 6213 .takeover = raid5_takeover, 6214 }; 6215 6216 static struct md_personality raid4_personality = 6217 { 6218 .name = "raid4", 6219 .level = 4, 6220 .owner = THIS_MODULE, 6221 .make_request = make_request, 6222 .run = run, 6223 .stop = stop, 6224 .status = status, 6225 .error_handler = error, 6226 .hot_add_disk = raid5_add_disk, 6227 .hot_remove_disk= raid5_remove_disk, 6228 .spare_active = raid5_spare_active, 6229 .sync_request = sync_request, 6230 .resize = raid5_resize, 6231 .size = raid5_size, 6232 .check_reshape = raid5_check_reshape, 6233 .start_reshape = raid5_start_reshape, 6234 .finish_reshape = raid5_finish_reshape, 6235 .quiesce = raid5_quiesce, 6236 .takeover = raid4_takeover, 6237 }; 6238 6239 static int __init raid5_init(void) 6240 { 6241 register_md_personality(&raid6_personality); 6242 register_md_personality(&raid5_personality); 6243 register_md_personality(&raid4_personality); 6244 return 0; 6245 } 6246 6247 static void raid5_exit(void) 6248 { 6249 unregister_md_personality(&raid6_personality); 6250 unregister_md_personality(&raid5_personality); 6251 unregister_md_personality(&raid4_personality); 6252 } 6253 6254 module_init(raid5_init); 6255 module_exit(raid5_exit); 6256 MODULE_LICENSE("GPL"); 6257 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6258 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6259 MODULE_ALIAS("md-raid5"); 6260 MODULE_ALIAS("md-raid4"); 6261 MODULE_ALIAS("md-level-5"); 6262 MODULE_ALIAS("md-level-4"); 6263 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6264 MODULE_ALIAS("md-raid6"); 6265 MODULE_ALIAS("md-level-6"); 6266 6267 /* This used to be two separate modules, they were: */ 6268 MODULE_ALIAS("raid5"); 6269 MODULE_ALIAS("raid6"); 6270