1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <trace/events/block.h> 57 58 #include "md.h" 59 #include "raid5.h" 60 #include "raid0.h" 61 #include "bitmap.h" 62 63 /* 64 * Stripe cache 65 */ 66 67 #define NR_STRIPES 256 68 #define STRIPE_SIZE PAGE_SIZE 69 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 70 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 71 #define IO_THRESHOLD 1 72 #define BYPASS_THRESHOLD 1 73 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 74 #define HASH_MASK (NR_HASH - 1) 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 83 * order without overlap. There may be several bio's per stripe+device, and 84 * a bio could span several devices. 85 * When walking this list for a particular stripe+device, we must never proceed 86 * beyond a bio that extends past this device, as the next bio might no longer 87 * be valid. 88 * This function is used to determine the 'next' bio in the list, given the sector 89 * of the current stripe+device 90 */ 91 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 92 { 93 int sectors = bio->bi_size >> 9; 94 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 95 return bio->bi_next; 96 else 97 return NULL; 98 } 99 100 /* 101 * We maintain a biased count of active stripes in the bottom 16 bits of 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 103 */ 104 static inline int raid5_bi_processed_stripes(struct bio *bio) 105 { 106 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 107 return (atomic_read(segments) >> 16) & 0xffff; 108 } 109 110 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return atomic_sub_return(1, segments) & 0xffff; 114 } 115 116 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 atomic_inc(segments); 120 } 121 122 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 123 unsigned int cnt) 124 { 125 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 126 int old, new; 127 128 do { 129 old = atomic_read(segments); 130 new = (old & 0xffff) | (cnt << 16); 131 } while (atomic_cmpxchg(segments, old, new) != old); 132 } 133 134 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 135 { 136 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 137 atomic_set(segments, cnt); 138 } 139 140 /* Find first data disk in a raid6 stripe */ 141 static inline int raid6_d0(struct stripe_head *sh) 142 { 143 if (sh->ddf_layout) 144 /* ddf always start from first device */ 145 return 0; 146 /* md starts just after Q block */ 147 if (sh->qd_idx == sh->disks - 1) 148 return 0; 149 else 150 return sh->qd_idx + 1; 151 } 152 static inline int raid6_next_disk(int disk, int raid_disks) 153 { 154 disk++; 155 return (disk < raid_disks) ? disk : 0; 156 } 157 158 /* When walking through the disks in a raid5, starting at raid6_d0, 159 * We need to map each disk to a 'slot', where the data disks are slot 160 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 161 * is raid_disks-1. This help does that mapping. 162 */ 163 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 164 int *count, int syndrome_disks) 165 { 166 int slot = *count; 167 168 if (sh->ddf_layout) 169 (*count)++; 170 if (idx == sh->pd_idx) 171 return syndrome_disks; 172 if (idx == sh->qd_idx) 173 return syndrome_disks + 1; 174 if (!sh->ddf_layout) 175 (*count)++; 176 return slot; 177 } 178 179 static void return_io(struct bio *return_bi) 180 { 181 struct bio *bi = return_bi; 182 while (bi) { 183 184 return_bi = bi->bi_next; 185 bi->bi_next = NULL; 186 bi->bi_size = 0; 187 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 188 bi, 0); 189 bio_endio(bi, 0); 190 bi = return_bi; 191 } 192 } 193 194 static void print_raid5_conf (struct r5conf *conf); 195 196 static int stripe_operations_active(struct stripe_head *sh) 197 { 198 return sh->check_state || sh->reconstruct_state || 199 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 201 } 202 203 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 204 { 205 BUG_ON(!list_empty(&sh->lru)); 206 BUG_ON(atomic_read(&conf->active_stripes)==0); 207 if (test_bit(STRIPE_HANDLE, &sh->state)) { 208 if (test_bit(STRIPE_DELAYED, &sh->state) && 209 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 210 list_add_tail(&sh->lru, &conf->delayed_list); 211 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 212 sh->bm_seq - conf->seq_write > 0) 213 list_add_tail(&sh->lru, &conf->bitmap_list); 214 else { 215 clear_bit(STRIPE_DELAYED, &sh->state); 216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 217 list_add_tail(&sh->lru, &conf->handle_list); 218 } 219 md_wakeup_thread(conf->mddev->thread); 220 } else { 221 BUG_ON(stripe_operations_active(sh)); 222 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 223 if (atomic_dec_return(&conf->preread_active_stripes) 224 < IO_THRESHOLD) 225 md_wakeup_thread(conf->mddev->thread); 226 atomic_dec(&conf->active_stripes); 227 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 228 list_add_tail(&sh->lru, &conf->inactive_list); 229 wake_up(&conf->wait_for_stripe); 230 if (conf->retry_read_aligned) 231 md_wakeup_thread(conf->mddev->thread); 232 } 233 } 234 } 235 236 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 237 { 238 if (atomic_dec_and_test(&sh->count)) 239 do_release_stripe(conf, sh); 240 } 241 242 static void release_stripe(struct stripe_head *sh) 243 { 244 struct r5conf *conf = sh->raid_conf; 245 unsigned long flags; 246 247 local_irq_save(flags); 248 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 249 do_release_stripe(conf, sh); 250 spin_unlock(&conf->device_lock); 251 } 252 local_irq_restore(flags); 253 } 254 255 static inline void remove_hash(struct stripe_head *sh) 256 { 257 pr_debug("remove_hash(), stripe %llu\n", 258 (unsigned long long)sh->sector); 259 260 hlist_del_init(&sh->hash); 261 } 262 263 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 264 { 265 struct hlist_head *hp = stripe_hash(conf, sh->sector); 266 267 pr_debug("insert_hash(), stripe %llu\n", 268 (unsigned long long)sh->sector); 269 270 hlist_add_head(&sh->hash, hp); 271 } 272 273 274 /* find an idle stripe, make sure it is unhashed, and return it. */ 275 static struct stripe_head *get_free_stripe(struct r5conf *conf) 276 { 277 struct stripe_head *sh = NULL; 278 struct list_head *first; 279 280 if (list_empty(&conf->inactive_list)) 281 goto out; 282 first = conf->inactive_list.next; 283 sh = list_entry(first, struct stripe_head, lru); 284 list_del_init(first); 285 remove_hash(sh); 286 atomic_inc(&conf->active_stripes); 287 out: 288 return sh; 289 } 290 291 static void shrink_buffers(struct stripe_head *sh) 292 { 293 struct page *p; 294 int i; 295 int num = sh->raid_conf->pool_size; 296 297 for (i = 0; i < num ; i++) { 298 p = sh->dev[i].page; 299 if (!p) 300 continue; 301 sh->dev[i].page = NULL; 302 put_page(p); 303 } 304 } 305 306 static int grow_buffers(struct stripe_head *sh) 307 { 308 int i; 309 int num = sh->raid_conf->pool_size; 310 311 for (i = 0; i < num; i++) { 312 struct page *page; 313 314 if (!(page = alloc_page(GFP_KERNEL))) { 315 return 1; 316 } 317 sh->dev[i].page = page; 318 } 319 return 0; 320 } 321 322 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 323 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 324 struct stripe_head *sh); 325 326 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 327 { 328 struct r5conf *conf = sh->raid_conf; 329 int i; 330 331 BUG_ON(atomic_read(&sh->count) != 0); 332 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 333 BUG_ON(stripe_operations_active(sh)); 334 335 pr_debug("init_stripe called, stripe %llu\n", 336 (unsigned long long)sh->sector); 337 338 remove_hash(sh); 339 340 sh->generation = conf->generation - previous; 341 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 342 sh->sector = sector; 343 stripe_set_idx(sector, conf, previous, sh); 344 sh->state = 0; 345 346 347 for (i = sh->disks; i--; ) { 348 struct r5dev *dev = &sh->dev[i]; 349 350 if (dev->toread || dev->read || dev->towrite || dev->written || 351 test_bit(R5_LOCKED, &dev->flags)) { 352 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 353 (unsigned long long)sh->sector, i, dev->toread, 354 dev->read, dev->towrite, dev->written, 355 test_bit(R5_LOCKED, &dev->flags)); 356 WARN_ON(1); 357 } 358 dev->flags = 0; 359 raid5_build_block(sh, i, previous); 360 } 361 insert_hash(conf, sh); 362 } 363 364 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 365 short generation) 366 { 367 struct stripe_head *sh; 368 369 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 370 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 371 if (sh->sector == sector && sh->generation == generation) 372 return sh; 373 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 374 return NULL; 375 } 376 377 /* 378 * Need to check if array has failed when deciding whether to: 379 * - start an array 380 * - remove non-faulty devices 381 * - add a spare 382 * - allow a reshape 383 * This determination is simple when no reshape is happening. 384 * However if there is a reshape, we need to carefully check 385 * both the before and after sections. 386 * This is because some failed devices may only affect one 387 * of the two sections, and some non-in_sync devices may 388 * be insync in the section most affected by failed devices. 389 */ 390 static int calc_degraded(struct r5conf *conf) 391 { 392 int degraded, degraded2; 393 int i; 394 395 rcu_read_lock(); 396 degraded = 0; 397 for (i = 0; i < conf->previous_raid_disks; i++) { 398 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 399 if (rdev && test_bit(Faulty, &rdev->flags)) 400 rdev = rcu_dereference(conf->disks[i].replacement); 401 if (!rdev || test_bit(Faulty, &rdev->flags)) 402 degraded++; 403 else if (test_bit(In_sync, &rdev->flags)) 404 ; 405 else 406 /* not in-sync or faulty. 407 * If the reshape increases the number of devices, 408 * this is being recovered by the reshape, so 409 * this 'previous' section is not in_sync. 410 * If the number of devices is being reduced however, 411 * the device can only be part of the array if 412 * we are reverting a reshape, so this section will 413 * be in-sync. 414 */ 415 if (conf->raid_disks >= conf->previous_raid_disks) 416 degraded++; 417 } 418 rcu_read_unlock(); 419 if (conf->raid_disks == conf->previous_raid_disks) 420 return degraded; 421 rcu_read_lock(); 422 degraded2 = 0; 423 for (i = 0; i < conf->raid_disks; i++) { 424 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 425 if (rdev && test_bit(Faulty, &rdev->flags)) 426 rdev = rcu_dereference(conf->disks[i].replacement); 427 if (!rdev || test_bit(Faulty, &rdev->flags)) 428 degraded2++; 429 else if (test_bit(In_sync, &rdev->flags)) 430 ; 431 else 432 /* not in-sync or faulty. 433 * If reshape increases the number of devices, this 434 * section has already been recovered, else it 435 * almost certainly hasn't. 436 */ 437 if (conf->raid_disks <= conf->previous_raid_disks) 438 degraded2++; 439 } 440 rcu_read_unlock(); 441 if (degraded2 > degraded) 442 return degraded2; 443 return degraded; 444 } 445 446 static int has_failed(struct r5conf *conf) 447 { 448 int degraded; 449 450 if (conf->mddev->reshape_position == MaxSector) 451 return conf->mddev->degraded > conf->max_degraded; 452 453 degraded = calc_degraded(conf); 454 if (degraded > conf->max_degraded) 455 return 1; 456 return 0; 457 } 458 459 static struct stripe_head * 460 get_active_stripe(struct r5conf *conf, sector_t sector, 461 int previous, int noblock, int noquiesce) 462 { 463 struct stripe_head *sh; 464 465 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 466 467 spin_lock_irq(&conf->device_lock); 468 469 do { 470 wait_event_lock_irq(conf->wait_for_stripe, 471 conf->quiesce == 0 || noquiesce, 472 conf->device_lock); 473 sh = __find_stripe(conf, sector, conf->generation - previous); 474 if (!sh) { 475 if (!conf->inactive_blocked) 476 sh = get_free_stripe(conf); 477 if (noblock && sh == NULL) 478 break; 479 if (!sh) { 480 conf->inactive_blocked = 1; 481 wait_event_lock_irq(conf->wait_for_stripe, 482 !list_empty(&conf->inactive_list) && 483 (atomic_read(&conf->active_stripes) 484 < (conf->max_nr_stripes *3/4) 485 || !conf->inactive_blocked), 486 conf->device_lock); 487 conf->inactive_blocked = 0; 488 } else 489 init_stripe(sh, sector, previous); 490 } else { 491 if (atomic_read(&sh->count)) { 492 BUG_ON(!list_empty(&sh->lru) 493 && !test_bit(STRIPE_EXPANDING, &sh->state) 494 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 495 } else { 496 if (!test_bit(STRIPE_HANDLE, &sh->state)) 497 atomic_inc(&conf->active_stripes); 498 if (list_empty(&sh->lru) && 499 !test_bit(STRIPE_EXPANDING, &sh->state)) 500 BUG(); 501 list_del_init(&sh->lru); 502 } 503 } 504 } while (sh == NULL); 505 506 if (sh) 507 atomic_inc(&sh->count); 508 509 spin_unlock_irq(&conf->device_lock); 510 return sh; 511 } 512 513 /* Determine if 'data_offset' or 'new_data_offset' should be used 514 * in this stripe_head. 515 */ 516 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 517 { 518 sector_t progress = conf->reshape_progress; 519 /* Need a memory barrier to make sure we see the value 520 * of conf->generation, or ->data_offset that was set before 521 * reshape_progress was updated. 522 */ 523 smp_rmb(); 524 if (progress == MaxSector) 525 return 0; 526 if (sh->generation == conf->generation - 1) 527 return 0; 528 /* We are in a reshape, and this is a new-generation stripe, 529 * so use new_data_offset. 530 */ 531 return 1; 532 } 533 534 static void 535 raid5_end_read_request(struct bio *bi, int error); 536 static void 537 raid5_end_write_request(struct bio *bi, int error); 538 539 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 540 { 541 struct r5conf *conf = sh->raid_conf; 542 int i, disks = sh->disks; 543 544 might_sleep(); 545 546 for (i = disks; i--; ) { 547 int rw; 548 int replace_only = 0; 549 struct bio *bi, *rbi; 550 struct md_rdev *rdev, *rrdev = NULL; 551 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 552 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 553 rw = WRITE_FUA; 554 else 555 rw = WRITE; 556 if (test_bit(R5_Discard, &sh->dev[i].flags)) 557 rw |= REQ_DISCARD; 558 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 559 rw = READ; 560 else if (test_and_clear_bit(R5_WantReplace, 561 &sh->dev[i].flags)) { 562 rw = WRITE; 563 replace_only = 1; 564 } else 565 continue; 566 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 567 rw |= REQ_SYNC; 568 569 bi = &sh->dev[i].req; 570 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 571 572 bi->bi_rw = rw; 573 rbi->bi_rw = rw; 574 if (rw & WRITE) { 575 bi->bi_end_io = raid5_end_write_request; 576 rbi->bi_end_io = raid5_end_write_request; 577 } else 578 bi->bi_end_io = raid5_end_read_request; 579 580 rcu_read_lock(); 581 rrdev = rcu_dereference(conf->disks[i].replacement); 582 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 583 rdev = rcu_dereference(conf->disks[i].rdev); 584 if (!rdev) { 585 rdev = rrdev; 586 rrdev = NULL; 587 } 588 if (rw & WRITE) { 589 if (replace_only) 590 rdev = NULL; 591 if (rdev == rrdev) 592 /* We raced and saw duplicates */ 593 rrdev = NULL; 594 } else { 595 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 596 rdev = rrdev; 597 rrdev = NULL; 598 } 599 600 if (rdev && test_bit(Faulty, &rdev->flags)) 601 rdev = NULL; 602 if (rdev) 603 atomic_inc(&rdev->nr_pending); 604 if (rrdev && test_bit(Faulty, &rrdev->flags)) 605 rrdev = NULL; 606 if (rrdev) 607 atomic_inc(&rrdev->nr_pending); 608 rcu_read_unlock(); 609 610 /* We have already checked bad blocks for reads. Now 611 * need to check for writes. We never accept write errors 612 * on the replacement, so we don't to check rrdev. 613 */ 614 while ((rw & WRITE) && rdev && 615 test_bit(WriteErrorSeen, &rdev->flags)) { 616 sector_t first_bad; 617 int bad_sectors; 618 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 619 &first_bad, &bad_sectors); 620 if (!bad) 621 break; 622 623 if (bad < 0) { 624 set_bit(BlockedBadBlocks, &rdev->flags); 625 if (!conf->mddev->external && 626 conf->mddev->flags) { 627 /* It is very unlikely, but we might 628 * still need to write out the 629 * bad block log - better give it 630 * a chance*/ 631 md_check_recovery(conf->mddev); 632 } 633 /* 634 * Because md_wait_for_blocked_rdev 635 * will dec nr_pending, we must 636 * increment it first. 637 */ 638 atomic_inc(&rdev->nr_pending); 639 md_wait_for_blocked_rdev(rdev, conf->mddev); 640 } else { 641 /* Acknowledged bad block - skip the write */ 642 rdev_dec_pending(rdev, conf->mddev); 643 rdev = NULL; 644 } 645 } 646 647 if (rdev) { 648 if (s->syncing || s->expanding || s->expanded 649 || s->replacing) 650 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 651 652 set_bit(STRIPE_IO_STARTED, &sh->state); 653 654 bi->bi_bdev = rdev->bdev; 655 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 656 __func__, (unsigned long long)sh->sector, 657 bi->bi_rw, i); 658 atomic_inc(&sh->count); 659 if (use_new_offset(conf, sh)) 660 bi->bi_sector = (sh->sector 661 + rdev->new_data_offset); 662 else 663 bi->bi_sector = (sh->sector 664 + rdev->data_offset); 665 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 666 bi->bi_rw |= REQ_FLUSH; 667 668 bi->bi_flags = 1 << BIO_UPTODATE; 669 bi->bi_idx = 0; 670 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 671 bi->bi_io_vec[0].bv_offset = 0; 672 bi->bi_size = STRIPE_SIZE; 673 bi->bi_next = NULL; 674 if (rrdev) 675 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 676 677 if (conf->mddev->gendisk) 678 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 679 bi, disk_devt(conf->mddev->gendisk), 680 sh->dev[i].sector); 681 generic_make_request(bi); 682 } 683 if (rrdev) { 684 if (s->syncing || s->expanding || s->expanded 685 || s->replacing) 686 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 687 688 set_bit(STRIPE_IO_STARTED, &sh->state); 689 690 rbi->bi_bdev = rrdev->bdev; 691 pr_debug("%s: for %llu schedule op %ld on " 692 "replacement disc %d\n", 693 __func__, (unsigned long long)sh->sector, 694 rbi->bi_rw, i); 695 atomic_inc(&sh->count); 696 if (use_new_offset(conf, sh)) 697 rbi->bi_sector = (sh->sector 698 + rrdev->new_data_offset); 699 else 700 rbi->bi_sector = (sh->sector 701 + rrdev->data_offset); 702 rbi->bi_flags = 1 << BIO_UPTODATE; 703 rbi->bi_idx = 0; 704 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 705 rbi->bi_io_vec[0].bv_offset = 0; 706 rbi->bi_size = STRIPE_SIZE; 707 rbi->bi_next = NULL; 708 if (conf->mddev->gendisk) 709 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 710 rbi, disk_devt(conf->mddev->gendisk), 711 sh->dev[i].sector); 712 generic_make_request(rbi); 713 } 714 if (!rdev && !rrdev) { 715 if (rw & WRITE) 716 set_bit(STRIPE_DEGRADED, &sh->state); 717 pr_debug("skip op %ld on disc %d for sector %llu\n", 718 bi->bi_rw, i, (unsigned long long)sh->sector); 719 clear_bit(R5_LOCKED, &sh->dev[i].flags); 720 set_bit(STRIPE_HANDLE, &sh->state); 721 } 722 } 723 } 724 725 static struct dma_async_tx_descriptor * 726 async_copy_data(int frombio, struct bio *bio, struct page *page, 727 sector_t sector, struct dma_async_tx_descriptor *tx) 728 { 729 struct bio_vec *bvl; 730 struct page *bio_page; 731 int i; 732 int page_offset; 733 struct async_submit_ctl submit; 734 enum async_tx_flags flags = 0; 735 736 if (bio->bi_sector >= sector) 737 page_offset = (signed)(bio->bi_sector - sector) * 512; 738 else 739 page_offset = (signed)(sector - bio->bi_sector) * -512; 740 741 if (frombio) 742 flags |= ASYNC_TX_FENCE; 743 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 744 745 bio_for_each_segment(bvl, bio, i) { 746 int len = bvl->bv_len; 747 int clen; 748 int b_offset = 0; 749 750 if (page_offset < 0) { 751 b_offset = -page_offset; 752 page_offset += b_offset; 753 len -= b_offset; 754 } 755 756 if (len > 0 && page_offset + len > STRIPE_SIZE) 757 clen = STRIPE_SIZE - page_offset; 758 else 759 clen = len; 760 761 if (clen > 0) { 762 b_offset += bvl->bv_offset; 763 bio_page = bvl->bv_page; 764 if (frombio) 765 tx = async_memcpy(page, bio_page, page_offset, 766 b_offset, clen, &submit); 767 else 768 tx = async_memcpy(bio_page, page, b_offset, 769 page_offset, clen, &submit); 770 } 771 /* chain the operations */ 772 submit.depend_tx = tx; 773 774 if (clen < len) /* hit end of page */ 775 break; 776 page_offset += len; 777 } 778 779 return tx; 780 } 781 782 static void ops_complete_biofill(void *stripe_head_ref) 783 { 784 struct stripe_head *sh = stripe_head_ref; 785 struct bio *return_bi = NULL; 786 int i; 787 788 pr_debug("%s: stripe %llu\n", __func__, 789 (unsigned long long)sh->sector); 790 791 /* clear completed biofills */ 792 for (i = sh->disks; i--; ) { 793 struct r5dev *dev = &sh->dev[i]; 794 795 /* acknowledge completion of a biofill operation */ 796 /* and check if we need to reply to a read request, 797 * new R5_Wantfill requests are held off until 798 * !STRIPE_BIOFILL_RUN 799 */ 800 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 801 struct bio *rbi, *rbi2; 802 803 BUG_ON(!dev->read); 804 rbi = dev->read; 805 dev->read = NULL; 806 while (rbi && rbi->bi_sector < 807 dev->sector + STRIPE_SECTORS) { 808 rbi2 = r5_next_bio(rbi, dev->sector); 809 if (!raid5_dec_bi_active_stripes(rbi)) { 810 rbi->bi_next = return_bi; 811 return_bi = rbi; 812 } 813 rbi = rbi2; 814 } 815 } 816 } 817 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 818 819 return_io(return_bi); 820 821 set_bit(STRIPE_HANDLE, &sh->state); 822 release_stripe(sh); 823 } 824 825 static void ops_run_biofill(struct stripe_head *sh) 826 { 827 struct dma_async_tx_descriptor *tx = NULL; 828 struct async_submit_ctl submit; 829 int i; 830 831 pr_debug("%s: stripe %llu\n", __func__, 832 (unsigned long long)sh->sector); 833 834 for (i = sh->disks; i--; ) { 835 struct r5dev *dev = &sh->dev[i]; 836 if (test_bit(R5_Wantfill, &dev->flags)) { 837 struct bio *rbi; 838 spin_lock_irq(&sh->stripe_lock); 839 dev->read = rbi = dev->toread; 840 dev->toread = NULL; 841 spin_unlock_irq(&sh->stripe_lock); 842 while (rbi && rbi->bi_sector < 843 dev->sector + STRIPE_SECTORS) { 844 tx = async_copy_data(0, rbi, dev->page, 845 dev->sector, tx); 846 rbi = r5_next_bio(rbi, dev->sector); 847 } 848 } 849 } 850 851 atomic_inc(&sh->count); 852 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 853 async_trigger_callback(&submit); 854 } 855 856 static void mark_target_uptodate(struct stripe_head *sh, int target) 857 { 858 struct r5dev *tgt; 859 860 if (target < 0) 861 return; 862 863 tgt = &sh->dev[target]; 864 set_bit(R5_UPTODATE, &tgt->flags); 865 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 866 clear_bit(R5_Wantcompute, &tgt->flags); 867 } 868 869 static void ops_complete_compute(void *stripe_head_ref) 870 { 871 struct stripe_head *sh = stripe_head_ref; 872 873 pr_debug("%s: stripe %llu\n", __func__, 874 (unsigned long long)sh->sector); 875 876 /* mark the computed target(s) as uptodate */ 877 mark_target_uptodate(sh, sh->ops.target); 878 mark_target_uptodate(sh, sh->ops.target2); 879 880 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 881 if (sh->check_state == check_state_compute_run) 882 sh->check_state = check_state_compute_result; 883 set_bit(STRIPE_HANDLE, &sh->state); 884 release_stripe(sh); 885 } 886 887 /* return a pointer to the address conversion region of the scribble buffer */ 888 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 889 struct raid5_percpu *percpu) 890 { 891 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 892 } 893 894 static struct dma_async_tx_descriptor * 895 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 896 { 897 int disks = sh->disks; 898 struct page **xor_srcs = percpu->scribble; 899 int target = sh->ops.target; 900 struct r5dev *tgt = &sh->dev[target]; 901 struct page *xor_dest = tgt->page; 902 int count = 0; 903 struct dma_async_tx_descriptor *tx; 904 struct async_submit_ctl submit; 905 int i; 906 907 pr_debug("%s: stripe %llu block: %d\n", 908 __func__, (unsigned long long)sh->sector, target); 909 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 910 911 for (i = disks; i--; ) 912 if (i != target) 913 xor_srcs[count++] = sh->dev[i].page; 914 915 atomic_inc(&sh->count); 916 917 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 918 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 919 if (unlikely(count == 1)) 920 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 921 else 922 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 923 924 return tx; 925 } 926 927 /* set_syndrome_sources - populate source buffers for gen_syndrome 928 * @srcs - (struct page *) array of size sh->disks 929 * @sh - stripe_head to parse 930 * 931 * Populates srcs in proper layout order for the stripe and returns the 932 * 'count' of sources to be used in a call to async_gen_syndrome. The P 933 * destination buffer is recorded in srcs[count] and the Q destination 934 * is recorded in srcs[count+1]]. 935 */ 936 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 937 { 938 int disks = sh->disks; 939 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 940 int d0_idx = raid6_d0(sh); 941 int count; 942 int i; 943 944 for (i = 0; i < disks; i++) 945 srcs[i] = NULL; 946 947 count = 0; 948 i = d0_idx; 949 do { 950 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 951 952 srcs[slot] = sh->dev[i].page; 953 i = raid6_next_disk(i, disks); 954 } while (i != d0_idx); 955 956 return syndrome_disks; 957 } 958 959 static struct dma_async_tx_descriptor * 960 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 961 { 962 int disks = sh->disks; 963 struct page **blocks = percpu->scribble; 964 int target; 965 int qd_idx = sh->qd_idx; 966 struct dma_async_tx_descriptor *tx; 967 struct async_submit_ctl submit; 968 struct r5dev *tgt; 969 struct page *dest; 970 int i; 971 int count; 972 973 if (sh->ops.target < 0) 974 target = sh->ops.target2; 975 else if (sh->ops.target2 < 0) 976 target = sh->ops.target; 977 else 978 /* we should only have one valid target */ 979 BUG(); 980 BUG_ON(target < 0); 981 pr_debug("%s: stripe %llu block: %d\n", 982 __func__, (unsigned long long)sh->sector, target); 983 984 tgt = &sh->dev[target]; 985 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 986 dest = tgt->page; 987 988 atomic_inc(&sh->count); 989 990 if (target == qd_idx) { 991 count = set_syndrome_sources(blocks, sh); 992 blocks[count] = NULL; /* regenerating p is not necessary */ 993 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 994 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 995 ops_complete_compute, sh, 996 to_addr_conv(sh, percpu)); 997 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 998 } else { 999 /* Compute any data- or p-drive using XOR */ 1000 count = 0; 1001 for (i = disks; i-- ; ) { 1002 if (i == target || i == qd_idx) 1003 continue; 1004 blocks[count++] = sh->dev[i].page; 1005 } 1006 1007 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1008 NULL, ops_complete_compute, sh, 1009 to_addr_conv(sh, percpu)); 1010 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1011 } 1012 1013 return tx; 1014 } 1015 1016 static struct dma_async_tx_descriptor * 1017 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1018 { 1019 int i, count, disks = sh->disks; 1020 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1021 int d0_idx = raid6_d0(sh); 1022 int faila = -1, failb = -1; 1023 int target = sh->ops.target; 1024 int target2 = sh->ops.target2; 1025 struct r5dev *tgt = &sh->dev[target]; 1026 struct r5dev *tgt2 = &sh->dev[target2]; 1027 struct dma_async_tx_descriptor *tx; 1028 struct page **blocks = percpu->scribble; 1029 struct async_submit_ctl submit; 1030 1031 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1032 __func__, (unsigned long long)sh->sector, target, target2); 1033 BUG_ON(target < 0 || target2 < 0); 1034 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1035 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1036 1037 /* we need to open-code set_syndrome_sources to handle the 1038 * slot number conversion for 'faila' and 'failb' 1039 */ 1040 for (i = 0; i < disks ; i++) 1041 blocks[i] = NULL; 1042 count = 0; 1043 i = d0_idx; 1044 do { 1045 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1046 1047 blocks[slot] = sh->dev[i].page; 1048 1049 if (i == target) 1050 faila = slot; 1051 if (i == target2) 1052 failb = slot; 1053 i = raid6_next_disk(i, disks); 1054 } while (i != d0_idx); 1055 1056 BUG_ON(faila == failb); 1057 if (failb < faila) 1058 swap(faila, failb); 1059 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1060 __func__, (unsigned long long)sh->sector, faila, failb); 1061 1062 atomic_inc(&sh->count); 1063 1064 if (failb == syndrome_disks+1) { 1065 /* Q disk is one of the missing disks */ 1066 if (faila == syndrome_disks) { 1067 /* Missing P+Q, just recompute */ 1068 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1069 ops_complete_compute, sh, 1070 to_addr_conv(sh, percpu)); 1071 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1072 STRIPE_SIZE, &submit); 1073 } else { 1074 struct page *dest; 1075 int data_target; 1076 int qd_idx = sh->qd_idx; 1077 1078 /* Missing D+Q: recompute D from P, then recompute Q */ 1079 if (target == qd_idx) 1080 data_target = target2; 1081 else 1082 data_target = target; 1083 1084 count = 0; 1085 for (i = disks; i-- ; ) { 1086 if (i == data_target || i == qd_idx) 1087 continue; 1088 blocks[count++] = sh->dev[i].page; 1089 } 1090 dest = sh->dev[data_target].page; 1091 init_async_submit(&submit, 1092 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1093 NULL, NULL, NULL, 1094 to_addr_conv(sh, percpu)); 1095 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1096 &submit); 1097 1098 count = set_syndrome_sources(blocks, sh); 1099 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1100 ops_complete_compute, sh, 1101 to_addr_conv(sh, percpu)); 1102 return async_gen_syndrome(blocks, 0, count+2, 1103 STRIPE_SIZE, &submit); 1104 } 1105 } else { 1106 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1107 ops_complete_compute, sh, 1108 to_addr_conv(sh, percpu)); 1109 if (failb == syndrome_disks) { 1110 /* We're missing D+P. */ 1111 return async_raid6_datap_recov(syndrome_disks+2, 1112 STRIPE_SIZE, faila, 1113 blocks, &submit); 1114 } else { 1115 /* We're missing D+D. */ 1116 return async_raid6_2data_recov(syndrome_disks+2, 1117 STRIPE_SIZE, faila, failb, 1118 blocks, &submit); 1119 } 1120 } 1121 } 1122 1123 1124 static void ops_complete_prexor(void *stripe_head_ref) 1125 { 1126 struct stripe_head *sh = stripe_head_ref; 1127 1128 pr_debug("%s: stripe %llu\n", __func__, 1129 (unsigned long long)sh->sector); 1130 } 1131 1132 static struct dma_async_tx_descriptor * 1133 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1134 struct dma_async_tx_descriptor *tx) 1135 { 1136 int disks = sh->disks; 1137 struct page **xor_srcs = percpu->scribble; 1138 int count = 0, pd_idx = sh->pd_idx, i; 1139 struct async_submit_ctl submit; 1140 1141 /* existing parity data subtracted */ 1142 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1143 1144 pr_debug("%s: stripe %llu\n", __func__, 1145 (unsigned long long)sh->sector); 1146 1147 for (i = disks; i--; ) { 1148 struct r5dev *dev = &sh->dev[i]; 1149 /* Only process blocks that are known to be uptodate */ 1150 if (test_bit(R5_Wantdrain, &dev->flags)) 1151 xor_srcs[count++] = dev->page; 1152 } 1153 1154 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1155 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1156 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1157 1158 return tx; 1159 } 1160 1161 static struct dma_async_tx_descriptor * 1162 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1163 { 1164 int disks = sh->disks; 1165 int i; 1166 1167 pr_debug("%s: stripe %llu\n", __func__, 1168 (unsigned long long)sh->sector); 1169 1170 for (i = disks; i--; ) { 1171 struct r5dev *dev = &sh->dev[i]; 1172 struct bio *chosen; 1173 1174 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1175 struct bio *wbi; 1176 1177 spin_lock_irq(&sh->stripe_lock); 1178 chosen = dev->towrite; 1179 dev->towrite = NULL; 1180 BUG_ON(dev->written); 1181 wbi = dev->written = chosen; 1182 spin_unlock_irq(&sh->stripe_lock); 1183 1184 while (wbi && wbi->bi_sector < 1185 dev->sector + STRIPE_SECTORS) { 1186 if (wbi->bi_rw & REQ_FUA) 1187 set_bit(R5_WantFUA, &dev->flags); 1188 if (wbi->bi_rw & REQ_SYNC) 1189 set_bit(R5_SyncIO, &dev->flags); 1190 if (wbi->bi_rw & REQ_DISCARD) 1191 set_bit(R5_Discard, &dev->flags); 1192 else 1193 tx = async_copy_data(1, wbi, dev->page, 1194 dev->sector, tx); 1195 wbi = r5_next_bio(wbi, dev->sector); 1196 } 1197 } 1198 } 1199 1200 return tx; 1201 } 1202 1203 static void ops_complete_reconstruct(void *stripe_head_ref) 1204 { 1205 struct stripe_head *sh = stripe_head_ref; 1206 int disks = sh->disks; 1207 int pd_idx = sh->pd_idx; 1208 int qd_idx = sh->qd_idx; 1209 int i; 1210 bool fua = false, sync = false, discard = false; 1211 1212 pr_debug("%s: stripe %llu\n", __func__, 1213 (unsigned long long)sh->sector); 1214 1215 for (i = disks; i--; ) { 1216 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1217 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1218 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1219 } 1220 1221 for (i = disks; i--; ) { 1222 struct r5dev *dev = &sh->dev[i]; 1223 1224 if (dev->written || i == pd_idx || i == qd_idx) { 1225 if (!discard) 1226 set_bit(R5_UPTODATE, &dev->flags); 1227 if (fua) 1228 set_bit(R5_WantFUA, &dev->flags); 1229 if (sync) 1230 set_bit(R5_SyncIO, &dev->flags); 1231 } 1232 } 1233 1234 if (sh->reconstruct_state == reconstruct_state_drain_run) 1235 sh->reconstruct_state = reconstruct_state_drain_result; 1236 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1237 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1238 else { 1239 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1240 sh->reconstruct_state = reconstruct_state_result; 1241 } 1242 1243 set_bit(STRIPE_HANDLE, &sh->state); 1244 release_stripe(sh); 1245 } 1246 1247 static void 1248 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1249 struct dma_async_tx_descriptor *tx) 1250 { 1251 int disks = sh->disks; 1252 struct page **xor_srcs = percpu->scribble; 1253 struct async_submit_ctl submit; 1254 int count = 0, pd_idx = sh->pd_idx, i; 1255 struct page *xor_dest; 1256 int prexor = 0; 1257 unsigned long flags; 1258 1259 pr_debug("%s: stripe %llu\n", __func__, 1260 (unsigned long long)sh->sector); 1261 1262 for (i = 0; i < sh->disks; i++) { 1263 if (pd_idx == i) 1264 continue; 1265 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1266 break; 1267 } 1268 if (i >= sh->disks) { 1269 atomic_inc(&sh->count); 1270 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1271 ops_complete_reconstruct(sh); 1272 return; 1273 } 1274 /* check if prexor is active which means only process blocks 1275 * that are part of a read-modify-write (written) 1276 */ 1277 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1278 prexor = 1; 1279 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1280 for (i = disks; i--; ) { 1281 struct r5dev *dev = &sh->dev[i]; 1282 if (dev->written) 1283 xor_srcs[count++] = dev->page; 1284 } 1285 } else { 1286 xor_dest = sh->dev[pd_idx].page; 1287 for (i = disks; i--; ) { 1288 struct r5dev *dev = &sh->dev[i]; 1289 if (i != pd_idx) 1290 xor_srcs[count++] = dev->page; 1291 } 1292 } 1293 1294 /* 1/ if we prexor'd then the dest is reused as a source 1295 * 2/ if we did not prexor then we are redoing the parity 1296 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1297 * for the synchronous xor case 1298 */ 1299 flags = ASYNC_TX_ACK | 1300 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1301 1302 atomic_inc(&sh->count); 1303 1304 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1305 to_addr_conv(sh, percpu)); 1306 if (unlikely(count == 1)) 1307 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1308 else 1309 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1310 } 1311 1312 static void 1313 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1314 struct dma_async_tx_descriptor *tx) 1315 { 1316 struct async_submit_ctl submit; 1317 struct page **blocks = percpu->scribble; 1318 int count, i; 1319 1320 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1321 1322 for (i = 0; i < sh->disks; i++) { 1323 if (sh->pd_idx == i || sh->qd_idx == i) 1324 continue; 1325 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1326 break; 1327 } 1328 if (i >= sh->disks) { 1329 atomic_inc(&sh->count); 1330 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1331 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1332 ops_complete_reconstruct(sh); 1333 return; 1334 } 1335 1336 count = set_syndrome_sources(blocks, sh); 1337 1338 atomic_inc(&sh->count); 1339 1340 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1341 sh, to_addr_conv(sh, percpu)); 1342 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1343 } 1344 1345 static void ops_complete_check(void *stripe_head_ref) 1346 { 1347 struct stripe_head *sh = stripe_head_ref; 1348 1349 pr_debug("%s: stripe %llu\n", __func__, 1350 (unsigned long long)sh->sector); 1351 1352 sh->check_state = check_state_check_result; 1353 set_bit(STRIPE_HANDLE, &sh->state); 1354 release_stripe(sh); 1355 } 1356 1357 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1358 { 1359 int disks = sh->disks; 1360 int pd_idx = sh->pd_idx; 1361 int qd_idx = sh->qd_idx; 1362 struct page *xor_dest; 1363 struct page **xor_srcs = percpu->scribble; 1364 struct dma_async_tx_descriptor *tx; 1365 struct async_submit_ctl submit; 1366 int count; 1367 int i; 1368 1369 pr_debug("%s: stripe %llu\n", __func__, 1370 (unsigned long long)sh->sector); 1371 1372 count = 0; 1373 xor_dest = sh->dev[pd_idx].page; 1374 xor_srcs[count++] = xor_dest; 1375 for (i = disks; i--; ) { 1376 if (i == pd_idx || i == qd_idx) 1377 continue; 1378 xor_srcs[count++] = sh->dev[i].page; 1379 } 1380 1381 init_async_submit(&submit, 0, NULL, NULL, NULL, 1382 to_addr_conv(sh, percpu)); 1383 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1384 &sh->ops.zero_sum_result, &submit); 1385 1386 atomic_inc(&sh->count); 1387 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1388 tx = async_trigger_callback(&submit); 1389 } 1390 1391 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1392 { 1393 struct page **srcs = percpu->scribble; 1394 struct async_submit_ctl submit; 1395 int count; 1396 1397 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1398 (unsigned long long)sh->sector, checkp); 1399 1400 count = set_syndrome_sources(srcs, sh); 1401 if (!checkp) 1402 srcs[count] = NULL; 1403 1404 atomic_inc(&sh->count); 1405 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1406 sh, to_addr_conv(sh, percpu)); 1407 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1408 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1409 } 1410 1411 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1412 { 1413 int overlap_clear = 0, i, disks = sh->disks; 1414 struct dma_async_tx_descriptor *tx = NULL; 1415 struct r5conf *conf = sh->raid_conf; 1416 int level = conf->level; 1417 struct raid5_percpu *percpu; 1418 unsigned long cpu; 1419 1420 cpu = get_cpu(); 1421 percpu = per_cpu_ptr(conf->percpu, cpu); 1422 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1423 ops_run_biofill(sh); 1424 overlap_clear++; 1425 } 1426 1427 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1428 if (level < 6) 1429 tx = ops_run_compute5(sh, percpu); 1430 else { 1431 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1432 tx = ops_run_compute6_1(sh, percpu); 1433 else 1434 tx = ops_run_compute6_2(sh, percpu); 1435 } 1436 /* terminate the chain if reconstruct is not set to be run */ 1437 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1438 async_tx_ack(tx); 1439 } 1440 1441 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1442 tx = ops_run_prexor(sh, percpu, tx); 1443 1444 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1445 tx = ops_run_biodrain(sh, tx); 1446 overlap_clear++; 1447 } 1448 1449 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1450 if (level < 6) 1451 ops_run_reconstruct5(sh, percpu, tx); 1452 else 1453 ops_run_reconstruct6(sh, percpu, tx); 1454 } 1455 1456 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1457 if (sh->check_state == check_state_run) 1458 ops_run_check_p(sh, percpu); 1459 else if (sh->check_state == check_state_run_q) 1460 ops_run_check_pq(sh, percpu, 0); 1461 else if (sh->check_state == check_state_run_pq) 1462 ops_run_check_pq(sh, percpu, 1); 1463 else 1464 BUG(); 1465 } 1466 1467 if (overlap_clear) 1468 for (i = disks; i--; ) { 1469 struct r5dev *dev = &sh->dev[i]; 1470 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1471 wake_up(&sh->raid_conf->wait_for_overlap); 1472 } 1473 put_cpu(); 1474 } 1475 1476 static int grow_one_stripe(struct r5conf *conf) 1477 { 1478 struct stripe_head *sh; 1479 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1480 if (!sh) 1481 return 0; 1482 1483 sh->raid_conf = conf; 1484 1485 spin_lock_init(&sh->stripe_lock); 1486 1487 if (grow_buffers(sh)) { 1488 shrink_buffers(sh); 1489 kmem_cache_free(conf->slab_cache, sh); 1490 return 0; 1491 } 1492 /* we just created an active stripe so... */ 1493 atomic_set(&sh->count, 1); 1494 atomic_inc(&conf->active_stripes); 1495 INIT_LIST_HEAD(&sh->lru); 1496 release_stripe(sh); 1497 return 1; 1498 } 1499 1500 static int grow_stripes(struct r5conf *conf, int num) 1501 { 1502 struct kmem_cache *sc; 1503 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1504 1505 if (conf->mddev->gendisk) 1506 sprintf(conf->cache_name[0], 1507 "raid%d-%s", conf->level, mdname(conf->mddev)); 1508 else 1509 sprintf(conf->cache_name[0], 1510 "raid%d-%p", conf->level, conf->mddev); 1511 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1512 1513 conf->active_name = 0; 1514 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1515 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1516 0, 0, NULL); 1517 if (!sc) 1518 return 1; 1519 conf->slab_cache = sc; 1520 conf->pool_size = devs; 1521 while (num--) 1522 if (!grow_one_stripe(conf)) 1523 return 1; 1524 return 0; 1525 } 1526 1527 /** 1528 * scribble_len - return the required size of the scribble region 1529 * @num - total number of disks in the array 1530 * 1531 * The size must be enough to contain: 1532 * 1/ a struct page pointer for each device in the array +2 1533 * 2/ room to convert each entry in (1) to its corresponding dma 1534 * (dma_map_page()) or page (page_address()) address. 1535 * 1536 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1537 * calculate over all devices (not just the data blocks), using zeros in place 1538 * of the P and Q blocks. 1539 */ 1540 static size_t scribble_len(int num) 1541 { 1542 size_t len; 1543 1544 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1545 1546 return len; 1547 } 1548 1549 static int resize_stripes(struct r5conf *conf, int newsize) 1550 { 1551 /* Make all the stripes able to hold 'newsize' devices. 1552 * New slots in each stripe get 'page' set to a new page. 1553 * 1554 * This happens in stages: 1555 * 1/ create a new kmem_cache and allocate the required number of 1556 * stripe_heads. 1557 * 2/ gather all the old stripe_heads and transfer the pages across 1558 * to the new stripe_heads. This will have the side effect of 1559 * freezing the array as once all stripe_heads have been collected, 1560 * no IO will be possible. Old stripe heads are freed once their 1561 * pages have been transferred over, and the old kmem_cache is 1562 * freed when all stripes are done. 1563 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1564 * we simple return a failre status - no need to clean anything up. 1565 * 4/ allocate new pages for the new slots in the new stripe_heads. 1566 * If this fails, we don't bother trying the shrink the 1567 * stripe_heads down again, we just leave them as they are. 1568 * As each stripe_head is processed the new one is released into 1569 * active service. 1570 * 1571 * Once step2 is started, we cannot afford to wait for a write, 1572 * so we use GFP_NOIO allocations. 1573 */ 1574 struct stripe_head *osh, *nsh; 1575 LIST_HEAD(newstripes); 1576 struct disk_info *ndisks; 1577 unsigned long cpu; 1578 int err; 1579 struct kmem_cache *sc; 1580 int i; 1581 1582 if (newsize <= conf->pool_size) 1583 return 0; /* never bother to shrink */ 1584 1585 err = md_allow_write(conf->mddev); 1586 if (err) 1587 return err; 1588 1589 /* Step 1 */ 1590 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1591 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1592 0, 0, NULL); 1593 if (!sc) 1594 return -ENOMEM; 1595 1596 for (i = conf->max_nr_stripes; i; i--) { 1597 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1598 if (!nsh) 1599 break; 1600 1601 nsh->raid_conf = conf; 1602 spin_lock_init(&nsh->stripe_lock); 1603 1604 list_add(&nsh->lru, &newstripes); 1605 } 1606 if (i) { 1607 /* didn't get enough, give up */ 1608 while (!list_empty(&newstripes)) { 1609 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1610 list_del(&nsh->lru); 1611 kmem_cache_free(sc, nsh); 1612 } 1613 kmem_cache_destroy(sc); 1614 return -ENOMEM; 1615 } 1616 /* Step 2 - Must use GFP_NOIO now. 1617 * OK, we have enough stripes, start collecting inactive 1618 * stripes and copying them over 1619 */ 1620 list_for_each_entry(nsh, &newstripes, lru) { 1621 spin_lock_irq(&conf->device_lock); 1622 wait_event_lock_irq(conf->wait_for_stripe, 1623 !list_empty(&conf->inactive_list), 1624 conf->device_lock); 1625 osh = get_free_stripe(conf); 1626 spin_unlock_irq(&conf->device_lock); 1627 atomic_set(&nsh->count, 1); 1628 for(i=0; i<conf->pool_size; i++) 1629 nsh->dev[i].page = osh->dev[i].page; 1630 for( ; i<newsize; i++) 1631 nsh->dev[i].page = NULL; 1632 kmem_cache_free(conf->slab_cache, osh); 1633 } 1634 kmem_cache_destroy(conf->slab_cache); 1635 1636 /* Step 3. 1637 * At this point, we are holding all the stripes so the array 1638 * is completely stalled, so now is a good time to resize 1639 * conf->disks and the scribble region 1640 */ 1641 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1642 if (ndisks) { 1643 for (i=0; i<conf->raid_disks; i++) 1644 ndisks[i] = conf->disks[i]; 1645 kfree(conf->disks); 1646 conf->disks = ndisks; 1647 } else 1648 err = -ENOMEM; 1649 1650 get_online_cpus(); 1651 conf->scribble_len = scribble_len(newsize); 1652 for_each_present_cpu(cpu) { 1653 struct raid5_percpu *percpu; 1654 void *scribble; 1655 1656 percpu = per_cpu_ptr(conf->percpu, cpu); 1657 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1658 1659 if (scribble) { 1660 kfree(percpu->scribble); 1661 percpu->scribble = scribble; 1662 } else { 1663 err = -ENOMEM; 1664 break; 1665 } 1666 } 1667 put_online_cpus(); 1668 1669 /* Step 4, return new stripes to service */ 1670 while(!list_empty(&newstripes)) { 1671 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1672 list_del_init(&nsh->lru); 1673 1674 for (i=conf->raid_disks; i < newsize; i++) 1675 if (nsh->dev[i].page == NULL) { 1676 struct page *p = alloc_page(GFP_NOIO); 1677 nsh->dev[i].page = p; 1678 if (!p) 1679 err = -ENOMEM; 1680 } 1681 release_stripe(nsh); 1682 } 1683 /* critical section pass, GFP_NOIO no longer needed */ 1684 1685 conf->slab_cache = sc; 1686 conf->active_name = 1-conf->active_name; 1687 conf->pool_size = newsize; 1688 return err; 1689 } 1690 1691 static int drop_one_stripe(struct r5conf *conf) 1692 { 1693 struct stripe_head *sh; 1694 1695 spin_lock_irq(&conf->device_lock); 1696 sh = get_free_stripe(conf); 1697 spin_unlock_irq(&conf->device_lock); 1698 if (!sh) 1699 return 0; 1700 BUG_ON(atomic_read(&sh->count)); 1701 shrink_buffers(sh); 1702 kmem_cache_free(conf->slab_cache, sh); 1703 atomic_dec(&conf->active_stripes); 1704 return 1; 1705 } 1706 1707 static void shrink_stripes(struct r5conf *conf) 1708 { 1709 while (drop_one_stripe(conf)) 1710 ; 1711 1712 if (conf->slab_cache) 1713 kmem_cache_destroy(conf->slab_cache); 1714 conf->slab_cache = NULL; 1715 } 1716 1717 static void raid5_end_read_request(struct bio * bi, int error) 1718 { 1719 struct stripe_head *sh = bi->bi_private; 1720 struct r5conf *conf = sh->raid_conf; 1721 int disks = sh->disks, i; 1722 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1723 char b[BDEVNAME_SIZE]; 1724 struct md_rdev *rdev = NULL; 1725 sector_t s; 1726 1727 for (i=0 ; i<disks; i++) 1728 if (bi == &sh->dev[i].req) 1729 break; 1730 1731 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1732 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1733 uptodate); 1734 if (i == disks) { 1735 BUG(); 1736 return; 1737 } 1738 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1739 /* If replacement finished while this request was outstanding, 1740 * 'replacement' might be NULL already. 1741 * In that case it moved down to 'rdev'. 1742 * rdev is not removed until all requests are finished. 1743 */ 1744 rdev = conf->disks[i].replacement; 1745 if (!rdev) 1746 rdev = conf->disks[i].rdev; 1747 1748 if (use_new_offset(conf, sh)) 1749 s = sh->sector + rdev->new_data_offset; 1750 else 1751 s = sh->sector + rdev->data_offset; 1752 if (uptodate) { 1753 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1754 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1755 /* Note that this cannot happen on a 1756 * replacement device. We just fail those on 1757 * any error 1758 */ 1759 printk_ratelimited( 1760 KERN_INFO 1761 "md/raid:%s: read error corrected" 1762 " (%lu sectors at %llu on %s)\n", 1763 mdname(conf->mddev), STRIPE_SECTORS, 1764 (unsigned long long)s, 1765 bdevname(rdev->bdev, b)); 1766 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1767 clear_bit(R5_ReadError, &sh->dev[i].flags); 1768 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1769 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1770 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1771 1772 if (atomic_read(&rdev->read_errors)) 1773 atomic_set(&rdev->read_errors, 0); 1774 } else { 1775 const char *bdn = bdevname(rdev->bdev, b); 1776 int retry = 0; 1777 int set_bad = 0; 1778 1779 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1780 atomic_inc(&rdev->read_errors); 1781 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1782 printk_ratelimited( 1783 KERN_WARNING 1784 "md/raid:%s: read error on replacement device " 1785 "(sector %llu on %s).\n", 1786 mdname(conf->mddev), 1787 (unsigned long long)s, 1788 bdn); 1789 else if (conf->mddev->degraded >= conf->max_degraded) { 1790 set_bad = 1; 1791 printk_ratelimited( 1792 KERN_WARNING 1793 "md/raid:%s: read error not correctable " 1794 "(sector %llu on %s).\n", 1795 mdname(conf->mddev), 1796 (unsigned long long)s, 1797 bdn); 1798 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1799 /* Oh, no!!! */ 1800 set_bad = 1; 1801 printk_ratelimited( 1802 KERN_WARNING 1803 "md/raid:%s: read error NOT corrected!! " 1804 "(sector %llu on %s).\n", 1805 mdname(conf->mddev), 1806 (unsigned long long)s, 1807 bdn); 1808 } else if (atomic_read(&rdev->read_errors) 1809 > conf->max_nr_stripes) 1810 printk(KERN_WARNING 1811 "md/raid:%s: Too many read errors, failing device %s.\n", 1812 mdname(conf->mddev), bdn); 1813 else 1814 retry = 1; 1815 if (retry) 1816 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1817 set_bit(R5_ReadError, &sh->dev[i].flags); 1818 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1819 } else 1820 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1821 else { 1822 clear_bit(R5_ReadError, &sh->dev[i].flags); 1823 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1824 if (!(set_bad 1825 && test_bit(In_sync, &rdev->flags) 1826 && rdev_set_badblocks( 1827 rdev, sh->sector, STRIPE_SECTORS, 0))) 1828 md_error(conf->mddev, rdev); 1829 } 1830 } 1831 rdev_dec_pending(rdev, conf->mddev); 1832 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1833 set_bit(STRIPE_HANDLE, &sh->state); 1834 release_stripe(sh); 1835 } 1836 1837 static void raid5_end_write_request(struct bio *bi, int error) 1838 { 1839 struct stripe_head *sh = bi->bi_private; 1840 struct r5conf *conf = sh->raid_conf; 1841 int disks = sh->disks, i; 1842 struct md_rdev *uninitialized_var(rdev); 1843 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1844 sector_t first_bad; 1845 int bad_sectors; 1846 int replacement = 0; 1847 1848 for (i = 0 ; i < disks; i++) { 1849 if (bi == &sh->dev[i].req) { 1850 rdev = conf->disks[i].rdev; 1851 break; 1852 } 1853 if (bi == &sh->dev[i].rreq) { 1854 rdev = conf->disks[i].replacement; 1855 if (rdev) 1856 replacement = 1; 1857 else 1858 /* rdev was removed and 'replacement' 1859 * replaced it. rdev is not removed 1860 * until all requests are finished. 1861 */ 1862 rdev = conf->disks[i].rdev; 1863 break; 1864 } 1865 } 1866 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1867 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1868 uptodate); 1869 if (i == disks) { 1870 BUG(); 1871 return; 1872 } 1873 1874 if (replacement) { 1875 if (!uptodate) 1876 md_error(conf->mddev, rdev); 1877 else if (is_badblock(rdev, sh->sector, 1878 STRIPE_SECTORS, 1879 &first_bad, &bad_sectors)) 1880 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1881 } else { 1882 if (!uptodate) { 1883 set_bit(WriteErrorSeen, &rdev->flags); 1884 set_bit(R5_WriteError, &sh->dev[i].flags); 1885 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1886 set_bit(MD_RECOVERY_NEEDED, 1887 &rdev->mddev->recovery); 1888 } else if (is_badblock(rdev, sh->sector, 1889 STRIPE_SECTORS, 1890 &first_bad, &bad_sectors)) { 1891 set_bit(R5_MadeGood, &sh->dev[i].flags); 1892 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 1893 /* That was a successful write so make 1894 * sure it looks like we already did 1895 * a re-write. 1896 */ 1897 set_bit(R5_ReWrite, &sh->dev[i].flags); 1898 } 1899 } 1900 rdev_dec_pending(rdev, conf->mddev); 1901 1902 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1903 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1904 set_bit(STRIPE_HANDLE, &sh->state); 1905 release_stripe(sh); 1906 } 1907 1908 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1909 1910 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1911 { 1912 struct r5dev *dev = &sh->dev[i]; 1913 1914 bio_init(&dev->req); 1915 dev->req.bi_io_vec = &dev->vec; 1916 dev->req.bi_vcnt++; 1917 dev->req.bi_max_vecs++; 1918 dev->req.bi_private = sh; 1919 dev->vec.bv_page = dev->page; 1920 1921 bio_init(&dev->rreq); 1922 dev->rreq.bi_io_vec = &dev->rvec; 1923 dev->rreq.bi_vcnt++; 1924 dev->rreq.bi_max_vecs++; 1925 dev->rreq.bi_private = sh; 1926 dev->rvec.bv_page = dev->page; 1927 1928 dev->flags = 0; 1929 dev->sector = compute_blocknr(sh, i, previous); 1930 } 1931 1932 static void error(struct mddev *mddev, struct md_rdev *rdev) 1933 { 1934 char b[BDEVNAME_SIZE]; 1935 struct r5conf *conf = mddev->private; 1936 unsigned long flags; 1937 pr_debug("raid456: error called\n"); 1938 1939 spin_lock_irqsave(&conf->device_lock, flags); 1940 clear_bit(In_sync, &rdev->flags); 1941 mddev->degraded = calc_degraded(conf); 1942 spin_unlock_irqrestore(&conf->device_lock, flags); 1943 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1944 1945 set_bit(Blocked, &rdev->flags); 1946 set_bit(Faulty, &rdev->flags); 1947 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1948 printk(KERN_ALERT 1949 "md/raid:%s: Disk failure on %s, disabling device.\n" 1950 "md/raid:%s: Operation continuing on %d devices.\n", 1951 mdname(mddev), 1952 bdevname(rdev->bdev, b), 1953 mdname(mddev), 1954 conf->raid_disks - mddev->degraded); 1955 } 1956 1957 /* 1958 * Input: a 'big' sector number, 1959 * Output: index of the data and parity disk, and the sector # in them. 1960 */ 1961 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1962 int previous, int *dd_idx, 1963 struct stripe_head *sh) 1964 { 1965 sector_t stripe, stripe2; 1966 sector_t chunk_number; 1967 unsigned int chunk_offset; 1968 int pd_idx, qd_idx; 1969 int ddf_layout = 0; 1970 sector_t new_sector; 1971 int algorithm = previous ? conf->prev_algo 1972 : conf->algorithm; 1973 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1974 : conf->chunk_sectors; 1975 int raid_disks = previous ? conf->previous_raid_disks 1976 : conf->raid_disks; 1977 int data_disks = raid_disks - conf->max_degraded; 1978 1979 /* First compute the information on this sector */ 1980 1981 /* 1982 * Compute the chunk number and the sector offset inside the chunk 1983 */ 1984 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1985 chunk_number = r_sector; 1986 1987 /* 1988 * Compute the stripe number 1989 */ 1990 stripe = chunk_number; 1991 *dd_idx = sector_div(stripe, data_disks); 1992 stripe2 = stripe; 1993 /* 1994 * Select the parity disk based on the user selected algorithm. 1995 */ 1996 pd_idx = qd_idx = -1; 1997 switch(conf->level) { 1998 case 4: 1999 pd_idx = data_disks; 2000 break; 2001 case 5: 2002 switch (algorithm) { 2003 case ALGORITHM_LEFT_ASYMMETRIC: 2004 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2005 if (*dd_idx >= pd_idx) 2006 (*dd_idx)++; 2007 break; 2008 case ALGORITHM_RIGHT_ASYMMETRIC: 2009 pd_idx = sector_div(stripe2, raid_disks); 2010 if (*dd_idx >= pd_idx) 2011 (*dd_idx)++; 2012 break; 2013 case ALGORITHM_LEFT_SYMMETRIC: 2014 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2015 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2016 break; 2017 case ALGORITHM_RIGHT_SYMMETRIC: 2018 pd_idx = sector_div(stripe2, raid_disks); 2019 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2020 break; 2021 case ALGORITHM_PARITY_0: 2022 pd_idx = 0; 2023 (*dd_idx)++; 2024 break; 2025 case ALGORITHM_PARITY_N: 2026 pd_idx = data_disks; 2027 break; 2028 default: 2029 BUG(); 2030 } 2031 break; 2032 case 6: 2033 2034 switch (algorithm) { 2035 case ALGORITHM_LEFT_ASYMMETRIC: 2036 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2037 qd_idx = pd_idx + 1; 2038 if (pd_idx == raid_disks-1) { 2039 (*dd_idx)++; /* Q D D D P */ 2040 qd_idx = 0; 2041 } else if (*dd_idx >= pd_idx) 2042 (*dd_idx) += 2; /* D D P Q D */ 2043 break; 2044 case ALGORITHM_RIGHT_ASYMMETRIC: 2045 pd_idx = sector_div(stripe2, raid_disks); 2046 qd_idx = pd_idx + 1; 2047 if (pd_idx == raid_disks-1) { 2048 (*dd_idx)++; /* Q D D D P */ 2049 qd_idx = 0; 2050 } else if (*dd_idx >= pd_idx) 2051 (*dd_idx) += 2; /* D D P Q D */ 2052 break; 2053 case ALGORITHM_LEFT_SYMMETRIC: 2054 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2055 qd_idx = (pd_idx + 1) % raid_disks; 2056 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2057 break; 2058 case ALGORITHM_RIGHT_SYMMETRIC: 2059 pd_idx = sector_div(stripe2, raid_disks); 2060 qd_idx = (pd_idx + 1) % raid_disks; 2061 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2062 break; 2063 2064 case ALGORITHM_PARITY_0: 2065 pd_idx = 0; 2066 qd_idx = 1; 2067 (*dd_idx) += 2; 2068 break; 2069 case ALGORITHM_PARITY_N: 2070 pd_idx = data_disks; 2071 qd_idx = data_disks + 1; 2072 break; 2073 2074 case ALGORITHM_ROTATING_ZERO_RESTART: 2075 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2076 * of blocks for computing Q is different. 2077 */ 2078 pd_idx = sector_div(stripe2, raid_disks); 2079 qd_idx = pd_idx + 1; 2080 if (pd_idx == raid_disks-1) { 2081 (*dd_idx)++; /* Q D D D P */ 2082 qd_idx = 0; 2083 } else if (*dd_idx >= pd_idx) 2084 (*dd_idx) += 2; /* D D P Q D */ 2085 ddf_layout = 1; 2086 break; 2087 2088 case ALGORITHM_ROTATING_N_RESTART: 2089 /* Same a left_asymmetric, by first stripe is 2090 * D D D P Q rather than 2091 * Q D D D P 2092 */ 2093 stripe2 += 1; 2094 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2095 qd_idx = pd_idx + 1; 2096 if (pd_idx == raid_disks-1) { 2097 (*dd_idx)++; /* Q D D D P */ 2098 qd_idx = 0; 2099 } else if (*dd_idx >= pd_idx) 2100 (*dd_idx) += 2; /* D D P Q D */ 2101 ddf_layout = 1; 2102 break; 2103 2104 case ALGORITHM_ROTATING_N_CONTINUE: 2105 /* Same as left_symmetric but Q is before P */ 2106 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2107 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2108 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2109 ddf_layout = 1; 2110 break; 2111 2112 case ALGORITHM_LEFT_ASYMMETRIC_6: 2113 /* RAID5 left_asymmetric, with Q on last device */ 2114 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2115 if (*dd_idx >= pd_idx) 2116 (*dd_idx)++; 2117 qd_idx = raid_disks - 1; 2118 break; 2119 2120 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2121 pd_idx = sector_div(stripe2, raid_disks-1); 2122 if (*dd_idx >= pd_idx) 2123 (*dd_idx)++; 2124 qd_idx = raid_disks - 1; 2125 break; 2126 2127 case ALGORITHM_LEFT_SYMMETRIC_6: 2128 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2129 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2130 qd_idx = raid_disks - 1; 2131 break; 2132 2133 case ALGORITHM_RIGHT_SYMMETRIC_6: 2134 pd_idx = sector_div(stripe2, raid_disks-1); 2135 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2136 qd_idx = raid_disks - 1; 2137 break; 2138 2139 case ALGORITHM_PARITY_0_6: 2140 pd_idx = 0; 2141 (*dd_idx)++; 2142 qd_idx = raid_disks - 1; 2143 break; 2144 2145 default: 2146 BUG(); 2147 } 2148 break; 2149 } 2150 2151 if (sh) { 2152 sh->pd_idx = pd_idx; 2153 sh->qd_idx = qd_idx; 2154 sh->ddf_layout = ddf_layout; 2155 } 2156 /* 2157 * Finally, compute the new sector number 2158 */ 2159 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2160 return new_sector; 2161 } 2162 2163 2164 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2165 { 2166 struct r5conf *conf = sh->raid_conf; 2167 int raid_disks = sh->disks; 2168 int data_disks = raid_disks - conf->max_degraded; 2169 sector_t new_sector = sh->sector, check; 2170 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2171 : conf->chunk_sectors; 2172 int algorithm = previous ? conf->prev_algo 2173 : conf->algorithm; 2174 sector_t stripe; 2175 int chunk_offset; 2176 sector_t chunk_number; 2177 int dummy1, dd_idx = i; 2178 sector_t r_sector; 2179 struct stripe_head sh2; 2180 2181 2182 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2183 stripe = new_sector; 2184 2185 if (i == sh->pd_idx) 2186 return 0; 2187 switch(conf->level) { 2188 case 4: break; 2189 case 5: 2190 switch (algorithm) { 2191 case ALGORITHM_LEFT_ASYMMETRIC: 2192 case ALGORITHM_RIGHT_ASYMMETRIC: 2193 if (i > sh->pd_idx) 2194 i--; 2195 break; 2196 case ALGORITHM_LEFT_SYMMETRIC: 2197 case ALGORITHM_RIGHT_SYMMETRIC: 2198 if (i < sh->pd_idx) 2199 i += raid_disks; 2200 i -= (sh->pd_idx + 1); 2201 break; 2202 case ALGORITHM_PARITY_0: 2203 i -= 1; 2204 break; 2205 case ALGORITHM_PARITY_N: 2206 break; 2207 default: 2208 BUG(); 2209 } 2210 break; 2211 case 6: 2212 if (i == sh->qd_idx) 2213 return 0; /* It is the Q disk */ 2214 switch (algorithm) { 2215 case ALGORITHM_LEFT_ASYMMETRIC: 2216 case ALGORITHM_RIGHT_ASYMMETRIC: 2217 case ALGORITHM_ROTATING_ZERO_RESTART: 2218 case ALGORITHM_ROTATING_N_RESTART: 2219 if (sh->pd_idx == raid_disks-1) 2220 i--; /* Q D D D P */ 2221 else if (i > sh->pd_idx) 2222 i -= 2; /* D D P Q D */ 2223 break; 2224 case ALGORITHM_LEFT_SYMMETRIC: 2225 case ALGORITHM_RIGHT_SYMMETRIC: 2226 if (sh->pd_idx == raid_disks-1) 2227 i--; /* Q D D D P */ 2228 else { 2229 /* D D P Q D */ 2230 if (i < sh->pd_idx) 2231 i += raid_disks; 2232 i -= (sh->pd_idx + 2); 2233 } 2234 break; 2235 case ALGORITHM_PARITY_0: 2236 i -= 2; 2237 break; 2238 case ALGORITHM_PARITY_N: 2239 break; 2240 case ALGORITHM_ROTATING_N_CONTINUE: 2241 /* Like left_symmetric, but P is before Q */ 2242 if (sh->pd_idx == 0) 2243 i--; /* P D D D Q */ 2244 else { 2245 /* D D Q P D */ 2246 if (i < sh->pd_idx) 2247 i += raid_disks; 2248 i -= (sh->pd_idx + 1); 2249 } 2250 break; 2251 case ALGORITHM_LEFT_ASYMMETRIC_6: 2252 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2253 if (i > sh->pd_idx) 2254 i--; 2255 break; 2256 case ALGORITHM_LEFT_SYMMETRIC_6: 2257 case ALGORITHM_RIGHT_SYMMETRIC_6: 2258 if (i < sh->pd_idx) 2259 i += data_disks + 1; 2260 i -= (sh->pd_idx + 1); 2261 break; 2262 case ALGORITHM_PARITY_0_6: 2263 i -= 1; 2264 break; 2265 default: 2266 BUG(); 2267 } 2268 break; 2269 } 2270 2271 chunk_number = stripe * data_disks + i; 2272 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2273 2274 check = raid5_compute_sector(conf, r_sector, 2275 previous, &dummy1, &sh2); 2276 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2277 || sh2.qd_idx != sh->qd_idx) { 2278 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2279 mdname(conf->mddev)); 2280 return 0; 2281 } 2282 return r_sector; 2283 } 2284 2285 2286 static void 2287 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2288 int rcw, int expand) 2289 { 2290 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2291 struct r5conf *conf = sh->raid_conf; 2292 int level = conf->level; 2293 2294 if (rcw) { 2295 2296 for (i = disks; i--; ) { 2297 struct r5dev *dev = &sh->dev[i]; 2298 2299 if (dev->towrite) { 2300 set_bit(R5_LOCKED, &dev->flags); 2301 set_bit(R5_Wantdrain, &dev->flags); 2302 if (!expand) 2303 clear_bit(R5_UPTODATE, &dev->flags); 2304 s->locked++; 2305 } 2306 } 2307 /* if we are not expanding this is a proper write request, and 2308 * there will be bios with new data to be drained into the 2309 * stripe cache 2310 */ 2311 if (!expand) { 2312 if (!s->locked) 2313 /* False alarm, nothing to do */ 2314 return; 2315 sh->reconstruct_state = reconstruct_state_drain_run; 2316 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2317 } else 2318 sh->reconstruct_state = reconstruct_state_run; 2319 2320 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2321 2322 if (s->locked + conf->max_degraded == disks) 2323 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2324 atomic_inc(&conf->pending_full_writes); 2325 } else { 2326 BUG_ON(level == 6); 2327 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2328 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2329 2330 for (i = disks; i--; ) { 2331 struct r5dev *dev = &sh->dev[i]; 2332 if (i == pd_idx) 2333 continue; 2334 2335 if (dev->towrite && 2336 (test_bit(R5_UPTODATE, &dev->flags) || 2337 test_bit(R5_Wantcompute, &dev->flags))) { 2338 set_bit(R5_Wantdrain, &dev->flags); 2339 set_bit(R5_LOCKED, &dev->flags); 2340 clear_bit(R5_UPTODATE, &dev->flags); 2341 s->locked++; 2342 } 2343 } 2344 if (!s->locked) 2345 /* False alarm - nothing to do */ 2346 return; 2347 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2348 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2349 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2350 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2351 } 2352 2353 /* keep the parity disk(s) locked while asynchronous operations 2354 * are in flight 2355 */ 2356 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2357 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2358 s->locked++; 2359 2360 if (level == 6) { 2361 int qd_idx = sh->qd_idx; 2362 struct r5dev *dev = &sh->dev[qd_idx]; 2363 2364 set_bit(R5_LOCKED, &dev->flags); 2365 clear_bit(R5_UPTODATE, &dev->flags); 2366 s->locked++; 2367 } 2368 2369 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2370 __func__, (unsigned long long)sh->sector, 2371 s->locked, s->ops_request); 2372 } 2373 2374 /* 2375 * Each stripe/dev can have one or more bion attached. 2376 * toread/towrite point to the first in a chain. 2377 * The bi_next chain must be in order. 2378 */ 2379 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2380 { 2381 struct bio **bip; 2382 struct r5conf *conf = sh->raid_conf; 2383 int firstwrite=0; 2384 2385 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2386 (unsigned long long)bi->bi_sector, 2387 (unsigned long long)sh->sector); 2388 2389 /* 2390 * If several bio share a stripe. The bio bi_phys_segments acts as a 2391 * reference count to avoid race. The reference count should already be 2392 * increased before this function is called (for example, in 2393 * make_request()), so other bio sharing this stripe will not free the 2394 * stripe. If a stripe is owned by one stripe, the stripe lock will 2395 * protect it. 2396 */ 2397 spin_lock_irq(&sh->stripe_lock); 2398 if (forwrite) { 2399 bip = &sh->dev[dd_idx].towrite; 2400 if (*bip == NULL) 2401 firstwrite = 1; 2402 } else 2403 bip = &sh->dev[dd_idx].toread; 2404 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2405 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2406 goto overlap; 2407 bip = & (*bip)->bi_next; 2408 } 2409 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2410 goto overlap; 2411 2412 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2413 if (*bip) 2414 bi->bi_next = *bip; 2415 *bip = bi; 2416 raid5_inc_bi_active_stripes(bi); 2417 2418 if (forwrite) { 2419 /* check if page is covered */ 2420 sector_t sector = sh->dev[dd_idx].sector; 2421 for (bi=sh->dev[dd_idx].towrite; 2422 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2423 bi && bi->bi_sector <= sector; 2424 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2425 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2426 sector = bi->bi_sector + (bi->bi_size>>9); 2427 } 2428 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2429 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2430 } 2431 2432 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2433 (unsigned long long)(*bip)->bi_sector, 2434 (unsigned long long)sh->sector, dd_idx); 2435 spin_unlock_irq(&sh->stripe_lock); 2436 2437 if (conf->mddev->bitmap && firstwrite) { 2438 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2439 STRIPE_SECTORS, 0); 2440 sh->bm_seq = conf->seq_flush+1; 2441 set_bit(STRIPE_BIT_DELAY, &sh->state); 2442 } 2443 return 1; 2444 2445 overlap: 2446 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2447 spin_unlock_irq(&sh->stripe_lock); 2448 return 0; 2449 } 2450 2451 static void end_reshape(struct r5conf *conf); 2452 2453 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2454 struct stripe_head *sh) 2455 { 2456 int sectors_per_chunk = 2457 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2458 int dd_idx; 2459 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2460 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2461 2462 raid5_compute_sector(conf, 2463 stripe * (disks - conf->max_degraded) 2464 *sectors_per_chunk + chunk_offset, 2465 previous, 2466 &dd_idx, sh); 2467 } 2468 2469 static void 2470 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2471 struct stripe_head_state *s, int disks, 2472 struct bio **return_bi) 2473 { 2474 int i; 2475 for (i = disks; i--; ) { 2476 struct bio *bi; 2477 int bitmap_end = 0; 2478 2479 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2480 struct md_rdev *rdev; 2481 rcu_read_lock(); 2482 rdev = rcu_dereference(conf->disks[i].rdev); 2483 if (rdev && test_bit(In_sync, &rdev->flags)) 2484 atomic_inc(&rdev->nr_pending); 2485 else 2486 rdev = NULL; 2487 rcu_read_unlock(); 2488 if (rdev) { 2489 if (!rdev_set_badblocks( 2490 rdev, 2491 sh->sector, 2492 STRIPE_SECTORS, 0)) 2493 md_error(conf->mddev, rdev); 2494 rdev_dec_pending(rdev, conf->mddev); 2495 } 2496 } 2497 spin_lock_irq(&sh->stripe_lock); 2498 /* fail all writes first */ 2499 bi = sh->dev[i].towrite; 2500 sh->dev[i].towrite = NULL; 2501 spin_unlock_irq(&sh->stripe_lock); 2502 if (bi) 2503 bitmap_end = 1; 2504 2505 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2506 wake_up(&conf->wait_for_overlap); 2507 2508 while (bi && bi->bi_sector < 2509 sh->dev[i].sector + STRIPE_SECTORS) { 2510 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2511 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2512 if (!raid5_dec_bi_active_stripes(bi)) { 2513 md_write_end(conf->mddev); 2514 bi->bi_next = *return_bi; 2515 *return_bi = bi; 2516 } 2517 bi = nextbi; 2518 } 2519 if (bitmap_end) 2520 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2521 STRIPE_SECTORS, 0, 0); 2522 bitmap_end = 0; 2523 /* and fail all 'written' */ 2524 bi = sh->dev[i].written; 2525 sh->dev[i].written = NULL; 2526 if (bi) bitmap_end = 1; 2527 while (bi && bi->bi_sector < 2528 sh->dev[i].sector + STRIPE_SECTORS) { 2529 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2530 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2531 if (!raid5_dec_bi_active_stripes(bi)) { 2532 md_write_end(conf->mddev); 2533 bi->bi_next = *return_bi; 2534 *return_bi = bi; 2535 } 2536 bi = bi2; 2537 } 2538 2539 /* fail any reads if this device is non-operational and 2540 * the data has not reached the cache yet. 2541 */ 2542 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2543 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2544 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2545 spin_lock_irq(&sh->stripe_lock); 2546 bi = sh->dev[i].toread; 2547 sh->dev[i].toread = NULL; 2548 spin_unlock_irq(&sh->stripe_lock); 2549 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2550 wake_up(&conf->wait_for_overlap); 2551 while (bi && bi->bi_sector < 2552 sh->dev[i].sector + STRIPE_SECTORS) { 2553 struct bio *nextbi = 2554 r5_next_bio(bi, sh->dev[i].sector); 2555 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2556 if (!raid5_dec_bi_active_stripes(bi)) { 2557 bi->bi_next = *return_bi; 2558 *return_bi = bi; 2559 } 2560 bi = nextbi; 2561 } 2562 } 2563 if (bitmap_end) 2564 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2565 STRIPE_SECTORS, 0, 0); 2566 /* If we were in the middle of a write the parity block might 2567 * still be locked - so just clear all R5_LOCKED flags 2568 */ 2569 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2570 } 2571 2572 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2573 if (atomic_dec_and_test(&conf->pending_full_writes)) 2574 md_wakeup_thread(conf->mddev->thread); 2575 } 2576 2577 static void 2578 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2579 struct stripe_head_state *s) 2580 { 2581 int abort = 0; 2582 int i; 2583 2584 clear_bit(STRIPE_SYNCING, &sh->state); 2585 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 2586 wake_up(&conf->wait_for_overlap); 2587 s->syncing = 0; 2588 s->replacing = 0; 2589 /* There is nothing more to do for sync/check/repair. 2590 * Don't even need to abort as that is handled elsewhere 2591 * if needed, and not always wanted e.g. if there is a known 2592 * bad block here. 2593 * For recover/replace we need to record a bad block on all 2594 * non-sync devices, or abort the recovery 2595 */ 2596 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2597 /* During recovery devices cannot be removed, so 2598 * locking and refcounting of rdevs is not needed 2599 */ 2600 for (i = 0; i < conf->raid_disks; i++) { 2601 struct md_rdev *rdev = conf->disks[i].rdev; 2602 if (rdev 2603 && !test_bit(Faulty, &rdev->flags) 2604 && !test_bit(In_sync, &rdev->flags) 2605 && !rdev_set_badblocks(rdev, sh->sector, 2606 STRIPE_SECTORS, 0)) 2607 abort = 1; 2608 rdev = conf->disks[i].replacement; 2609 if (rdev 2610 && !test_bit(Faulty, &rdev->flags) 2611 && !test_bit(In_sync, &rdev->flags) 2612 && !rdev_set_badblocks(rdev, sh->sector, 2613 STRIPE_SECTORS, 0)) 2614 abort = 1; 2615 } 2616 if (abort) 2617 conf->recovery_disabled = 2618 conf->mddev->recovery_disabled; 2619 } 2620 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2621 } 2622 2623 static int want_replace(struct stripe_head *sh, int disk_idx) 2624 { 2625 struct md_rdev *rdev; 2626 int rv = 0; 2627 /* Doing recovery so rcu locking not required */ 2628 rdev = sh->raid_conf->disks[disk_idx].replacement; 2629 if (rdev 2630 && !test_bit(Faulty, &rdev->flags) 2631 && !test_bit(In_sync, &rdev->flags) 2632 && (rdev->recovery_offset <= sh->sector 2633 || rdev->mddev->recovery_cp <= sh->sector)) 2634 rv = 1; 2635 2636 return rv; 2637 } 2638 2639 /* fetch_block - checks the given member device to see if its data needs 2640 * to be read or computed to satisfy a request. 2641 * 2642 * Returns 1 when no more member devices need to be checked, otherwise returns 2643 * 0 to tell the loop in handle_stripe_fill to continue 2644 */ 2645 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2646 int disk_idx, int disks) 2647 { 2648 struct r5dev *dev = &sh->dev[disk_idx]; 2649 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2650 &sh->dev[s->failed_num[1]] }; 2651 2652 /* is the data in this block needed, and can we get it? */ 2653 if (!test_bit(R5_LOCKED, &dev->flags) && 2654 !test_bit(R5_UPTODATE, &dev->flags) && 2655 (dev->toread || 2656 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2657 s->syncing || s->expanding || 2658 (s->replacing && want_replace(sh, disk_idx)) || 2659 (s->failed >= 1 && fdev[0]->toread) || 2660 (s->failed >= 2 && fdev[1]->toread) || 2661 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2662 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2663 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2664 /* we would like to get this block, possibly by computing it, 2665 * otherwise read it if the backing disk is insync 2666 */ 2667 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2668 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2669 if ((s->uptodate == disks - 1) && 2670 (s->failed && (disk_idx == s->failed_num[0] || 2671 disk_idx == s->failed_num[1]))) { 2672 /* have disk failed, and we're requested to fetch it; 2673 * do compute it 2674 */ 2675 pr_debug("Computing stripe %llu block %d\n", 2676 (unsigned long long)sh->sector, disk_idx); 2677 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2678 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2679 set_bit(R5_Wantcompute, &dev->flags); 2680 sh->ops.target = disk_idx; 2681 sh->ops.target2 = -1; /* no 2nd target */ 2682 s->req_compute = 1; 2683 /* Careful: from this point on 'uptodate' is in the eye 2684 * of raid_run_ops which services 'compute' operations 2685 * before writes. R5_Wantcompute flags a block that will 2686 * be R5_UPTODATE by the time it is needed for a 2687 * subsequent operation. 2688 */ 2689 s->uptodate++; 2690 return 1; 2691 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2692 /* Computing 2-failure is *very* expensive; only 2693 * do it if failed >= 2 2694 */ 2695 int other; 2696 for (other = disks; other--; ) { 2697 if (other == disk_idx) 2698 continue; 2699 if (!test_bit(R5_UPTODATE, 2700 &sh->dev[other].flags)) 2701 break; 2702 } 2703 BUG_ON(other < 0); 2704 pr_debug("Computing stripe %llu blocks %d,%d\n", 2705 (unsigned long long)sh->sector, 2706 disk_idx, other); 2707 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2708 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2709 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2710 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2711 sh->ops.target = disk_idx; 2712 sh->ops.target2 = other; 2713 s->uptodate += 2; 2714 s->req_compute = 1; 2715 return 1; 2716 } else if (test_bit(R5_Insync, &dev->flags)) { 2717 set_bit(R5_LOCKED, &dev->flags); 2718 set_bit(R5_Wantread, &dev->flags); 2719 s->locked++; 2720 pr_debug("Reading block %d (sync=%d)\n", 2721 disk_idx, s->syncing); 2722 } 2723 } 2724 2725 return 0; 2726 } 2727 2728 /** 2729 * handle_stripe_fill - read or compute data to satisfy pending requests. 2730 */ 2731 static void handle_stripe_fill(struct stripe_head *sh, 2732 struct stripe_head_state *s, 2733 int disks) 2734 { 2735 int i; 2736 2737 /* look for blocks to read/compute, skip this if a compute 2738 * is already in flight, or if the stripe contents are in the 2739 * midst of changing due to a write 2740 */ 2741 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2742 !sh->reconstruct_state) 2743 for (i = disks; i--; ) 2744 if (fetch_block(sh, s, i, disks)) 2745 break; 2746 set_bit(STRIPE_HANDLE, &sh->state); 2747 } 2748 2749 2750 /* handle_stripe_clean_event 2751 * any written block on an uptodate or failed drive can be returned. 2752 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2753 * never LOCKED, so we don't need to test 'failed' directly. 2754 */ 2755 static void handle_stripe_clean_event(struct r5conf *conf, 2756 struct stripe_head *sh, int disks, struct bio **return_bi) 2757 { 2758 int i; 2759 struct r5dev *dev; 2760 int discard_pending = 0; 2761 2762 for (i = disks; i--; ) 2763 if (sh->dev[i].written) { 2764 dev = &sh->dev[i]; 2765 if (!test_bit(R5_LOCKED, &dev->flags) && 2766 (test_bit(R5_UPTODATE, &dev->flags) || 2767 test_bit(R5_Discard, &dev->flags))) { 2768 /* We can return any write requests */ 2769 struct bio *wbi, *wbi2; 2770 pr_debug("Return write for disc %d\n", i); 2771 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2772 clear_bit(R5_UPTODATE, &dev->flags); 2773 wbi = dev->written; 2774 dev->written = NULL; 2775 while (wbi && wbi->bi_sector < 2776 dev->sector + STRIPE_SECTORS) { 2777 wbi2 = r5_next_bio(wbi, dev->sector); 2778 if (!raid5_dec_bi_active_stripes(wbi)) { 2779 md_write_end(conf->mddev); 2780 wbi->bi_next = *return_bi; 2781 *return_bi = wbi; 2782 } 2783 wbi = wbi2; 2784 } 2785 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2786 STRIPE_SECTORS, 2787 !test_bit(STRIPE_DEGRADED, &sh->state), 2788 0); 2789 } else if (test_bit(R5_Discard, &dev->flags)) 2790 discard_pending = 1; 2791 } 2792 if (!discard_pending && 2793 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 2794 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2795 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2796 if (sh->qd_idx >= 0) { 2797 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2798 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 2799 } 2800 /* now that discard is done we can proceed with any sync */ 2801 clear_bit(STRIPE_DISCARD, &sh->state); 2802 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 2803 set_bit(STRIPE_HANDLE, &sh->state); 2804 2805 } 2806 2807 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2808 if (atomic_dec_and_test(&conf->pending_full_writes)) 2809 md_wakeup_thread(conf->mddev->thread); 2810 } 2811 2812 static void handle_stripe_dirtying(struct r5conf *conf, 2813 struct stripe_head *sh, 2814 struct stripe_head_state *s, 2815 int disks) 2816 { 2817 int rmw = 0, rcw = 0, i; 2818 sector_t recovery_cp = conf->mddev->recovery_cp; 2819 2820 /* RAID6 requires 'rcw' in current implementation. 2821 * Otherwise, check whether resync is now happening or should start. 2822 * If yes, then the array is dirty (after unclean shutdown or 2823 * initial creation), so parity in some stripes might be inconsistent. 2824 * In this case, we need to always do reconstruct-write, to ensure 2825 * that in case of drive failure or read-error correction, we 2826 * generate correct data from the parity. 2827 */ 2828 if (conf->max_degraded == 2 || 2829 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2830 /* Calculate the real rcw later - for now make it 2831 * look like rcw is cheaper 2832 */ 2833 rcw = 1; rmw = 2; 2834 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2835 conf->max_degraded, (unsigned long long)recovery_cp, 2836 (unsigned long long)sh->sector); 2837 } else for (i = disks; i--; ) { 2838 /* would I have to read this buffer for read_modify_write */ 2839 struct r5dev *dev = &sh->dev[i]; 2840 if ((dev->towrite || i == sh->pd_idx) && 2841 !test_bit(R5_LOCKED, &dev->flags) && 2842 !(test_bit(R5_UPTODATE, &dev->flags) || 2843 test_bit(R5_Wantcompute, &dev->flags))) { 2844 if (test_bit(R5_Insync, &dev->flags)) 2845 rmw++; 2846 else 2847 rmw += 2*disks; /* cannot read it */ 2848 } 2849 /* Would I have to read this buffer for reconstruct_write */ 2850 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2851 !test_bit(R5_LOCKED, &dev->flags) && 2852 !(test_bit(R5_UPTODATE, &dev->flags) || 2853 test_bit(R5_Wantcompute, &dev->flags))) { 2854 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2855 else 2856 rcw += 2*disks; 2857 } 2858 } 2859 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2860 (unsigned long long)sh->sector, rmw, rcw); 2861 set_bit(STRIPE_HANDLE, &sh->state); 2862 if (rmw < rcw && rmw > 0) { 2863 /* prefer read-modify-write, but need to get some data */ 2864 if (conf->mddev->queue) 2865 blk_add_trace_msg(conf->mddev->queue, 2866 "raid5 rmw %llu %d", 2867 (unsigned long long)sh->sector, rmw); 2868 for (i = disks; i--; ) { 2869 struct r5dev *dev = &sh->dev[i]; 2870 if ((dev->towrite || i == sh->pd_idx) && 2871 !test_bit(R5_LOCKED, &dev->flags) && 2872 !(test_bit(R5_UPTODATE, &dev->flags) || 2873 test_bit(R5_Wantcompute, &dev->flags)) && 2874 test_bit(R5_Insync, &dev->flags)) { 2875 if ( 2876 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2877 pr_debug("Read_old block " 2878 "%d for r-m-w\n", i); 2879 set_bit(R5_LOCKED, &dev->flags); 2880 set_bit(R5_Wantread, &dev->flags); 2881 s->locked++; 2882 } else { 2883 set_bit(STRIPE_DELAYED, &sh->state); 2884 set_bit(STRIPE_HANDLE, &sh->state); 2885 } 2886 } 2887 } 2888 } 2889 if (rcw <= rmw && rcw > 0) { 2890 /* want reconstruct write, but need to get some data */ 2891 int qread =0; 2892 rcw = 0; 2893 for (i = disks; i--; ) { 2894 struct r5dev *dev = &sh->dev[i]; 2895 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2896 i != sh->pd_idx && i != sh->qd_idx && 2897 !test_bit(R5_LOCKED, &dev->flags) && 2898 !(test_bit(R5_UPTODATE, &dev->flags) || 2899 test_bit(R5_Wantcompute, &dev->flags))) { 2900 rcw++; 2901 if (!test_bit(R5_Insync, &dev->flags)) 2902 continue; /* it's a failed drive */ 2903 if ( 2904 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2905 pr_debug("Read_old block " 2906 "%d for Reconstruct\n", i); 2907 set_bit(R5_LOCKED, &dev->flags); 2908 set_bit(R5_Wantread, &dev->flags); 2909 s->locked++; 2910 qread++; 2911 } else { 2912 set_bit(STRIPE_DELAYED, &sh->state); 2913 set_bit(STRIPE_HANDLE, &sh->state); 2914 } 2915 } 2916 } 2917 if (rcw && conf->mddev->queue) 2918 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 2919 (unsigned long long)sh->sector, 2920 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 2921 } 2922 /* now if nothing is locked, and if we have enough data, 2923 * we can start a write request 2924 */ 2925 /* since handle_stripe can be called at any time we need to handle the 2926 * case where a compute block operation has been submitted and then a 2927 * subsequent call wants to start a write request. raid_run_ops only 2928 * handles the case where compute block and reconstruct are requested 2929 * simultaneously. If this is not the case then new writes need to be 2930 * held off until the compute completes. 2931 */ 2932 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2933 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2934 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2935 schedule_reconstruction(sh, s, rcw == 0, 0); 2936 } 2937 2938 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2939 struct stripe_head_state *s, int disks) 2940 { 2941 struct r5dev *dev = NULL; 2942 2943 set_bit(STRIPE_HANDLE, &sh->state); 2944 2945 switch (sh->check_state) { 2946 case check_state_idle: 2947 /* start a new check operation if there are no failures */ 2948 if (s->failed == 0) { 2949 BUG_ON(s->uptodate != disks); 2950 sh->check_state = check_state_run; 2951 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2952 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2953 s->uptodate--; 2954 break; 2955 } 2956 dev = &sh->dev[s->failed_num[0]]; 2957 /* fall through */ 2958 case check_state_compute_result: 2959 sh->check_state = check_state_idle; 2960 if (!dev) 2961 dev = &sh->dev[sh->pd_idx]; 2962 2963 /* check that a write has not made the stripe insync */ 2964 if (test_bit(STRIPE_INSYNC, &sh->state)) 2965 break; 2966 2967 /* either failed parity check, or recovery is happening */ 2968 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2969 BUG_ON(s->uptodate != disks); 2970 2971 set_bit(R5_LOCKED, &dev->flags); 2972 s->locked++; 2973 set_bit(R5_Wantwrite, &dev->flags); 2974 2975 clear_bit(STRIPE_DEGRADED, &sh->state); 2976 set_bit(STRIPE_INSYNC, &sh->state); 2977 break; 2978 case check_state_run: 2979 break; /* we will be called again upon completion */ 2980 case check_state_check_result: 2981 sh->check_state = check_state_idle; 2982 2983 /* if a failure occurred during the check operation, leave 2984 * STRIPE_INSYNC not set and let the stripe be handled again 2985 */ 2986 if (s->failed) 2987 break; 2988 2989 /* handle a successful check operation, if parity is correct 2990 * we are done. Otherwise update the mismatch count and repair 2991 * parity if !MD_RECOVERY_CHECK 2992 */ 2993 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2994 /* parity is correct (on disc, 2995 * not in buffer any more) 2996 */ 2997 set_bit(STRIPE_INSYNC, &sh->state); 2998 else { 2999 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3000 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3001 /* don't try to repair!! */ 3002 set_bit(STRIPE_INSYNC, &sh->state); 3003 else { 3004 sh->check_state = check_state_compute_run; 3005 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3006 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3007 set_bit(R5_Wantcompute, 3008 &sh->dev[sh->pd_idx].flags); 3009 sh->ops.target = sh->pd_idx; 3010 sh->ops.target2 = -1; 3011 s->uptodate++; 3012 } 3013 } 3014 break; 3015 case check_state_compute_run: 3016 break; 3017 default: 3018 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3019 __func__, sh->check_state, 3020 (unsigned long long) sh->sector); 3021 BUG(); 3022 } 3023 } 3024 3025 3026 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3027 struct stripe_head_state *s, 3028 int disks) 3029 { 3030 int pd_idx = sh->pd_idx; 3031 int qd_idx = sh->qd_idx; 3032 struct r5dev *dev; 3033 3034 set_bit(STRIPE_HANDLE, &sh->state); 3035 3036 BUG_ON(s->failed > 2); 3037 3038 /* Want to check and possibly repair P and Q. 3039 * However there could be one 'failed' device, in which 3040 * case we can only check one of them, possibly using the 3041 * other to generate missing data 3042 */ 3043 3044 switch (sh->check_state) { 3045 case check_state_idle: 3046 /* start a new check operation if there are < 2 failures */ 3047 if (s->failed == s->q_failed) { 3048 /* The only possible failed device holds Q, so it 3049 * makes sense to check P (If anything else were failed, 3050 * we would have used P to recreate it). 3051 */ 3052 sh->check_state = check_state_run; 3053 } 3054 if (!s->q_failed && s->failed < 2) { 3055 /* Q is not failed, and we didn't use it to generate 3056 * anything, so it makes sense to check it 3057 */ 3058 if (sh->check_state == check_state_run) 3059 sh->check_state = check_state_run_pq; 3060 else 3061 sh->check_state = check_state_run_q; 3062 } 3063 3064 /* discard potentially stale zero_sum_result */ 3065 sh->ops.zero_sum_result = 0; 3066 3067 if (sh->check_state == check_state_run) { 3068 /* async_xor_zero_sum destroys the contents of P */ 3069 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3070 s->uptodate--; 3071 } 3072 if (sh->check_state >= check_state_run && 3073 sh->check_state <= check_state_run_pq) { 3074 /* async_syndrome_zero_sum preserves P and Q, so 3075 * no need to mark them !uptodate here 3076 */ 3077 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3078 break; 3079 } 3080 3081 /* we have 2-disk failure */ 3082 BUG_ON(s->failed != 2); 3083 /* fall through */ 3084 case check_state_compute_result: 3085 sh->check_state = check_state_idle; 3086 3087 /* check that a write has not made the stripe insync */ 3088 if (test_bit(STRIPE_INSYNC, &sh->state)) 3089 break; 3090 3091 /* now write out any block on a failed drive, 3092 * or P or Q if they were recomputed 3093 */ 3094 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3095 if (s->failed == 2) { 3096 dev = &sh->dev[s->failed_num[1]]; 3097 s->locked++; 3098 set_bit(R5_LOCKED, &dev->flags); 3099 set_bit(R5_Wantwrite, &dev->flags); 3100 } 3101 if (s->failed >= 1) { 3102 dev = &sh->dev[s->failed_num[0]]; 3103 s->locked++; 3104 set_bit(R5_LOCKED, &dev->flags); 3105 set_bit(R5_Wantwrite, &dev->flags); 3106 } 3107 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3108 dev = &sh->dev[pd_idx]; 3109 s->locked++; 3110 set_bit(R5_LOCKED, &dev->flags); 3111 set_bit(R5_Wantwrite, &dev->flags); 3112 } 3113 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3114 dev = &sh->dev[qd_idx]; 3115 s->locked++; 3116 set_bit(R5_LOCKED, &dev->flags); 3117 set_bit(R5_Wantwrite, &dev->flags); 3118 } 3119 clear_bit(STRIPE_DEGRADED, &sh->state); 3120 3121 set_bit(STRIPE_INSYNC, &sh->state); 3122 break; 3123 case check_state_run: 3124 case check_state_run_q: 3125 case check_state_run_pq: 3126 break; /* we will be called again upon completion */ 3127 case check_state_check_result: 3128 sh->check_state = check_state_idle; 3129 3130 /* handle a successful check operation, if parity is correct 3131 * we are done. Otherwise update the mismatch count and repair 3132 * parity if !MD_RECOVERY_CHECK 3133 */ 3134 if (sh->ops.zero_sum_result == 0) { 3135 /* both parities are correct */ 3136 if (!s->failed) 3137 set_bit(STRIPE_INSYNC, &sh->state); 3138 else { 3139 /* in contrast to the raid5 case we can validate 3140 * parity, but still have a failure to write 3141 * back 3142 */ 3143 sh->check_state = check_state_compute_result; 3144 /* Returning at this point means that we may go 3145 * off and bring p and/or q uptodate again so 3146 * we make sure to check zero_sum_result again 3147 * to verify if p or q need writeback 3148 */ 3149 } 3150 } else { 3151 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3152 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3153 /* don't try to repair!! */ 3154 set_bit(STRIPE_INSYNC, &sh->state); 3155 else { 3156 int *target = &sh->ops.target; 3157 3158 sh->ops.target = -1; 3159 sh->ops.target2 = -1; 3160 sh->check_state = check_state_compute_run; 3161 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3162 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3163 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3164 set_bit(R5_Wantcompute, 3165 &sh->dev[pd_idx].flags); 3166 *target = pd_idx; 3167 target = &sh->ops.target2; 3168 s->uptodate++; 3169 } 3170 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3171 set_bit(R5_Wantcompute, 3172 &sh->dev[qd_idx].flags); 3173 *target = qd_idx; 3174 s->uptodate++; 3175 } 3176 } 3177 } 3178 break; 3179 case check_state_compute_run: 3180 break; 3181 default: 3182 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3183 __func__, sh->check_state, 3184 (unsigned long long) sh->sector); 3185 BUG(); 3186 } 3187 } 3188 3189 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3190 { 3191 int i; 3192 3193 /* We have read all the blocks in this stripe and now we need to 3194 * copy some of them into a target stripe for expand. 3195 */ 3196 struct dma_async_tx_descriptor *tx = NULL; 3197 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3198 for (i = 0; i < sh->disks; i++) 3199 if (i != sh->pd_idx && i != sh->qd_idx) { 3200 int dd_idx, j; 3201 struct stripe_head *sh2; 3202 struct async_submit_ctl submit; 3203 3204 sector_t bn = compute_blocknr(sh, i, 1); 3205 sector_t s = raid5_compute_sector(conf, bn, 0, 3206 &dd_idx, NULL); 3207 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3208 if (sh2 == NULL) 3209 /* so far only the early blocks of this stripe 3210 * have been requested. When later blocks 3211 * get requested, we will try again 3212 */ 3213 continue; 3214 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3215 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3216 /* must have already done this block */ 3217 release_stripe(sh2); 3218 continue; 3219 } 3220 3221 /* place all the copies on one channel */ 3222 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3223 tx = async_memcpy(sh2->dev[dd_idx].page, 3224 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3225 &submit); 3226 3227 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3228 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3229 for (j = 0; j < conf->raid_disks; j++) 3230 if (j != sh2->pd_idx && 3231 j != sh2->qd_idx && 3232 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3233 break; 3234 if (j == conf->raid_disks) { 3235 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3236 set_bit(STRIPE_HANDLE, &sh2->state); 3237 } 3238 release_stripe(sh2); 3239 3240 } 3241 /* done submitting copies, wait for them to complete */ 3242 async_tx_quiesce(&tx); 3243 } 3244 3245 /* 3246 * handle_stripe - do things to a stripe. 3247 * 3248 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3249 * state of various bits to see what needs to be done. 3250 * Possible results: 3251 * return some read requests which now have data 3252 * return some write requests which are safely on storage 3253 * schedule a read on some buffers 3254 * schedule a write of some buffers 3255 * return confirmation of parity correctness 3256 * 3257 */ 3258 3259 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3260 { 3261 struct r5conf *conf = sh->raid_conf; 3262 int disks = sh->disks; 3263 struct r5dev *dev; 3264 int i; 3265 int do_recovery = 0; 3266 3267 memset(s, 0, sizeof(*s)); 3268 3269 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3270 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3271 s->failed_num[0] = -1; 3272 s->failed_num[1] = -1; 3273 3274 /* Now to look around and see what can be done */ 3275 rcu_read_lock(); 3276 for (i=disks; i--; ) { 3277 struct md_rdev *rdev; 3278 sector_t first_bad; 3279 int bad_sectors; 3280 int is_bad = 0; 3281 3282 dev = &sh->dev[i]; 3283 3284 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3285 i, dev->flags, 3286 dev->toread, dev->towrite, dev->written); 3287 /* maybe we can reply to a read 3288 * 3289 * new wantfill requests are only permitted while 3290 * ops_complete_biofill is guaranteed to be inactive 3291 */ 3292 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3293 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3294 set_bit(R5_Wantfill, &dev->flags); 3295 3296 /* now count some things */ 3297 if (test_bit(R5_LOCKED, &dev->flags)) 3298 s->locked++; 3299 if (test_bit(R5_UPTODATE, &dev->flags)) 3300 s->uptodate++; 3301 if (test_bit(R5_Wantcompute, &dev->flags)) { 3302 s->compute++; 3303 BUG_ON(s->compute > 2); 3304 } 3305 3306 if (test_bit(R5_Wantfill, &dev->flags)) 3307 s->to_fill++; 3308 else if (dev->toread) 3309 s->to_read++; 3310 if (dev->towrite) { 3311 s->to_write++; 3312 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3313 s->non_overwrite++; 3314 } 3315 if (dev->written) 3316 s->written++; 3317 /* Prefer to use the replacement for reads, but only 3318 * if it is recovered enough and has no bad blocks. 3319 */ 3320 rdev = rcu_dereference(conf->disks[i].replacement); 3321 if (rdev && !test_bit(Faulty, &rdev->flags) && 3322 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3323 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3324 &first_bad, &bad_sectors)) 3325 set_bit(R5_ReadRepl, &dev->flags); 3326 else { 3327 if (rdev) 3328 set_bit(R5_NeedReplace, &dev->flags); 3329 rdev = rcu_dereference(conf->disks[i].rdev); 3330 clear_bit(R5_ReadRepl, &dev->flags); 3331 } 3332 if (rdev && test_bit(Faulty, &rdev->flags)) 3333 rdev = NULL; 3334 if (rdev) { 3335 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3336 &first_bad, &bad_sectors); 3337 if (s->blocked_rdev == NULL 3338 && (test_bit(Blocked, &rdev->flags) 3339 || is_bad < 0)) { 3340 if (is_bad < 0) 3341 set_bit(BlockedBadBlocks, 3342 &rdev->flags); 3343 s->blocked_rdev = rdev; 3344 atomic_inc(&rdev->nr_pending); 3345 } 3346 } 3347 clear_bit(R5_Insync, &dev->flags); 3348 if (!rdev) 3349 /* Not in-sync */; 3350 else if (is_bad) { 3351 /* also not in-sync */ 3352 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3353 test_bit(R5_UPTODATE, &dev->flags)) { 3354 /* treat as in-sync, but with a read error 3355 * which we can now try to correct 3356 */ 3357 set_bit(R5_Insync, &dev->flags); 3358 set_bit(R5_ReadError, &dev->flags); 3359 } 3360 } else if (test_bit(In_sync, &rdev->flags)) 3361 set_bit(R5_Insync, &dev->flags); 3362 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3363 /* in sync if before recovery_offset */ 3364 set_bit(R5_Insync, &dev->flags); 3365 else if (test_bit(R5_UPTODATE, &dev->flags) && 3366 test_bit(R5_Expanded, &dev->flags)) 3367 /* If we've reshaped into here, we assume it is Insync. 3368 * We will shortly update recovery_offset to make 3369 * it official. 3370 */ 3371 set_bit(R5_Insync, &dev->flags); 3372 3373 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3374 /* This flag does not apply to '.replacement' 3375 * only to .rdev, so make sure to check that*/ 3376 struct md_rdev *rdev2 = rcu_dereference( 3377 conf->disks[i].rdev); 3378 if (rdev2 == rdev) 3379 clear_bit(R5_Insync, &dev->flags); 3380 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3381 s->handle_bad_blocks = 1; 3382 atomic_inc(&rdev2->nr_pending); 3383 } else 3384 clear_bit(R5_WriteError, &dev->flags); 3385 } 3386 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3387 /* This flag does not apply to '.replacement' 3388 * only to .rdev, so make sure to check that*/ 3389 struct md_rdev *rdev2 = rcu_dereference( 3390 conf->disks[i].rdev); 3391 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3392 s->handle_bad_blocks = 1; 3393 atomic_inc(&rdev2->nr_pending); 3394 } else 3395 clear_bit(R5_MadeGood, &dev->flags); 3396 } 3397 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3398 struct md_rdev *rdev2 = rcu_dereference( 3399 conf->disks[i].replacement); 3400 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3401 s->handle_bad_blocks = 1; 3402 atomic_inc(&rdev2->nr_pending); 3403 } else 3404 clear_bit(R5_MadeGoodRepl, &dev->flags); 3405 } 3406 if (!test_bit(R5_Insync, &dev->flags)) { 3407 /* The ReadError flag will just be confusing now */ 3408 clear_bit(R5_ReadError, &dev->flags); 3409 clear_bit(R5_ReWrite, &dev->flags); 3410 } 3411 if (test_bit(R5_ReadError, &dev->flags)) 3412 clear_bit(R5_Insync, &dev->flags); 3413 if (!test_bit(R5_Insync, &dev->flags)) { 3414 if (s->failed < 2) 3415 s->failed_num[s->failed] = i; 3416 s->failed++; 3417 if (rdev && !test_bit(Faulty, &rdev->flags)) 3418 do_recovery = 1; 3419 } 3420 } 3421 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3422 /* If there is a failed device being replaced, 3423 * we must be recovering. 3424 * else if we are after recovery_cp, we must be syncing 3425 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3426 * else we can only be replacing 3427 * sync and recovery both need to read all devices, and so 3428 * use the same flag. 3429 */ 3430 if (do_recovery || 3431 sh->sector >= conf->mddev->recovery_cp || 3432 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3433 s->syncing = 1; 3434 else 3435 s->replacing = 1; 3436 } 3437 rcu_read_unlock(); 3438 } 3439 3440 static void handle_stripe(struct stripe_head *sh) 3441 { 3442 struct stripe_head_state s; 3443 struct r5conf *conf = sh->raid_conf; 3444 int i; 3445 int prexor; 3446 int disks = sh->disks; 3447 struct r5dev *pdev, *qdev; 3448 3449 clear_bit(STRIPE_HANDLE, &sh->state); 3450 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3451 /* already being handled, ensure it gets handled 3452 * again when current action finishes */ 3453 set_bit(STRIPE_HANDLE, &sh->state); 3454 return; 3455 } 3456 3457 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3458 spin_lock(&sh->stripe_lock); 3459 /* Cannot process 'sync' concurrently with 'discard' */ 3460 if (!test_bit(STRIPE_DISCARD, &sh->state) && 3461 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3462 set_bit(STRIPE_SYNCING, &sh->state); 3463 clear_bit(STRIPE_INSYNC, &sh->state); 3464 } 3465 spin_unlock(&sh->stripe_lock); 3466 } 3467 clear_bit(STRIPE_DELAYED, &sh->state); 3468 3469 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3470 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3471 (unsigned long long)sh->sector, sh->state, 3472 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3473 sh->check_state, sh->reconstruct_state); 3474 3475 analyse_stripe(sh, &s); 3476 3477 if (s.handle_bad_blocks) { 3478 set_bit(STRIPE_HANDLE, &sh->state); 3479 goto finish; 3480 } 3481 3482 if (unlikely(s.blocked_rdev)) { 3483 if (s.syncing || s.expanding || s.expanded || 3484 s.replacing || s.to_write || s.written) { 3485 set_bit(STRIPE_HANDLE, &sh->state); 3486 goto finish; 3487 } 3488 /* There is nothing for the blocked_rdev to block */ 3489 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3490 s.blocked_rdev = NULL; 3491 } 3492 3493 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3494 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3495 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3496 } 3497 3498 pr_debug("locked=%d uptodate=%d to_read=%d" 3499 " to_write=%d failed=%d failed_num=%d,%d\n", 3500 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3501 s.failed_num[0], s.failed_num[1]); 3502 /* check if the array has lost more than max_degraded devices and, 3503 * if so, some requests might need to be failed. 3504 */ 3505 if (s.failed > conf->max_degraded) { 3506 sh->check_state = 0; 3507 sh->reconstruct_state = 0; 3508 if (s.to_read+s.to_write+s.written) 3509 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3510 if (s.syncing + s.replacing) 3511 handle_failed_sync(conf, sh, &s); 3512 } 3513 3514 /* Now we check to see if any write operations have recently 3515 * completed 3516 */ 3517 prexor = 0; 3518 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3519 prexor = 1; 3520 if (sh->reconstruct_state == reconstruct_state_drain_result || 3521 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3522 sh->reconstruct_state = reconstruct_state_idle; 3523 3524 /* All the 'written' buffers and the parity block are ready to 3525 * be written back to disk 3526 */ 3527 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3528 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3529 BUG_ON(sh->qd_idx >= 0 && 3530 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3531 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3532 for (i = disks; i--; ) { 3533 struct r5dev *dev = &sh->dev[i]; 3534 if (test_bit(R5_LOCKED, &dev->flags) && 3535 (i == sh->pd_idx || i == sh->qd_idx || 3536 dev->written)) { 3537 pr_debug("Writing block %d\n", i); 3538 set_bit(R5_Wantwrite, &dev->flags); 3539 if (prexor) 3540 continue; 3541 if (!test_bit(R5_Insync, &dev->flags) || 3542 ((i == sh->pd_idx || i == sh->qd_idx) && 3543 s.failed == 0)) 3544 set_bit(STRIPE_INSYNC, &sh->state); 3545 } 3546 } 3547 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3548 s.dec_preread_active = 1; 3549 } 3550 3551 /* 3552 * might be able to return some write requests if the parity blocks 3553 * are safe, or on a failed drive 3554 */ 3555 pdev = &sh->dev[sh->pd_idx]; 3556 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3557 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3558 qdev = &sh->dev[sh->qd_idx]; 3559 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3560 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3561 || conf->level < 6; 3562 3563 if (s.written && 3564 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3565 && !test_bit(R5_LOCKED, &pdev->flags) 3566 && (test_bit(R5_UPTODATE, &pdev->flags) || 3567 test_bit(R5_Discard, &pdev->flags))))) && 3568 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3569 && !test_bit(R5_LOCKED, &qdev->flags) 3570 && (test_bit(R5_UPTODATE, &qdev->flags) || 3571 test_bit(R5_Discard, &qdev->flags)))))) 3572 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3573 3574 /* Now we might consider reading some blocks, either to check/generate 3575 * parity, or to satisfy requests 3576 * or to load a block that is being partially written. 3577 */ 3578 if (s.to_read || s.non_overwrite 3579 || (conf->level == 6 && s.to_write && s.failed) 3580 || (s.syncing && (s.uptodate + s.compute < disks)) 3581 || s.replacing 3582 || s.expanding) 3583 handle_stripe_fill(sh, &s, disks); 3584 3585 /* Now to consider new write requests and what else, if anything 3586 * should be read. We do not handle new writes when: 3587 * 1/ A 'write' operation (copy+xor) is already in flight. 3588 * 2/ A 'check' operation is in flight, as it may clobber the parity 3589 * block. 3590 */ 3591 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3592 handle_stripe_dirtying(conf, sh, &s, disks); 3593 3594 /* maybe we need to check and possibly fix the parity for this stripe 3595 * Any reads will already have been scheduled, so we just see if enough 3596 * data is available. The parity check is held off while parity 3597 * dependent operations are in flight. 3598 */ 3599 if (sh->check_state || 3600 (s.syncing && s.locked == 0 && 3601 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3602 !test_bit(STRIPE_INSYNC, &sh->state))) { 3603 if (conf->level == 6) 3604 handle_parity_checks6(conf, sh, &s, disks); 3605 else 3606 handle_parity_checks5(conf, sh, &s, disks); 3607 } 3608 3609 if (s.replacing && s.locked == 0 3610 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3611 /* Write out to replacement devices where possible */ 3612 for (i = 0; i < conf->raid_disks; i++) 3613 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3614 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3615 set_bit(R5_WantReplace, &sh->dev[i].flags); 3616 set_bit(R5_LOCKED, &sh->dev[i].flags); 3617 s.locked++; 3618 } 3619 set_bit(STRIPE_INSYNC, &sh->state); 3620 } 3621 if ((s.syncing || s.replacing) && s.locked == 0 && 3622 test_bit(STRIPE_INSYNC, &sh->state)) { 3623 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3624 clear_bit(STRIPE_SYNCING, &sh->state); 3625 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3626 wake_up(&conf->wait_for_overlap); 3627 } 3628 3629 /* If the failed drives are just a ReadError, then we might need 3630 * to progress the repair/check process 3631 */ 3632 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3633 for (i = 0; i < s.failed; i++) { 3634 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3635 if (test_bit(R5_ReadError, &dev->flags) 3636 && !test_bit(R5_LOCKED, &dev->flags) 3637 && test_bit(R5_UPTODATE, &dev->flags) 3638 ) { 3639 if (!test_bit(R5_ReWrite, &dev->flags)) { 3640 set_bit(R5_Wantwrite, &dev->flags); 3641 set_bit(R5_ReWrite, &dev->flags); 3642 set_bit(R5_LOCKED, &dev->flags); 3643 s.locked++; 3644 } else { 3645 /* let's read it back */ 3646 set_bit(R5_Wantread, &dev->flags); 3647 set_bit(R5_LOCKED, &dev->flags); 3648 s.locked++; 3649 } 3650 } 3651 } 3652 3653 3654 /* Finish reconstruct operations initiated by the expansion process */ 3655 if (sh->reconstruct_state == reconstruct_state_result) { 3656 struct stripe_head *sh_src 3657 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3658 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3659 /* sh cannot be written until sh_src has been read. 3660 * so arrange for sh to be delayed a little 3661 */ 3662 set_bit(STRIPE_DELAYED, &sh->state); 3663 set_bit(STRIPE_HANDLE, &sh->state); 3664 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3665 &sh_src->state)) 3666 atomic_inc(&conf->preread_active_stripes); 3667 release_stripe(sh_src); 3668 goto finish; 3669 } 3670 if (sh_src) 3671 release_stripe(sh_src); 3672 3673 sh->reconstruct_state = reconstruct_state_idle; 3674 clear_bit(STRIPE_EXPANDING, &sh->state); 3675 for (i = conf->raid_disks; i--; ) { 3676 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3677 set_bit(R5_LOCKED, &sh->dev[i].flags); 3678 s.locked++; 3679 } 3680 } 3681 3682 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3683 !sh->reconstruct_state) { 3684 /* Need to write out all blocks after computing parity */ 3685 sh->disks = conf->raid_disks; 3686 stripe_set_idx(sh->sector, conf, 0, sh); 3687 schedule_reconstruction(sh, &s, 1, 1); 3688 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3689 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3690 atomic_dec(&conf->reshape_stripes); 3691 wake_up(&conf->wait_for_overlap); 3692 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3693 } 3694 3695 if (s.expanding && s.locked == 0 && 3696 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3697 handle_stripe_expansion(conf, sh); 3698 3699 finish: 3700 /* wait for this device to become unblocked */ 3701 if (unlikely(s.blocked_rdev)) { 3702 if (conf->mddev->external) 3703 md_wait_for_blocked_rdev(s.blocked_rdev, 3704 conf->mddev); 3705 else 3706 /* Internal metadata will immediately 3707 * be written by raid5d, so we don't 3708 * need to wait here. 3709 */ 3710 rdev_dec_pending(s.blocked_rdev, 3711 conf->mddev); 3712 } 3713 3714 if (s.handle_bad_blocks) 3715 for (i = disks; i--; ) { 3716 struct md_rdev *rdev; 3717 struct r5dev *dev = &sh->dev[i]; 3718 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3719 /* We own a safe reference to the rdev */ 3720 rdev = conf->disks[i].rdev; 3721 if (!rdev_set_badblocks(rdev, sh->sector, 3722 STRIPE_SECTORS, 0)) 3723 md_error(conf->mddev, rdev); 3724 rdev_dec_pending(rdev, conf->mddev); 3725 } 3726 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3727 rdev = conf->disks[i].rdev; 3728 rdev_clear_badblocks(rdev, sh->sector, 3729 STRIPE_SECTORS, 0); 3730 rdev_dec_pending(rdev, conf->mddev); 3731 } 3732 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3733 rdev = conf->disks[i].replacement; 3734 if (!rdev) 3735 /* rdev have been moved down */ 3736 rdev = conf->disks[i].rdev; 3737 rdev_clear_badblocks(rdev, sh->sector, 3738 STRIPE_SECTORS, 0); 3739 rdev_dec_pending(rdev, conf->mddev); 3740 } 3741 } 3742 3743 if (s.ops_request) 3744 raid_run_ops(sh, s.ops_request); 3745 3746 ops_run_io(sh, &s); 3747 3748 if (s.dec_preread_active) { 3749 /* We delay this until after ops_run_io so that if make_request 3750 * is waiting on a flush, it won't continue until the writes 3751 * have actually been submitted. 3752 */ 3753 atomic_dec(&conf->preread_active_stripes); 3754 if (atomic_read(&conf->preread_active_stripes) < 3755 IO_THRESHOLD) 3756 md_wakeup_thread(conf->mddev->thread); 3757 } 3758 3759 return_io(s.return_bi); 3760 3761 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3762 } 3763 3764 static void raid5_activate_delayed(struct r5conf *conf) 3765 { 3766 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3767 while (!list_empty(&conf->delayed_list)) { 3768 struct list_head *l = conf->delayed_list.next; 3769 struct stripe_head *sh; 3770 sh = list_entry(l, struct stripe_head, lru); 3771 list_del_init(l); 3772 clear_bit(STRIPE_DELAYED, &sh->state); 3773 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3774 atomic_inc(&conf->preread_active_stripes); 3775 list_add_tail(&sh->lru, &conf->hold_list); 3776 } 3777 } 3778 } 3779 3780 static void activate_bit_delay(struct r5conf *conf) 3781 { 3782 /* device_lock is held */ 3783 struct list_head head; 3784 list_add(&head, &conf->bitmap_list); 3785 list_del_init(&conf->bitmap_list); 3786 while (!list_empty(&head)) { 3787 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3788 list_del_init(&sh->lru); 3789 atomic_inc(&sh->count); 3790 __release_stripe(conf, sh); 3791 } 3792 } 3793 3794 int md_raid5_congested(struct mddev *mddev, int bits) 3795 { 3796 struct r5conf *conf = mddev->private; 3797 3798 /* No difference between reads and writes. Just check 3799 * how busy the stripe_cache is 3800 */ 3801 3802 if (conf->inactive_blocked) 3803 return 1; 3804 if (conf->quiesce) 3805 return 1; 3806 if (list_empty_careful(&conf->inactive_list)) 3807 return 1; 3808 3809 return 0; 3810 } 3811 EXPORT_SYMBOL_GPL(md_raid5_congested); 3812 3813 static int raid5_congested(void *data, int bits) 3814 { 3815 struct mddev *mddev = data; 3816 3817 return mddev_congested(mddev, bits) || 3818 md_raid5_congested(mddev, bits); 3819 } 3820 3821 /* We want read requests to align with chunks where possible, 3822 * but write requests don't need to. 3823 */ 3824 static int raid5_mergeable_bvec(struct request_queue *q, 3825 struct bvec_merge_data *bvm, 3826 struct bio_vec *biovec) 3827 { 3828 struct mddev *mddev = q->queuedata; 3829 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3830 int max; 3831 unsigned int chunk_sectors = mddev->chunk_sectors; 3832 unsigned int bio_sectors = bvm->bi_size >> 9; 3833 3834 if ((bvm->bi_rw & 1) == WRITE) 3835 return biovec->bv_len; /* always allow writes to be mergeable */ 3836 3837 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3838 chunk_sectors = mddev->new_chunk_sectors; 3839 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3840 if (max < 0) max = 0; 3841 if (max <= biovec->bv_len && bio_sectors == 0) 3842 return biovec->bv_len; 3843 else 3844 return max; 3845 } 3846 3847 3848 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3849 { 3850 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3851 unsigned int chunk_sectors = mddev->chunk_sectors; 3852 unsigned int bio_sectors = bio->bi_size >> 9; 3853 3854 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3855 chunk_sectors = mddev->new_chunk_sectors; 3856 return chunk_sectors >= 3857 ((sector & (chunk_sectors - 1)) + bio_sectors); 3858 } 3859 3860 /* 3861 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3862 * later sampled by raid5d. 3863 */ 3864 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3865 { 3866 unsigned long flags; 3867 3868 spin_lock_irqsave(&conf->device_lock, flags); 3869 3870 bi->bi_next = conf->retry_read_aligned_list; 3871 conf->retry_read_aligned_list = bi; 3872 3873 spin_unlock_irqrestore(&conf->device_lock, flags); 3874 md_wakeup_thread(conf->mddev->thread); 3875 } 3876 3877 3878 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3879 { 3880 struct bio *bi; 3881 3882 bi = conf->retry_read_aligned; 3883 if (bi) { 3884 conf->retry_read_aligned = NULL; 3885 return bi; 3886 } 3887 bi = conf->retry_read_aligned_list; 3888 if(bi) { 3889 conf->retry_read_aligned_list = bi->bi_next; 3890 bi->bi_next = NULL; 3891 /* 3892 * this sets the active strip count to 1 and the processed 3893 * strip count to zero (upper 8 bits) 3894 */ 3895 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3896 } 3897 3898 return bi; 3899 } 3900 3901 3902 /* 3903 * The "raid5_align_endio" should check if the read succeeded and if it 3904 * did, call bio_endio on the original bio (having bio_put the new bio 3905 * first). 3906 * If the read failed.. 3907 */ 3908 static void raid5_align_endio(struct bio *bi, int error) 3909 { 3910 struct bio* raid_bi = bi->bi_private; 3911 struct mddev *mddev; 3912 struct r5conf *conf; 3913 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3914 struct md_rdev *rdev; 3915 3916 bio_put(bi); 3917 3918 rdev = (void*)raid_bi->bi_next; 3919 raid_bi->bi_next = NULL; 3920 mddev = rdev->mddev; 3921 conf = mddev->private; 3922 3923 rdev_dec_pending(rdev, conf->mddev); 3924 3925 if (!error && uptodate) { 3926 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 3927 raid_bi, 0); 3928 bio_endio(raid_bi, 0); 3929 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3930 wake_up(&conf->wait_for_stripe); 3931 return; 3932 } 3933 3934 3935 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3936 3937 add_bio_to_retry(raid_bi, conf); 3938 } 3939 3940 static int bio_fits_rdev(struct bio *bi) 3941 { 3942 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3943 3944 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3945 return 0; 3946 blk_recount_segments(q, bi); 3947 if (bi->bi_phys_segments > queue_max_segments(q)) 3948 return 0; 3949 3950 if (q->merge_bvec_fn) 3951 /* it's too hard to apply the merge_bvec_fn at this stage, 3952 * just just give up 3953 */ 3954 return 0; 3955 3956 return 1; 3957 } 3958 3959 3960 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3961 { 3962 struct r5conf *conf = mddev->private; 3963 int dd_idx; 3964 struct bio* align_bi; 3965 struct md_rdev *rdev; 3966 sector_t end_sector; 3967 3968 if (!in_chunk_boundary(mddev, raid_bio)) { 3969 pr_debug("chunk_aligned_read : non aligned\n"); 3970 return 0; 3971 } 3972 /* 3973 * use bio_clone_mddev to make a copy of the bio 3974 */ 3975 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3976 if (!align_bi) 3977 return 0; 3978 /* 3979 * set bi_end_io to a new function, and set bi_private to the 3980 * original bio. 3981 */ 3982 align_bi->bi_end_io = raid5_align_endio; 3983 align_bi->bi_private = raid_bio; 3984 /* 3985 * compute position 3986 */ 3987 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3988 0, 3989 &dd_idx, NULL); 3990 3991 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3992 rcu_read_lock(); 3993 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3994 if (!rdev || test_bit(Faulty, &rdev->flags) || 3995 rdev->recovery_offset < end_sector) { 3996 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3997 if (rdev && 3998 (test_bit(Faulty, &rdev->flags) || 3999 !(test_bit(In_sync, &rdev->flags) || 4000 rdev->recovery_offset >= end_sector))) 4001 rdev = NULL; 4002 } 4003 if (rdev) { 4004 sector_t first_bad; 4005 int bad_sectors; 4006 4007 atomic_inc(&rdev->nr_pending); 4008 rcu_read_unlock(); 4009 raid_bio->bi_next = (void*)rdev; 4010 align_bi->bi_bdev = rdev->bdev; 4011 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4012 4013 if (!bio_fits_rdev(align_bi) || 4014 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 4015 &first_bad, &bad_sectors)) { 4016 /* too big in some way, or has a known bad block */ 4017 bio_put(align_bi); 4018 rdev_dec_pending(rdev, mddev); 4019 return 0; 4020 } 4021 4022 /* No reshape active, so we can trust rdev->data_offset */ 4023 align_bi->bi_sector += rdev->data_offset; 4024 4025 spin_lock_irq(&conf->device_lock); 4026 wait_event_lock_irq(conf->wait_for_stripe, 4027 conf->quiesce == 0, 4028 conf->device_lock); 4029 atomic_inc(&conf->active_aligned_reads); 4030 spin_unlock_irq(&conf->device_lock); 4031 4032 if (mddev->gendisk) 4033 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4034 align_bi, disk_devt(mddev->gendisk), 4035 raid_bio->bi_sector); 4036 generic_make_request(align_bi); 4037 return 1; 4038 } else { 4039 rcu_read_unlock(); 4040 bio_put(align_bi); 4041 return 0; 4042 } 4043 } 4044 4045 /* __get_priority_stripe - get the next stripe to process 4046 * 4047 * Full stripe writes are allowed to pass preread active stripes up until 4048 * the bypass_threshold is exceeded. In general the bypass_count 4049 * increments when the handle_list is handled before the hold_list; however, it 4050 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4051 * stripe with in flight i/o. The bypass_count will be reset when the 4052 * head of the hold_list has changed, i.e. the head was promoted to the 4053 * handle_list. 4054 */ 4055 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4056 { 4057 struct stripe_head *sh; 4058 4059 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4060 __func__, 4061 list_empty(&conf->handle_list) ? "empty" : "busy", 4062 list_empty(&conf->hold_list) ? "empty" : "busy", 4063 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4064 4065 if (!list_empty(&conf->handle_list)) { 4066 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4067 4068 if (list_empty(&conf->hold_list)) 4069 conf->bypass_count = 0; 4070 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4071 if (conf->hold_list.next == conf->last_hold) 4072 conf->bypass_count++; 4073 else { 4074 conf->last_hold = conf->hold_list.next; 4075 conf->bypass_count -= conf->bypass_threshold; 4076 if (conf->bypass_count < 0) 4077 conf->bypass_count = 0; 4078 } 4079 } 4080 } else if (!list_empty(&conf->hold_list) && 4081 ((conf->bypass_threshold && 4082 conf->bypass_count > conf->bypass_threshold) || 4083 atomic_read(&conf->pending_full_writes) == 0)) { 4084 sh = list_entry(conf->hold_list.next, 4085 typeof(*sh), lru); 4086 conf->bypass_count -= conf->bypass_threshold; 4087 if (conf->bypass_count < 0) 4088 conf->bypass_count = 0; 4089 } else 4090 return NULL; 4091 4092 list_del_init(&sh->lru); 4093 atomic_inc(&sh->count); 4094 BUG_ON(atomic_read(&sh->count) != 1); 4095 return sh; 4096 } 4097 4098 struct raid5_plug_cb { 4099 struct blk_plug_cb cb; 4100 struct list_head list; 4101 }; 4102 4103 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4104 { 4105 struct raid5_plug_cb *cb = container_of( 4106 blk_cb, struct raid5_plug_cb, cb); 4107 struct stripe_head *sh; 4108 struct mddev *mddev = cb->cb.data; 4109 struct r5conf *conf = mddev->private; 4110 int cnt = 0; 4111 4112 if (cb->list.next && !list_empty(&cb->list)) { 4113 spin_lock_irq(&conf->device_lock); 4114 while (!list_empty(&cb->list)) { 4115 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4116 list_del_init(&sh->lru); 4117 /* 4118 * avoid race release_stripe_plug() sees 4119 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4120 * is still in our list 4121 */ 4122 smp_mb__before_clear_bit(); 4123 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4124 __release_stripe(conf, sh); 4125 cnt++; 4126 } 4127 spin_unlock_irq(&conf->device_lock); 4128 } 4129 if (mddev->queue) 4130 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4131 kfree(cb); 4132 } 4133 4134 static void release_stripe_plug(struct mddev *mddev, 4135 struct stripe_head *sh) 4136 { 4137 struct blk_plug_cb *blk_cb = blk_check_plugged( 4138 raid5_unplug, mddev, 4139 sizeof(struct raid5_plug_cb)); 4140 struct raid5_plug_cb *cb; 4141 4142 if (!blk_cb) { 4143 release_stripe(sh); 4144 return; 4145 } 4146 4147 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4148 4149 if (cb->list.next == NULL) 4150 INIT_LIST_HEAD(&cb->list); 4151 4152 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4153 list_add_tail(&sh->lru, &cb->list); 4154 else 4155 release_stripe(sh); 4156 } 4157 4158 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4159 { 4160 struct r5conf *conf = mddev->private; 4161 sector_t logical_sector, last_sector; 4162 struct stripe_head *sh; 4163 int remaining; 4164 int stripe_sectors; 4165 4166 if (mddev->reshape_position != MaxSector) 4167 /* Skip discard while reshape is happening */ 4168 return; 4169 4170 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4171 last_sector = bi->bi_sector + (bi->bi_size>>9); 4172 4173 bi->bi_next = NULL; 4174 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4175 4176 stripe_sectors = conf->chunk_sectors * 4177 (conf->raid_disks - conf->max_degraded); 4178 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4179 stripe_sectors); 4180 sector_div(last_sector, stripe_sectors); 4181 4182 logical_sector *= conf->chunk_sectors; 4183 last_sector *= conf->chunk_sectors; 4184 4185 for (; logical_sector < last_sector; 4186 logical_sector += STRIPE_SECTORS) { 4187 DEFINE_WAIT(w); 4188 int d; 4189 again: 4190 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4191 prepare_to_wait(&conf->wait_for_overlap, &w, 4192 TASK_UNINTERRUPTIBLE); 4193 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4194 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4195 release_stripe(sh); 4196 schedule(); 4197 goto again; 4198 } 4199 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4200 spin_lock_irq(&sh->stripe_lock); 4201 for (d = 0; d < conf->raid_disks; d++) { 4202 if (d == sh->pd_idx || d == sh->qd_idx) 4203 continue; 4204 if (sh->dev[d].towrite || sh->dev[d].toread) { 4205 set_bit(R5_Overlap, &sh->dev[d].flags); 4206 spin_unlock_irq(&sh->stripe_lock); 4207 release_stripe(sh); 4208 schedule(); 4209 goto again; 4210 } 4211 } 4212 set_bit(STRIPE_DISCARD, &sh->state); 4213 finish_wait(&conf->wait_for_overlap, &w); 4214 for (d = 0; d < conf->raid_disks; d++) { 4215 if (d == sh->pd_idx || d == sh->qd_idx) 4216 continue; 4217 sh->dev[d].towrite = bi; 4218 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4219 raid5_inc_bi_active_stripes(bi); 4220 } 4221 spin_unlock_irq(&sh->stripe_lock); 4222 if (conf->mddev->bitmap) { 4223 for (d = 0; 4224 d < conf->raid_disks - conf->max_degraded; 4225 d++) 4226 bitmap_startwrite(mddev->bitmap, 4227 sh->sector, 4228 STRIPE_SECTORS, 4229 0); 4230 sh->bm_seq = conf->seq_flush + 1; 4231 set_bit(STRIPE_BIT_DELAY, &sh->state); 4232 } 4233 4234 set_bit(STRIPE_HANDLE, &sh->state); 4235 clear_bit(STRIPE_DELAYED, &sh->state); 4236 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4237 atomic_inc(&conf->preread_active_stripes); 4238 release_stripe_plug(mddev, sh); 4239 } 4240 4241 remaining = raid5_dec_bi_active_stripes(bi); 4242 if (remaining == 0) { 4243 md_write_end(mddev); 4244 bio_endio(bi, 0); 4245 } 4246 } 4247 4248 static void make_request(struct mddev *mddev, struct bio * bi) 4249 { 4250 struct r5conf *conf = mddev->private; 4251 int dd_idx; 4252 sector_t new_sector; 4253 sector_t logical_sector, last_sector; 4254 struct stripe_head *sh; 4255 const int rw = bio_data_dir(bi); 4256 int remaining; 4257 4258 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4259 md_flush_request(mddev, bi); 4260 return; 4261 } 4262 4263 md_write_start(mddev, bi); 4264 4265 if (rw == READ && 4266 mddev->reshape_position == MaxSector && 4267 chunk_aligned_read(mddev,bi)) 4268 return; 4269 4270 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4271 make_discard_request(mddev, bi); 4272 return; 4273 } 4274 4275 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4276 last_sector = bi->bi_sector + (bi->bi_size>>9); 4277 bi->bi_next = NULL; 4278 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4279 4280 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4281 DEFINE_WAIT(w); 4282 int previous; 4283 4284 retry: 4285 previous = 0; 4286 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4287 if (unlikely(conf->reshape_progress != MaxSector)) { 4288 /* spinlock is needed as reshape_progress may be 4289 * 64bit on a 32bit platform, and so it might be 4290 * possible to see a half-updated value 4291 * Of course reshape_progress could change after 4292 * the lock is dropped, so once we get a reference 4293 * to the stripe that we think it is, we will have 4294 * to check again. 4295 */ 4296 spin_lock_irq(&conf->device_lock); 4297 if (mddev->reshape_backwards 4298 ? logical_sector < conf->reshape_progress 4299 : logical_sector >= conf->reshape_progress) { 4300 previous = 1; 4301 } else { 4302 if (mddev->reshape_backwards 4303 ? logical_sector < conf->reshape_safe 4304 : logical_sector >= conf->reshape_safe) { 4305 spin_unlock_irq(&conf->device_lock); 4306 schedule(); 4307 goto retry; 4308 } 4309 } 4310 spin_unlock_irq(&conf->device_lock); 4311 } 4312 4313 new_sector = raid5_compute_sector(conf, logical_sector, 4314 previous, 4315 &dd_idx, NULL); 4316 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4317 (unsigned long long)new_sector, 4318 (unsigned long long)logical_sector); 4319 4320 sh = get_active_stripe(conf, new_sector, previous, 4321 (bi->bi_rw&RWA_MASK), 0); 4322 if (sh) { 4323 if (unlikely(previous)) { 4324 /* expansion might have moved on while waiting for a 4325 * stripe, so we must do the range check again. 4326 * Expansion could still move past after this 4327 * test, but as we are holding a reference to 4328 * 'sh', we know that if that happens, 4329 * STRIPE_EXPANDING will get set and the expansion 4330 * won't proceed until we finish with the stripe. 4331 */ 4332 int must_retry = 0; 4333 spin_lock_irq(&conf->device_lock); 4334 if (mddev->reshape_backwards 4335 ? logical_sector >= conf->reshape_progress 4336 : logical_sector < conf->reshape_progress) 4337 /* mismatch, need to try again */ 4338 must_retry = 1; 4339 spin_unlock_irq(&conf->device_lock); 4340 if (must_retry) { 4341 release_stripe(sh); 4342 schedule(); 4343 goto retry; 4344 } 4345 } 4346 4347 if (rw == WRITE && 4348 logical_sector >= mddev->suspend_lo && 4349 logical_sector < mddev->suspend_hi) { 4350 release_stripe(sh); 4351 /* As the suspend_* range is controlled by 4352 * userspace, we want an interruptible 4353 * wait. 4354 */ 4355 flush_signals(current); 4356 prepare_to_wait(&conf->wait_for_overlap, 4357 &w, TASK_INTERRUPTIBLE); 4358 if (logical_sector >= mddev->suspend_lo && 4359 logical_sector < mddev->suspend_hi) 4360 schedule(); 4361 goto retry; 4362 } 4363 4364 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4365 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4366 /* Stripe is busy expanding or 4367 * add failed due to overlap. Flush everything 4368 * and wait a while 4369 */ 4370 md_wakeup_thread(mddev->thread); 4371 release_stripe(sh); 4372 schedule(); 4373 goto retry; 4374 } 4375 finish_wait(&conf->wait_for_overlap, &w); 4376 set_bit(STRIPE_HANDLE, &sh->state); 4377 clear_bit(STRIPE_DELAYED, &sh->state); 4378 if ((bi->bi_rw & REQ_SYNC) && 4379 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4380 atomic_inc(&conf->preread_active_stripes); 4381 release_stripe_plug(mddev, sh); 4382 } else { 4383 /* cannot get stripe for read-ahead, just give-up */ 4384 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4385 finish_wait(&conf->wait_for_overlap, &w); 4386 break; 4387 } 4388 } 4389 4390 remaining = raid5_dec_bi_active_stripes(bi); 4391 if (remaining == 0) { 4392 4393 if ( rw == WRITE ) 4394 md_write_end(mddev); 4395 4396 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4397 bi, 0); 4398 bio_endio(bi, 0); 4399 } 4400 } 4401 4402 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4403 4404 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4405 { 4406 /* reshaping is quite different to recovery/resync so it is 4407 * handled quite separately ... here. 4408 * 4409 * On each call to sync_request, we gather one chunk worth of 4410 * destination stripes and flag them as expanding. 4411 * Then we find all the source stripes and request reads. 4412 * As the reads complete, handle_stripe will copy the data 4413 * into the destination stripe and release that stripe. 4414 */ 4415 struct r5conf *conf = mddev->private; 4416 struct stripe_head *sh; 4417 sector_t first_sector, last_sector; 4418 int raid_disks = conf->previous_raid_disks; 4419 int data_disks = raid_disks - conf->max_degraded; 4420 int new_data_disks = conf->raid_disks - conf->max_degraded; 4421 int i; 4422 int dd_idx; 4423 sector_t writepos, readpos, safepos; 4424 sector_t stripe_addr; 4425 int reshape_sectors; 4426 struct list_head stripes; 4427 4428 if (sector_nr == 0) { 4429 /* If restarting in the middle, skip the initial sectors */ 4430 if (mddev->reshape_backwards && 4431 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4432 sector_nr = raid5_size(mddev, 0, 0) 4433 - conf->reshape_progress; 4434 } else if (!mddev->reshape_backwards && 4435 conf->reshape_progress > 0) 4436 sector_nr = conf->reshape_progress; 4437 sector_div(sector_nr, new_data_disks); 4438 if (sector_nr) { 4439 mddev->curr_resync_completed = sector_nr; 4440 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4441 *skipped = 1; 4442 return sector_nr; 4443 } 4444 } 4445 4446 /* We need to process a full chunk at a time. 4447 * If old and new chunk sizes differ, we need to process the 4448 * largest of these 4449 */ 4450 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4451 reshape_sectors = mddev->new_chunk_sectors; 4452 else 4453 reshape_sectors = mddev->chunk_sectors; 4454 4455 /* We update the metadata at least every 10 seconds, or when 4456 * the data about to be copied would over-write the source of 4457 * the data at the front of the range. i.e. one new_stripe 4458 * along from reshape_progress new_maps to after where 4459 * reshape_safe old_maps to 4460 */ 4461 writepos = conf->reshape_progress; 4462 sector_div(writepos, new_data_disks); 4463 readpos = conf->reshape_progress; 4464 sector_div(readpos, data_disks); 4465 safepos = conf->reshape_safe; 4466 sector_div(safepos, data_disks); 4467 if (mddev->reshape_backwards) { 4468 writepos -= min_t(sector_t, reshape_sectors, writepos); 4469 readpos += reshape_sectors; 4470 safepos += reshape_sectors; 4471 } else { 4472 writepos += reshape_sectors; 4473 readpos -= min_t(sector_t, reshape_sectors, readpos); 4474 safepos -= min_t(sector_t, reshape_sectors, safepos); 4475 } 4476 4477 /* Having calculated the 'writepos' possibly use it 4478 * to set 'stripe_addr' which is where we will write to. 4479 */ 4480 if (mddev->reshape_backwards) { 4481 BUG_ON(conf->reshape_progress == 0); 4482 stripe_addr = writepos; 4483 BUG_ON((mddev->dev_sectors & 4484 ~((sector_t)reshape_sectors - 1)) 4485 - reshape_sectors - stripe_addr 4486 != sector_nr); 4487 } else { 4488 BUG_ON(writepos != sector_nr + reshape_sectors); 4489 stripe_addr = sector_nr; 4490 } 4491 4492 /* 'writepos' is the most advanced device address we might write. 4493 * 'readpos' is the least advanced device address we might read. 4494 * 'safepos' is the least address recorded in the metadata as having 4495 * been reshaped. 4496 * If there is a min_offset_diff, these are adjusted either by 4497 * increasing the safepos/readpos if diff is negative, or 4498 * increasing writepos if diff is positive. 4499 * If 'readpos' is then behind 'writepos', there is no way that we can 4500 * ensure safety in the face of a crash - that must be done by userspace 4501 * making a backup of the data. So in that case there is no particular 4502 * rush to update metadata. 4503 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4504 * update the metadata to advance 'safepos' to match 'readpos' so that 4505 * we can be safe in the event of a crash. 4506 * So we insist on updating metadata if safepos is behind writepos and 4507 * readpos is beyond writepos. 4508 * In any case, update the metadata every 10 seconds. 4509 * Maybe that number should be configurable, but I'm not sure it is 4510 * worth it.... maybe it could be a multiple of safemode_delay??? 4511 */ 4512 if (conf->min_offset_diff < 0) { 4513 safepos += -conf->min_offset_diff; 4514 readpos += -conf->min_offset_diff; 4515 } else 4516 writepos += conf->min_offset_diff; 4517 4518 if ((mddev->reshape_backwards 4519 ? (safepos > writepos && readpos < writepos) 4520 : (safepos < writepos && readpos > writepos)) || 4521 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4522 /* Cannot proceed until we've updated the superblock... */ 4523 wait_event(conf->wait_for_overlap, 4524 atomic_read(&conf->reshape_stripes)==0); 4525 mddev->reshape_position = conf->reshape_progress; 4526 mddev->curr_resync_completed = sector_nr; 4527 conf->reshape_checkpoint = jiffies; 4528 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4529 md_wakeup_thread(mddev->thread); 4530 wait_event(mddev->sb_wait, mddev->flags == 0 || 4531 kthread_should_stop()); 4532 spin_lock_irq(&conf->device_lock); 4533 conf->reshape_safe = mddev->reshape_position; 4534 spin_unlock_irq(&conf->device_lock); 4535 wake_up(&conf->wait_for_overlap); 4536 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4537 } 4538 4539 INIT_LIST_HEAD(&stripes); 4540 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4541 int j; 4542 int skipped_disk = 0; 4543 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4544 set_bit(STRIPE_EXPANDING, &sh->state); 4545 atomic_inc(&conf->reshape_stripes); 4546 /* If any of this stripe is beyond the end of the old 4547 * array, then we need to zero those blocks 4548 */ 4549 for (j=sh->disks; j--;) { 4550 sector_t s; 4551 if (j == sh->pd_idx) 4552 continue; 4553 if (conf->level == 6 && 4554 j == sh->qd_idx) 4555 continue; 4556 s = compute_blocknr(sh, j, 0); 4557 if (s < raid5_size(mddev, 0, 0)) { 4558 skipped_disk = 1; 4559 continue; 4560 } 4561 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4562 set_bit(R5_Expanded, &sh->dev[j].flags); 4563 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4564 } 4565 if (!skipped_disk) { 4566 set_bit(STRIPE_EXPAND_READY, &sh->state); 4567 set_bit(STRIPE_HANDLE, &sh->state); 4568 } 4569 list_add(&sh->lru, &stripes); 4570 } 4571 spin_lock_irq(&conf->device_lock); 4572 if (mddev->reshape_backwards) 4573 conf->reshape_progress -= reshape_sectors * new_data_disks; 4574 else 4575 conf->reshape_progress += reshape_sectors * new_data_disks; 4576 spin_unlock_irq(&conf->device_lock); 4577 /* Ok, those stripe are ready. We can start scheduling 4578 * reads on the source stripes. 4579 * The source stripes are determined by mapping the first and last 4580 * block on the destination stripes. 4581 */ 4582 first_sector = 4583 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4584 1, &dd_idx, NULL); 4585 last_sector = 4586 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4587 * new_data_disks - 1), 4588 1, &dd_idx, NULL); 4589 if (last_sector >= mddev->dev_sectors) 4590 last_sector = mddev->dev_sectors - 1; 4591 while (first_sector <= last_sector) { 4592 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4593 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4594 set_bit(STRIPE_HANDLE, &sh->state); 4595 release_stripe(sh); 4596 first_sector += STRIPE_SECTORS; 4597 } 4598 /* Now that the sources are clearly marked, we can release 4599 * the destination stripes 4600 */ 4601 while (!list_empty(&stripes)) { 4602 sh = list_entry(stripes.next, struct stripe_head, lru); 4603 list_del_init(&sh->lru); 4604 release_stripe(sh); 4605 } 4606 /* If this takes us to the resync_max point where we have to pause, 4607 * then we need to write out the superblock. 4608 */ 4609 sector_nr += reshape_sectors; 4610 if ((sector_nr - mddev->curr_resync_completed) * 2 4611 >= mddev->resync_max - mddev->curr_resync_completed) { 4612 /* Cannot proceed until we've updated the superblock... */ 4613 wait_event(conf->wait_for_overlap, 4614 atomic_read(&conf->reshape_stripes) == 0); 4615 mddev->reshape_position = conf->reshape_progress; 4616 mddev->curr_resync_completed = sector_nr; 4617 conf->reshape_checkpoint = jiffies; 4618 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4619 md_wakeup_thread(mddev->thread); 4620 wait_event(mddev->sb_wait, 4621 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4622 || kthread_should_stop()); 4623 spin_lock_irq(&conf->device_lock); 4624 conf->reshape_safe = mddev->reshape_position; 4625 spin_unlock_irq(&conf->device_lock); 4626 wake_up(&conf->wait_for_overlap); 4627 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4628 } 4629 return reshape_sectors; 4630 } 4631 4632 /* FIXME go_faster isn't used */ 4633 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4634 { 4635 struct r5conf *conf = mddev->private; 4636 struct stripe_head *sh; 4637 sector_t max_sector = mddev->dev_sectors; 4638 sector_t sync_blocks; 4639 int still_degraded = 0; 4640 int i; 4641 4642 if (sector_nr >= max_sector) { 4643 /* just being told to finish up .. nothing much to do */ 4644 4645 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4646 end_reshape(conf); 4647 return 0; 4648 } 4649 4650 if (mddev->curr_resync < max_sector) /* aborted */ 4651 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4652 &sync_blocks, 1); 4653 else /* completed sync */ 4654 conf->fullsync = 0; 4655 bitmap_close_sync(mddev->bitmap); 4656 4657 return 0; 4658 } 4659 4660 /* Allow raid5_quiesce to complete */ 4661 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4662 4663 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4664 return reshape_request(mddev, sector_nr, skipped); 4665 4666 /* No need to check resync_max as we never do more than one 4667 * stripe, and as resync_max will always be on a chunk boundary, 4668 * if the check in md_do_sync didn't fire, there is no chance 4669 * of overstepping resync_max here 4670 */ 4671 4672 /* if there is too many failed drives and we are trying 4673 * to resync, then assert that we are finished, because there is 4674 * nothing we can do. 4675 */ 4676 if (mddev->degraded >= conf->max_degraded && 4677 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4678 sector_t rv = mddev->dev_sectors - sector_nr; 4679 *skipped = 1; 4680 return rv; 4681 } 4682 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4683 !conf->fullsync && 4684 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4685 sync_blocks >= STRIPE_SECTORS) { 4686 /* we can skip this block, and probably more */ 4687 sync_blocks /= STRIPE_SECTORS; 4688 *skipped = 1; 4689 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4690 } 4691 4692 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4693 4694 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4695 if (sh == NULL) { 4696 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4697 /* make sure we don't swamp the stripe cache if someone else 4698 * is trying to get access 4699 */ 4700 schedule_timeout_uninterruptible(1); 4701 } 4702 /* Need to check if array will still be degraded after recovery/resync 4703 * We don't need to check the 'failed' flag as when that gets set, 4704 * recovery aborts. 4705 */ 4706 for (i = 0; i < conf->raid_disks; i++) 4707 if (conf->disks[i].rdev == NULL) 4708 still_degraded = 1; 4709 4710 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4711 4712 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4713 4714 handle_stripe(sh); 4715 release_stripe(sh); 4716 4717 return STRIPE_SECTORS; 4718 } 4719 4720 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4721 { 4722 /* We may not be able to submit a whole bio at once as there 4723 * may not be enough stripe_heads available. 4724 * We cannot pre-allocate enough stripe_heads as we may need 4725 * more than exist in the cache (if we allow ever large chunks). 4726 * So we do one stripe head at a time and record in 4727 * ->bi_hw_segments how many have been done. 4728 * 4729 * We *know* that this entire raid_bio is in one chunk, so 4730 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4731 */ 4732 struct stripe_head *sh; 4733 int dd_idx; 4734 sector_t sector, logical_sector, last_sector; 4735 int scnt = 0; 4736 int remaining; 4737 int handled = 0; 4738 4739 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4740 sector = raid5_compute_sector(conf, logical_sector, 4741 0, &dd_idx, NULL); 4742 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4743 4744 for (; logical_sector < last_sector; 4745 logical_sector += STRIPE_SECTORS, 4746 sector += STRIPE_SECTORS, 4747 scnt++) { 4748 4749 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4750 /* already done this stripe */ 4751 continue; 4752 4753 sh = get_active_stripe(conf, sector, 0, 1, 0); 4754 4755 if (!sh) { 4756 /* failed to get a stripe - must wait */ 4757 raid5_set_bi_processed_stripes(raid_bio, scnt); 4758 conf->retry_read_aligned = raid_bio; 4759 return handled; 4760 } 4761 4762 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4763 release_stripe(sh); 4764 raid5_set_bi_processed_stripes(raid_bio, scnt); 4765 conf->retry_read_aligned = raid_bio; 4766 return handled; 4767 } 4768 4769 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4770 handle_stripe(sh); 4771 release_stripe(sh); 4772 handled++; 4773 } 4774 remaining = raid5_dec_bi_active_stripes(raid_bio); 4775 if (remaining == 0) { 4776 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4777 raid_bio, 0); 4778 bio_endio(raid_bio, 0); 4779 } 4780 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4781 wake_up(&conf->wait_for_stripe); 4782 return handled; 4783 } 4784 4785 #define MAX_STRIPE_BATCH 8 4786 static int handle_active_stripes(struct r5conf *conf) 4787 { 4788 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4789 int i, batch_size = 0; 4790 4791 while (batch_size < MAX_STRIPE_BATCH && 4792 (sh = __get_priority_stripe(conf)) != NULL) 4793 batch[batch_size++] = sh; 4794 4795 if (batch_size == 0) 4796 return batch_size; 4797 spin_unlock_irq(&conf->device_lock); 4798 4799 for (i = 0; i < batch_size; i++) 4800 handle_stripe(batch[i]); 4801 4802 cond_resched(); 4803 4804 spin_lock_irq(&conf->device_lock); 4805 for (i = 0; i < batch_size; i++) 4806 __release_stripe(conf, batch[i]); 4807 return batch_size; 4808 } 4809 4810 /* 4811 * This is our raid5 kernel thread. 4812 * 4813 * We scan the hash table for stripes which can be handled now. 4814 * During the scan, completed stripes are saved for us by the interrupt 4815 * handler, so that they will not have to wait for our next wakeup. 4816 */ 4817 static void raid5d(struct md_thread *thread) 4818 { 4819 struct mddev *mddev = thread->mddev; 4820 struct r5conf *conf = mddev->private; 4821 int handled; 4822 struct blk_plug plug; 4823 4824 pr_debug("+++ raid5d active\n"); 4825 4826 md_check_recovery(mddev); 4827 4828 blk_start_plug(&plug); 4829 handled = 0; 4830 spin_lock_irq(&conf->device_lock); 4831 while (1) { 4832 struct bio *bio; 4833 int batch_size; 4834 4835 if ( 4836 !list_empty(&conf->bitmap_list)) { 4837 /* Now is a good time to flush some bitmap updates */ 4838 conf->seq_flush++; 4839 spin_unlock_irq(&conf->device_lock); 4840 bitmap_unplug(mddev->bitmap); 4841 spin_lock_irq(&conf->device_lock); 4842 conf->seq_write = conf->seq_flush; 4843 activate_bit_delay(conf); 4844 } 4845 raid5_activate_delayed(conf); 4846 4847 while ((bio = remove_bio_from_retry(conf))) { 4848 int ok; 4849 spin_unlock_irq(&conf->device_lock); 4850 ok = retry_aligned_read(conf, bio); 4851 spin_lock_irq(&conf->device_lock); 4852 if (!ok) 4853 break; 4854 handled++; 4855 } 4856 4857 batch_size = handle_active_stripes(conf); 4858 if (!batch_size) 4859 break; 4860 handled += batch_size; 4861 4862 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 4863 spin_unlock_irq(&conf->device_lock); 4864 md_check_recovery(mddev); 4865 spin_lock_irq(&conf->device_lock); 4866 } 4867 } 4868 pr_debug("%d stripes handled\n", handled); 4869 4870 spin_unlock_irq(&conf->device_lock); 4871 4872 async_tx_issue_pending_all(); 4873 blk_finish_plug(&plug); 4874 4875 pr_debug("--- raid5d inactive\n"); 4876 } 4877 4878 static ssize_t 4879 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4880 { 4881 struct r5conf *conf = mddev->private; 4882 if (conf) 4883 return sprintf(page, "%d\n", conf->max_nr_stripes); 4884 else 4885 return 0; 4886 } 4887 4888 int 4889 raid5_set_cache_size(struct mddev *mddev, int size) 4890 { 4891 struct r5conf *conf = mddev->private; 4892 int err; 4893 4894 if (size <= 16 || size > 32768) 4895 return -EINVAL; 4896 while (size < conf->max_nr_stripes) { 4897 if (drop_one_stripe(conf)) 4898 conf->max_nr_stripes--; 4899 else 4900 break; 4901 } 4902 err = md_allow_write(mddev); 4903 if (err) 4904 return err; 4905 while (size > conf->max_nr_stripes) { 4906 if (grow_one_stripe(conf)) 4907 conf->max_nr_stripes++; 4908 else break; 4909 } 4910 return 0; 4911 } 4912 EXPORT_SYMBOL(raid5_set_cache_size); 4913 4914 static ssize_t 4915 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4916 { 4917 struct r5conf *conf = mddev->private; 4918 unsigned long new; 4919 int err; 4920 4921 if (len >= PAGE_SIZE) 4922 return -EINVAL; 4923 if (!conf) 4924 return -ENODEV; 4925 4926 if (strict_strtoul(page, 10, &new)) 4927 return -EINVAL; 4928 err = raid5_set_cache_size(mddev, new); 4929 if (err) 4930 return err; 4931 return len; 4932 } 4933 4934 static struct md_sysfs_entry 4935 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4936 raid5_show_stripe_cache_size, 4937 raid5_store_stripe_cache_size); 4938 4939 static ssize_t 4940 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4941 { 4942 struct r5conf *conf = mddev->private; 4943 if (conf) 4944 return sprintf(page, "%d\n", conf->bypass_threshold); 4945 else 4946 return 0; 4947 } 4948 4949 static ssize_t 4950 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4951 { 4952 struct r5conf *conf = mddev->private; 4953 unsigned long new; 4954 if (len >= PAGE_SIZE) 4955 return -EINVAL; 4956 if (!conf) 4957 return -ENODEV; 4958 4959 if (strict_strtoul(page, 10, &new)) 4960 return -EINVAL; 4961 if (new > conf->max_nr_stripes) 4962 return -EINVAL; 4963 conf->bypass_threshold = new; 4964 return len; 4965 } 4966 4967 static struct md_sysfs_entry 4968 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4969 S_IRUGO | S_IWUSR, 4970 raid5_show_preread_threshold, 4971 raid5_store_preread_threshold); 4972 4973 static ssize_t 4974 stripe_cache_active_show(struct mddev *mddev, char *page) 4975 { 4976 struct r5conf *conf = mddev->private; 4977 if (conf) 4978 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4979 else 4980 return 0; 4981 } 4982 4983 static struct md_sysfs_entry 4984 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4985 4986 static struct attribute *raid5_attrs[] = { 4987 &raid5_stripecache_size.attr, 4988 &raid5_stripecache_active.attr, 4989 &raid5_preread_bypass_threshold.attr, 4990 NULL, 4991 }; 4992 static struct attribute_group raid5_attrs_group = { 4993 .name = NULL, 4994 .attrs = raid5_attrs, 4995 }; 4996 4997 static sector_t 4998 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4999 { 5000 struct r5conf *conf = mddev->private; 5001 5002 if (!sectors) 5003 sectors = mddev->dev_sectors; 5004 if (!raid_disks) 5005 /* size is defined by the smallest of previous and new size */ 5006 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 5007 5008 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5009 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 5010 return sectors * (raid_disks - conf->max_degraded); 5011 } 5012 5013 static void raid5_free_percpu(struct r5conf *conf) 5014 { 5015 struct raid5_percpu *percpu; 5016 unsigned long cpu; 5017 5018 if (!conf->percpu) 5019 return; 5020 5021 get_online_cpus(); 5022 for_each_possible_cpu(cpu) { 5023 percpu = per_cpu_ptr(conf->percpu, cpu); 5024 safe_put_page(percpu->spare_page); 5025 kfree(percpu->scribble); 5026 } 5027 #ifdef CONFIG_HOTPLUG_CPU 5028 unregister_cpu_notifier(&conf->cpu_notify); 5029 #endif 5030 put_online_cpus(); 5031 5032 free_percpu(conf->percpu); 5033 } 5034 5035 static void free_conf(struct r5conf *conf) 5036 { 5037 shrink_stripes(conf); 5038 raid5_free_percpu(conf); 5039 kfree(conf->disks); 5040 kfree(conf->stripe_hashtbl); 5041 kfree(conf); 5042 } 5043 5044 #ifdef CONFIG_HOTPLUG_CPU 5045 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5046 void *hcpu) 5047 { 5048 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5049 long cpu = (long)hcpu; 5050 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5051 5052 switch (action) { 5053 case CPU_UP_PREPARE: 5054 case CPU_UP_PREPARE_FROZEN: 5055 if (conf->level == 6 && !percpu->spare_page) 5056 percpu->spare_page = alloc_page(GFP_KERNEL); 5057 if (!percpu->scribble) 5058 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5059 5060 if (!percpu->scribble || 5061 (conf->level == 6 && !percpu->spare_page)) { 5062 safe_put_page(percpu->spare_page); 5063 kfree(percpu->scribble); 5064 pr_err("%s: failed memory allocation for cpu%ld\n", 5065 __func__, cpu); 5066 return notifier_from_errno(-ENOMEM); 5067 } 5068 break; 5069 case CPU_DEAD: 5070 case CPU_DEAD_FROZEN: 5071 safe_put_page(percpu->spare_page); 5072 kfree(percpu->scribble); 5073 percpu->spare_page = NULL; 5074 percpu->scribble = NULL; 5075 break; 5076 default: 5077 break; 5078 } 5079 return NOTIFY_OK; 5080 } 5081 #endif 5082 5083 static int raid5_alloc_percpu(struct r5conf *conf) 5084 { 5085 unsigned long cpu; 5086 struct page *spare_page; 5087 struct raid5_percpu __percpu *allcpus; 5088 void *scribble; 5089 int err; 5090 5091 allcpus = alloc_percpu(struct raid5_percpu); 5092 if (!allcpus) 5093 return -ENOMEM; 5094 conf->percpu = allcpus; 5095 5096 get_online_cpus(); 5097 err = 0; 5098 for_each_present_cpu(cpu) { 5099 if (conf->level == 6) { 5100 spare_page = alloc_page(GFP_KERNEL); 5101 if (!spare_page) { 5102 err = -ENOMEM; 5103 break; 5104 } 5105 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5106 } 5107 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5108 if (!scribble) { 5109 err = -ENOMEM; 5110 break; 5111 } 5112 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5113 } 5114 #ifdef CONFIG_HOTPLUG_CPU 5115 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5116 conf->cpu_notify.priority = 0; 5117 if (err == 0) 5118 err = register_cpu_notifier(&conf->cpu_notify); 5119 #endif 5120 put_online_cpus(); 5121 5122 return err; 5123 } 5124 5125 static struct r5conf *setup_conf(struct mddev *mddev) 5126 { 5127 struct r5conf *conf; 5128 int raid_disk, memory, max_disks; 5129 struct md_rdev *rdev; 5130 struct disk_info *disk; 5131 char pers_name[6]; 5132 5133 if (mddev->new_level != 5 5134 && mddev->new_level != 4 5135 && mddev->new_level != 6) { 5136 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5137 mdname(mddev), mddev->new_level); 5138 return ERR_PTR(-EIO); 5139 } 5140 if ((mddev->new_level == 5 5141 && !algorithm_valid_raid5(mddev->new_layout)) || 5142 (mddev->new_level == 6 5143 && !algorithm_valid_raid6(mddev->new_layout))) { 5144 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5145 mdname(mddev), mddev->new_layout); 5146 return ERR_PTR(-EIO); 5147 } 5148 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5149 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5150 mdname(mddev), mddev->raid_disks); 5151 return ERR_PTR(-EINVAL); 5152 } 5153 5154 if (!mddev->new_chunk_sectors || 5155 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5156 !is_power_of_2(mddev->new_chunk_sectors)) { 5157 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5158 mdname(mddev), mddev->new_chunk_sectors << 9); 5159 return ERR_PTR(-EINVAL); 5160 } 5161 5162 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5163 if (conf == NULL) 5164 goto abort; 5165 spin_lock_init(&conf->device_lock); 5166 init_waitqueue_head(&conf->wait_for_stripe); 5167 init_waitqueue_head(&conf->wait_for_overlap); 5168 INIT_LIST_HEAD(&conf->handle_list); 5169 INIT_LIST_HEAD(&conf->hold_list); 5170 INIT_LIST_HEAD(&conf->delayed_list); 5171 INIT_LIST_HEAD(&conf->bitmap_list); 5172 INIT_LIST_HEAD(&conf->inactive_list); 5173 atomic_set(&conf->active_stripes, 0); 5174 atomic_set(&conf->preread_active_stripes, 0); 5175 atomic_set(&conf->active_aligned_reads, 0); 5176 conf->bypass_threshold = BYPASS_THRESHOLD; 5177 conf->recovery_disabled = mddev->recovery_disabled - 1; 5178 5179 conf->raid_disks = mddev->raid_disks; 5180 if (mddev->reshape_position == MaxSector) 5181 conf->previous_raid_disks = mddev->raid_disks; 5182 else 5183 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5184 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5185 conf->scribble_len = scribble_len(max_disks); 5186 5187 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5188 GFP_KERNEL); 5189 if (!conf->disks) 5190 goto abort; 5191 5192 conf->mddev = mddev; 5193 5194 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5195 goto abort; 5196 5197 conf->level = mddev->new_level; 5198 if (raid5_alloc_percpu(conf) != 0) 5199 goto abort; 5200 5201 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5202 5203 rdev_for_each(rdev, mddev) { 5204 raid_disk = rdev->raid_disk; 5205 if (raid_disk >= max_disks 5206 || raid_disk < 0) 5207 continue; 5208 disk = conf->disks + raid_disk; 5209 5210 if (test_bit(Replacement, &rdev->flags)) { 5211 if (disk->replacement) 5212 goto abort; 5213 disk->replacement = rdev; 5214 } else { 5215 if (disk->rdev) 5216 goto abort; 5217 disk->rdev = rdev; 5218 } 5219 5220 if (test_bit(In_sync, &rdev->flags)) { 5221 char b[BDEVNAME_SIZE]; 5222 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5223 " disk %d\n", 5224 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5225 } else if (rdev->saved_raid_disk != raid_disk) 5226 /* Cannot rely on bitmap to complete recovery */ 5227 conf->fullsync = 1; 5228 } 5229 5230 conf->chunk_sectors = mddev->new_chunk_sectors; 5231 conf->level = mddev->new_level; 5232 if (conf->level == 6) 5233 conf->max_degraded = 2; 5234 else 5235 conf->max_degraded = 1; 5236 conf->algorithm = mddev->new_layout; 5237 conf->max_nr_stripes = NR_STRIPES; 5238 conf->reshape_progress = mddev->reshape_position; 5239 if (conf->reshape_progress != MaxSector) { 5240 conf->prev_chunk_sectors = mddev->chunk_sectors; 5241 conf->prev_algo = mddev->layout; 5242 } 5243 5244 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5245 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5246 if (grow_stripes(conf, conf->max_nr_stripes)) { 5247 printk(KERN_ERR 5248 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5249 mdname(mddev), memory); 5250 goto abort; 5251 } else 5252 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5253 mdname(mddev), memory); 5254 5255 sprintf(pers_name, "raid%d", mddev->new_level); 5256 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5257 if (!conf->thread) { 5258 printk(KERN_ERR 5259 "md/raid:%s: couldn't allocate thread.\n", 5260 mdname(mddev)); 5261 goto abort; 5262 } 5263 5264 return conf; 5265 5266 abort: 5267 if (conf) { 5268 free_conf(conf); 5269 return ERR_PTR(-EIO); 5270 } else 5271 return ERR_PTR(-ENOMEM); 5272 } 5273 5274 5275 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5276 { 5277 switch (algo) { 5278 case ALGORITHM_PARITY_0: 5279 if (raid_disk < max_degraded) 5280 return 1; 5281 break; 5282 case ALGORITHM_PARITY_N: 5283 if (raid_disk >= raid_disks - max_degraded) 5284 return 1; 5285 break; 5286 case ALGORITHM_PARITY_0_6: 5287 if (raid_disk == 0 || 5288 raid_disk == raid_disks - 1) 5289 return 1; 5290 break; 5291 case ALGORITHM_LEFT_ASYMMETRIC_6: 5292 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5293 case ALGORITHM_LEFT_SYMMETRIC_6: 5294 case ALGORITHM_RIGHT_SYMMETRIC_6: 5295 if (raid_disk == raid_disks - 1) 5296 return 1; 5297 } 5298 return 0; 5299 } 5300 5301 static int run(struct mddev *mddev) 5302 { 5303 struct r5conf *conf; 5304 int working_disks = 0; 5305 int dirty_parity_disks = 0; 5306 struct md_rdev *rdev; 5307 sector_t reshape_offset = 0; 5308 int i; 5309 long long min_offset_diff = 0; 5310 int first = 1; 5311 5312 if (mddev->recovery_cp != MaxSector) 5313 printk(KERN_NOTICE "md/raid:%s: not clean" 5314 " -- starting background reconstruction\n", 5315 mdname(mddev)); 5316 5317 rdev_for_each(rdev, mddev) { 5318 long long diff; 5319 if (rdev->raid_disk < 0) 5320 continue; 5321 diff = (rdev->new_data_offset - rdev->data_offset); 5322 if (first) { 5323 min_offset_diff = diff; 5324 first = 0; 5325 } else if (mddev->reshape_backwards && 5326 diff < min_offset_diff) 5327 min_offset_diff = diff; 5328 else if (!mddev->reshape_backwards && 5329 diff > min_offset_diff) 5330 min_offset_diff = diff; 5331 } 5332 5333 if (mddev->reshape_position != MaxSector) { 5334 /* Check that we can continue the reshape. 5335 * Difficulties arise if the stripe we would write to 5336 * next is at or after the stripe we would read from next. 5337 * For a reshape that changes the number of devices, this 5338 * is only possible for a very short time, and mdadm makes 5339 * sure that time appears to have past before assembling 5340 * the array. So we fail if that time hasn't passed. 5341 * For a reshape that keeps the number of devices the same 5342 * mdadm must be monitoring the reshape can keeping the 5343 * critical areas read-only and backed up. It will start 5344 * the array in read-only mode, so we check for that. 5345 */ 5346 sector_t here_new, here_old; 5347 int old_disks; 5348 int max_degraded = (mddev->level == 6 ? 2 : 1); 5349 5350 if (mddev->new_level != mddev->level) { 5351 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5352 "required - aborting.\n", 5353 mdname(mddev)); 5354 return -EINVAL; 5355 } 5356 old_disks = mddev->raid_disks - mddev->delta_disks; 5357 /* reshape_position must be on a new-stripe boundary, and one 5358 * further up in new geometry must map after here in old 5359 * geometry. 5360 */ 5361 here_new = mddev->reshape_position; 5362 if (sector_div(here_new, mddev->new_chunk_sectors * 5363 (mddev->raid_disks - max_degraded))) { 5364 printk(KERN_ERR "md/raid:%s: reshape_position not " 5365 "on a stripe boundary\n", mdname(mddev)); 5366 return -EINVAL; 5367 } 5368 reshape_offset = here_new * mddev->new_chunk_sectors; 5369 /* here_new is the stripe we will write to */ 5370 here_old = mddev->reshape_position; 5371 sector_div(here_old, mddev->chunk_sectors * 5372 (old_disks-max_degraded)); 5373 /* here_old is the first stripe that we might need to read 5374 * from */ 5375 if (mddev->delta_disks == 0) { 5376 if ((here_new * mddev->new_chunk_sectors != 5377 here_old * mddev->chunk_sectors)) { 5378 printk(KERN_ERR "md/raid:%s: reshape position is" 5379 " confused - aborting\n", mdname(mddev)); 5380 return -EINVAL; 5381 } 5382 /* We cannot be sure it is safe to start an in-place 5383 * reshape. It is only safe if user-space is monitoring 5384 * and taking constant backups. 5385 * mdadm always starts a situation like this in 5386 * readonly mode so it can take control before 5387 * allowing any writes. So just check for that. 5388 */ 5389 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5390 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5391 /* not really in-place - so OK */; 5392 else if (mddev->ro == 0) { 5393 printk(KERN_ERR "md/raid:%s: in-place reshape " 5394 "must be started in read-only mode " 5395 "- aborting\n", 5396 mdname(mddev)); 5397 return -EINVAL; 5398 } 5399 } else if (mddev->reshape_backwards 5400 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5401 here_old * mddev->chunk_sectors) 5402 : (here_new * mddev->new_chunk_sectors >= 5403 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5404 /* Reading from the same stripe as writing to - bad */ 5405 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5406 "auto-recovery - aborting.\n", 5407 mdname(mddev)); 5408 return -EINVAL; 5409 } 5410 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5411 mdname(mddev)); 5412 /* OK, we should be able to continue; */ 5413 } else { 5414 BUG_ON(mddev->level != mddev->new_level); 5415 BUG_ON(mddev->layout != mddev->new_layout); 5416 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5417 BUG_ON(mddev->delta_disks != 0); 5418 } 5419 5420 if (mddev->private == NULL) 5421 conf = setup_conf(mddev); 5422 else 5423 conf = mddev->private; 5424 5425 if (IS_ERR(conf)) 5426 return PTR_ERR(conf); 5427 5428 conf->min_offset_diff = min_offset_diff; 5429 mddev->thread = conf->thread; 5430 conf->thread = NULL; 5431 mddev->private = conf; 5432 5433 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5434 i++) { 5435 rdev = conf->disks[i].rdev; 5436 if (!rdev && conf->disks[i].replacement) { 5437 /* The replacement is all we have yet */ 5438 rdev = conf->disks[i].replacement; 5439 conf->disks[i].replacement = NULL; 5440 clear_bit(Replacement, &rdev->flags); 5441 conf->disks[i].rdev = rdev; 5442 } 5443 if (!rdev) 5444 continue; 5445 if (conf->disks[i].replacement && 5446 conf->reshape_progress != MaxSector) { 5447 /* replacements and reshape simply do not mix. */ 5448 printk(KERN_ERR "md: cannot handle concurrent " 5449 "replacement and reshape.\n"); 5450 goto abort; 5451 } 5452 if (test_bit(In_sync, &rdev->flags)) { 5453 working_disks++; 5454 continue; 5455 } 5456 /* This disc is not fully in-sync. However if it 5457 * just stored parity (beyond the recovery_offset), 5458 * when we don't need to be concerned about the 5459 * array being dirty. 5460 * When reshape goes 'backwards', we never have 5461 * partially completed devices, so we only need 5462 * to worry about reshape going forwards. 5463 */ 5464 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5465 if (mddev->major_version == 0 && 5466 mddev->minor_version > 90) 5467 rdev->recovery_offset = reshape_offset; 5468 5469 if (rdev->recovery_offset < reshape_offset) { 5470 /* We need to check old and new layout */ 5471 if (!only_parity(rdev->raid_disk, 5472 conf->algorithm, 5473 conf->raid_disks, 5474 conf->max_degraded)) 5475 continue; 5476 } 5477 if (!only_parity(rdev->raid_disk, 5478 conf->prev_algo, 5479 conf->previous_raid_disks, 5480 conf->max_degraded)) 5481 continue; 5482 dirty_parity_disks++; 5483 } 5484 5485 /* 5486 * 0 for a fully functional array, 1 or 2 for a degraded array. 5487 */ 5488 mddev->degraded = calc_degraded(conf); 5489 5490 if (has_failed(conf)) { 5491 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5492 " (%d/%d failed)\n", 5493 mdname(mddev), mddev->degraded, conf->raid_disks); 5494 goto abort; 5495 } 5496 5497 /* device size must be a multiple of chunk size */ 5498 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5499 mddev->resync_max_sectors = mddev->dev_sectors; 5500 5501 if (mddev->degraded > dirty_parity_disks && 5502 mddev->recovery_cp != MaxSector) { 5503 if (mddev->ok_start_degraded) 5504 printk(KERN_WARNING 5505 "md/raid:%s: starting dirty degraded array" 5506 " - data corruption possible.\n", 5507 mdname(mddev)); 5508 else { 5509 printk(KERN_ERR 5510 "md/raid:%s: cannot start dirty degraded array.\n", 5511 mdname(mddev)); 5512 goto abort; 5513 } 5514 } 5515 5516 if (mddev->degraded == 0) 5517 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5518 " devices, algorithm %d\n", mdname(mddev), conf->level, 5519 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5520 mddev->new_layout); 5521 else 5522 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5523 " out of %d devices, algorithm %d\n", 5524 mdname(mddev), conf->level, 5525 mddev->raid_disks - mddev->degraded, 5526 mddev->raid_disks, mddev->new_layout); 5527 5528 print_raid5_conf(conf); 5529 5530 if (conf->reshape_progress != MaxSector) { 5531 conf->reshape_safe = conf->reshape_progress; 5532 atomic_set(&conf->reshape_stripes, 0); 5533 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5534 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5535 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5536 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5537 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5538 "reshape"); 5539 } 5540 5541 5542 /* Ok, everything is just fine now */ 5543 if (mddev->to_remove == &raid5_attrs_group) 5544 mddev->to_remove = NULL; 5545 else if (mddev->kobj.sd && 5546 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5547 printk(KERN_WARNING 5548 "raid5: failed to create sysfs attributes for %s\n", 5549 mdname(mddev)); 5550 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5551 5552 if (mddev->queue) { 5553 int chunk_size; 5554 bool discard_supported = true; 5555 /* read-ahead size must cover two whole stripes, which 5556 * is 2 * (datadisks) * chunksize where 'n' is the 5557 * number of raid devices 5558 */ 5559 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5560 int stripe = data_disks * 5561 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5562 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5563 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5564 5565 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5566 5567 mddev->queue->backing_dev_info.congested_data = mddev; 5568 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5569 5570 chunk_size = mddev->chunk_sectors << 9; 5571 blk_queue_io_min(mddev->queue, chunk_size); 5572 blk_queue_io_opt(mddev->queue, chunk_size * 5573 (conf->raid_disks - conf->max_degraded)); 5574 /* 5575 * We can only discard a whole stripe. It doesn't make sense to 5576 * discard data disk but write parity disk 5577 */ 5578 stripe = stripe * PAGE_SIZE; 5579 /* Round up to power of 2, as discard handling 5580 * currently assumes that */ 5581 while ((stripe-1) & stripe) 5582 stripe = (stripe | (stripe-1)) + 1; 5583 mddev->queue->limits.discard_alignment = stripe; 5584 mddev->queue->limits.discard_granularity = stripe; 5585 /* 5586 * unaligned part of discard request will be ignored, so can't 5587 * guarantee discard_zerors_data 5588 */ 5589 mddev->queue->limits.discard_zeroes_data = 0; 5590 5591 rdev_for_each(rdev, mddev) { 5592 disk_stack_limits(mddev->gendisk, rdev->bdev, 5593 rdev->data_offset << 9); 5594 disk_stack_limits(mddev->gendisk, rdev->bdev, 5595 rdev->new_data_offset << 9); 5596 /* 5597 * discard_zeroes_data is required, otherwise data 5598 * could be lost. Consider a scenario: discard a stripe 5599 * (the stripe could be inconsistent if 5600 * discard_zeroes_data is 0); write one disk of the 5601 * stripe (the stripe could be inconsistent again 5602 * depending on which disks are used to calculate 5603 * parity); the disk is broken; The stripe data of this 5604 * disk is lost. 5605 */ 5606 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5607 !bdev_get_queue(rdev->bdev)-> 5608 limits.discard_zeroes_data) 5609 discard_supported = false; 5610 } 5611 5612 if (discard_supported && 5613 mddev->queue->limits.max_discard_sectors >= stripe && 5614 mddev->queue->limits.discard_granularity >= stripe) 5615 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5616 mddev->queue); 5617 else 5618 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5619 mddev->queue); 5620 } 5621 5622 return 0; 5623 abort: 5624 md_unregister_thread(&mddev->thread); 5625 print_raid5_conf(conf); 5626 free_conf(conf); 5627 mddev->private = NULL; 5628 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5629 return -EIO; 5630 } 5631 5632 static int stop(struct mddev *mddev) 5633 { 5634 struct r5conf *conf = mddev->private; 5635 5636 md_unregister_thread(&mddev->thread); 5637 if (mddev->queue) 5638 mddev->queue->backing_dev_info.congested_fn = NULL; 5639 free_conf(conf); 5640 mddev->private = NULL; 5641 mddev->to_remove = &raid5_attrs_group; 5642 return 0; 5643 } 5644 5645 static void status(struct seq_file *seq, struct mddev *mddev) 5646 { 5647 struct r5conf *conf = mddev->private; 5648 int i; 5649 5650 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5651 mddev->chunk_sectors / 2, mddev->layout); 5652 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5653 for (i = 0; i < conf->raid_disks; i++) 5654 seq_printf (seq, "%s", 5655 conf->disks[i].rdev && 5656 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5657 seq_printf (seq, "]"); 5658 } 5659 5660 static void print_raid5_conf (struct r5conf *conf) 5661 { 5662 int i; 5663 struct disk_info *tmp; 5664 5665 printk(KERN_DEBUG "RAID conf printout:\n"); 5666 if (!conf) { 5667 printk("(conf==NULL)\n"); 5668 return; 5669 } 5670 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5671 conf->raid_disks, 5672 conf->raid_disks - conf->mddev->degraded); 5673 5674 for (i = 0; i < conf->raid_disks; i++) { 5675 char b[BDEVNAME_SIZE]; 5676 tmp = conf->disks + i; 5677 if (tmp->rdev) 5678 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5679 i, !test_bit(Faulty, &tmp->rdev->flags), 5680 bdevname(tmp->rdev->bdev, b)); 5681 } 5682 } 5683 5684 static int raid5_spare_active(struct mddev *mddev) 5685 { 5686 int i; 5687 struct r5conf *conf = mddev->private; 5688 struct disk_info *tmp; 5689 int count = 0; 5690 unsigned long flags; 5691 5692 for (i = 0; i < conf->raid_disks; i++) { 5693 tmp = conf->disks + i; 5694 if (tmp->replacement 5695 && tmp->replacement->recovery_offset == MaxSector 5696 && !test_bit(Faulty, &tmp->replacement->flags) 5697 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5698 /* Replacement has just become active. */ 5699 if (!tmp->rdev 5700 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5701 count++; 5702 if (tmp->rdev) { 5703 /* Replaced device not technically faulty, 5704 * but we need to be sure it gets removed 5705 * and never re-added. 5706 */ 5707 set_bit(Faulty, &tmp->rdev->flags); 5708 sysfs_notify_dirent_safe( 5709 tmp->rdev->sysfs_state); 5710 } 5711 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5712 } else if (tmp->rdev 5713 && tmp->rdev->recovery_offset == MaxSector 5714 && !test_bit(Faulty, &tmp->rdev->flags) 5715 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5716 count++; 5717 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5718 } 5719 } 5720 spin_lock_irqsave(&conf->device_lock, flags); 5721 mddev->degraded = calc_degraded(conf); 5722 spin_unlock_irqrestore(&conf->device_lock, flags); 5723 print_raid5_conf(conf); 5724 return count; 5725 } 5726 5727 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5728 { 5729 struct r5conf *conf = mddev->private; 5730 int err = 0; 5731 int number = rdev->raid_disk; 5732 struct md_rdev **rdevp; 5733 struct disk_info *p = conf->disks + number; 5734 5735 print_raid5_conf(conf); 5736 if (rdev == p->rdev) 5737 rdevp = &p->rdev; 5738 else if (rdev == p->replacement) 5739 rdevp = &p->replacement; 5740 else 5741 return 0; 5742 5743 if (number >= conf->raid_disks && 5744 conf->reshape_progress == MaxSector) 5745 clear_bit(In_sync, &rdev->flags); 5746 5747 if (test_bit(In_sync, &rdev->flags) || 5748 atomic_read(&rdev->nr_pending)) { 5749 err = -EBUSY; 5750 goto abort; 5751 } 5752 /* Only remove non-faulty devices if recovery 5753 * isn't possible. 5754 */ 5755 if (!test_bit(Faulty, &rdev->flags) && 5756 mddev->recovery_disabled != conf->recovery_disabled && 5757 !has_failed(conf) && 5758 (!p->replacement || p->replacement == rdev) && 5759 number < conf->raid_disks) { 5760 err = -EBUSY; 5761 goto abort; 5762 } 5763 *rdevp = NULL; 5764 synchronize_rcu(); 5765 if (atomic_read(&rdev->nr_pending)) { 5766 /* lost the race, try later */ 5767 err = -EBUSY; 5768 *rdevp = rdev; 5769 } else if (p->replacement) { 5770 /* We must have just cleared 'rdev' */ 5771 p->rdev = p->replacement; 5772 clear_bit(Replacement, &p->replacement->flags); 5773 smp_mb(); /* Make sure other CPUs may see both as identical 5774 * but will never see neither - if they are careful 5775 */ 5776 p->replacement = NULL; 5777 clear_bit(WantReplacement, &rdev->flags); 5778 } else 5779 /* We might have just removed the Replacement as faulty- 5780 * clear the bit just in case 5781 */ 5782 clear_bit(WantReplacement, &rdev->flags); 5783 abort: 5784 5785 print_raid5_conf(conf); 5786 return err; 5787 } 5788 5789 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5790 { 5791 struct r5conf *conf = mddev->private; 5792 int err = -EEXIST; 5793 int disk; 5794 struct disk_info *p; 5795 int first = 0; 5796 int last = conf->raid_disks - 1; 5797 5798 if (mddev->recovery_disabled == conf->recovery_disabled) 5799 return -EBUSY; 5800 5801 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 5802 /* no point adding a device */ 5803 return -EINVAL; 5804 5805 if (rdev->raid_disk >= 0) 5806 first = last = rdev->raid_disk; 5807 5808 /* 5809 * find the disk ... but prefer rdev->saved_raid_disk 5810 * if possible. 5811 */ 5812 if (rdev->saved_raid_disk >= 0 && 5813 rdev->saved_raid_disk >= first && 5814 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5815 first = rdev->saved_raid_disk; 5816 5817 for (disk = first; disk <= last; disk++) { 5818 p = conf->disks + disk; 5819 if (p->rdev == NULL) { 5820 clear_bit(In_sync, &rdev->flags); 5821 rdev->raid_disk = disk; 5822 err = 0; 5823 if (rdev->saved_raid_disk != disk) 5824 conf->fullsync = 1; 5825 rcu_assign_pointer(p->rdev, rdev); 5826 goto out; 5827 } 5828 } 5829 for (disk = first; disk <= last; disk++) { 5830 p = conf->disks + disk; 5831 if (test_bit(WantReplacement, &p->rdev->flags) && 5832 p->replacement == NULL) { 5833 clear_bit(In_sync, &rdev->flags); 5834 set_bit(Replacement, &rdev->flags); 5835 rdev->raid_disk = disk; 5836 err = 0; 5837 conf->fullsync = 1; 5838 rcu_assign_pointer(p->replacement, rdev); 5839 break; 5840 } 5841 } 5842 out: 5843 print_raid5_conf(conf); 5844 return err; 5845 } 5846 5847 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5848 { 5849 /* no resync is happening, and there is enough space 5850 * on all devices, so we can resize. 5851 * We need to make sure resync covers any new space. 5852 * If the array is shrinking we should possibly wait until 5853 * any io in the removed space completes, but it hardly seems 5854 * worth it. 5855 */ 5856 sector_t newsize; 5857 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5858 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 5859 if (mddev->external_size && 5860 mddev->array_sectors > newsize) 5861 return -EINVAL; 5862 if (mddev->bitmap) { 5863 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 5864 if (ret) 5865 return ret; 5866 } 5867 md_set_array_sectors(mddev, newsize); 5868 set_capacity(mddev->gendisk, mddev->array_sectors); 5869 revalidate_disk(mddev->gendisk); 5870 if (sectors > mddev->dev_sectors && 5871 mddev->recovery_cp > mddev->dev_sectors) { 5872 mddev->recovery_cp = mddev->dev_sectors; 5873 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5874 } 5875 mddev->dev_sectors = sectors; 5876 mddev->resync_max_sectors = sectors; 5877 return 0; 5878 } 5879 5880 static int check_stripe_cache(struct mddev *mddev) 5881 { 5882 /* Can only proceed if there are plenty of stripe_heads. 5883 * We need a minimum of one full stripe,, and for sensible progress 5884 * it is best to have about 4 times that. 5885 * If we require 4 times, then the default 256 4K stripe_heads will 5886 * allow for chunk sizes up to 256K, which is probably OK. 5887 * If the chunk size is greater, user-space should request more 5888 * stripe_heads first. 5889 */ 5890 struct r5conf *conf = mddev->private; 5891 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5892 > conf->max_nr_stripes || 5893 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5894 > conf->max_nr_stripes) { 5895 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5896 mdname(mddev), 5897 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5898 / STRIPE_SIZE)*4); 5899 return 0; 5900 } 5901 return 1; 5902 } 5903 5904 static int check_reshape(struct mddev *mddev) 5905 { 5906 struct r5conf *conf = mddev->private; 5907 5908 if (mddev->delta_disks == 0 && 5909 mddev->new_layout == mddev->layout && 5910 mddev->new_chunk_sectors == mddev->chunk_sectors) 5911 return 0; /* nothing to do */ 5912 if (has_failed(conf)) 5913 return -EINVAL; 5914 if (mddev->delta_disks < 0) { 5915 /* We might be able to shrink, but the devices must 5916 * be made bigger first. 5917 * For raid6, 4 is the minimum size. 5918 * Otherwise 2 is the minimum 5919 */ 5920 int min = 2; 5921 if (mddev->level == 6) 5922 min = 4; 5923 if (mddev->raid_disks + mddev->delta_disks < min) 5924 return -EINVAL; 5925 } 5926 5927 if (!check_stripe_cache(mddev)) 5928 return -ENOSPC; 5929 5930 return resize_stripes(conf, (conf->previous_raid_disks 5931 + mddev->delta_disks)); 5932 } 5933 5934 static int raid5_start_reshape(struct mddev *mddev) 5935 { 5936 struct r5conf *conf = mddev->private; 5937 struct md_rdev *rdev; 5938 int spares = 0; 5939 unsigned long flags; 5940 5941 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5942 return -EBUSY; 5943 5944 if (!check_stripe_cache(mddev)) 5945 return -ENOSPC; 5946 5947 if (has_failed(conf)) 5948 return -EINVAL; 5949 5950 rdev_for_each(rdev, mddev) { 5951 if (!test_bit(In_sync, &rdev->flags) 5952 && !test_bit(Faulty, &rdev->flags)) 5953 spares++; 5954 } 5955 5956 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5957 /* Not enough devices even to make a degraded array 5958 * of that size 5959 */ 5960 return -EINVAL; 5961 5962 /* Refuse to reduce size of the array. Any reductions in 5963 * array size must be through explicit setting of array_size 5964 * attribute. 5965 */ 5966 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5967 < mddev->array_sectors) { 5968 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5969 "before number of disks\n", mdname(mddev)); 5970 return -EINVAL; 5971 } 5972 5973 atomic_set(&conf->reshape_stripes, 0); 5974 spin_lock_irq(&conf->device_lock); 5975 conf->previous_raid_disks = conf->raid_disks; 5976 conf->raid_disks += mddev->delta_disks; 5977 conf->prev_chunk_sectors = conf->chunk_sectors; 5978 conf->chunk_sectors = mddev->new_chunk_sectors; 5979 conf->prev_algo = conf->algorithm; 5980 conf->algorithm = mddev->new_layout; 5981 conf->generation++; 5982 /* Code that selects data_offset needs to see the generation update 5983 * if reshape_progress has been set - so a memory barrier needed. 5984 */ 5985 smp_mb(); 5986 if (mddev->reshape_backwards) 5987 conf->reshape_progress = raid5_size(mddev, 0, 0); 5988 else 5989 conf->reshape_progress = 0; 5990 conf->reshape_safe = conf->reshape_progress; 5991 spin_unlock_irq(&conf->device_lock); 5992 5993 /* Add some new drives, as many as will fit. 5994 * We know there are enough to make the newly sized array work. 5995 * Don't add devices if we are reducing the number of 5996 * devices in the array. This is because it is not possible 5997 * to correctly record the "partially reconstructed" state of 5998 * such devices during the reshape and confusion could result. 5999 */ 6000 if (mddev->delta_disks >= 0) { 6001 rdev_for_each(rdev, mddev) 6002 if (rdev->raid_disk < 0 && 6003 !test_bit(Faulty, &rdev->flags)) { 6004 if (raid5_add_disk(mddev, rdev) == 0) { 6005 if (rdev->raid_disk 6006 >= conf->previous_raid_disks) 6007 set_bit(In_sync, &rdev->flags); 6008 else 6009 rdev->recovery_offset = 0; 6010 6011 if (sysfs_link_rdev(mddev, rdev)) 6012 /* Failure here is OK */; 6013 } 6014 } else if (rdev->raid_disk >= conf->previous_raid_disks 6015 && !test_bit(Faulty, &rdev->flags)) { 6016 /* This is a spare that was manually added */ 6017 set_bit(In_sync, &rdev->flags); 6018 } 6019 6020 /* When a reshape changes the number of devices, 6021 * ->degraded is measured against the larger of the 6022 * pre and post number of devices. 6023 */ 6024 spin_lock_irqsave(&conf->device_lock, flags); 6025 mddev->degraded = calc_degraded(conf); 6026 spin_unlock_irqrestore(&conf->device_lock, flags); 6027 } 6028 mddev->raid_disks = conf->raid_disks; 6029 mddev->reshape_position = conf->reshape_progress; 6030 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6031 6032 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6033 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6034 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6035 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6036 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6037 "reshape"); 6038 if (!mddev->sync_thread) { 6039 mddev->recovery = 0; 6040 spin_lock_irq(&conf->device_lock); 6041 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6042 rdev_for_each(rdev, mddev) 6043 rdev->new_data_offset = rdev->data_offset; 6044 smp_wmb(); 6045 conf->reshape_progress = MaxSector; 6046 mddev->reshape_position = MaxSector; 6047 spin_unlock_irq(&conf->device_lock); 6048 return -EAGAIN; 6049 } 6050 conf->reshape_checkpoint = jiffies; 6051 md_wakeup_thread(mddev->sync_thread); 6052 md_new_event(mddev); 6053 return 0; 6054 } 6055 6056 /* This is called from the reshape thread and should make any 6057 * changes needed in 'conf' 6058 */ 6059 static void end_reshape(struct r5conf *conf) 6060 { 6061 6062 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6063 struct md_rdev *rdev; 6064 6065 spin_lock_irq(&conf->device_lock); 6066 conf->previous_raid_disks = conf->raid_disks; 6067 rdev_for_each(rdev, conf->mddev) 6068 rdev->data_offset = rdev->new_data_offset; 6069 smp_wmb(); 6070 conf->reshape_progress = MaxSector; 6071 spin_unlock_irq(&conf->device_lock); 6072 wake_up(&conf->wait_for_overlap); 6073 6074 /* read-ahead size must cover two whole stripes, which is 6075 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6076 */ 6077 if (conf->mddev->queue) { 6078 int data_disks = conf->raid_disks - conf->max_degraded; 6079 int stripe = data_disks * ((conf->chunk_sectors << 9) 6080 / PAGE_SIZE); 6081 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6082 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6083 } 6084 } 6085 } 6086 6087 /* This is called from the raid5d thread with mddev_lock held. 6088 * It makes config changes to the device. 6089 */ 6090 static void raid5_finish_reshape(struct mddev *mddev) 6091 { 6092 struct r5conf *conf = mddev->private; 6093 6094 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6095 6096 if (mddev->delta_disks > 0) { 6097 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6098 set_capacity(mddev->gendisk, mddev->array_sectors); 6099 revalidate_disk(mddev->gendisk); 6100 } else { 6101 int d; 6102 spin_lock_irq(&conf->device_lock); 6103 mddev->degraded = calc_degraded(conf); 6104 spin_unlock_irq(&conf->device_lock); 6105 for (d = conf->raid_disks ; 6106 d < conf->raid_disks - mddev->delta_disks; 6107 d++) { 6108 struct md_rdev *rdev = conf->disks[d].rdev; 6109 if (rdev) 6110 clear_bit(In_sync, &rdev->flags); 6111 rdev = conf->disks[d].replacement; 6112 if (rdev) 6113 clear_bit(In_sync, &rdev->flags); 6114 } 6115 } 6116 mddev->layout = conf->algorithm; 6117 mddev->chunk_sectors = conf->chunk_sectors; 6118 mddev->reshape_position = MaxSector; 6119 mddev->delta_disks = 0; 6120 mddev->reshape_backwards = 0; 6121 } 6122 } 6123 6124 static void raid5_quiesce(struct mddev *mddev, int state) 6125 { 6126 struct r5conf *conf = mddev->private; 6127 6128 switch(state) { 6129 case 2: /* resume for a suspend */ 6130 wake_up(&conf->wait_for_overlap); 6131 break; 6132 6133 case 1: /* stop all writes */ 6134 spin_lock_irq(&conf->device_lock); 6135 /* '2' tells resync/reshape to pause so that all 6136 * active stripes can drain 6137 */ 6138 conf->quiesce = 2; 6139 wait_event_lock_irq(conf->wait_for_stripe, 6140 atomic_read(&conf->active_stripes) == 0 && 6141 atomic_read(&conf->active_aligned_reads) == 0, 6142 conf->device_lock); 6143 conf->quiesce = 1; 6144 spin_unlock_irq(&conf->device_lock); 6145 /* allow reshape to continue */ 6146 wake_up(&conf->wait_for_overlap); 6147 break; 6148 6149 case 0: /* re-enable writes */ 6150 spin_lock_irq(&conf->device_lock); 6151 conf->quiesce = 0; 6152 wake_up(&conf->wait_for_stripe); 6153 wake_up(&conf->wait_for_overlap); 6154 spin_unlock_irq(&conf->device_lock); 6155 break; 6156 } 6157 } 6158 6159 6160 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6161 { 6162 struct r0conf *raid0_conf = mddev->private; 6163 sector_t sectors; 6164 6165 /* for raid0 takeover only one zone is supported */ 6166 if (raid0_conf->nr_strip_zones > 1) { 6167 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6168 mdname(mddev)); 6169 return ERR_PTR(-EINVAL); 6170 } 6171 6172 sectors = raid0_conf->strip_zone[0].zone_end; 6173 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6174 mddev->dev_sectors = sectors; 6175 mddev->new_level = level; 6176 mddev->new_layout = ALGORITHM_PARITY_N; 6177 mddev->new_chunk_sectors = mddev->chunk_sectors; 6178 mddev->raid_disks += 1; 6179 mddev->delta_disks = 1; 6180 /* make sure it will be not marked as dirty */ 6181 mddev->recovery_cp = MaxSector; 6182 6183 return setup_conf(mddev); 6184 } 6185 6186 6187 static void *raid5_takeover_raid1(struct mddev *mddev) 6188 { 6189 int chunksect; 6190 6191 if (mddev->raid_disks != 2 || 6192 mddev->degraded > 1) 6193 return ERR_PTR(-EINVAL); 6194 6195 /* Should check if there are write-behind devices? */ 6196 6197 chunksect = 64*2; /* 64K by default */ 6198 6199 /* The array must be an exact multiple of chunksize */ 6200 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6201 chunksect >>= 1; 6202 6203 if ((chunksect<<9) < STRIPE_SIZE) 6204 /* array size does not allow a suitable chunk size */ 6205 return ERR_PTR(-EINVAL); 6206 6207 mddev->new_level = 5; 6208 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6209 mddev->new_chunk_sectors = chunksect; 6210 6211 return setup_conf(mddev); 6212 } 6213 6214 static void *raid5_takeover_raid6(struct mddev *mddev) 6215 { 6216 int new_layout; 6217 6218 switch (mddev->layout) { 6219 case ALGORITHM_LEFT_ASYMMETRIC_6: 6220 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6221 break; 6222 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6223 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6224 break; 6225 case ALGORITHM_LEFT_SYMMETRIC_6: 6226 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6227 break; 6228 case ALGORITHM_RIGHT_SYMMETRIC_6: 6229 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6230 break; 6231 case ALGORITHM_PARITY_0_6: 6232 new_layout = ALGORITHM_PARITY_0; 6233 break; 6234 case ALGORITHM_PARITY_N: 6235 new_layout = ALGORITHM_PARITY_N; 6236 break; 6237 default: 6238 return ERR_PTR(-EINVAL); 6239 } 6240 mddev->new_level = 5; 6241 mddev->new_layout = new_layout; 6242 mddev->delta_disks = -1; 6243 mddev->raid_disks -= 1; 6244 return setup_conf(mddev); 6245 } 6246 6247 6248 static int raid5_check_reshape(struct mddev *mddev) 6249 { 6250 /* For a 2-drive array, the layout and chunk size can be changed 6251 * immediately as not restriping is needed. 6252 * For larger arrays we record the new value - after validation 6253 * to be used by a reshape pass. 6254 */ 6255 struct r5conf *conf = mddev->private; 6256 int new_chunk = mddev->new_chunk_sectors; 6257 6258 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6259 return -EINVAL; 6260 if (new_chunk > 0) { 6261 if (!is_power_of_2(new_chunk)) 6262 return -EINVAL; 6263 if (new_chunk < (PAGE_SIZE>>9)) 6264 return -EINVAL; 6265 if (mddev->array_sectors & (new_chunk-1)) 6266 /* not factor of array size */ 6267 return -EINVAL; 6268 } 6269 6270 /* They look valid */ 6271 6272 if (mddev->raid_disks == 2) { 6273 /* can make the change immediately */ 6274 if (mddev->new_layout >= 0) { 6275 conf->algorithm = mddev->new_layout; 6276 mddev->layout = mddev->new_layout; 6277 } 6278 if (new_chunk > 0) { 6279 conf->chunk_sectors = new_chunk ; 6280 mddev->chunk_sectors = new_chunk; 6281 } 6282 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6283 md_wakeup_thread(mddev->thread); 6284 } 6285 return check_reshape(mddev); 6286 } 6287 6288 static int raid6_check_reshape(struct mddev *mddev) 6289 { 6290 int new_chunk = mddev->new_chunk_sectors; 6291 6292 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6293 return -EINVAL; 6294 if (new_chunk > 0) { 6295 if (!is_power_of_2(new_chunk)) 6296 return -EINVAL; 6297 if (new_chunk < (PAGE_SIZE >> 9)) 6298 return -EINVAL; 6299 if (mddev->array_sectors & (new_chunk-1)) 6300 /* not factor of array size */ 6301 return -EINVAL; 6302 } 6303 6304 /* They look valid */ 6305 return check_reshape(mddev); 6306 } 6307 6308 static void *raid5_takeover(struct mddev *mddev) 6309 { 6310 /* raid5 can take over: 6311 * raid0 - if there is only one strip zone - make it a raid4 layout 6312 * raid1 - if there are two drives. We need to know the chunk size 6313 * raid4 - trivial - just use a raid4 layout. 6314 * raid6 - Providing it is a *_6 layout 6315 */ 6316 if (mddev->level == 0) 6317 return raid45_takeover_raid0(mddev, 5); 6318 if (mddev->level == 1) 6319 return raid5_takeover_raid1(mddev); 6320 if (mddev->level == 4) { 6321 mddev->new_layout = ALGORITHM_PARITY_N; 6322 mddev->new_level = 5; 6323 return setup_conf(mddev); 6324 } 6325 if (mddev->level == 6) 6326 return raid5_takeover_raid6(mddev); 6327 6328 return ERR_PTR(-EINVAL); 6329 } 6330 6331 static void *raid4_takeover(struct mddev *mddev) 6332 { 6333 /* raid4 can take over: 6334 * raid0 - if there is only one strip zone 6335 * raid5 - if layout is right 6336 */ 6337 if (mddev->level == 0) 6338 return raid45_takeover_raid0(mddev, 4); 6339 if (mddev->level == 5 && 6340 mddev->layout == ALGORITHM_PARITY_N) { 6341 mddev->new_layout = 0; 6342 mddev->new_level = 4; 6343 return setup_conf(mddev); 6344 } 6345 return ERR_PTR(-EINVAL); 6346 } 6347 6348 static struct md_personality raid5_personality; 6349 6350 static void *raid6_takeover(struct mddev *mddev) 6351 { 6352 /* Currently can only take over a raid5. We map the 6353 * personality to an equivalent raid6 personality 6354 * with the Q block at the end. 6355 */ 6356 int new_layout; 6357 6358 if (mddev->pers != &raid5_personality) 6359 return ERR_PTR(-EINVAL); 6360 if (mddev->degraded > 1) 6361 return ERR_PTR(-EINVAL); 6362 if (mddev->raid_disks > 253) 6363 return ERR_PTR(-EINVAL); 6364 if (mddev->raid_disks < 3) 6365 return ERR_PTR(-EINVAL); 6366 6367 switch (mddev->layout) { 6368 case ALGORITHM_LEFT_ASYMMETRIC: 6369 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6370 break; 6371 case ALGORITHM_RIGHT_ASYMMETRIC: 6372 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6373 break; 6374 case ALGORITHM_LEFT_SYMMETRIC: 6375 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6376 break; 6377 case ALGORITHM_RIGHT_SYMMETRIC: 6378 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6379 break; 6380 case ALGORITHM_PARITY_0: 6381 new_layout = ALGORITHM_PARITY_0_6; 6382 break; 6383 case ALGORITHM_PARITY_N: 6384 new_layout = ALGORITHM_PARITY_N; 6385 break; 6386 default: 6387 return ERR_PTR(-EINVAL); 6388 } 6389 mddev->new_level = 6; 6390 mddev->new_layout = new_layout; 6391 mddev->delta_disks = 1; 6392 mddev->raid_disks += 1; 6393 return setup_conf(mddev); 6394 } 6395 6396 6397 static struct md_personality raid6_personality = 6398 { 6399 .name = "raid6", 6400 .level = 6, 6401 .owner = THIS_MODULE, 6402 .make_request = make_request, 6403 .run = run, 6404 .stop = stop, 6405 .status = status, 6406 .error_handler = error, 6407 .hot_add_disk = raid5_add_disk, 6408 .hot_remove_disk= raid5_remove_disk, 6409 .spare_active = raid5_spare_active, 6410 .sync_request = sync_request, 6411 .resize = raid5_resize, 6412 .size = raid5_size, 6413 .check_reshape = raid6_check_reshape, 6414 .start_reshape = raid5_start_reshape, 6415 .finish_reshape = raid5_finish_reshape, 6416 .quiesce = raid5_quiesce, 6417 .takeover = raid6_takeover, 6418 }; 6419 static struct md_personality raid5_personality = 6420 { 6421 .name = "raid5", 6422 .level = 5, 6423 .owner = THIS_MODULE, 6424 .make_request = make_request, 6425 .run = run, 6426 .stop = stop, 6427 .status = status, 6428 .error_handler = error, 6429 .hot_add_disk = raid5_add_disk, 6430 .hot_remove_disk= raid5_remove_disk, 6431 .spare_active = raid5_spare_active, 6432 .sync_request = sync_request, 6433 .resize = raid5_resize, 6434 .size = raid5_size, 6435 .check_reshape = raid5_check_reshape, 6436 .start_reshape = raid5_start_reshape, 6437 .finish_reshape = raid5_finish_reshape, 6438 .quiesce = raid5_quiesce, 6439 .takeover = raid5_takeover, 6440 }; 6441 6442 static struct md_personality raid4_personality = 6443 { 6444 .name = "raid4", 6445 .level = 4, 6446 .owner = THIS_MODULE, 6447 .make_request = make_request, 6448 .run = run, 6449 .stop = stop, 6450 .status = status, 6451 .error_handler = error, 6452 .hot_add_disk = raid5_add_disk, 6453 .hot_remove_disk= raid5_remove_disk, 6454 .spare_active = raid5_spare_active, 6455 .sync_request = sync_request, 6456 .resize = raid5_resize, 6457 .size = raid5_size, 6458 .check_reshape = raid5_check_reshape, 6459 .start_reshape = raid5_start_reshape, 6460 .finish_reshape = raid5_finish_reshape, 6461 .quiesce = raid5_quiesce, 6462 .takeover = raid4_takeover, 6463 }; 6464 6465 static int __init raid5_init(void) 6466 { 6467 register_md_personality(&raid6_personality); 6468 register_md_personality(&raid5_personality); 6469 register_md_personality(&raid4_personality); 6470 return 0; 6471 } 6472 6473 static void raid5_exit(void) 6474 { 6475 unregister_md_personality(&raid6_personality); 6476 unregister_md_personality(&raid5_personality); 6477 unregister_md_personality(&raid4_personality); 6478 } 6479 6480 module_init(raid5_init); 6481 module_exit(raid5_exit); 6482 MODULE_LICENSE("GPL"); 6483 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6484 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6485 MODULE_ALIAS("md-raid5"); 6486 MODULE_ALIAS("md-raid4"); 6487 MODULE_ALIAS("md-level-5"); 6488 MODULE_ALIAS("md-level-4"); 6489 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6490 MODULE_ALIAS("md-raid6"); 6491 MODULE_ALIAS("md-level-6"); 6492 6493 /* This used to be two separate modules, they were: */ 6494 MODULE_ALIAS("raid5"); 6495 MODULE_ALIAS("raid6"); 6496