1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->bm_write is the number of the last batch successfully written. 31 * conf->bm_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is bm_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/async.h> 51 #include <linux/seq_file.h> 52 #include <linux/cpu.h> 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "raid5.h" 56 #include "raid0.h" 57 #include "bitmap.h" 58 59 /* 60 * Stripe cache 61 */ 62 63 #define NR_STRIPES 256 64 #define STRIPE_SIZE PAGE_SIZE 65 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 66 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 67 #define IO_THRESHOLD 1 68 #define BYPASS_THRESHOLD 1 69 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 70 #define HASH_MASK (NR_HASH - 1) 71 72 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 73 74 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 75 * order without overlap. There may be several bio's per stripe+device, and 76 * a bio could span several devices. 77 * When walking this list for a particular stripe+device, we must never proceed 78 * beyond a bio that extends past this device, as the next bio might no longer 79 * be valid. 80 * This macro is used to determine the 'next' bio in the list, given the sector 81 * of the current stripe+device 82 */ 83 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 84 /* 85 * The following can be used to debug the driver 86 */ 87 #define RAID5_PARANOIA 1 88 #if RAID5_PARANOIA && defined(CONFIG_SMP) 89 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 90 #else 91 # define CHECK_DEVLOCK() 92 #endif 93 94 #ifdef DEBUG 95 #define inline 96 #define __inline__ 97 #endif 98 99 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 100 101 /* 102 * We maintain a biased count of active stripes in the bottom 16 bits of 103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 104 */ 105 static inline int raid5_bi_phys_segments(struct bio *bio) 106 { 107 return bio->bi_phys_segments & 0xffff; 108 } 109 110 static inline int raid5_bi_hw_segments(struct bio *bio) 111 { 112 return (bio->bi_phys_segments >> 16) & 0xffff; 113 } 114 115 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 116 { 117 --bio->bi_phys_segments; 118 return raid5_bi_phys_segments(bio); 119 } 120 121 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 122 { 123 unsigned short val = raid5_bi_hw_segments(bio); 124 125 --val; 126 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 return val; 128 } 129 130 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 131 { 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 133 } 134 135 /* Find first data disk in a raid6 stripe */ 136 static inline int raid6_d0(struct stripe_head *sh) 137 { 138 if (sh->ddf_layout) 139 /* ddf always start from first device */ 140 return 0; 141 /* md starts just after Q block */ 142 if (sh->qd_idx == sh->disks - 1) 143 return 0; 144 else 145 return sh->qd_idx + 1; 146 } 147 static inline int raid6_next_disk(int disk, int raid_disks) 148 { 149 disk++; 150 return (disk < raid_disks) ? disk : 0; 151 } 152 153 /* When walking through the disks in a raid5, starting at raid6_d0, 154 * We need to map each disk to a 'slot', where the data disks are slot 155 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 156 * is raid_disks-1. This help does that mapping. 157 */ 158 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 159 int *count, int syndrome_disks) 160 { 161 int slot = *count; 162 163 if (sh->ddf_layout) 164 (*count)++; 165 if (idx == sh->pd_idx) 166 return syndrome_disks; 167 if (idx == sh->qd_idx) 168 return syndrome_disks + 1; 169 if (!sh->ddf_layout) 170 (*count)++; 171 return slot; 172 } 173 174 static void return_io(struct bio *return_bi) 175 { 176 struct bio *bi = return_bi; 177 while (bi) { 178 179 return_bi = bi->bi_next; 180 bi->bi_next = NULL; 181 bi->bi_size = 0; 182 bio_endio(bi, 0); 183 bi = return_bi; 184 } 185 } 186 187 static void print_raid5_conf (raid5_conf_t *conf); 188 189 static int stripe_operations_active(struct stripe_head *sh) 190 { 191 return sh->check_state || sh->reconstruct_state || 192 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 193 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 194 } 195 196 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 197 { 198 if (atomic_dec_and_test(&sh->count)) { 199 BUG_ON(!list_empty(&sh->lru)); 200 BUG_ON(atomic_read(&conf->active_stripes)==0); 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state)) { 203 list_add_tail(&sh->lru, &conf->delayed_list); 204 blk_plug_device(conf->mddev->queue); 205 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 206 sh->bm_seq - conf->seq_write > 0) { 207 list_add_tail(&sh->lru, &conf->bitmap_list); 208 blk_plug_device(conf->mddev->queue); 209 } else { 210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 211 list_add_tail(&sh->lru, &conf->handle_list); 212 } 213 md_wakeup_thread(conf->mddev->thread); 214 } else { 215 BUG_ON(stripe_operations_active(sh)); 216 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 217 atomic_dec(&conf->preread_active_stripes); 218 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 219 md_wakeup_thread(conf->mddev->thread); 220 } 221 atomic_dec(&conf->active_stripes); 222 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 223 list_add_tail(&sh->lru, &conf->inactive_list); 224 wake_up(&conf->wait_for_stripe); 225 if (conf->retry_read_aligned) 226 md_wakeup_thread(conf->mddev->thread); 227 } 228 } 229 } 230 } 231 232 static void release_stripe(struct stripe_head *sh) 233 { 234 raid5_conf_t *conf = sh->raid_conf; 235 unsigned long flags; 236 237 spin_lock_irqsave(&conf->device_lock, flags); 238 __release_stripe(conf, sh); 239 spin_unlock_irqrestore(&conf->device_lock, flags); 240 } 241 242 static inline void remove_hash(struct stripe_head *sh) 243 { 244 pr_debug("remove_hash(), stripe %llu\n", 245 (unsigned long long)sh->sector); 246 247 hlist_del_init(&sh->hash); 248 } 249 250 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 251 { 252 struct hlist_head *hp = stripe_hash(conf, sh->sector); 253 254 pr_debug("insert_hash(), stripe %llu\n", 255 (unsigned long long)sh->sector); 256 257 CHECK_DEVLOCK(); 258 hlist_add_head(&sh->hash, hp); 259 } 260 261 262 /* find an idle stripe, make sure it is unhashed, and return it. */ 263 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 264 { 265 struct stripe_head *sh = NULL; 266 struct list_head *first; 267 268 CHECK_DEVLOCK(); 269 if (list_empty(&conf->inactive_list)) 270 goto out; 271 first = conf->inactive_list.next; 272 sh = list_entry(first, struct stripe_head, lru); 273 list_del_init(first); 274 remove_hash(sh); 275 atomic_inc(&conf->active_stripes); 276 out: 277 return sh; 278 } 279 280 static void shrink_buffers(struct stripe_head *sh) 281 { 282 struct page *p; 283 int i; 284 int num = sh->raid_conf->pool_size; 285 286 for (i = 0; i < num ; i++) { 287 p = sh->dev[i].page; 288 if (!p) 289 continue; 290 sh->dev[i].page = NULL; 291 put_page(p); 292 } 293 } 294 295 static int grow_buffers(struct stripe_head *sh) 296 { 297 int i; 298 int num = sh->raid_conf->pool_size; 299 300 for (i = 0; i < num; i++) { 301 struct page *page; 302 303 if (!(page = alloc_page(GFP_KERNEL))) { 304 return 1; 305 } 306 sh->dev[i].page = page; 307 } 308 return 0; 309 } 310 311 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 312 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 313 struct stripe_head *sh); 314 315 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 316 { 317 raid5_conf_t *conf = sh->raid_conf; 318 int i; 319 320 BUG_ON(atomic_read(&sh->count) != 0); 321 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 322 BUG_ON(stripe_operations_active(sh)); 323 324 CHECK_DEVLOCK(); 325 pr_debug("init_stripe called, stripe %llu\n", 326 (unsigned long long)sh->sector); 327 328 remove_hash(sh); 329 330 sh->generation = conf->generation - previous; 331 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 332 sh->sector = sector; 333 stripe_set_idx(sector, conf, previous, sh); 334 sh->state = 0; 335 336 337 for (i = sh->disks; i--; ) { 338 struct r5dev *dev = &sh->dev[i]; 339 340 if (dev->toread || dev->read || dev->towrite || dev->written || 341 test_bit(R5_LOCKED, &dev->flags)) { 342 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 343 (unsigned long long)sh->sector, i, dev->toread, 344 dev->read, dev->towrite, dev->written, 345 test_bit(R5_LOCKED, &dev->flags)); 346 BUG(); 347 } 348 dev->flags = 0; 349 raid5_build_block(sh, i, previous); 350 } 351 insert_hash(conf, sh); 352 } 353 354 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 355 short generation) 356 { 357 struct stripe_head *sh; 358 struct hlist_node *hn; 359 360 CHECK_DEVLOCK(); 361 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 362 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 363 if (sh->sector == sector && sh->generation == generation) 364 return sh; 365 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 366 return NULL; 367 } 368 369 /* 370 * Need to check if array has failed when deciding whether to: 371 * - start an array 372 * - remove non-faulty devices 373 * - add a spare 374 * - allow a reshape 375 * This determination is simple when no reshape is happening. 376 * However if there is a reshape, we need to carefully check 377 * both the before and after sections. 378 * This is because some failed devices may only affect one 379 * of the two sections, and some non-in_sync devices may 380 * be insync in the section most affected by failed devices. 381 */ 382 static int has_failed(raid5_conf_t *conf) 383 { 384 int degraded; 385 int i; 386 if (conf->mddev->reshape_position == MaxSector) 387 return conf->mddev->degraded > conf->max_degraded; 388 389 rcu_read_lock(); 390 degraded = 0; 391 for (i = 0; i < conf->previous_raid_disks; i++) { 392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 393 if (!rdev || test_bit(Faulty, &rdev->flags)) 394 degraded++; 395 else if (test_bit(In_sync, &rdev->flags)) 396 ; 397 else 398 /* not in-sync or faulty. 399 * If the reshape increases the number of devices, 400 * this is being recovered by the reshape, so 401 * this 'previous' section is not in_sync. 402 * If the number of devices is being reduced however, 403 * the device can only be part of the array if 404 * we are reverting a reshape, so this section will 405 * be in-sync. 406 */ 407 if (conf->raid_disks >= conf->previous_raid_disks) 408 degraded++; 409 } 410 rcu_read_unlock(); 411 if (degraded > conf->max_degraded) 412 return 1; 413 rcu_read_lock(); 414 degraded = 0; 415 for (i = 0; i < conf->raid_disks; i++) { 416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 417 if (!rdev || test_bit(Faulty, &rdev->flags)) 418 degraded++; 419 else if (test_bit(In_sync, &rdev->flags)) 420 ; 421 else 422 /* not in-sync or faulty. 423 * If reshape increases the number of devices, this 424 * section has already been recovered, else it 425 * almost certainly hasn't. 426 */ 427 if (conf->raid_disks <= conf->previous_raid_disks) 428 degraded++; 429 } 430 rcu_read_unlock(); 431 if (degraded > conf->max_degraded) 432 return 1; 433 return 0; 434 } 435 436 static void unplug_slaves(mddev_t *mddev); 437 static void raid5_unplug_device(struct request_queue *q); 438 439 static struct stripe_head * 440 get_active_stripe(raid5_conf_t *conf, sector_t sector, 441 int previous, int noblock, int noquiesce) 442 { 443 struct stripe_head *sh; 444 445 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 446 447 spin_lock_irq(&conf->device_lock); 448 449 do { 450 wait_event_lock_irq(conf->wait_for_stripe, 451 conf->quiesce == 0 || noquiesce, 452 conf->device_lock, /* nothing */); 453 sh = __find_stripe(conf, sector, conf->generation - previous); 454 if (!sh) { 455 if (!conf->inactive_blocked) 456 sh = get_free_stripe(conf); 457 if (noblock && sh == NULL) 458 break; 459 if (!sh) { 460 conf->inactive_blocked = 1; 461 wait_event_lock_irq(conf->wait_for_stripe, 462 !list_empty(&conf->inactive_list) && 463 (atomic_read(&conf->active_stripes) 464 < (conf->max_nr_stripes *3/4) 465 || !conf->inactive_blocked), 466 conf->device_lock, 467 raid5_unplug_device(conf->mddev->queue) 468 ); 469 conf->inactive_blocked = 0; 470 } else 471 init_stripe(sh, sector, previous); 472 } else { 473 if (atomic_read(&sh->count)) { 474 BUG_ON(!list_empty(&sh->lru) 475 && !test_bit(STRIPE_EXPANDING, &sh->state)); 476 } else { 477 if (!test_bit(STRIPE_HANDLE, &sh->state)) 478 atomic_inc(&conf->active_stripes); 479 if (list_empty(&sh->lru) && 480 !test_bit(STRIPE_EXPANDING, &sh->state)) 481 BUG(); 482 list_del_init(&sh->lru); 483 } 484 } 485 } while (sh == NULL); 486 487 if (sh) 488 atomic_inc(&sh->count); 489 490 spin_unlock_irq(&conf->device_lock); 491 return sh; 492 } 493 494 static void 495 raid5_end_read_request(struct bio *bi, int error); 496 static void 497 raid5_end_write_request(struct bio *bi, int error); 498 499 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 500 { 501 raid5_conf_t *conf = sh->raid_conf; 502 int i, disks = sh->disks; 503 504 might_sleep(); 505 506 for (i = disks; i--; ) { 507 int rw; 508 struct bio *bi; 509 mdk_rdev_t *rdev; 510 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 511 rw = WRITE; 512 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 513 rw = READ; 514 else 515 continue; 516 517 bi = &sh->dev[i].req; 518 519 bi->bi_rw = rw; 520 if (rw == WRITE) 521 bi->bi_end_io = raid5_end_write_request; 522 else 523 bi->bi_end_io = raid5_end_read_request; 524 525 rcu_read_lock(); 526 rdev = rcu_dereference(conf->disks[i].rdev); 527 if (rdev && test_bit(Faulty, &rdev->flags)) 528 rdev = NULL; 529 if (rdev) 530 atomic_inc(&rdev->nr_pending); 531 rcu_read_unlock(); 532 533 if (rdev) { 534 if (s->syncing || s->expanding || s->expanded) 535 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 536 537 set_bit(STRIPE_IO_STARTED, &sh->state); 538 539 bi->bi_bdev = rdev->bdev; 540 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 541 __func__, (unsigned long long)sh->sector, 542 bi->bi_rw, i); 543 atomic_inc(&sh->count); 544 bi->bi_sector = sh->sector + rdev->data_offset; 545 bi->bi_flags = 1 << BIO_UPTODATE; 546 bi->bi_vcnt = 1; 547 bi->bi_max_vecs = 1; 548 bi->bi_idx = 0; 549 bi->bi_io_vec = &sh->dev[i].vec; 550 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 551 bi->bi_io_vec[0].bv_offset = 0; 552 bi->bi_size = STRIPE_SIZE; 553 bi->bi_next = NULL; 554 if (rw == WRITE && 555 test_bit(R5_ReWrite, &sh->dev[i].flags)) 556 atomic_add(STRIPE_SECTORS, 557 &rdev->corrected_errors); 558 generic_make_request(bi); 559 } else { 560 if (rw == WRITE) 561 set_bit(STRIPE_DEGRADED, &sh->state); 562 pr_debug("skip op %ld on disc %d for sector %llu\n", 563 bi->bi_rw, i, (unsigned long long)sh->sector); 564 clear_bit(R5_LOCKED, &sh->dev[i].flags); 565 set_bit(STRIPE_HANDLE, &sh->state); 566 } 567 } 568 } 569 570 static struct dma_async_tx_descriptor * 571 async_copy_data(int frombio, struct bio *bio, struct page *page, 572 sector_t sector, struct dma_async_tx_descriptor *tx) 573 { 574 struct bio_vec *bvl; 575 struct page *bio_page; 576 int i; 577 int page_offset; 578 struct async_submit_ctl submit; 579 enum async_tx_flags flags = 0; 580 581 if (bio->bi_sector >= sector) 582 page_offset = (signed)(bio->bi_sector - sector) * 512; 583 else 584 page_offset = (signed)(sector - bio->bi_sector) * -512; 585 586 if (frombio) 587 flags |= ASYNC_TX_FENCE; 588 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 589 590 bio_for_each_segment(bvl, bio, i) { 591 int len = bio_iovec_idx(bio, i)->bv_len; 592 int clen; 593 int b_offset = 0; 594 595 if (page_offset < 0) { 596 b_offset = -page_offset; 597 page_offset += b_offset; 598 len -= b_offset; 599 } 600 601 if (len > 0 && page_offset + len > STRIPE_SIZE) 602 clen = STRIPE_SIZE - page_offset; 603 else 604 clen = len; 605 606 if (clen > 0) { 607 b_offset += bio_iovec_idx(bio, i)->bv_offset; 608 bio_page = bio_iovec_idx(bio, i)->bv_page; 609 if (frombio) 610 tx = async_memcpy(page, bio_page, page_offset, 611 b_offset, clen, &submit); 612 else 613 tx = async_memcpy(bio_page, page, b_offset, 614 page_offset, clen, &submit); 615 } 616 /* chain the operations */ 617 submit.depend_tx = tx; 618 619 if (clen < len) /* hit end of page */ 620 break; 621 page_offset += len; 622 } 623 624 return tx; 625 } 626 627 static void ops_complete_biofill(void *stripe_head_ref) 628 { 629 struct stripe_head *sh = stripe_head_ref; 630 struct bio *return_bi = NULL; 631 raid5_conf_t *conf = sh->raid_conf; 632 int i; 633 634 pr_debug("%s: stripe %llu\n", __func__, 635 (unsigned long long)sh->sector); 636 637 /* clear completed biofills */ 638 spin_lock_irq(&conf->device_lock); 639 for (i = sh->disks; i--; ) { 640 struct r5dev *dev = &sh->dev[i]; 641 642 /* acknowledge completion of a biofill operation */ 643 /* and check if we need to reply to a read request, 644 * new R5_Wantfill requests are held off until 645 * !STRIPE_BIOFILL_RUN 646 */ 647 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 648 struct bio *rbi, *rbi2; 649 650 BUG_ON(!dev->read); 651 rbi = dev->read; 652 dev->read = NULL; 653 while (rbi && rbi->bi_sector < 654 dev->sector + STRIPE_SECTORS) { 655 rbi2 = r5_next_bio(rbi, dev->sector); 656 if (!raid5_dec_bi_phys_segments(rbi)) { 657 rbi->bi_next = return_bi; 658 return_bi = rbi; 659 } 660 rbi = rbi2; 661 } 662 } 663 } 664 spin_unlock_irq(&conf->device_lock); 665 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 666 667 return_io(return_bi); 668 669 set_bit(STRIPE_HANDLE, &sh->state); 670 release_stripe(sh); 671 } 672 673 static void ops_run_biofill(struct stripe_head *sh) 674 { 675 struct dma_async_tx_descriptor *tx = NULL; 676 raid5_conf_t *conf = sh->raid_conf; 677 struct async_submit_ctl submit; 678 int i; 679 680 pr_debug("%s: stripe %llu\n", __func__, 681 (unsigned long long)sh->sector); 682 683 for (i = sh->disks; i--; ) { 684 struct r5dev *dev = &sh->dev[i]; 685 if (test_bit(R5_Wantfill, &dev->flags)) { 686 struct bio *rbi; 687 spin_lock_irq(&conf->device_lock); 688 dev->read = rbi = dev->toread; 689 dev->toread = NULL; 690 spin_unlock_irq(&conf->device_lock); 691 while (rbi && rbi->bi_sector < 692 dev->sector + STRIPE_SECTORS) { 693 tx = async_copy_data(0, rbi, dev->page, 694 dev->sector, tx); 695 rbi = r5_next_bio(rbi, dev->sector); 696 } 697 } 698 } 699 700 atomic_inc(&sh->count); 701 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 702 async_trigger_callback(&submit); 703 } 704 705 static void mark_target_uptodate(struct stripe_head *sh, int target) 706 { 707 struct r5dev *tgt; 708 709 if (target < 0) 710 return; 711 712 tgt = &sh->dev[target]; 713 set_bit(R5_UPTODATE, &tgt->flags); 714 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 715 clear_bit(R5_Wantcompute, &tgt->flags); 716 } 717 718 static void ops_complete_compute(void *stripe_head_ref) 719 { 720 struct stripe_head *sh = stripe_head_ref; 721 722 pr_debug("%s: stripe %llu\n", __func__, 723 (unsigned long long)sh->sector); 724 725 /* mark the computed target(s) as uptodate */ 726 mark_target_uptodate(sh, sh->ops.target); 727 mark_target_uptodate(sh, sh->ops.target2); 728 729 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 730 if (sh->check_state == check_state_compute_run) 731 sh->check_state = check_state_compute_result; 732 set_bit(STRIPE_HANDLE, &sh->state); 733 release_stripe(sh); 734 } 735 736 /* return a pointer to the address conversion region of the scribble buffer */ 737 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 738 struct raid5_percpu *percpu) 739 { 740 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 741 } 742 743 static struct dma_async_tx_descriptor * 744 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 745 { 746 int disks = sh->disks; 747 struct page **xor_srcs = percpu->scribble; 748 int target = sh->ops.target; 749 struct r5dev *tgt = &sh->dev[target]; 750 struct page *xor_dest = tgt->page; 751 int count = 0; 752 struct dma_async_tx_descriptor *tx; 753 struct async_submit_ctl submit; 754 int i; 755 756 pr_debug("%s: stripe %llu block: %d\n", 757 __func__, (unsigned long long)sh->sector, target); 758 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 759 760 for (i = disks; i--; ) 761 if (i != target) 762 xor_srcs[count++] = sh->dev[i].page; 763 764 atomic_inc(&sh->count); 765 766 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 767 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 768 if (unlikely(count == 1)) 769 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 770 else 771 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 772 773 return tx; 774 } 775 776 /* set_syndrome_sources - populate source buffers for gen_syndrome 777 * @srcs - (struct page *) array of size sh->disks 778 * @sh - stripe_head to parse 779 * 780 * Populates srcs in proper layout order for the stripe and returns the 781 * 'count' of sources to be used in a call to async_gen_syndrome. The P 782 * destination buffer is recorded in srcs[count] and the Q destination 783 * is recorded in srcs[count+1]]. 784 */ 785 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 786 { 787 int disks = sh->disks; 788 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 789 int d0_idx = raid6_d0(sh); 790 int count; 791 int i; 792 793 for (i = 0; i < disks; i++) 794 srcs[i] = NULL; 795 796 count = 0; 797 i = d0_idx; 798 do { 799 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 800 801 srcs[slot] = sh->dev[i].page; 802 i = raid6_next_disk(i, disks); 803 } while (i != d0_idx); 804 805 return syndrome_disks; 806 } 807 808 static struct dma_async_tx_descriptor * 809 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 810 { 811 int disks = sh->disks; 812 struct page **blocks = percpu->scribble; 813 int target; 814 int qd_idx = sh->qd_idx; 815 struct dma_async_tx_descriptor *tx; 816 struct async_submit_ctl submit; 817 struct r5dev *tgt; 818 struct page *dest; 819 int i; 820 int count; 821 822 if (sh->ops.target < 0) 823 target = sh->ops.target2; 824 else if (sh->ops.target2 < 0) 825 target = sh->ops.target; 826 else 827 /* we should only have one valid target */ 828 BUG(); 829 BUG_ON(target < 0); 830 pr_debug("%s: stripe %llu block: %d\n", 831 __func__, (unsigned long long)sh->sector, target); 832 833 tgt = &sh->dev[target]; 834 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 835 dest = tgt->page; 836 837 atomic_inc(&sh->count); 838 839 if (target == qd_idx) { 840 count = set_syndrome_sources(blocks, sh); 841 blocks[count] = NULL; /* regenerating p is not necessary */ 842 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 843 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 844 ops_complete_compute, sh, 845 to_addr_conv(sh, percpu)); 846 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 847 } else { 848 /* Compute any data- or p-drive using XOR */ 849 count = 0; 850 for (i = disks; i-- ; ) { 851 if (i == target || i == qd_idx) 852 continue; 853 blocks[count++] = sh->dev[i].page; 854 } 855 856 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 857 NULL, ops_complete_compute, sh, 858 to_addr_conv(sh, percpu)); 859 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 860 } 861 862 return tx; 863 } 864 865 static struct dma_async_tx_descriptor * 866 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 867 { 868 int i, count, disks = sh->disks; 869 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 870 int d0_idx = raid6_d0(sh); 871 int faila = -1, failb = -1; 872 int target = sh->ops.target; 873 int target2 = sh->ops.target2; 874 struct r5dev *tgt = &sh->dev[target]; 875 struct r5dev *tgt2 = &sh->dev[target2]; 876 struct dma_async_tx_descriptor *tx; 877 struct page **blocks = percpu->scribble; 878 struct async_submit_ctl submit; 879 880 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 881 __func__, (unsigned long long)sh->sector, target, target2); 882 BUG_ON(target < 0 || target2 < 0); 883 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 884 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 885 886 /* we need to open-code set_syndrome_sources to handle the 887 * slot number conversion for 'faila' and 'failb' 888 */ 889 for (i = 0; i < disks ; i++) 890 blocks[i] = NULL; 891 count = 0; 892 i = d0_idx; 893 do { 894 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 895 896 blocks[slot] = sh->dev[i].page; 897 898 if (i == target) 899 faila = slot; 900 if (i == target2) 901 failb = slot; 902 i = raid6_next_disk(i, disks); 903 } while (i != d0_idx); 904 905 BUG_ON(faila == failb); 906 if (failb < faila) 907 swap(faila, failb); 908 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 909 __func__, (unsigned long long)sh->sector, faila, failb); 910 911 atomic_inc(&sh->count); 912 913 if (failb == syndrome_disks+1) { 914 /* Q disk is one of the missing disks */ 915 if (faila == syndrome_disks) { 916 /* Missing P+Q, just recompute */ 917 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 918 ops_complete_compute, sh, 919 to_addr_conv(sh, percpu)); 920 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 921 STRIPE_SIZE, &submit); 922 } else { 923 struct page *dest; 924 int data_target; 925 int qd_idx = sh->qd_idx; 926 927 /* Missing D+Q: recompute D from P, then recompute Q */ 928 if (target == qd_idx) 929 data_target = target2; 930 else 931 data_target = target; 932 933 count = 0; 934 for (i = disks; i-- ; ) { 935 if (i == data_target || i == qd_idx) 936 continue; 937 blocks[count++] = sh->dev[i].page; 938 } 939 dest = sh->dev[data_target].page; 940 init_async_submit(&submit, 941 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 942 NULL, NULL, NULL, 943 to_addr_conv(sh, percpu)); 944 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 945 &submit); 946 947 count = set_syndrome_sources(blocks, sh); 948 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 949 ops_complete_compute, sh, 950 to_addr_conv(sh, percpu)); 951 return async_gen_syndrome(blocks, 0, count+2, 952 STRIPE_SIZE, &submit); 953 } 954 } else { 955 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 956 ops_complete_compute, sh, 957 to_addr_conv(sh, percpu)); 958 if (failb == syndrome_disks) { 959 /* We're missing D+P. */ 960 return async_raid6_datap_recov(syndrome_disks+2, 961 STRIPE_SIZE, faila, 962 blocks, &submit); 963 } else { 964 /* We're missing D+D. */ 965 return async_raid6_2data_recov(syndrome_disks+2, 966 STRIPE_SIZE, faila, failb, 967 blocks, &submit); 968 } 969 } 970 } 971 972 973 static void ops_complete_prexor(void *stripe_head_ref) 974 { 975 struct stripe_head *sh = stripe_head_ref; 976 977 pr_debug("%s: stripe %llu\n", __func__, 978 (unsigned long long)sh->sector); 979 } 980 981 static struct dma_async_tx_descriptor * 982 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 983 struct dma_async_tx_descriptor *tx) 984 { 985 int disks = sh->disks; 986 struct page **xor_srcs = percpu->scribble; 987 int count = 0, pd_idx = sh->pd_idx, i; 988 struct async_submit_ctl submit; 989 990 /* existing parity data subtracted */ 991 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 992 993 pr_debug("%s: stripe %llu\n", __func__, 994 (unsigned long long)sh->sector); 995 996 for (i = disks; i--; ) { 997 struct r5dev *dev = &sh->dev[i]; 998 /* Only process blocks that are known to be uptodate */ 999 if (test_bit(R5_Wantdrain, &dev->flags)) 1000 xor_srcs[count++] = dev->page; 1001 } 1002 1003 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1004 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1005 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1006 1007 return tx; 1008 } 1009 1010 static struct dma_async_tx_descriptor * 1011 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1012 { 1013 int disks = sh->disks; 1014 int i; 1015 1016 pr_debug("%s: stripe %llu\n", __func__, 1017 (unsigned long long)sh->sector); 1018 1019 for (i = disks; i--; ) { 1020 struct r5dev *dev = &sh->dev[i]; 1021 struct bio *chosen; 1022 1023 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1024 struct bio *wbi; 1025 1026 spin_lock(&sh->lock); 1027 chosen = dev->towrite; 1028 dev->towrite = NULL; 1029 BUG_ON(dev->written); 1030 wbi = dev->written = chosen; 1031 spin_unlock(&sh->lock); 1032 1033 while (wbi && wbi->bi_sector < 1034 dev->sector + STRIPE_SECTORS) { 1035 tx = async_copy_data(1, wbi, dev->page, 1036 dev->sector, tx); 1037 wbi = r5_next_bio(wbi, dev->sector); 1038 } 1039 } 1040 } 1041 1042 return tx; 1043 } 1044 1045 static void ops_complete_reconstruct(void *stripe_head_ref) 1046 { 1047 struct stripe_head *sh = stripe_head_ref; 1048 int disks = sh->disks; 1049 int pd_idx = sh->pd_idx; 1050 int qd_idx = sh->qd_idx; 1051 int i; 1052 1053 pr_debug("%s: stripe %llu\n", __func__, 1054 (unsigned long long)sh->sector); 1055 1056 for (i = disks; i--; ) { 1057 struct r5dev *dev = &sh->dev[i]; 1058 1059 if (dev->written || i == pd_idx || i == qd_idx) 1060 set_bit(R5_UPTODATE, &dev->flags); 1061 } 1062 1063 if (sh->reconstruct_state == reconstruct_state_drain_run) 1064 sh->reconstruct_state = reconstruct_state_drain_result; 1065 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1066 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1067 else { 1068 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1069 sh->reconstruct_state = reconstruct_state_result; 1070 } 1071 1072 set_bit(STRIPE_HANDLE, &sh->state); 1073 release_stripe(sh); 1074 } 1075 1076 static void 1077 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1078 struct dma_async_tx_descriptor *tx) 1079 { 1080 int disks = sh->disks; 1081 struct page **xor_srcs = percpu->scribble; 1082 struct async_submit_ctl submit; 1083 int count = 0, pd_idx = sh->pd_idx, i; 1084 struct page *xor_dest; 1085 int prexor = 0; 1086 unsigned long flags; 1087 1088 pr_debug("%s: stripe %llu\n", __func__, 1089 (unsigned long long)sh->sector); 1090 1091 /* check if prexor is active which means only process blocks 1092 * that are part of a read-modify-write (written) 1093 */ 1094 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1095 prexor = 1; 1096 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1097 for (i = disks; i--; ) { 1098 struct r5dev *dev = &sh->dev[i]; 1099 if (dev->written) 1100 xor_srcs[count++] = dev->page; 1101 } 1102 } else { 1103 xor_dest = sh->dev[pd_idx].page; 1104 for (i = disks; i--; ) { 1105 struct r5dev *dev = &sh->dev[i]; 1106 if (i != pd_idx) 1107 xor_srcs[count++] = dev->page; 1108 } 1109 } 1110 1111 /* 1/ if we prexor'd then the dest is reused as a source 1112 * 2/ if we did not prexor then we are redoing the parity 1113 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1114 * for the synchronous xor case 1115 */ 1116 flags = ASYNC_TX_ACK | 1117 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1118 1119 atomic_inc(&sh->count); 1120 1121 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1122 to_addr_conv(sh, percpu)); 1123 if (unlikely(count == 1)) 1124 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1125 else 1126 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1127 } 1128 1129 static void 1130 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1131 struct dma_async_tx_descriptor *tx) 1132 { 1133 struct async_submit_ctl submit; 1134 struct page **blocks = percpu->scribble; 1135 int count; 1136 1137 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1138 1139 count = set_syndrome_sources(blocks, sh); 1140 1141 atomic_inc(&sh->count); 1142 1143 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1144 sh, to_addr_conv(sh, percpu)); 1145 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1146 } 1147 1148 static void ops_complete_check(void *stripe_head_ref) 1149 { 1150 struct stripe_head *sh = stripe_head_ref; 1151 1152 pr_debug("%s: stripe %llu\n", __func__, 1153 (unsigned long long)sh->sector); 1154 1155 sh->check_state = check_state_check_result; 1156 set_bit(STRIPE_HANDLE, &sh->state); 1157 release_stripe(sh); 1158 } 1159 1160 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1161 { 1162 int disks = sh->disks; 1163 int pd_idx = sh->pd_idx; 1164 int qd_idx = sh->qd_idx; 1165 struct page *xor_dest; 1166 struct page **xor_srcs = percpu->scribble; 1167 struct dma_async_tx_descriptor *tx; 1168 struct async_submit_ctl submit; 1169 int count; 1170 int i; 1171 1172 pr_debug("%s: stripe %llu\n", __func__, 1173 (unsigned long long)sh->sector); 1174 1175 count = 0; 1176 xor_dest = sh->dev[pd_idx].page; 1177 xor_srcs[count++] = xor_dest; 1178 for (i = disks; i--; ) { 1179 if (i == pd_idx || i == qd_idx) 1180 continue; 1181 xor_srcs[count++] = sh->dev[i].page; 1182 } 1183 1184 init_async_submit(&submit, 0, NULL, NULL, NULL, 1185 to_addr_conv(sh, percpu)); 1186 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1187 &sh->ops.zero_sum_result, &submit); 1188 1189 atomic_inc(&sh->count); 1190 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1191 tx = async_trigger_callback(&submit); 1192 } 1193 1194 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1195 { 1196 struct page **srcs = percpu->scribble; 1197 struct async_submit_ctl submit; 1198 int count; 1199 1200 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1201 (unsigned long long)sh->sector, checkp); 1202 1203 count = set_syndrome_sources(srcs, sh); 1204 if (!checkp) 1205 srcs[count] = NULL; 1206 1207 atomic_inc(&sh->count); 1208 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1209 sh, to_addr_conv(sh, percpu)); 1210 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1211 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1212 } 1213 1214 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1215 { 1216 int overlap_clear = 0, i, disks = sh->disks; 1217 struct dma_async_tx_descriptor *tx = NULL; 1218 raid5_conf_t *conf = sh->raid_conf; 1219 int level = conf->level; 1220 struct raid5_percpu *percpu; 1221 unsigned long cpu; 1222 1223 cpu = get_cpu(); 1224 percpu = per_cpu_ptr(conf->percpu, cpu); 1225 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1226 ops_run_biofill(sh); 1227 overlap_clear++; 1228 } 1229 1230 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1231 if (level < 6) 1232 tx = ops_run_compute5(sh, percpu); 1233 else { 1234 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1235 tx = ops_run_compute6_1(sh, percpu); 1236 else 1237 tx = ops_run_compute6_2(sh, percpu); 1238 } 1239 /* terminate the chain if reconstruct is not set to be run */ 1240 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1241 async_tx_ack(tx); 1242 } 1243 1244 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1245 tx = ops_run_prexor(sh, percpu, tx); 1246 1247 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1248 tx = ops_run_biodrain(sh, tx); 1249 overlap_clear++; 1250 } 1251 1252 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1253 if (level < 6) 1254 ops_run_reconstruct5(sh, percpu, tx); 1255 else 1256 ops_run_reconstruct6(sh, percpu, tx); 1257 } 1258 1259 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1260 if (sh->check_state == check_state_run) 1261 ops_run_check_p(sh, percpu); 1262 else if (sh->check_state == check_state_run_q) 1263 ops_run_check_pq(sh, percpu, 0); 1264 else if (sh->check_state == check_state_run_pq) 1265 ops_run_check_pq(sh, percpu, 1); 1266 else 1267 BUG(); 1268 } 1269 1270 if (overlap_clear) 1271 for (i = disks; i--; ) { 1272 struct r5dev *dev = &sh->dev[i]; 1273 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1274 wake_up(&sh->raid_conf->wait_for_overlap); 1275 } 1276 put_cpu(); 1277 } 1278 1279 #ifdef CONFIG_MULTICORE_RAID456 1280 static void async_run_ops(void *param, async_cookie_t cookie) 1281 { 1282 struct stripe_head *sh = param; 1283 unsigned long ops_request = sh->ops.request; 1284 1285 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1286 wake_up(&sh->ops.wait_for_ops); 1287 1288 __raid_run_ops(sh, ops_request); 1289 release_stripe(sh); 1290 } 1291 1292 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1293 { 1294 /* since handle_stripe can be called outside of raid5d context 1295 * we need to ensure sh->ops.request is de-staged before another 1296 * request arrives 1297 */ 1298 wait_event(sh->ops.wait_for_ops, 1299 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1300 sh->ops.request = ops_request; 1301 1302 atomic_inc(&sh->count); 1303 async_schedule(async_run_ops, sh); 1304 } 1305 #else 1306 #define raid_run_ops __raid_run_ops 1307 #endif 1308 1309 static int grow_one_stripe(raid5_conf_t *conf) 1310 { 1311 struct stripe_head *sh; 1312 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1313 if (!sh) 1314 return 0; 1315 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1316 sh->raid_conf = conf; 1317 spin_lock_init(&sh->lock); 1318 #ifdef CONFIG_MULTICORE_RAID456 1319 init_waitqueue_head(&sh->ops.wait_for_ops); 1320 #endif 1321 1322 if (grow_buffers(sh)) { 1323 shrink_buffers(sh); 1324 kmem_cache_free(conf->slab_cache, sh); 1325 return 0; 1326 } 1327 /* we just created an active stripe so... */ 1328 atomic_set(&sh->count, 1); 1329 atomic_inc(&conf->active_stripes); 1330 INIT_LIST_HEAD(&sh->lru); 1331 release_stripe(sh); 1332 return 1; 1333 } 1334 1335 static int grow_stripes(raid5_conf_t *conf, int num) 1336 { 1337 struct kmem_cache *sc; 1338 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1339 1340 sprintf(conf->cache_name[0], 1341 "raid%d-%s", conf->level, mdname(conf->mddev)); 1342 sprintf(conf->cache_name[1], 1343 "raid%d-%s-alt", conf->level, mdname(conf->mddev)); 1344 conf->active_name = 0; 1345 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1346 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1347 0, 0, NULL); 1348 if (!sc) 1349 return 1; 1350 conf->slab_cache = sc; 1351 conf->pool_size = devs; 1352 while (num--) 1353 if (!grow_one_stripe(conf)) 1354 return 1; 1355 return 0; 1356 } 1357 1358 /** 1359 * scribble_len - return the required size of the scribble region 1360 * @num - total number of disks in the array 1361 * 1362 * The size must be enough to contain: 1363 * 1/ a struct page pointer for each device in the array +2 1364 * 2/ room to convert each entry in (1) to its corresponding dma 1365 * (dma_map_page()) or page (page_address()) address. 1366 * 1367 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1368 * calculate over all devices (not just the data blocks), using zeros in place 1369 * of the P and Q blocks. 1370 */ 1371 static size_t scribble_len(int num) 1372 { 1373 size_t len; 1374 1375 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1376 1377 return len; 1378 } 1379 1380 static int resize_stripes(raid5_conf_t *conf, int newsize) 1381 { 1382 /* Make all the stripes able to hold 'newsize' devices. 1383 * New slots in each stripe get 'page' set to a new page. 1384 * 1385 * This happens in stages: 1386 * 1/ create a new kmem_cache and allocate the required number of 1387 * stripe_heads. 1388 * 2/ gather all the old stripe_heads and tranfer the pages across 1389 * to the new stripe_heads. This will have the side effect of 1390 * freezing the array as once all stripe_heads have been collected, 1391 * no IO will be possible. Old stripe heads are freed once their 1392 * pages have been transferred over, and the old kmem_cache is 1393 * freed when all stripes are done. 1394 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1395 * we simple return a failre status - no need to clean anything up. 1396 * 4/ allocate new pages for the new slots in the new stripe_heads. 1397 * If this fails, we don't bother trying the shrink the 1398 * stripe_heads down again, we just leave them as they are. 1399 * As each stripe_head is processed the new one is released into 1400 * active service. 1401 * 1402 * Once step2 is started, we cannot afford to wait for a write, 1403 * so we use GFP_NOIO allocations. 1404 */ 1405 struct stripe_head *osh, *nsh; 1406 LIST_HEAD(newstripes); 1407 struct disk_info *ndisks; 1408 unsigned long cpu; 1409 int err; 1410 struct kmem_cache *sc; 1411 int i; 1412 1413 if (newsize <= conf->pool_size) 1414 return 0; /* never bother to shrink */ 1415 1416 err = md_allow_write(conf->mddev); 1417 if (err) 1418 return err; 1419 1420 /* Step 1 */ 1421 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1422 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1423 0, 0, NULL); 1424 if (!sc) 1425 return -ENOMEM; 1426 1427 for (i = conf->max_nr_stripes; i; i--) { 1428 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1429 if (!nsh) 1430 break; 1431 1432 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1433 1434 nsh->raid_conf = conf; 1435 spin_lock_init(&nsh->lock); 1436 #ifdef CONFIG_MULTICORE_RAID456 1437 init_waitqueue_head(&nsh->ops.wait_for_ops); 1438 #endif 1439 1440 list_add(&nsh->lru, &newstripes); 1441 } 1442 if (i) { 1443 /* didn't get enough, give up */ 1444 while (!list_empty(&newstripes)) { 1445 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1446 list_del(&nsh->lru); 1447 kmem_cache_free(sc, nsh); 1448 } 1449 kmem_cache_destroy(sc); 1450 return -ENOMEM; 1451 } 1452 /* Step 2 - Must use GFP_NOIO now. 1453 * OK, we have enough stripes, start collecting inactive 1454 * stripes and copying them over 1455 */ 1456 list_for_each_entry(nsh, &newstripes, lru) { 1457 spin_lock_irq(&conf->device_lock); 1458 wait_event_lock_irq(conf->wait_for_stripe, 1459 !list_empty(&conf->inactive_list), 1460 conf->device_lock, 1461 unplug_slaves(conf->mddev) 1462 ); 1463 osh = get_free_stripe(conf); 1464 spin_unlock_irq(&conf->device_lock); 1465 atomic_set(&nsh->count, 1); 1466 for(i=0; i<conf->pool_size; i++) 1467 nsh->dev[i].page = osh->dev[i].page; 1468 for( ; i<newsize; i++) 1469 nsh->dev[i].page = NULL; 1470 kmem_cache_free(conf->slab_cache, osh); 1471 } 1472 kmem_cache_destroy(conf->slab_cache); 1473 1474 /* Step 3. 1475 * At this point, we are holding all the stripes so the array 1476 * is completely stalled, so now is a good time to resize 1477 * conf->disks and the scribble region 1478 */ 1479 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1480 if (ndisks) { 1481 for (i=0; i<conf->raid_disks; i++) 1482 ndisks[i] = conf->disks[i]; 1483 kfree(conf->disks); 1484 conf->disks = ndisks; 1485 } else 1486 err = -ENOMEM; 1487 1488 get_online_cpus(); 1489 conf->scribble_len = scribble_len(newsize); 1490 for_each_present_cpu(cpu) { 1491 struct raid5_percpu *percpu; 1492 void *scribble; 1493 1494 percpu = per_cpu_ptr(conf->percpu, cpu); 1495 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1496 1497 if (scribble) { 1498 kfree(percpu->scribble); 1499 percpu->scribble = scribble; 1500 } else { 1501 err = -ENOMEM; 1502 break; 1503 } 1504 } 1505 put_online_cpus(); 1506 1507 /* Step 4, return new stripes to service */ 1508 while(!list_empty(&newstripes)) { 1509 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1510 list_del_init(&nsh->lru); 1511 1512 for (i=conf->raid_disks; i < newsize; i++) 1513 if (nsh->dev[i].page == NULL) { 1514 struct page *p = alloc_page(GFP_NOIO); 1515 nsh->dev[i].page = p; 1516 if (!p) 1517 err = -ENOMEM; 1518 } 1519 release_stripe(nsh); 1520 } 1521 /* critical section pass, GFP_NOIO no longer needed */ 1522 1523 conf->slab_cache = sc; 1524 conf->active_name = 1-conf->active_name; 1525 conf->pool_size = newsize; 1526 return err; 1527 } 1528 1529 static int drop_one_stripe(raid5_conf_t *conf) 1530 { 1531 struct stripe_head *sh; 1532 1533 spin_lock_irq(&conf->device_lock); 1534 sh = get_free_stripe(conf); 1535 spin_unlock_irq(&conf->device_lock); 1536 if (!sh) 1537 return 0; 1538 BUG_ON(atomic_read(&sh->count)); 1539 shrink_buffers(sh); 1540 kmem_cache_free(conf->slab_cache, sh); 1541 atomic_dec(&conf->active_stripes); 1542 return 1; 1543 } 1544 1545 static void shrink_stripes(raid5_conf_t *conf) 1546 { 1547 while (drop_one_stripe(conf)) 1548 ; 1549 1550 if (conf->slab_cache) 1551 kmem_cache_destroy(conf->slab_cache); 1552 conf->slab_cache = NULL; 1553 } 1554 1555 static void raid5_end_read_request(struct bio * bi, int error) 1556 { 1557 struct stripe_head *sh = bi->bi_private; 1558 raid5_conf_t *conf = sh->raid_conf; 1559 int disks = sh->disks, i; 1560 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1561 char b[BDEVNAME_SIZE]; 1562 mdk_rdev_t *rdev; 1563 1564 1565 for (i=0 ; i<disks; i++) 1566 if (bi == &sh->dev[i].req) 1567 break; 1568 1569 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1570 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1571 uptodate); 1572 if (i == disks) { 1573 BUG(); 1574 return; 1575 } 1576 1577 if (uptodate) { 1578 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1579 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1580 rdev = conf->disks[i].rdev; 1581 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1582 " (%lu sectors at %llu on %s)\n", 1583 mdname(conf->mddev), STRIPE_SECTORS, 1584 (unsigned long long)(sh->sector 1585 + rdev->data_offset), 1586 bdevname(rdev->bdev, b)); 1587 clear_bit(R5_ReadError, &sh->dev[i].flags); 1588 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1589 } 1590 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1591 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1592 } else { 1593 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1594 int retry = 0; 1595 rdev = conf->disks[i].rdev; 1596 1597 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1598 atomic_inc(&rdev->read_errors); 1599 if (conf->mddev->degraded >= conf->max_degraded) 1600 printk_rl(KERN_WARNING 1601 "md/raid:%s: read error not correctable " 1602 "(sector %llu on %s).\n", 1603 mdname(conf->mddev), 1604 (unsigned long long)(sh->sector 1605 + rdev->data_offset), 1606 bdn); 1607 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1608 /* Oh, no!!! */ 1609 printk_rl(KERN_WARNING 1610 "md/raid:%s: read error NOT corrected!! " 1611 "(sector %llu on %s).\n", 1612 mdname(conf->mddev), 1613 (unsigned long long)(sh->sector 1614 + rdev->data_offset), 1615 bdn); 1616 else if (atomic_read(&rdev->read_errors) 1617 > conf->max_nr_stripes) 1618 printk(KERN_WARNING 1619 "md/raid:%s: Too many read errors, failing device %s.\n", 1620 mdname(conf->mddev), bdn); 1621 else 1622 retry = 1; 1623 if (retry) 1624 set_bit(R5_ReadError, &sh->dev[i].flags); 1625 else { 1626 clear_bit(R5_ReadError, &sh->dev[i].flags); 1627 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1628 md_error(conf->mddev, rdev); 1629 } 1630 } 1631 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1632 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1633 set_bit(STRIPE_HANDLE, &sh->state); 1634 release_stripe(sh); 1635 } 1636 1637 static void raid5_end_write_request(struct bio *bi, int error) 1638 { 1639 struct stripe_head *sh = bi->bi_private; 1640 raid5_conf_t *conf = sh->raid_conf; 1641 int disks = sh->disks, i; 1642 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1643 1644 for (i=0 ; i<disks; i++) 1645 if (bi == &sh->dev[i].req) 1646 break; 1647 1648 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1649 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1650 uptodate); 1651 if (i == disks) { 1652 BUG(); 1653 return; 1654 } 1655 1656 if (!uptodate) 1657 md_error(conf->mddev, conf->disks[i].rdev); 1658 1659 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1660 1661 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1662 set_bit(STRIPE_HANDLE, &sh->state); 1663 release_stripe(sh); 1664 } 1665 1666 1667 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1668 1669 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1670 { 1671 struct r5dev *dev = &sh->dev[i]; 1672 1673 bio_init(&dev->req); 1674 dev->req.bi_io_vec = &dev->vec; 1675 dev->req.bi_vcnt++; 1676 dev->req.bi_max_vecs++; 1677 dev->vec.bv_page = dev->page; 1678 dev->vec.bv_len = STRIPE_SIZE; 1679 dev->vec.bv_offset = 0; 1680 1681 dev->req.bi_sector = sh->sector; 1682 dev->req.bi_private = sh; 1683 1684 dev->flags = 0; 1685 dev->sector = compute_blocknr(sh, i, previous); 1686 } 1687 1688 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1689 { 1690 char b[BDEVNAME_SIZE]; 1691 raid5_conf_t *conf = mddev->private; 1692 pr_debug("raid456: error called\n"); 1693 1694 if (!test_bit(Faulty, &rdev->flags)) { 1695 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1696 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1697 unsigned long flags; 1698 spin_lock_irqsave(&conf->device_lock, flags); 1699 mddev->degraded++; 1700 spin_unlock_irqrestore(&conf->device_lock, flags); 1701 /* 1702 * if recovery was running, make sure it aborts. 1703 */ 1704 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1705 } 1706 set_bit(Faulty, &rdev->flags); 1707 printk(KERN_ALERT 1708 "md/raid:%s: Disk failure on %s, disabling device.\n" 1709 KERN_ALERT 1710 "md/raid:%s: Operation continuing on %d devices.\n", 1711 mdname(mddev), 1712 bdevname(rdev->bdev, b), 1713 mdname(mddev), 1714 conf->raid_disks - mddev->degraded); 1715 } 1716 } 1717 1718 /* 1719 * Input: a 'big' sector number, 1720 * Output: index of the data and parity disk, and the sector # in them. 1721 */ 1722 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1723 int previous, int *dd_idx, 1724 struct stripe_head *sh) 1725 { 1726 sector_t stripe, stripe2; 1727 sector_t chunk_number; 1728 unsigned int chunk_offset; 1729 int pd_idx, qd_idx; 1730 int ddf_layout = 0; 1731 sector_t new_sector; 1732 int algorithm = previous ? conf->prev_algo 1733 : conf->algorithm; 1734 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1735 : conf->chunk_sectors; 1736 int raid_disks = previous ? conf->previous_raid_disks 1737 : conf->raid_disks; 1738 int data_disks = raid_disks - conf->max_degraded; 1739 1740 /* First compute the information on this sector */ 1741 1742 /* 1743 * Compute the chunk number and the sector offset inside the chunk 1744 */ 1745 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1746 chunk_number = r_sector; 1747 1748 /* 1749 * Compute the stripe number 1750 */ 1751 stripe = chunk_number; 1752 *dd_idx = sector_div(stripe, data_disks); 1753 stripe2 = stripe; 1754 /* 1755 * Select the parity disk based on the user selected algorithm. 1756 */ 1757 pd_idx = qd_idx = ~0; 1758 switch(conf->level) { 1759 case 4: 1760 pd_idx = data_disks; 1761 break; 1762 case 5: 1763 switch (algorithm) { 1764 case ALGORITHM_LEFT_ASYMMETRIC: 1765 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1766 if (*dd_idx >= pd_idx) 1767 (*dd_idx)++; 1768 break; 1769 case ALGORITHM_RIGHT_ASYMMETRIC: 1770 pd_idx = sector_div(stripe2, raid_disks); 1771 if (*dd_idx >= pd_idx) 1772 (*dd_idx)++; 1773 break; 1774 case ALGORITHM_LEFT_SYMMETRIC: 1775 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1776 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1777 break; 1778 case ALGORITHM_RIGHT_SYMMETRIC: 1779 pd_idx = sector_div(stripe2, raid_disks); 1780 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1781 break; 1782 case ALGORITHM_PARITY_0: 1783 pd_idx = 0; 1784 (*dd_idx)++; 1785 break; 1786 case ALGORITHM_PARITY_N: 1787 pd_idx = data_disks; 1788 break; 1789 default: 1790 BUG(); 1791 } 1792 break; 1793 case 6: 1794 1795 switch (algorithm) { 1796 case ALGORITHM_LEFT_ASYMMETRIC: 1797 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1798 qd_idx = pd_idx + 1; 1799 if (pd_idx == raid_disks-1) { 1800 (*dd_idx)++; /* Q D D D P */ 1801 qd_idx = 0; 1802 } else if (*dd_idx >= pd_idx) 1803 (*dd_idx) += 2; /* D D P Q D */ 1804 break; 1805 case ALGORITHM_RIGHT_ASYMMETRIC: 1806 pd_idx = sector_div(stripe2, raid_disks); 1807 qd_idx = pd_idx + 1; 1808 if (pd_idx == raid_disks-1) { 1809 (*dd_idx)++; /* Q D D D P */ 1810 qd_idx = 0; 1811 } else if (*dd_idx >= pd_idx) 1812 (*dd_idx) += 2; /* D D P Q D */ 1813 break; 1814 case ALGORITHM_LEFT_SYMMETRIC: 1815 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1816 qd_idx = (pd_idx + 1) % raid_disks; 1817 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1818 break; 1819 case ALGORITHM_RIGHT_SYMMETRIC: 1820 pd_idx = sector_div(stripe2, raid_disks); 1821 qd_idx = (pd_idx + 1) % raid_disks; 1822 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1823 break; 1824 1825 case ALGORITHM_PARITY_0: 1826 pd_idx = 0; 1827 qd_idx = 1; 1828 (*dd_idx) += 2; 1829 break; 1830 case ALGORITHM_PARITY_N: 1831 pd_idx = data_disks; 1832 qd_idx = data_disks + 1; 1833 break; 1834 1835 case ALGORITHM_ROTATING_ZERO_RESTART: 1836 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1837 * of blocks for computing Q is different. 1838 */ 1839 pd_idx = sector_div(stripe2, raid_disks); 1840 qd_idx = pd_idx + 1; 1841 if (pd_idx == raid_disks-1) { 1842 (*dd_idx)++; /* Q D D D P */ 1843 qd_idx = 0; 1844 } else if (*dd_idx >= pd_idx) 1845 (*dd_idx) += 2; /* D D P Q D */ 1846 ddf_layout = 1; 1847 break; 1848 1849 case ALGORITHM_ROTATING_N_RESTART: 1850 /* Same a left_asymmetric, by first stripe is 1851 * D D D P Q rather than 1852 * Q D D D P 1853 */ 1854 stripe2 += 1; 1855 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1856 qd_idx = pd_idx + 1; 1857 if (pd_idx == raid_disks-1) { 1858 (*dd_idx)++; /* Q D D D P */ 1859 qd_idx = 0; 1860 } else if (*dd_idx >= pd_idx) 1861 (*dd_idx) += 2; /* D D P Q D */ 1862 ddf_layout = 1; 1863 break; 1864 1865 case ALGORITHM_ROTATING_N_CONTINUE: 1866 /* Same as left_symmetric but Q is before P */ 1867 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1868 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1869 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1870 ddf_layout = 1; 1871 break; 1872 1873 case ALGORITHM_LEFT_ASYMMETRIC_6: 1874 /* RAID5 left_asymmetric, with Q on last device */ 1875 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1876 if (*dd_idx >= pd_idx) 1877 (*dd_idx)++; 1878 qd_idx = raid_disks - 1; 1879 break; 1880 1881 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1882 pd_idx = sector_div(stripe2, raid_disks-1); 1883 if (*dd_idx >= pd_idx) 1884 (*dd_idx)++; 1885 qd_idx = raid_disks - 1; 1886 break; 1887 1888 case ALGORITHM_LEFT_SYMMETRIC_6: 1889 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1890 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1891 qd_idx = raid_disks - 1; 1892 break; 1893 1894 case ALGORITHM_RIGHT_SYMMETRIC_6: 1895 pd_idx = sector_div(stripe2, raid_disks-1); 1896 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1897 qd_idx = raid_disks - 1; 1898 break; 1899 1900 case ALGORITHM_PARITY_0_6: 1901 pd_idx = 0; 1902 (*dd_idx)++; 1903 qd_idx = raid_disks - 1; 1904 break; 1905 1906 default: 1907 BUG(); 1908 } 1909 break; 1910 } 1911 1912 if (sh) { 1913 sh->pd_idx = pd_idx; 1914 sh->qd_idx = qd_idx; 1915 sh->ddf_layout = ddf_layout; 1916 } 1917 /* 1918 * Finally, compute the new sector number 1919 */ 1920 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1921 return new_sector; 1922 } 1923 1924 1925 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1926 { 1927 raid5_conf_t *conf = sh->raid_conf; 1928 int raid_disks = sh->disks; 1929 int data_disks = raid_disks - conf->max_degraded; 1930 sector_t new_sector = sh->sector, check; 1931 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1932 : conf->chunk_sectors; 1933 int algorithm = previous ? conf->prev_algo 1934 : conf->algorithm; 1935 sector_t stripe; 1936 int chunk_offset; 1937 sector_t chunk_number; 1938 int dummy1, dd_idx = i; 1939 sector_t r_sector; 1940 struct stripe_head sh2; 1941 1942 1943 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1944 stripe = new_sector; 1945 1946 if (i == sh->pd_idx) 1947 return 0; 1948 switch(conf->level) { 1949 case 4: break; 1950 case 5: 1951 switch (algorithm) { 1952 case ALGORITHM_LEFT_ASYMMETRIC: 1953 case ALGORITHM_RIGHT_ASYMMETRIC: 1954 if (i > sh->pd_idx) 1955 i--; 1956 break; 1957 case ALGORITHM_LEFT_SYMMETRIC: 1958 case ALGORITHM_RIGHT_SYMMETRIC: 1959 if (i < sh->pd_idx) 1960 i += raid_disks; 1961 i -= (sh->pd_idx + 1); 1962 break; 1963 case ALGORITHM_PARITY_0: 1964 i -= 1; 1965 break; 1966 case ALGORITHM_PARITY_N: 1967 break; 1968 default: 1969 BUG(); 1970 } 1971 break; 1972 case 6: 1973 if (i == sh->qd_idx) 1974 return 0; /* It is the Q disk */ 1975 switch (algorithm) { 1976 case ALGORITHM_LEFT_ASYMMETRIC: 1977 case ALGORITHM_RIGHT_ASYMMETRIC: 1978 case ALGORITHM_ROTATING_ZERO_RESTART: 1979 case ALGORITHM_ROTATING_N_RESTART: 1980 if (sh->pd_idx == raid_disks-1) 1981 i--; /* Q D D D P */ 1982 else if (i > sh->pd_idx) 1983 i -= 2; /* D D P Q D */ 1984 break; 1985 case ALGORITHM_LEFT_SYMMETRIC: 1986 case ALGORITHM_RIGHT_SYMMETRIC: 1987 if (sh->pd_idx == raid_disks-1) 1988 i--; /* Q D D D P */ 1989 else { 1990 /* D D P Q D */ 1991 if (i < sh->pd_idx) 1992 i += raid_disks; 1993 i -= (sh->pd_idx + 2); 1994 } 1995 break; 1996 case ALGORITHM_PARITY_0: 1997 i -= 2; 1998 break; 1999 case ALGORITHM_PARITY_N: 2000 break; 2001 case ALGORITHM_ROTATING_N_CONTINUE: 2002 /* Like left_symmetric, but P is before Q */ 2003 if (sh->pd_idx == 0) 2004 i--; /* P D D D Q */ 2005 else { 2006 /* D D Q P D */ 2007 if (i < sh->pd_idx) 2008 i += raid_disks; 2009 i -= (sh->pd_idx + 1); 2010 } 2011 break; 2012 case ALGORITHM_LEFT_ASYMMETRIC_6: 2013 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2014 if (i > sh->pd_idx) 2015 i--; 2016 break; 2017 case ALGORITHM_LEFT_SYMMETRIC_6: 2018 case ALGORITHM_RIGHT_SYMMETRIC_6: 2019 if (i < sh->pd_idx) 2020 i += data_disks + 1; 2021 i -= (sh->pd_idx + 1); 2022 break; 2023 case ALGORITHM_PARITY_0_6: 2024 i -= 1; 2025 break; 2026 default: 2027 BUG(); 2028 } 2029 break; 2030 } 2031 2032 chunk_number = stripe * data_disks + i; 2033 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2034 2035 check = raid5_compute_sector(conf, r_sector, 2036 previous, &dummy1, &sh2); 2037 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2038 || sh2.qd_idx != sh->qd_idx) { 2039 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2040 mdname(conf->mddev)); 2041 return 0; 2042 } 2043 return r_sector; 2044 } 2045 2046 2047 static void 2048 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2049 int rcw, int expand) 2050 { 2051 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2052 raid5_conf_t *conf = sh->raid_conf; 2053 int level = conf->level; 2054 2055 if (rcw) { 2056 /* if we are not expanding this is a proper write request, and 2057 * there will be bios with new data to be drained into the 2058 * stripe cache 2059 */ 2060 if (!expand) { 2061 sh->reconstruct_state = reconstruct_state_drain_run; 2062 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2063 } else 2064 sh->reconstruct_state = reconstruct_state_run; 2065 2066 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2067 2068 for (i = disks; i--; ) { 2069 struct r5dev *dev = &sh->dev[i]; 2070 2071 if (dev->towrite) { 2072 set_bit(R5_LOCKED, &dev->flags); 2073 set_bit(R5_Wantdrain, &dev->flags); 2074 if (!expand) 2075 clear_bit(R5_UPTODATE, &dev->flags); 2076 s->locked++; 2077 } 2078 } 2079 if (s->locked + conf->max_degraded == disks) 2080 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2081 atomic_inc(&conf->pending_full_writes); 2082 } else { 2083 BUG_ON(level == 6); 2084 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2085 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2086 2087 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2088 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2089 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2090 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2091 2092 for (i = disks; i--; ) { 2093 struct r5dev *dev = &sh->dev[i]; 2094 if (i == pd_idx) 2095 continue; 2096 2097 if (dev->towrite && 2098 (test_bit(R5_UPTODATE, &dev->flags) || 2099 test_bit(R5_Wantcompute, &dev->flags))) { 2100 set_bit(R5_Wantdrain, &dev->flags); 2101 set_bit(R5_LOCKED, &dev->flags); 2102 clear_bit(R5_UPTODATE, &dev->flags); 2103 s->locked++; 2104 } 2105 } 2106 } 2107 2108 /* keep the parity disk(s) locked while asynchronous operations 2109 * are in flight 2110 */ 2111 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2112 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2113 s->locked++; 2114 2115 if (level == 6) { 2116 int qd_idx = sh->qd_idx; 2117 struct r5dev *dev = &sh->dev[qd_idx]; 2118 2119 set_bit(R5_LOCKED, &dev->flags); 2120 clear_bit(R5_UPTODATE, &dev->flags); 2121 s->locked++; 2122 } 2123 2124 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2125 __func__, (unsigned long long)sh->sector, 2126 s->locked, s->ops_request); 2127 } 2128 2129 /* 2130 * Each stripe/dev can have one or more bion attached. 2131 * toread/towrite point to the first in a chain. 2132 * The bi_next chain must be in order. 2133 */ 2134 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2135 { 2136 struct bio **bip; 2137 raid5_conf_t *conf = sh->raid_conf; 2138 int firstwrite=0; 2139 2140 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2141 (unsigned long long)bi->bi_sector, 2142 (unsigned long long)sh->sector); 2143 2144 2145 spin_lock(&sh->lock); 2146 spin_lock_irq(&conf->device_lock); 2147 if (forwrite) { 2148 bip = &sh->dev[dd_idx].towrite; 2149 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2150 firstwrite = 1; 2151 } else 2152 bip = &sh->dev[dd_idx].toread; 2153 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2154 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2155 goto overlap; 2156 bip = & (*bip)->bi_next; 2157 } 2158 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2159 goto overlap; 2160 2161 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2162 if (*bip) 2163 bi->bi_next = *bip; 2164 *bip = bi; 2165 bi->bi_phys_segments++; 2166 spin_unlock_irq(&conf->device_lock); 2167 spin_unlock(&sh->lock); 2168 2169 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2170 (unsigned long long)bi->bi_sector, 2171 (unsigned long long)sh->sector, dd_idx); 2172 2173 if (conf->mddev->bitmap && firstwrite) { 2174 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2175 STRIPE_SECTORS, 0); 2176 sh->bm_seq = conf->seq_flush+1; 2177 set_bit(STRIPE_BIT_DELAY, &sh->state); 2178 } 2179 2180 if (forwrite) { 2181 /* check if page is covered */ 2182 sector_t sector = sh->dev[dd_idx].sector; 2183 for (bi=sh->dev[dd_idx].towrite; 2184 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2185 bi && bi->bi_sector <= sector; 2186 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2187 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2188 sector = bi->bi_sector + (bi->bi_size>>9); 2189 } 2190 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2191 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2192 } 2193 return 1; 2194 2195 overlap: 2196 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2197 spin_unlock_irq(&conf->device_lock); 2198 spin_unlock(&sh->lock); 2199 return 0; 2200 } 2201 2202 static void end_reshape(raid5_conf_t *conf); 2203 2204 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2205 struct stripe_head *sh) 2206 { 2207 int sectors_per_chunk = 2208 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2209 int dd_idx; 2210 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2211 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2212 2213 raid5_compute_sector(conf, 2214 stripe * (disks - conf->max_degraded) 2215 *sectors_per_chunk + chunk_offset, 2216 previous, 2217 &dd_idx, sh); 2218 } 2219 2220 static void 2221 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 2222 struct stripe_head_state *s, int disks, 2223 struct bio **return_bi) 2224 { 2225 int i; 2226 for (i = disks; i--; ) { 2227 struct bio *bi; 2228 int bitmap_end = 0; 2229 2230 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2231 mdk_rdev_t *rdev; 2232 rcu_read_lock(); 2233 rdev = rcu_dereference(conf->disks[i].rdev); 2234 if (rdev && test_bit(In_sync, &rdev->flags)) 2235 /* multiple read failures in one stripe */ 2236 md_error(conf->mddev, rdev); 2237 rcu_read_unlock(); 2238 } 2239 spin_lock_irq(&conf->device_lock); 2240 /* fail all writes first */ 2241 bi = sh->dev[i].towrite; 2242 sh->dev[i].towrite = NULL; 2243 if (bi) { 2244 s->to_write--; 2245 bitmap_end = 1; 2246 } 2247 2248 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2249 wake_up(&conf->wait_for_overlap); 2250 2251 while (bi && bi->bi_sector < 2252 sh->dev[i].sector + STRIPE_SECTORS) { 2253 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2254 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2255 if (!raid5_dec_bi_phys_segments(bi)) { 2256 md_write_end(conf->mddev); 2257 bi->bi_next = *return_bi; 2258 *return_bi = bi; 2259 } 2260 bi = nextbi; 2261 } 2262 /* and fail all 'written' */ 2263 bi = sh->dev[i].written; 2264 sh->dev[i].written = NULL; 2265 if (bi) bitmap_end = 1; 2266 while (bi && bi->bi_sector < 2267 sh->dev[i].sector + STRIPE_SECTORS) { 2268 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2269 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2270 if (!raid5_dec_bi_phys_segments(bi)) { 2271 md_write_end(conf->mddev); 2272 bi->bi_next = *return_bi; 2273 *return_bi = bi; 2274 } 2275 bi = bi2; 2276 } 2277 2278 /* fail any reads if this device is non-operational and 2279 * the data has not reached the cache yet. 2280 */ 2281 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2282 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2283 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2284 bi = sh->dev[i].toread; 2285 sh->dev[i].toread = NULL; 2286 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2287 wake_up(&conf->wait_for_overlap); 2288 if (bi) s->to_read--; 2289 while (bi && bi->bi_sector < 2290 sh->dev[i].sector + STRIPE_SECTORS) { 2291 struct bio *nextbi = 2292 r5_next_bio(bi, sh->dev[i].sector); 2293 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2294 if (!raid5_dec_bi_phys_segments(bi)) { 2295 bi->bi_next = *return_bi; 2296 *return_bi = bi; 2297 } 2298 bi = nextbi; 2299 } 2300 } 2301 spin_unlock_irq(&conf->device_lock); 2302 if (bitmap_end) 2303 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2304 STRIPE_SECTORS, 0, 0); 2305 } 2306 2307 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2308 if (atomic_dec_and_test(&conf->pending_full_writes)) 2309 md_wakeup_thread(conf->mddev->thread); 2310 } 2311 2312 /* fetch_block5 - checks the given member device to see if its data needs 2313 * to be read or computed to satisfy a request. 2314 * 2315 * Returns 1 when no more member devices need to be checked, otherwise returns 2316 * 0 to tell the loop in handle_stripe_fill5 to continue 2317 */ 2318 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 2319 int disk_idx, int disks) 2320 { 2321 struct r5dev *dev = &sh->dev[disk_idx]; 2322 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 2323 2324 /* is the data in this block needed, and can we get it? */ 2325 if (!test_bit(R5_LOCKED, &dev->flags) && 2326 !test_bit(R5_UPTODATE, &dev->flags) && 2327 (dev->toread || 2328 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2329 s->syncing || s->expanding || 2330 (s->failed && 2331 (failed_dev->toread || 2332 (failed_dev->towrite && 2333 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 2334 /* We would like to get this block, possibly by computing it, 2335 * otherwise read it if the backing disk is insync 2336 */ 2337 if ((s->uptodate == disks - 1) && 2338 (s->failed && disk_idx == s->failed_num)) { 2339 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2340 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2341 set_bit(R5_Wantcompute, &dev->flags); 2342 sh->ops.target = disk_idx; 2343 sh->ops.target2 = -1; 2344 s->req_compute = 1; 2345 /* Careful: from this point on 'uptodate' is in the eye 2346 * of raid_run_ops which services 'compute' operations 2347 * before writes. R5_Wantcompute flags a block that will 2348 * be R5_UPTODATE by the time it is needed for a 2349 * subsequent operation. 2350 */ 2351 s->uptodate++; 2352 return 1; /* uptodate + compute == disks */ 2353 } else if (test_bit(R5_Insync, &dev->flags)) { 2354 set_bit(R5_LOCKED, &dev->flags); 2355 set_bit(R5_Wantread, &dev->flags); 2356 s->locked++; 2357 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 2358 s->syncing); 2359 } 2360 } 2361 2362 return 0; 2363 } 2364 2365 /** 2366 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 2367 */ 2368 static void handle_stripe_fill5(struct stripe_head *sh, 2369 struct stripe_head_state *s, int disks) 2370 { 2371 int i; 2372 2373 /* look for blocks to read/compute, skip this if a compute 2374 * is already in flight, or if the stripe contents are in the 2375 * midst of changing due to a write 2376 */ 2377 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2378 !sh->reconstruct_state) 2379 for (i = disks; i--; ) 2380 if (fetch_block5(sh, s, i, disks)) 2381 break; 2382 set_bit(STRIPE_HANDLE, &sh->state); 2383 } 2384 2385 /* fetch_block6 - checks the given member device to see if its data needs 2386 * to be read or computed to satisfy a request. 2387 * 2388 * Returns 1 when no more member devices need to be checked, otherwise returns 2389 * 0 to tell the loop in handle_stripe_fill6 to continue 2390 */ 2391 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2392 struct r6_state *r6s, int disk_idx, int disks) 2393 { 2394 struct r5dev *dev = &sh->dev[disk_idx]; 2395 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2396 &sh->dev[r6s->failed_num[1]] }; 2397 2398 if (!test_bit(R5_LOCKED, &dev->flags) && 2399 !test_bit(R5_UPTODATE, &dev->flags) && 2400 (dev->toread || 2401 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2402 s->syncing || s->expanding || 2403 (s->failed >= 1 && 2404 (fdev[0]->toread || s->to_write)) || 2405 (s->failed >= 2 && 2406 (fdev[1]->toread || s->to_write)))) { 2407 /* we would like to get this block, possibly by computing it, 2408 * otherwise read it if the backing disk is insync 2409 */ 2410 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2411 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2412 if ((s->uptodate == disks - 1) && 2413 (s->failed && (disk_idx == r6s->failed_num[0] || 2414 disk_idx == r6s->failed_num[1]))) { 2415 /* have disk failed, and we're requested to fetch it; 2416 * do compute it 2417 */ 2418 pr_debug("Computing stripe %llu block %d\n", 2419 (unsigned long long)sh->sector, disk_idx); 2420 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2421 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2422 set_bit(R5_Wantcompute, &dev->flags); 2423 sh->ops.target = disk_idx; 2424 sh->ops.target2 = -1; /* no 2nd target */ 2425 s->req_compute = 1; 2426 s->uptodate++; 2427 return 1; 2428 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2429 /* Computing 2-failure is *very* expensive; only 2430 * do it if failed >= 2 2431 */ 2432 int other; 2433 for (other = disks; other--; ) { 2434 if (other == disk_idx) 2435 continue; 2436 if (!test_bit(R5_UPTODATE, 2437 &sh->dev[other].flags)) 2438 break; 2439 } 2440 BUG_ON(other < 0); 2441 pr_debug("Computing stripe %llu blocks %d,%d\n", 2442 (unsigned long long)sh->sector, 2443 disk_idx, other); 2444 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2445 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2446 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2447 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2448 sh->ops.target = disk_idx; 2449 sh->ops.target2 = other; 2450 s->uptodate += 2; 2451 s->req_compute = 1; 2452 return 1; 2453 } else if (test_bit(R5_Insync, &dev->flags)) { 2454 set_bit(R5_LOCKED, &dev->flags); 2455 set_bit(R5_Wantread, &dev->flags); 2456 s->locked++; 2457 pr_debug("Reading block %d (sync=%d)\n", 2458 disk_idx, s->syncing); 2459 } 2460 } 2461 2462 return 0; 2463 } 2464 2465 /** 2466 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2467 */ 2468 static void handle_stripe_fill6(struct stripe_head *sh, 2469 struct stripe_head_state *s, struct r6_state *r6s, 2470 int disks) 2471 { 2472 int i; 2473 2474 /* look for blocks to read/compute, skip this if a compute 2475 * is already in flight, or if the stripe contents are in the 2476 * midst of changing due to a write 2477 */ 2478 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2479 !sh->reconstruct_state) 2480 for (i = disks; i--; ) 2481 if (fetch_block6(sh, s, r6s, i, disks)) 2482 break; 2483 set_bit(STRIPE_HANDLE, &sh->state); 2484 } 2485 2486 2487 /* handle_stripe_clean_event 2488 * any written block on an uptodate or failed drive can be returned. 2489 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2490 * never LOCKED, so we don't need to test 'failed' directly. 2491 */ 2492 static void handle_stripe_clean_event(raid5_conf_t *conf, 2493 struct stripe_head *sh, int disks, struct bio **return_bi) 2494 { 2495 int i; 2496 struct r5dev *dev; 2497 2498 for (i = disks; i--; ) 2499 if (sh->dev[i].written) { 2500 dev = &sh->dev[i]; 2501 if (!test_bit(R5_LOCKED, &dev->flags) && 2502 test_bit(R5_UPTODATE, &dev->flags)) { 2503 /* We can return any write requests */ 2504 struct bio *wbi, *wbi2; 2505 int bitmap_end = 0; 2506 pr_debug("Return write for disc %d\n", i); 2507 spin_lock_irq(&conf->device_lock); 2508 wbi = dev->written; 2509 dev->written = NULL; 2510 while (wbi && wbi->bi_sector < 2511 dev->sector + STRIPE_SECTORS) { 2512 wbi2 = r5_next_bio(wbi, dev->sector); 2513 if (!raid5_dec_bi_phys_segments(wbi)) { 2514 md_write_end(conf->mddev); 2515 wbi->bi_next = *return_bi; 2516 *return_bi = wbi; 2517 } 2518 wbi = wbi2; 2519 } 2520 if (dev->towrite == NULL) 2521 bitmap_end = 1; 2522 spin_unlock_irq(&conf->device_lock); 2523 if (bitmap_end) 2524 bitmap_endwrite(conf->mddev->bitmap, 2525 sh->sector, 2526 STRIPE_SECTORS, 2527 !test_bit(STRIPE_DEGRADED, &sh->state), 2528 0); 2529 } 2530 } 2531 2532 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2533 if (atomic_dec_and_test(&conf->pending_full_writes)) 2534 md_wakeup_thread(conf->mddev->thread); 2535 } 2536 2537 static void handle_stripe_dirtying5(raid5_conf_t *conf, 2538 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2539 { 2540 int rmw = 0, rcw = 0, i; 2541 for (i = disks; i--; ) { 2542 /* would I have to read this buffer for read_modify_write */ 2543 struct r5dev *dev = &sh->dev[i]; 2544 if ((dev->towrite || i == sh->pd_idx) && 2545 !test_bit(R5_LOCKED, &dev->flags) && 2546 !(test_bit(R5_UPTODATE, &dev->flags) || 2547 test_bit(R5_Wantcompute, &dev->flags))) { 2548 if (test_bit(R5_Insync, &dev->flags)) 2549 rmw++; 2550 else 2551 rmw += 2*disks; /* cannot read it */ 2552 } 2553 /* Would I have to read this buffer for reconstruct_write */ 2554 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2555 !test_bit(R5_LOCKED, &dev->flags) && 2556 !(test_bit(R5_UPTODATE, &dev->flags) || 2557 test_bit(R5_Wantcompute, &dev->flags))) { 2558 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2559 else 2560 rcw += 2*disks; 2561 } 2562 } 2563 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2564 (unsigned long long)sh->sector, rmw, rcw); 2565 set_bit(STRIPE_HANDLE, &sh->state); 2566 if (rmw < rcw && rmw > 0) 2567 /* prefer read-modify-write, but need to get some data */ 2568 for (i = disks; i--; ) { 2569 struct r5dev *dev = &sh->dev[i]; 2570 if ((dev->towrite || i == sh->pd_idx) && 2571 !test_bit(R5_LOCKED, &dev->flags) && 2572 !(test_bit(R5_UPTODATE, &dev->flags) || 2573 test_bit(R5_Wantcompute, &dev->flags)) && 2574 test_bit(R5_Insync, &dev->flags)) { 2575 if ( 2576 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2577 pr_debug("Read_old block " 2578 "%d for r-m-w\n", i); 2579 set_bit(R5_LOCKED, &dev->flags); 2580 set_bit(R5_Wantread, &dev->flags); 2581 s->locked++; 2582 } else { 2583 set_bit(STRIPE_DELAYED, &sh->state); 2584 set_bit(STRIPE_HANDLE, &sh->state); 2585 } 2586 } 2587 } 2588 if (rcw <= rmw && rcw > 0) 2589 /* want reconstruct write, but need to get some data */ 2590 for (i = disks; i--; ) { 2591 struct r5dev *dev = &sh->dev[i]; 2592 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2593 i != sh->pd_idx && 2594 !test_bit(R5_LOCKED, &dev->flags) && 2595 !(test_bit(R5_UPTODATE, &dev->flags) || 2596 test_bit(R5_Wantcompute, &dev->flags)) && 2597 test_bit(R5_Insync, &dev->flags)) { 2598 if ( 2599 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2600 pr_debug("Read_old block " 2601 "%d for Reconstruct\n", i); 2602 set_bit(R5_LOCKED, &dev->flags); 2603 set_bit(R5_Wantread, &dev->flags); 2604 s->locked++; 2605 } else { 2606 set_bit(STRIPE_DELAYED, &sh->state); 2607 set_bit(STRIPE_HANDLE, &sh->state); 2608 } 2609 } 2610 } 2611 /* now if nothing is locked, and if we have enough data, 2612 * we can start a write request 2613 */ 2614 /* since handle_stripe can be called at any time we need to handle the 2615 * case where a compute block operation has been submitted and then a 2616 * subsequent call wants to start a write request. raid_run_ops only 2617 * handles the case where compute block and reconstruct are requested 2618 * simultaneously. If this is not the case then new writes need to be 2619 * held off until the compute completes. 2620 */ 2621 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2622 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2623 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2624 schedule_reconstruction(sh, s, rcw == 0, 0); 2625 } 2626 2627 static void handle_stripe_dirtying6(raid5_conf_t *conf, 2628 struct stripe_head *sh, struct stripe_head_state *s, 2629 struct r6_state *r6s, int disks) 2630 { 2631 int rcw = 0, pd_idx = sh->pd_idx, i; 2632 int qd_idx = sh->qd_idx; 2633 2634 set_bit(STRIPE_HANDLE, &sh->state); 2635 for (i = disks; i--; ) { 2636 struct r5dev *dev = &sh->dev[i]; 2637 /* check if we haven't enough data */ 2638 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2639 i != pd_idx && i != qd_idx && 2640 !test_bit(R5_LOCKED, &dev->flags) && 2641 !(test_bit(R5_UPTODATE, &dev->flags) || 2642 test_bit(R5_Wantcompute, &dev->flags))) { 2643 rcw++; 2644 if (!test_bit(R5_Insync, &dev->flags)) 2645 continue; /* it's a failed drive */ 2646 2647 if ( 2648 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2649 pr_debug("Read_old stripe %llu " 2650 "block %d for Reconstruct\n", 2651 (unsigned long long)sh->sector, i); 2652 set_bit(R5_LOCKED, &dev->flags); 2653 set_bit(R5_Wantread, &dev->flags); 2654 s->locked++; 2655 } else { 2656 pr_debug("Request delayed stripe %llu " 2657 "block %d for Reconstruct\n", 2658 (unsigned long long)sh->sector, i); 2659 set_bit(STRIPE_DELAYED, &sh->state); 2660 set_bit(STRIPE_HANDLE, &sh->state); 2661 } 2662 } 2663 } 2664 /* now if nothing is locked, and if we have enough data, we can start a 2665 * write request 2666 */ 2667 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2668 s->locked == 0 && rcw == 0 && 2669 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2670 schedule_reconstruction(sh, s, 1, 0); 2671 } 2672 } 2673 2674 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2675 struct stripe_head_state *s, int disks) 2676 { 2677 struct r5dev *dev = NULL; 2678 2679 set_bit(STRIPE_HANDLE, &sh->state); 2680 2681 switch (sh->check_state) { 2682 case check_state_idle: 2683 /* start a new check operation if there are no failures */ 2684 if (s->failed == 0) { 2685 BUG_ON(s->uptodate != disks); 2686 sh->check_state = check_state_run; 2687 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2688 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2689 s->uptodate--; 2690 break; 2691 } 2692 dev = &sh->dev[s->failed_num]; 2693 /* fall through */ 2694 case check_state_compute_result: 2695 sh->check_state = check_state_idle; 2696 if (!dev) 2697 dev = &sh->dev[sh->pd_idx]; 2698 2699 /* check that a write has not made the stripe insync */ 2700 if (test_bit(STRIPE_INSYNC, &sh->state)) 2701 break; 2702 2703 /* either failed parity check, or recovery is happening */ 2704 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2705 BUG_ON(s->uptodate != disks); 2706 2707 set_bit(R5_LOCKED, &dev->flags); 2708 s->locked++; 2709 set_bit(R5_Wantwrite, &dev->flags); 2710 2711 clear_bit(STRIPE_DEGRADED, &sh->state); 2712 set_bit(STRIPE_INSYNC, &sh->state); 2713 break; 2714 case check_state_run: 2715 break; /* we will be called again upon completion */ 2716 case check_state_check_result: 2717 sh->check_state = check_state_idle; 2718 2719 /* if a failure occurred during the check operation, leave 2720 * STRIPE_INSYNC not set and let the stripe be handled again 2721 */ 2722 if (s->failed) 2723 break; 2724 2725 /* handle a successful check operation, if parity is correct 2726 * we are done. Otherwise update the mismatch count and repair 2727 * parity if !MD_RECOVERY_CHECK 2728 */ 2729 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2730 /* parity is correct (on disc, 2731 * not in buffer any more) 2732 */ 2733 set_bit(STRIPE_INSYNC, &sh->state); 2734 else { 2735 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2736 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2737 /* don't try to repair!! */ 2738 set_bit(STRIPE_INSYNC, &sh->state); 2739 else { 2740 sh->check_state = check_state_compute_run; 2741 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2742 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2743 set_bit(R5_Wantcompute, 2744 &sh->dev[sh->pd_idx].flags); 2745 sh->ops.target = sh->pd_idx; 2746 sh->ops.target2 = -1; 2747 s->uptodate++; 2748 } 2749 } 2750 break; 2751 case check_state_compute_run: 2752 break; 2753 default: 2754 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2755 __func__, sh->check_state, 2756 (unsigned long long) sh->sector); 2757 BUG(); 2758 } 2759 } 2760 2761 2762 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2763 struct stripe_head_state *s, 2764 struct r6_state *r6s, int disks) 2765 { 2766 int pd_idx = sh->pd_idx; 2767 int qd_idx = sh->qd_idx; 2768 struct r5dev *dev; 2769 2770 set_bit(STRIPE_HANDLE, &sh->state); 2771 2772 BUG_ON(s->failed > 2); 2773 2774 /* Want to check and possibly repair P and Q. 2775 * However there could be one 'failed' device, in which 2776 * case we can only check one of them, possibly using the 2777 * other to generate missing data 2778 */ 2779 2780 switch (sh->check_state) { 2781 case check_state_idle: 2782 /* start a new check operation if there are < 2 failures */ 2783 if (s->failed == r6s->q_failed) { 2784 /* The only possible failed device holds Q, so it 2785 * makes sense to check P (If anything else were failed, 2786 * we would have used P to recreate it). 2787 */ 2788 sh->check_state = check_state_run; 2789 } 2790 if (!r6s->q_failed && s->failed < 2) { 2791 /* Q is not failed, and we didn't use it to generate 2792 * anything, so it makes sense to check it 2793 */ 2794 if (sh->check_state == check_state_run) 2795 sh->check_state = check_state_run_pq; 2796 else 2797 sh->check_state = check_state_run_q; 2798 } 2799 2800 /* discard potentially stale zero_sum_result */ 2801 sh->ops.zero_sum_result = 0; 2802 2803 if (sh->check_state == check_state_run) { 2804 /* async_xor_zero_sum destroys the contents of P */ 2805 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2806 s->uptodate--; 2807 } 2808 if (sh->check_state >= check_state_run && 2809 sh->check_state <= check_state_run_pq) { 2810 /* async_syndrome_zero_sum preserves P and Q, so 2811 * no need to mark them !uptodate here 2812 */ 2813 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2814 break; 2815 } 2816 2817 /* we have 2-disk failure */ 2818 BUG_ON(s->failed != 2); 2819 /* fall through */ 2820 case check_state_compute_result: 2821 sh->check_state = check_state_idle; 2822 2823 /* check that a write has not made the stripe insync */ 2824 if (test_bit(STRIPE_INSYNC, &sh->state)) 2825 break; 2826 2827 /* now write out any block on a failed drive, 2828 * or P or Q if they were recomputed 2829 */ 2830 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2831 if (s->failed == 2) { 2832 dev = &sh->dev[r6s->failed_num[1]]; 2833 s->locked++; 2834 set_bit(R5_LOCKED, &dev->flags); 2835 set_bit(R5_Wantwrite, &dev->flags); 2836 } 2837 if (s->failed >= 1) { 2838 dev = &sh->dev[r6s->failed_num[0]]; 2839 s->locked++; 2840 set_bit(R5_LOCKED, &dev->flags); 2841 set_bit(R5_Wantwrite, &dev->flags); 2842 } 2843 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2844 dev = &sh->dev[pd_idx]; 2845 s->locked++; 2846 set_bit(R5_LOCKED, &dev->flags); 2847 set_bit(R5_Wantwrite, &dev->flags); 2848 } 2849 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2850 dev = &sh->dev[qd_idx]; 2851 s->locked++; 2852 set_bit(R5_LOCKED, &dev->flags); 2853 set_bit(R5_Wantwrite, &dev->flags); 2854 } 2855 clear_bit(STRIPE_DEGRADED, &sh->state); 2856 2857 set_bit(STRIPE_INSYNC, &sh->state); 2858 break; 2859 case check_state_run: 2860 case check_state_run_q: 2861 case check_state_run_pq: 2862 break; /* we will be called again upon completion */ 2863 case check_state_check_result: 2864 sh->check_state = check_state_idle; 2865 2866 /* handle a successful check operation, if parity is correct 2867 * we are done. Otherwise update the mismatch count and repair 2868 * parity if !MD_RECOVERY_CHECK 2869 */ 2870 if (sh->ops.zero_sum_result == 0) { 2871 /* both parities are correct */ 2872 if (!s->failed) 2873 set_bit(STRIPE_INSYNC, &sh->state); 2874 else { 2875 /* in contrast to the raid5 case we can validate 2876 * parity, but still have a failure to write 2877 * back 2878 */ 2879 sh->check_state = check_state_compute_result; 2880 /* Returning at this point means that we may go 2881 * off and bring p and/or q uptodate again so 2882 * we make sure to check zero_sum_result again 2883 * to verify if p or q need writeback 2884 */ 2885 } 2886 } else { 2887 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2888 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2889 /* don't try to repair!! */ 2890 set_bit(STRIPE_INSYNC, &sh->state); 2891 else { 2892 int *target = &sh->ops.target; 2893 2894 sh->ops.target = -1; 2895 sh->ops.target2 = -1; 2896 sh->check_state = check_state_compute_run; 2897 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2898 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2899 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2900 set_bit(R5_Wantcompute, 2901 &sh->dev[pd_idx].flags); 2902 *target = pd_idx; 2903 target = &sh->ops.target2; 2904 s->uptodate++; 2905 } 2906 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2907 set_bit(R5_Wantcompute, 2908 &sh->dev[qd_idx].flags); 2909 *target = qd_idx; 2910 s->uptodate++; 2911 } 2912 } 2913 } 2914 break; 2915 case check_state_compute_run: 2916 break; 2917 default: 2918 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2919 __func__, sh->check_state, 2920 (unsigned long long) sh->sector); 2921 BUG(); 2922 } 2923 } 2924 2925 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2926 struct r6_state *r6s) 2927 { 2928 int i; 2929 2930 /* We have read all the blocks in this stripe and now we need to 2931 * copy some of them into a target stripe for expand. 2932 */ 2933 struct dma_async_tx_descriptor *tx = NULL; 2934 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2935 for (i = 0; i < sh->disks; i++) 2936 if (i != sh->pd_idx && i != sh->qd_idx) { 2937 int dd_idx, j; 2938 struct stripe_head *sh2; 2939 struct async_submit_ctl submit; 2940 2941 sector_t bn = compute_blocknr(sh, i, 1); 2942 sector_t s = raid5_compute_sector(conf, bn, 0, 2943 &dd_idx, NULL); 2944 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2945 if (sh2 == NULL) 2946 /* so far only the early blocks of this stripe 2947 * have been requested. When later blocks 2948 * get requested, we will try again 2949 */ 2950 continue; 2951 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2952 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2953 /* must have already done this block */ 2954 release_stripe(sh2); 2955 continue; 2956 } 2957 2958 /* place all the copies on one channel */ 2959 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 2960 tx = async_memcpy(sh2->dev[dd_idx].page, 2961 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2962 &submit); 2963 2964 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2965 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2966 for (j = 0; j < conf->raid_disks; j++) 2967 if (j != sh2->pd_idx && 2968 (!r6s || j != sh2->qd_idx) && 2969 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2970 break; 2971 if (j == conf->raid_disks) { 2972 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2973 set_bit(STRIPE_HANDLE, &sh2->state); 2974 } 2975 release_stripe(sh2); 2976 2977 } 2978 /* done submitting copies, wait for them to complete */ 2979 if (tx) { 2980 async_tx_ack(tx); 2981 dma_wait_for_async_tx(tx); 2982 } 2983 } 2984 2985 2986 /* 2987 * handle_stripe - do things to a stripe. 2988 * 2989 * We lock the stripe and then examine the state of various bits 2990 * to see what needs to be done. 2991 * Possible results: 2992 * return some read request which now have data 2993 * return some write requests which are safely on disc 2994 * schedule a read on some buffers 2995 * schedule a write of some buffers 2996 * return confirmation of parity correctness 2997 * 2998 * buffers are taken off read_list or write_list, and bh_cache buffers 2999 * get BH_Lock set before the stripe lock is released. 3000 * 3001 */ 3002 3003 static void handle_stripe5(struct stripe_head *sh) 3004 { 3005 raid5_conf_t *conf = sh->raid_conf; 3006 int disks = sh->disks, i; 3007 struct bio *return_bi = NULL; 3008 struct stripe_head_state s; 3009 struct r5dev *dev; 3010 mdk_rdev_t *blocked_rdev = NULL; 3011 int prexor; 3012 int dec_preread_active = 0; 3013 3014 memset(&s, 0, sizeof(s)); 3015 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 3016 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 3017 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 3018 sh->reconstruct_state); 3019 3020 spin_lock(&sh->lock); 3021 clear_bit(STRIPE_HANDLE, &sh->state); 3022 clear_bit(STRIPE_DELAYED, &sh->state); 3023 3024 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3025 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3026 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3027 3028 /* Now to look around and see what can be done */ 3029 rcu_read_lock(); 3030 for (i=disks; i--; ) { 3031 mdk_rdev_t *rdev; 3032 3033 dev = &sh->dev[i]; 3034 3035 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3036 "written %p\n", i, dev->flags, dev->toread, dev->read, 3037 dev->towrite, dev->written); 3038 3039 /* maybe we can request a biofill operation 3040 * 3041 * new wantfill requests are only permitted while 3042 * ops_complete_biofill is guaranteed to be inactive 3043 */ 3044 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3045 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3046 set_bit(R5_Wantfill, &dev->flags); 3047 3048 /* now count some things */ 3049 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3050 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3051 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3052 3053 if (test_bit(R5_Wantfill, &dev->flags)) 3054 s.to_fill++; 3055 else if (dev->toread) 3056 s.to_read++; 3057 if (dev->towrite) { 3058 s.to_write++; 3059 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3060 s.non_overwrite++; 3061 } 3062 if (dev->written) 3063 s.written++; 3064 rdev = rcu_dereference(conf->disks[i].rdev); 3065 if (blocked_rdev == NULL && 3066 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3067 blocked_rdev = rdev; 3068 atomic_inc(&rdev->nr_pending); 3069 } 3070 clear_bit(R5_Insync, &dev->flags); 3071 if (!rdev) 3072 /* Not in-sync */; 3073 else if (test_bit(In_sync, &rdev->flags)) 3074 set_bit(R5_Insync, &dev->flags); 3075 else { 3076 /* could be in-sync depending on recovery/reshape status */ 3077 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3078 set_bit(R5_Insync, &dev->flags); 3079 } 3080 if (!test_bit(R5_Insync, &dev->flags)) { 3081 /* The ReadError flag will just be confusing now */ 3082 clear_bit(R5_ReadError, &dev->flags); 3083 clear_bit(R5_ReWrite, &dev->flags); 3084 } 3085 if (test_bit(R5_ReadError, &dev->flags)) 3086 clear_bit(R5_Insync, &dev->flags); 3087 if (!test_bit(R5_Insync, &dev->flags)) { 3088 s.failed++; 3089 s.failed_num = i; 3090 } 3091 } 3092 rcu_read_unlock(); 3093 3094 if (unlikely(blocked_rdev)) { 3095 if (s.syncing || s.expanding || s.expanded || 3096 s.to_write || s.written) { 3097 set_bit(STRIPE_HANDLE, &sh->state); 3098 goto unlock; 3099 } 3100 /* There is nothing for the blocked_rdev to block */ 3101 rdev_dec_pending(blocked_rdev, conf->mddev); 3102 blocked_rdev = NULL; 3103 } 3104 3105 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3106 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3107 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3108 } 3109 3110 pr_debug("locked=%d uptodate=%d to_read=%d" 3111 " to_write=%d failed=%d failed_num=%d\n", 3112 s.locked, s.uptodate, s.to_read, s.to_write, 3113 s.failed, s.failed_num); 3114 /* check if the array has lost two devices and, if so, some requests might 3115 * need to be failed 3116 */ 3117 if (s.failed > 1 && s.to_read+s.to_write+s.written) 3118 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3119 if (s.failed > 1 && s.syncing) { 3120 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3121 clear_bit(STRIPE_SYNCING, &sh->state); 3122 s.syncing = 0; 3123 } 3124 3125 /* might be able to return some write requests if the parity block 3126 * is safe, or on a failed drive 3127 */ 3128 dev = &sh->dev[sh->pd_idx]; 3129 if ( s.written && 3130 ((test_bit(R5_Insync, &dev->flags) && 3131 !test_bit(R5_LOCKED, &dev->flags) && 3132 test_bit(R5_UPTODATE, &dev->flags)) || 3133 (s.failed == 1 && s.failed_num == sh->pd_idx))) 3134 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3135 3136 /* Now we might consider reading some blocks, either to check/generate 3137 * parity, or to satisfy requests 3138 * or to load a block that is being partially written. 3139 */ 3140 if (s.to_read || s.non_overwrite || 3141 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3142 handle_stripe_fill5(sh, &s, disks); 3143 3144 /* Now we check to see if any write operations have recently 3145 * completed 3146 */ 3147 prexor = 0; 3148 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3149 prexor = 1; 3150 if (sh->reconstruct_state == reconstruct_state_drain_result || 3151 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3152 sh->reconstruct_state = reconstruct_state_idle; 3153 3154 /* All the 'written' buffers and the parity block are ready to 3155 * be written back to disk 3156 */ 3157 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3158 for (i = disks; i--; ) { 3159 dev = &sh->dev[i]; 3160 if (test_bit(R5_LOCKED, &dev->flags) && 3161 (i == sh->pd_idx || dev->written)) { 3162 pr_debug("Writing block %d\n", i); 3163 set_bit(R5_Wantwrite, &dev->flags); 3164 if (prexor) 3165 continue; 3166 if (!test_bit(R5_Insync, &dev->flags) || 3167 (i == sh->pd_idx && s.failed == 0)) 3168 set_bit(STRIPE_INSYNC, &sh->state); 3169 } 3170 } 3171 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3172 dec_preread_active = 1; 3173 } 3174 3175 /* Now to consider new write requests and what else, if anything 3176 * should be read. We do not handle new writes when: 3177 * 1/ A 'write' operation (copy+xor) is already in flight. 3178 * 2/ A 'check' operation is in flight, as it may clobber the parity 3179 * block. 3180 */ 3181 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3182 handle_stripe_dirtying5(conf, sh, &s, disks); 3183 3184 /* maybe we need to check and possibly fix the parity for this stripe 3185 * Any reads will already have been scheduled, so we just see if enough 3186 * data is available. The parity check is held off while parity 3187 * dependent operations are in flight. 3188 */ 3189 if (sh->check_state || 3190 (s.syncing && s.locked == 0 && 3191 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3192 !test_bit(STRIPE_INSYNC, &sh->state))) 3193 handle_parity_checks5(conf, sh, &s, disks); 3194 3195 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3196 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3197 clear_bit(STRIPE_SYNCING, &sh->state); 3198 } 3199 3200 /* If the failed drive is just a ReadError, then we might need to progress 3201 * the repair/check process 3202 */ 3203 if (s.failed == 1 && !conf->mddev->ro && 3204 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 3205 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 3206 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 3207 ) { 3208 dev = &sh->dev[s.failed_num]; 3209 if (!test_bit(R5_ReWrite, &dev->flags)) { 3210 set_bit(R5_Wantwrite, &dev->flags); 3211 set_bit(R5_ReWrite, &dev->flags); 3212 set_bit(R5_LOCKED, &dev->flags); 3213 s.locked++; 3214 } else { 3215 /* let's read it back */ 3216 set_bit(R5_Wantread, &dev->flags); 3217 set_bit(R5_LOCKED, &dev->flags); 3218 s.locked++; 3219 } 3220 } 3221 3222 /* Finish reconstruct operations initiated by the expansion process */ 3223 if (sh->reconstruct_state == reconstruct_state_result) { 3224 struct stripe_head *sh2 3225 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3226 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3227 /* sh cannot be written until sh2 has been read. 3228 * so arrange for sh to be delayed a little 3229 */ 3230 set_bit(STRIPE_DELAYED, &sh->state); 3231 set_bit(STRIPE_HANDLE, &sh->state); 3232 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3233 &sh2->state)) 3234 atomic_inc(&conf->preread_active_stripes); 3235 release_stripe(sh2); 3236 goto unlock; 3237 } 3238 if (sh2) 3239 release_stripe(sh2); 3240 3241 sh->reconstruct_state = reconstruct_state_idle; 3242 clear_bit(STRIPE_EXPANDING, &sh->state); 3243 for (i = conf->raid_disks; i--; ) { 3244 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3245 set_bit(R5_LOCKED, &sh->dev[i].flags); 3246 s.locked++; 3247 } 3248 } 3249 3250 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3251 !sh->reconstruct_state) { 3252 /* Need to write out all blocks after computing parity */ 3253 sh->disks = conf->raid_disks; 3254 stripe_set_idx(sh->sector, conf, 0, sh); 3255 schedule_reconstruction(sh, &s, 1, 1); 3256 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3257 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3258 atomic_dec(&conf->reshape_stripes); 3259 wake_up(&conf->wait_for_overlap); 3260 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3261 } 3262 3263 if (s.expanding && s.locked == 0 && 3264 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3265 handle_stripe_expansion(conf, sh, NULL); 3266 3267 unlock: 3268 spin_unlock(&sh->lock); 3269 3270 /* wait for this device to become unblocked */ 3271 if (unlikely(blocked_rdev)) 3272 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3273 3274 if (s.ops_request) 3275 raid_run_ops(sh, s.ops_request); 3276 3277 ops_run_io(sh, &s); 3278 3279 if (dec_preread_active) { 3280 /* We delay this until after ops_run_io so that if make_request 3281 * is waiting on a barrier, it won't continue until the writes 3282 * have actually been submitted. 3283 */ 3284 atomic_dec(&conf->preread_active_stripes); 3285 if (atomic_read(&conf->preread_active_stripes) < 3286 IO_THRESHOLD) 3287 md_wakeup_thread(conf->mddev->thread); 3288 } 3289 return_io(return_bi); 3290 } 3291 3292 static void handle_stripe6(struct stripe_head *sh) 3293 { 3294 raid5_conf_t *conf = sh->raid_conf; 3295 int disks = sh->disks; 3296 struct bio *return_bi = NULL; 3297 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3298 struct stripe_head_state s; 3299 struct r6_state r6s; 3300 struct r5dev *dev, *pdev, *qdev; 3301 mdk_rdev_t *blocked_rdev = NULL; 3302 int dec_preread_active = 0; 3303 3304 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3305 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3306 (unsigned long long)sh->sector, sh->state, 3307 atomic_read(&sh->count), pd_idx, qd_idx, 3308 sh->check_state, sh->reconstruct_state); 3309 memset(&s, 0, sizeof(s)); 3310 3311 spin_lock(&sh->lock); 3312 clear_bit(STRIPE_HANDLE, &sh->state); 3313 clear_bit(STRIPE_DELAYED, &sh->state); 3314 3315 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3316 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3317 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3318 /* Now to look around and see what can be done */ 3319 3320 rcu_read_lock(); 3321 for (i=disks; i--; ) { 3322 mdk_rdev_t *rdev; 3323 dev = &sh->dev[i]; 3324 3325 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3326 i, dev->flags, dev->toread, dev->towrite, dev->written); 3327 /* maybe we can reply to a read 3328 * 3329 * new wantfill requests are only permitted while 3330 * ops_complete_biofill is guaranteed to be inactive 3331 */ 3332 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3333 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3334 set_bit(R5_Wantfill, &dev->flags); 3335 3336 /* now count some things */ 3337 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3338 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3339 if (test_bit(R5_Wantcompute, &dev->flags)) { 3340 s.compute++; 3341 BUG_ON(s.compute > 2); 3342 } 3343 3344 if (test_bit(R5_Wantfill, &dev->flags)) { 3345 s.to_fill++; 3346 } else if (dev->toread) 3347 s.to_read++; 3348 if (dev->towrite) { 3349 s.to_write++; 3350 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3351 s.non_overwrite++; 3352 } 3353 if (dev->written) 3354 s.written++; 3355 rdev = rcu_dereference(conf->disks[i].rdev); 3356 if (blocked_rdev == NULL && 3357 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3358 blocked_rdev = rdev; 3359 atomic_inc(&rdev->nr_pending); 3360 } 3361 clear_bit(R5_Insync, &dev->flags); 3362 if (!rdev) 3363 /* Not in-sync */; 3364 else if (test_bit(In_sync, &rdev->flags)) 3365 set_bit(R5_Insync, &dev->flags); 3366 else { 3367 /* in sync if before recovery_offset */ 3368 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3369 set_bit(R5_Insync, &dev->flags); 3370 } 3371 if (!test_bit(R5_Insync, &dev->flags)) { 3372 /* The ReadError flag will just be confusing now */ 3373 clear_bit(R5_ReadError, &dev->flags); 3374 clear_bit(R5_ReWrite, &dev->flags); 3375 } 3376 if (test_bit(R5_ReadError, &dev->flags)) 3377 clear_bit(R5_Insync, &dev->flags); 3378 if (!test_bit(R5_Insync, &dev->flags)) { 3379 if (s.failed < 2) 3380 r6s.failed_num[s.failed] = i; 3381 s.failed++; 3382 } 3383 } 3384 rcu_read_unlock(); 3385 3386 if (unlikely(blocked_rdev)) { 3387 if (s.syncing || s.expanding || s.expanded || 3388 s.to_write || s.written) { 3389 set_bit(STRIPE_HANDLE, &sh->state); 3390 goto unlock; 3391 } 3392 /* There is nothing for the blocked_rdev to block */ 3393 rdev_dec_pending(blocked_rdev, conf->mddev); 3394 blocked_rdev = NULL; 3395 } 3396 3397 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3398 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3399 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3400 } 3401 3402 pr_debug("locked=%d uptodate=%d to_read=%d" 3403 " to_write=%d failed=%d failed_num=%d,%d\n", 3404 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3405 r6s.failed_num[0], r6s.failed_num[1]); 3406 /* check if the array has lost >2 devices and, if so, some requests 3407 * might need to be failed 3408 */ 3409 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3410 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3411 if (s.failed > 2 && s.syncing) { 3412 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3413 clear_bit(STRIPE_SYNCING, &sh->state); 3414 s.syncing = 0; 3415 } 3416 3417 /* 3418 * might be able to return some write requests if the parity blocks 3419 * are safe, or on a failed drive 3420 */ 3421 pdev = &sh->dev[pd_idx]; 3422 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3423 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3424 qdev = &sh->dev[qd_idx]; 3425 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3426 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3427 3428 if ( s.written && 3429 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3430 && !test_bit(R5_LOCKED, &pdev->flags) 3431 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3432 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3433 && !test_bit(R5_LOCKED, &qdev->flags) 3434 && test_bit(R5_UPTODATE, &qdev->flags))))) 3435 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3436 3437 /* Now we might consider reading some blocks, either to check/generate 3438 * parity, or to satisfy requests 3439 * or to load a block that is being partially written. 3440 */ 3441 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3442 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3443 handle_stripe_fill6(sh, &s, &r6s, disks); 3444 3445 /* Now we check to see if any write operations have recently 3446 * completed 3447 */ 3448 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3449 3450 sh->reconstruct_state = reconstruct_state_idle; 3451 /* All the 'written' buffers and the parity blocks are ready to 3452 * be written back to disk 3453 */ 3454 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3455 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3456 for (i = disks; i--; ) { 3457 dev = &sh->dev[i]; 3458 if (test_bit(R5_LOCKED, &dev->flags) && 3459 (i == sh->pd_idx || i == qd_idx || 3460 dev->written)) { 3461 pr_debug("Writing block %d\n", i); 3462 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3463 set_bit(R5_Wantwrite, &dev->flags); 3464 if (!test_bit(R5_Insync, &dev->flags) || 3465 ((i == sh->pd_idx || i == qd_idx) && 3466 s.failed == 0)) 3467 set_bit(STRIPE_INSYNC, &sh->state); 3468 } 3469 } 3470 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3471 dec_preread_active = 1; 3472 } 3473 3474 /* Now to consider new write requests and what else, if anything 3475 * should be read. We do not handle new writes when: 3476 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3477 * 2/ A 'check' operation is in flight, as it may clobber the parity 3478 * block. 3479 */ 3480 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3481 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3482 3483 /* maybe we need to check and possibly fix the parity for this stripe 3484 * Any reads will already have been scheduled, so we just see if enough 3485 * data is available. The parity check is held off while parity 3486 * dependent operations are in flight. 3487 */ 3488 if (sh->check_state || 3489 (s.syncing && s.locked == 0 && 3490 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3491 !test_bit(STRIPE_INSYNC, &sh->state))) 3492 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3493 3494 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3495 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3496 clear_bit(STRIPE_SYNCING, &sh->state); 3497 } 3498 3499 /* If the failed drives are just a ReadError, then we might need 3500 * to progress the repair/check process 3501 */ 3502 if (s.failed <= 2 && !conf->mddev->ro) 3503 for (i = 0; i < s.failed; i++) { 3504 dev = &sh->dev[r6s.failed_num[i]]; 3505 if (test_bit(R5_ReadError, &dev->flags) 3506 && !test_bit(R5_LOCKED, &dev->flags) 3507 && test_bit(R5_UPTODATE, &dev->flags) 3508 ) { 3509 if (!test_bit(R5_ReWrite, &dev->flags)) { 3510 set_bit(R5_Wantwrite, &dev->flags); 3511 set_bit(R5_ReWrite, &dev->flags); 3512 set_bit(R5_LOCKED, &dev->flags); 3513 s.locked++; 3514 } else { 3515 /* let's read it back */ 3516 set_bit(R5_Wantread, &dev->flags); 3517 set_bit(R5_LOCKED, &dev->flags); 3518 s.locked++; 3519 } 3520 } 3521 } 3522 3523 /* Finish reconstruct operations initiated by the expansion process */ 3524 if (sh->reconstruct_state == reconstruct_state_result) { 3525 sh->reconstruct_state = reconstruct_state_idle; 3526 clear_bit(STRIPE_EXPANDING, &sh->state); 3527 for (i = conf->raid_disks; i--; ) { 3528 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3529 set_bit(R5_LOCKED, &sh->dev[i].flags); 3530 s.locked++; 3531 } 3532 } 3533 3534 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3535 !sh->reconstruct_state) { 3536 struct stripe_head *sh2 3537 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3538 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3539 /* sh cannot be written until sh2 has been read. 3540 * so arrange for sh to be delayed a little 3541 */ 3542 set_bit(STRIPE_DELAYED, &sh->state); 3543 set_bit(STRIPE_HANDLE, &sh->state); 3544 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3545 &sh2->state)) 3546 atomic_inc(&conf->preread_active_stripes); 3547 release_stripe(sh2); 3548 goto unlock; 3549 } 3550 if (sh2) 3551 release_stripe(sh2); 3552 3553 /* Need to write out all blocks after computing P&Q */ 3554 sh->disks = conf->raid_disks; 3555 stripe_set_idx(sh->sector, conf, 0, sh); 3556 schedule_reconstruction(sh, &s, 1, 1); 3557 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3558 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3559 atomic_dec(&conf->reshape_stripes); 3560 wake_up(&conf->wait_for_overlap); 3561 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3562 } 3563 3564 if (s.expanding && s.locked == 0 && 3565 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3566 handle_stripe_expansion(conf, sh, &r6s); 3567 3568 unlock: 3569 spin_unlock(&sh->lock); 3570 3571 /* wait for this device to become unblocked */ 3572 if (unlikely(blocked_rdev)) 3573 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3574 3575 if (s.ops_request) 3576 raid_run_ops(sh, s.ops_request); 3577 3578 ops_run_io(sh, &s); 3579 3580 3581 if (dec_preread_active) { 3582 /* We delay this until after ops_run_io so that if make_request 3583 * is waiting on a barrier, it won't continue until the writes 3584 * have actually been submitted. 3585 */ 3586 atomic_dec(&conf->preread_active_stripes); 3587 if (atomic_read(&conf->preread_active_stripes) < 3588 IO_THRESHOLD) 3589 md_wakeup_thread(conf->mddev->thread); 3590 } 3591 3592 return_io(return_bi); 3593 } 3594 3595 static void handle_stripe(struct stripe_head *sh) 3596 { 3597 if (sh->raid_conf->level == 6) 3598 handle_stripe6(sh); 3599 else 3600 handle_stripe5(sh); 3601 } 3602 3603 static void raid5_activate_delayed(raid5_conf_t *conf) 3604 { 3605 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3606 while (!list_empty(&conf->delayed_list)) { 3607 struct list_head *l = conf->delayed_list.next; 3608 struct stripe_head *sh; 3609 sh = list_entry(l, struct stripe_head, lru); 3610 list_del_init(l); 3611 clear_bit(STRIPE_DELAYED, &sh->state); 3612 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3613 atomic_inc(&conf->preread_active_stripes); 3614 list_add_tail(&sh->lru, &conf->hold_list); 3615 } 3616 } else 3617 blk_plug_device(conf->mddev->queue); 3618 } 3619 3620 static void activate_bit_delay(raid5_conf_t *conf) 3621 { 3622 /* device_lock is held */ 3623 struct list_head head; 3624 list_add(&head, &conf->bitmap_list); 3625 list_del_init(&conf->bitmap_list); 3626 while (!list_empty(&head)) { 3627 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3628 list_del_init(&sh->lru); 3629 atomic_inc(&sh->count); 3630 __release_stripe(conf, sh); 3631 } 3632 } 3633 3634 static void unplug_slaves(mddev_t *mddev) 3635 { 3636 raid5_conf_t *conf = mddev->private; 3637 int i; 3638 int devs = max(conf->raid_disks, conf->previous_raid_disks); 3639 3640 rcu_read_lock(); 3641 for (i = 0; i < devs; i++) { 3642 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3643 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3644 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3645 3646 atomic_inc(&rdev->nr_pending); 3647 rcu_read_unlock(); 3648 3649 blk_unplug(r_queue); 3650 3651 rdev_dec_pending(rdev, mddev); 3652 rcu_read_lock(); 3653 } 3654 } 3655 rcu_read_unlock(); 3656 } 3657 3658 static void raid5_unplug_device(struct request_queue *q) 3659 { 3660 mddev_t *mddev = q->queuedata; 3661 raid5_conf_t *conf = mddev->private; 3662 unsigned long flags; 3663 3664 spin_lock_irqsave(&conf->device_lock, flags); 3665 3666 if (blk_remove_plug(q)) { 3667 conf->seq_flush++; 3668 raid5_activate_delayed(conf); 3669 } 3670 md_wakeup_thread(mddev->thread); 3671 3672 spin_unlock_irqrestore(&conf->device_lock, flags); 3673 3674 unplug_slaves(mddev); 3675 } 3676 3677 static int raid5_congested(void *data, int bits) 3678 { 3679 mddev_t *mddev = data; 3680 raid5_conf_t *conf = mddev->private; 3681 3682 /* No difference between reads and writes. Just check 3683 * how busy the stripe_cache is 3684 */ 3685 3686 if (mddev_congested(mddev, bits)) 3687 return 1; 3688 if (conf->inactive_blocked) 3689 return 1; 3690 if (conf->quiesce) 3691 return 1; 3692 if (list_empty_careful(&conf->inactive_list)) 3693 return 1; 3694 3695 return 0; 3696 } 3697 3698 /* We want read requests to align with chunks where possible, 3699 * but write requests don't need to. 3700 */ 3701 static int raid5_mergeable_bvec(struct request_queue *q, 3702 struct bvec_merge_data *bvm, 3703 struct bio_vec *biovec) 3704 { 3705 mddev_t *mddev = q->queuedata; 3706 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3707 int max; 3708 unsigned int chunk_sectors = mddev->chunk_sectors; 3709 unsigned int bio_sectors = bvm->bi_size >> 9; 3710 3711 if ((bvm->bi_rw & 1) == WRITE) 3712 return biovec->bv_len; /* always allow writes to be mergeable */ 3713 3714 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3715 chunk_sectors = mddev->new_chunk_sectors; 3716 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3717 if (max < 0) max = 0; 3718 if (max <= biovec->bv_len && bio_sectors == 0) 3719 return biovec->bv_len; 3720 else 3721 return max; 3722 } 3723 3724 3725 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3726 { 3727 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3728 unsigned int chunk_sectors = mddev->chunk_sectors; 3729 unsigned int bio_sectors = bio->bi_size >> 9; 3730 3731 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3732 chunk_sectors = mddev->new_chunk_sectors; 3733 return chunk_sectors >= 3734 ((sector & (chunk_sectors - 1)) + bio_sectors); 3735 } 3736 3737 /* 3738 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3739 * later sampled by raid5d. 3740 */ 3741 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3742 { 3743 unsigned long flags; 3744 3745 spin_lock_irqsave(&conf->device_lock, flags); 3746 3747 bi->bi_next = conf->retry_read_aligned_list; 3748 conf->retry_read_aligned_list = bi; 3749 3750 spin_unlock_irqrestore(&conf->device_lock, flags); 3751 md_wakeup_thread(conf->mddev->thread); 3752 } 3753 3754 3755 static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3756 { 3757 struct bio *bi; 3758 3759 bi = conf->retry_read_aligned; 3760 if (bi) { 3761 conf->retry_read_aligned = NULL; 3762 return bi; 3763 } 3764 bi = conf->retry_read_aligned_list; 3765 if(bi) { 3766 conf->retry_read_aligned_list = bi->bi_next; 3767 bi->bi_next = NULL; 3768 /* 3769 * this sets the active strip count to 1 and the processed 3770 * strip count to zero (upper 8 bits) 3771 */ 3772 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3773 } 3774 3775 return bi; 3776 } 3777 3778 3779 /* 3780 * The "raid5_align_endio" should check if the read succeeded and if it 3781 * did, call bio_endio on the original bio (having bio_put the new bio 3782 * first). 3783 * If the read failed.. 3784 */ 3785 static void raid5_align_endio(struct bio *bi, int error) 3786 { 3787 struct bio* raid_bi = bi->bi_private; 3788 mddev_t *mddev; 3789 raid5_conf_t *conf; 3790 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3791 mdk_rdev_t *rdev; 3792 3793 bio_put(bi); 3794 3795 rdev = (void*)raid_bi->bi_next; 3796 raid_bi->bi_next = NULL; 3797 mddev = rdev->mddev; 3798 conf = mddev->private; 3799 3800 rdev_dec_pending(rdev, conf->mddev); 3801 3802 if (!error && uptodate) { 3803 bio_endio(raid_bi, 0); 3804 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3805 wake_up(&conf->wait_for_stripe); 3806 return; 3807 } 3808 3809 3810 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3811 3812 add_bio_to_retry(raid_bi, conf); 3813 } 3814 3815 static int bio_fits_rdev(struct bio *bi) 3816 { 3817 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3818 3819 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3820 return 0; 3821 blk_recount_segments(q, bi); 3822 if (bi->bi_phys_segments > queue_max_segments(q)) 3823 return 0; 3824 3825 if (q->merge_bvec_fn) 3826 /* it's too hard to apply the merge_bvec_fn at this stage, 3827 * just just give up 3828 */ 3829 return 0; 3830 3831 return 1; 3832 } 3833 3834 3835 static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) 3836 { 3837 raid5_conf_t *conf = mddev->private; 3838 int dd_idx; 3839 struct bio* align_bi; 3840 mdk_rdev_t *rdev; 3841 3842 if (!in_chunk_boundary(mddev, raid_bio)) { 3843 pr_debug("chunk_aligned_read : non aligned\n"); 3844 return 0; 3845 } 3846 /* 3847 * use bio_clone to make a copy of the bio 3848 */ 3849 align_bi = bio_clone(raid_bio, GFP_NOIO); 3850 if (!align_bi) 3851 return 0; 3852 /* 3853 * set bi_end_io to a new function, and set bi_private to the 3854 * original bio. 3855 */ 3856 align_bi->bi_end_io = raid5_align_endio; 3857 align_bi->bi_private = raid_bio; 3858 /* 3859 * compute position 3860 */ 3861 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3862 0, 3863 &dd_idx, NULL); 3864 3865 rcu_read_lock(); 3866 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3867 if (rdev && test_bit(In_sync, &rdev->flags)) { 3868 atomic_inc(&rdev->nr_pending); 3869 rcu_read_unlock(); 3870 raid_bio->bi_next = (void*)rdev; 3871 align_bi->bi_bdev = rdev->bdev; 3872 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3873 align_bi->bi_sector += rdev->data_offset; 3874 3875 if (!bio_fits_rdev(align_bi)) { 3876 /* too big in some way */ 3877 bio_put(align_bi); 3878 rdev_dec_pending(rdev, mddev); 3879 return 0; 3880 } 3881 3882 spin_lock_irq(&conf->device_lock); 3883 wait_event_lock_irq(conf->wait_for_stripe, 3884 conf->quiesce == 0, 3885 conf->device_lock, /* nothing */); 3886 atomic_inc(&conf->active_aligned_reads); 3887 spin_unlock_irq(&conf->device_lock); 3888 3889 generic_make_request(align_bi); 3890 return 1; 3891 } else { 3892 rcu_read_unlock(); 3893 bio_put(align_bi); 3894 return 0; 3895 } 3896 } 3897 3898 /* __get_priority_stripe - get the next stripe to process 3899 * 3900 * Full stripe writes are allowed to pass preread active stripes up until 3901 * the bypass_threshold is exceeded. In general the bypass_count 3902 * increments when the handle_list is handled before the hold_list; however, it 3903 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3904 * stripe with in flight i/o. The bypass_count will be reset when the 3905 * head of the hold_list has changed, i.e. the head was promoted to the 3906 * handle_list. 3907 */ 3908 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3909 { 3910 struct stripe_head *sh; 3911 3912 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3913 __func__, 3914 list_empty(&conf->handle_list) ? "empty" : "busy", 3915 list_empty(&conf->hold_list) ? "empty" : "busy", 3916 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3917 3918 if (!list_empty(&conf->handle_list)) { 3919 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3920 3921 if (list_empty(&conf->hold_list)) 3922 conf->bypass_count = 0; 3923 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3924 if (conf->hold_list.next == conf->last_hold) 3925 conf->bypass_count++; 3926 else { 3927 conf->last_hold = conf->hold_list.next; 3928 conf->bypass_count -= conf->bypass_threshold; 3929 if (conf->bypass_count < 0) 3930 conf->bypass_count = 0; 3931 } 3932 } 3933 } else if (!list_empty(&conf->hold_list) && 3934 ((conf->bypass_threshold && 3935 conf->bypass_count > conf->bypass_threshold) || 3936 atomic_read(&conf->pending_full_writes) == 0)) { 3937 sh = list_entry(conf->hold_list.next, 3938 typeof(*sh), lru); 3939 conf->bypass_count -= conf->bypass_threshold; 3940 if (conf->bypass_count < 0) 3941 conf->bypass_count = 0; 3942 } else 3943 return NULL; 3944 3945 list_del_init(&sh->lru); 3946 atomic_inc(&sh->count); 3947 BUG_ON(atomic_read(&sh->count) != 1); 3948 return sh; 3949 } 3950 3951 static int make_request(mddev_t *mddev, struct bio * bi) 3952 { 3953 raid5_conf_t *conf = mddev->private; 3954 int dd_idx; 3955 sector_t new_sector; 3956 sector_t logical_sector, last_sector; 3957 struct stripe_head *sh; 3958 const int rw = bio_data_dir(bi); 3959 int remaining; 3960 3961 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3962 /* Drain all pending writes. We only really need 3963 * to ensure they have been submitted, but this is 3964 * easier. 3965 */ 3966 mddev->pers->quiesce(mddev, 1); 3967 mddev->pers->quiesce(mddev, 0); 3968 md_barrier_request(mddev, bi); 3969 return 0; 3970 } 3971 3972 md_write_start(mddev, bi); 3973 3974 if (rw == READ && 3975 mddev->reshape_position == MaxSector && 3976 chunk_aligned_read(mddev,bi)) 3977 return 0; 3978 3979 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3980 last_sector = bi->bi_sector + (bi->bi_size>>9); 3981 bi->bi_next = NULL; 3982 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3983 3984 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3985 DEFINE_WAIT(w); 3986 int disks, data_disks; 3987 int previous; 3988 3989 retry: 3990 previous = 0; 3991 disks = conf->raid_disks; 3992 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3993 if (unlikely(conf->reshape_progress != MaxSector)) { 3994 /* spinlock is needed as reshape_progress may be 3995 * 64bit on a 32bit platform, and so it might be 3996 * possible to see a half-updated value 3997 * Ofcourse reshape_progress could change after 3998 * the lock is dropped, so once we get a reference 3999 * to the stripe that we think it is, we will have 4000 * to check again. 4001 */ 4002 spin_lock_irq(&conf->device_lock); 4003 if (mddev->delta_disks < 0 4004 ? logical_sector < conf->reshape_progress 4005 : logical_sector >= conf->reshape_progress) { 4006 disks = conf->previous_raid_disks; 4007 previous = 1; 4008 } else { 4009 if (mddev->delta_disks < 0 4010 ? logical_sector < conf->reshape_safe 4011 : logical_sector >= conf->reshape_safe) { 4012 spin_unlock_irq(&conf->device_lock); 4013 schedule(); 4014 goto retry; 4015 } 4016 } 4017 spin_unlock_irq(&conf->device_lock); 4018 } 4019 data_disks = disks - conf->max_degraded; 4020 4021 new_sector = raid5_compute_sector(conf, logical_sector, 4022 previous, 4023 &dd_idx, NULL); 4024 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4025 (unsigned long long)new_sector, 4026 (unsigned long long)logical_sector); 4027 4028 sh = get_active_stripe(conf, new_sector, previous, 4029 (bi->bi_rw&RWA_MASK), 0); 4030 if (sh) { 4031 if (unlikely(previous)) { 4032 /* expansion might have moved on while waiting for a 4033 * stripe, so we must do the range check again. 4034 * Expansion could still move past after this 4035 * test, but as we are holding a reference to 4036 * 'sh', we know that if that happens, 4037 * STRIPE_EXPANDING will get set and the expansion 4038 * won't proceed until we finish with the stripe. 4039 */ 4040 int must_retry = 0; 4041 spin_lock_irq(&conf->device_lock); 4042 if (mddev->delta_disks < 0 4043 ? logical_sector >= conf->reshape_progress 4044 : logical_sector < conf->reshape_progress) 4045 /* mismatch, need to try again */ 4046 must_retry = 1; 4047 spin_unlock_irq(&conf->device_lock); 4048 if (must_retry) { 4049 release_stripe(sh); 4050 schedule(); 4051 goto retry; 4052 } 4053 } 4054 4055 if (bio_data_dir(bi) == WRITE && 4056 logical_sector >= mddev->suspend_lo && 4057 logical_sector < mddev->suspend_hi) { 4058 release_stripe(sh); 4059 /* As the suspend_* range is controlled by 4060 * userspace, we want an interruptible 4061 * wait. 4062 */ 4063 flush_signals(current); 4064 prepare_to_wait(&conf->wait_for_overlap, 4065 &w, TASK_INTERRUPTIBLE); 4066 if (logical_sector >= mddev->suspend_lo && 4067 logical_sector < mddev->suspend_hi) 4068 schedule(); 4069 goto retry; 4070 } 4071 4072 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4073 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 4074 /* Stripe is busy expanding or 4075 * add failed due to overlap. Flush everything 4076 * and wait a while 4077 */ 4078 raid5_unplug_device(mddev->queue); 4079 release_stripe(sh); 4080 schedule(); 4081 goto retry; 4082 } 4083 finish_wait(&conf->wait_for_overlap, &w); 4084 set_bit(STRIPE_HANDLE, &sh->state); 4085 clear_bit(STRIPE_DELAYED, &sh->state); 4086 if (mddev->barrier && 4087 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4088 atomic_inc(&conf->preread_active_stripes); 4089 release_stripe(sh); 4090 } else { 4091 /* cannot get stripe for read-ahead, just give-up */ 4092 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4093 finish_wait(&conf->wait_for_overlap, &w); 4094 break; 4095 } 4096 4097 } 4098 spin_lock_irq(&conf->device_lock); 4099 remaining = raid5_dec_bi_phys_segments(bi); 4100 spin_unlock_irq(&conf->device_lock); 4101 if (remaining == 0) { 4102 4103 if ( rw == WRITE ) 4104 md_write_end(mddev); 4105 4106 bio_endio(bi, 0); 4107 } 4108 4109 if (mddev->barrier) { 4110 /* We need to wait for the stripes to all be handled. 4111 * So: wait for preread_active_stripes to drop to 0. 4112 */ 4113 wait_event(mddev->thread->wqueue, 4114 atomic_read(&conf->preread_active_stripes) == 0); 4115 } 4116 return 0; 4117 } 4118 4119 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); 4120 4121 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 4122 { 4123 /* reshaping is quite different to recovery/resync so it is 4124 * handled quite separately ... here. 4125 * 4126 * On each call to sync_request, we gather one chunk worth of 4127 * destination stripes and flag them as expanding. 4128 * Then we find all the source stripes and request reads. 4129 * As the reads complete, handle_stripe will copy the data 4130 * into the destination stripe and release that stripe. 4131 */ 4132 raid5_conf_t *conf = mddev->private; 4133 struct stripe_head *sh; 4134 sector_t first_sector, last_sector; 4135 int raid_disks = conf->previous_raid_disks; 4136 int data_disks = raid_disks - conf->max_degraded; 4137 int new_data_disks = conf->raid_disks - conf->max_degraded; 4138 int i; 4139 int dd_idx; 4140 sector_t writepos, readpos, safepos; 4141 sector_t stripe_addr; 4142 int reshape_sectors; 4143 struct list_head stripes; 4144 4145 if (sector_nr == 0) { 4146 /* If restarting in the middle, skip the initial sectors */ 4147 if (mddev->delta_disks < 0 && 4148 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4149 sector_nr = raid5_size(mddev, 0, 0) 4150 - conf->reshape_progress; 4151 } else if (mddev->delta_disks >= 0 && 4152 conf->reshape_progress > 0) 4153 sector_nr = conf->reshape_progress; 4154 sector_div(sector_nr, new_data_disks); 4155 if (sector_nr) { 4156 mddev->curr_resync_completed = sector_nr; 4157 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4158 *skipped = 1; 4159 return sector_nr; 4160 } 4161 } 4162 4163 /* We need to process a full chunk at a time. 4164 * If old and new chunk sizes differ, we need to process the 4165 * largest of these 4166 */ 4167 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4168 reshape_sectors = mddev->new_chunk_sectors; 4169 else 4170 reshape_sectors = mddev->chunk_sectors; 4171 4172 /* we update the metadata when there is more than 3Meg 4173 * in the block range (that is rather arbitrary, should 4174 * probably be time based) or when the data about to be 4175 * copied would over-write the source of the data at 4176 * the front of the range. 4177 * i.e. one new_stripe along from reshape_progress new_maps 4178 * to after where reshape_safe old_maps to 4179 */ 4180 writepos = conf->reshape_progress; 4181 sector_div(writepos, new_data_disks); 4182 readpos = conf->reshape_progress; 4183 sector_div(readpos, data_disks); 4184 safepos = conf->reshape_safe; 4185 sector_div(safepos, data_disks); 4186 if (mddev->delta_disks < 0) { 4187 writepos -= min_t(sector_t, reshape_sectors, writepos); 4188 readpos += reshape_sectors; 4189 safepos += reshape_sectors; 4190 } else { 4191 writepos += reshape_sectors; 4192 readpos -= min_t(sector_t, reshape_sectors, readpos); 4193 safepos -= min_t(sector_t, reshape_sectors, safepos); 4194 } 4195 4196 /* 'writepos' is the most advanced device address we might write. 4197 * 'readpos' is the least advanced device address we might read. 4198 * 'safepos' is the least address recorded in the metadata as having 4199 * been reshaped. 4200 * If 'readpos' is behind 'writepos', then there is no way that we can 4201 * ensure safety in the face of a crash - that must be done by userspace 4202 * making a backup of the data. So in that case there is no particular 4203 * rush to update metadata. 4204 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4205 * update the metadata to advance 'safepos' to match 'readpos' so that 4206 * we can be safe in the event of a crash. 4207 * So we insist on updating metadata if safepos is behind writepos and 4208 * readpos is beyond writepos. 4209 * In any case, update the metadata every 10 seconds. 4210 * Maybe that number should be configurable, but I'm not sure it is 4211 * worth it.... maybe it could be a multiple of safemode_delay??? 4212 */ 4213 if ((mddev->delta_disks < 0 4214 ? (safepos > writepos && readpos < writepos) 4215 : (safepos < writepos && readpos > writepos)) || 4216 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4217 /* Cannot proceed until we've updated the superblock... */ 4218 wait_event(conf->wait_for_overlap, 4219 atomic_read(&conf->reshape_stripes)==0); 4220 mddev->reshape_position = conf->reshape_progress; 4221 mddev->curr_resync_completed = mddev->curr_resync; 4222 conf->reshape_checkpoint = jiffies; 4223 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4224 md_wakeup_thread(mddev->thread); 4225 wait_event(mddev->sb_wait, mddev->flags == 0 || 4226 kthread_should_stop()); 4227 spin_lock_irq(&conf->device_lock); 4228 conf->reshape_safe = mddev->reshape_position; 4229 spin_unlock_irq(&conf->device_lock); 4230 wake_up(&conf->wait_for_overlap); 4231 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4232 } 4233 4234 if (mddev->delta_disks < 0) { 4235 BUG_ON(conf->reshape_progress == 0); 4236 stripe_addr = writepos; 4237 BUG_ON((mddev->dev_sectors & 4238 ~((sector_t)reshape_sectors - 1)) 4239 - reshape_sectors - stripe_addr 4240 != sector_nr); 4241 } else { 4242 BUG_ON(writepos != sector_nr + reshape_sectors); 4243 stripe_addr = sector_nr; 4244 } 4245 INIT_LIST_HEAD(&stripes); 4246 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4247 int j; 4248 int skipped_disk = 0; 4249 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4250 set_bit(STRIPE_EXPANDING, &sh->state); 4251 atomic_inc(&conf->reshape_stripes); 4252 /* If any of this stripe is beyond the end of the old 4253 * array, then we need to zero those blocks 4254 */ 4255 for (j=sh->disks; j--;) { 4256 sector_t s; 4257 if (j == sh->pd_idx) 4258 continue; 4259 if (conf->level == 6 && 4260 j == sh->qd_idx) 4261 continue; 4262 s = compute_blocknr(sh, j, 0); 4263 if (s < raid5_size(mddev, 0, 0)) { 4264 skipped_disk = 1; 4265 continue; 4266 } 4267 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4268 set_bit(R5_Expanded, &sh->dev[j].flags); 4269 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4270 } 4271 if (!skipped_disk) { 4272 set_bit(STRIPE_EXPAND_READY, &sh->state); 4273 set_bit(STRIPE_HANDLE, &sh->state); 4274 } 4275 list_add(&sh->lru, &stripes); 4276 } 4277 spin_lock_irq(&conf->device_lock); 4278 if (mddev->delta_disks < 0) 4279 conf->reshape_progress -= reshape_sectors * new_data_disks; 4280 else 4281 conf->reshape_progress += reshape_sectors * new_data_disks; 4282 spin_unlock_irq(&conf->device_lock); 4283 /* Ok, those stripe are ready. We can start scheduling 4284 * reads on the source stripes. 4285 * The source stripes are determined by mapping the first and last 4286 * block on the destination stripes. 4287 */ 4288 first_sector = 4289 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4290 1, &dd_idx, NULL); 4291 last_sector = 4292 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4293 * new_data_disks - 1), 4294 1, &dd_idx, NULL); 4295 if (last_sector >= mddev->dev_sectors) 4296 last_sector = mddev->dev_sectors - 1; 4297 while (first_sector <= last_sector) { 4298 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4299 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4300 set_bit(STRIPE_HANDLE, &sh->state); 4301 release_stripe(sh); 4302 first_sector += STRIPE_SECTORS; 4303 } 4304 /* Now that the sources are clearly marked, we can release 4305 * the destination stripes 4306 */ 4307 while (!list_empty(&stripes)) { 4308 sh = list_entry(stripes.next, struct stripe_head, lru); 4309 list_del_init(&sh->lru); 4310 release_stripe(sh); 4311 } 4312 /* If this takes us to the resync_max point where we have to pause, 4313 * then we need to write out the superblock. 4314 */ 4315 sector_nr += reshape_sectors; 4316 if ((sector_nr - mddev->curr_resync_completed) * 2 4317 >= mddev->resync_max - mddev->curr_resync_completed) { 4318 /* Cannot proceed until we've updated the superblock... */ 4319 wait_event(conf->wait_for_overlap, 4320 atomic_read(&conf->reshape_stripes) == 0); 4321 mddev->reshape_position = conf->reshape_progress; 4322 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; 4323 conf->reshape_checkpoint = jiffies; 4324 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4325 md_wakeup_thread(mddev->thread); 4326 wait_event(mddev->sb_wait, 4327 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4328 || kthread_should_stop()); 4329 spin_lock_irq(&conf->device_lock); 4330 conf->reshape_safe = mddev->reshape_position; 4331 spin_unlock_irq(&conf->device_lock); 4332 wake_up(&conf->wait_for_overlap); 4333 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4334 } 4335 return reshape_sectors; 4336 } 4337 4338 /* FIXME go_faster isn't used */ 4339 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4340 { 4341 raid5_conf_t *conf = mddev->private; 4342 struct stripe_head *sh; 4343 sector_t max_sector = mddev->dev_sectors; 4344 int sync_blocks; 4345 int still_degraded = 0; 4346 int i; 4347 4348 if (sector_nr >= max_sector) { 4349 /* just being told to finish up .. nothing much to do */ 4350 unplug_slaves(mddev); 4351 4352 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4353 end_reshape(conf); 4354 return 0; 4355 } 4356 4357 if (mddev->curr_resync < max_sector) /* aborted */ 4358 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4359 &sync_blocks, 1); 4360 else /* completed sync */ 4361 conf->fullsync = 0; 4362 bitmap_close_sync(mddev->bitmap); 4363 4364 return 0; 4365 } 4366 4367 /* Allow raid5_quiesce to complete */ 4368 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4369 4370 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4371 return reshape_request(mddev, sector_nr, skipped); 4372 4373 /* No need to check resync_max as we never do more than one 4374 * stripe, and as resync_max will always be on a chunk boundary, 4375 * if the check in md_do_sync didn't fire, there is no chance 4376 * of overstepping resync_max here 4377 */ 4378 4379 /* if there is too many failed drives and we are trying 4380 * to resync, then assert that we are finished, because there is 4381 * nothing we can do. 4382 */ 4383 if (mddev->degraded >= conf->max_degraded && 4384 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4385 sector_t rv = mddev->dev_sectors - sector_nr; 4386 *skipped = 1; 4387 return rv; 4388 } 4389 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4390 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4391 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4392 /* we can skip this block, and probably more */ 4393 sync_blocks /= STRIPE_SECTORS; 4394 *skipped = 1; 4395 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4396 } 4397 4398 4399 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4400 4401 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4402 if (sh == NULL) { 4403 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4404 /* make sure we don't swamp the stripe cache if someone else 4405 * is trying to get access 4406 */ 4407 schedule_timeout_uninterruptible(1); 4408 } 4409 /* Need to check if array will still be degraded after recovery/resync 4410 * We don't need to check the 'failed' flag as when that gets set, 4411 * recovery aborts. 4412 */ 4413 for (i = 0; i < conf->raid_disks; i++) 4414 if (conf->disks[i].rdev == NULL) 4415 still_degraded = 1; 4416 4417 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4418 4419 spin_lock(&sh->lock); 4420 set_bit(STRIPE_SYNCING, &sh->state); 4421 clear_bit(STRIPE_INSYNC, &sh->state); 4422 spin_unlock(&sh->lock); 4423 4424 handle_stripe(sh); 4425 release_stripe(sh); 4426 4427 return STRIPE_SECTORS; 4428 } 4429 4430 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 4431 { 4432 /* We may not be able to submit a whole bio at once as there 4433 * may not be enough stripe_heads available. 4434 * We cannot pre-allocate enough stripe_heads as we may need 4435 * more than exist in the cache (if we allow ever large chunks). 4436 * So we do one stripe head at a time and record in 4437 * ->bi_hw_segments how many have been done. 4438 * 4439 * We *know* that this entire raid_bio is in one chunk, so 4440 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4441 */ 4442 struct stripe_head *sh; 4443 int dd_idx; 4444 sector_t sector, logical_sector, last_sector; 4445 int scnt = 0; 4446 int remaining; 4447 int handled = 0; 4448 4449 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4450 sector = raid5_compute_sector(conf, logical_sector, 4451 0, &dd_idx, NULL); 4452 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4453 4454 for (; logical_sector < last_sector; 4455 logical_sector += STRIPE_SECTORS, 4456 sector += STRIPE_SECTORS, 4457 scnt++) { 4458 4459 if (scnt < raid5_bi_hw_segments(raid_bio)) 4460 /* already done this stripe */ 4461 continue; 4462 4463 sh = get_active_stripe(conf, sector, 0, 1, 0); 4464 4465 if (!sh) { 4466 /* failed to get a stripe - must wait */ 4467 raid5_set_bi_hw_segments(raid_bio, scnt); 4468 conf->retry_read_aligned = raid_bio; 4469 return handled; 4470 } 4471 4472 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4473 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4474 release_stripe(sh); 4475 raid5_set_bi_hw_segments(raid_bio, scnt); 4476 conf->retry_read_aligned = raid_bio; 4477 return handled; 4478 } 4479 4480 handle_stripe(sh); 4481 release_stripe(sh); 4482 handled++; 4483 } 4484 spin_lock_irq(&conf->device_lock); 4485 remaining = raid5_dec_bi_phys_segments(raid_bio); 4486 spin_unlock_irq(&conf->device_lock); 4487 if (remaining == 0) 4488 bio_endio(raid_bio, 0); 4489 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4490 wake_up(&conf->wait_for_stripe); 4491 return handled; 4492 } 4493 4494 4495 /* 4496 * This is our raid5 kernel thread. 4497 * 4498 * We scan the hash table for stripes which can be handled now. 4499 * During the scan, completed stripes are saved for us by the interrupt 4500 * handler, so that they will not have to wait for our next wakeup. 4501 */ 4502 static void raid5d(mddev_t *mddev) 4503 { 4504 struct stripe_head *sh; 4505 raid5_conf_t *conf = mddev->private; 4506 int handled; 4507 4508 pr_debug("+++ raid5d active\n"); 4509 4510 md_check_recovery(mddev); 4511 4512 handled = 0; 4513 spin_lock_irq(&conf->device_lock); 4514 while (1) { 4515 struct bio *bio; 4516 4517 if (conf->seq_flush != conf->seq_write) { 4518 int seq = conf->seq_flush; 4519 spin_unlock_irq(&conf->device_lock); 4520 bitmap_unplug(mddev->bitmap); 4521 spin_lock_irq(&conf->device_lock); 4522 conf->seq_write = seq; 4523 activate_bit_delay(conf); 4524 } 4525 4526 while ((bio = remove_bio_from_retry(conf))) { 4527 int ok; 4528 spin_unlock_irq(&conf->device_lock); 4529 ok = retry_aligned_read(conf, bio); 4530 spin_lock_irq(&conf->device_lock); 4531 if (!ok) 4532 break; 4533 handled++; 4534 } 4535 4536 sh = __get_priority_stripe(conf); 4537 4538 if (!sh) 4539 break; 4540 spin_unlock_irq(&conf->device_lock); 4541 4542 handled++; 4543 handle_stripe(sh); 4544 release_stripe(sh); 4545 cond_resched(); 4546 4547 spin_lock_irq(&conf->device_lock); 4548 } 4549 pr_debug("%d stripes handled\n", handled); 4550 4551 spin_unlock_irq(&conf->device_lock); 4552 4553 async_tx_issue_pending_all(); 4554 unplug_slaves(mddev); 4555 4556 pr_debug("--- raid5d inactive\n"); 4557 } 4558 4559 static ssize_t 4560 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4561 { 4562 raid5_conf_t *conf = mddev->private; 4563 if (conf) 4564 return sprintf(page, "%d\n", conf->max_nr_stripes); 4565 else 4566 return 0; 4567 } 4568 4569 static ssize_t 4570 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4571 { 4572 raid5_conf_t *conf = mddev->private; 4573 unsigned long new; 4574 int err; 4575 4576 if (len >= PAGE_SIZE) 4577 return -EINVAL; 4578 if (!conf) 4579 return -ENODEV; 4580 4581 if (strict_strtoul(page, 10, &new)) 4582 return -EINVAL; 4583 if (new <= 16 || new > 32768) 4584 return -EINVAL; 4585 while (new < conf->max_nr_stripes) { 4586 if (drop_one_stripe(conf)) 4587 conf->max_nr_stripes--; 4588 else 4589 break; 4590 } 4591 err = md_allow_write(mddev); 4592 if (err) 4593 return err; 4594 while (new > conf->max_nr_stripes) { 4595 if (grow_one_stripe(conf)) 4596 conf->max_nr_stripes++; 4597 else break; 4598 } 4599 return len; 4600 } 4601 4602 static struct md_sysfs_entry 4603 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4604 raid5_show_stripe_cache_size, 4605 raid5_store_stripe_cache_size); 4606 4607 static ssize_t 4608 raid5_show_preread_threshold(mddev_t *mddev, char *page) 4609 { 4610 raid5_conf_t *conf = mddev->private; 4611 if (conf) 4612 return sprintf(page, "%d\n", conf->bypass_threshold); 4613 else 4614 return 0; 4615 } 4616 4617 static ssize_t 4618 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4619 { 4620 raid5_conf_t *conf = mddev->private; 4621 unsigned long new; 4622 if (len >= PAGE_SIZE) 4623 return -EINVAL; 4624 if (!conf) 4625 return -ENODEV; 4626 4627 if (strict_strtoul(page, 10, &new)) 4628 return -EINVAL; 4629 if (new > conf->max_nr_stripes) 4630 return -EINVAL; 4631 conf->bypass_threshold = new; 4632 return len; 4633 } 4634 4635 static struct md_sysfs_entry 4636 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4637 S_IRUGO | S_IWUSR, 4638 raid5_show_preread_threshold, 4639 raid5_store_preread_threshold); 4640 4641 static ssize_t 4642 stripe_cache_active_show(mddev_t *mddev, char *page) 4643 { 4644 raid5_conf_t *conf = mddev->private; 4645 if (conf) 4646 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4647 else 4648 return 0; 4649 } 4650 4651 static struct md_sysfs_entry 4652 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4653 4654 static struct attribute *raid5_attrs[] = { 4655 &raid5_stripecache_size.attr, 4656 &raid5_stripecache_active.attr, 4657 &raid5_preread_bypass_threshold.attr, 4658 NULL, 4659 }; 4660 static struct attribute_group raid5_attrs_group = { 4661 .name = NULL, 4662 .attrs = raid5_attrs, 4663 }; 4664 4665 static sector_t 4666 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4667 { 4668 raid5_conf_t *conf = mddev->private; 4669 4670 if (!sectors) 4671 sectors = mddev->dev_sectors; 4672 if (!raid_disks) 4673 /* size is defined by the smallest of previous and new size */ 4674 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4675 4676 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4677 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4678 return sectors * (raid_disks - conf->max_degraded); 4679 } 4680 4681 static void raid5_free_percpu(raid5_conf_t *conf) 4682 { 4683 struct raid5_percpu *percpu; 4684 unsigned long cpu; 4685 4686 if (!conf->percpu) 4687 return; 4688 4689 get_online_cpus(); 4690 for_each_possible_cpu(cpu) { 4691 percpu = per_cpu_ptr(conf->percpu, cpu); 4692 safe_put_page(percpu->spare_page); 4693 kfree(percpu->scribble); 4694 } 4695 #ifdef CONFIG_HOTPLUG_CPU 4696 unregister_cpu_notifier(&conf->cpu_notify); 4697 #endif 4698 put_online_cpus(); 4699 4700 free_percpu(conf->percpu); 4701 } 4702 4703 static void free_conf(raid5_conf_t *conf) 4704 { 4705 shrink_stripes(conf); 4706 raid5_free_percpu(conf); 4707 kfree(conf->disks); 4708 kfree(conf->stripe_hashtbl); 4709 kfree(conf); 4710 } 4711 4712 #ifdef CONFIG_HOTPLUG_CPU 4713 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4714 void *hcpu) 4715 { 4716 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); 4717 long cpu = (long)hcpu; 4718 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4719 4720 switch (action) { 4721 case CPU_UP_PREPARE: 4722 case CPU_UP_PREPARE_FROZEN: 4723 if (conf->level == 6 && !percpu->spare_page) 4724 percpu->spare_page = alloc_page(GFP_KERNEL); 4725 if (!percpu->scribble) 4726 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4727 4728 if (!percpu->scribble || 4729 (conf->level == 6 && !percpu->spare_page)) { 4730 safe_put_page(percpu->spare_page); 4731 kfree(percpu->scribble); 4732 pr_err("%s: failed memory allocation for cpu%ld\n", 4733 __func__, cpu); 4734 return notifier_from_errno(-ENOMEM); 4735 } 4736 break; 4737 case CPU_DEAD: 4738 case CPU_DEAD_FROZEN: 4739 safe_put_page(percpu->spare_page); 4740 kfree(percpu->scribble); 4741 percpu->spare_page = NULL; 4742 percpu->scribble = NULL; 4743 break; 4744 default: 4745 break; 4746 } 4747 return NOTIFY_OK; 4748 } 4749 #endif 4750 4751 static int raid5_alloc_percpu(raid5_conf_t *conf) 4752 { 4753 unsigned long cpu; 4754 struct page *spare_page; 4755 struct raid5_percpu __percpu *allcpus; 4756 void *scribble; 4757 int err; 4758 4759 allcpus = alloc_percpu(struct raid5_percpu); 4760 if (!allcpus) 4761 return -ENOMEM; 4762 conf->percpu = allcpus; 4763 4764 get_online_cpus(); 4765 err = 0; 4766 for_each_present_cpu(cpu) { 4767 if (conf->level == 6) { 4768 spare_page = alloc_page(GFP_KERNEL); 4769 if (!spare_page) { 4770 err = -ENOMEM; 4771 break; 4772 } 4773 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4774 } 4775 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4776 if (!scribble) { 4777 err = -ENOMEM; 4778 break; 4779 } 4780 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4781 } 4782 #ifdef CONFIG_HOTPLUG_CPU 4783 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4784 conf->cpu_notify.priority = 0; 4785 if (err == 0) 4786 err = register_cpu_notifier(&conf->cpu_notify); 4787 #endif 4788 put_online_cpus(); 4789 4790 return err; 4791 } 4792 4793 static raid5_conf_t *setup_conf(mddev_t *mddev) 4794 { 4795 raid5_conf_t *conf; 4796 int raid_disk, memory, max_disks; 4797 mdk_rdev_t *rdev; 4798 struct disk_info *disk; 4799 4800 if (mddev->new_level != 5 4801 && mddev->new_level != 4 4802 && mddev->new_level != 6) { 4803 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4804 mdname(mddev), mddev->new_level); 4805 return ERR_PTR(-EIO); 4806 } 4807 if ((mddev->new_level == 5 4808 && !algorithm_valid_raid5(mddev->new_layout)) || 4809 (mddev->new_level == 6 4810 && !algorithm_valid_raid6(mddev->new_layout))) { 4811 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4812 mdname(mddev), mddev->new_layout); 4813 return ERR_PTR(-EIO); 4814 } 4815 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4816 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4817 mdname(mddev), mddev->raid_disks); 4818 return ERR_PTR(-EINVAL); 4819 } 4820 4821 if (!mddev->new_chunk_sectors || 4822 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4823 !is_power_of_2(mddev->new_chunk_sectors)) { 4824 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4825 mdname(mddev), mddev->new_chunk_sectors << 9); 4826 return ERR_PTR(-EINVAL); 4827 } 4828 4829 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4830 if (conf == NULL) 4831 goto abort; 4832 spin_lock_init(&conf->device_lock); 4833 init_waitqueue_head(&conf->wait_for_stripe); 4834 init_waitqueue_head(&conf->wait_for_overlap); 4835 INIT_LIST_HEAD(&conf->handle_list); 4836 INIT_LIST_HEAD(&conf->hold_list); 4837 INIT_LIST_HEAD(&conf->delayed_list); 4838 INIT_LIST_HEAD(&conf->bitmap_list); 4839 INIT_LIST_HEAD(&conf->inactive_list); 4840 atomic_set(&conf->active_stripes, 0); 4841 atomic_set(&conf->preread_active_stripes, 0); 4842 atomic_set(&conf->active_aligned_reads, 0); 4843 conf->bypass_threshold = BYPASS_THRESHOLD; 4844 4845 conf->raid_disks = mddev->raid_disks; 4846 if (mddev->reshape_position == MaxSector) 4847 conf->previous_raid_disks = mddev->raid_disks; 4848 else 4849 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4850 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4851 conf->scribble_len = scribble_len(max_disks); 4852 4853 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4854 GFP_KERNEL); 4855 if (!conf->disks) 4856 goto abort; 4857 4858 conf->mddev = mddev; 4859 4860 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4861 goto abort; 4862 4863 conf->level = mddev->new_level; 4864 if (raid5_alloc_percpu(conf) != 0) 4865 goto abort; 4866 4867 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4868 4869 list_for_each_entry(rdev, &mddev->disks, same_set) { 4870 raid_disk = rdev->raid_disk; 4871 if (raid_disk >= max_disks 4872 || raid_disk < 0) 4873 continue; 4874 disk = conf->disks + raid_disk; 4875 4876 disk->rdev = rdev; 4877 4878 if (test_bit(In_sync, &rdev->flags)) { 4879 char b[BDEVNAME_SIZE]; 4880 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4881 " disk %d\n", 4882 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4883 } else 4884 /* Cannot rely on bitmap to complete recovery */ 4885 conf->fullsync = 1; 4886 } 4887 4888 conf->chunk_sectors = mddev->new_chunk_sectors; 4889 conf->level = mddev->new_level; 4890 if (conf->level == 6) 4891 conf->max_degraded = 2; 4892 else 4893 conf->max_degraded = 1; 4894 conf->algorithm = mddev->new_layout; 4895 conf->max_nr_stripes = NR_STRIPES; 4896 conf->reshape_progress = mddev->reshape_position; 4897 if (conf->reshape_progress != MaxSector) { 4898 conf->prev_chunk_sectors = mddev->chunk_sectors; 4899 conf->prev_algo = mddev->layout; 4900 } 4901 4902 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4903 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4904 if (grow_stripes(conf, conf->max_nr_stripes)) { 4905 printk(KERN_ERR 4906 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4907 mdname(mddev), memory); 4908 goto abort; 4909 } else 4910 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4911 mdname(mddev), memory); 4912 4913 conf->thread = md_register_thread(raid5d, mddev, NULL); 4914 if (!conf->thread) { 4915 printk(KERN_ERR 4916 "md/raid:%s: couldn't allocate thread.\n", 4917 mdname(mddev)); 4918 goto abort; 4919 } 4920 4921 return conf; 4922 4923 abort: 4924 if (conf) { 4925 free_conf(conf); 4926 return ERR_PTR(-EIO); 4927 } else 4928 return ERR_PTR(-ENOMEM); 4929 } 4930 4931 4932 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4933 { 4934 switch (algo) { 4935 case ALGORITHM_PARITY_0: 4936 if (raid_disk < max_degraded) 4937 return 1; 4938 break; 4939 case ALGORITHM_PARITY_N: 4940 if (raid_disk >= raid_disks - max_degraded) 4941 return 1; 4942 break; 4943 case ALGORITHM_PARITY_0_6: 4944 if (raid_disk == 0 || 4945 raid_disk == raid_disks - 1) 4946 return 1; 4947 break; 4948 case ALGORITHM_LEFT_ASYMMETRIC_6: 4949 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4950 case ALGORITHM_LEFT_SYMMETRIC_6: 4951 case ALGORITHM_RIGHT_SYMMETRIC_6: 4952 if (raid_disk == raid_disks - 1) 4953 return 1; 4954 } 4955 return 0; 4956 } 4957 4958 static int run(mddev_t *mddev) 4959 { 4960 raid5_conf_t *conf; 4961 int working_disks = 0, chunk_size; 4962 int dirty_parity_disks = 0; 4963 mdk_rdev_t *rdev; 4964 sector_t reshape_offset = 0; 4965 4966 if (mddev->recovery_cp != MaxSector) 4967 printk(KERN_NOTICE "md/raid:%s: not clean" 4968 " -- starting background reconstruction\n", 4969 mdname(mddev)); 4970 if (mddev->reshape_position != MaxSector) { 4971 /* Check that we can continue the reshape. 4972 * Currently only disks can change, it must 4973 * increase, and we must be past the point where 4974 * a stripe over-writes itself 4975 */ 4976 sector_t here_new, here_old; 4977 int old_disks; 4978 int max_degraded = (mddev->level == 6 ? 2 : 1); 4979 4980 if (mddev->new_level != mddev->level) { 4981 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4982 "required - aborting.\n", 4983 mdname(mddev)); 4984 return -EINVAL; 4985 } 4986 old_disks = mddev->raid_disks - mddev->delta_disks; 4987 /* reshape_position must be on a new-stripe boundary, and one 4988 * further up in new geometry must map after here in old 4989 * geometry. 4990 */ 4991 here_new = mddev->reshape_position; 4992 if (sector_div(here_new, mddev->new_chunk_sectors * 4993 (mddev->raid_disks - max_degraded))) { 4994 printk(KERN_ERR "md/raid:%s: reshape_position not " 4995 "on a stripe boundary\n", mdname(mddev)); 4996 return -EINVAL; 4997 } 4998 reshape_offset = here_new * mddev->new_chunk_sectors; 4999 /* here_new is the stripe we will write to */ 5000 here_old = mddev->reshape_position; 5001 sector_div(here_old, mddev->chunk_sectors * 5002 (old_disks-max_degraded)); 5003 /* here_old is the first stripe that we might need to read 5004 * from */ 5005 if (mddev->delta_disks == 0) { 5006 /* We cannot be sure it is safe to start an in-place 5007 * reshape. It is only safe if user-space if monitoring 5008 * and taking constant backups. 5009 * mdadm always starts a situation like this in 5010 * readonly mode so it can take control before 5011 * allowing any writes. So just check for that. 5012 */ 5013 if ((here_new * mddev->new_chunk_sectors != 5014 here_old * mddev->chunk_sectors) || 5015 mddev->ro == 0) { 5016 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5017 " in read-only mode - aborting\n", 5018 mdname(mddev)); 5019 return -EINVAL; 5020 } 5021 } else if (mddev->delta_disks < 0 5022 ? (here_new * mddev->new_chunk_sectors <= 5023 here_old * mddev->chunk_sectors) 5024 : (here_new * mddev->new_chunk_sectors >= 5025 here_old * mddev->chunk_sectors)) { 5026 /* Reading from the same stripe as writing to - bad */ 5027 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5028 "auto-recovery - aborting.\n", 5029 mdname(mddev)); 5030 return -EINVAL; 5031 } 5032 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5033 mdname(mddev)); 5034 /* OK, we should be able to continue; */ 5035 } else { 5036 BUG_ON(mddev->level != mddev->new_level); 5037 BUG_ON(mddev->layout != mddev->new_layout); 5038 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5039 BUG_ON(mddev->delta_disks != 0); 5040 } 5041 5042 if (mddev->private == NULL) 5043 conf = setup_conf(mddev); 5044 else 5045 conf = mddev->private; 5046 5047 if (IS_ERR(conf)) 5048 return PTR_ERR(conf); 5049 5050 mddev->thread = conf->thread; 5051 conf->thread = NULL; 5052 mddev->private = conf; 5053 5054 /* 5055 * 0 for a fully functional array, 1 or 2 for a degraded array. 5056 */ 5057 list_for_each_entry(rdev, &mddev->disks, same_set) { 5058 if (rdev->raid_disk < 0) 5059 continue; 5060 if (test_bit(In_sync, &rdev->flags)) { 5061 working_disks++; 5062 continue; 5063 } 5064 /* This disc is not fully in-sync. However if it 5065 * just stored parity (beyond the recovery_offset), 5066 * when we don't need to be concerned about the 5067 * array being dirty. 5068 * When reshape goes 'backwards', we never have 5069 * partially completed devices, so we only need 5070 * to worry about reshape going forwards. 5071 */ 5072 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5073 if (mddev->major_version == 0 && 5074 mddev->minor_version > 90) 5075 rdev->recovery_offset = reshape_offset; 5076 5077 if (rdev->recovery_offset < reshape_offset) { 5078 /* We need to check old and new layout */ 5079 if (!only_parity(rdev->raid_disk, 5080 conf->algorithm, 5081 conf->raid_disks, 5082 conf->max_degraded)) 5083 continue; 5084 } 5085 if (!only_parity(rdev->raid_disk, 5086 conf->prev_algo, 5087 conf->previous_raid_disks, 5088 conf->max_degraded)) 5089 continue; 5090 dirty_parity_disks++; 5091 } 5092 5093 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5094 - working_disks); 5095 5096 if (has_failed(conf)) { 5097 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5098 " (%d/%d failed)\n", 5099 mdname(mddev), mddev->degraded, conf->raid_disks); 5100 goto abort; 5101 } 5102 5103 /* device size must be a multiple of chunk size */ 5104 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5105 mddev->resync_max_sectors = mddev->dev_sectors; 5106 5107 if (mddev->degraded > dirty_parity_disks && 5108 mddev->recovery_cp != MaxSector) { 5109 if (mddev->ok_start_degraded) 5110 printk(KERN_WARNING 5111 "md/raid:%s: starting dirty degraded array" 5112 " - data corruption possible.\n", 5113 mdname(mddev)); 5114 else { 5115 printk(KERN_ERR 5116 "md/raid:%s: cannot start dirty degraded array.\n", 5117 mdname(mddev)); 5118 goto abort; 5119 } 5120 } 5121 5122 if (mddev->degraded == 0) 5123 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5124 " devices, algorithm %d\n", mdname(mddev), conf->level, 5125 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5126 mddev->new_layout); 5127 else 5128 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5129 " out of %d devices, algorithm %d\n", 5130 mdname(mddev), conf->level, 5131 mddev->raid_disks - mddev->degraded, 5132 mddev->raid_disks, mddev->new_layout); 5133 5134 print_raid5_conf(conf); 5135 5136 if (conf->reshape_progress != MaxSector) { 5137 conf->reshape_safe = conf->reshape_progress; 5138 atomic_set(&conf->reshape_stripes, 0); 5139 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5140 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5141 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5142 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5143 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5144 "reshape"); 5145 } 5146 5147 /* read-ahead size must cover two whole stripes, which is 5148 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5149 */ 5150 { 5151 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5152 int stripe = data_disks * 5153 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5154 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5155 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5156 } 5157 5158 /* Ok, everything is just fine now */ 5159 if (mddev->to_remove == &raid5_attrs_group) 5160 mddev->to_remove = NULL; 5161 else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5162 printk(KERN_WARNING 5163 "md/raid:%s: failed to create sysfs attributes.\n", 5164 mdname(mddev)); 5165 5166 mddev->queue->queue_lock = &conf->device_lock; 5167 5168 mddev->queue->unplug_fn = raid5_unplug_device; 5169 mddev->queue->backing_dev_info.congested_data = mddev; 5170 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5171 5172 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5173 5174 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5175 chunk_size = mddev->chunk_sectors << 9; 5176 blk_queue_io_min(mddev->queue, chunk_size); 5177 blk_queue_io_opt(mddev->queue, chunk_size * 5178 (conf->raid_disks - conf->max_degraded)); 5179 5180 list_for_each_entry(rdev, &mddev->disks, same_set) 5181 disk_stack_limits(mddev->gendisk, rdev->bdev, 5182 rdev->data_offset << 9); 5183 5184 return 0; 5185 abort: 5186 md_unregister_thread(mddev->thread); 5187 mddev->thread = NULL; 5188 if (conf) { 5189 print_raid5_conf(conf); 5190 free_conf(conf); 5191 } 5192 mddev->private = NULL; 5193 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5194 return -EIO; 5195 } 5196 5197 static int stop(mddev_t *mddev) 5198 { 5199 raid5_conf_t *conf = mddev->private; 5200 5201 md_unregister_thread(mddev->thread); 5202 mddev->thread = NULL; 5203 mddev->queue->backing_dev_info.congested_fn = NULL; 5204 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5205 free_conf(conf); 5206 mddev->private = NULL; 5207 mddev->to_remove = &raid5_attrs_group; 5208 return 0; 5209 } 5210 5211 #ifdef DEBUG 5212 static void print_sh(struct seq_file *seq, struct stripe_head *sh) 5213 { 5214 int i; 5215 5216 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 5217 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 5218 seq_printf(seq, "sh %llu, count %d.\n", 5219 (unsigned long long)sh->sector, atomic_read(&sh->count)); 5220 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 5221 for (i = 0; i < sh->disks; i++) { 5222 seq_printf(seq, "(cache%d: %p %ld) ", 5223 i, sh->dev[i].page, sh->dev[i].flags); 5224 } 5225 seq_printf(seq, "\n"); 5226 } 5227 5228 static void printall(struct seq_file *seq, raid5_conf_t *conf) 5229 { 5230 struct stripe_head *sh; 5231 struct hlist_node *hn; 5232 int i; 5233 5234 spin_lock_irq(&conf->device_lock); 5235 for (i = 0; i < NR_HASH; i++) { 5236 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 5237 if (sh->raid_conf != conf) 5238 continue; 5239 print_sh(seq, sh); 5240 } 5241 } 5242 spin_unlock_irq(&conf->device_lock); 5243 } 5244 #endif 5245 5246 static void status(struct seq_file *seq, mddev_t *mddev) 5247 { 5248 raid5_conf_t *conf = mddev->private; 5249 int i; 5250 5251 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5252 mddev->chunk_sectors / 2, mddev->layout); 5253 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5254 for (i = 0; i < conf->raid_disks; i++) 5255 seq_printf (seq, "%s", 5256 conf->disks[i].rdev && 5257 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5258 seq_printf (seq, "]"); 5259 #ifdef DEBUG 5260 seq_printf (seq, "\n"); 5261 printall(seq, conf); 5262 #endif 5263 } 5264 5265 static void print_raid5_conf (raid5_conf_t *conf) 5266 { 5267 int i; 5268 struct disk_info *tmp; 5269 5270 printk(KERN_DEBUG "RAID conf printout:\n"); 5271 if (!conf) { 5272 printk("(conf==NULL)\n"); 5273 return; 5274 } 5275 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5276 conf->raid_disks, 5277 conf->raid_disks - conf->mddev->degraded); 5278 5279 for (i = 0; i < conf->raid_disks; i++) { 5280 char b[BDEVNAME_SIZE]; 5281 tmp = conf->disks + i; 5282 if (tmp->rdev) 5283 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5284 i, !test_bit(Faulty, &tmp->rdev->flags), 5285 bdevname(tmp->rdev->bdev, b)); 5286 } 5287 } 5288 5289 static int raid5_spare_active(mddev_t *mddev) 5290 { 5291 int i; 5292 raid5_conf_t *conf = mddev->private; 5293 struct disk_info *tmp; 5294 5295 for (i = 0; i < conf->raid_disks; i++) { 5296 tmp = conf->disks + i; 5297 if (tmp->rdev 5298 && tmp->rdev->recovery_offset == MaxSector 5299 && !test_bit(Faulty, &tmp->rdev->flags) 5300 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5301 unsigned long flags; 5302 spin_lock_irqsave(&conf->device_lock, flags); 5303 mddev->degraded--; 5304 spin_unlock_irqrestore(&conf->device_lock, flags); 5305 } 5306 } 5307 print_raid5_conf(conf); 5308 return 0; 5309 } 5310 5311 static int raid5_remove_disk(mddev_t *mddev, int number) 5312 { 5313 raid5_conf_t *conf = mddev->private; 5314 int err = 0; 5315 mdk_rdev_t *rdev; 5316 struct disk_info *p = conf->disks + number; 5317 5318 print_raid5_conf(conf); 5319 rdev = p->rdev; 5320 if (rdev) { 5321 if (number >= conf->raid_disks && 5322 conf->reshape_progress == MaxSector) 5323 clear_bit(In_sync, &rdev->flags); 5324 5325 if (test_bit(In_sync, &rdev->flags) || 5326 atomic_read(&rdev->nr_pending)) { 5327 err = -EBUSY; 5328 goto abort; 5329 } 5330 /* Only remove non-faulty devices if recovery 5331 * isn't possible. 5332 */ 5333 if (!test_bit(Faulty, &rdev->flags) && 5334 !has_failed(conf) && 5335 number < conf->raid_disks) { 5336 err = -EBUSY; 5337 goto abort; 5338 } 5339 p->rdev = NULL; 5340 synchronize_rcu(); 5341 if (atomic_read(&rdev->nr_pending)) { 5342 /* lost the race, try later */ 5343 err = -EBUSY; 5344 p->rdev = rdev; 5345 } 5346 } 5347 abort: 5348 5349 print_raid5_conf(conf); 5350 return err; 5351 } 5352 5353 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 5354 { 5355 raid5_conf_t *conf = mddev->private; 5356 int err = -EEXIST; 5357 int disk; 5358 struct disk_info *p; 5359 int first = 0; 5360 int last = conf->raid_disks - 1; 5361 5362 if (has_failed(conf)) 5363 /* no point adding a device */ 5364 return -EINVAL; 5365 5366 if (rdev->raid_disk >= 0) 5367 first = last = rdev->raid_disk; 5368 5369 /* 5370 * find the disk ... but prefer rdev->saved_raid_disk 5371 * if possible. 5372 */ 5373 if (rdev->saved_raid_disk >= 0 && 5374 rdev->saved_raid_disk >= first && 5375 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5376 disk = rdev->saved_raid_disk; 5377 else 5378 disk = first; 5379 for ( ; disk <= last ; disk++) 5380 if ((p=conf->disks + disk)->rdev == NULL) { 5381 clear_bit(In_sync, &rdev->flags); 5382 rdev->raid_disk = disk; 5383 err = 0; 5384 if (rdev->saved_raid_disk != disk) 5385 conf->fullsync = 1; 5386 rcu_assign_pointer(p->rdev, rdev); 5387 break; 5388 } 5389 print_raid5_conf(conf); 5390 return err; 5391 } 5392 5393 static int raid5_resize(mddev_t *mddev, sector_t sectors) 5394 { 5395 /* no resync is happening, and there is enough space 5396 * on all devices, so we can resize. 5397 * We need to make sure resync covers any new space. 5398 * If the array is shrinking we should possibly wait until 5399 * any io in the removed space completes, but it hardly seems 5400 * worth it. 5401 */ 5402 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5403 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5404 mddev->raid_disks)); 5405 if (mddev->array_sectors > 5406 raid5_size(mddev, sectors, mddev->raid_disks)) 5407 return -EINVAL; 5408 set_capacity(mddev->gendisk, mddev->array_sectors); 5409 revalidate_disk(mddev->gendisk); 5410 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5411 mddev->recovery_cp = mddev->dev_sectors; 5412 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5413 } 5414 mddev->dev_sectors = sectors; 5415 mddev->resync_max_sectors = sectors; 5416 return 0; 5417 } 5418 5419 static int check_stripe_cache(mddev_t *mddev) 5420 { 5421 /* Can only proceed if there are plenty of stripe_heads. 5422 * We need a minimum of one full stripe,, and for sensible progress 5423 * it is best to have about 4 times that. 5424 * If we require 4 times, then the default 256 4K stripe_heads will 5425 * allow for chunk sizes up to 256K, which is probably OK. 5426 * If the chunk size is greater, user-space should request more 5427 * stripe_heads first. 5428 */ 5429 raid5_conf_t *conf = mddev->private; 5430 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5431 > conf->max_nr_stripes || 5432 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5433 > conf->max_nr_stripes) { 5434 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5435 mdname(mddev), 5436 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5437 / STRIPE_SIZE)*4); 5438 return 0; 5439 } 5440 return 1; 5441 } 5442 5443 static int check_reshape(mddev_t *mddev) 5444 { 5445 raid5_conf_t *conf = mddev->private; 5446 5447 if (mddev->delta_disks == 0 && 5448 mddev->new_layout == mddev->layout && 5449 mddev->new_chunk_sectors == mddev->chunk_sectors) 5450 return 0; /* nothing to do */ 5451 if (mddev->bitmap) 5452 /* Cannot grow a bitmap yet */ 5453 return -EBUSY; 5454 if (has_failed(conf)) 5455 return -EINVAL; 5456 if (mddev->delta_disks < 0) { 5457 /* We might be able to shrink, but the devices must 5458 * be made bigger first. 5459 * For raid6, 4 is the minimum size. 5460 * Otherwise 2 is the minimum 5461 */ 5462 int min = 2; 5463 if (mddev->level == 6) 5464 min = 4; 5465 if (mddev->raid_disks + mddev->delta_disks < min) 5466 return -EINVAL; 5467 } 5468 5469 if (!check_stripe_cache(mddev)) 5470 return -ENOSPC; 5471 5472 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5473 } 5474 5475 static int raid5_start_reshape(mddev_t *mddev) 5476 { 5477 raid5_conf_t *conf = mddev->private; 5478 mdk_rdev_t *rdev; 5479 int spares = 0; 5480 int added_devices = 0; 5481 unsigned long flags; 5482 5483 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5484 return -EBUSY; 5485 5486 if (!check_stripe_cache(mddev)) 5487 return -ENOSPC; 5488 5489 list_for_each_entry(rdev, &mddev->disks, same_set) 5490 if (rdev->raid_disk < 0 && 5491 !test_bit(Faulty, &rdev->flags)) 5492 spares++; 5493 5494 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5495 /* Not enough devices even to make a degraded array 5496 * of that size 5497 */ 5498 return -EINVAL; 5499 5500 /* Refuse to reduce size of the array. Any reductions in 5501 * array size must be through explicit setting of array_size 5502 * attribute. 5503 */ 5504 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5505 < mddev->array_sectors) { 5506 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5507 "before number of disks\n", mdname(mddev)); 5508 return -EINVAL; 5509 } 5510 5511 atomic_set(&conf->reshape_stripes, 0); 5512 spin_lock_irq(&conf->device_lock); 5513 conf->previous_raid_disks = conf->raid_disks; 5514 conf->raid_disks += mddev->delta_disks; 5515 conf->prev_chunk_sectors = conf->chunk_sectors; 5516 conf->chunk_sectors = mddev->new_chunk_sectors; 5517 conf->prev_algo = conf->algorithm; 5518 conf->algorithm = mddev->new_layout; 5519 if (mddev->delta_disks < 0) 5520 conf->reshape_progress = raid5_size(mddev, 0, 0); 5521 else 5522 conf->reshape_progress = 0; 5523 conf->reshape_safe = conf->reshape_progress; 5524 conf->generation++; 5525 spin_unlock_irq(&conf->device_lock); 5526 5527 /* Add some new drives, as many as will fit. 5528 * We know there are enough to make the newly sized array work. 5529 * Don't add devices if we are reducing the number of 5530 * devices in the array. This is because it is not possible 5531 * to correctly record the "partially reconstructed" state of 5532 * such devices during the reshape and confusion could result. 5533 */ 5534 if (mddev->delta_disks >= 0) 5535 list_for_each_entry(rdev, &mddev->disks, same_set) 5536 if (rdev->raid_disk < 0 && 5537 !test_bit(Faulty, &rdev->flags)) { 5538 if (raid5_add_disk(mddev, rdev) == 0) { 5539 char nm[20]; 5540 if (rdev->raid_disk >= conf->previous_raid_disks) { 5541 set_bit(In_sync, &rdev->flags); 5542 added_devices++; 5543 } else 5544 rdev->recovery_offset = 0; 5545 sprintf(nm, "rd%d", rdev->raid_disk); 5546 if (sysfs_create_link(&mddev->kobj, 5547 &rdev->kobj, nm)) 5548 printk(KERN_WARNING 5549 "md/raid:%s: failed to create " 5550 " link %s\n", 5551 mdname(mddev), nm); 5552 } else 5553 break; 5554 } 5555 5556 /* When a reshape changes the number of devices, ->degraded 5557 * is measured against the larger of the pre and post number of 5558 * devices.*/ 5559 if (mddev->delta_disks > 0) { 5560 spin_lock_irqsave(&conf->device_lock, flags); 5561 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5562 - added_devices; 5563 spin_unlock_irqrestore(&conf->device_lock, flags); 5564 } 5565 mddev->raid_disks = conf->raid_disks; 5566 mddev->reshape_position = conf->reshape_progress; 5567 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5568 5569 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5570 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5571 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5572 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5573 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5574 "reshape"); 5575 if (!mddev->sync_thread) { 5576 mddev->recovery = 0; 5577 spin_lock_irq(&conf->device_lock); 5578 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5579 conf->reshape_progress = MaxSector; 5580 spin_unlock_irq(&conf->device_lock); 5581 return -EAGAIN; 5582 } 5583 conf->reshape_checkpoint = jiffies; 5584 md_wakeup_thread(mddev->sync_thread); 5585 md_new_event(mddev); 5586 return 0; 5587 } 5588 5589 /* This is called from the reshape thread and should make any 5590 * changes needed in 'conf' 5591 */ 5592 static void end_reshape(raid5_conf_t *conf) 5593 { 5594 5595 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5596 5597 spin_lock_irq(&conf->device_lock); 5598 conf->previous_raid_disks = conf->raid_disks; 5599 conf->reshape_progress = MaxSector; 5600 spin_unlock_irq(&conf->device_lock); 5601 wake_up(&conf->wait_for_overlap); 5602 5603 /* read-ahead size must cover two whole stripes, which is 5604 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5605 */ 5606 { 5607 int data_disks = conf->raid_disks - conf->max_degraded; 5608 int stripe = data_disks * ((conf->chunk_sectors << 9) 5609 / PAGE_SIZE); 5610 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5611 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5612 } 5613 } 5614 } 5615 5616 /* This is called from the raid5d thread with mddev_lock held. 5617 * It makes config changes to the device. 5618 */ 5619 static void raid5_finish_reshape(mddev_t *mddev) 5620 { 5621 raid5_conf_t *conf = mddev->private; 5622 5623 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5624 5625 if (mddev->delta_disks > 0) { 5626 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5627 set_capacity(mddev->gendisk, mddev->array_sectors); 5628 revalidate_disk(mddev->gendisk); 5629 } else { 5630 int d; 5631 mddev->degraded = conf->raid_disks; 5632 for (d = 0; d < conf->raid_disks ; d++) 5633 if (conf->disks[d].rdev && 5634 test_bit(In_sync, 5635 &conf->disks[d].rdev->flags)) 5636 mddev->degraded--; 5637 for (d = conf->raid_disks ; 5638 d < conf->raid_disks - mddev->delta_disks; 5639 d++) { 5640 mdk_rdev_t *rdev = conf->disks[d].rdev; 5641 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5642 char nm[20]; 5643 sprintf(nm, "rd%d", rdev->raid_disk); 5644 sysfs_remove_link(&mddev->kobj, nm); 5645 rdev->raid_disk = -1; 5646 } 5647 } 5648 } 5649 mddev->layout = conf->algorithm; 5650 mddev->chunk_sectors = conf->chunk_sectors; 5651 mddev->reshape_position = MaxSector; 5652 mddev->delta_disks = 0; 5653 } 5654 } 5655 5656 static void raid5_quiesce(mddev_t *mddev, int state) 5657 { 5658 raid5_conf_t *conf = mddev->private; 5659 5660 switch(state) { 5661 case 2: /* resume for a suspend */ 5662 wake_up(&conf->wait_for_overlap); 5663 break; 5664 5665 case 1: /* stop all writes */ 5666 spin_lock_irq(&conf->device_lock); 5667 /* '2' tells resync/reshape to pause so that all 5668 * active stripes can drain 5669 */ 5670 conf->quiesce = 2; 5671 wait_event_lock_irq(conf->wait_for_stripe, 5672 atomic_read(&conf->active_stripes) == 0 && 5673 atomic_read(&conf->active_aligned_reads) == 0, 5674 conf->device_lock, /* nothing */); 5675 conf->quiesce = 1; 5676 spin_unlock_irq(&conf->device_lock); 5677 /* allow reshape to continue */ 5678 wake_up(&conf->wait_for_overlap); 5679 break; 5680 5681 case 0: /* re-enable writes */ 5682 spin_lock_irq(&conf->device_lock); 5683 conf->quiesce = 0; 5684 wake_up(&conf->wait_for_stripe); 5685 wake_up(&conf->wait_for_overlap); 5686 spin_unlock_irq(&conf->device_lock); 5687 break; 5688 } 5689 } 5690 5691 5692 static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5693 { 5694 struct raid0_private_data *raid0_priv = mddev->private; 5695 5696 /* for raid0 takeover only one zone is supported */ 5697 if (raid0_priv->nr_strip_zones > 1) { 5698 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5699 mdname(mddev)); 5700 return ERR_PTR(-EINVAL); 5701 } 5702 5703 mddev->new_level = level; 5704 mddev->new_layout = ALGORITHM_PARITY_N; 5705 mddev->new_chunk_sectors = mddev->chunk_sectors; 5706 mddev->raid_disks += 1; 5707 mddev->delta_disks = 1; 5708 /* make sure it will be not marked as dirty */ 5709 mddev->recovery_cp = MaxSector; 5710 5711 return setup_conf(mddev); 5712 } 5713 5714 5715 static void *raid5_takeover_raid1(mddev_t *mddev) 5716 { 5717 int chunksect; 5718 5719 if (mddev->raid_disks != 2 || 5720 mddev->degraded > 1) 5721 return ERR_PTR(-EINVAL); 5722 5723 /* Should check if there are write-behind devices? */ 5724 5725 chunksect = 64*2; /* 64K by default */ 5726 5727 /* The array must be an exact multiple of chunksize */ 5728 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5729 chunksect >>= 1; 5730 5731 if ((chunksect<<9) < STRIPE_SIZE) 5732 /* array size does not allow a suitable chunk size */ 5733 return ERR_PTR(-EINVAL); 5734 5735 mddev->new_level = 5; 5736 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5737 mddev->new_chunk_sectors = chunksect; 5738 5739 return setup_conf(mddev); 5740 } 5741 5742 static void *raid5_takeover_raid6(mddev_t *mddev) 5743 { 5744 int new_layout; 5745 5746 switch (mddev->layout) { 5747 case ALGORITHM_LEFT_ASYMMETRIC_6: 5748 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5749 break; 5750 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5751 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5752 break; 5753 case ALGORITHM_LEFT_SYMMETRIC_6: 5754 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5755 break; 5756 case ALGORITHM_RIGHT_SYMMETRIC_6: 5757 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5758 break; 5759 case ALGORITHM_PARITY_0_6: 5760 new_layout = ALGORITHM_PARITY_0; 5761 break; 5762 case ALGORITHM_PARITY_N: 5763 new_layout = ALGORITHM_PARITY_N; 5764 break; 5765 default: 5766 return ERR_PTR(-EINVAL); 5767 } 5768 mddev->new_level = 5; 5769 mddev->new_layout = new_layout; 5770 mddev->delta_disks = -1; 5771 mddev->raid_disks -= 1; 5772 return setup_conf(mddev); 5773 } 5774 5775 5776 static int raid5_check_reshape(mddev_t *mddev) 5777 { 5778 /* For a 2-drive array, the layout and chunk size can be changed 5779 * immediately as not restriping is needed. 5780 * For larger arrays we record the new value - after validation 5781 * to be used by a reshape pass. 5782 */ 5783 raid5_conf_t *conf = mddev->private; 5784 int new_chunk = mddev->new_chunk_sectors; 5785 5786 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5787 return -EINVAL; 5788 if (new_chunk > 0) { 5789 if (!is_power_of_2(new_chunk)) 5790 return -EINVAL; 5791 if (new_chunk < (PAGE_SIZE>>9)) 5792 return -EINVAL; 5793 if (mddev->array_sectors & (new_chunk-1)) 5794 /* not factor of array size */ 5795 return -EINVAL; 5796 } 5797 5798 /* They look valid */ 5799 5800 if (mddev->raid_disks == 2) { 5801 /* can make the change immediately */ 5802 if (mddev->new_layout >= 0) { 5803 conf->algorithm = mddev->new_layout; 5804 mddev->layout = mddev->new_layout; 5805 } 5806 if (new_chunk > 0) { 5807 conf->chunk_sectors = new_chunk ; 5808 mddev->chunk_sectors = new_chunk; 5809 } 5810 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5811 md_wakeup_thread(mddev->thread); 5812 } 5813 return check_reshape(mddev); 5814 } 5815 5816 static int raid6_check_reshape(mddev_t *mddev) 5817 { 5818 int new_chunk = mddev->new_chunk_sectors; 5819 5820 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5821 return -EINVAL; 5822 if (new_chunk > 0) { 5823 if (!is_power_of_2(new_chunk)) 5824 return -EINVAL; 5825 if (new_chunk < (PAGE_SIZE >> 9)) 5826 return -EINVAL; 5827 if (mddev->array_sectors & (new_chunk-1)) 5828 /* not factor of array size */ 5829 return -EINVAL; 5830 } 5831 5832 /* They look valid */ 5833 return check_reshape(mddev); 5834 } 5835 5836 static void *raid5_takeover(mddev_t *mddev) 5837 { 5838 /* raid5 can take over: 5839 * raid0 - if there is only one strip zone - make it a raid4 layout 5840 * raid1 - if there are two drives. We need to know the chunk size 5841 * raid4 - trivial - just use a raid4 layout. 5842 * raid6 - Providing it is a *_6 layout 5843 */ 5844 if (mddev->level == 0) 5845 return raid45_takeover_raid0(mddev, 5); 5846 if (mddev->level == 1) 5847 return raid5_takeover_raid1(mddev); 5848 if (mddev->level == 4) { 5849 mddev->new_layout = ALGORITHM_PARITY_N; 5850 mddev->new_level = 5; 5851 return setup_conf(mddev); 5852 } 5853 if (mddev->level == 6) 5854 return raid5_takeover_raid6(mddev); 5855 5856 return ERR_PTR(-EINVAL); 5857 } 5858 5859 static void *raid4_takeover(mddev_t *mddev) 5860 { 5861 /* raid4 can take over: 5862 * raid0 - if there is only one strip zone 5863 * raid5 - if layout is right 5864 */ 5865 if (mddev->level == 0) 5866 return raid45_takeover_raid0(mddev, 4); 5867 if (mddev->level == 5 && 5868 mddev->layout == ALGORITHM_PARITY_N) { 5869 mddev->new_layout = 0; 5870 mddev->new_level = 4; 5871 return setup_conf(mddev); 5872 } 5873 return ERR_PTR(-EINVAL); 5874 } 5875 5876 static struct mdk_personality raid5_personality; 5877 5878 static void *raid6_takeover(mddev_t *mddev) 5879 { 5880 /* Currently can only take over a raid5. We map the 5881 * personality to an equivalent raid6 personality 5882 * with the Q block at the end. 5883 */ 5884 int new_layout; 5885 5886 if (mddev->pers != &raid5_personality) 5887 return ERR_PTR(-EINVAL); 5888 if (mddev->degraded > 1) 5889 return ERR_PTR(-EINVAL); 5890 if (mddev->raid_disks > 253) 5891 return ERR_PTR(-EINVAL); 5892 if (mddev->raid_disks < 3) 5893 return ERR_PTR(-EINVAL); 5894 5895 switch (mddev->layout) { 5896 case ALGORITHM_LEFT_ASYMMETRIC: 5897 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5898 break; 5899 case ALGORITHM_RIGHT_ASYMMETRIC: 5900 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5901 break; 5902 case ALGORITHM_LEFT_SYMMETRIC: 5903 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5904 break; 5905 case ALGORITHM_RIGHT_SYMMETRIC: 5906 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5907 break; 5908 case ALGORITHM_PARITY_0: 5909 new_layout = ALGORITHM_PARITY_0_6; 5910 break; 5911 case ALGORITHM_PARITY_N: 5912 new_layout = ALGORITHM_PARITY_N; 5913 break; 5914 default: 5915 return ERR_PTR(-EINVAL); 5916 } 5917 mddev->new_level = 6; 5918 mddev->new_layout = new_layout; 5919 mddev->delta_disks = 1; 5920 mddev->raid_disks += 1; 5921 return setup_conf(mddev); 5922 } 5923 5924 5925 static struct mdk_personality raid6_personality = 5926 { 5927 .name = "raid6", 5928 .level = 6, 5929 .owner = THIS_MODULE, 5930 .make_request = make_request, 5931 .run = run, 5932 .stop = stop, 5933 .status = status, 5934 .error_handler = error, 5935 .hot_add_disk = raid5_add_disk, 5936 .hot_remove_disk= raid5_remove_disk, 5937 .spare_active = raid5_spare_active, 5938 .sync_request = sync_request, 5939 .resize = raid5_resize, 5940 .size = raid5_size, 5941 .check_reshape = raid6_check_reshape, 5942 .start_reshape = raid5_start_reshape, 5943 .finish_reshape = raid5_finish_reshape, 5944 .quiesce = raid5_quiesce, 5945 .takeover = raid6_takeover, 5946 }; 5947 static struct mdk_personality raid5_personality = 5948 { 5949 .name = "raid5", 5950 .level = 5, 5951 .owner = THIS_MODULE, 5952 .make_request = make_request, 5953 .run = run, 5954 .stop = stop, 5955 .status = status, 5956 .error_handler = error, 5957 .hot_add_disk = raid5_add_disk, 5958 .hot_remove_disk= raid5_remove_disk, 5959 .spare_active = raid5_spare_active, 5960 .sync_request = sync_request, 5961 .resize = raid5_resize, 5962 .size = raid5_size, 5963 .check_reshape = raid5_check_reshape, 5964 .start_reshape = raid5_start_reshape, 5965 .finish_reshape = raid5_finish_reshape, 5966 .quiesce = raid5_quiesce, 5967 .takeover = raid5_takeover, 5968 }; 5969 5970 static struct mdk_personality raid4_personality = 5971 { 5972 .name = "raid4", 5973 .level = 4, 5974 .owner = THIS_MODULE, 5975 .make_request = make_request, 5976 .run = run, 5977 .stop = stop, 5978 .status = status, 5979 .error_handler = error, 5980 .hot_add_disk = raid5_add_disk, 5981 .hot_remove_disk= raid5_remove_disk, 5982 .spare_active = raid5_spare_active, 5983 .sync_request = sync_request, 5984 .resize = raid5_resize, 5985 .size = raid5_size, 5986 .check_reshape = raid5_check_reshape, 5987 .start_reshape = raid5_start_reshape, 5988 .finish_reshape = raid5_finish_reshape, 5989 .quiesce = raid5_quiesce, 5990 .takeover = raid4_takeover, 5991 }; 5992 5993 static int __init raid5_init(void) 5994 { 5995 register_md_personality(&raid6_personality); 5996 register_md_personality(&raid5_personality); 5997 register_md_personality(&raid4_personality); 5998 return 0; 5999 } 6000 6001 static void raid5_exit(void) 6002 { 6003 unregister_md_personality(&raid6_personality); 6004 unregister_md_personality(&raid5_personality); 6005 unregister_md_personality(&raid4_personality); 6006 } 6007 6008 module_init(raid5_init); 6009 module_exit(raid5_exit); 6010 MODULE_LICENSE("GPL"); 6011 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6012 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6013 MODULE_ALIAS("md-raid5"); 6014 MODULE_ALIAS("md-raid4"); 6015 MODULE_ALIAS("md-level-5"); 6016 MODULE_ALIAS("md-level-4"); 6017 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6018 MODULE_ALIAS("md-raid6"); 6019 MODULE_ALIAS("md-level-6"); 6020 6021 /* This used to be two separate modules, they were: */ 6022 MODULE_ALIAS("raid5"); 6023 MODULE_ALIAS("raid6"); 6024