1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/async.h> 51 #include <linux/seq_file.h> 52 #include <linux/cpu.h> 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "raid5.h" 56 #include "raid0.h" 57 #include "bitmap.h" 58 59 /* 60 * Stripe cache 61 */ 62 63 #define NR_STRIPES 256 64 #define STRIPE_SIZE PAGE_SIZE 65 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 66 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 67 #define IO_THRESHOLD 1 68 #define BYPASS_THRESHOLD 1 69 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 70 #define HASH_MASK (NR_HASH - 1) 71 72 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 73 74 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 75 * order without overlap. There may be several bio's per stripe+device, and 76 * a bio could span several devices. 77 * When walking this list for a particular stripe+device, we must never proceed 78 * beyond a bio that extends past this device, as the next bio might no longer 79 * be valid. 80 * This macro is used to determine the 'next' bio in the list, given the sector 81 * of the current stripe+device 82 */ 83 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 84 /* 85 * The following can be used to debug the driver 86 */ 87 #define RAID5_PARANOIA 1 88 #if RAID5_PARANOIA && defined(CONFIG_SMP) 89 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 90 #else 91 # define CHECK_DEVLOCK() 92 #endif 93 94 #ifdef DEBUG 95 #define inline 96 #define __inline__ 97 #endif 98 99 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 100 101 /* 102 * We maintain a biased count of active stripes in the bottom 16 bits of 103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 104 */ 105 static inline int raid5_bi_phys_segments(struct bio *bio) 106 { 107 return bio->bi_phys_segments & 0xffff; 108 } 109 110 static inline int raid5_bi_hw_segments(struct bio *bio) 111 { 112 return (bio->bi_phys_segments >> 16) & 0xffff; 113 } 114 115 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 116 { 117 --bio->bi_phys_segments; 118 return raid5_bi_phys_segments(bio); 119 } 120 121 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 122 { 123 unsigned short val = raid5_bi_hw_segments(bio); 124 125 --val; 126 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 return val; 128 } 129 130 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 131 { 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 133 } 134 135 /* Find first data disk in a raid6 stripe */ 136 static inline int raid6_d0(struct stripe_head *sh) 137 { 138 if (sh->ddf_layout) 139 /* ddf always start from first device */ 140 return 0; 141 /* md starts just after Q block */ 142 if (sh->qd_idx == sh->disks - 1) 143 return 0; 144 else 145 return sh->qd_idx + 1; 146 } 147 static inline int raid6_next_disk(int disk, int raid_disks) 148 { 149 disk++; 150 return (disk < raid_disks) ? disk : 0; 151 } 152 153 /* When walking through the disks in a raid5, starting at raid6_d0, 154 * We need to map each disk to a 'slot', where the data disks are slot 155 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 156 * is raid_disks-1. This help does that mapping. 157 */ 158 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 159 int *count, int syndrome_disks) 160 { 161 int slot = *count; 162 163 if (sh->ddf_layout) 164 (*count)++; 165 if (idx == sh->pd_idx) 166 return syndrome_disks; 167 if (idx == sh->qd_idx) 168 return syndrome_disks + 1; 169 if (!sh->ddf_layout) 170 (*count)++; 171 return slot; 172 } 173 174 static void return_io(struct bio *return_bi) 175 { 176 struct bio *bi = return_bi; 177 while (bi) { 178 179 return_bi = bi->bi_next; 180 bi->bi_next = NULL; 181 bi->bi_size = 0; 182 bio_endio(bi, 0); 183 bi = return_bi; 184 } 185 } 186 187 static void print_raid5_conf (raid5_conf_t *conf); 188 189 static int stripe_operations_active(struct stripe_head *sh) 190 { 191 return sh->check_state || sh->reconstruct_state || 192 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 193 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 194 } 195 196 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 197 { 198 if (atomic_dec_and_test(&sh->count)) { 199 BUG_ON(!list_empty(&sh->lru)); 200 BUG_ON(atomic_read(&conf->active_stripes)==0); 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state)) 203 list_add_tail(&sh->lru, &conf->delayed_list); 204 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 205 sh->bm_seq - conf->seq_write > 0) 206 list_add_tail(&sh->lru, &conf->bitmap_list); 207 else { 208 clear_bit(STRIPE_BIT_DELAY, &sh->state); 209 list_add_tail(&sh->lru, &conf->handle_list); 210 } 211 md_wakeup_thread(conf->mddev->thread); 212 } else { 213 BUG_ON(stripe_operations_active(sh)); 214 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 215 atomic_dec(&conf->preread_active_stripes); 216 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 217 md_wakeup_thread(conf->mddev->thread); 218 } 219 atomic_dec(&conf->active_stripes); 220 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 221 list_add_tail(&sh->lru, &conf->inactive_list); 222 wake_up(&conf->wait_for_stripe); 223 if (conf->retry_read_aligned) 224 md_wakeup_thread(conf->mddev->thread); 225 } 226 } 227 } 228 } 229 230 static void release_stripe(struct stripe_head *sh) 231 { 232 raid5_conf_t *conf = sh->raid_conf; 233 unsigned long flags; 234 235 spin_lock_irqsave(&conf->device_lock, flags); 236 __release_stripe(conf, sh); 237 spin_unlock_irqrestore(&conf->device_lock, flags); 238 } 239 240 static inline void remove_hash(struct stripe_head *sh) 241 { 242 pr_debug("remove_hash(), stripe %llu\n", 243 (unsigned long long)sh->sector); 244 245 hlist_del_init(&sh->hash); 246 } 247 248 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 249 { 250 struct hlist_head *hp = stripe_hash(conf, sh->sector); 251 252 pr_debug("insert_hash(), stripe %llu\n", 253 (unsigned long long)sh->sector); 254 255 CHECK_DEVLOCK(); 256 hlist_add_head(&sh->hash, hp); 257 } 258 259 260 /* find an idle stripe, make sure it is unhashed, and return it. */ 261 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 262 { 263 struct stripe_head *sh = NULL; 264 struct list_head *first; 265 266 CHECK_DEVLOCK(); 267 if (list_empty(&conf->inactive_list)) 268 goto out; 269 first = conf->inactive_list.next; 270 sh = list_entry(first, struct stripe_head, lru); 271 list_del_init(first); 272 remove_hash(sh); 273 atomic_inc(&conf->active_stripes); 274 out: 275 return sh; 276 } 277 278 static void shrink_buffers(struct stripe_head *sh) 279 { 280 struct page *p; 281 int i; 282 int num = sh->raid_conf->pool_size; 283 284 for (i = 0; i < num ; i++) { 285 p = sh->dev[i].page; 286 if (!p) 287 continue; 288 sh->dev[i].page = NULL; 289 put_page(p); 290 } 291 } 292 293 static int grow_buffers(struct stripe_head *sh) 294 { 295 int i; 296 int num = sh->raid_conf->pool_size; 297 298 for (i = 0; i < num; i++) { 299 struct page *page; 300 301 if (!(page = alloc_page(GFP_KERNEL))) { 302 return 1; 303 } 304 sh->dev[i].page = page; 305 } 306 return 0; 307 } 308 309 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 310 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 311 struct stripe_head *sh); 312 313 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 314 { 315 raid5_conf_t *conf = sh->raid_conf; 316 int i; 317 318 BUG_ON(atomic_read(&sh->count) != 0); 319 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 320 BUG_ON(stripe_operations_active(sh)); 321 322 CHECK_DEVLOCK(); 323 pr_debug("init_stripe called, stripe %llu\n", 324 (unsigned long long)sh->sector); 325 326 remove_hash(sh); 327 328 sh->generation = conf->generation - previous; 329 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 330 sh->sector = sector; 331 stripe_set_idx(sector, conf, previous, sh); 332 sh->state = 0; 333 334 335 for (i = sh->disks; i--; ) { 336 struct r5dev *dev = &sh->dev[i]; 337 338 if (dev->toread || dev->read || dev->towrite || dev->written || 339 test_bit(R5_LOCKED, &dev->flags)) { 340 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 341 (unsigned long long)sh->sector, i, dev->toread, 342 dev->read, dev->towrite, dev->written, 343 test_bit(R5_LOCKED, &dev->flags)); 344 BUG(); 345 } 346 dev->flags = 0; 347 raid5_build_block(sh, i, previous); 348 } 349 insert_hash(conf, sh); 350 } 351 352 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 353 short generation) 354 { 355 struct stripe_head *sh; 356 struct hlist_node *hn; 357 358 CHECK_DEVLOCK(); 359 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 360 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 361 if (sh->sector == sector && sh->generation == generation) 362 return sh; 363 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 364 return NULL; 365 } 366 367 /* 368 * Need to check if array has failed when deciding whether to: 369 * - start an array 370 * - remove non-faulty devices 371 * - add a spare 372 * - allow a reshape 373 * This determination is simple when no reshape is happening. 374 * However if there is a reshape, we need to carefully check 375 * both the before and after sections. 376 * This is because some failed devices may only affect one 377 * of the two sections, and some non-in_sync devices may 378 * be insync in the section most affected by failed devices. 379 */ 380 static int has_failed(raid5_conf_t *conf) 381 { 382 int degraded; 383 int i; 384 if (conf->mddev->reshape_position == MaxSector) 385 return conf->mddev->degraded > conf->max_degraded; 386 387 rcu_read_lock(); 388 degraded = 0; 389 for (i = 0; i < conf->previous_raid_disks; i++) { 390 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 391 if (!rdev || test_bit(Faulty, &rdev->flags)) 392 degraded++; 393 else if (test_bit(In_sync, &rdev->flags)) 394 ; 395 else 396 /* not in-sync or faulty. 397 * If the reshape increases the number of devices, 398 * this is being recovered by the reshape, so 399 * this 'previous' section is not in_sync. 400 * If the number of devices is being reduced however, 401 * the device can only be part of the array if 402 * we are reverting a reshape, so this section will 403 * be in-sync. 404 */ 405 if (conf->raid_disks >= conf->previous_raid_disks) 406 degraded++; 407 } 408 rcu_read_unlock(); 409 if (degraded > conf->max_degraded) 410 return 1; 411 rcu_read_lock(); 412 degraded = 0; 413 for (i = 0; i < conf->raid_disks; i++) { 414 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 415 if (!rdev || test_bit(Faulty, &rdev->flags)) 416 degraded++; 417 else if (test_bit(In_sync, &rdev->flags)) 418 ; 419 else 420 /* not in-sync or faulty. 421 * If reshape increases the number of devices, this 422 * section has already been recovered, else it 423 * almost certainly hasn't. 424 */ 425 if (conf->raid_disks <= conf->previous_raid_disks) 426 degraded++; 427 } 428 rcu_read_unlock(); 429 if (degraded > conf->max_degraded) 430 return 1; 431 return 0; 432 } 433 434 static struct stripe_head * 435 get_active_stripe(raid5_conf_t *conf, sector_t sector, 436 int previous, int noblock, int noquiesce) 437 { 438 struct stripe_head *sh; 439 440 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 441 442 spin_lock_irq(&conf->device_lock); 443 444 do { 445 wait_event_lock_irq(conf->wait_for_stripe, 446 conf->quiesce == 0 || noquiesce, 447 conf->device_lock, /* nothing */); 448 sh = __find_stripe(conf, sector, conf->generation - previous); 449 if (!sh) { 450 if (!conf->inactive_blocked) 451 sh = get_free_stripe(conf); 452 if (noblock && sh == NULL) 453 break; 454 if (!sh) { 455 conf->inactive_blocked = 1; 456 wait_event_lock_irq(conf->wait_for_stripe, 457 !list_empty(&conf->inactive_list) && 458 (atomic_read(&conf->active_stripes) 459 < (conf->max_nr_stripes *3/4) 460 || !conf->inactive_blocked), 461 conf->device_lock, 462 ); 463 conf->inactive_blocked = 0; 464 } else 465 init_stripe(sh, sector, previous); 466 } else { 467 if (atomic_read(&sh->count)) { 468 BUG_ON(!list_empty(&sh->lru) 469 && !test_bit(STRIPE_EXPANDING, &sh->state)); 470 } else { 471 if (!test_bit(STRIPE_HANDLE, &sh->state)) 472 atomic_inc(&conf->active_stripes); 473 if (list_empty(&sh->lru) && 474 !test_bit(STRIPE_EXPANDING, &sh->state)) 475 BUG(); 476 list_del_init(&sh->lru); 477 } 478 } 479 } while (sh == NULL); 480 481 if (sh) 482 atomic_inc(&sh->count); 483 484 spin_unlock_irq(&conf->device_lock); 485 return sh; 486 } 487 488 static void 489 raid5_end_read_request(struct bio *bi, int error); 490 static void 491 raid5_end_write_request(struct bio *bi, int error); 492 493 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 494 { 495 raid5_conf_t *conf = sh->raid_conf; 496 int i, disks = sh->disks; 497 498 might_sleep(); 499 500 for (i = disks; i--; ) { 501 int rw; 502 struct bio *bi; 503 mdk_rdev_t *rdev; 504 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 505 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 506 rw = WRITE_FUA; 507 else 508 rw = WRITE; 509 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 510 rw = READ; 511 else 512 continue; 513 514 bi = &sh->dev[i].req; 515 516 bi->bi_rw = rw; 517 if (rw == WRITE) 518 bi->bi_end_io = raid5_end_write_request; 519 else 520 bi->bi_end_io = raid5_end_read_request; 521 522 rcu_read_lock(); 523 rdev = rcu_dereference(conf->disks[i].rdev); 524 if (rdev && test_bit(Faulty, &rdev->flags)) 525 rdev = NULL; 526 if (rdev) 527 atomic_inc(&rdev->nr_pending); 528 rcu_read_unlock(); 529 530 if (rdev) { 531 if (s->syncing || s->expanding || s->expanded) 532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 533 534 set_bit(STRIPE_IO_STARTED, &sh->state); 535 536 bi->bi_bdev = rdev->bdev; 537 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 538 __func__, (unsigned long long)sh->sector, 539 bi->bi_rw, i); 540 atomic_inc(&sh->count); 541 bi->bi_sector = sh->sector + rdev->data_offset; 542 bi->bi_flags = 1 << BIO_UPTODATE; 543 bi->bi_vcnt = 1; 544 bi->bi_max_vecs = 1; 545 bi->bi_idx = 0; 546 bi->bi_io_vec = &sh->dev[i].vec; 547 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 548 bi->bi_io_vec[0].bv_offset = 0; 549 bi->bi_size = STRIPE_SIZE; 550 bi->bi_next = NULL; 551 if (rw == WRITE && 552 test_bit(R5_ReWrite, &sh->dev[i].flags)) 553 atomic_add(STRIPE_SECTORS, 554 &rdev->corrected_errors); 555 generic_make_request(bi); 556 } else { 557 if (rw == WRITE) 558 set_bit(STRIPE_DEGRADED, &sh->state); 559 pr_debug("skip op %ld on disc %d for sector %llu\n", 560 bi->bi_rw, i, (unsigned long long)sh->sector); 561 clear_bit(R5_LOCKED, &sh->dev[i].flags); 562 set_bit(STRIPE_HANDLE, &sh->state); 563 } 564 } 565 } 566 567 static struct dma_async_tx_descriptor * 568 async_copy_data(int frombio, struct bio *bio, struct page *page, 569 sector_t sector, struct dma_async_tx_descriptor *tx) 570 { 571 struct bio_vec *bvl; 572 struct page *bio_page; 573 int i; 574 int page_offset; 575 struct async_submit_ctl submit; 576 enum async_tx_flags flags = 0; 577 578 if (bio->bi_sector >= sector) 579 page_offset = (signed)(bio->bi_sector - sector) * 512; 580 else 581 page_offset = (signed)(sector - bio->bi_sector) * -512; 582 583 if (frombio) 584 flags |= ASYNC_TX_FENCE; 585 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 586 587 bio_for_each_segment(bvl, bio, i) { 588 int len = bio_iovec_idx(bio, i)->bv_len; 589 int clen; 590 int b_offset = 0; 591 592 if (page_offset < 0) { 593 b_offset = -page_offset; 594 page_offset += b_offset; 595 len -= b_offset; 596 } 597 598 if (len > 0 && page_offset + len > STRIPE_SIZE) 599 clen = STRIPE_SIZE - page_offset; 600 else 601 clen = len; 602 603 if (clen > 0) { 604 b_offset += bio_iovec_idx(bio, i)->bv_offset; 605 bio_page = bio_iovec_idx(bio, i)->bv_page; 606 if (frombio) 607 tx = async_memcpy(page, bio_page, page_offset, 608 b_offset, clen, &submit); 609 else 610 tx = async_memcpy(bio_page, page, b_offset, 611 page_offset, clen, &submit); 612 } 613 /* chain the operations */ 614 submit.depend_tx = tx; 615 616 if (clen < len) /* hit end of page */ 617 break; 618 page_offset += len; 619 } 620 621 return tx; 622 } 623 624 static void ops_complete_biofill(void *stripe_head_ref) 625 { 626 struct stripe_head *sh = stripe_head_ref; 627 struct bio *return_bi = NULL; 628 raid5_conf_t *conf = sh->raid_conf; 629 int i; 630 631 pr_debug("%s: stripe %llu\n", __func__, 632 (unsigned long long)sh->sector); 633 634 /* clear completed biofills */ 635 spin_lock_irq(&conf->device_lock); 636 for (i = sh->disks; i--; ) { 637 struct r5dev *dev = &sh->dev[i]; 638 639 /* acknowledge completion of a biofill operation */ 640 /* and check if we need to reply to a read request, 641 * new R5_Wantfill requests are held off until 642 * !STRIPE_BIOFILL_RUN 643 */ 644 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 645 struct bio *rbi, *rbi2; 646 647 BUG_ON(!dev->read); 648 rbi = dev->read; 649 dev->read = NULL; 650 while (rbi && rbi->bi_sector < 651 dev->sector + STRIPE_SECTORS) { 652 rbi2 = r5_next_bio(rbi, dev->sector); 653 if (!raid5_dec_bi_phys_segments(rbi)) { 654 rbi->bi_next = return_bi; 655 return_bi = rbi; 656 } 657 rbi = rbi2; 658 } 659 } 660 } 661 spin_unlock_irq(&conf->device_lock); 662 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 663 664 return_io(return_bi); 665 666 set_bit(STRIPE_HANDLE, &sh->state); 667 release_stripe(sh); 668 } 669 670 static void ops_run_biofill(struct stripe_head *sh) 671 { 672 struct dma_async_tx_descriptor *tx = NULL; 673 raid5_conf_t *conf = sh->raid_conf; 674 struct async_submit_ctl submit; 675 int i; 676 677 pr_debug("%s: stripe %llu\n", __func__, 678 (unsigned long long)sh->sector); 679 680 for (i = sh->disks; i--; ) { 681 struct r5dev *dev = &sh->dev[i]; 682 if (test_bit(R5_Wantfill, &dev->flags)) { 683 struct bio *rbi; 684 spin_lock_irq(&conf->device_lock); 685 dev->read = rbi = dev->toread; 686 dev->toread = NULL; 687 spin_unlock_irq(&conf->device_lock); 688 while (rbi && rbi->bi_sector < 689 dev->sector + STRIPE_SECTORS) { 690 tx = async_copy_data(0, rbi, dev->page, 691 dev->sector, tx); 692 rbi = r5_next_bio(rbi, dev->sector); 693 } 694 } 695 } 696 697 atomic_inc(&sh->count); 698 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 699 async_trigger_callback(&submit); 700 } 701 702 static void mark_target_uptodate(struct stripe_head *sh, int target) 703 { 704 struct r5dev *tgt; 705 706 if (target < 0) 707 return; 708 709 tgt = &sh->dev[target]; 710 set_bit(R5_UPTODATE, &tgt->flags); 711 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 712 clear_bit(R5_Wantcompute, &tgt->flags); 713 } 714 715 static void ops_complete_compute(void *stripe_head_ref) 716 { 717 struct stripe_head *sh = stripe_head_ref; 718 719 pr_debug("%s: stripe %llu\n", __func__, 720 (unsigned long long)sh->sector); 721 722 /* mark the computed target(s) as uptodate */ 723 mark_target_uptodate(sh, sh->ops.target); 724 mark_target_uptodate(sh, sh->ops.target2); 725 726 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 727 if (sh->check_state == check_state_compute_run) 728 sh->check_state = check_state_compute_result; 729 set_bit(STRIPE_HANDLE, &sh->state); 730 release_stripe(sh); 731 } 732 733 /* return a pointer to the address conversion region of the scribble buffer */ 734 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 735 struct raid5_percpu *percpu) 736 { 737 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 738 } 739 740 static struct dma_async_tx_descriptor * 741 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 742 { 743 int disks = sh->disks; 744 struct page **xor_srcs = percpu->scribble; 745 int target = sh->ops.target; 746 struct r5dev *tgt = &sh->dev[target]; 747 struct page *xor_dest = tgt->page; 748 int count = 0; 749 struct dma_async_tx_descriptor *tx; 750 struct async_submit_ctl submit; 751 int i; 752 753 pr_debug("%s: stripe %llu block: %d\n", 754 __func__, (unsigned long long)sh->sector, target); 755 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 756 757 for (i = disks; i--; ) 758 if (i != target) 759 xor_srcs[count++] = sh->dev[i].page; 760 761 atomic_inc(&sh->count); 762 763 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 764 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 765 if (unlikely(count == 1)) 766 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 767 else 768 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 769 770 return tx; 771 } 772 773 /* set_syndrome_sources - populate source buffers for gen_syndrome 774 * @srcs - (struct page *) array of size sh->disks 775 * @sh - stripe_head to parse 776 * 777 * Populates srcs in proper layout order for the stripe and returns the 778 * 'count' of sources to be used in a call to async_gen_syndrome. The P 779 * destination buffer is recorded in srcs[count] and the Q destination 780 * is recorded in srcs[count+1]]. 781 */ 782 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 783 { 784 int disks = sh->disks; 785 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 786 int d0_idx = raid6_d0(sh); 787 int count; 788 int i; 789 790 for (i = 0; i < disks; i++) 791 srcs[i] = NULL; 792 793 count = 0; 794 i = d0_idx; 795 do { 796 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 797 798 srcs[slot] = sh->dev[i].page; 799 i = raid6_next_disk(i, disks); 800 } while (i != d0_idx); 801 802 return syndrome_disks; 803 } 804 805 static struct dma_async_tx_descriptor * 806 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 807 { 808 int disks = sh->disks; 809 struct page **blocks = percpu->scribble; 810 int target; 811 int qd_idx = sh->qd_idx; 812 struct dma_async_tx_descriptor *tx; 813 struct async_submit_ctl submit; 814 struct r5dev *tgt; 815 struct page *dest; 816 int i; 817 int count; 818 819 if (sh->ops.target < 0) 820 target = sh->ops.target2; 821 else if (sh->ops.target2 < 0) 822 target = sh->ops.target; 823 else 824 /* we should only have one valid target */ 825 BUG(); 826 BUG_ON(target < 0); 827 pr_debug("%s: stripe %llu block: %d\n", 828 __func__, (unsigned long long)sh->sector, target); 829 830 tgt = &sh->dev[target]; 831 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 832 dest = tgt->page; 833 834 atomic_inc(&sh->count); 835 836 if (target == qd_idx) { 837 count = set_syndrome_sources(blocks, sh); 838 blocks[count] = NULL; /* regenerating p is not necessary */ 839 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 840 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 841 ops_complete_compute, sh, 842 to_addr_conv(sh, percpu)); 843 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 844 } else { 845 /* Compute any data- or p-drive using XOR */ 846 count = 0; 847 for (i = disks; i-- ; ) { 848 if (i == target || i == qd_idx) 849 continue; 850 blocks[count++] = sh->dev[i].page; 851 } 852 853 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 854 NULL, ops_complete_compute, sh, 855 to_addr_conv(sh, percpu)); 856 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 857 } 858 859 return tx; 860 } 861 862 static struct dma_async_tx_descriptor * 863 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 864 { 865 int i, count, disks = sh->disks; 866 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 867 int d0_idx = raid6_d0(sh); 868 int faila = -1, failb = -1; 869 int target = sh->ops.target; 870 int target2 = sh->ops.target2; 871 struct r5dev *tgt = &sh->dev[target]; 872 struct r5dev *tgt2 = &sh->dev[target2]; 873 struct dma_async_tx_descriptor *tx; 874 struct page **blocks = percpu->scribble; 875 struct async_submit_ctl submit; 876 877 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 878 __func__, (unsigned long long)sh->sector, target, target2); 879 BUG_ON(target < 0 || target2 < 0); 880 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 881 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 882 883 /* we need to open-code set_syndrome_sources to handle the 884 * slot number conversion for 'faila' and 'failb' 885 */ 886 for (i = 0; i < disks ; i++) 887 blocks[i] = NULL; 888 count = 0; 889 i = d0_idx; 890 do { 891 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 892 893 blocks[slot] = sh->dev[i].page; 894 895 if (i == target) 896 faila = slot; 897 if (i == target2) 898 failb = slot; 899 i = raid6_next_disk(i, disks); 900 } while (i != d0_idx); 901 902 BUG_ON(faila == failb); 903 if (failb < faila) 904 swap(faila, failb); 905 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 906 __func__, (unsigned long long)sh->sector, faila, failb); 907 908 atomic_inc(&sh->count); 909 910 if (failb == syndrome_disks+1) { 911 /* Q disk is one of the missing disks */ 912 if (faila == syndrome_disks) { 913 /* Missing P+Q, just recompute */ 914 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 915 ops_complete_compute, sh, 916 to_addr_conv(sh, percpu)); 917 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 918 STRIPE_SIZE, &submit); 919 } else { 920 struct page *dest; 921 int data_target; 922 int qd_idx = sh->qd_idx; 923 924 /* Missing D+Q: recompute D from P, then recompute Q */ 925 if (target == qd_idx) 926 data_target = target2; 927 else 928 data_target = target; 929 930 count = 0; 931 for (i = disks; i-- ; ) { 932 if (i == data_target || i == qd_idx) 933 continue; 934 blocks[count++] = sh->dev[i].page; 935 } 936 dest = sh->dev[data_target].page; 937 init_async_submit(&submit, 938 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 939 NULL, NULL, NULL, 940 to_addr_conv(sh, percpu)); 941 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 942 &submit); 943 944 count = set_syndrome_sources(blocks, sh); 945 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 946 ops_complete_compute, sh, 947 to_addr_conv(sh, percpu)); 948 return async_gen_syndrome(blocks, 0, count+2, 949 STRIPE_SIZE, &submit); 950 } 951 } else { 952 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 953 ops_complete_compute, sh, 954 to_addr_conv(sh, percpu)); 955 if (failb == syndrome_disks) { 956 /* We're missing D+P. */ 957 return async_raid6_datap_recov(syndrome_disks+2, 958 STRIPE_SIZE, faila, 959 blocks, &submit); 960 } else { 961 /* We're missing D+D. */ 962 return async_raid6_2data_recov(syndrome_disks+2, 963 STRIPE_SIZE, faila, failb, 964 blocks, &submit); 965 } 966 } 967 } 968 969 970 static void ops_complete_prexor(void *stripe_head_ref) 971 { 972 struct stripe_head *sh = stripe_head_ref; 973 974 pr_debug("%s: stripe %llu\n", __func__, 975 (unsigned long long)sh->sector); 976 } 977 978 static struct dma_async_tx_descriptor * 979 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 980 struct dma_async_tx_descriptor *tx) 981 { 982 int disks = sh->disks; 983 struct page **xor_srcs = percpu->scribble; 984 int count = 0, pd_idx = sh->pd_idx, i; 985 struct async_submit_ctl submit; 986 987 /* existing parity data subtracted */ 988 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 989 990 pr_debug("%s: stripe %llu\n", __func__, 991 (unsigned long long)sh->sector); 992 993 for (i = disks; i--; ) { 994 struct r5dev *dev = &sh->dev[i]; 995 /* Only process blocks that are known to be uptodate */ 996 if (test_bit(R5_Wantdrain, &dev->flags)) 997 xor_srcs[count++] = dev->page; 998 } 999 1000 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1001 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1002 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1003 1004 return tx; 1005 } 1006 1007 static struct dma_async_tx_descriptor * 1008 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1009 { 1010 int disks = sh->disks; 1011 int i; 1012 1013 pr_debug("%s: stripe %llu\n", __func__, 1014 (unsigned long long)sh->sector); 1015 1016 for (i = disks; i--; ) { 1017 struct r5dev *dev = &sh->dev[i]; 1018 struct bio *chosen; 1019 1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1021 struct bio *wbi; 1022 1023 spin_lock(&sh->lock); 1024 chosen = dev->towrite; 1025 dev->towrite = NULL; 1026 BUG_ON(dev->written); 1027 wbi = dev->written = chosen; 1028 spin_unlock(&sh->lock); 1029 1030 while (wbi && wbi->bi_sector < 1031 dev->sector + STRIPE_SECTORS) { 1032 if (wbi->bi_rw & REQ_FUA) 1033 set_bit(R5_WantFUA, &dev->flags); 1034 tx = async_copy_data(1, wbi, dev->page, 1035 dev->sector, tx); 1036 wbi = r5_next_bio(wbi, dev->sector); 1037 } 1038 } 1039 } 1040 1041 return tx; 1042 } 1043 1044 static void ops_complete_reconstruct(void *stripe_head_ref) 1045 { 1046 struct stripe_head *sh = stripe_head_ref; 1047 int disks = sh->disks; 1048 int pd_idx = sh->pd_idx; 1049 int qd_idx = sh->qd_idx; 1050 int i; 1051 bool fua = false; 1052 1053 pr_debug("%s: stripe %llu\n", __func__, 1054 (unsigned long long)sh->sector); 1055 1056 for (i = disks; i--; ) 1057 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1058 1059 for (i = disks; i--; ) { 1060 struct r5dev *dev = &sh->dev[i]; 1061 1062 if (dev->written || i == pd_idx || i == qd_idx) { 1063 set_bit(R5_UPTODATE, &dev->flags); 1064 if (fua) 1065 set_bit(R5_WantFUA, &dev->flags); 1066 } 1067 } 1068 1069 if (sh->reconstruct_state == reconstruct_state_drain_run) 1070 sh->reconstruct_state = reconstruct_state_drain_result; 1071 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1072 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1073 else { 1074 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1075 sh->reconstruct_state = reconstruct_state_result; 1076 } 1077 1078 set_bit(STRIPE_HANDLE, &sh->state); 1079 release_stripe(sh); 1080 } 1081 1082 static void 1083 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1084 struct dma_async_tx_descriptor *tx) 1085 { 1086 int disks = sh->disks; 1087 struct page **xor_srcs = percpu->scribble; 1088 struct async_submit_ctl submit; 1089 int count = 0, pd_idx = sh->pd_idx, i; 1090 struct page *xor_dest; 1091 int prexor = 0; 1092 unsigned long flags; 1093 1094 pr_debug("%s: stripe %llu\n", __func__, 1095 (unsigned long long)sh->sector); 1096 1097 /* check if prexor is active which means only process blocks 1098 * that are part of a read-modify-write (written) 1099 */ 1100 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1101 prexor = 1; 1102 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1103 for (i = disks; i--; ) { 1104 struct r5dev *dev = &sh->dev[i]; 1105 if (dev->written) 1106 xor_srcs[count++] = dev->page; 1107 } 1108 } else { 1109 xor_dest = sh->dev[pd_idx].page; 1110 for (i = disks; i--; ) { 1111 struct r5dev *dev = &sh->dev[i]; 1112 if (i != pd_idx) 1113 xor_srcs[count++] = dev->page; 1114 } 1115 } 1116 1117 /* 1/ if we prexor'd then the dest is reused as a source 1118 * 2/ if we did not prexor then we are redoing the parity 1119 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1120 * for the synchronous xor case 1121 */ 1122 flags = ASYNC_TX_ACK | 1123 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1124 1125 atomic_inc(&sh->count); 1126 1127 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1128 to_addr_conv(sh, percpu)); 1129 if (unlikely(count == 1)) 1130 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1131 else 1132 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1133 } 1134 1135 static void 1136 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1137 struct dma_async_tx_descriptor *tx) 1138 { 1139 struct async_submit_ctl submit; 1140 struct page **blocks = percpu->scribble; 1141 int count; 1142 1143 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1144 1145 count = set_syndrome_sources(blocks, sh); 1146 1147 atomic_inc(&sh->count); 1148 1149 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1150 sh, to_addr_conv(sh, percpu)); 1151 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1152 } 1153 1154 static void ops_complete_check(void *stripe_head_ref) 1155 { 1156 struct stripe_head *sh = stripe_head_ref; 1157 1158 pr_debug("%s: stripe %llu\n", __func__, 1159 (unsigned long long)sh->sector); 1160 1161 sh->check_state = check_state_check_result; 1162 set_bit(STRIPE_HANDLE, &sh->state); 1163 release_stripe(sh); 1164 } 1165 1166 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1167 { 1168 int disks = sh->disks; 1169 int pd_idx = sh->pd_idx; 1170 int qd_idx = sh->qd_idx; 1171 struct page *xor_dest; 1172 struct page **xor_srcs = percpu->scribble; 1173 struct dma_async_tx_descriptor *tx; 1174 struct async_submit_ctl submit; 1175 int count; 1176 int i; 1177 1178 pr_debug("%s: stripe %llu\n", __func__, 1179 (unsigned long long)sh->sector); 1180 1181 count = 0; 1182 xor_dest = sh->dev[pd_idx].page; 1183 xor_srcs[count++] = xor_dest; 1184 for (i = disks; i--; ) { 1185 if (i == pd_idx || i == qd_idx) 1186 continue; 1187 xor_srcs[count++] = sh->dev[i].page; 1188 } 1189 1190 init_async_submit(&submit, 0, NULL, NULL, NULL, 1191 to_addr_conv(sh, percpu)); 1192 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1193 &sh->ops.zero_sum_result, &submit); 1194 1195 atomic_inc(&sh->count); 1196 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1197 tx = async_trigger_callback(&submit); 1198 } 1199 1200 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1201 { 1202 struct page **srcs = percpu->scribble; 1203 struct async_submit_ctl submit; 1204 int count; 1205 1206 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1207 (unsigned long long)sh->sector, checkp); 1208 1209 count = set_syndrome_sources(srcs, sh); 1210 if (!checkp) 1211 srcs[count] = NULL; 1212 1213 atomic_inc(&sh->count); 1214 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1215 sh, to_addr_conv(sh, percpu)); 1216 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1217 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1218 } 1219 1220 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1221 { 1222 int overlap_clear = 0, i, disks = sh->disks; 1223 struct dma_async_tx_descriptor *tx = NULL; 1224 raid5_conf_t *conf = sh->raid_conf; 1225 int level = conf->level; 1226 struct raid5_percpu *percpu; 1227 unsigned long cpu; 1228 1229 cpu = get_cpu(); 1230 percpu = per_cpu_ptr(conf->percpu, cpu); 1231 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1232 ops_run_biofill(sh); 1233 overlap_clear++; 1234 } 1235 1236 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1237 if (level < 6) 1238 tx = ops_run_compute5(sh, percpu); 1239 else { 1240 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1241 tx = ops_run_compute6_1(sh, percpu); 1242 else 1243 tx = ops_run_compute6_2(sh, percpu); 1244 } 1245 /* terminate the chain if reconstruct is not set to be run */ 1246 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1247 async_tx_ack(tx); 1248 } 1249 1250 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1251 tx = ops_run_prexor(sh, percpu, tx); 1252 1253 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1254 tx = ops_run_biodrain(sh, tx); 1255 overlap_clear++; 1256 } 1257 1258 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1259 if (level < 6) 1260 ops_run_reconstruct5(sh, percpu, tx); 1261 else 1262 ops_run_reconstruct6(sh, percpu, tx); 1263 } 1264 1265 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1266 if (sh->check_state == check_state_run) 1267 ops_run_check_p(sh, percpu); 1268 else if (sh->check_state == check_state_run_q) 1269 ops_run_check_pq(sh, percpu, 0); 1270 else if (sh->check_state == check_state_run_pq) 1271 ops_run_check_pq(sh, percpu, 1); 1272 else 1273 BUG(); 1274 } 1275 1276 if (overlap_clear) 1277 for (i = disks; i--; ) { 1278 struct r5dev *dev = &sh->dev[i]; 1279 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1280 wake_up(&sh->raid_conf->wait_for_overlap); 1281 } 1282 put_cpu(); 1283 } 1284 1285 #ifdef CONFIG_MULTICORE_RAID456 1286 static void async_run_ops(void *param, async_cookie_t cookie) 1287 { 1288 struct stripe_head *sh = param; 1289 unsigned long ops_request = sh->ops.request; 1290 1291 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1292 wake_up(&sh->ops.wait_for_ops); 1293 1294 __raid_run_ops(sh, ops_request); 1295 release_stripe(sh); 1296 } 1297 1298 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1299 { 1300 /* since handle_stripe can be called outside of raid5d context 1301 * we need to ensure sh->ops.request is de-staged before another 1302 * request arrives 1303 */ 1304 wait_event(sh->ops.wait_for_ops, 1305 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1306 sh->ops.request = ops_request; 1307 1308 atomic_inc(&sh->count); 1309 async_schedule(async_run_ops, sh); 1310 } 1311 #else 1312 #define raid_run_ops __raid_run_ops 1313 #endif 1314 1315 static int grow_one_stripe(raid5_conf_t *conf) 1316 { 1317 struct stripe_head *sh; 1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1319 if (!sh) 1320 return 0; 1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1322 sh->raid_conf = conf; 1323 spin_lock_init(&sh->lock); 1324 #ifdef CONFIG_MULTICORE_RAID456 1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1326 #endif 1327 1328 if (grow_buffers(sh)) { 1329 shrink_buffers(sh); 1330 kmem_cache_free(conf->slab_cache, sh); 1331 return 0; 1332 } 1333 /* we just created an active stripe so... */ 1334 atomic_set(&sh->count, 1); 1335 atomic_inc(&conf->active_stripes); 1336 INIT_LIST_HEAD(&sh->lru); 1337 release_stripe(sh); 1338 return 1; 1339 } 1340 1341 static int grow_stripes(raid5_conf_t *conf, int num) 1342 { 1343 struct kmem_cache *sc; 1344 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1345 1346 if (conf->mddev->gendisk) 1347 sprintf(conf->cache_name[0], 1348 "raid%d-%s", conf->level, mdname(conf->mddev)); 1349 else 1350 sprintf(conf->cache_name[0], 1351 "raid%d-%p", conf->level, conf->mddev); 1352 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1353 1354 conf->active_name = 0; 1355 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1356 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1357 0, 0, NULL); 1358 if (!sc) 1359 return 1; 1360 conf->slab_cache = sc; 1361 conf->pool_size = devs; 1362 while (num--) 1363 if (!grow_one_stripe(conf)) 1364 return 1; 1365 return 0; 1366 } 1367 1368 /** 1369 * scribble_len - return the required size of the scribble region 1370 * @num - total number of disks in the array 1371 * 1372 * The size must be enough to contain: 1373 * 1/ a struct page pointer for each device in the array +2 1374 * 2/ room to convert each entry in (1) to its corresponding dma 1375 * (dma_map_page()) or page (page_address()) address. 1376 * 1377 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1378 * calculate over all devices (not just the data blocks), using zeros in place 1379 * of the P and Q blocks. 1380 */ 1381 static size_t scribble_len(int num) 1382 { 1383 size_t len; 1384 1385 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1386 1387 return len; 1388 } 1389 1390 static int resize_stripes(raid5_conf_t *conf, int newsize) 1391 { 1392 /* Make all the stripes able to hold 'newsize' devices. 1393 * New slots in each stripe get 'page' set to a new page. 1394 * 1395 * This happens in stages: 1396 * 1/ create a new kmem_cache and allocate the required number of 1397 * stripe_heads. 1398 * 2/ gather all the old stripe_heads and tranfer the pages across 1399 * to the new stripe_heads. This will have the side effect of 1400 * freezing the array as once all stripe_heads have been collected, 1401 * no IO will be possible. Old stripe heads are freed once their 1402 * pages have been transferred over, and the old kmem_cache is 1403 * freed when all stripes are done. 1404 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1405 * we simple return a failre status - no need to clean anything up. 1406 * 4/ allocate new pages for the new slots in the new stripe_heads. 1407 * If this fails, we don't bother trying the shrink the 1408 * stripe_heads down again, we just leave them as they are. 1409 * As each stripe_head is processed the new one is released into 1410 * active service. 1411 * 1412 * Once step2 is started, we cannot afford to wait for a write, 1413 * so we use GFP_NOIO allocations. 1414 */ 1415 struct stripe_head *osh, *nsh; 1416 LIST_HEAD(newstripes); 1417 struct disk_info *ndisks; 1418 unsigned long cpu; 1419 int err; 1420 struct kmem_cache *sc; 1421 int i; 1422 1423 if (newsize <= conf->pool_size) 1424 return 0; /* never bother to shrink */ 1425 1426 err = md_allow_write(conf->mddev); 1427 if (err) 1428 return err; 1429 1430 /* Step 1 */ 1431 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1432 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1433 0, 0, NULL); 1434 if (!sc) 1435 return -ENOMEM; 1436 1437 for (i = conf->max_nr_stripes; i; i--) { 1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1439 if (!nsh) 1440 break; 1441 1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1443 1444 nsh->raid_conf = conf; 1445 spin_lock_init(&nsh->lock); 1446 #ifdef CONFIG_MULTICORE_RAID456 1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1448 #endif 1449 1450 list_add(&nsh->lru, &newstripes); 1451 } 1452 if (i) { 1453 /* didn't get enough, give up */ 1454 while (!list_empty(&newstripes)) { 1455 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1456 list_del(&nsh->lru); 1457 kmem_cache_free(sc, nsh); 1458 } 1459 kmem_cache_destroy(sc); 1460 return -ENOMEM; 1461 } 1462 /* Step 2 - Must use GFP_NOIO now. 1463 * OK, we have enough stripes, start collecting inactive 1464 * stripes and copying them over 1465 */ 1466 list_for_each_entry(nsh, &newstripes, lru) { 1467 spin_lock_irq(&conf->device_lock); 1468 wait_event_lock_irq(conf->wait_for_stripe, 1469 !list_empty(&conf->inactive_list), 1470 conf->device_lock, 1471 ); 1472 osh = get_free_stripe(conf); 1473 spin_unlock_irq(&conf->device_lock); 1474 atomic_set(&nsh->count, 1); 1475 for(i=0; i<conf->pool_size; i++) 1476 nsh->dev[i].page = osh->dev[i].page; 1477 for( ; i<newsize; i++) 1478 nsh->dev[i].page = NULL; 1479 kmem_cache_free(conf->slab_cache, osh); 1480 } 1481 kmem_cache_destroy(conf->slab_cache); 1482 1483 /* Step 3. 1484 * At this point, we are holding all the stripes so the array 1485 * is completely stalled, so now is a good time to resize 1486 * conf->disks and the scribble region 1487 */ 1488 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1489 if (ndisks) { 1490 for (i=0; i<conf->raid_disks; i++) 1491 ndisks[i] = conf->disks[i]; 1492 kfree(conf->disks); 1493 conf->disks = ndisks; 1494 } else 1495 err = -ENOMEM; 1496 1497 get_online_cpus(); 1498 conf->scribble_len = scribble_len(newsize); 1499 for_each_present_cpu(cpu) { 1500 struct raid5_percpu *percpu; 1501 void *scribble; 1502 1503 percpu = per_cpu_ptr(conf->percpu, cpu); 1504 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1505 1506 if (scribble) { 1507 kfree(percpu->scribble); 1508 percpu->scribble = scribble; 1509 } else { 1510 err = -ENOMEM; 1511 break; 1512 } 1513 } 1514 put_online_cpus(); 1515 1516 /* Step 4, return new stripes to service */ 1517 while(!list_empty(&newstripes)) { 1518 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1519 list_del_init(&nsh->lru); 1520 1521 for (i=conf->raid_disks; i < newsize; i++) 1522 if (nsh->dev[i].page == NULL) { 1523 struct page *p = alloc_page(GFP_NOIO); 1524 nsh->dev[i].page = p; 1525 if (!p) 1526 err = -ENOMEM; 1527 } 1528 release_stripe(nsh); 1529 } 1530 /* critical section pass, GFP_NOIO no longer needed */ 1531 1532 conf->slab_cache = sc; 1533 conf->active_name = 1-conf->active_name; 1534 conf->pool_size = newsize; 1535 return err; 1536 } 1537 1538 static int drop_one_stripe(raid5_conf_t *conf) 1539 { 1540 struct stripe_head *sh; 1541 1542 spin_lock_irq(&conf->device_lock); 1543 sh = get_free_stripe(conf); 1544 spin_unlock_irq(&conf->device_lock); 1545 if (!sh) 1546 return 0; 1547 BUG_ON(atomic_read(&sh->count)); 1548 shrink_buffers(sh); 1549 kmem_cache_free(conf->slab_cache, sh); 1550 atomic_dec(&conf->active_stripes); 1551 return 1; 1552 } 1553 1554 static void shrink_stripes(raid5_conf_t *conf) 1555 { 1556 while (drop_one_stripe(conf)) 1557 ; 1558 1559 if (conf->slab_cache) 1560 kmem_cache_destroy(conf->slab_cache); 1561 conf->slab_cache = NULL; 1562 } 1563 1564 static void raid5_end_read_request(struct bio * bi, int error) 1565 { 1566 struct stripe_head *sh = bi->bi_private; 1567 raid5_conf_t *conf = sh->raid_conf; 1568 int disks = sh->disks, i; 1569 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1570 char b[BDEVNAME_SIZE]; 1571 mdk_rdev_t *rdev; 1572 1573 1574 for (i=0 ; i<disks; i++) 1575 if (bi == &sh->dev[i].req) 1576 break; 1577 1578 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1579 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1580 uptodate); 1581 if (i == disks) { 1582 BUG(); 1583 return; 1584 } 1585 1586 if (uptodate) { 1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1589 rdev = conf->disks[i].rdev; 1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1591 " (%lu sectors at %llu on %s)\n", 1592 mdname(conf->mddev), STRIPE_SECTORS, 1593 (unsigned long long)(sh->sector 1594 + rdev->data_offset), 1595 bdevname(rdev->bdev, b)); 1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1598 } 1599 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1600 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1601 } else { 1602 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1603 int retry = 0; 1604 rdev = conf->disks[i].rdev; 1605 1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1607 atomic_inc(&rdev->read_errors); 1608 if (conf->mddev->degraded >= conf->max_degraded) 1609 printk_rl(KERN_WARNING 1610 "md/raid:%s: read error not correctable " 1611 "(sector %llu on %s).\n", 1612 mdname(conf->mddev), 1613 (unsigned long long)(sh->sector 1614 + rdev->data_offset), 1615 bdn); 1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1617 /* Oh, no!!! */ 1618 printk_rl(KERN_WARNING 1619 "md/raid:%s: read error NOT corrected!! " 1620 "(sector %llu on %s).\n", 1621 mdname(conf->mddev), 1622 (unsigned long long)(sh->sector 1623 + rdev->data_offset), 1624 bdn); 1625 else if (atomic_read(&rdev->read_errors) 1626 > conf->max_nr_stripes) 1627 printk(KERN_WARNING 1628 "md/raid:%s: Too many read errors, failing device %s.\n", 1629 mdname(conf->mddev), bdn); 1630 else 1631 retry = 1; 1632 if (retry) 1633 set_bit(R5_ReadError, &sh->dev[i].flags); 1634 else { 1635 clear_bit(R5_ReadError, &sh->dev[i].flags); 1636 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1637 md_error(conf->mddev, rdev); 1638 } 1639 } 1640 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1641 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1642 set_bit(STRIPE_HANDLE, &sh->state); 1643 release_stripe(sh); 1644 } 1645 1646 static void raid5_end_write_request(struct bio *bi, int error) 1647 { 1648 struct stripe_head *sh = bi->bi_private; 1649 raid5_conf_t *conf = sh->raid_conf; 1650 int disks = sh->disks, i; 1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1652 1653 for (i=0 ; i<disks; i++) 1654 if (bi == &sh->dev[i].req) 1655 break; 1656 1657 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1658 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1659 uptodate); 1660 if (i == disks) { 1661 BUG(); 1662 return; 1663 } 1664 1665 if (!uptodate) 1666 md_error(conf->mddev, conf->disks[i].rdev); 1667 1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1669 1670 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1671 set_bit(STRIPE_HANDLE, &sh->state); 1672 release_stripe(sh); 1673 } 1674 1675 1676 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1677 1678 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1679 { 1680 struct r5dev *dev = &sh->dev[i]; 1681 1682 bio_init(&dev->req); 1683 dev->req.bi_io_vec = &dev->vec; 1684 dev->req.bi_vcnt++; 1685 dev->req.bi_max_vecs++; 1686 dev->vec.bv_page = dev->page; 1687 dev->vec.bv_len = STRIPE_SIZE; 1688 dev->vec.bv_offset = 0; 1689 1690 dev->req.bi_sector = sh->sector; 1691 dev->req.bi_private = sh; 1692 1693 dev->flags = 0; 1694 dev->sector = compute_blocknr(sh, i, previous); 1695 } 1696 1697 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1698 { 1699 char b[BDEVNAME_SIZE]; 1700 raid5_conf_t *conf = mddev->private; 1701 pr_debug("raid456: error called\n"); 1702 1703 if (!test_bit(Faulty, &rdev->flags)) { 1704 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1705 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1706 unsigned long flags; 1707 spin_lock_irqsave(&conf->device_lock, flags); 1708 mddev->degraded++; 1709 spin_unlock_irqrestore(&conf->device_lock, flags); 1710 /* 1711 * if recovery was running, make sure it aborts. 1712 */ 1713 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1714 } 1715 set_bit(Faulty, &rdev->flags); 1716 printk(KERN_ALERT 1717 "md/raid:%s: Disk failure on %s, disabling device.\n" 1718 "md/raid:%s: Operation continuing on %d devices.\n", 1719 mdname(mddev), 1720 bdevname(rdev->bdev, b), 1721 mdname(mddev), 1722 conf->raid_disks - mddev->degraded); 1723 } 1724 } 1725 1726 /* 1727 * Input: a 'big' sector number, 1728 * Output: index of the data and parity disk, and the sector # in them. 1729 */ 1730 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1731 int previous, int *dd_idx, 1732 struct stripe_head *sh) 1733 { 1734 sector_t stripe, stripe2; 1735 sector_t chunk_number; 1736 unsigned int chunk_offset; 1737 int pd_idx, qd_idx; 1738 int ddf_layout = 0; 1739 sector_t new_sector; 1740 int algorithm = previous ? conf->prev_algo 1741 : conf->algorithm; 1742 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1743 : conf->chunk_sectors; 1744 int raid_disks = previous ? conf->previous_raid_disks 1745 : conf->raid_disks; 1746 int data_disks = raid_disks - conf->max_degraded; 1747 1748 /* First compute the information on this sector */ 1749 1750 /* 1751 * Compute the chunk number and the sector offset inside the chunk 1752 */ 1753 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1754 chunk_number = r_sector; 1755 1756 /* 1757 * Compute the stripe number 1758 */ 1759 stripe = chunk_number; 1760 *dd_idx = sector_div(stripe, data_disks); 1761 stripe2 = stripe; 1762 /* 1763 * Select the parity disk based on the user selected algorithm. 1764 */ 1765 pd_idx = qd_idx = ~0; 1766 switch(conf->level) { 1767 case 4: 1768 pd_idx = data_disks; 1769 break; 1770 case 5: 1771 switch (algorithm) { 1772 case ALGORITHM_LEFT_ASYMMETRIC: 1773 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1774 if (*dd_idx >= pd_idx) 1775 (*dd_idx)++; 1776 break; 1777 case ALGORITHM_RIGHT_ASYMMETRIC: 1778 pd_idx = sector_div(stripe2, raid_disks); 1779 if (*dd_idx >= pd_idx) 1780 (*dd_idx)++; 1781 break; 1782 case ALGORITHM_LEFT_SYMMETRIC: 1783 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1784 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1785 break; 1786 case ALGORITHM_RIGHT_SYMMETRIC: 1787 pd_idx = sector_div(stripe2, raid_disks); 1788 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1789 break; 1790 case ALGORITHM_PARITY_0: 1791 pd_idx = 0; 1792 (*dd_idx)++; 1793 break; 1794 case ALGORITHM_PARITY_N: 1795 pd_idx = data_disks; 1796 break; 1797 default: 1798 BUG(); 1799 } 1800 break; 1801 case 6: 1802 1803 switch (algorithm) { 1804 case ALGORITHM_LEFT_ASYMMETRIC: 1805 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1806 qd_idx = pd_idx + 1; 1807 if (pd_idx == raid_disks-1) { 1808 (*dd_idx)++; /* Q D D D P */ 1809 qd_idx = 0; 1810 } else if (*dd_idx >= pd_idx) 1811 (*dd_idx) += 2; /* D D P Q D */ 1812 break; 1813 case ALGORITHM_RIGHT_ASYMMETRIC: 1814 pd_idx = sector_div(stripe2, raid_disks); 1815 qd_idx = pd_idx + 1; 1816 if (pd_idx == raid_disks-1) { 1817 (*dd_idx)++; /* Q D D D P */ 1818 qd_idx = 0; 1819 } else if (*dd_idx >= pd_idx) 1820 (*dd_idx) += 2; /* D D P Q D */ 1821 break; 1822 case ALGORITHM_LEFT_SYMMETRIC: 1823 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1824 qd_idx = (pd_idx + 1) % raid_disks; 1825 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1826 break; 1827 case ALGORITHM_RIGHT_SYMMETRIC: 1828 pd_idx = sector_div(stripe2, raid_disks); 1829 qd_idx = (pd_idx + 1) % raid_disks; 1830 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1831 break; 1832 1833 case ALGORITHM_PARITY_0: 1834 pd_idx = 0; 1835 qd_idx = 1; 1836 (*dd_idx) += 2; 1837 break; 1838 case ALGORITHM_PARITY_N: 1839 pd_idx = data_disks; 1840 qd_idx = data_disks + 1; 1841 break; 1842 1843 case ALGORITHM_ROTATING_ZERO_RESTART: 1844 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1845 * of blocks for computing Q is different. 1846 */ 1847 pd_idx = sector_div(stripe2, raid_disks); 1848 qd_idx = pd_idx + 1; 1849 if (pd_idx == raid_disks-1) { 1850 (*dd_idx)++; /* Q D D D P */ 1851 qd_idx = 0; 1852 } else if (*dd_idx >= pd_idx) 1853 (*dd_idx) += 2; /* D D P Q D */ 1854 ddf_layout = 1; 1855 break; 1856 1857 case ALGORITHM_ROTATING_N_RESTART: 1858 /* Same a left_asymmetric, by first stripe is 1859 * D D D P Q rather than 1860 * Q D D D P 1861 */ 1862 stripe2 += 1; 1863 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1864 qd_idx = pd_idx + 1; 1865 if (pd_idx == raid_disks-1) { 1866 (*dd_idx)++; /* Q D D D P */ 1867 qd_idx = 0; 1868 } else if (*dd_idx >= pd_idx) 1869 (*dd_idx) += 2; /* D D P Q D */ 1870 ddf_layout = 1; 1871 break; 1872 1873 case ALGORITHM_ROTATING_N_CONTINUE: 1874 /* Same as left_symmetric but Q is before P */ 1875 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1876 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1877 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1878 ddf_layout = 1; 1879 break; 1880 1881 case ALGORITHM_LEFT_ASYMMETRIC_6: 1882 /* RAID5 left_asymmetric, with Q on last device */ 1883 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1884 if (*dd_idx >= pd_idx) 1885 (*dd_idx)++; 1886 qd_idx = raid_disks - 1; 1887 break; 1888 1889 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1890 pd_idx = sector_div(stripe2, raid_disks-1); 1891 if (*dd_idx >= pd_idx) 1892 (*dd_idx)++; 1893 qd_idx = raid_disks - 1; 1894 break; 1895 1896 case ALGORITHM_LEFT_SYMMETRIC_6: 1897 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1898 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1899 qd_idx = raid_disks - 1; 1900 break; 1901 1902 case ALGORITHM_RIGHT_SYMMETRIC_6: 1903 pd_idx = sector_div(stripe2, raid_disks-1); 1904 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1905 qd_idx = raid_disks - 1; 1906 break; 1907 1908 case ALGORITHM_PARITY_0_6: 1909 pd_idx = 0; 1910 (*dd_idx)++; 1911 qd_idx = raid_disks - 1; 1912 break; 1913 1914 default: 1915 BUG(); 1916 } 1917 break; 1918 } 1919 1920 if (sh) { 1921 sh->pd_idx = pd_idx; 1922 sh->qd_idx = qd_idx; 1923 sh->ddf_layout = ddf_layout; 1924 } 1925 /* 1926 * Finally, compute the new sector number 1927 */ 1928 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1929 return new_sector; 1930 } 1931 1932 1933 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1934 { 1935 raid5_conf_t *conf = sh->raid_conf; 1936 int raid_disks = sh->disks; 1937 int data_disks = raid_disks - conf->max_degraded; 1938 sector_t new_sector = sh->sector, check; 1939 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1940 : conf->chunk_sectors; 1941 int algorithm = previous ? conf->prev_algo 1942 : conf->algorithm; 1943 sector_t stripe; 1944 int chunk_offset; 1945 sector_t chunk_number; 1946 int dummy1, dd_idx = i; 1947 sector_t r_sector; 1948 struct stripe_head sh2; 1949 1950 1951 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1952 stripe = new_sector; 1953 1954 if (i == sh->pd_idx) 1955 return 0; 1956 switch(conf->level) { 1957 case 4: break; 1958 case 5: 1959 switch (algorithm) { 1960 case ALGORITHM_LEFT_ASYMMETRIC: 1961 case ALGORITHM_RIGHT_ASYMMETRIC: 1962 if (i > sh->pd_idx) 1963 i--; 1964 break; 1965 case ALGORITHM_LEFT_SYMMETRIC: 1966 case ALGORITHM_RIGHT_SYMMETRIC: 1967 if (i < sh->pd_idx) 1968 i += raid_disks; 1969 i -= (sh->pd_idx + 1); 1970 break; 1971 case ALGORITHM_PARITY_0: 1972 i -= 1; 1973 break; 1974 case ALGORITHM_PARITY_N: 1975 break; 1976 default: 1977 BUG(); 1978 } 1979 break; 1980 case 6: 1981 if (i == sh->qd_idx) 1982 return 0; /* It is the Q disk */ 1983 switch (algorithm) { 1984 case ALGORITHM_LEFT_ASYMMETRIC: 1985 case ALGORITHM_RIGHT_ASYMMETRIC: 1986 case ALGORITHM_ROTATING_ZERO_RESTART: 1987 case ALGORITHM_ROTATING_N_RESTART: 1988 if (sh->pd_idx == raid_disks-1) 1989 i--; /* Q D D D P */ 1990 else if (i > sh->pd_idx) 1991 i -= 2; /* D D P Q D */ 1992 break; 1993 case ALGORITHM_LEFT_SYMMETRIC: 1994 case ALGORITHM_RIGHT_SYMMETRIC: 1995 if (sh->pd_idx == raid_disks-1) 1996 i--; /* Q D D D P */ 1997 else { 1998 /* D D P Q D */ 1999 if (i < sh->pd_idx) 2000 i += raid_disks; 2001 i -= (sh->pd_idx + 2); 2002 } 2003 break; 2004 case ALGORITHM_PARITY_0: 2005 i -= 2; 2006 break; 2007 case ALGORITHM_PARITY_N: 2008 break; 2009 case ALGORITHM_ROTATING_N_CONTINUE: 2010 /* Like left_symmetric, but P is before Q */ 2011 if (sh->pd_idx == 0) 2012 i--; /* P D D D Q */ 2013 else { 2014 /* D D Q P D */ 2015 if (i < sh->pd_idx) 2016 i += raid_disks; 2017 i -= (sh->pd_idx + 1); 2018 } 2019 break; 2020 case ALGORITHM_LEFT_ASYMMETRIC_6: 2021 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2022 if (i > sh->pd_idx) 2023 i--; 2024 break; 2025 case ALGORITHM_LEFT_SYMMETRIC_6: 2026 case ALGORITHM_RIGHT_SYMMETRIC_6: 2027 if (i < sh->pd_idx) 2028 i += data_disks + 1; 2029 i -= (sh->pd_idx + 1); 2030 break; 2031 case ALGORITHM_PARITY_0_6: 2032 i -= 1; 2033 break; 2034 default: 2035 BUG(); 2036 } 2037 break; 2038 } 2039 2040 chunk_number = stripe * data_disks + i; 2041 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2042 2043 check = raid5_compute_sector(conf, r_sector, 2044 previous, &dummy1, &sh2); 2045 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2046 || sh2.qd_idx != sh->qd_idx) { 2047 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2048 mdname(conf->mddev)); 2049 return 0; 2050 } 2051 return r_sector; 2052 } 2053 2054 2055 static void 2056 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2057 int rcw, int expand) 2058 { 2059 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2060 raid5_conf_t *conf = sh->raid_conf; 2061 int level = conf->level; 2062 2063 if (rcw) { 2064 /* if we are not expanding this is a proper write request, and 2065 * there will be bios with new data to be drained into the 2066 * stripe cache 2067 */ 2068 if (!expand) { 2069 sh->reconstruct_state = reconstruct_state_drain_run; 2070 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2071 } else 2072 sh->reconstruct_state = reconstruct_state_run; 2073 2074 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2075 2076 for (i = disks; i--; ) { 2077 struct r5dev *dev = &sh->dev[i]; 2078 2079 if (dev->towrite) { 2080 set_bit(R5_LOCKED, &dev->flags); 2081 set_bit(R5_Wantdrain, &dev->flags); 2082 if (!expand) 2083 clear_bit(R5_UPTODATE, &dev->flags); 2084 s->locked++; 2085 } 2086 } 2087 if (s->locked + conf->max_degraded == disks) 2088 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2089 atomic_inc(&conf->pending_full_writes); 2090 } else { 2091 BUG_ON(level == 6); 2092 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2093 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2094 2095 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2096 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2097 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2098 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2099 2100 for (i = disks; i--; ) { 2101 struct r5dev *dev = &sh->dev[i]; 2102 if (i == pd_idx) 2103 continue; 2104 2105 if (dev->towrite && 2106 (test_bit(R5_UPTODATE, &dev->flags) || 2107 test_bit(R5_Wantcompute, &dev->flags))) { 2108 set_bit(R5_Wantdrain, &dev->flags); 2109 set_bit(R5_LOCKED, &dev->flags); 2110 clear_bit(R5_UPTODATE, &dev->flags); 2111 s->locked++; 2112 } 2113 } 2114 } 2115 2116 /* keep the parity disk(s) locked while asynchronous operations 2117 * are in flight 2118 */ 2119 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2120 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2121 s->locked++; 2122 2123 if (level == 6) { 2124 int qd_idx = sh->qd_idx; 2125 struct r5dev *dev = &sh->dev[qd_idx]; 2126 2127 set_bit(R5_LOCKED, &dev->flags); 2128 clear_bit(R5_UPTODATE, &dev->flags); 2129 s->locked++; 2130 } 2131 2132 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2133 __func__, (unsigned long long)sh->sector, 2134 s->locked, s->ops_request); 2135 } 2136 2137 /* 2138 * Each stripe/dev can have one or more bion attached. 2139 * toread/towrite point to the first in a chain. 2140 * The bi_next chain must be in order. 2141 */ 2142 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2143 { 2144 struct bio **bip; 2145 raid5_conf_t *conf = sh->raid_conf; 2146 int firstwrite=0; 2147 2148 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2149 (unsigned long long)bi->bi_sector, 2150 (unsigned long long)sh->sector); 2151 2152 2153 spin_lock(&sh->lock); 2154 spin_lock_irq(&conf->device_lock); 2155 if (forwrite) { 2156 bip = &sh->dev[dd_idx].towrite; 2157 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2158 firstwrite = 1; 2159 } else 2160 bip = &sh->dev[dd_idx].toread; 2161 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2162 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2163 goto overlap; 2164 bip = & (*bip)->bi_next; 2165 } 2166 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2167 goto overlap; 2168 2169 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2170 if (*bip) 2171 bi->bi_next = *bip; 2172 *bip = bi; 2173 bi->bi_phys_segments++; 2174 spin_unlock_irq(&conf->device_lock); 2175 spin_unlock(&sh->lock); 2176 2177 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2178 (unsigned long long)bi->bi_sector, 2179 (unsigned long long)sh->sector, dd_idx); 2180 2181 if (conf->mddev->bitmap && firstwrite) { 2182 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2183 STRIPE_SECTORS, 0); 2184 sh->bm_seq = conf->seq_flush+1; 2185 set_bit(STRIPE_BIT_DELAY, &sh->state); 2186 } 2187 2188 if (forwrite) { 2189 /* check if page is covered */ 2190 sector_t sector = sh->dev[dd_idx].sector; 2191 for (bi=sh->dev[dd_idx].towrite; 2192 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2193 bi && bi->bi_sector <= sector; 2194 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2195 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2196 sector = bi->bi_sector + (bi->bi_size>>9); 2197 } 2198 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2199 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2200 } 2201 return 1; 2202 2203 overlap: 2204 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2205 spin_unlock_irq(&conf->device_lock); 2206 spin_unlock(&sh->lock); 2207 return 0; 2208 } 2209 2210 static void end_reshape(raid5_conf_t *conf); 2211 2212 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2213 struct stripe_head *sh) 2214 { 2215 int sectors_per_chunk = 2216 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2217 int dd_idx; 2218 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2219 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2220 2221 raid5_compute_sector(conf, 2222 stripe * (disks - conf->max_degraded) 2223 *sectors_per_chunk + chunk_offset, 2224 previous, 2225 &dd_idx, sh); 2226 } 2227 2228 static void 2229 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 2230 struct stripe_head_state *s, int disks, 2231 struct bio **return_bi) 2232 { 2233 int i; 2234 for (i = disks; i--; ) { 2235 struct bio *bi; 2236 int bitmap_end = 0; 2237 2238 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2239 mdk_rdev_t *rdev; 2240 rcu_read_lock(); 2241 rdev = rcu_dereference(conf->disks[i].rdev); 2242 if (rdev && test_bit(In_sync, &rdev->flags)) 2243 /* multiple read failures in one stripe */ 2244 md_error(conf->mddev, rdev); 2245 rcu_read_unlock(); 2246 } 2247 spin_lock_irq(&conf->device_lock); 2248 /* fail all writes first */ 2249 bi = sh->dev[i].towrite; 2250 sh->dev[i].towrite = NULL; 2251 if (bi) { 2252 s->to_write--; 2253 bitmap_end = 1; 2254 } 2255 2256 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2257 wake_up(&conf->wait_for_overlap); 2258 2259 while (bi && bi->bi_sector < 2260 sh->dev[i].sector + STRIPE_SECTORS) { 2261 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2262 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2263 if (!raid5_dec_bi_phys_segments(bi)) { 2264 md_write_end(conf->mddev); 2265 bi->bi_next = *return_bi; 2266 *return_bi = bi; 2267 } 2268 bi = nextbi; 2269 } 2270 /* and fail all 'written' */ 2271 bi = sh->dev[i].written; 2272 sh->dev[i].written = NULL; 2273 if (bi) bitmap_end = 1; 2274 while (bi && bi->bi_sector < 2275 sh->dev[i].sector + STRIPE_SECTORS) { 2276 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2277 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2278 if (!raid5_dec_bi_phys_segments(bi)) { 2279 md_write_end(conf->mddev); 2280 bi->bi_next = *return_bi; 2281 *return_bi = bi; 2282 } 2283 bi = bi2; 2284 } 2285 2286 /* fail any reads if this device is non-operational and 2287 * the data has not reached the cache yet. 2288 */ 2289 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2290 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2291 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2292 bi = sh->dev[i].toread; 2293 sh->dev[i].toread = NULL; 2294 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2295 wake_up(&conf->wait_for_overlap); 2296 if (bi) s->to_read--; 2297 while (bi && bi->bi_sector < 2298 sh->dev[i].sector + STRIPE_SECTORS) { 2299 struct bio *nextbi = 2300 r5_next_bio(bi, sh->dev[i].sector); 2301 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2302 if (!raid5_dec_bi_phys_segments(bi)) { 2303 bi->bi_next = *return_bi; 2304 *return_bi = bi; 2305 } 2306 bi = nextbi; 2307 } 2308 } 2309 spin_unlock_irq(&conf->device_lock); 2310 if (bitmap_end) 2311 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2312 STRIPE_SECTORS, 0, 0); 2313 } 2314 2315 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2316 if (atomic_dec_and_test(&conf->pending_full_writes)) 2317 md_wakeup_thread(conf->mddev->thread); 2318 } 2319 2320 /* fetch_block5 - checks the given member device to see if its data needs 2321 * to be read or computed to satisfy a request. 2322 * 2323 * Returns 1 when no more member devices need to be checked, otherwise returns 2324 * 0 to tell the loop in handle_stripe_fill5 to continue 2325 */ 2326 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 2327 int disk_idx, int disks) 2328 { 2329 struct r5dev *dev = &sh->dev[disk_idx]; 2330 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 2331 2332 /* is the data in this block needed, and can we get it? */ 2333 if (!test_bit(R5_LOCKED, &dev->flags) && 2334 !test_bit(R5_UPTODATE, &dev->flags) && 2335 (dev->toread || 2336 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2337 s->syncing || s->expanding || 2338 (s->failed && 2339 (failed_dev->toread || 2340 (failed_dev->towrite && 2341 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 2342 /* We would like to get this block, possibly by computing it, 2343 * otherwise read it if the backing disk is insync 2344 */ 2345 if ((s->uptodate == disks - 1) && 2346 (s->failed && disk_idx == s->failed_num)) { 2347 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2348 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2349 set_bit(R5_Wantcompute, &dev->flags); 2350 sh->ops.target = disk_idx; 2351 sh->ops.target2 = -1; 2352 s->req_compute = 1; 2353 /* Careful: from this point on 'uptodate' is in the eye 2354 * of raid_run_ops which services 'compute' operations 2355 * before writes. R5_Wantcompute flags a block that will 2356 * be R5_UPTODATE by the time it is needed for a 2357 * subsequent operation. 2358 */ 2359 s->uptodate++; 2360 return 1; /* uptodate + compute == disks */ 2361 } else if (test_bit(R5_Insync, &dev->flags)) { 2362 set_bit(R5_LOCKED, &dev->flags); 2363 set_bit(R5_Wantread, &dev->flags); 2364 s->locked++; 2365 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 2366 s->syncing); 2367 } 2368 } 2369 2370 return 0; 2371 } 2372 2373 /** 2374 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 2375 */ 2376 static void handle_stripe_fill5(struct stripe_head *sh, 2377 struct stripe_head_state *s, int disks) 2378 { 2379 int i; 2380 2381 /* look for blocks to read/compute, skip this if a compute 2382 * is already in flight, or if the stripe contents are in the 2383 * midst of changing due to a write 2384 */ 2385 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2386 !sh->reconstruct_state) 2387 for (i = disks; i--; ) 2388 if (fetch_block5(sh, s, i, disks)) 2389 break; 2390 set_bit(STRIPE_HANDLE, &sh->state); 2391 } 2392 2393 /* fetch_block6 - checks the given member device to see if its data needs 2394 * to be read or computed to satisfy a request. 2395 * 2396 * Returns 1 when no more member devices need to be checked, otherwise returns 2397 * 0 to tell the loop in handle_stripe_fill6 to continue 2398 */ 2399 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2400 struct r6_state *r6s, int disk_idx, int disks) 2401 { 2402 struct r5dev *dev = &sh->dev[disk_idx]; 2403 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2404 &sh->dev[r6s->failed_num[1]] }; 2405 2406 if (!test_bit(R5_LOCKED, &dev->flags) && 2407 !test_bit(R5_UPTODATE, &dev->flags) && 2408 (dev->toread || 2409 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2410 s->syncing || s->expanding || 2411 (s->failed >= 1 && 2412 (fdev[0]->toread || s->to_write)) || 2413 (s->failed >= 2 && 2414 (fdev[1]->toread || s->to_write)))) { 2415 /* we would like to get this block, possibly by computing it, 2416 * otherwise read it if the backing disk is insync 2417 */ 2418 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2419 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2420 if ((s->uptodate == disks - 1) && 2421 (s->failed && (disk_idx == r6s->failed_num[0] || 2422 disk_idx == r6s->failed_num[1]))) { 2423 /* have disk failed, and we're requested to fetch it; 2424 * do compute it 2425 */ 2426 pr_debug("Computing stripe %llu block %d\n", 2427 (unsigned long long)sh->sector, disk_idx); 2428 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2429 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2430 set_bit(R5_Wantcompute, &dev->flags); 2431 sh->ops.target = disk_idx; 2432 sh->ops.target2 = -1; /* no 2nd target */ 2433 s->req_compute = 1; 2434 s->uptodate++; 2435 return 1; 2436 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2437 /* Computing 2-failure is *very* expensive; only 2438 * do it if failed >= 2 2439 */ 2440 int other; 2441 for (other = disks; other--; ) { 2442 if (other == disk_idx) 2443 continue; 2444 if (!test_bit(R5_UPTODATE, 2445 &sh->dev[other].flags)) 2446 break; 2447 } 2448 BUG_ON(other < 0); 2449 pr_debug("Computing stripe %llu blocks %d,%d\n", 2450 (unsigned long long)sh->sector, 2451 disk_idx, other); 2452 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2453 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2454 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2455 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2456 sh->ops.target = disk_idx; 2457 sh->ops.target2 = other; 2458 s->uptodate += 2; 2459 s->req_compute = 1; 2460 return 1; 2461 } else if (test_bit(R5_Insync, &dev->flags)) { 2462 set_bit(R5_LOCKED, &dev->flags); 2463 set_bit(R5_Wantread, &dev->flags); 2464 s->locked++; 2465 pr_debug("Reading block %d (sync=%d)\n", 2466 disk_idx, s->syncing); 2467 } 2468 } 2469 2470 return 0; 2471 } 2472 2473 /** 2474 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2475 */ 2476 static void handle_stripe_fill6(struct stripe_head *sh, 2477 struct stripe_head_state *s, struct r6_state *r6s, 2478 int disks) 2479 { 2480 int i; 2481 2482 /* look for blocks to read/compute, skip this if a compute 2483 * is already in flight, or if the stripe contents are in the 2484 * midst of changing due to a write 2485 */ 2486 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2487 !sh->reconstruct_state) 2488 for (i = disks; i--; ) 2489 if (fetch_block6(sh, s, r6s, i, disks)) 2490 break; 2491 set_bit(STRIPE_HANDLE, &sh->state); 2492 } 2493 2494 2495 /* handle_stripe_clean_event 2496 * any written block on an uptodate or failed drive can be returned. 2497 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2498 * never LOCKED, so we don't need to test 'failed' directly. 2499 */ 2500 static void handle_stripe_clean_event(raid5_conf_t *conf, 2501 struct stripe_head *sh, int disks, struct bio **return_bi) 2502 { 2503 int i; 2504 struct r5dev *dev; 2505 2506 for (i = disks; i--; ) 2507 if (sh->dev[i].written) { 2508 dev = &sh->dev[i]; 2509 if (!test_bit(R5_LOCKED, &dev->flags) && 2510 test_bit(R5_UPTODATE, &dev->flags)) { 2511 /* We can return any write requests */ 2512 struct bio *wbi, *wbi2; 2513 int bitmap_end = 0; 2514 pr_debug("Return write for disc %d\n", i); 2515 spin_lock_irq(&conf->device_lock); 2516 wbi = dev->written; 2517 dev->written = NULL; 2518 while (wbi && wbi->bi_sector < 2519 dev->sector + STRIPE_SECTORS) { 2520 wbi2 = r5_next_bio(wbi, dev->sector); 2521 if (!raid5_dec_bi_phys_segments(wbi)) { 2522 md_write_end(conf->mddev); 2523 wbi->bi_next = *return_bi; 2524 *return_bi = wbi; 2525 } 2526 wbi = wbi2; 2527 } 2528 if (dev->towrite == NULL) 2529 bitmap_end = 1; 2530 spin_unlock_irq(&conf->device_lock); 2531 if (bitmap_end) 2532 bitmap_endwrite(conf->mddev->bitmap, 2533 sh->sector, 2534 STRIPE_SECTORS, 2535 !test_bit(STRIPE_DEGRADED, &sh->state), 2536 0); 2537 } 2538 } 2539 2540 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2541 if (atomic_dec_and_test(&conf->pending_full_writes)) 2542 md_wakeup_thread(conf->mddev->thread); 2543 } 2544 2545 static void handle_stripe_dirtying5(raid5_conf_t *conf, 2546 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2547 { 2548 int rmw = 0, rcw = 0, i; 2549 for (i = disks; i--; ) { 2550 /* would I have to read this buffer for read_modify_write */ 2551 struct r5dev *dev = &sh->dev[i]; 2552 if ((dev->towrite || i == sh->pd_idx) && 2553 !test_bit(R5_LOCKED, &dev->flags) && 2554 !(test_bit(R5_UPTODATE, &dev->flags) || 2555 test_bit(R5_Wantcompute, &dev->flags))) { 2556 if (test_bit(R5_Insync, &dev->flags)) 2557 rmw++; 2558 else 2559 rmw += 2*disks; /* cannot read it */ 2560 } 2561 /* Would I have to read this buffer for reconstruct_write */ 2562 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2563 !test_bit(R5_LOCKED, &dev->flags) && 2564 !(test_bit(R5_UPTODATE, &dev->flags) || 2565 test_bit(R5_Wantcompute, &dev->flags))) { 2566 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2567 else 2568 rcw += 2*disks; 2569 } 2570 } 2571 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2572 (unsigned long long)sh->sector, rmw, rcw); 2573 set_bit(STRIPE_HANDLE, &sh->state); 2574 if (rmw < rcw && rmw > 0) 2575 /* prefer read-modify-write, but need to get some data */ 2576 for (i = disks; i--; ) { 2577 struct r5dev *dev = &sh->dev[i]; 2578 if ((dev->towrite || i == sh->pd_idx) && 2579 !test_bit(R5_LOCKED, &dev->flags) && 2580 !(test_bit(R5_UPTODATE, &dev->flags) || 2581 test_bit(R5_Wantcompute, &dev->flags)) && 2582 test_bit(R5_Insync, &dev->flags)) { 2583 if ( 2584 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2585 pr_debug("Read_old block " 2586 "%d for r-m-w\n", i); 2587 set_bit(R5_LOCKED, &dev->flags); 2588 set_bit(R5_Wantread, &dev->flags); 2589 s->locked++; 2590 } else { 2591 set_bit(STRIPE_DELAYED, &sh->state); 2592 set_bit(STRIPE_HANDLE, &sh->state); 2593 } 2594 } 2595 } 2596 if (rcw <= rmw && rcw > 0) 2597 /* want reconstruct write, but need to get some data */ 2598 for (i = disks; i--; ) { 2599 struct r5dev *dev = &sh->dev[i]; 2600 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2601 i != sh->pd_idx && 2602 !test_bit(R5_LOCKED, &dev->flags) && 2603 !(test_bit(R5_UPTODATE, &dev->flags) || 2604 test_bit(R5_Wantcompute, &dev->flags)) && 2605 test_bit(R5_Insync, &dev->flags)) { 2606 if ( 2607 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2608 pr_debug("Read_old block " 2609 "%d for Reconstruct\n", i); 2610 set_bit(R5_LOCKED, &dev->flags); 2611 set_bit(R5_Wantread, &dev->flags); 2612 s->locked++; 2613 } else { 2614 set_bit(STRIPE_DELAYED, &sh->state); 2615 set_bit(STRIPE_HANDLE, &sh->state); 2616 } 2617 } 2618 } 2619 /* now if nothing is locked, and if we have enough data, 2620 * we can start a write request 2621 */ 2622 /* since handle_stripe can be called at any time we need to handle the 2623 * case where a compute block operation has been submitted and then a 2624 * subsequent call wants to start a write request. raid_run_ops only 2625 * handles the case where compute block and reconstruct are requested 2626 * simultaneously. If this is not the case then new writes need to be 2627 * held off until the compute completes. 2628 */ 2629 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2630 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2631 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2632 schedule_reconstruction(sh, s, rcw == 0, 0); 2633 } 2634 2635 static void handle_stripe_dirtying6(raid5_conf_t *conf, 2636 struct stripe_head *sh, struct stripe_head_state *s, 2637 struct r6_state *r6s, int disks) 2638 { 2639 int rcw = 0, pd_idx = sh->pd_idx, i; 2640 int qd_idx = sh->qd_idx; 2641 2642 set_bit(STRIPE_HANDLE, &sh->state); 2643 for (i = disks; i--; ) { 2644 struct r5dev *dev = &sh->dev[i]; 2645 /* check if we haven't enough data */ 2646 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2647 i != pd_idx && i != qd_idx && 2648 !test_bit(R5_LOCKED, &dev->flags) && 2649 !(test_bit(R5_UPTODATE, &dev->flags) || 2650 test_bit(R5_Wantcompute, &dev->flags))) { 2651 rcw++; 2652 if (!test_bit(R5_Insync, &dev->flags)) 2653 continue; /* it's a failed drive */ 2654 2655 if ( 2656 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2657 pr_debug("Read_old stripe %llu " 2658 "block %d for Reconstruct\n", 2659 (unsigned long long)sh->sector, i); 2660 set_bit(R5_LOCKED, &dev->flags); 2661 set_bit(R5_Wantread, &dev->flags); 2662 s->locked++; 2663 } else { 2664 pr_debug("Request delayed stripe %llu " 2665 "block %d for Reconstruct\n", 2666 (unsigned long long)sh->sector, i); 2667 set_bit(STRIPE_DELAYED, &sh->state); 2668 set_bit(STRIPE_HANDLE, &sh->state); 2669 } 2670 } 2671 } 2672 /* now if nothing is locked, and if we have enough data, we can start a 2673 * write request 2674 */ 2675 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2676 s->locked == 0 && rcw == 0 && 2677 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2678 schedule_reconstruction(sh, s, 1, 0); 2679 } 2680 } 2681 2682 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2683 struct stripe_head_state *s, int disks) 2684 { 2685 struct r5dev *dev = NULL; 2686 2687 set_bit(STRIPE_HANDLE, &sh->state); 2688 2689 switch (sh->check_state) { 2690 case check_state_idle: 2691 /* start a new check operation if there are no failures */ 2692 if (s->failed == 0) { 2693 BUG_ON(s->uptodate != disks); 2694 sh->check_state = check_state_run; 2695 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2696 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2697 s->uptodate--; 2698 break; 2699 } 2700 dev = &sh->dev[s->failed_num]; 2701 /* fall through */ 2702 case check_state_compute_result: 2703 sh->check_state = check_state_idle; 2704 if (!dev) 2705 dev = &sh->dev[sh->pd_idx]; 2706 2707 /* check that a write has not made the stripe insync */ 2708 if (test_bit(STRIPE_INSYNC, &sh->state)) 2709 break; 2710 2711 /* either failed parity check, or recovery is happening */ 2712 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2713 BUG_ON(s->uptodate != disks); 2714 2715 set_bit(R5_LOCKED, &dev->flags); 2716 s->locked++; 2717 set_bit(R5_Wantwrite, &dev->flags); 2718 2719 clear_bit(STRIPE_DEGRADED, &sh->state); 2720 set_bit(STRIPE_INSYNC, &sh->state); 2721 break; 2722 case check_state_run: 2723 break; /* we will be called again upon completion */ 2724 case check_state_check_result: 2725 sh->check_state = check_state_idle; 2726 2727 /* if a failure occurred during the check operation, leave 2728 * STRIPE_INSYNC not set and let the stripe be handled again 2729 */ 2730 if (s->failed) 2731 break; 2732 2733 /* handle a successful check operation, if parity is correct 2734 * we are done. Otherwise update the mismatch count and repair 2735 * parity if !MD_RECOVERY_CHECK 2736 */ 2737 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2738 /* parity is correct (on disc, 2739 * not in buffer any more) 2740 */ 2741 set_bit(STRIPE_INSYNC, &sh->state); 2742 else { 2743 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2744 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2745 /* don't try to repair!! */ 2746 set_bit(STRIPE_INSYNC, &sh->state); 2747 else { 2748 sh->check_state = check_state_compute_run; 2749 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2750 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2751 set_bit(R5_Wantcompute, 2752 &sh->dev[sh->pd_idx].flags); 2753 sh->ops.target = sh->pd_idx; 2754 sh->ops.target2 = -1; 2755 s->uptodate++; 2756 } 2757 } 2758 break; 2759 case check_state_compute_run: 2760 break; 2761 default: 2762 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2763 __func__, sh->check_state, 2764 (unsigned long long) sh->sector); 2765 BUG(); 2766 } 2767 } 2768 2769 2770 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2771 struct stripe_head_state *s, 2772 struct r6_state *r6s, int disks) 2773 { 2774 int pd_idx = sh->pd_idx; 2775 int qd_idx = sh->qd_idx; 2776 struct r5dev *dev; 2777 2778 set_bit(STRIPE_HANDLE, &sh->state); 2779 2780 BUG_ON(s->failed > 2); 2781 2782 /* Want to check and possibly repair P and Q. 2783 * However there could be one 'failed' device, in which 2784 * case we can only check one of them, possibly using the 2785 * other to generate missing data 2786 */ 2787 2788 switch (sh->check_state) { 2789 case check_state_idle: 2790 /* start a new check operation if there are < 2 failures */ 2791 if (s->failed == r6s->q_failed) { 2792 /* The only possible failed device holds Q, so it 2793 * makes sense to check P (If anything else were failed, 2794 * we would have used P to recreate it). 2795 */ 2796 sh->check_state = check_state_run; 2797 } 2798 if (!r6s->q_failed && s->failed < 2) { 2799 /* Q is not failed, and we didn't use it to generate 2800 * anything, so it makes sense to check it 2801 */ 2802 if (sh->check_state == check_state_run) 2803 sh->check_state = check_state_run_pq; 2804 else 2805 sh->check_state = check_state_run_q; 2806 } 2807 2808 /* discard potentially stale zero_sum_result */ 2809 sh->ops.zero_sum_result = 0; 2810 2811 if (sh->check_state == check_state_run) { 2812 /* async_xor_zero_sum destroys the contents of P */ 2813 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2814 s->uptodate--; 2815 } 2816 if (sh->check_state >= check_state_run && 2817 sh->check_state <= check_state_run_pq) { 2818 /* async_syndrome_zero_sum preserves P and Q, so 2819 * no need to mark them !uptodate here 2820 */ 2821 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2822 break; 2823 } 2824 2825 /* we have 2-disk failure */ 2826 BUG_ON(s->failed != 2); 2827 /* fall through */ 2828 case check_state_compute_result: 2829 sh->check_state = check_state_idle; 2830 2831 /* check that a write has not made the stripe insync */ 2832 if (test_bit(STRIPE_INSYNC, &sh->state)) 2833 break; 2834 2835 /* now write out any block on a failed drive, 2836 * or P or Q if they were recomputed 2837 */ 2838 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2839 if (s->failed == 2) { 2840 dev = &sh->dev[r6s->failed_num[1]]; 2841 s->locked++; 2842 set_bit(R5_LOCKED, &dev->flags); 2843 set_bit(R5_Wantwrite, &dev->flags); 2844 } 2845 if (s->failed >= 1) { 2846 dev = &sh->dev[r6s->failed_num[0]]; 2847 s->locked++; 2848 set_bit(R5_LOCKED, &dev->flags); 2849 set_bit(R5_Wantwrite, &dev->flags); 2850 } 2851 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2852 dev = &sh->dev[pd_idx]; 2853 s->locked++; 2854 set_bit(R5_LOCKED, &dev->flags); 2855 set_bit(R5_Wantwrite, &dev->flags); 2856 } 2857 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2858 dev = &sh->dev[qd_idx]; 2859 s->locked++; 2860 set_bit(R5_LOCKED, &dev->flags); 2861 set_bit(R5_Wantwrite, &dev->flags); 2862 } 2863 clear_bit(STRIPE_DEGRADED, &sh->state); 2864 2865 set_bit(STRIPE_INSYNC, &sh->state); 2866 break; 2867 case check_state_run: 2868 case check_state_run_q: 2869 case check_state_run_pq: 2870 break; /* we will be called again upon completion */ 2871 case check_state_check_result: 2872 sh->check_state = check_state_idle; 2873 2874 /* handle a successful check operation, if parity is correct 2875 * we are done. Otherwise update the mismatch count and repair 2876 * parity if !MD_RECOVERY_CHECK 2877 */ 2878 if (sh->ops.zero_sum_result == 0) { 2879 /* both parities are correct */ 2880 if (!s->failed) 2881 set_bit(STRIPE_INSYNC, &sh->state); 2882 else { 2883 /* in contrast to the raid5 case we can validate 2884 * parity, but still have a failure to write 2885 * back 2886 */ 2887 sh->check_state = check_state_compute_result; 2888 /* Returning at this point means that we may go 2889 * off and bring p and/or q uptodate again so 2890 * we make sure to check zero_sum_result again 2891 * to verify if p or q need writeback 2892 */ 2893 } 2894 } else { 2895 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2896 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2897 /* don't try to repair!! */ 2898 set_bit(STRIPE_INSYNC, &sh->state); 2899 else { 2900 int *target = &sh->ops.target; 2901 2902 sh->ops.target = -1; 2903 sh->ops.target2 = -1; 2904 sh->check_state = check_state_compute_run; 2905 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2906 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2907 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2908 set_bit(R5_Wantcompute, 2909 &sh->dev[pd_idx].flags); 2910 *target = pd_idx; 2911 target = &sh->ops.target2; 2912 s->uptodate++; 2913 } 2914 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2915 set_bit(R5_Wantcompute, 2916 &sh->dev[qd_idx].flags); 2917 *target = qd_idx; 2918 s->uptodate++; 2919 } 2920 } 2921 } 2922 break; 2923 case check_state_compute_run: 2924 break; 2925 default: 2926 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2927 __func__, sh->check_state, 2928 (unsigned long long) sh->sector); 2929 BUG(); 2930 } 2931 } 2932 2933 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2934 struct r6_state *r6s) 2935 { 2936 int i; 2937 2938 /* We have read all the blocks in this stripe and now we need to 2939 * copy some of them into a target stripe for expand. 2940 */ 2941 struct dma_async_tx_descriptor *tx = NULL; 2942 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2943 for (i = 0; i < sh->disks; i++) 2944 if (i != sh->pd_idx && i != sh->qd_idx) { 2945 int dd_idx, j; 2946 struct stripe_head *sh2; 2947 struct async_submit_ctl submit; 2948 2949 sector_t bn = compute_blocknr(sh, i, 1); 2950 sector_t s = raid5_compute_sector(conf, bn, 0, 2951 &dd_idx, NULL); 2952 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2953 if (sh2 == NULL) 2954 /* so far only the early blocks of this stripe 2955 * have been requested. When later blocks 2956 * get requested, we will try again 2957 */ 2958 continue; 2959 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2960 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2961 /* must have already done this block */ 2962 release_stripe(sh2); 2963 continue; 2964 } 2965 2966 /* place all the copies on one channel */ 2967 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 2968 tx = async_memcpy(sh2->dev[dd_idx].page, 2969 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2970 &submit); 2971 2972 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2973 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2974 for (j = 0; j < conf->raid_disks; j++) 2975 if (j != sh2->pd_idx && 2976 (!r6s || j != sh2->qd_idx) && 2977 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2978 break; 2979 if (j == conf->raid_disks) { 2980 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2981 set_bit(STRIPE_HANDLE, &sh2->state); 2982 } 2983 release_stripe(sh2); 2984 2985 } 2986 /* done submitting copies, wait for them to complete */ 2987 if (tx) { 2988 async_tx_ack(tx); 2989 dma_wait_for_async_tx(tx); 2990 } 2991 } 2992 2993 2994 /* 2995 * handle_stripe - do things to a stripe. 2996 * 2997 * We lock the stripe and then examine the state of various bits 2998 * to see what needs to be done. 2999 * Possible results: 3000 * return some read request which now have data 3001 * return some write requests which are safely on disc 3002 * schedule a read on some buffers 3003 * schedule a write of some buffers 3004 * return confirmation of parity correctness 3005 * 3006 * buffers are taken off read_list or write_list, and bh_cache buffers 3007 * get BH_Lock set before the stripe lock is released. 3008 * 3009 */ 3010 3011 static void handle_stripe5(struct stripe_head *sh) 3012 { 3013 raid5_conf_t *conf = sh->raid_conf; 3014 int disks = sh->disks, i; 3015 struct bio *return_bi = NULL; 3016 struct stripe_head_state s; 3017 struct r5dev *dev; 3018 mdk_rdev_t *blocked_rdev = NULL; 3019 int prexor; 3020 int dec_preread_active = 0; 3021 3022 memset(&s, 0, sizeof(s)); 3023 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 3024 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 3025 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 3026 sh->reconstruct_state); 3027 3028 spin_lock(&sh->lock); 3029 clear_bit(STRIPE_HANDLE, &sh->state); 3030 clear_bit(STRIPE_DELAYED, &sh->state); 3031 3032 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3033 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3034 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3035 3036 /* Now to look around and see what can be done */ 3037 rcu_read_lock(); 3038 for (i=disks; i--; ) { 3039 mdk_rdev_t *rdev; 3040 3041 dev = &sh->dev[i]; 3042 3043 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3044 "written %p\n", i, dev->flags, dev->toread, dev->read, 3045 dev->towrite, dev->written); 3046 3047 /* maybe we can request a biofill operation 3048 * 3049 * new wantfill requests are only permitted while 3050 * ops_complete_biofill is guaranteed to be inactive 3051 */ 3052 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3053 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3054 set_bit(R5_Wantfill, &dev->flags); 3055 3056 /* now count some things */ 3057 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3058 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3059 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3060 3061 if (test_bit(R5_Wantfill, &dev->flags)) 3062 s.to_fill++; 3063 else if (dev->toread) 3064 s.to_read++; 3065 if (dev->towrite) { 3066 s.to_write++; 3067 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3068 s.non_overwrite++; 3069 } 3070 if (dev->written) 3071 s.written++; 3072 rdev = rcu_dereference(conf->disks[i].rdev); 3073 if (blocked_rdev == NULL && 3074 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3075 blocked_rdev = rdev; 3076 atomic_inc(&rdev->nr_pending); 3077 } 3078 clear_bit(R5_Insync, &dev->flags); 3079 if (!rdev) 3080 /* Not in-sync */; 3081 else if (test_bit(In_sync, &rdev->flags)) 3082 set_bit(R5_Insync, &dev->flags); 3083 else { 3084 /* could be in-sync depending on recovery/reshape status */ 3085 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3086 set_bit(R5_Insync, &dev->flags); 3087 } 3088 if (!test_bit(R5_Insync, &dev->flags)) { 3089 /* The ReadError flag will just be confusing now */ 3090 clear_bit(R5_ReadError, &dev->flags); 3091 clear_bit(R5_ReWrite, &dev->flags); 3092 } 3093 if (test_bit(R5_ReadError, &dev->flags)) 3094 clear_bit(R5_Insync, &dev->flags); 3095 if (!test_bit(R5_Insync, &dev->flags)) { 3096 s.failed++; 3097 s.failed_num = i; 3098 } 3099 } 3100 rcu_read_unlock(); 3101 3102 if (unlikely(blocked_rdev)) { 3103 if (s.syncing || s.expanding || s.expanded || 3104 s.to_write || s.written) { 3105 set_bit(STRIPE_HANDLE, &sh->state); 3106 goto unlock; 3107 } 3108 /* There is nothing for the blocked_rdev to block */ 3109 rdev_dec_pending(blocked_rdev, conf->mddev); 3110 blocked_rdev = NULL; 3111 } 3112 3113 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3114 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3115 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3116 } 3117 3118 pr_debug("locked=%d uptodate=%d to_read=%d" 3119 " to_write=%d failed=%d failed_num=%d\n", 3120 s.locked, s.uptodate, s.to_read, s.to_write, 3121 s.failed, s.failed_num); 3122 /* check if the array has lost two devices and, if so, some requests might 3123 * need to be failed 3124 */ 3125 if (s.failed > 1 && s.to_read+s.to_write+s.written) 3126 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3127 if (s.failed > 1 && s.syncing) { 3128 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3129 clear_bit(STRIPE_SYNCING, &sh->state); 3130 s.syncing = 0; 3131 } 3132 3133 /* might be able to return some write requests if the parity block 3134 * is safe, or on a failed drive 3135 */ 3136 dev = &sh->dev[sh->pd_idx]; 3137 if ( s.written && 3138 ((test_bit(R5_Insync, &dev->flags) && 3139 !test_bit(R5_LOCKED, &dev->flags) && 3140 test_bit(R5_UPTODATE, &dev->flags)) || 3141 (s.failed == 1 && s.failed_num == sh->pd_idx))) 3142 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3143 3144 /* Now we might consider reading some blocks, either to check/generate 3145 * parity, or to satisfy requests 3146 * or to load a block that is being partially written. 3147 */ 3148 if (s.to_read || s.non_overwrite || 3149 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3150 handle_stripe_fill5(sh, &s, disks); 3151 3152 /* Now we check to see if any write operations have recently 3153 * completed 3154 */ 3155 prexor = 0; 3156 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3157 prexor = 1; 3158 if (sh->reconstruct_state == reconstruct_state_drain_result || 3159 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3160 sh->reconstruct_state = reconstruct_state_idle; 3161 3162 /* All the 'written' buffers and the parity block are ready to 3163 * be written back to disk 3164 */ 3165 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3166 for (i = disks; i--; ) { 3167 dev = &sh->dev[i]; 3168 if (test_bit(R5_LOCKED, &dev->flags) && 3169 (i == sh->pd_idx || dev->written)) { 3170 pr_debug("Writing block %d\n", i); 3171 set_bit(R5_Wantwrite, &dev->flags); 3172 if (prexor) 3173 continue; 3174 if (!test_bit(R5_Insync, &dev->flags) || 3175 (i == sh->pd_idx && s.failed == 0)) 3176 set_bit(STRIPE_INSYNC, &sh->state); 3177 } 3178 } 3179 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3180 dec_preread_active = 1; 3181 } 3182 3183 /* Now to consider new write requests and what else, if anything 3184 * should be read. We do not handle new writes when: 3185 * 1/ A 'write' operation (copy+xor) is already in flight. 3186 * 2/ A 'check' operation is in flight, as it may clobber the parity 3187 * block. 3188 */ 3189 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3190 handle_stripe_dirtying5(conf, sh, &s, disks); 3191 3192 /* maybe we need to check and possibly fix the parity for this stripe 3193 * Any reads will already have been scheduled, so we just see if enough 3194 * data is available. The parity check is held off while parity 3195 * dependent operations are in flight. 3196 */ 3197 if (sh->check_state || 3198 (s.syncing && s.locked == 0 && 3199 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3200 !test_bit(STRIPE_INSYNC, &sh->state))) 3201 handle_parity_checks5(conf, sh, &s, disks); 3202 3203 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3204 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3205 clear_bit(STRIPE_SYNCING, &sh->state); 3206 } 3207 3208 /* If the failed drive is just a ReadError, then we might need to progress 3209 * the repair/check process 3210 */ 3211 if (s.failed == 1 && !conf->mddev->ro && 3212 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 3213 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 3214 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 3215 ) { 3216 dev = &sh->dev[s.failed_num]; 3217 if (!test_bit(R5_ReWrite, &dev->flags)) { 3218 set_bit(R5_Wantwrite, &dev->flags); 3219 set_bit(R5_ReWrite, &dev->flags); 3220 set_bit(R5_LOCKED, &dev->flags); 3221 s.locked++; 3222 } else { 3223 /* let's read it back */ 3224 set_bit(R5_Wantread, &dev->flags); 3225 set_bit(R5_LOCKED, &dev->flags); 3226 s.locked++; 3227 } 3228 } 3229 3230 /* Finish reconstruct operations initiated by the expansion process */ 3231 if (sh->reconstruct_state == reconstruct_state_result) { 3232 struct stripe_head *sh2 3233 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3234 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3235 /* sh cannot be written until sh2 has been read. 3236 * so arrange for sh to be delayed a little 3237 */ 3238 set_bit(STRIPE_DELAYED, &sh->state); 3239 set_bit(STRIPE_HANDLE, &sh->state); 3240 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3241 &sh2->state)) 3242 atomic_inc(&conf->preread_active_stripes); 3243 release_stripe(sh2); 3244 goto unlock; 3245 } 3246 if (sh2) 3247 release_stripe(sh2); 3248 3249 sh->reconstruct_state = reconstruct_state_idle; 3250 clear_bit(STRIPE_EXPANDING, &sh->state); 3251 for (i = conf->raid_disks; i--; ) { 3252 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3253 set_bit(R5_LOCKED, &sh->dev[i].flags); 3254 s.locked++; 3255 } 3256 } 3257 3258 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3259 !sh->reconstruct_state) { 3260 /* Need to write out all blocks after computing parity */ 3261 sh->disks = conf->raid_disks; 3262 stripe_set_idx(sh->sector, conf, 0, sh); 3263 schedule_reconstruction(sh, &s, 1, 1); 3264 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3265 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3266 atomic_dec(&conf->reshape_stripes); 3267 wake_up(&conf->wait_for_overlap); 3268 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3269 } 3270 3271 if (s.expanding && s.locked == 0 && 3272 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3273 handle_stripe_expansion(conf, sh, NULL); 3274 3275 unlock: 3276 spin_unlock(&sh->lock); 3277 3278 /* wait for this device to become unblocked */ 3279 if (unlikely(blocked_rdev)) 3280 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3281 3282 if (s.ops_request) 3283 raid_run_ops(sh, s.ops_request); 3284 3285 ops_run_io(sh, &s); 3286 3287 if (dec_preread_active) { 3288 /* We delay this until after ops_run_io so that if make_request 3289 * is waiting on a flush, it won't continue until the writes 3290 * have actually been submitted. 3291 */ 3292 atomic_dec(&conf->preread_active_stripes); 3293 if (atomic_read(&conf->preread_active_stripes) < 3294 IO_THRESHOLD) 3295 md_wakeup_thread(conf->mddev->thread); 3296 } 3297 return_io(return_bi); 3298 } 3299 3300 static void handle_stripe6(struct stripe_head *sh) 3301 { 3302 raid5_conf_t *conf = sh->raid_conf; 3303 int disks = sh->disks; 3304 struct bio *return_bi = NULL; 3305 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3306 struct stripe_head_state s; 3307 struct r6_state r6s; 3308 struct r5dev *dev, *pdev, *qdev; 3309 mdk_rdev_t *blocked_rdev = NULL; 3310 int dec_preread_active = 0; 3311 3312 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3313 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3314 (unsigned long long)sh->sector, sh->state, 3315 atomic_read(&sh->count), pd_idx, qd_idx, 3316 sh->check_state, sh->reconstruct_state); 3317 memset(&s, 0, sizeof(s)); 3318 3319 spin_lock(&sh->lock); 3320 clear_bit(STRIPE_HANDLE, &sh->state); 3321 clear_bit(STRIPE_DELAYED, &sh->state); 3322 3323 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3324 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3325 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3326 /* Now to look around and see what can be done */ 3327 3328 rcu_read_lock(); 3329 for (i=disks; i--; ) { 3330 mdk_rdev_t *rdev; 3331 dev = &sh->dev[i]; 3332 3333 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3334 i, dev->flags, dev->toread, dev->towrite, dev->written); 3335 /* maybe we can reply to a read 3336 * 3337 * new wantfill requests are only permitted while 3338 * ops_complete_biofill is guaranteed to be inactive 3339 */ 3340 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3341 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3342 set_bit(R5_Wantfill, &dev->flags); 3343 3344 /* now count some things */ 3345 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3346 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3347 if (test_bit(R5_Wantcompute, &dev->flags)) { 3348 s.compute++; 3349 BUG_ON(s.compute > 2); 3350 } 3351 3352 if (test_bit(R5_Wantfill, &dev->flags)) { 3353 s.to_fill++; 3354 } else if (dev->toread) 3355 s.to_read++; 3356 if (dev->towrite) { 3357 s.to_write++; 3358 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3359 s.non_overwrite++; 3360 } 3361 if (dev->written) 3362 s.written++; 3363 rdev = rcu_dereference(conf->disks[i].rdev); 3364 if (blocked_rdev == NULL && 3365 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3366 blocked_rdev = rdev; 3367 atomic_inc(&rdev->nr_pending); 3368 } 3369 clear_bit(R5_Insync, &dev->flags); 3370 if (!rdev) 3371 /* Not in-sync */; 3372 else if (test_bit(In_sync, &rdev->flags)) 3373 set_bit(R5_Insync, &dev->flags); 3374 else { 3375 /* in sync if before recovery_offset */ 3376 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3377 set_bit(R5_Insync, &dev->flags); 3378 } 3379 if (!test_bit(R5_Insync, &dev->flags)) { 3380 /* The ReadError flag will just be confusing now */ 3381 clear_bit(R5_ReadError, &dev->flags); 3382 clear_bit(R5_ReWrite, &dev->flags); 3383 } 3384 if (test_bit(R5_ReadError, &dev->flags)) 3385 clear_bit(R5_Insync, &dev->flags); 3386 if (!test_bit(R5_Insync, &dev->flags)) { 3387 if (s.failed < 2) 3388 r6s.failed_num[s.failed] = i; 3389 s.failed++; 3390 } 3391 } 3392 rcu_read_unlock(); 3393 3394 if (unlikely(blocked_rdev)) { 3395 if (s.syncing || s.expanding || s.expanded || 3396 s.to_write || s.written) { 3397 set_bit(STRIPE_HANDLE, &sh->state); 3398 goto unlock; 3399 } 3400 /* There is nothing for the blocked_rdev to block */ 3401 rdev_dec_pending(blocked_rdev, conf->mddev); 3402 blocked_rdev = NULL; 3403 } 3404 3405 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3406 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3407 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3408 } 3409 3410 pr_debug("locked=%d uptodate=%d to_read=%d" 3411 " to_write=%d failed=%d failed_num=%d,%d\n", 3412 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3413 r6s.failed_num[0], r6s.failed_num[1]); 3414 /* check if the array has lost >2 devices and, if so, some requests 3415 * might need to be failed 3416 */ 3417 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3418 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3419 if (s.failed > 2 && s.syncing) { 3420 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3421 clear_bit(STRIPE_SYNCING, &sh->state); 3422 s.syncing = 0; 3423 } 3424 3425 /* 3426 * might be able to return some write requests if the parity blocks 3427 * are safe, or on a failed drive 3428 */ 3429 pdev = &sh->dev[pd_idx]; 3430 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3431 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3432 qdev = &sh->dev[qd_idx]; 3433 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3434 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3435 3436 if ( s.written && 3437 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3438 && !test_bit(R5_LOCKED, &pdev->flags) 3439 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3440 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3441 && !test_bit(R5_LOCKED, &qdev->flags) 3442 && test_bit(R5_UPTODATE, &qdev->flags))))) 3443 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3444 3445 /* Now we might consider reading some blocks, either to check/generate 3446 * parity, or to satisfy requests 3447 * or to load a block that is being partially written. 3448 */ 3449 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3450 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3451 handle_stripe_fill6(sh, &s, &r6s, disks); 3452 3453 /* Now we check to see if any write operations have recently 3454 * completed 3455 */ 3456 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3457 3458 sh->reconstruct_state = reconstruct_state_idle; 3459 /* All the 'written' buffers and the parity blocks are ready to 3460 * be written back to disk 3461 */ 3462 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3463 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3464 for (i = disks; i--; ) { 3465 dev = &sh->dev[i]; 3466 if (test_bit(R5_LOCKED, &dev->flags) && 3467 (i == sh->pd_idx || i == qd_idx || 3468 dev->written)) { 3469 pr_debug("Writing block %d\n", i); 3470 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3471 set_bit(R5_Wantwrite, &dev->flags); 3472 if (!test_bit(R5_Insync, &dev->flags) || 3473 ((i == sh->pd_idx || i == qd_idx) && 3474 s.failed == 0)) 3475 set_bit(STRIPE_INSYNC, &sh->state); 3476 } 3477 } 3478 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3479 dec_preread_active = 1; 3480 } 3481 3482 /* Now to consider new write requests and what else, if anything 3483 * should be read. We do not handle new writes when: 3484 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3485 * 2/ A 'check' operation is in flight, as it may clobber the parity 3486 * block. 3487 */ 3488 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3489 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3490 3491 /* maybe we need to check and possibly fix the parity for this stripe 3492 * Any reads will already have been scheduled, so we just see if enough 3493 * data is available. The parity check is held off while parity 3494 * dependent operations are in flight. 3495 */ 3496 if (sh->check_state || 3497 (s.syncing && s.locked == 0 && 3498 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3499 !test_bit(STRIPE_INSYNC, &sh->state))) 3500 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3501 3502 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3503 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3504 clear_bit(STRIPE_SYNCING, &sh->state); 3505 } 3506 3507 /* If the failed drives are just a ReadError, then we might need 3508 * to progress the repair/check process 3509 */ 3510 if (s.failed <= 2 && !conf->mddev->ro) 3511 for (i = 0; i < s.failed; i++) { 3512 dev = &sh->dev[r6s.failed_num[i]]; 3513 if (test_bit(R5_ReadError, &dev->flags) 3514 && !test_bit(R5_LOCKED, &dev->flags) 3515 && test_bit(R5_UPTODATE, &dev->flags) 3516 ) { 3517 if (!test_bit(R5_ReWrite, &dev->flags)) { 3518 set_bit(R5_Wantwrite, &dev->flags); 3519 set_bit(R5_ReWrite, &dev->flags); 3520 set_bit(R5_LOCKED, &dev->flags); 3521 s.locked++; 3522 } else { 3523 /* let's read it back */ 3524 set_bit(R5_Wantread, &dev->flags); 3525 set_bit(R5_LOCKED, &dev->flags); 3526 s.locked++; 3527 } 3528 } 3529 } 3530 3531 /* Finish reconstruct operations initiated by the expansion process */ 3532 if (sh->reconstruct_state == reconstruct_state_result) { 3533 sh->reconstruct_state = reconstruct_state_idle; 3534 clear_bit(STRIPE_EXPANDING, &sh->state); 3535 for (i = conf->raid_disks; i--; ) { 3536 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3537 set_bit(R5_LOCKED, &sh->dev[i].flags); 3538 s.locked++; 3539 } 3540 } 3541 3542 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3543 !sh->reconstruct_state) { 3544 struct stripe_head *sh2 3545 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3546 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3547 /* sh cannot be written until sh2 has been read. 3548 * so arrange for sh to be delayed a little 3549 */ 3550 set_bit(STRIPE_DELAYED, &sh->state); 3551 set_bit(STRIPE_HANDLE, &sh->state); 3552 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3553 &sh2->state)) 3554 atomic_inc(&conf->preread_active_stripes); 3555 release_stripe(sh2); 3556 goto unlock; 3557 } 3558 if (sh2) 3559 release_stripe(sh2); 3560 3561 /* Need to write out all blocks after computing P&Q */ 3562 sh->disks = conf->raid_disks; 3563 stripe_set_idx(sh->sector, conf, 0, sh); 3564 schedule_reconstruction(sh, &s, 1, 1); 3565 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3566 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3567 atomic_dec(&conf->reshape_stripes); 3568 wake_up(&conf->wait_for_overlap); 3569 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3570 } 3571 3572 if (s.expanding && s.locked == 0 && 3573 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3574 handle_stripe_expansion(conf, sh, &r6s); 3575 3576 unlock: 3577 spin_unlock(&sh->lock); 3578 3579 /* wait for this device to become unblocked */ 3580 if (unlikely(blocked_rdev)) 3581 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3582 3583 if (s.ops_request) 3584 raid_run_ops(sh, s.ops_request); 3585 3586 ops_run_io(sh, &s); 3587 3588 3589 if (dec_preread_active) { 3590 /* We delay this until after ops_run_io so that if make_request 3591 * is waiting on a flush, it won't continue until the writes 3592 * have actually been submitted. 3593 */ 3594 atomic_dec(&conf->preread_active_stripes); 3595 if (atomic_read(&conf->preread_active_stripes) < 3596 IO_THRESHOLD) 3597 md_wakeup_thread(conf->mddev->thread); 3598 } 3599 3600 return_io(return_bi); 3601 } 3602 3603 static void handle_stripe(struct stripe_head *sh) 3604 { 3605 if (sh->raid_conf->level == 6) 3606 handle_stripe6(sh); 3607 else 3608 handle_stripe5(sh); 3609 } 3610 3611 static void raid5_activate_delayed(raid5_conf_t *conf) 3612 { 3613 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3614 while (!list_empty(&conf->delayed_list)) { 3615 struct list_head *l = conf->delayed_list.next; 3616 struct stripe_head *sh; 3617 sh = list_entry(l, struct stripe_head, lru); 3618 list_del_init(l); 3619 clear_bit(STRIPE_DELAYED, &sh->state); 3620 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3621 atomic_inc(&conf->preread_active_stripes); 3622 list_add_tail(&sh->lru, &conf->hold_list); 3623 } 3624 } 3625 } 3626 3627 static void activate_bit_delay(raid5_conf_t *conf) 3628 { 3629 /* device_lock is held */ 3630 struct list_head head; 3631 list_add(&head, &conf->bitmap_list); 3632 list_del_init(&conf->bitmap_list); 3633 while (!list_empty(&head)) { 3634 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3635 list_del_init(&sh->lru); 3636 atomic_inc(&sh->count); 3637 __release_stripe(conf, sh); 3638 } 3639 } 3640 3641 int md_raid5_congested(mddev_t *mddev, int bits) 3642 { 3643 raid5_conf_t *conf = mddev->private; 3644 3645 /* No difference between reads and writes. Just check 3646 * how busy the stripe_cache is 3647 */ 3648 3649 if (conf->inactive_blocked) 3650 return 1; 3651 if (conf->quiesce) 3652 return 1; 3653 if (list_empty_careful(&conf->inactive_list)) 3654 return 1; 3655 3656 return 0; 3657 } 3658 EXPORT_SYMBOL_GPL(md_raid5_congested); 3659 3660 static int raid5_congested(void *data, int bits) 3661 { 3662 mddev_t *mddev = data; 3663 3664 return mddev_congested(mddev, bits) || 3665 md_raid5_congested(mddev, bits); 3666 } 3667 3668 /* We want read requests to align with chunks where possible, 3669 * but write requests don't need to. 3670 */ 3671 static int raid5_mergeable_bvec(struct request_queue *q, 3672 struct bvec_merge_data *bvm, 3673 struct bio_vec *biovec) 3674 { 3675 mddev_t *mddev = q->queuedata; 3676 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3677 int max; 3678 unsigned int chunk_sectors = mddev->chunk_sectors; 3679 unsigned int bio_sectors = bvm->bi_size >> 9; 3680 3681 if ((bvm->bi_rw & 1) == WRITE) 3682 return biovec->bv_len; /* always allow writes to be mergeable */ 3683 3684 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3685 chunk_sectors = mddev->new_chunk_sectors; 3686 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3687 if (max < 0) max = 0; 3688 if (max <= biovec->bv_len && bio_sectors == 0) 3689 return biovec->bv_len; 3690 else 3691 return max; 3692 } 3693 3694 3695 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3696 { 3697 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3698 unsigned int chunk_sectors = mddev->chunk_sectors; 3699 unsigned int bio_sectors = bio->bi_size >> 9; 3700 3701 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3702 chunk_sectors = mddev->new_chunk_sectors; 3703 return chunk_sectors >= 3704 ((sector & (chunk_sectors - 1)) + bio_sectors); 3705 } 3706 3707 /* 3708 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3709 * later sampled by raid5d. 3710 */ 3711 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3712 { 3713 unsigned long flags; 3714 3715 spin_lock_irqsave(&conf->device_lock, flags); 3716 3717 bi->bi_next = conf->retry_read_aligned_list; 3718 conf->retry_read_aligned_list = bi; 3719 3720 spin_unlock_irqrestore(&conf->device_lock, flags); 3721 md_wakeup_thread(conf->mddev->thread); 3722 } 3723 3724 3725 static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3726 { 3727 struct bio *bi; 3728 3729 bi = conf->retry_read_aligned; 3730 if (bi) { 3731 conf->retry_read_aligned = NULL; 3732 return bi; 3733 } 3734 bi = conf->retry_read_aligned_list; 3735 if(bi) { 3736 conf->retry_read_aligned_list = bi->bi_next; 3737 bi->bi_next = NULL; 3738 /* 3739 * this sets the active strip count to 1 and the processed 3740 * strip count to zero (upper 8 bits) 3741 */ 3742 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3743 } 3744 3745 return bi; 3746 } 3747 3748 3749 /* 3750 * The "raid5_align_endio" should check if the read succeeded and if it 3751 * did, call bio_endio on the original bio (having bio_put the new bio 3752 * first). 3753 * If the read failed.. 3754 */ 3755 static void raid5_align_endio(struct bio *bi, int error) 3756 { 3757 struct bio* raid_bi = bi->bi_private; 3758 mddev_t *mddev; 3759 raid5_conf_t *conf; 3760 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3761 mdk_rdev_t *rdev; 3762 3763 bio_put(bi); 3764 3765 rdev = (void*)raid_bi->bi_next; 3766 raid_bi->bi_next = NULL; 3767 mddev = rdev->mddev; 3768 conf = mddev->private; 3769 3770 rdev_dec_pending(rdev, conf->mddev); 3771 3772 if (!error && uptodate) { 3773 bio_endio(raid_bi, 0); 3774 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3775 wake_up(&conf->wait_for_stripe); 3776 return; 3777 } 3778 3779 3780 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3781 3782 add_bio_to_retry(raid_bi, conf); 3783 } 3784 3785 static int bio_fits_rdev(struct bio *bi) 3786 { 3787 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3788 3789 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3790 return 0; 3791 blk_recount_segments(q, bi); 3792 if (bi->bi_phys_segments > queue_max_segments(q)) 3793 return 0; 3794 3795 if (q->merge_bvec_fn) 3796 /* it's too hard to apply the merge_bvec_fn at this stage, 3797 * just just give up 3798 */ 3799 return 0; 3800 3801 return 1; 3802 } 3803 3804 3805 static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) 3806 { 3807 raid5_conf_t *conf = mddev->private; 3808 int dd_idx; 3809 struct bio* align_bi; 3810 mdk_rdev_t *rdev; 3811 3812 if (!in_chunk_boundary(mddev, raid_bio)) { 3813 pr_debug("chunk_aligned_read : non aligned\n"); 3814 return 0; 3815 } 3816 /* 3817 * use bio_clone_mddev to make a copy of the bio 3818 */ 3819 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3820 if (!align_bi) 3821 return 0; 3822 /* 3823 * set bi_end_io to a new function, and set bi_private to the 3824 * original bio. 3825 */ 3826 align_bi->bi_end_io = raid5_align_endio; 3827 align_bi->bi_private = raid_bio; 3828 /* 3829 * compute position 3830 */ 3831 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3832 0, 3833 &dd_idx, NULL); 3834 3835 rcu_read_lock(); 3836 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3837 if (rdev && test_bit(In_sync, &rdev->flags)) { 3838 atomic_inc(&rdev->nr_pending); 3839 rcu_read_unlock(); 3840 raid_bio->bi_next = (void*)rdev; 3841 align_bi->bi_bdev = rdev->bdev; 3842 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3843 align_bi->bi_sector += rdev->data_offset; 3844 3845 if (!bio_fits_rdev(align_bi)) { 3846 /* too big in some way */ 3847 bio_put(align_bi); 3848 rdev_dec_pending(rdev, mddev); 3849 return 0; 3850 } 3851 3852 spin_lock_irq(&conf->device_lock); 3853 wait_event_lock_irq(conf->wait_for_stripe, 3854 conf->quiesce == 0, 3855 conf->device_lock, /* nothing */); 3856 atomic_inc(&conf->active_aligned_reads); 3857 spin_unlock_irq(&conf->device_lock); 3858 3859 generic_make_request(align_bi); 3860 return 1; 3861 } else { 3862 rcu_read_unlock(); 3863 bio_put(align_bi); 3864 return 0; 3865 } 3866 } 3867 3868 /* __get_priority_stripe - get the next stripe to process 3869 * 3870 * Full stripe writes are allowed to pass preread active stripes up until 3871 * the bypass_threshold is exceeded. In general the bypass_count 3872 * increments when the handle_list is handled before the hold_list; however, it 3873 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3874 * stripe with in flight i/o. The bypass_count will be reset when the 3875 * head of the hold_list has changed, i.e. the head was promoted to the 3876 * handle_list. 3877 */ 3878 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3879 { 3880 struct stripe_head *sh; 3881 3882 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3883 __func__, 3884 list_empty(&conf->handle_list) ? "empty" : "busy", 3885 list_empty(&conf->hold_list) ? "empty" : "busy", 3886 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3887 3888 if (!list_empty(&conf->handle_list)) { 3889 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3890 3891 if (list_empty(&conf->hold_list)) 3892 conf->bypass_count = 0; 3893 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3894 if (conf->hold_list.next == conf->last_hold) 3895 conf->bypass_count++; 3896 else { 3897 conf->last_hold = conf->hold_list.next; 3898 conf->bypass_count -= conf->bypass_threshold; 3899 if (conf->bypass_count < 0) 3900 conf->bypass_count = 0; 3901 } 3902 } 3903 } else if (!list_empty(&conf->hold_list) && 3904 ((conf->bypass_threshold && 3905 conf->bypass_count > conf->bypass_threshold) || 3906 atomic_read(&conf->pending_full_writes) == 0)) { 3907 sh = list_entry(conf->hold_list.next, 3908 typeof(*sh), lru); 3909 conf->bypass_count -= conf->bypass_threshold; 3910 if (conf->bypass_count < 0) 3911 conf->bypass_count = 0; 3912 } else 3913 return NULL; 3914 3915 list_del_init(&sh->lru); 3916 atomic_inc(&sh->count); 3917 BUG_ON(atomic_read(&sh->count) != 1); 3918 return sh; 3919 } 3920 3921 static int make_request(mddev_t *mddev, struct bio * bi) 3922 { 3923 raid5_conf_t *conf = mddev->private; 3924 int dd_idx; 3925 sector_t new_sector; 3926 sector_t logical_sector, last_sector; 3927 struct stripe_head *sh; 3928 const int rw = bio_data_dir(bi); 3929 int remaining; 3930 int plugged; 3931 3932 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3933 md_flush_request(mddev, bi); 3934 return 0; 3935 } 3936 3937 md_write_start(mddev, bi); 3938 3939 if (rw == READ && 3940 mddev->reshape_position == MaxSector && 3941 chunk_aligned_read(mddev,bi)) 3942 return 0; 3943 3944 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3945 last_sector = bi->bi_sector + (bi->bi_size>>9); 3946 bi->bi_next = NULL; 3947 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3948 3949 plugged = mddev_check_plugged(mddev); 3950 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3951 DEFINE_WAIT(w); 3952 int disks, data_disks; 3953 int previous; 3954 3955 retry: 3956 previous = 0; 3957 disks = conf->raid_disks; 3958 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3959 if (unlikely(conf->reshape_progress != MaxSector)) { 3960 /* spinlock is needed as reshape_progress may be 3961 * 64bit on a 32bit platform, and so it might be 3962 * possible to see a half-updated value 3963 * Ofcourse reshape_progress could change after 3964 * the lock is dropped, so once we get a reference 3965 * to the stripe that we think it is, we will have 3966 * to check again. 3967 */ 3968 spin_lock_irq(&conf->device_lock); 3969 if (mddev->delta_disks < 0 3970 ? logical_sector < conf->reshape_progress 3971 : logical_sector >= conf->reshape_progress) { 3972 disks = conf->previous_raid_disks; 3973 previous = 1; 3974 } else { 3975 if (mddev->delta_disks < 0 3976 ? logical_sector < conf->reshape_safe 3977 : logical_sector >= conf->reshape_safe) { 3978 spin_unlock_irq(&conf->device_lock); 3979 schedule(); 3980 goto retry; 3981 } 3982 } 3983 spin_unlock_irq(&conf->device_lock); 3984 } 3985 data_disks = disks - conf->max_degraded; 3986 3987 new_sector = raid5_compute_sector(conf, logical_sector, 3988 previous, 3989 &dd_idx, NULL); 3990 pr_debug("raid456: make_request, sector %llu logical %llu\n", 3991 (unsigned long long)new_sector, 3992 (unsigned long long)logical_sector); 3993 3994 sh = get_active_stripe(conf, new_sector, previous, 3995 (bi->bi_rw&RWA_MASK), 0); 3996 if (sh) { 3997 if (unlikely(previous)) { 3998 /* expansion might have moved on while waiting for a 3999 * stripe, so we must do the range check again. 4000 * Expansion could still move past after this 4001 * test, but as we are holding a reference to 4002 * 'sh', we know that if that happens, 4003 * STRIPE_EXPANDING will get set and the expansion 4004 * won't proceed until we finish with the stripe. 4005 */ 4006 int must_retry = 0; 4007 spin_lock_irq(&conf->device_lock); 4008 if (mddev->delta_disks < 0 4009 ? logical_sector >= conf->reshape_progress 4010 : logical_sector < conf->reshape_progress) 4011 /* mismatch, need to try again */ 4012 must_retry = 1; 4013 spin_unlock_irq(&conf->device_lock); 4014 if (must_retry) { 4015 release_stripe(sh); 4016 schedule(); 4017 goto retry; 4018 } 4019 } 4020 4021 if (bio_data_dir(bi) == WRITE && 4022 logical_sector >= mddev->suspend_lo && 4023 logical_sector < mddev->suspend_hi) { 4024 release_stripe(sh); 4025 /* As the suspend_* range is controlled by 4026 * userspace, we want an interruptible 4027 * wait. 4028 */ 4029 flush_signals(current); 4030 prepare_to_wait(&conf->wait_for_overlap, 4031 &w, TASK_INTERRUPTIBLE); 4032 if (logical_sector >= mddev->suspend_lo && 4033 logical_sector < mddev->suspend_hi) 4034 schedule(); 4035 goto retry; 4036 } 4037 4038 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4039 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 4040 /* Stripe is busy expanding or 4041 * add failed due to overlap. Flush everything 4042 * and wait a while 4043 */ 4044 md_wakeup_thread(mddev->thread); 4045 release_stripe(sh); 4046 schedule(); 4047 goto retry; 4048 } 4049 finish_wait(&conf->wait_for_overlap, &w); 4050 set_bit(STRIPE_HANDLE, &sh->state); 4051 clear_bit(STRIPE_DELAYED, &sh->state); 4052 if ((bi->bi_rw & REQ_SYNC) && 4053 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4054 atomic_inc(&conf->preread_active_stripes); 4055 release_stripe(sh); 4056 } else { 4057 /* cannot get stripe for read-ahead, just give-up */ 4058 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4059 finish_wait(&conf->wait_for_overlap, &w); 4060 break; 4061 } 4062 4063 } 4064 if (!plugged) 4065 md_wakeup_thread(mddev->thread); 4066 4067 spin_lock_irq(&conf->device_lock); 4068 remaining = raid5_dec_bi_phys_segments(bi); 4069 spin_unlock_irq(&conf->device_lock); 4070 if (remaining == 0) { 4071 4072 if ( rw == WRITE ) 4073 md_write_end(mddev); 4074 4075 bio_endio(bi, 0); 4076 } 4077 4078 return 0; 4079 } 4080 4081 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); 4082 4083 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 4084 { 4085 /* reshaping is quite different to recovery/resync so it is 4086 * handled quite separately ... here. 4087 * 4088 * On each call to sync_request, we gather one chunk worth of 4089 * destination stripes and flag them as expanding. 4090 * Then we find all the source stripes and request reads. 4091 * As the reads complete, handle_stripe will copy the data 4092 * into the destination stripe and release that stripe. 4093 */ 4094 raid5_conf_t *conf = mddev->private; 4095 struct stripe_head *sh; 4096 sector_t first_sector, last_sector; 4097 int raid_disks = conf->previous_raid_disks; 4098 int data_disks = raid_disks - conf->max_degraded; 4099 int new_data_disks = conf->raid_disks - conf->max_degraded; 4100 int i; 4101 int dd_idx; 4102 sector_t writepos, readpos, safepos; 4103 sector_t stripe_addr; 4104 int reshape_sectors; 4105 struct list_head stripes; 4106 4107 if (sector_nr == 0) { 4108 /* If restarting in the middle, skip the initial sectors */ 4109 if (mddev->delta_disks < 0 && 4110 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4111 sector_nr = raid5_size(mddev, 0, 0) 4112 - conf->reshape_progress; 4113 } else if (mddev->delta_disks >= 0 && 4114 conf->reshape_progress > 0) 4115 sector_nr = conf->reshape_progress; 4116 sector_div(sector_nr, new_data_disks); 4117 if (sector_nr) { 4118 mddev->curr_resync_completed = sector_nr; 4119 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4120 *skipped = 1; 4121 return sector_nr; 4122 } 4123 } 4124 4125 /* We need to process a full chunk at a time. 4126 * If old and new chunk sizes differ, we need to process the 4127 * largest of these 4128 */ 4129 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4130 reshape_sectors = mddev->new_chunk_sectors; 4131 else 4132 reshape_sectors = mddev->chunk_sectors; 4133 4134 /* we update the metadata when there is more than 3Meg 4135 * in the block range (that is rather arbitrary, should 4136 * probably be time based) or when the data about to be 4137 * copied would over-write the source of the data at 4138 * the front of the range. 4139 * i.e. one new_stripe along from reshape_progress new_maps 4140 * to after where reshape_safe old_maps to 4141 */ 4142 writepos = conf->reshape_progress; 4143 sector_div(writepos, new_data_disks); 4144 readpos = conf->reshape_progress; 4145 sector_div(readpos, data_disks); 4146 safepos = conf->reshape_safe; 4147 sector_div(safepos, data_disks); 4148 if (mddev->delta_disks < 0) { 4149 writepos -= min_t(sector_t, reshape_sectors, writepos); 4150 readpos += reshape_sectors; 4151 safepos += reshape_sectors; 4152 } else { 4153 writepos += reshape_sectors; 4154 readpos -= min_t(sector_t, reshape_sectors, readpos); 4155 safepos -= min_t(sector_t, reshape_sectors, safepos); 4156 } 4157 4158 /* 'writepos' is the most advanced device address we might write. 4159 * 'readpos' is the least advanced device address we might read. 4160 * 'safepos' is the least address recorded in the metadata as having 4161 * been reshaped. 4162 * If 'readpos' is behind 'writepos', then there is no way that we can 4163 * ensure safety in the face of a crash - that must be done by userspace 4164 * making a backup of the data. So in that case there is no particular 4165 * rush to update metadata. 4166 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4167 * update the metadata to advance 'safepos' to match 'readpos' so that 4168 * we can be safe in the event of a crash. 4169 * So we insist on updating metadata if safepos is behind writepos and 4170 * readpos is beyond writepos. 4171 * In any case, update the metadata every 10 seconds. 4172 * Maybe that number should be configurable, but I'm not sure it is 4173 * worth it.... maybe it could be a multiple of safemode_delay??? 4174 */ 4175 if ((mddev->delta_disks < 0 4176 ? (safepos > writepos && readpos < writepos) 4177 : (safepos < writepos && readpos > writepos)) || 4178 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4179 /* Cannot proceed until we've updated the superblock... */ 4180 wait_event(conf->wait_for_overlap, 4181 atomic_read(&conf->reshape_stripes)==0); 4182 mddev->reshape_position = conf->reshape_progress; 4183 mddev->curr_resync_completed = sector_nr; 4184 conf->reshape_checkpoint = jiffies; 4185 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4186 md_wakeup_thread(mddev->thread); 4187 wait_event(mddev->sb_wait, mddev->flags == 0 || 4188 kthread_should_stop()); 4189 spin_lock_irq(&conf->device_lock); 4190 conf->reshape_safe = mddev->reshape_position; 4191 spin_unlock_irq(&conf->device_lock); 4192 wake_up(&conf->wait_for_overlap); 4193 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4194 } 4195 4196 if (mddev->delta_disks < 0) { 4197 BUG_ON(conf->reshape_progress == 0); 4198 stripe_addr = writepos; 4199 BUG_ON((mddev->dev_sectors & 4200 ~((sector_t)reshape_sectors - 1)) 4201 - reshape_sectors - stripe_addr 4202 != sector_nr); 4203 } else { 4204 BUG_ON(writepos != sector_nr + reshape_sectors); 4205 stripe_addr = sector_nr; 4206 } 4207 INIT_LIST_HEAD(&stripes); 4208 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4209 int j; 4210 int skipped_disk = 0; 4211 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4212 set_bit(STRIPE_EXPANDING, &sh->state); 4213 atomic_inc(&conf->reshape_stripes); 4214 /* If any of this stripe is beyond the end of the old 4215 * array, then we need to zero those blocks 4216 */ 4217 for (j=sh->disks; j--;) { 4218 sector_t s; 4219 if (j == sh->pd_idx) 4220 continue; 4221 if (conf->level == 6 && 4222 j == sh->qd_idx) 4223 continue; 4224 s = compute_blocknr(sh, j, 0); 4225 if (s < raid5_size(mddev, 0, 0)) { 4226 skipped_disk = 1; 4227 continue; 4228 } 4229 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4230 set_bit(R5_Expanded, &sh->dev[j].flags); 4231 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4232 } 4233 if (!skipped_disk) { 4234 set_bit(STRIPE_EXPAND_READY, &sh->state); 4235 set_bit(STRIPE_HANDLE, &sh->state); 4236 } 4237 list_add(&sh->lru, &stripes); 4238 } 4239 spin_lock_irq(&conf->device_lock); 4240 if (mddev->delta_disks < 0) 4241 conf->reshape_progress -= reshape_sectors * new_data_disks; 4242 else 4243 conf->reshape_progress += reshape_sectors * new_data_disks; 4244 spin_unlock_irq(&conf->device_lock); 4245 /* Ok, those stripe are ready. We can start scheduling 4246 * reads on the source stripes. 4247 * The source stripes are determined by mapping the first and last 4248 * block on the destination stripes. 4249 */ 4250 first_sector = 4251 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4252 1, &dd_idx, NULL); 4253 last_sector = 4254 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4255 * new_data_disks - 1), 4256 1, &dd_idx, NULL); 4257 if (last_sector >= mddev->dev_sectors) 4258 last_sector = mddev->dev_sectors - 1; 4259 while (first_sector <= last_sector) { 4260 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4261 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4262 set_bit(STRIPE_HANDLE, &sh->state); 4263 release_stripe(sh); 4264 first_sector += STRIPE_SECTORS; 4265 } 4266 /* Now that the sources are clearly marked, we can release 4267 * the destination stripes 4268 */ 4269 while (!list_empty(&stripes)) { 4270 sh = list_entry(stripes.next, struct stripe_head, lru); 4271 list_del_init(&sh->lru); 4272 release_stripe(sh); 4273 } 4274 /* If this takes us to the resync_max point where we have to pause, 4275 * then we need to write out the superblock. 4276 */ 4277 sector_nr += reshape_sectors; 4278 if ((sector_nr - mddev->curr_resync_completed) * 2 4279 >= mddev->resync_max - mddev->curr_resync_completed) { 4280 /* Cannot proceed until we've updated the superblock... */ 4281 wait_event(conf->wait_for_overlap, 4282 atomic_read(&conf->reshape_stripes) == 0); 4283 mddev->reshape_position = conf->reshape_progress; 4284 mddev->curr_resync_completed = sector_nr; 4285 conf->reshape_checkpoint = jiffies; 4286 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4287 md_wakeup_thread(mddev->thread); 4288 wait_event(mddev->sb_wait, 4289 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4290 || kthread_should_stop()); 4291 spin_lock_irq(&conf->device_lock); 4292 conf->reshape_safe = mddev->reshape_position; 4293 spin_unlock_irq(&conf->device_lock); 4294 wake_up(&conf->wait_for_overlap); 4295 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4296 } 4297 return reshape_sectors; 4298 } 4299 4300 /* FIXME go_faster isn't used */ 4301 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4302 { 4303 raid5_conf_t *conf = mddev->private; 4304 struct stripe_head *sh; 4305 sector_t max_sector = mddev->dev_sectors; 4306 sector_t sync_blocks; 4307 int still_degraded = 0; 4308 int i; 4309 4310 if (sector_nr >= max_sector) { 4311 /* just being told to finish up .. nothing much to do */ 4312 4313 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4314 end_reshape(conf); 4315 return 0; 4316 } 4317 4318 if (mddev->curr_resync < max_sector) /* aborted */ 4319 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4320 &sync_blocks, 1); 4321 else /* completed sync */ 4322 conf->fullsync = 0; 4323 bitmap_close_sync(mddev->bitmap); 4324 4325 return 0; 4326 } 4327 4328 /* Allow raid5_quiesce to complete */ 4329 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4330 4331 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4332 return reshape_request(mddev, sector_nr, skipped); 4333 4334 /* No need to check resync_max as we never do more than one 4335 * stripe, and as resync_max will always be on a chunk boundary, 4336 * if the check in md_do_sync didn't fire, there is no chance 4337 * of overstepping resync_max here 4338 */ 4339 4340 /* if there is too many failed drives and we are trying 4341 * to resync, then assert that we are finished, because there is 4342 * nothing we can do. 4343 */ 4344 if (mddev->degraded >= conf->max_degraded && 4345 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4346 sector_t rv = mddev->dev_sectors - sector_nr; 4347 *skipped = 1; 4348 return rv; 4349 } 4350 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4351 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4352 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4353 /* we can skip this block, and probably more */ 4354 sync_blocks /= STRIPE_SECTORS; 4355 *skipped = 1; 4356 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4357 } 4358 4359 4360 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4361 4362 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4363 if (sh == NULL) { 4364 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4365 /* make sure we don't swamp the stripe cache if someone else 4366 * is trying to get access 4367 */ 4368 schedule_timeout_uninterruptible(1); 4369 } 4370 /* Need to check if array will still be degraded after recovery/resync 4371 * We don't need to check the 'failed' flag as when that gets set, 4372 * recovery aborts. 4373 */ 4374 for (i = 0; i < conf->raid_disks; i++) 4375 if (conf->disks[i].rdev == NULL) 4376 still_degraded = 1; 4377 4378 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4379 4380 spin_lock(&sh->lock); 4381 set_bit(STRIPE_SYNCING, &sh->state); 4382 clear_bit(STRIPE_INSYNC, &sh->state); 4383 spin_unlock(&sh->lock); 4384 4385 handle_stripe(sh); 4386 release_stripe(sh); 4387 4388 return STRIPE_SECTORS; 4389 } 4390 4391 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 4392 { 4393 /* We may not be able to submit a whole bio at once as there 4394 * may not be enough stripe_heads available. 4395 * We cannot pre-allocate enough stripe_heads as we may need 4396 * more than exist in the cache (if we allow ever large chunks). 4397 * So we do one stripe head at a time and record in 4398 * ->bi_hw_segments how many have been done. 4399 * 4400 * We *know* that this entire raid_bio is in one chunk, so 4401 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4402 */ 4403 struct stripe_head *sh; 4404 int dd_idx; 4405 sector_t sector, logical_sector, last_sector; 4406 int scnt = 0; 4407 int remaining; 4408 int handled = 0; 4409 4410 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4411 sector = raid5_compute_sector(conf, logical_sector, 4412 0, &dd_idx, NULL); 4413 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4414 4415 for (; logical_sector < last_sector; 4416 logical_sector += STRIPE_SECTORS, 4417 sector += STRIPE_SECTORS, 4418 scnt++) { 4419 4420 if (scnt < raid5_bi_hw_segments(raid_bio)) 4421 /* already done this stripe */ 4422 continue; 4423 4424 sh = get_active_stripe(conf, sector, 0, 1, 0); 4425 4426 if (!sh) { 4427 /* failed to get a stripe - must wait */ 4428 raid5_set_bi_hw_segments(raid_bio, scnt); 4429 conf->retry_read_aligned = raid_bio; 4430 return handled; 4431 } 4432 4433 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4434 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4435 release_stripe(sh); 4436 raid5_set_bi_hw_segments(raid_bio, scnt); 4437 conf->retry_read_aligned = raid_bio; 4438 return handled; 4439 } 4440 4441 handle_stripe(sh); 4442 release_stripe(sh); 4443 handled++; 4444 } 4445 spin_lock_irq(&conf->device_lock); 4446 remaining = raid5_dec_bi_phys_segments(raid_bio); 4447 spin_unlock_irq(&conf->device_lock); 4448 if (remaining == 0) 4449 bio_endio(raid_bio, 0); 4450 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4451 wake_up(&conf->wait_for_stripe); 4452 return handled; 4453 } 4454 4455 4456 /* 4457 * This is our raid5 kernel thread. 4458 * 4459 * We scan the hash table for stripes which can be handled now. 4460 * During the scan, completed stripes are saved for us by the interrupt 4461 * handler, so that they will not have to wait for our next wakeup. 4462 */ 4463 static void raid5d(mddev_t *mddev) 4464 { 4465 struct stripe_head *sh; 4466 raid5_conf_t *conf = mddev->private; 4467 int handled; 4468 struct blk_plug plug; 4469 4470 pr_debug("+++ raid5d active\n"); 4471 4472 md_check_recovery(mddev); 4473 4474 blk_start_plug(&plug); 4475 handled = 0; 4476 spin_lock_irq(&conf->device_lock); 4477 while (1) { 4478 struct bio *bio; 4479 4480 if (atomic_read(&mddev->plug_cnt) == 0 && 4481 !list_empty(&conf->bitmap_list)) { 4482 /* Now is a good time to flush some bitmap updates */ 4483 conf->seq_flush++; 4484 spin_unlock_irq(&conf->device_lock); 4485 bitmap_unplug(mddev->bitmap); 4486 spin_lock_irq(&conf->device_lock); 4487 conf->seq_write = conf->seq_flush; 4488 activate_bit_delay(conf); 4489 } 4490 if (atomic_read(&mddev->plug_cnt) == 0) 4491 raid5_activate_delayed(conf); 4492 4493 while ((bio = remove_bio_from_retry(conf))) { 4494 int ok; 4495 spin_unlock_irq(&conf->device_lock); 4496 ok = retry_aligned_read(conf, bio); 4497 spin_lock_irq(&conf->device_lock); 4498 if (!ok) 4499 break; 4500 handled++; 4501 } 4502 4503 sh = __get_priority_stripe(conf); 4504 4505 if (!sh) 4506 break; 4507 spin_unlock_irq(&conf->device_lock); 4508 4509 handled++; 4510 handle_stripe(sh); 4511 release_stripe(sh); 4512 cond_resched(); 4513 4514 spin_lock_irq(&conf->device_lock); 4515 } 4516 pr_debug("%d stripes handled\n", handled); 4517 4518 spin_unlock_irq(&conf->device_lock); 4519 4520 async_tx_issue_pending_all(); 4521 blk_finish_plug(&plug); 4522 4523 pr_debug("--- raid5d inactive\n"); 4524 } 4525 4526 static ssize_t 4527 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4528 { 4529 raid5_conf_t *conf = mddev->private; 4530 if (conf) 4531 return sprintf(page, "%d\n", conf->max_nr_stripes); 4532 else 4533 return 0; 4534 } 4535 4536 int 4537 raid5_set_cache_size(mddev_t *mddev, int size) 4538 { 4539 raid5_conf_t *conf = mddev->private; 4540 int err; 4541 4542 if (size <= 16 || size > 32768) 4543 return -EINVAL; 4544 while (size < conf->max_nr_stripes) { 4545 if (drop_one_stripe(conf)) 4546 conf->max_nr_stripes--; 4547 else 4548 break; 4549 } 4550 err = md_allow_write(mddev); 4551 if (err) 4552 return err; 4553 while (size > conf->max_nr_stripes) { 4554 if (grow_one_stripe(conf)) 4555 conf->max_nr_stripes++; 4556 else break; 4557 } 4558 return 0; 4559 } 4560 EXPORT_SYMBOL(raid5_set_cache_size); 4561 4562 static ssize_t 4563 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4564 { 4565 raid5_conf_t *conf = mddev->private; 4566 unsigned long new; 4567 int err; 4568 4569 if (len >= PAGE_SIZE) 4570 return -EINVAL; 4571 if (!conf) 4572 return -ENODEV; 4573 4574 if (strict_strtoul(page, 10, &new)) 4575 return -EINVAL; 4576 err = raid5_set_cache_size(mddev, new); 4577 if (err) 4578 return err; 4579 return len; 4580 } 4581 4582 static struct md_sysfs_entry 4583 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4584 raid5_show_stripe_cache_size, 4585 raid5_store_stripe_cache_size); 4586 4587 static ssize_t 4588 raid5_show_preread_threshold(mddev_t *mddev, char *page) 4589 { 4590 raid5_conf_t *conf = mddev->private; 4591 if (conf) 4592 return sprintf(page, "%d\n", conf->bypass_threshold); 4593 else 4594 return 0; 4595 } 4596 4597 static ssize_t 4598 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4599 { 4600 raid5_conf_t *conf = mddev->private; 4601 unsigned long new; 4602 if (len >= PAGE_SIZE) 4603 return -EINVAL; 4604 if (!conf) 4605 return -ENODEV; 4606 4607 if (strict_strtoul(page, 10, &new)) 4608 return -EINVAL; 4609 if (new > conf->max_nr_stripes) 4610 return -EINVAL; 4611 conf->bypass_threshold = new; 4612 return len; 4613 } 4614 4615 static struct md_sysfs_entry 4616 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4617 S_IRUGO | S_IWUSR, 4618 raid5_show_preread_threshold, 4619 raid5_store_preread_threshold); 4620 4621 static ssize_t 4622 stripe_cache_active_show(mddev_t *mddev, char *page) 4623 { 4624 raid5_conf_t *conf = mddev->private; 4625 if (conf) 4626 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4627 else 4628 return 0; 4629 } 4630 4631 static struct md_sysfs_entry 4632 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4633 4634 static struct attribute *raid5_attrs[] = { 4635 &raid5_stripecache_size.attr, 4636 &raid5_stripecache_active.attr, 4637 &raid5_preread_bypass_threshold.attr, 4638 NULL, 4639 }; 4640 static struct attribute_group raid5_attrs_group = { 4641 .name = NULL, 4642 .attrs = raid5_attrs, 4643 }; 4644 4645 static sector_t 4646 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4647 { 4648 raid5_conf_t *conf = mddev->private; 4649 4650 if (!sectors) 4651 sectors = mddev->dev_sectors; 4652 if (!raid_disks) 4653 /* size is defined by the smallest of previous and new size */ 4654 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4655 4656 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4657 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4658 return sectors * (raid_disks - conf->max_degraded); 4659 } 4660 4661 static void raid5_free_percpu(raid5_conf_t *conf) 4662 { 4663 struct raid5_percpu *percpu; 4664 unsigned long cpu; 4665 4666 if (!conf->percpu) 4667 return; 4668 4669 get_online_cpus(); 4670 for_each_possible_cpu(cpu) { 4671 percpu = per_cpu_ptr(conf->percpu, cpu); 4672 safe_put_page(percpu->spare_page); 4673 kfree(percpu->scribble); 4674 } 4675 #ifdef CONFIG_HOTPLUG_CPU 4676 unregister_cpu_notifier(&conf->cpu_notify); 4677 #endif 4678 put_online_cpus(); 4679 4680 free_percpu(conf->percpu); 4681 } 4682 4683 static void free_conf(raid5_conf_t *conf) 4684 { 4685 shrink_stripes(conf); 4686 raid5_free_percpu(conf); 4687 kfree(conf->disks); 4688 kfree(conf->stripe_hashtbl); 4689 kfree(conf); 4690 } 4691 4692 #ifdef CONFIG_HOTPLUG_CPU 4693 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4694 void *hcpu) 4695 { 4696 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); 4697 long cpu = (long)hcpu; 4698 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4699 4700 switch (action) { 4701 case CPU_UP_PREPARE: 4702 case CPU_UP_PREPARE_FROZEN: 4703 if (conf->level == 6 && !percpu->spare_page) 4704 percpu->spare_page = alloc_page(GFP_KERNEL); 4705 if (!percpu->scribble) 4706 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4707 4708 if (!percpu->scribble || 4709 (conf->level == 6 && !percpu->spare_page)) { 4710 safe_put_page(percpu->spare_page); 4711 kfree(percpu->scribble); 4712 pr_err("%s: failed memory allocation for cpu%ld\n", 4713 __func__, cpu); 4714 return notifier_from_errno(-ENOMEM); 4715 } 4716 break; 4717 case CPU_DEAD: 4718 case CPU_DEAD_FROZEN: 4719 safe_put_page(percpu->spare_page); 4720 kfree(percpu->scribble); 4721 percpu->spare_page = NULL; 4722 percpu->scribble = NULL; 4723 break; 4724 default: 4725 break; 4726 } 4727 return NOTIFY_OK; 4728 } 4729 #endif 4730 4731 static int raid5_alloc_percpu(raid5_conf_t *conf) 4732 { 4733 unsigned long cpu; 4734 struct page *spare_page; 4735 struct raid5_percpu __percpu *allcpus; 4736 void *scribble; 4737 int err; 4738 4739 allcpus = alloc_percpu(struct raid5_percpu); 4740 if (!allcpus) 4741 return -ENOMEM; 4742 conf->percpu = allcpus; 4743 4744 get_online_cpus(); 4745 err = 0; 4746 for_each_present_cpu(cpu) { 4747 if (conf->level == 6) { 4748 spare_page = alloc_page(GFP_KERNEL); 4749 if (!spare_page) { 4750 err = -ENOMEM; 4751 break; 4752 } 4753 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4754 } 4755 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4756 if (!scribble) { 4757 err = -ENOMEM; 4758 break; 4759 } 4760 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4761 } 4762 #ifdef CONFIG_HOTPLUG_CPU 4763 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4764 conf->cpu_notify.priority = 0; 4765 if (err == 0) 4766 err = register_cpu_notifier(&conf->cpu_notify); 4767 #endif 4768 put_online_cpus(); 4769 4770 return err; 4771 } 4772 4773 static raid5_conf_t *setup_conf(mddev_t *mddev) 4774 { 4775 raid5_conf_t *conf; 4776 int raid_disk, memory, max_disks; 4777 mdk_rdev_t *rdev; 4778 struct disk_info *disk; 4779 4780 if (mddev->new_level != 5 4781 && mddev->new_level != 4 4782 && mddev->new_level != 6) { 4783 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4784 mdname(mddev), mddev->new_level); 4785 return ERR_PTR(-EIO); 4786 } 4787 if ((mddev->new_level == 5 4788 && !algorithm_valid_raid5(mddev->new_layout)) || 4789 (mddev->new_level == 6 4790 && !algorithm_valid_raid6(mddev->new_layout))) { 4791 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4792 mdname(mddev), mddev->new_layout); 4793 return ERR_PTR(-EIO); 4794 } 4795 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4796 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4797 mdname(mddev), mddev->raid_disks); 4798 return ERR_PTR(-EINVAL); 4799 } 4800 4801 if (!mddev->new_chunk_sectors || 4802 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4803 !is_power_of_2(mddev->new_chunk_sectors)) { 4804 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4805 mdname(mddev), mddev->new_chunk_sectors << 9); 4806 return ERR_PTR(-EINVAL); 4807 } 4808 4809 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4810 if (conf == NULL) 4811 goto abort; 4812 spin_lock_init(&conf->device_lock); 4813 init_waitqueue_head(&conf->wait_for_stripe); 4814 init_waitqueue_head(&conf->wait_for_overlap); 4815 INIT_LIST_HEAD(&conf->handle_list); 4816 INIT_LIST_HEAD(&conf->hold_list); 4817 INIT_LIST_HEAD(&conf->delayed_list); 4818 INIT_LIST_HEAD(&conf->bitmap_list); 4819 INIT_LIST_HEAD(&conf->inactive_list); 4820 atomic_set(&conf->active_stripes, 0); 4821 atomic_set(&conf->preread_active_stripes, 0); 4822 atomic_set(&conf->active_aligned_reads, 0); 4823 conf->bypass_threshold = BYPASS_THRESHOLD; 4824 4825 conf->raid_disks = mddev->raid_disks; 4826 if (mddev->reshape_position == MaxSector) 4827 conf->previous_raid_disks = mddev->raid_disks; 4828 else 4829 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4830 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4831 conf->scribble_len = scribble_len(max_disks); 4832 4833 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4834 GFP_KERNEL); 4835 if (!conf->disks) 4836 goto abort; 4837 4838 conf->mddev = mddev; 4839 4840 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4841 goto abort; 4842 4843 conf->level = mddev->new_level; 4844 if (raid5_alloc_percpu(conf) != 0) 4845 goto abort; 4846 4847 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4848 4849 list_for_each_entry(rdev, &mddev->disks, same_set) { 4850 raid_disk = rdev->raid_disk; 4851 if (raid_disk >= max_disks 4852 || raid_disk < 0) 4853 continue; 4854 disk = conf->disks + raid_disk; 4855 4856 disk->rdev = rdev; 4857 4858 if (test_bit(In_sync, &rdev->flags)) { 4859 char b[BDEVNAME_SIZE]; 4860 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4861 " disk %d\n", 4862 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4863 } else 4864 /* Cannot rely on bitmap to complete recovery */ 4865 conf->fullsync = 1; 4866 } 4867 4868 conf->chunk_sectors = mddev->new_chunk_sectors; 4869 conf->level = mddev->new_level; 4870 if (conf->level == 6) 4871 conf->max_degraded = 2; 4872 else 4873 conf->max_degraded = 1; 4874 conf->algorithm = mddev->new_layout; 4875 conf->max_nr_stripes = NR_STRIPES; 4876 conf->reshape_progress = mddev->reshape_position; 4877 if (conf->reshape_progress != MaxSector) { 4878 conf->prev_chunk_sectors = mddev->chunk_sectors; 4879 conf->prev_algo = mddev->layout; 4880 } 4881 4882 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4883 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4884 if (grow_stripes(conf, conf->max_nr_stripes)) { 4885 printk(KERN_ERR 4886 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4887 mdname(mddev), memory); 4888 goto abort; 4889 } else 4890 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4891 mdname(mddev), memory); 4892 4893 conf->thread = md_register_thread(raid5d, mddev, NULL); 4894 if (!conf->thread) { 4895 printk(KERN_ERR 4896 "md/raid:%s: couldn't allocate thread.\n", 4897 mdname(mddev)); 4898 goto abort; 4899 } 4900 4901 return conf; 4902 4903 abort: 4904 if (conf) { 4905 free_conf(conf); 4906 return ERR_PTR(-EIO); 4907 } else 4908 return ERR_PTR(-ENOMEM); 4909 } 4910 4911 4912 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4913 { 4914 switch (algo) { 4915 case ALGORITHM_PARITY_0: 4916 if (raid_disk < max_degraded) 4917 return 1; 4918 break; 4919 case ALGORITHM_PARITY_N: 4920 if (raid_disk >= raid_disks - max_degraded) 4921 return 1; 4922 break; 4923 case ALGORITHM_PARITY_0_6: 4924 if (raid_disk == 0 || 4925 raid_disk == raid_disks - 1) 4926 return 1; 4927 break; 4928 case ALGORITHM_LEFT_ASYMMETRIC_6: 4929 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4930 case ALGORITHM_LEFT_SYMMETRIC_6: 4931 case ALGORITHM_RIGHT_SYMMETRIC_6: 4932 if (raid_disk == raid_disks - 1) 4933 return 1; 4934 } 4935 return 0; 4936 } 4937 4938 static int run(mddev_t *mddev) 4939 { 4940 raid5_conf_t *conf; 4941 int working_disks = 0; 4942 int dirty_parity_disks = 0; 4943 mdk_rdev_t *rdev; 4944 sector_t reshape_offset = 0; 4945 4946 if (mddev->recovery_cp != MaxSector) 4947 printk(KERN_NOTICE "md/raid:%s: not clean" 4948 " -- starting background reconstruction\n", 4949 mdname(mddev)); 4950 if (mddev->reshape_position != MaxSector) { 4951 /* Check that we can continue the reshape. 4952 * Currently only disks can change, it must 4953 * increase, and we must be past the point where 4954 * a stripe over-writes itself 4955 */ 4956 sector_t here_new, here_old; 4957 int old_disks; 4958 int max_degraded = (mddev->level == 6 ? 2 : 1); 4959 4960 if (mddev->new_level != mddev->level) { 4961 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4962 "required - aborting.\n", 4963 mdname(mddev)); 4964 return -EINVAL; 4965 } 4966 old_disks = mddev->raid_disks - mddev->delta_disks; 4967 /* reshape_position must be on a new-stripe boundary, and one 4968 * further up in new geometry must map after here in old 4969 * geometry. 4970 */ 4971 here_new = mddev->reshape_position; 4972 if (sector_div(here_new, mddev->new_chunk_sectors * 4973 (mddev->raid_disks - max_degraded))) { 4974 printk(KERN_ERR "md/raid:%s: reshape_position not " 4975 "on a stripe boundary\n", mdname(mddev)); 4976 return -EINVAL; 4977 } 4978 reshape_offset = here_new * mddev->new_chunk_sectors; 4979 /* here_new is the stripe we will write to */ 4980 here_old = mddev->reshape_position; 4981 sector_div(here_old, mddev->chunk_sectors * 4982 (old_disks-max_degraded)); 4983 /* here_old is the first stripe that we might need to read 4984 * from */ 4985 if (mddev->delta_disks == 0) { 4986 /* We cannot be sure it is safe to start an in-place 4987 * reshape. It is only safe if user-space if monitoring 4988 * and taking constant backups. 4989 * mdadm always starts a situation like this in 4990 * readonly mode so it can take control before 4991 * allowing any writes. So just check for that. 4992 */ 4993 if ((here_new * mddev->new_chunk_sectors != 4994 here_old * mddev->chunk_sectors) || 4995 mddev->ro == 0) { 4996 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4997 " in read-only mode - aborting\n", 4998 mdname(mddev)); 4999 return -EINVAL; 5000 } 5001 } else if (mddev->delta_disks < 0 5002 ? (here_new * mddev->new_chunk_sectors <= 5003 here_old * mddev->chunk_sectors) 5004 : (here_new * mddev->new_chunk_sectors >= 5005 here_old * mddev->chunk_sectors)) { 5006 /* Reading from the same stripe as writing to - bad */ 5007 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5008 "auto-recovery - aborting.\n", 5009 mdname(mddev)); 5010 return -EINVAL; 5011 } 5012 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5013 mdname(mddev)); 5014 /* OK, we should be able to continue; */ 5015 } else { 5016 BUG_ON(mddev->level != mddev->new_level); 5017 BUG_ON(mddev->layout != mddev->new_layout); 5018 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5019 BUG_ON(mddev->delta_disks != 0); 5020 } 5021 5022 if (mddev->private == NULL) 5023 conf = setup_conf(mddev); 5024 else 5025 conf = mddev->private; 5026 5027 if (IS_ERR(conf)) 5028 return PTR_ERR(conf); 5029 5030 mddev->thread = conf->thread; 5031 conf->thread = NULL; 5032 mddev->private = conf; 5033 5034 /* 5035 * 0 for a fully functional array, 1 or 2 for a degraded array. 5036 */ 5037 list_for_each_entry(rdev, &mddev->disks, same_set) { 5038 if (rdev->raid_disk < 0) 5039 continue; 5040 if (test_bit(In_sync, &rdev->flags)) { 5041 working_disks++; 5042 continue; 5043 } 5044 /* This disc is not fully in-sync. However if it 5045 * just stored parity (beyond the recovery_offset), 5046 * when we don't need to be concerned about the 5047 * array being dirty. 5048 * When reshape goes 'backwards', we never have 5049 * partially completed devices, so we only need 5050 * to worry about reshape going forwards. 5051 */ 5052 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5053 if (mddev->major_version == 0 && 5054 mddev->minor_version > 90) 5055 rdev->recovery_offset = reshape_offset; 5056 5057 if (rdev->recovery_offset < reshape_offset) { 5058 /* We need to check old and new layout */ 5059 if (!only_parity(rdev->raid_disk, 5060 conf->algorithm, 5061 conf->raid_disks, 5062 conf->max_degraded)) 5063 continue; 5064 } 5065 if (!only_parity(rdev->raid_disk, 5066 conf->prev_algo, 5067 conf->previous_raid_disks, 5068 conf->max_degraded)) 5069 continue; 5070 dirty_parity_disks++; 5071 } 5072 5073 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5074 - working_disks); 5075 5076 if (has_failed(conf)) { 5077 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5078 " (%d/%d failed)\n", 5079 mdname(mddev), mddev->degraded, conf->raid_disks); 5080 goto abort; 5081 } 5082 5083 /* device size must be a multiple of chunk size */ 5084 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5085 mddev->resync_max_sectors = mddev->dev_sectors; 5086 5087 if (mddev->degraded > dirty_parity_disks && 5088 mddev->recovery_cp != MaxSector) { 5089 if (mddev->ok_start_degraded) 5090 printk(KERN_WARNING 5091 "md/raid:%s: starting dirty degraded array" 5092 " - data corruption possible.\n", 5093 mdname(mddev)); 5094 else { 5095 printk(KERN_ERR 5096 "md/raid:%s: cannot start dirty degraded array.\n", 5097 mdname(mddev)); 5098 goto abort; 5099 } 5100 } 5101 5102 if (mddev->degraded == 0) 5103 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5104 " devices, algorithm %d\n", mdname(mddev), conf->level, 5105 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5106 mddev->new_layout); 5107 else 5108 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5109 " out of %d devices, algorithm %d\n", 5110 mdname(mddev), conf->level, 5111 mddev->raid_disks - mddev->degraded, 5112 mddev->raid_disks, mddev->new_layout); 5113 5114 print_raid5_conf(conf); 5115 5116 if (conf->reshape_progress != MaxSector) { 5117 conf->reshape_safe = conf->reshape_progress; 5118 atomic_set(&conf->reshape_stripes, 0); 5119 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5120 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5121 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5122 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5123 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5124 "reshape"); 5125 } 5126 5127 5128 /* Ok, everything is just fine now */ 5129 if (mddev->to_remove == &raid5_attrs_group) 5130 mddev->to_remove = NULL; 5131 else if (mddev->kobj.sd && 5132 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5133 printk(KERN_WARNING 5134 "raid5: failed to create sysfs attributes for %s\n", 5135 mdname(mddev)); 5136 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5137 5138 if (mddev->queue) { 5139 int chunk_size; 5140 /* read-ahead size must cover two whole stripes, which 5141 * is 2 * (datadisks) * chunksize where 'n' is the 5142 * number of raid devices 5143 */ 5144 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5145 int stripe = data_disks * 5146 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5147 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5148 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5149 5150 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5151 5152 mddev->queue->backing_dev_info.congested_data = mddev; 5153 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5154 5155 chunk_size = mddev->chunk_sectors << 9; 5156 blk_queue_io_min(mddev->queue, chunk_size); 5157 blk_queue_io_opt(mddev->queue, chunk_size * 5158 (conf->raid_disks - conf->max_degraded)); 5159 5160 list_for_each_entry(rdev, &mddev->disks, same_set) 5161 disk_stack_limits(mddev->gendisk, rdev->bdev, 5162 rdev->data_offset << 9); 5163 } 5164 5165 return 0; 5166 abort: 5167 md_unregister_thread(mddev->thread); 5168 mddev->thread = NULL; 5169 if (conf) { 5170 print_raid5_conf(conf); 5171 free_conf(conf); 5172 } 5173 mddev->private = NULL; 5174 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5175 return -EIO; 5176 } 5177 5178 static int stop(mddev_t *mddev) 5179 { 5180 raid5_conf_t *conf = mddev->private; 5181 5182 md_unregister_thread(mddev->thread); 5183 mddev->thread = NULL; 5184 if (mddev->queue) 5185 mddev->queue->backing_dev_info.congested_fn = NULL; 5186 free_conf(conf); 5187 mddev->private = NULL; 5188 mddev->to_remove = &raid5_attrs_group; 5189 return 0; 5190 } 5191 5192 #ifdef DEBUG 5193 static void print_sh(struct seq_file *seq, struct stripe_head *sh) 5194 { 5195 int i; 5196 5197 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 5198 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 5199 seq_printf(seq, "sh %llu, count %d.\n", 5200 (unsigned long long)sh->sector, atomic_read(&sh->count)); 5201 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 5202 for (i = 0; i < sh->disks; i++) { 5203 seq_printf(seq, "(cache%d: %p %ld) ", 5204 i, sh->dev[i].page, sh->dev[i].flags); 5205 } 5206 seq_printf(seq, "\n"); 5207 } 5208 5209 static void printall(struct seq_file *seq, raid5_conf_t *conf) 5210 { 5211 struct stripe_head *sh; 5212 struct hlist_node *hn; 5213 int i; 5214 5215 spin_lock_irq(&conf->device_lock); 5216 for (i = 0; i < NR_HASH; i++) { 5217 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 5218 if (sh->raid_conf != conf) 5219 continue; 5220 print_sh(seq, sh); 5221 } 5222 } 5223 spin_unlock_irq(&conf->device_lock); 5224 } 5225 #endif 5226 5227 static void status(struct seq_file *seq, mddev_t *mddev) 5228 { 5229 raid5_conf_t *conf = mddev->private; 5230 int i; 5231 5232 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5233 mddev->chunk_sectors / 2, mddev->layout); 5234 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5235 for (i = 0; i < conf->raid_disks; i++) 5236 seq_printf (seq, "%s", 5237 conf->disks[i].rdev && 5238 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5239 seq_printf (seq, "]"); 5240 #ifdef DEBUG 5241 seq_printf (seq, "\n"); 5242 printall(seq, conf); 5243 #endif 5244 } 5245 5246 static void print_raid5_conf (raid5_conf_t *conf) 5247 { 5248 int i; 5249 struct disk_info *tmp; 5250 5251 printk(KERN_DEBUG "RAID conf printout:\n"); 5252 if (!conf) { 5253 printk("(conf==NULL)\n"); 5254 return; 5255 } 5256 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5257 conf->raid_disks, 5258 conf->raid_disks - conf->mddev->degraded); 5259 5260 for (i = 0; i < conf->raid_disks; i++) { 5261 char b[BDEVNAME_SIZE]; 5262 tmp = conf->disks + i; 5263 if (tmp->rdev) 5264 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5265 i, !test_bit(Faulty, &tmp->rdev->flags), 5266 bdevname(tmp->rdev->bdev, b)); 5267 } 5268 } 5269 5270 static int raid5_spare_active(mddev_t *mddev) 5271 { 5272 int i; 5273 raid5_conf_t *conf = mddev->private; 5274 struct disk_info *tmp; 5275 int count = 0; 5276 unsigned long flags; 5277 5278 for (i = 0; i < conf->raid_disks; i++) { 5279 tmp = conf->disks + i; 5280 if (tmp->rdev 5281 && tmp->rdev->recovery_offset == MaxSector 5282 && !test_bit(Faulty, &tmp->rdev->flags) 5283 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5284 count++; 5285 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5286 } 5287 } 5288 spin_lock_irqsave(&conf->device_lock, flags); 5289 mddev->degraded -= count; 5290 spin_unlock_irqrestore(&conf->device_lock, flags); 5291 print_raid5_conf(conf); 5292 return count; 5293 } 5294 5295 static int raid5_remove_disk(mddev_t *mddev, int number) 5296 { 5297 raid5_conf_t *conf = mddev->private; 5298 int err = 0; 5299 mdk_rdev_t *rdev; 5300 struct disk_info *p = conf->disks + number; 5301 5302 print_raid5_conf(conf); 5303 rdev = p->rdev; 5304 if (rdev) { 5305 if (number >= conf->raid_disks && 5306 conf->reshape_progress == MaxSector) 5307 clear_bit(In_sync, &rdev->flags); 5308 5309 if (test_bit(In_sync, &rdev->flags) || 5310 atomic_read(&rdev->nr_pending)) { 5311 err = -EBUSY; 5312 goto abort; 5313 } 5314 /* Only remove non-faulty devices if recovery 5315 * isn't possible. 5316 */ 5317 if (!test_bit(Faulty, &rdev->flags) && 5318 !has_failed(conf) && 5319 number < conf->raid_disks) { 5320 err = -EBUSY; 5321 goto abort; 5322 } 5323 p->rdev = NULL; 5324 synchronize_rcu(); 5325 if (atomic_read(&rdev->nr_pending)) { 5326 /* lost the race, try later */ 5327 err = -EBUSY; 5328 p->rdev = rdev; 5329 } 5330 } 5331 abort: 5332 5333 print_raid5_conf(conf); 5334 return err; 5335 } 5336 5337 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 5338 { 5339 raid5_conf_t *conf = mddev->private; 5340 int err = -EEXIST; 5341 int disk; 5342 struct disk_info *p; 5343 int first = 0; 5344 int last = conf->raid_disks - 1; 5345 5346 if (has_failed(conf)) 5347 /* no point adding a device */ 5348 return -EINVAL; 5349 5350 if (rdev->raid_disk >= 0) 5351 first = last = rdev->raid_disk; 5352 5353 /* 5354 * find the disk ... but prefer rdev->saved_raid_disk 5355 * if possible. 5356 */ 5357 if (rdev->saved_raid_disk >= 0 && 5358 rdev->saved_raid_disk >= first && 5359 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5360 disk = rdev->saved_raid_disk; 5361 else 5362 disk = first; 5363 for ( ; disk <= last ; disk++) 5364 if ((p=conf->disks + disk)->rdev == NULL) { 5365 clear_bit(In_sync, &rdev->flags); 5366 rdev->raid_disk = disk; 5367 err = 0; 5368 if (rdev->saved_raid_disk != disk) 5369 conf->fullsync = 1; 5370 rcu_assign_pointer(p->rdev, rdev); 5371 break; 5372 } 5373 print_raid5_conf(conf); 5374 return err; 5375 } 5376 5377 static int raid5_resize(mddev_t *mddev, sector_t sectors) 5378 { 5379 /* no resync is happening, and there is enough space 5380 * on all devices, so we can resize. 5381 * We need to make sure resync covers any new space. 5382 * If the array is shrinking we should possibly wait until 5383 * any io in the removed space completes, but it hardly seems 5384 * worth it. 5385 */ 5386 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5387 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5388 mddev->raid_disks)); 5389 if (mddev->array_sectors > 5390 raid5_size(mddev, sectors, mddev->raid_disks)) 5391 return -EINVAL; 5392 set_capacity(mddev->gendisk, mddev->array_sectors); 5393 revalidate_disk(mddev->gendisk); 5394 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5395 mddev->recovery_cp = mddev->dev_sectors; 5396 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5397 } 5398 mddev->dev_sectors = sectors; 5399 mddev->resync_max_sectors = sectors; 5400 return 0; 5401 } 5402 5403 static int check_stripe_cache(mddev_t *mddev) 5404 { 5405 /* Can only proceed if there are plenty of stripe_heads. 5406 * We need a minimum of one full stripe,, and for sensible progress 5407 * it is best to have about 4 times that. 5408 * If we require 4 times, then the default 256 4K stripe_heads will 5409 * allow for chunk sizes up to 256K, which is probably OK. 5410 * If the chunk size is greater, user-space should request more 5411 * stripe_heads first. 5412 */ 5413 raid5_conf_t *conf = mddev->private; 5414 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5415 > conf->max_nr_stripes || 5416 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5417 > conf->max_nr_stripes) { 5418 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5419 mdname(mddev), 5420 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5421 / STRIPE_SIZE)*4); 5422 return 0; 5423 } 5424 return 1; 5425 } 5426 5427 static int check_reshape(mddev_t *mddev) 5428 { 5429 raid5_conf_t *conf = mddev->private; 5430 5431 if (mddev->delta_disks == 0 && 5432 mddev->new_layout == mddev->layout && 5433 mddev->new_chunk_sectors == mddev->chunk_sectors) 5434 return 0; /* nothing to do */ 5435 if (mddev->bitmap) 5436 /* Cannot grow a bitmap yet */ 5437 return -EBUSY; 5438 if (has_failed(conf)) 5439 return -EINVAL; 5440 if (mddev->delta_disks < 0) { 5441 /* We might be able to shrink, but the devices must 5442 * be made bigger first. 5443 * For raid6, 4 is the minimum size. 5444 * Otherwise 2 is the minimum 5445 */ 5446 int min = 2; 5447 if (mddev->level == 6) 5448 min = 4; 5449 if (mddev->raid_disks + mddev->delta_disks < min) 5450 return -EINVAL; 5451 } 5452 5453 if (!check_stripe_cache(mddev)) 5454 return -ENOSPC; 5455 5456 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5457 } 5458 5459 static int raid5_start_reshape(mddev_t *mddev) 5460 { 5461 raid5_conf_t *conf = mddev->private; 5462 mdk_rdev_t *rdev; 5463 int spares = 0; 5464 unsigned long flags; 5465 5466 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5467 return -EBUSY; 5468 5469 if (!check_stripe_cache(mddev)) 5470 return -ENOSPC; 5471 5472 list_for_each_entry(rdev, &mddev->disks, same_set) 5473 if (!test_bit(In_sync, &rdev->flags) 5474 && !test_bit(Faulty, &rdev->flags)) 5475 spares++; 5476 5477 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5478 /* Not enough devices even to make a degraded array 5479 * of that size 5480 */ 5481 return -EINVAL; 5482 5483 /* Refuse to reduce size of the array. Any reductions in 5484 * array size must be through explicit setting of array_size 5485 * attribute. 5486 */ 5487 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5488 < mddev->array_sectors) { 5489 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5490 "before number of disks\n", mdname(mddev)); 5491 return -EINVAL; 5492 } 5493 5494 atomic_set(&conf->reshape_stripes, 0); 5495 spin_lock_irq(&conf->device_lock); 5496 conf->previous_raid_disks = conf->raid_disks; 5497 conf->raid_disks += mddev->delta_disks; 5498 conf->prev_chunk_sectors = conf->chunk_sectors; 5499 conf->chunk_sectors = mddev->new_chunk_sectors; 5500 conf->prev_algo = conf->algorithm; 5501 conf->algorithm = mddev->new_layout; 5502 if (mddev->delta_disks < 0) 5503 conf->reshape_progress = raid5_size(mddev, 0, 0); 5504 else 5505 conf->reshape_progress = 0; 5506 conf->reshape_safe = conf->reshape_progress; 5507 conf->generation++; 5508 spin_unlock_irq(&conf->device_lock); 5509 5510 /* Add some new drives, as many as will fit. 5511 * We know there are enough to make the newly sized array work. 5512 * Don't add devices if we are reducing the number of 5513 * devices in the array. This is because it is not possible 5514 * to correctly record the "partially reconstructed" state of 5515 * such devices during the reshape and confusion could result. 5516 */ 5517 if (mddev->delta_disks >= 0) { 5518 int added_devices = 0; 5519 list_for_each_entry(rdev, &mddev->disks, same_set) 5520 if (rdev->raid_disk < 0 && 5521 !test_bit(Faulty, &rdev->flags)) { 5522 if (raid5_add_disk(mddev, rdev) == 0) { 5523 char nm[20]; 5524 if (rdev->raid_disk 5525 >= conf->previous_raid_disks) { 5526 set_bit(In_sync, &rdev->flags); 5527 added_devices++; 5528 } else 5529 rdev->recovery_offset = 0; 5530 sprintf(nm, "rd%d", rdev->raid_disk); 5531 if (sysfs_create_link(&mddev->kobj, 5532 &rdev->kobj, nm)) 5533 /* Failure here is OK */; 5534 } 5535 } else if (rdev->raid_disk >= conf->previous_raid_disks 5536 && !test_bit(Faulty, &rdev->flags)) { 5537 /* This is a spare that was manually added */ 5538 set_bit(In_sync, &rdev->flags); 5539 added_devices++; 5540 } 5541 5542 /* When a reshape changes the number of devices, 5543 * ->degraded is measured against the larger of the 5544 * pre and post number of devices. 5545 */ 5546 spin_lock_irqsave(&conf->device_lock, flags); 5547 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5548 - added_devices; 5549 spin_unlock_irqrestore(&conf->device_lock, flags); 5550 } 5551 mddev->raid_disks = conf->raid_disks; 5552 mddev->reshape_position = conf->reshape_progress; 5553 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5554 5555 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5556 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5557 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5558 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5559 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5560 "reshape"); 5561 if (!mddev->sync_thread) { 5562 mddev->recovery = 0; 5563 spin_lock_irq(&conf->device_lock); 5564 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5565 conf->reshape_progress = MaxSector; 5566 spin_unlock_irq(&conf->device_lock); 5567 return -EAGAIN; 5568 } 5569 conf->reshape_checkpoint = jiffies; 5570 md_wakeup_thread(mddev->sync_thread); 5571 md_new_event(mddev); 5572 return 0; 5573 } 5574 5575 /* This is called from the reshape thread and should make any 5576 * changes needed in 'conf' 5577 */ 5578 static void end_reshape(raid5_conf_t *conf) 5579 { 5580 5581 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5582 5583 spin_lock_irq(&conf->device_lock); 5584 conf->previous_raid_disks = conf->raid_disks; 5585 conf->reshape_progress = MaxSector; 5586 spin_unlock_irq(&conf->device_lock); 5587 wake_up(&conf->wait_for_overlap); 5588 5589 /* read-ahead size must cover two whole stripes, which is 5590 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5591 */ 5592 if (conf->mddev->queue) { 5593 int data_disks = conf->raid_disks - conf->max_degraded; 5594 int stripe = data_disks * ((conf->chunk_sectors << 9) 5595 / PAGE_SIZE); 5596 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5597 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5598 } 5599 } 5600 } 5601 5602 /* This is called from the raid5d thread with mddev_lock held. 5603 * It makes config changes to the device. 5604 */ 5605 static void raid5_finish_reshape(mddev_t *mddev) 5606 { 5607 raid5_conf_t *conf = mddev->private; 5608 5609 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5610 5611 if (mddev->delta_disks > 0) { 5612 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5613 set_capacity(mddev->gendisk, mddev->array_sectors); 5614 revalidate_disk(mddev->gendisk); 5615 } else { 5616 int d; 5617 mddev->degraded = conf->raid_disks; 5618 for (d = 0; d < conf->raid_disks ; d++) 5619 if (conf->disks[d].rdev && 5620 test_bit(In_sync, 5621 &conf->disks[d].rdev->flags)) 5622 mddev->degraded--; 5623 for (d = conf->raid_disks ; 5624 d < conf->raid_disks - mddev->delta_disks; 5625 d++) { 5626 mdk_rdev_t *rdev = conf->disks[d].rdev; 5627 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5628 char nm[20]; 5629 sprintf(nm, "rd%d", rdev->raid_disk); 5630 sysfs_remove_link(&mddev->kobj, nm); 5631 rdev->raid_disk = -1; 5632 } 5633 } 5634 } 5635 mddev->layout = conf->algorithm; 5636 mddev->chunk_sectors = conf->chunk_sectors; 5637 mddev->reshape_position = MaxSector; 5638 mddev->delta_disks = 0; 5639 } 5640 } 5641 5642 static void raid5_quiesce(mddev_t *mddev, int state) 5643 { 5644 raid5_conf_t *conf = mddev->private; 5645 5646 switch(state) { 5647 case 2: /* resume for a suspend */ 5648 wake_up(&conf->wait_for_overlap); 5649 break; 5650 5651 case 1: /* stop all writes */ 5652 spin_lock_irq(&conf->device_lock); 5653 /* '2' tells resync/reshape to pause so that all 5654 * active stripes can drain 5655 */ 5656 conf->quiesce = 2; 5657 wait_event_lock_irq(conf->wait_for_stripe, 5658 atomic_read(&conf->active_stripes) == 0 && 5659 atomic_read(&conf->active_aligned_reads) == 0, 5660 conf->device_lock, /* nothing */); 5661 conf->quiesce = 1; 5662 spin_unlock_irq(&conf->device_lock); 5663 /* allow reshape to continue */ 5664 wake_up(&conf->wait_for_overlap); 5665 break; 5666 5667 case 0: /* re-enable writes */ 5668 spin_lock_irq(&conf->device_lock); 5669 conf->quiesce = 0; 5670 wake_up(&conf->wait_for_stripe); 5671 wake_up(&conf->wait_for_overlap); 5672 spin_unlock_irq(&conf->device_lock); 5673 break; 5674 } 5675 } 5676 5677 5678 static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5679 { 5680 struct raid0_private_data *raid0_priv = mddev->private; 5681 sector_t sectors; 5682 5683 /* for raid0 takeover only one zone is supported */ 5684 if (raid0_priv->nr_strip_zones > 1) { 5685 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5686 mdname(mddev)); 5687 return ERR_PTR(-EINVAL); 5688 } 5689 5690 sectors = raid0_priv->strip_zone[0].zone_end; 5691 sector_div(sectors, raid0_priv->strip_zone[0].nb_dev); 5692 mddev->dev_sectors = sectors; 5693 mddev->new_level = level; 5694 mddev->new_layout = ALGORITHM_PARITY_N; 5695 mddev->new_chunk_sectors = mddev->chunk_sectors; 5696 mddev->raid_disks += 1; 5697 mddev->delta_disks = 1; 5698 /* make sure it will be not marked as dirty */ 5699 mddev->recovery_cp = MaxSector; 5700 5701 return setup_conf(mddev); 5702 } 5703 5704 5705 static void *raid5_takeover_raid1(mddev_t *mddev) 5706 { 5707 int chunksect; 5708 5709 if (mddev->raid_disks != 2 || 5710 mddev->degraded > 1) 5711 return ERR_PTR(-EINVAL); 5712 5713 /* Should check if there are write-behind devices? */ 5714 5715 chunksect = 64*2; /* 64K by default */ 5716 5717 /* The array must be an exact multiple of chunksize */ 5718 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5719 chunksect >>= 1; 5720 5721 if ((chunksect<<9) < STRIPE_SIZE) 5722 /* array size does not allow a suitable chunk size */ 5723 return ERR_PTR(-EINVAL); 5724 5725 mddev->new_level = 5; 5726 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5727 mddev->new_chunk_sectors = chunksect; 5728 5729 return setup_conf(mddev); 5730 } 5731 5732 static void *raid5_takeover_raid6(mddev_t *mddev) 5733 { 5734 int new_layout; 5735 5736 switch (mddev->layout) { 5737 case ALGORITHM_LEFT_ASYMMETRIC_6: 5738 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5739 break; 5740 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5741 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5742 break; 5743 case ALGORITHM_LEFT_SYMMETRIC_6: 5744 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5745 break; 5746 case ALGORITHM_RIGHT_SYMMETRIC_6: 5747 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5748 break; 5749 case ALGORITHM_PARITY_0_6: 5750 new_layout = ALGORITHM_PARITY_0; 5751 break; 5752 case ALGORITHM_PARITY_N: 5753 new_layout = ALGORITHM_PARITY_N; 5754 break; 5755 default: 5756 return ERR_PTR(-EINVAL); 5757 } 5758 mddev->new_level = 5; 5759 mddev->new_layout = new_layout; 5760 mddev->delta_disks = -1; 5761 mddev->raid_disks -= 1; 5762 return setup_conf(mddev); 5763 } 5764 5765 5766 static int raid5_check_reshape(mddev_t *mddev) 5767 { 5768 /* For a 2-drive array, the layout and chunk size can be changed 5769 * immediately as not restriping is needed. 5770 * For larger arrays we record the new value - after validation 5771 * to be used by a reshape pass. 5772 */ 5773 raid5_conf_t *conf = mddev->private; 5774 int new_chunk = mddev->new_chunk_sectors; 5775 5776 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5777 return -EINVAL; 5778 if (new_chunk > 0) { 5779 if (!is_power_of_2(new_chunk)) 5780 return -EINVAL; 5781 if (new_chunk < (PAGE_SIZE>>9)) 5782 return -EINVAL; 5783 if (mddev->array_sectors & (new_chunk-1)) 5784 /* not factor of array size */ 5785 return -EINVAL; 5786 } 5787 5788 /* They look valid */ 5789 5790 if (mddev->raid_disks == 2) { 5791 /* can make the change immediately */ 5792 if (mddev->new_layout >= 0) { 5793 conf->algorithm = mddev->new_layout; 5794 mddev->layout = mddev->new_layout; 5795 } 5796 if (new_chunk > 0) { 5797 conf->chunk_sectors = new_chunk ; 5798 mddev->chunk_sectors = new_chunk; 5799 } 5800 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5801 md_wakeup_thread(mddev->thread); 5802 } 5803 return check_reshape(mddev); 5804 } 5805 5806 static int raid6_check_reshape(mddev_t *mddev) 5807 { 5808 int new_chunk = mddev->new_chunk_sectors; 5809 5810 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5811 return -EINVAL; 5812 if (new_chunk > 0) { 5813 if (!is_power_of_2(new_chunk)) 5814 return -EINVAL; 5815 if (new_chunk < (PAGE_SIZE >> 9)) 5816 return -EINVAL; 5817 if (mddev->array_sectors & (new_chunk-1)) 5818 /* not factor of array size */ 5819 return -EINVAL; 5820 } 5821 5822 /* They look valid */ 5823 return check_reshape(mddev); 5824 } 5825 5826 static void *raid5_takeover(mddev_t *mddev) 5827 { 5828 /* raid5 can take over: 5829 * raid0 - if there is only one strip zone - make it a raid4 layout 5830 * raid1 - if there are two drives. We need to know the chunk size 5831 * raid4 - trivial - just use a raid4 layout. 5832 * raid6 - Providing it is a *_6 layout 5833 */ 5834 if (mddev->level == 0) 5835 return raid45_takeover_raid0(mddev, 5); 5836 if (mddev->level == 1) 5837 return raid5_takeover_raid1(mddev); 5838 if (mddev->level == 4) { 5839 mddev->new_layout = ALGORITHM_PARITY_N; 5840 mddev->new_level = 5; 5841 return setup_conf(mddev); 5842 } 5843 if (mddev->level == 6) 5844 return raid5_takeover_raid6(mddev); 5845 5846 return ERR_PTR(-EINVAL); 5847 } 5848 5849 static void *raid4_takeover(mddev_t *mddev) 5850 { 5851 /* raid4 can take over: 5852 * raid0 - if there is only one strip zone 5853 * raid5 - if layout is right 5854 */ 5855 if (mddev->level == 0) 5856 return raid45_takeover_raid0(mddev, 4); 5857 if (mddev->level == 5 && 5858 mddev->layout == ALGORITHM_PARITY_N) { 5859 mddev->new_layout = 0; 5860 mddev->new_level = 4; 5861 return setup_conf(mddev); 5862 } 5863 return ERR_PTR(-EINVAL); 5864 } 5865 5866 static struct mdk_personality raid5_personality; 5867 5868 static void *raid6_takeover(mddev_t *mddev) 5869 { 5870 /* Currently can only take over a raid5. We map the 5871 * personality to an equivalent raid6 personality 5872 * with the Q block at the end. 5873 */ 5874 int new_layout; 5875 5876 if (mddev->pers != &raid5_personality) 5877 return ERR_PTR(-EINVAL); 5878 if (mddev->degraded > 1) 5879 return ERR_PTR(-EINVAL); 5880 if (mddev->raid_disks > 253) 5881 return ERR_PTR(-EINVAL); 5882 if (mddev->raid_disks < 3) 5883 return ERR_PTR(-EINVAL); 5884 5885 switch (mddev->layout) { 5886 case ALGORITHM_LEFT_ASYMMETRIC: 5887 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5888 break; 5889 case ALGORITHM_RIGHT_ASYMMETRIC: 5890 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5891 break; 5892 case ALGORITHM_LEFT_SYMMETRIC: 5893 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5894 break; 5895 case ALGORITHM_RIGHT_SYMMETRIC: 5896 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5897 break; 5898 case ALGORITHM_PARITY_0: 5899 new_layout = ALGORITHM_PARITY_0_6; 5900 break; 5901 case ALGORITHM_PARITY_N: 5902 new_layout = ALGORITHM_PARITY_N; 5903 break; 5904 default: 5905 return ERR_PTR(-EINVAL); 5906 } 5907 mddev->new_level = 6; 5908 mddev->new_layout = new_layout; 5909 mddev->delta_disks = 1; 5910 mddev->raid_disks += 1; 5911 return setup_conf(mddev); 5912 } 5913 5914 5915 static struct mdk_personality raid6_personality = 5916 { 5917 .name = "raid6", 5918 .level = 6, 5919 .owner = THIS_MODULE, 5920 .make_request = make_request, 5921 .run = run, 5922 .stop = stop, 5923 .status = status, 5924 .error_handler = error, 5925 .hot_add_disk = raid5_add_disk, 5926 .hot_remove_disk= raid5_remove_disk, 5927 .spare_active = raid5_spare_active, 5928 .sync_request = sync_request, 5929 .resize = raid5_resize, 5930 .size = raid5_size, 5931 .check_reshape = raid6_check_reshape, 5932 .start_reshape = raid5_start_reshape, 5933 .finish_reshape = raid5_finish_reshape, 5934 .quiesce = raid5_quiesce, 5935 .takeover = raid6_takeover, 5936 }; 5937 static struct mdk_personality raid5_personality = 5938 { 5939 .name = "raid5", 5940 .level = 5, 5941 .owner = THIS_MODULE, 5942 .make_request = make_request, 5943 .run = run, 5944 .stop = stop, 5945 .status = status, 5946 .error_handler = error, 5947 .hot_add_disk = raid5_add_disk, 5948 .hot_remove_disk= raid5_remove_disk, 5949 .spare_active = raid5_spare_active, 5950 .sync_request = sync_request, 5951 .resize = raid5_resize, 5952 .size = raid5_size, 5953 .check_reshape = raid5_check_reshape, 5954 .start_reshape = raid5_start_reshape, 5955 .finish_reshape = raid5_finish_reshape, 5956 .quiesce = raid5_quiesce, 5957 .takeover = raid5_takeover, 5958 }; 5959 5960 static struct mdk_personality raid4_personality = 5961 { 5962 .name = "raid4", 5963 .level = 4, 5964 .owner = THIS_MODULE, 5965 .make_request = make_request, 5966 .run = run, 5967 .stop = stop, 5968 .status = status, 5969 .error_handler = error, 5970 .hot_add_disk = raid5_add_disk, 5971 .hot_remove_disk= raid5_remove_disk, 5972 .spare_active = raid5_spare_active, 5973 .sync_request = sync_request, 5974 .resize = raid5_resize, 5975 .size = raid5_size, 5976 .check_reshape = raid5_check_reshape, 5977 .start_reshape = raid5_start_reshape, 5978 .finish_reshape = raid5_finish_reshape, 5979 .quiesce = raid5_quiesce, 5980 .takeover = raid4_takeover, 5981 }; 5982 5983 static int __init raid5_init(void) 5984 { 5985 register_md_personality(&raid6_personality); 5986 register_md_personality(&raid5_personality); 5987 register_md_personality(&raid4_personality); 5988 return 0; 5989 } 5990 5991 static void raid5_exit(void) 5992 { 5993 unregister_md_personality(&raid6_personality); 5994 unregister_md_personality(&raid5_personality); 5995 unregister_md_personality(&raid4_personality); 5996 } 5997 5998 module_init(raid5_init); 5999 module_exit(raid5_exit); 6000 MODULE_LICENSE("GPL"); 6001 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6002 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6003 MODULE_ALIAS("md-raid5"); 6004 MODULE_ALIAS("md-raid4"); 6005 MODULE_ALIAS("md-level-5"); 6006 MODULE_ALIAS("md-level-4"); 6007 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6008 MODULE_ALIAS("md-raid6"); 6009 MODULE_ALIAS("md-level-6"); 6010 6011 /* This used to be two separate modules, they were: */ 6012 MODULE_ALIAS("raid5"); 6013 MODULE_ALIAS("raid6"); 6014