1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/async.h> 51 #include <linux/seq_file.h> 52 #include <linux/cpu.h> 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "raid5.h" 56 #include "raid0.h" 57 #include "bitmap.h" 58 59 /* 60 * Stripe cache 61 */ 62 63 #define NR_STRIPES 256 64 #define STRIPE_SIZE PAGE_SIZE 65 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 66 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 67 #define IO_THRESHOLD 1 68 #define BYPASS_THRESHOLD 1 69 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 70 #define HASH_MASK (NR_HASH - 1) 71 72 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 73 74 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 75 * order without overlap. There may be several bio's per stripe+device, and 76 * a bio could span several devices. 77 * When walking this list for a particular stripe+device, we must never proceed 78 * beyond a bio that extends past this device, as the next bio might no longer 79 * be valid. 80 * This macro is used to determine the 'next' bio in the list, given the sector 81 * of the current stripe+device 82 */ 83 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 84 /* 85 * The following can be used to debug the driver 86 */ 87 #define RAID5_PARANOIA 1 88 #if RAID5_PARANOIA && defined(CONFIG_SMP) 89 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 90 #else 91 # define CHECK_DEVLOCK() 92 #endif 93 94 #ifdef DEBUG 95 #define inline 96 #define __inline__ 97 #endif 98 99 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 100 101 /* 102 * We maintain a biased count of active stripes in the bottom 16 bits of 103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 104 */ 105 static inline int raid5_bi_phys_segments(struct bio *bio) 106 { 107 return bio->bi_phys_segments & 0xffff; 108 } 109 110 static inline int raid5_bi_hw_segments(struct bio *bio) 111 { 112 return (bio->bi_phys_segments >> 16) & 0xffff; 113 } 114 115 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 116 { 117 --bio->bi_phys_segments; 118 return raid5_bi_phys_segments(bio); 119 } 120 121 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 122 { 123 unsigned short val = raid5_bi_hw_segments(bio); 124 125 --val; 126 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 return val; 128 } 129 130 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 131 { 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 133 } 134 135 /* Find first data disk in a raid6 stripe */ 136 static inline int raid6_d0(struct stripe_head *sh) 137 { 138 if (sh->ddf_layout) 139 /* ddf always start from first device */ 140 return 0; 141 /* md starts just after Q block */ 142 if (sh->qd_idx == sh->disks - 1) 143 return 0; 144 else 145 return sh->qd_idx + 1; 146 } 147 static inline int raid6_next_disk(int disk, int raid_disks) 148 { 149 disk++; 150 return (disk < raid_disks) ? disk : 0; 151 } 152 153 /* When walking through the disks in a raid5, starting at raid6_d0, 154 * We need to map each disk to a 'slot', where the data disks are slot 155 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 156 * is raid_disks-1. This help does that mapping. 157 */ 158 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 159 int *count, int syndrome_disks) 160 { 161 int slot = *count; 162 163 if (sh->ddf_layout) 164 (*count)++; 165 if (idx == sh->pd_idx) 166 return syndrome_disks; 167 if (idx == sh->qd_idx) 168 return syndrome_disks + 1; 169 if (!sh->ddf_layout) 170 (*count)++; 171 return slot; 172 } 173 174 static void return_io(struct bio *return_bi) 175 { 176 struct bio *bi = return_bi; 177 while (bi) { 178 179 return_bi = bi->bi_next; 180 bi->bi_next = NULL; 181 bi->bi_size = 0; 182 bio_endio(bi, 0); 183 bi = return_bi; 184 } 185 } 186 187 static void print_raid5_conf (raid5_conf_t *conf); 188 189 static int stripe_operations_active(struct stripe_head *sh) 190 { 191 return sh->check_state || sh->reconstruct_state || 192 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 193 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 194 } 195 196 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 197 { 198 if (atomic_dec_and_test(&sh->count)) { 199 BUG_ON(!list_empty(&sh->lru)); 200 BUG_ON(atomic_read(&conf->active_stripes)==0); 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state)) 203 list_add_tail(&sh->lru, &conf->delayed_list); 204 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 205 sh->bm_seq - conf->seq_write > 0) 206 list_add_tail(&sh->lru, &conf->bitmap_list); 207 else { 208 clear_bit(STRIPE_BIT_DELAY, &sh->state); 209 list_add_tail(&sh->lru, &conf->handle_list); 210 } 211 md_wakeup_thread(conf->mddev->thread); 212 } else { 213 BUG_ON(stripe_operations_active(sh)); 214 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 215 atomic_dec(&conf->preread_active_stripes); 216 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 217 md_wakeup_thread(conf->mddev->thread); 218 } 219 atomic_dec(&conf->active_stripes); 220 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 221 list_add_tail(&sh->lru, &conf->inactive_list); 222 wake_up(&conf->wait_for_stripe); 223 if (conf->retry_read_aligned) 224 md_wakeup_thread(conf->mddev->thread); 225 } 226 } 227 } 228 } 229 230 static void release_stripe(struct stripe_head *sh) 231 { 232 raid5_conf_t *conf = sh->raid_conf; 233 unsigned long flags; 234 235 spin_lock_irqsave(&conf->device_lock, flags); 236 __release_stripe(conf, sh); 237 spin_unlock_irqrestore(&conf->device_lock, flags); 238 } 239 240 static inline void remove_hash(struct stripe_head *sh) 241 { 242 pr_debug("remove_hash(), stripe %llu\n", 243 (unsigned long long)sh->sector); 244 245 hlist_del_init(&sh->hash); 246 } 247 248 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 249 { 250 struct hlist_head *hp = stripe_hash(conf, sh->sector); 251 252 pr_debug("insert_hash(), stripe %llu\n", 253 (unsigned long long)sh->sector); 254 255 CHECK_DEVLOCK(); 256 hlist_add_head(&sh->hash, hp); 257 } 258 259 260 /* find an idle stripe, make sure it is unhashed, and return it. */ 261 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 262 { 263 struct stripe_head *sh = NULL; 264 struct list_head *first; 265 266 CHECK_DEVLOCK(); 267 if (list_empty(&conf->inactive_list)) 268 goto out; 269 first = conf->inactive_list.next; 270 sh = list_entry(first, struct stripe_head, lru); 271 list_del_init(first); 272 remove_hash(sh); 273 atomic_inc(&conf->active_stripes); 274 out: 275 return sh; 276 } 277 278 static void shrink_buffers(struct stripe_head *sh) 279 { 280 struct page *p; 281 int i; 282 int num = sh->raid_conf->pool_size; 283 284 for (i = 0; i < num ; i++) { 285 p = sh->dev[i].page; 286 if (!p) 287 continue; 288 sh->dev[i].page = NULL; 289 put_page(p); 290 } 291 } 292 293 static int grow_buffers(struct stripe_head *sh) 294 { 295 int i; 296 int num = sh->raid_conf->pool_size; 297 298 for (i = 0; i < num; i++) { 299 struct page *page; 300 301 if (!(page = alloc_page(GFP_KERNEL))) { 302 return 1; 303 } 304 sh->dev[i].page = page; 305 } 306 return 0; 307 } 308 309 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 310 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 311 struct stripe_head *sh); 312 313 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 314 { 315 raid5_conf_t *conf = sh->raid_conf; 316 int i; 317 318 BUG_ON(atomic_read(&sh->count) != 0); 319 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 320 BUG_ON(stripe_operations_active(sh)); 321 322 CHECK_DEVLOCK(); 323 pr_debug("init_stripe called, stripe %llu\n", 324 (unsigned long long)sh->sector); 325 326 remove_hash(sh); 327 328 sh->generation = conf->generation - previous; 329 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 330 sh->sector = sector; 331 stripe_set_idx(sector, conf, previous, sh); 332 sh->state = 0; 333 334 335 for (i = sh->disks; i--; ) { 336 struct r5dev *dev = &sh->dev[i]; 337 338 if (dev->toread || dev->read || dev->towrite || dev->written || 339 test_bit(R5_LOCKED, &dev->flags)) { 340 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 341 (unsigned long long)sh->sector, i, dev->toread, 342 dev->read, dev->towrite, dev->written, 343 test_bit(R5_LOCKED, &dev->flags)); 344 BUG(); 345 } 346 dev->flags = 0; 347 raid5_build_block(sh, i, previous); 348 } 349 insert_hash(conf, sh); 350 } 351 352 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 353 short generation) 354 { 355 struct stripe_head *sh; 356 struct hlist_node *hn; 357 358 CHECK_DEVLOCK(); 359 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 360 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 361 if (sh->sector == sector && sh->generation == generation) 362 return sh; 363 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 364 return NULL; 365 } 366 367 /* 368 * Need to check if array has failed when deciding whether to: 369 * - start an array 370 * - remove non-faulty devices 371 * - add a spare 372 * - allow a reshape 373 * This determination is simple when no reshape is happening. 374 * However if there is a reshape, we need to carefully check 375 * both the before and after sections. 376 * This is because some failed devices may only affect one 377 * of the two sections, and some non-in_sync devices may 378 * be insync in the section most affected by failed devices. 379 */ 380 static int has_failed(raid5_conf_t *conf) 381 { 382 int degraded; 383 int i; 384 if (conf->mddev->reshape_position == MaxSector) 385 return conf->mddev->degraded > conf->max_degraded; 386 387 rcu_read_lock(); 388 degraded = 0; 389 for (i = 0; i < conf->previous_raid_disks; i++) { 390 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 391 if (!rdev || test_bit(Faulty, &rdev->flags)) 392 degraded++; 393 else if (test_bit(In_sync, &rdev->flags)) 394 ; 395 else 396 /* not in-sync or faulty. 397 * If the reshape increases the number of devices, 398 * this is being recovered by the reshape, so 399 * this 'previous' section is not in_sync. 400 * If the number of devices is being reduced however, 401 * the device can only be part of the array if 402 * we are reverting a reshape, so this section will 403 * be in-sync. 404 */ 405 if (conf->raid_disks >= conf->previous_raid_disks) 406 degraded++; 407 } 408 rcu_read_unlock(); 409 if (degraded > conf->max_degraded) 410 return 1; 411 rcu_read_lock(); 412 degraded = 0; 413 for (i = 0; i < conf->raid_disks; i++) { 414 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 415 if (!rdev || test_bit(Faulty, &rdev->flags)) 416 degraded++; 417 else if (test_bit(In_sync, &rdev->flags)) 418 ; 419 else 420 /* not in-sync or faulty. 421 * If reshape increases the number of devices, this 422 * section has already been recovered, else it 423 * almost certainly hasn't. 424 */ 425 if (conf->raid_disks <= conf->previous_raid_disks) 426 degraded++; 427 } 428 rcu_read_unlock(); 429 if (degraded > conf->max_degraded) 430 return 1; 431 return 0; 432 } 433 434 static struct stripe_head * 435 get_active_stripe(raid5_conf_t *conf, sector_t sector, 436 int previous, int noblock, int noquiesce) 437 { 438 struct stripe_head *sh; 439 440 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 441 442 spin_lock_irq(&conf->device_lock); 443 444 do { 445 wait_event_lock_irq(conf->wait_for_stripe, 446 conf->quiesce == 0 || noquiesce, 447 conf->device_lock, /* nothing */); 448 sh = __find_stripe(conf, sector, conf->generation - previous); 449 if (!sh) { 450 if (!conf->inactive_blocked) 451 sh = get_free_stripe(conf); 452 if (noblock && sh == NULL) 453 break; 454 if (!sh) { 455 conf->inactive_blocked = 1; 456 wait_event_lock_irq(conf->wait_for_stripe, 457 !list_empty(&conf->inactive_list) && 458 (atomic_read(&conf->active_stripes) 459 < (conf->max_nr_stripes *3/4) 460 || !conf->inactive_blocked), 461 conf->device_lock, 462 ); 463 conf->inactive_blocked = 0; 464 } else 465 init_stripe(sh, sector, previous); 466 } else { 467 if (atomic_read(&sh->count)) { 468 BUG_ON(!list_empty(&sh->lru) 469 && !test_bit(STRIPE_EXPANDING, &sh->state)); 470 } else { 471 if (!test_bit(STRIPE_HANDLE, &sh->state)) 472 atomic_inc(&conf->active_stripes); 473 if (list_empty(&sh->lru) && 474 !test_bit(STRIPE_EXPANDING, &sh->state)) 475 BUG(); 476 list_del_init(&sh->lru); 477 } 478 } 479 } while (sh == NULL); 480 481 if (sh) 482 atomic_inc(&sh->count); 483 484 spin_unlock_irq(&conf->device_lock); 485 return sh; 486 } 487 488 static void 489 raid5_end_read_request(struct bio *bi, int error); 490 static void 491 raid5_end_write_request(struct bio *bi, int error); 492 493 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 494 { 495 raid5_conf_t *conf = sh->raid_conf; 496 int i, disks = sh->disks; 497 498 might_sleep(); 499 500 for (i = disks; i--; ) { 501 int rw; 502 struct bio *bi; 503 mdk_rdev_t *rdev; 504 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 505 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 506 rw = WRITE_FUA; 507 else 508 rw = WRITE; 509 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 510 rw = READ; 511 else 512 continue; 513 514 bi = &sh->dev[i].req; 515 516 bi->bi_rw = rw; 517 if (rw & WRITE) 518 bi->bi_end_io = raid5_end_write_request; 519 else 520 bi->bi_end_io = raid5_end_read_request; 521 522 rcu_read_lock(); 523 rdev = rcu_dereference(conf->disks[i].rdev); 524 if (rdev && test_bit(Faulty, &rdev->flags)) 525 rdev = NULL; 526 if (rdev) 527 atomic_inc(&rdev->nr_pending); 528 rcu_read_unlock(); 529 530 if (rdev) { 531 if (s->syncing || s->expanding || s->expanded) 532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 533 534 set_bit(STRIPE_IO_STARTED, &sh->state); 535 536 bi->bi_bdev = rdev->bdev; 537 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 538 __func__, (unsigned long long)sh->sector, 539 bi->bi_rw, i); 540 atomic_inc(&sh->count); 541 bi->bi_sector = sh->sector + rdev->data_offset; 542 bi->bi_flags = 1 << BIO_UPTODATE; 543 bi->bi_vcnt = 1; 544 bi->bi_max_vecs = 1; 545 bi->bi_idx = 0; 546 bi->bi_io_vec = &sh->dev[i].vec; 547 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 548 bi->bi_io_vec[0].bv_offset = 0; 549 bi->bi_size = STRIPE_SIZE; 550 bi->bi_next = NULL; 551 if ((rw & WRITE) && 552 test_bit(R5_ReWrite, &sh->dev[i].flags)) 553 atomic_add(STRIPE_SECTORS, 554 &rdev->corrected_errors); 555 generic_make_request(bi); 556 } else { 557 if (rw & WRITE) 558 set_bit(STRIPE_DEGRADED, &sh->state); 559 pr_debug("skip op %ld on disc %d for sector %llu\n", 560 bi->bi_rw, i, (unsigned long long)sh->sector); 561 clear_bit(R5_LOCKED, &sh->dev[i].flags); 562 set_bit(STRIPE_HANDLE, &sh->state); 563 } 564 } 565 } 566 567 static struct dma_async_tx_descriptor * 568 async_copy_data(int frombio, struct bio *bio, struct page *page, 569 sector_t sector, struct dma_async_tx_descriptor *tx) 570 { 571 struct bio_vec *bvl; 572 struct page *bio_page; 573 int i; 574 int page_offset; 575 struct async_submit_ctl submit; 576 enum async_tx_flags flags = 0; 577 578 if (bio->bi_sector >= sector) 579 page_offset = (signed)(bio->bi_sector - sector) * 512; 580 else 581 page_offset = (signed)(sector - bio->bi_sector) * -512; 582 583 if (frombio) 584 flags |= ASYNC_TX_FENCE; 585 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 586 587 bio_for_each_segment(bvl, bio, i) { 588 int len = bvl->bv_len; 589 int clen; 590 int b_offset = 0; 591 592 if (page_offset < 0) { 593 b_offset = -page_offset; 594 page_offset += b_offset; 595 len -= b_offset; 596 } 597 598 if (len > 0 && page_offset + len > STRIPE_SIZE) 599 clen = STRIPE_SIZE - page_offset; 600 else 601 clen = len; 602 603 if (clen > 0) { 604 b_offset += bvl->bv_offset; 605 bio_page = bvl->bv_page; 606 if (frombio) 607 tx = async_memcpy(page, bio_page, page_offset, 608 b_offset, clen, &submit); 609 else 610 tx = async_memcpy(bio_page, page, b_offset, 611 page_offset, clen, &submit); 612 } 613 /* chain the operations */ 614 submit.depend_tx = tx; 615 616 if (clen < len) /* hit end of page */ 617 break; 618 page_offset += len; 619 } 620 621 return tx; 622 } 623 624 static void ops_complete_biofill(void *stripe_head_ref) 625 { 626 struct stripe_head *sh = stripe_head_ref; 627 struct bio *return_bi = NULL; 628 raid5_conf_t *conf = sh->raid_conf; 629 int i; 630 631 pr_debug("%s: stripe %llu\n", __func__, 632 (unsigned long long)sh->sector); 633 634 /* clear completed biofills */ 635 spin_lock_irq(&conf->device_lock); 636 for (i = sh->disks; i--; ) { 637 struct r5dev *dev = &sh->dev[i]; 638 639 /* acknowledge completion of a biofill operation */ 640 /* and check if we need to reply to a read request, 641 * new R5_Wantfill requests are held off until 642 * !STRIPE_BIOFILL_RUN 643 */ 644 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 645 struct bio *rbi, *rbi2; 646 647 BUG_ON(!dev->read); 648 rbi = dev->read; 649 dev->read = NULL; 650 while (rbi && rbi->bi_sector < 651 dev->sector + STRIPE_SECTORS) { 652 rbi2 = r5_next_bio(rbi, dev->sector); 653 if (!raid5_dec_bi_phys_segments(rbi)) { 654 rbi->bi_next = return_bi; 655 return_bi = rbi; 656 } 657 rbi = rbi2; 658 } 659 } 660 } 661 spin_unlock_irq(&conf->device_lock); 662 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 663 664 return_io(return_bi); 665 666 set_bit(STRIPE_HANDLE, &sh->state); 667 release_stripe(sh); 668 } 669 670 static void ops_run_biofill(struct stripe_head *sh) 671 { 672 struct dma_async_tx_descriptor *tx = NULL; 673 raid5_conf_t *conf = sh->raid_conf; 674 struct async_submit_ctl submit; 675 int i; 676 677 pr_debug("%s: stripe %llu\n", __func__, 678 (unsigned long long)sh->sector); 679 680 for (i = sh->disks; i--; ) { 681 struct r5dev *dev = &sh->dev[i]; 682 if (test_bit(R5_Wantfill, &dev->flags)) { 683 struct bio *rbi; 684 spin_lock_irq(&conf->device_lock); 685 dev->read = rbi = dev->toread; 686 dev->toread = NULL; 687 spin_unlock_irq(&conf->device_lock); 688 while (rbi && rbi->bi_sector < 689 dev->sector + STRIPE_SECTORS) { 690 tx = async_copy_data(0, rbi, dev->page, 691 dev->sector, tx); 692 rbi = r5_next_bio(rbi, dev->sector); 693 } 694 } 695 } 696 697 atomic_inc(&sh->count); 698 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 699 async_trigger_callback(&submit); 700 } 701 702 static void mark_target_uptodate(struct stripe_head *sh, int target) 703 { 704 struct r5dev *tgt; 705 706 if (target < 0) 707 return; 708 709 tgt = &sh->dev[target]; 710 set_bit(R5_UPTODATE, &tgt->flags); 711 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 712 clear_bit(R5_Wantcompute, &tgt->flags); 713 } 714 715 static void ops_complete_compute(void *stripe_head_ref) 716 { 717 struct stripe_head *sh = stripe_head_ref; 718 719 pr_debug("%s: stripe %llu\n", __func__, 720 (unsigned long long)sh->sector); 721 722 /* mark the computed target(s) as uptodate */ 723 mark_target_uptodate(sh, sh->ops.target); 724 mark_target_uptodate(sh, sh->ops.target2); 725 726 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 727 if (sh->check_state == check_state_compute_run) 728 sh->check_state = check_state_compute_result; 729 set_bit(STRIPE_HANDLE, &sh->state); 730 release_stripe(sh); 731 } 732 733 /* return a pointer to the address conversion region of the scribble buffer */ 734 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 735 struct raid5_percpu *percpu) 736 { 737 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 738 } 739 740 static struct dma_async_tx_descriptor * 741 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 742 { 743 int disks = sh->disks; 744 struct page **xor_srcs = percpu->scribble; 745 int target = sh->ops.target; 746 struct r5dev *tgt = &sh->dev[target]; 747 struct page *xor_dest = tgt->page; 748 int count = 0; 749 struct dma_async_tx_descriptor *tx; 750 struct async_submit_ctl submit; 751 int i; 752 753 pr_debug("%s: stripe %llu block: %d\n", 754 __func__, (unsigned long long)sh->sector, target); 755 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 756 757 for (i = disks; i--; ) 758 if (i != target) 759 xor_srcs[count++] = sh->dev[i].page; 760 761 atomic_inc(&sh->count); 762 763 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 764 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 765 if (unlikely(count == 1)) 766 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 767 else 768 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 769 770 return tx; 771 } 772 773 /* set_syndrome_sources - populate source buffers for gen_syndrome 774 * @srcs - (struct page *) array of size sh->disks 775 * @sh - stripe_head to parse 776 * 777 * Populates srcs in proper layout order for the stripe and returns the 778 * 'count' of sources to be used in a call to async_gen_syndrome. The P 779 * destination buffer is recorded in srcs[count] and the Q destination 780 * is recorded in srcs[count+1]]. 781 */ 782 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 783 { 784 int disks = sh->disks; 785 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 786 int d0_idx = raid6_d0(sh); 787 int count; 788 int i; 789 790 for (i = 0; i < disks; i++) 791 srcs[i] = NULL; 792 793 count = 0; 794 i = d0_idx; 795 do { 796 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 797 798 srcs[slot] = sh->dev[i].page; 799 i = raid6_next_disk(i, disks); 800 } while (i != d0_idx); 801 802 return syndrome_disks; 803 } 804 805 static struct dma_async_tx_descriptor * 806 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 807 { 808 int disks = sh->disks; 809 struct page **blocks = percpu->scribble; 810 int target; 811 int qd_idx = sh->qd_idx; 812 struct dma_async_tx_descriptor *tx; 813 struct async_submit_ctl submit; 814 struct r5dev *tgt; 815 struct page *dest; 816 int i; 817 int count; 818 819 if (sh->ops.target < 0) 820 target = sh->ops.target2; 821 else if (sh->ops.target2 < 0) 822 target = sh->ops.target; 823 else 824 /* we should only have one valid target */ 825 BUG(); 826 BUG_ON(target < 0); 827 pr_debug("%s: stripe %llu block: %d\n", 828 __func__, (unsigned long long)sh->sector, target); 829 830 tgt = &sh->dev[target]; 831 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 832 dest = tgt->page; 833 834 atomic_inc(&sh->count); 835 836 if (target == qd_idx) { 837 count = set_syndrome_sources(blocks, sh); 838 blocks[count] = NULL; /* regenerating p is not necessary */ 839 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 840 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 841 ops_complete_compute, sh, 842 to_addr_conv(sh, percpu)); 843 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 844 } else { 845 /* Compute any data- or p-drive using XOR */ 846 count = 0; 847 for (i = disks; i-- ; ) { 848 if (i == target || i == qd_idx) 849 continue; 850 blocks[count++] = sh->dev[i].page; 851 } 852 853 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 854 NULL, ops_complete_compute, sh, 855 to_addr_conv(sh, percpu)); 856 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 857 } 858 859 return tx; 860 } 861 862 static struct dma_async_tx_descriptor * 863 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 864 { 865 int i, count, disks = sh->disks; 866 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 867 int d0_idx = raid6_d0(sh); 868 int faila = -1, failb = -1; 869 int target = sh->ops.target; 870 int target2 = sh->ops.target2; 871 struct r5dev *tgt = &sh->dev[target]; 872 struct r5dev *tgt2 = &sh->dev[target2]; 873 struct dma_async_tx_descriptor *tx; 874 struct page **blocks = percpu->scribble; 875 struct async_submit_ctl submit; 876 877 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 878 __func__, (unsigned long long)sh->sector, target, target2); 879 BUG_ON(target < 0 || target2 < 0); 880 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 881 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 882 883 /* we need to open-code set_syndrome_sources to handle the 884 * slot number conversion for 'faila' and 'failb' 885 */ 886 for (i = 0; i < disks ; i++) 887 blocks[i] = NULL; 888 count = 0; 889 i = d0_idx; 890 do { 891 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 892 893 blocks[slot] = sh->dev[i].page; 894 895 if (i == target) 896 faila = slot; 897 if (i == target2) 898 failb = slot; 899 i = raid6_next_disk(i, disks); 900 } while (i != d0_idx); 901 902 BUG_ON(faila == failb); 903 if (failb < faila) 904 swap(faila, failb); 905 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 906 __func__, (unsigned long long)sh->sector, faila, failb); 907 908 atomic_inc(&sh->count); 909 910 if (failb == syndrome_disks+1) { 911 /* Q disk is one of the missing disks */ 912 if (faila == syndrome_disks) { 913 /* Missing P+Q, just recompute */ 914 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 915 ops_complete_compute, sh, 916 to_addr_conv(sh, percpu)); 917 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 918 STRIPE_SIZE, &submit); 919 } else { 920 struct page *dest; 921 int data_target; 922 int qd_idx = sh->qd_idx; 923 924 /* Missing D+Q: recompute D from P, then recompute Q */ 925 if (target == qd_idx) 926 data_target = target2; 927 else 928 data_target = target; 929 930 count = 0; 931 for (i = disks; i-- ; ) { 932 if (i == data_target || i == qd_idx) 933 continue; 934 blocks[count++] = sh->dev[i].page; 935 } 936 dest = sh->dev[data_target].page; 937 init_async_submit(&submit, 938 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 939 NULL, NULL, NULL, 940 to_addr_conv(sh, percpu)); 941 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 942 &submit); 943 944 count = set_syndrome_sources(blocks, sh); 945 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 946 ops_complete_compute, sh, 947 to_addr_conv(sh, percpu)); 948 return async_gen_syndrome(blocks, 0, count+2, 949 STRIPE_SIZE, &submit); 950 } 951 } else { 952 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 953 ops_complete_compute, sh, 954 to_addr_conv(sh, percpu)); 955 if (failb == syndrome_disks) { 956 /* We're missing D+P. */ 957 return async_raid6_datap_recov(syndrome_disks+2, 958 STRIPE_SIZE, faila, 959 blocks, &submit); 960 } else { 961 /* We're missing D+D. */ 962 return async_raid6_2data_recov(syndrome_disks+2, 963 STRIPE_SIZE, faila, failb, 964 blocks, &submit); 965 } 966 } 967 } 968 969 970 static void ops_complete_prexor(void *stripe_head_ref) 971 { 972 struct stripe_head *sh = stripe_head_ref; 973 974 pr_debug("%s: stripe %llu\n", __func__, 975 (unsigned long long)sh->sector); 976 } 977 978 static struct dma_async_tx_descriptor * 979 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 980 struct dma_async_tx_descriptor *tx) 981 { 982 int disks = sh->disks; 983 struct page **xor_srcs = percpu->scribble; 984 int count = 0, pd_idx = sh->pd_idx, i; 985 struct async_submit_ctl submit; 986 987 /* existing parity data subtracted */ 988 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 989 990 pr_debug("%s: stripe %llu\n", __func__, 991 (unsigned long long)sh->sector); 992 993 for (i = disks; i--; ) { 994 struct r5dev *dev = &sh->dev[i]; 995 /* Only process blocks that are known to be uptodate */ 996 if (test_bit(R5_Wantdrain, &dev->flags)) 997 xor_srcs[count++] = dev->page; 998 } 999 1000 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1001 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1002 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1003 1004 return tx; 1005 } 1006 1007 static struct dma_async_tx_descriptor * 1008 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1009 { 1010 int disks = sh->disks; 1011 int i; 1012 1013 pr_debug("%s: stripe %llu\n", __func__, 1014 (unsigned long long)sh->sector); 1015 1016 for (i = disks; i--; ) { 1017 struct r5dev *dev = &sh->dev[i]; 1018 struct bio *chosen; 1019 1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1021 struct bio *wbi; 1022 1023 spin_lock(&sh->lock); 1024 chosen = dev->towrite; 1025 dev->towrite = NULL; 1026 BUG_ON(dev->written); 1027 wbi = dev->written = chosen; 1028 spin_unlock(&sh->lock); 1029 1030 while (wbi && wbi->bi_sector < 1031 dev->sector + STRIPE_SECTORS) { 1032 if (wbi->bi_rw & REQ_FUA) 1033 set_bit(R5_WantFUA, &dev->flags); 1034 tx = async_copy_data(1, wbi, dev->page, 1035 dev->sector, tx); 1036 wbi = r5_next_bio(wbi, dev->sector); 1037 } 1038 } 1039 } 1040 1041 return tx; 1042 } 1043 1044 static void ops_complete_reconstruct(void *stripe_head_ref) 1045 { 1046 struct stripe_head *sh = stripe_head_ref; 1047 int disks = sh->disks; 1048 int pd_idx = sh->pd_idx; 1049 int qd_idx = sh->qd_idx; 1050 int i; 1051 bool fua = false; 1052 1053 pr_debug("%s: stripe %llu\n", __func__, 1054 (unsigned long long)sh->sector); 1055 1056 for (i = disks; i--; ) 1057 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1058 1059 for (i = disks; i--; ) { 1060 struct r5dev *dev = &sh->dev[i]; 1061 1062 if (dev->written || i == pd_idx || i == qd_idx) { 1063 set_bit(R5_UPTODATE, &dev->flags); 1064 if (fua) 1065 set_bit(R5_WantFUA, &dev->flags); 1066 } 1067 } 1068 1069 if (sh->reconstruct_state == reconstruct_state_drain_run) 1070 sh->reconstruct_state = reconstruct_state_drain_result; 1071 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1072 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1073 else { 1074 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1075 sh->reconstruct_state = reconstruct_state_result; 1076 } 1077 1078 set_bit(STRIPE_HANDLE, &sh->state); 1079 release_stripe(sh); 1080 } 1081 1082 static void 1083 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1084 struct dma_async_tx_descriptor *tx) 1085 { 1086 int disks = sh->disks; 1087 struct page **xor_srcs = percpu->scribble; 1088 struct async_submit_ctl submit; 1089 int count = 0, pd_idx = sh->pd_idx, i; 1090 struct page *xor_dest; 1091 int prexor = 0; 1092 unsigned long flags; 1093 1094 pr_debug("%s: stripe %llu\n", __func__, 1095 (unsigned long long)sh->sector); 1096 1097 /* check if prexor is active which means only process blocks 1098 * that are part of a read-modify-write (written) 1099 */ 1100 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1101 prexor = 1; 1102 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1103 for (i = disks; i--; ) { 1104 struct r5dev *dev = &sh->dev[i]; 1105 if (dev->written) 1106 xor_srcs[count++] = dev->page; 1107 } 1108 } else { 1109 xor_dest = sh->dev[pd_idx].page; 1110 for (i = disks; i--; ) { 1111 struct r5dev *dev = &sh->dev[i]; 1112 if (i != pd_idx) 1113 xor_srcs[count++] = dev->page; 1114 } 1115 } 1116 1117 /* 1/ if we prexor'd then the dest is reused as a source 1118 * 2/ if we did not prexor then we are redoing the parity 1119 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1120 * for the synchronous xor case 1121 */ 1122 flags = ASYNC_TX_ACK | 1123 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1124 1125 atomic_inc(&sh->count); 1126 1127 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1128 to_addr_conv(sh, percpu)); 1129 if (unlikely(count == 1)) 1130 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1131 else 1132 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1133 } 1134 1135 static void 1136 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1137 struct dma_async_tx_descriptor *tx) 1138 { 1139 struct async_submit_ctl submit; 1140 struct page **blocks = percpu->scribble; 1141 int count; 1142 1143 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1144 1145 count = set_syndrome_sources(blocks, sh); 1146 1147 atomic_inc(&sh->count); 1148 1149 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1150 sh, to_addr_conv(sh, percpu)); 1151 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1152 } 1153 1154 static void ops_complete_check(void *stripe_head_ref) 1155 { 1156 struct stripe_head *sh = stripe_head_ref; 1157 1158 pr_debug("%s: stripe %llu\n", __func__, 1159 (unsigned long long)sh->sector); 1160 1161 sh->check_state = check_state_check_result; 1162 set_bit(STRIPE_HANDLE, &sh->state); 1163 release_stripe(sh); 1164 } 1165 1166 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1167 { 1168 int disks = sh->disks; 1169 int pd_idx = sh->pd_idx; 1170 int qd_idx = sh->qd_idx; 1171 struct page *xor_dest; 1172 struct page **xor_srcs = percpu->scribble; 1173 struct dma_async_tx_descriptor *tx; 1174 struct async_submit_ctl submit; 1175 int count; 1176 int i; 1177 1178 pr_debug("%s: stripe %llu\n", __func__, 1179 (unsigned long long)sh->sector); 1180 1181 count = 0; 1182 xor_dest = sh->dev[pd_idx].page; 1183 xor_srcs[count++] = xor_dest; 1184 for (i = disks; i--; ) { 1185 if (i == pd_idx || i == qd_idx) 1186 continue; 1187 xor_srcs[count++] = sh->dev[i].page; 1188 } 1189 1190 init_async_submit(&submit, 0, NULL, NULL, NULL, 1191 to_addr_conv(sh, percpu)); 1192 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1193 &sh->ops.zero_sum_result, &submit); 1194 1195 atomic_inc(&sh->count); 1196 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1197 tx = async_trigger_callback(&submit); 1198 } 1199 1200 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1201 { 1202 struct page **srcs = percpu->scribble; 1203 struct async_submit_ctl submit; 1204 int count; 1205 1206 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1207 (unsigned long long)sh->sector, checkp); 1208 1209 count = set_syndrome_sources(srcs, sh); 1210 if (!checkp) 1211 srcs[count] = NULL; 1212 1213 atomic_inc(&sh->count); 1214 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1215 sh, to_addr_conv(sh, percpu)); 1216 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1217 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1218 } 1219 1220 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1221 { 1222 int overlap_clear = 0, i, disks = sh->disks; 1223 struct dma_async_tx_descriptor *tx = NULL; 1224 raid5_conf_t *conf = sh->raid_conf; 1225 int level = conf->level; 1226 struct raid5_percpu *percpu; 1227 unsigned long cpu; 1228 1229 cpu = get_cpu(); 1230 percpu = per_cpu_ptr(conf->percpu, cpu); 1231 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1232 ops_run_biofill(sh); 1233 overlap_clear++; 1234 } 1235 1236 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1237 if (level < 6) 1238 tx = ops_run_compute5(sh, percpu); 1239 else { 1240 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1241 tx = ops_run_compute6_1(sh, percpu); 1242 else 1243 tx = ops_run_compute6_2(sh, percpu); 1244 } 1245 /* terminate the chain if reconstruct is not set to be run */ 1246 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1247 async_tx_ack(tx); 1248 } 1249 1250 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1251 tx = ops_run_prexor(sh, percpu, tx); 1252 1253 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1254 tx = ops_run_biodrain(sh, tx); 1255 overlap_clear++; 1256 } 1257 1258 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1259 if (level < 6) 1260 ops_run_reconstruct5(sh, percpu, tx); 1261 else 1262 ops_run_reconstruct6(sh, percpu, tx); 1263 } 1264 1265 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1266 if (sh->check_state == check_state_run) 1267 ops_run_check_p(sh, percpu); 1268 else if (sh->check_state == check_state_run_q) 1269 ops_run_check_pq(sh, percpu, 0); 1270 else if (sh->check_state == check_state_run_pq) 1271 ops_run_check_pq(sh, percpu, 1); 1272 else 1273 BUG(); 1274 } 1275 1276 if (overlap_clear) 1277 for (i = disks; i--; ) { 1278 struct r5dev *dev = &sh->dev[i]; 1279 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1280 wake_up(&sh->raid_conf->wait_for_overlap); 1281 } 1282 put_cpu(); 1283 } 1284 1285 #ifdef CONFIG_MULTICORE_RAID456 1286 static void async_run_ops(void *param, async_cookie_t cookie) 1287 { 1288 struct stripe_head *sh = param; 1289 unsigned long ops_request = sh->ops.request; 1290 1291 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1292 wake_up(&sh->ops.wait_for_ops); 1293 1294 __raid_run_ops(sh, ops_request); 1295 release_stripe(sh); 1296 } 1297 1298 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1299 { 1300 /* since handle_stripe can be called outside of raid5d context 1301 * we need to ensure sh->ops.request is de-staged before another 1302 * request arrives 1303 */ 1304 wait_event(sh->ops.wait_for_ops, 1305 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1306 sh->ops.request = ops_request; 1307 1308 atomic_inc(&sh->count); 1309 async_schedule(async_run_ops, sh); 1310 } 1311 #else 1312 #define raid_run_ops __raid_run_ops 1313 #endif 1314 1315 static int grow_one_stripe(raid5_conf_t *conf) 1316 { 1317 struct stripe_head *sh; 1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1319 if (!sh) 1320 return 0; 1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1322 sh->raid_conf = conf; 1323 spin_lock_init(&sh->lock); 1324 #ifdef CONFIG_MULTICORE_RAID456 1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1326 #endif 1327 1328 if (grow_buffers(sh)) { 1329 shrink_buffers(sh); 1330 kmem_cache_free(conf->slab_cache, sh); 1331 return 0; 1332 } 1333 /* we just created an active stripe so... */ 1334 atomic_set(&sh->count, 1); 1335 atomic_inc(&conf->active_stripes); 1336 INIT_LIST_HEAD(&sh->lru); 1337 release_stripe(sh); 1338 return 1; 1339 } 1340 1341 static int grow_stripes(raid5_conf_t *conf, int num) 1342 { 1343 struct kmem_cache *sc; 1344 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1345 1346 if (conf->mddev->gendisk) 1347 sprintf(conf->cache_name[0], 1348 "raid%d-%s", conf->level, mdname(conf->mddev)); 1349 else 1350 sprintf(conf->cache_name[0], 1351 "raid%d-%p", conf->level, conf->mddev); 1352 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1353 1354 conf->active_name = 0; 1355 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1356 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1357 0, 0, NULL); 1358 if (!sc) 1359 return 1; 1360 conf->slab_cache = sc; 1361 conf->pool_size = devs; 1362 while (num--) 1363 if (!grow_one_stripe(conf)) 1364 return 1; 1365 return 0; 1366 } 1367 1368 /** 1369 * scribble_len - return the required size of the scribble region 1370 * @num - total number of disks in the array 1371 * 1372 * The size must be enough to contain: 1373 * 1/ a struct page pointer for each device in the array +2 1374 * 2/ room to convert each entry in (1) to its corresponding dma 1375 * (dma_map_page()) or page (page_address()) address. 1376 * 1377 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1378 * calculate over all devices (not just the data blocks), using zeros in place 1379 * of the P and Q blocks. 1380 */ 1381 static size_t scribble_len(int num) 1382 { 1383 size_t len; 1384 1385 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1386 1387 return len; 1388 } 1389 1390 static int resize_stripes(raid5_conf_t *conf, int newsize) 1391 { 1392 /* Make all the stripes able to hold 'newsize' devices. 1393 * New slots in each stripe get 'page' set to a new page. 1394 * 1395 * This happens in stages: 1396 * 1/ create a new kmem_cache and allocate the required number of 1397 * stripe_heads. 1398 * 2/ gather all the old stripe_heads and tranfer the pages across 1399 * to the new stripe_heads. This will have the side effect of 1400 * freezing the array as once all stripe_heads have been collected, 1401 * no IO will be possible. Old stripe heads are freed once their 1402 * pages have been transferred over, and the old kmem_cache is 1403 * freed when all stripes are done. 1404 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1405 * we simple return a failre status - no need to clean anything up. 1406 * 4/ allocate new pages for the new slots in the new stripe_heads. 1407 * If this fails, we don't bother trying the shrink the 1408 * stripe_heads down again, we just leave them as they are. 1409 * As each stripe_head is processed the new one is released into 1410 * active service. 1411 * 1412 * Once step2 is started, we cannot afford to wait for a write, 1413 * so we use GFP_NOIO allocations. 1414 */ 1415 struct stripe_head *osh, *nsh; 1416 LIST_HEAD(newstripes); 1417 struct disk_info *ndisks; 1418 unsigned long cpu; 1419 int err; 1420 struct kmem_cache *sc; 1421 int i; 1422 1423 if (newsize <= conf->pool_size) 1424 return 0; /* never bother to shrink */ 1425 1426 err = md_allow_write(conf->mddev); 1427 if (err) 1428 return err; 1429 1430 /* Step 1 */ 1431 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1432 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1433 0, 0, NULL); 1434 if (!sc) 1435 return -ENOMEM; 1436 1437 for (i = conf->max_nr_stripes; i; i--) { 1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1439 if (!nsh) 1440 break; 1441 1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1443 1444 nsh->raid_conf = conf; 1445 spin_lock_init(&nsh->lock); 1446 #ifdef CONFIG_MULTICORE_RAID456 1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1448 #endif 1449 1450 list_add(&nsh->lru, &newstripes); 1451 } 1452 if (i) { 1453 /* didn't get enough, give up */ 1454 while (!list_empty(&newstripes)) { 1455 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1456 list_del(&nsh->lru); 1457 kmem_cache_free(sc, nsh); 1458 } 1459 kmem_cache_destroy(sc); 1460 return -ENOMEM; 1461 } 1462 /* Step 2 - Must use GFP_NOIO now. 1463 * OK, we have enough stripes, start collecting inactive 1464 * stripes and copying them over 1465 */ 1466 list_for_each_entry(nsh, &newstripes, lru) { 1467 spin_lock_irq(&conf->device_lock); 1468 wait_event_lock_irq(conf->wait_for_stripe, 1469 !list_empty(&conf->inactive_list), 1470 conf->device_lock, 1471 ); 1472 osh = get_free_stripe(conf); 1473 spin_unlock_irq(&conf->device_lock); 1474 atomic_set(&nsh->count, 1); 1475 for(i=0; i<conf->pool_size; i++) 1476 nsh->dev[i].page = osh->dev[i].page; 1477 for( ; i<newsize; i++) 1478 nsh->dev[i].page = NULL; 1479 kmem_cache_free(conf->slab_cache, osh); 1480 } 1481 kmem_cache_destroy(conf->slab_cache); 1482 1483 /* Step 3. 1484 * At this point, we are holding all the stripes so the array 1485 * is completely stalled, so now is a good time to resize 1486 * conf->disks and the scribble region 1487 */ 1488 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1489 if (ndisks) { 1490 for (i=0; i<conf->raid_disks; i++) 1491 ndisks[i] = conf->disks[i]; 1492 kfree(conf->disks); 1493 conf->disks = ndisks; 1494 } else 1495 err = -ENOMEM; 1496 1497 get_online_cpus(); 1498 conf->scribble_len = scribble_len(newsize); 1499 for_each_present_cpu(cpu) { 1500 struct raid5_percpu *percpu; 1501 void *scribble; 1502 1503 percpu = per_cpu_ptr(conf->percpu, cpu); 1504 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1505 1506 if (scribble) { 1507 kfree(percpu->scribble); 1508 percpu->scribble = scribble; 1509 } else { 1510 err = -ENOMEM; 1511 break; 1512 } 1513 } 1514 put_online_cpus(); 1515 1516 /* Step 4, return new stripes to service */ 1517 while(!list_empty(&newstripes)) { 1518 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1519 list_del_init(&nsh->lru); 1520 1521 for (i=conf->raid_disks; i < newsize; i++) 1522 if (nsh->dev[i].page == NULL) { 1523 struct page *p = alloc_page(GFP_NOIO); 1524 nsh->dev[i].page = p; 1525 if (!p) 1526 err = -ENOMEM; 1527 } 1528 release_stripe(nsh); 1529 } 1530 /* critical section pass, GFP_NOIO no longer needed */ 1531 1532 conf->slab_cache = sc; 1533 conf->active_name = 1-conf->active_name; 1534 conf->pool_size = newsize; 1535 return err; 1536 } 1537 1538 static int drop_one_stripe(raid5_conf_t *conf) 1539 { 1540 struct stripe_head *sh; 1541 1542 spin_lock_irq(&conf->device_lock); 1543 sh = get_free_stripe(conf); 1544 spin_unlock_irq(&conf->device_lock); 1545 if (!sh) 1546 return 0; 1547 BUG_ON(atomic_read(&sh->count)); 1548 shrink_buffers(sh); 1549 kmem_cache_free(conf->slab_cache, sh); 1550 atomic_dec(&conf->active_stripes); 1551 return 1; 1552 } 1553 1554 static void shrink_stripes(raid5_conf_t *conf) 1555 { 1556 while (drop_one_stripe(conf)) 1557 ; 1558 1559 if (conf->slab_cache) 1560 kmem_cache_destroy(conf->slab_cache); 1561 conf->slab_cache = NULL; 1562 } 1563 1564 static void raid5_end_read_request(struct bio * bi, int error) 1565 { 1566 struct stripe_head *sh = bi->bi_private; 1567 raid5_conf_t *conf = sh->raid_conf; 1568 int disks = sh->disks, i; 1569 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1570 char b[BDEVNAME_SIZE]; 1571 mdk_rdev_t *rdev; 1572 1573 1574 for (i=0 ; i<disks; i++) 1575 if (bi == &sh->dev[i].req) 1576 break; 1577 1578 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1579 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1580 uptodate); 1581 if (i == disks) { 1582 BUG(); 1583 return; 1584 } 1585 1586 if (uptodate) { 1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1589 rdev = conf->disks[i].rdev; 1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1591 " (%lu sectors at %llu on %s)\n", 1592 mdname(conf->mddev), STRIPE_SECTORS, 1593 (unsigned long long)(sh->sector 1594 + rdev->data_offset), 1595 bdevname(rdev->bdev, b)); 1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1598 } 1599 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1600 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1601 } else { 1602 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1603 int retry = 0; 1604 rdev = conf->disks[i].rdev; 1605 1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1607 atomic_inc(&rdev->read_errors); 1608 if (conf->mddev->degraded >= conf->max_degraded) 1609 printk_rl(KERN_WARNING 1610 "md/raid:%s: read error not correctable " 1611 "(sector %llu on %s).\n", 1612 mdname(conf->mddev), 1613 (unsigned long long)(sh->sector 1614 + rdev->data_offset), 1615 bdn); 1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1617 /* Oh, no!!! */ 1618 printk_rl(KERN_WARNING 1619 "md/raid:%s: read error NOT corrected!! " 1620 "(sector %llu on %s).\n", 1621 mdname(conf->mddev), 1622 (unsigned long long)(sh->sector 1623 + rdev->data_offset), 1624 bdn); 1625 else if (atomic_read(&rdev->read_errors) 1626 > conf->max_nr_stripes) 1627 printk(KERN_WARNING 1628 "md/raid:%s: Too many read errors, failing device %s.\n", 1629 mdname(conf->mddev), bdn); 1630 else 1631 retry = 1; 1632 if (retry) 1633 set_bit(R5_ReadError, &sh->dev[i].flags); 1634 else { 1635 clear_bit(R5_ReadError, &sh->dev[i].flags); 1636 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1637 md_error(conf->mddev, rdev); 1638 } 1639 } 1640 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1641 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1642 set_bit(STRIPE_HANDLE, &sh->state); 1643 release_stripe(sh); 1644 } 1645 1646 static void raid5_end_write_request(struct bio *bi, int error) 1647 { 1648 struct stripe_head *sh = bi->bi_private; 1649 raid5_conf_t *conf = sh->raid_conf; 1650 int disks = sh->disks, i; 1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1652 1653 for (i=0 ; i<disks; i++) 1654 if (bi == &sh->dev[i].req) 1655 break; 1656 1657 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1658 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1659 uptodate); 1660 if (i == disks) { 1661 BUG(); 1662 return; 1663 } 1664 1665 if (!uptodate) 1666 md_error(conf->mddev, conf->disks[i].rdev); 1667 1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1669 1670 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1671 set_bit(STRIPE_HANDLE, &sh->state); 1672 release_stripe(sh); 1673 } 1674 1675 1676 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1677 1678 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1679 { 1680 struct r5dev *dev = &sh->dev[i]; 1681 1682 bio_init(&dev->req); 1683 dev->req.bi_io_vec = &dev->vec; 1684 dev->req.bi_vcnt++; 1685 dev->req.bi_max_vecs++; 1686 dev->vec.bv_page = dev->page; 1687 dev->vec.bv_len = STRIPE_SIZE; 1688 dev->vec.bv_offset = 0; 1689 1690 dev->req.bi_sector = sh->sector; 1691 dev->req.bi_private = sh; 1692 1693 dev->flags = 0; 1694 dev->sector = compute_blocknr(sh, i, previous); 1695 } 1696 1697 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1698 { 1699 char b[BDEVNAME_SIZE]; 1700 raid5_conf_t *conf = mddev->private; 1701 pr_debug("raid456: error called\n"); 1702 1703 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1704 unsigned long flags; 1705 spin_lock_irqsave(&conf->device_lock, flags); 1706 mddev->degraded++; 1707 spin_unlock_irqrestore(&conf->device_lock, flags); 1708 /* 1709 * if recovery was running, make sure it aborts. 1710 */ 1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1712 } 1713 set_bit(Faulty, &rdev->flags); 1714 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1715 printk(KERN_ALERT 1716 "md/raid:%s: Disk failure on %s, disabling device.\n" 1717 "md/raid:%s: Operation continuing on %d devices.\n", 1718 mdname(mddev), 1719 bdevname(rdev->bdev, b), 1720 mdname(mddev), 1721 conf->raid_disks - mddev->degraded); 1722 } 1723 1724 /* 1725 * Input: a 'big' sector number, 1726 * Output: index of the data and parity disk, and the sector # in them. 1727 */ 1728 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1729 int previous, int *dd_idx, 1730 struct stripe_head *sh) 1731 { 1732 sector_t stripe, stripe2; 1733 sector_t chunk_number; 1734 unsigned int chunk_offset; 1735 int pd_idx, qd_idx; 1736 int ddf_layout = 0; 1737 sector_t new_sector; 1738 int algorithm = previous ? conf->prev_algo 1739 : conf->algorithm; 1740 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1741 : conf->chunk_sectors; 1742 int raid_disks = previous ? conf->previous_raid_disks 1743 : conf->raid_disks; 1744 int data_disks = raid_disks - conf->max_degraded; 1745 1746 /* First compute the information on this sector */ 1747 1748 /* 1749 * Compute the chunk number and the sector offset inside the chunk 1750 */ 1751 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1752 chunk_number = r_sector; 1753 1754 /* 1755 * Compute the stripe number 1756 */ 1757 stripe = chunk_number; 1758 *dd_idx = sector_div(stripe, data_disks); 1759 stripe2 = stripe; 1760 /* 1761 * Select the parity disk based on the user selected algorithm. 1762 */ 1763 pd_idx = qd_idx = ~0; 1764 switch(conf->level) { 1765 case 4: 1766 pd_idx = data_disks; 1767 break; 1768 case 5: 1769 switch (algorithm) { 1770 case ALGORITHM_LEFT_ASYMMETRIC: 1771 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1772 if (*dd_idx >= pd_idx) 1773 (*dd_idx)++; 1774 break; 1775 case ALGORITHM_RIGHT_ASYMMETRIC: 1776 pd_idx = sector_div(stripe2, raid_disks); 1777 if (*dd_idx >= pd_idx) 1778 (*dd_idx)++; 1779 break; 1780 case ALGORITHM_LEFT_SYMMETRIC: 1781 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1782 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1783 break; 1784 case ALGORITHM_RIGHT_SYMMETRIC: 1785 pd_idx = sector_div(stripe2, raid_disks); 1786 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1787 break; 1788 case ALGORITHM_PARITY_0: 1789 pd_idx = 0; 1790 (*dd_idx)++; 1791 break; 1792 case ALGORITHM_PARITY_N: 1793 pd_idx = data_disks; 1794 break; 1795 default: 1796 BUG(); 1797 } 1798 break; 1799 case 6: 1800 1801 switch (algorithm) { 1802 case ALGORITHM_LEFT_ASYMMETRIC: 1803 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1804 qd_idx = pd_idx + 1; 1805 if (pd_idx == raid_disks-1) { 1806 (*dd_idx)++; /* Q D D D P */ 1807 qd_idx = 0; 1808 } else if (*dd_idx >= pd_idx) 1809 (*dd_idx) += 2; /* D D P Q D */ 1810 break; 1811 case ALGORITHM_RIGHT_ASYMMETRIC: 1812 pd_idx = sector_div(stripe2, raid_disks); 1813 qd_idx = pd_idx + 1; 1814 if (pd_idx == raid_disks-1) { 1815 (*dd_idx)++; /* Q D D D P */ 1816 qd_idx = 0; 1817 } else if (*dd_idx >= pd_idx) 1818 (*dd_idx) += 2; /* D D P Q D */ 1819 break; 1820 case ALGORITHM_LEFT_SYMMETRIC: 1821 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1822 qd_idx = (pd_idx + 1) % raid_disks; 1823 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1824 break; 1825 case ALGORITHM_RIGHT_SYMMETRIC: 1826 pd_idx = sector_div(stripe2, raid_disks); 1827 qd_idx = (pd_idx + 1) % raid_disks; 1828 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1829 break; 1830 1831 case ALGORITHM_PARITY_0: 1832 pd_idx = 0; 1833 qd_idx = 1; 1834 (*dd_idx) += 2; 1835 break; 1836 case ALGORITHM_PARITY_N: 1837 pd_idx = data_disks; 1838 qd_idx = data_disks + 1; 1839 break; 1840 1841 case ALGORITHM_ROTATING_ZERO_RESTART: 1842 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1843 * of blocks for computing Q is different. 1844 */ 1845 pd_idx = sector_div(stripe2, raid_disks); 1846 qd_idx = pd_idx + 1; 1847 if (pd_idx == raid_disks-1) { 1848 (*dd_idx)++; /* Q D D D P */ 1849 qd_idx = 0; 1850 } else if (*dd_idx >= pd_idx) 1851 (*dd_idx) += 2; /* D D P Q D */ 1852 ddf_layout = 1; 1853 break; 1854 1855 case ALGORITHM_ROTATING_N_RESTART: 1856 /* Same a left_asymmetric, by first stripe is 1857 * D D D P Q rather than 1858 * Q D D D P 1859 */ 1860 stripe2 += 1; 1861 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1862 qd_idx = pd_idx + 1; 1863 if (pd_idx == raid_disks-1) { 1864 (*dd_idx)++; /* Q D D D P */ 1865 qd_idx = 0; 1866 } else if (*dd_idx >= pd_idx) 1867 (*dd_idx) += 2; /* D D P Q D */ 1868 ddf_layout = 1; 1869 break; 1870 1871 case ALGORITHM_ROTATING_N_CONTINUE: 1872 /* Same as left_symmetric but Q is before P */ 1873 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1874 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1875 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1876 ddf_layout = 1; 1877 break; 1878 1879 case ALGORITHM_LEFT_ASYMMETRIC_6: 1880 /* RAID5 left_asymmetric, with Q on last device */ 1881 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1882 if (*dd_idx >= pd_idx) 1883 (*dd_idx)++; 1884 qd_idx = raid_disks - 1; 1885 break; 1886 1887 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1888 pd_idx = sector_div(stripe2, raid_disks-1); 1889 if (*dd_idx >= pd_idx) 1890 (*dd_idx)++; 1891 qd_idx = raid_disks - 1; 1892 break; 1893 1894 case ALGORITHM_LEFT_SYMMETRIC_6: 1895 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1896 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1897 qd_idx = raid_disks - 1; 1898 break; 1899 1900 case ALGORITHM_RIGHT_SYMMETRIC_6: 1901 pd_idx = sector_div(stripe2, raid_disks-1); 1902 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1903 qd_idx = raid_disks - 1; 1904 break; 1905 1906 case ALGORITHM_PARITY_0_6: 1907 pd_idx = 0; 1908 (*dd_idx)++; 1909 qd_idx = raid_disks - 1; 1910 break; 1911 1912 default: 1913 BUG(); 1914 } 1915 break; 1916 } 1917 1918 if (sh) { 1919 sh->pd_idx = pd_idx; 1920 sh->qd_idx = qd_idx; 1921 sh->ddf_layout = ddf_layout; 1922 } 1923 /* 1924 * Finally, compute the new sector number 1925 */ 1926 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1927 return new_sector; 1928 } 1929 1930 1931 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1932 { 1933 raid5_conf_t *conf = sh->raid_conf; 1934 int raid_disks = sh->disks; 1935 int data_disks = raid_disks - conf->max_degraded; 1936 sector_t new_sector = sh->sector, check; 1937 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1938 : conf->chunk_sectors; 1939 int algorithm = previous ? conf->prev_algo 1940 : conf->algorithm; 1941 sector_t stripe; 1942 int chunk_offset; 1943 sector_t chunk_number; 1944 int dummy1, dd_idx = i; 1945 sector_t r_sector; 1946 struct stripe_head sh2; 1947 1948 1949 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1950 stripe = new_sector; 1951 1952 if (i == sh->pd_idx) 1953 return 0; 1954 switch(conf->level) { 1955 case 4: break; 1956 case 5: 1957 switch (algorithm) { 1958 case ALGORITHM_LEFT_ASYMMETRIC: 1959 case ALGORITHM_RIGHT_ASYMMETRIC: 1960 if (i > sh->pd_idx) 1961 i--; 1962 break; 1963 case ALGORITHM_LEFT_SYMMETRIC: 1964 case ALGORITHM_RIGHT_SYMMETRIC: 1965 if (i < sh->pd_idx) 1966 i += raid_disks; 1967 i -= (sh->pd_idx + 1); 1968 break; 1969 case ALGORITHM_PARITY_0: 1970 i -= 1; 1971 break; 1972 case ALGORITHM_PARITY_N: 1973 break; 1974 default: 1975 BUG(); 1976 } 1977 break; 1978 case 6: 1979 if (i == sh->qd_idx) 1980 return 0; /* It is the Q disk */ 1981 switch (algorithm) { 1982 case ALGORITHM_LEFT_ASYMMETRIC: 1983 case ALGORITHM_RIGHT_ASYMMETRIC: 1984 case ALGORITHM_ROTATING_ZERO_RESTART: 1985 case ALGORITHM_ROTATING_N_RESTART: 1986 if (sh->pd_idx == raid_disks-1) 1987 i--; /* Q D D D P */ 1988 else if (i > sh->pd_idx) 1989 i -= 2; /* D D P Q D */ 1990 break; 1991 case ALGORITHM_LEFT_SYMMETRIC: 1992 case ALGORITHM_RIGHT_SYMMETRIC: 1993 if (sh->pd_idx == raid_disks-1) 1994 i--; /* Q D D D P */ 1995 else { 1996 /* D D P Q D */ 1997 if (i < sh->pd_idx) 1998 i += raid_disks; 1999 i -= (sh->pd_idx + 2); 2000 } 2001 break; 2002 case ALGORITHM_PARITY_0: 2003 i -= 2; 2004 break; 2005 case ALGORITHM_PARITY_N: 2006 break; 2007 case ALGORITHM_ROTATING_N_CONTINUE: 2008 /* Like left_symmetric, but P is before Q */ 2009 if (sh->pd_idx == 0) 2010 i--; /* P D D D Q */ 2011 else { 2012 /* D D Q P D */ 2013 if (i < sh->pd_idx) 2014 i += raid_disks; 2015 i -= (sh->pd_idx + 1); 2016 } 2017 break; 2018 case ALGORITHM_LEFT_ASYMMETRIC_6: 2019 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2020 if (i > sh->pd_idx) 2021 i--; 2022 break; 2023 case ALGORITHM_LEFT_SYMMETRIC_6: 2024 case ALGORITHM_RIGHT_SYMMETRIC_6: 2025 if (i < sh->pd_idx) 2026 i += data_disks + 1; 2027 i -= (sh->pd_idx + 1); 2028 break; 2029 case ALGORITHM_PARITY_0_6: 2030 i -= 1; 2031 break; 2032 default: 2033 BUG(); 2034 } 2035 break; 2036 } 2037 2038 chunk_number = stripe * data_disks + i; 2039 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2040 2041 check = raid5_compute_sector(conf, r_sector, 2042 previous, &dummy1, &sh2); 2043 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2044 || sh2.qd_idx != sh->qd_idx) { 2045 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2046 mdname(conf->mddev)); 2047 return 0; 2048 } 2049 return r_sector; 2050 } 2051 2052 2053 static void 2054 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2055 int rcw, int expand) 2056 { 2057 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2058 raid5_conf_t *conf = sh->raid_conf; 2059 int level = conf->level; 2060 2061 if (rcw) { 2062 /* if we are not expanding this is a proper write request, and 2063 * there will be bios with new data to be drained into the 2064 * stripe cache 2065 */ 2066 if (!expand) { 2067 sh->reconstruct_state = reconstruct_state_drain_run; 2068 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2069 } else 2070 sh->reconstruct_state = reconstruct_state_run; 2071 2072 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2073 2074 for (i = disks; i--; ) { 2075 struct r5dev *dev = &sh->dev[i]; 2076 2077 if (dev->towrite) { 2078 set_bit(R5_LOCKED, &dev->flags); 2079 set_bit(R5_Wantdrain, &dev->flags); 2080 if (!expand) 2081 clear_bit(R5_UPTODATE, &dev->flags); 2082 s->locked++; 2083 } 2084 } 2085 if (s->locked + conf->max_degraded == disks) 2086 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2087 atomic_inc(&conf->pending_full_writes); 2088 } else { 2089 BUG_ON(level == 6); 2090 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2091 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2092 2093 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2094 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2095 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2096 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2097 2098 for (i = disks; i--; ) { 2099 struct r5dev *dev = &sh->dev[i]; 2100 if (i == pd_idx) 2101 continue; 2102 2103 if (dev->towrite && 2104 (test_bit(R5_UPTODATE, &dev->flags) || 2105 test_bit(R5_Wantcompute, &dev->flags))) { 2106 set_bit(R5_Wantdrain, &dev->flags); 2107 set_bit(R5_LOCKED, &dev->flags); 2108 clear_bit(R5_UPTODATE, &dev->flags); 2109 s->locked++; 2110 } 2111 } 2112 } 2113 2114 /* keep the parity disk(s) locked while asynchronous operations 2115 * are in flight 2116 */ 2117 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2118 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2119 s->locked++; 2120 2121 if (level == 6) { 2122 int qd_idx = sh->qd_idx; 2123 struct r5dev *dev = &sh->dev[qd_idx]; 2124 2125 set_bit(R5_LOCKED, &dev->flags); 2126 clear_bit(R5_UPTODATE, &dev->flags); 2127 s->locked++; 2128 } 2129 2130 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2131 __func__, (unsigned long long)sh->sector, 2132 s->locked, s->ops_request); 2133 } 2134 2135 /* 2136 * Each stripe/dev can have one or more bion attached. 2137 * toread/towrite point to the first in a chain. 2138 * The bi_next chain must be in order. 2139 */ 2140 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2141 { 2142 struct bio **bip; 2143 raid5_conf_t *conf = sh->raid_conf; 2144 int firstwrite=0; 2145 2146 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2147 (unsigned long long)bi->bi_sector, 2148 (unsigned long long)sh->sector); 2149 2150 2151 spin_lock(&sh->lock); 2152 spin_lock_irq(&conf->device_lock); 2153 if (forwrite) { 2154 bip = &sh->dev[dd_idx].towrite; 2155 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2156 firstwrite = 1; 2157 } else 2158 bip = &sh->dev[dd_idx].toread; 2159 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2160 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2161 goto overlap; 2162 bip = & (*bip)->bi_next; 2163 } 2164 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2165 goto overlap; 2166 2167 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2168 if (*bip) 2169 bi->bi_next = *bip; 2170 *bip = bi; 2171 bi->bi_phys_segments++; 2172 spin_unlock_irq(&conf->device_lock); 2173 spin_unlock(&sh->lock); 2174 2175 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2176 (unsigned long long)bi->bi_sector, 2177 (unsigned long long)sh->sector, dd_idx); 2178 2179 if (conf->mddev->bitmap && firstwrite) { 2180 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2181 STRIPE_SECTORS, 0); 2182 sh->bm_seq = conf->seq_flush+1; 2183 set_bit(STRIPE_BIT_DELAY, &sh->state); 2184 } 2185 2186 if (forwrite) { 2187 /* check if page is covered */ 2188 sector_t sector = sh->dev[dd_idx].sector; 2189 for (bi=sh->dev[dd_idx].towrite; 2190 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2191 bi && bi->bi_sector <= sector; 2192 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2193 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2194 sector = bi->bi_sector + (bi->bi_size>>9); 2195 } 2196 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2197 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2198 } 2199 return 1; 2200 2201 overlap: 2202 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2203 spin_unlock_irq(&conf->device_lock); 2204 spin_unlock(&sh->lock); 2205 return 0; 2206 } 2207 2208 static void end_reshape(raid5_conf_t *conf); 2209 2210 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2211 struct stripe_head *sh) 2212 { 2213 int sectors_per_chunk = 2214 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2215 int dd_idx; 2216 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2217 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2218 2219 raid5_compute_sector(conf, 2220 stripe * (disks - conf->max_degraded) 2221 *sectors_per_chunk + chunk_offset, 2222 previous, 2223 &dd_idx, sh); 2224 } 2225 2226 static void 2227 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 2228 struct stripe_head_state *s, int disks, 2229 struct bio **return_bi) 2230 { 2231 int i; 2232 for (i = disks; i--; ) { 2233 struct bio *bi; 2234 int bitmap_end = 0; 2235 2236 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2237 mdk_rdev_t *rdev; 2238 rcu_read_lock(); 2239 rdev = rcu_dereference(conf->disks[i].rdev); 2240 if (rdev && test_bit(In_sync, &rdev->flags)) 2241 /* multiple read failures in one stripe */ 2242 md_error(conf->mddev, rdev); 2243 rcu_read_unlock(); 2244 } 2245 spin_lock_irq(&conf->device_lock); 2246 /* fail all writes first */ 2247 bi = sh->dev[i].towrite; 2248 sh->dev[i].towrite = NULL; 2249 if (bi) { 2250 s->to_write--; 2251 bitmap_end = 1; 2252 } 2253 2254 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2255 wake_up(&conf->wait_for_overlap); 2256 2257 while (bi && bi->bi_sector < 2258 sh->dev[i].sector + STRIPE_SECTORS) { 2259 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2260 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2261 if (!raid5_dec_bi_phys_segments(bi)) { 2262 md_write_end(conf->mddev); 2263 bi->bi_next = *return_bi; 2264 *return_bi = bi; 2265 } 2266 bi = nextbi; 2267 } 2268 /* and fail all 'written' */ 2269 bi = sh->dev[i].written; 2270 sh->dev[i].written = NULL; 2271 if (bi) bitmap_end = 1; 2272 while (bi && bi->bi_sector < 2273 sh->dev[i].sector + STRIPE_SECTORS) { 2274 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2275 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2276 if (!raid5_dec_bi_phys_segments(bi)) { 2277 md_write_end(conf->mddev); 2278 bi->bi_next = *return_bi; 2279 *return_bi = bi; 2280 } 2281 bi = bi2; 2282 } 2283 2284 /* fail any reads if this device is non-operational and 2285 * the data has not reached the cache yet. 2286 */ 2287 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2288 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2289 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2290 bi = sh->dev[i].toread; 2291 sh->dev[i].toread = NULL; 2292 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2293 wake_up(&conf->wait_for_overlap); 2294 if (bi) s->to_read--; 2295 while (bi && bi->bi_sector < 2296 sh->dev[i].sector + STRIPE_SECTORS) { 2297 struct bio *nextbi = 2298 r5_next_bio(bi, sh->dev[i].sector); 2299 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2300 if (!raid5_dec_bi_phys_segments(bi)) { 2301 bi->bi_next = *return_bi; 2302 *return_bi = bi; 2303 } 2304 bi = nextbi; 2305 } 2306 } 2307 spin_unlock_irq(&conf->device_lock); 2308 if (bitmap_end) 2309 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2310 STRIPE_SECTORS, 0, 0); 2311 } 2312 2313 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2314 if (atomic_dec_and_test(&conf->pending_full_writes)) 2315 md_wakeup_thread(conf->mddev->thread); 2316 } 2317 2318 /* fetch_block5 - checks the given member device to see if its data needs 2319 * to be read or computed to satisfy a request. 2320 * 2321 * Returns 1 when no more member devices need to be checked, otherwise returns 2322 * 0 to tell the loop in handle_stripe_fill5 to continue 2323 */ 2324 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 2325 int disk_idx, int disks) 2326 { 2327 struct r5dev *dev = &sh->dev[disk_idx]; 2328 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 2329 2330 /* is the data in this block needed, and can we get it? */ 2331 if (!test_bit(R5_LOCKED, &dev->flags) && 2332 !test_bit(R5_UPTODATE, &dev->flags) && 2333 (dev->toread || 2334 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2335 s->syncing || s->expanding || 2336 (s->failed && 2337 (failed_dev->toread || 2338 (failed_dev->towrite && 2339 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 2340 /* We would like to get this block, possibly by computing it, 2341 * otherwise read it if the backing disk is insync 2342 */ 2343 if ((s->uptodate == disks - 1) && 2344 (s->failed && disk_idx == s->failed_num)) { 2345 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2346 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2347 set_bit(R5_Wantcompute, &dev->flags); 2348 sh->ops.target = disk_idx; 2349 sh->ops.target2 = -1; 2350 s->req_compute = 1; 2351 /* Careful: from this point on 'uptodate' is in the eye 2352 * of raid_run_ops which services 'compute' operations 2353 * before writes. R5_Wantcompute flags a block that will 2354 * be R5_UPTODATE by the time it is needed for a 2355 * subsequent operation. 2356 */ 2357 s->uptodate++; 2358 return 1; /* uptodate + compute == disks */ 2359 } else if (test_bit(R5_Insync, &dev->flags)) { 2360 set_bit(R5_LOCKED, &dev->flags); 2361 set_bit(R5_Wantread, &dev->flags); 2362 s->locked++; 2363 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 2364 s->syncing); 2365 } 2366 } 2367 2368 return 0; 2369 } 2370 2371 /** 2372 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 2373 */ 2374 static void handle_stripe_fill5(struct stripe_head *sh, 2375 struct stripe_head_state *s, int disks) 2376 { 2377 int i; 2378 2379 /* look for blocks to read/compute, skip this if a compute 2380 * is already in flight, or if the stripe contents are in the 2381 * midst of changing due to a write 2382 */ 2383 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2384 !sh->reconstruct_state) 2385 for (i = disks; i--; ) 2386 if (fetch_block5(sh, s, i, disks)) 2387 break; 2388 set_bit(STRIPE_HANDLE, &sh->state); 2389 } 2390 2391 /* fetch_block6 - checks the given member device to see if its data needs 2392 * to be read or computed to satisfy a request. 2393 * 2394 * Returns 1 when no more member devices need to be checked, otherwise returns 2395 * 0 to tell the loop in handle_stripe_fill6 to continue 2396 */ 2397 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2398 struct r6_state *r6s, int disk_idx, int disks) 2399 { 2400 struct r5dev *dev = &sh->dev[disk_idx]; 2401 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2402 &sh->dev[r6s->failed_num[1]] }; 2403 2404 if (!test_bit(R5_LOCKED, &dev->flags) && 2405 !test_bit(R5_UPTODATE, &dev->flags) && 2406 (dev->toread || 2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2408 s->syncing || s->expanding || 2409 (s->failed >= 1 && 2410 (fdev[0]->toread || s->to_write)) || 2411 (s->failed >= 2 && 2412 (fdev[1]->toread || s->to_write)))) { 2413 /* we would like to get this block, possibly by computing it, 2414 * otherwise read it if the backing disk is insync 2415 */ 2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2418 if ((s->uptodate == disks - 1) && 2419 (s->failed && (disk_idx == r6s->failed_num[0] || 2420 disk_idx == r6s->failed_num[1]))) { 2421 /* have disk failed, and we're requested to fetch it; 2422 * do compute it 2423 */ 2424 pr_debug("Computing stripe %llu block %d\n", 2425 (unsigned long long)sh->sector, disk_idx); 2426 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2427 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2428 set_bit(R5_Wantcompute, &dev->flags); 2429 sh->ops.target = disk_idx; 2430 sh->ops.target2 = -1; /* no 2nd target */ 2431 s->req_compute = 1; 2432 s->uptodate++; 2433 return 1; 2434 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2435 /* Computing 2-failure is *very* expensive; only 2436 * do it if failed >= 2 2437 */ 2438 int other; 2439 for (other = disks; other--; ) { 2440 if (other == disk_idx) 2441 continue; 2442 if (!test_bit(R5_UPTODATE, 2443 &sh->dev[other].flags)) 2444 break; 2445 } 2446 BUG_ON(other < 0); 2447 pr_debug("Computing stripe %llu blocks %d,%d\n", 2448 (unsigned long long)sh->sector, 2449 disk_idx, other); 2450 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2451 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2452 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2453 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2454 sh->ops.target = disk_idx; 2455 sh->ops.target2 = other; 2456 s->uptodate += 2; 2457 s->req_compute = 1; 2458 return 1; 2459 } else if (test_bit(R5_Insync, &dev->flags)) { 2460 set_bit(R5_LOCKED, &dev->flags); 2461 set_bit(R5_Wantread, &dev->flags); 2462 s->locked++; 2463 pr_debug("Reading block %d (sync=%d)\n", 2464 disk_idx, s->syncing); 2465 } 2466 } 2467 2468 return 0; 2469 } 2470 2471 /** 2472 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2473 */ 2474 static void handle_stripe_fill6(struct stripe_head *sh, 2475 struct stripe_head_state *s, struct r6_state *r6s, 2476 int disks) 2477 { 2478 int i; 2479 2480 /* look for blocks to read/compute, skip this if a compute 2481 * is already in flight, or if the stripe contents are in the 2482 * midst of changing due to a write 2483 */ 2484 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2485 !sh->reconstruct_state) 2486 for (i = disks; i--; ) 2487 if (fetch_block6(sh, s, r6s, i, disks)) 2488 break; 2489 set_bit(STRIPE_HANDLE, &sh->state); 2490 } 2491 2492 2493 /* handle_stripe_clean_event 2494 * any written block on an uptodate or failed drive can be returned. 2495 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2496 * never LOCKED, so we don't need to test 'failed' directly. 2497 */ 2498 static void handle_stripe_clean_event(raid5_conf_t *conf, 2499 struct stripe_head *sh, int disks, struct bio **return_bi) 2500 { 2501 int i; 2502 struct r5dev *dev; 2503 2504 for (i = disks; i--; ) 2505 if (sh->dev[i].written) { 2506 dev = &sh->dev[i]; 2507 if (!test_bit(R5_LOCKED, &dev->flags) && 2508 test_bit(R5_UPTODATE, &dev->flags)) { 2509 /* We can return any write requests */ 2510 struct bio *wbi, *wbi2; 2511 int bitmap_end = 0; 2512 pr_debug("Return write for disc %d\n", i); 2513 spin_lock_irq(&conf->device_lock); 2514 wbi = dev->written; 2515 dev->written = NULL; 2516 while (wbi && wbi->bi_sector < 2517 dev->sector + STRIPE_SECTORS) { 2518 wbi2 = r5_next_bio(wbi, dev->sector); 2519 if (!raid5_dec_bi_phys_segments(wbi)) { 2520 md_write_end(conf->mddev); 2521 wbi->bi_next = *return_bi; 2522 *return_bi = wbi; 2523 } 2524 wbi = wbi2; 2525 } 2526 if (dev->towrite == NULL) 2527 bitmap_end = 1; 2528 spin_unlock_irq(&conf->device_lock); 2529 if (bitmap_end) 2530 bitmap_endwrite(conf->mddev->bitmap, 2531 sh->sector, 2532 STRIPE_SECTORS, 2533 !test_bit(STRIPE_DEGRADED, &sh->state), 2534 0); 2535 } 2536 } 2537 2538 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2539 if (atomic_dec_and_test(&conf->pending_full_writes)) 2540 md_wakeup_thread(conf->mddev->thread); 2541 } 2542 2543 static void handle_stripe_dirtying5(raid5_conf_t *conf, 2544 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2545 { 2546 int rmw = 0, rcw = 0, i; 2547 for (i = disks; i--; ) { 2548 /* would I have to read this buffer for read_modify_write */ 2549 struct r5dev *dev = &sh->dev[i]; 2550 if ((dev->towrite || i == sh->pd_idx) && 2551 !test_bit(R5_LOCKED, &dev->flags) && 2552 !(test_bit(R5_UPTODATE, &dev->flags) || 2553 test_bit(R5_Wantcompute, &dev->flags))) { 2554 if (test_bit(R5_Insync, &dev->flags)) 2555 rmw++; 2556 else 2557 rmw += 2*disks; /* cannot read it */ 2558 } 2559 /* Would I have to read this buffer for reconstruct_write */ 2560 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2561 !test_bit(R5_LOCKED, &dev->flags) && 2562 !(test_bit(R5_UPTODATE, &dev->flags) || 2563 test_bit(R5_Wantcompute, &dev->flags))) { 2564 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2565 else 2566 rcw += 2*disks; 2567 } 2568 } 2569 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2570 (unsigned long long)sh->sector, rmw, rcw); 2571 set_bit(STRIPE_HANDLE, &sh->state); 2572 if (rmw < rcw && rmw > 0) 2573 /* prefer read-modify-write, but need to get some data */ 2574 for (i = disks; i--; ) { 2575 struct r5dev *dev = &sh->dev[i]; 2576 if ((dev->towrite || i == sh->pd_idx) && 2577 !test_bit(R5_LOCKED, &dev->flags) && 2578 !(test_bit(R5_UPTODATE, &dev->flags) || 2579 test_bit(R5_Wantcompute, &dev->flags)) && 2580 test_bit(R5_Insync, &dev->flags)) { 2581 if ( 2582 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2583 pr_debug("Read_old block " 2584 "%d for r-m-w\n", i); 2585 set_bit(R5_LOCKED, &dev->flags); 2586 set_bit(R5_Wantread, &dev->flags); 2587 s->locked++; 2588 } else { 2589 set_bit(STRIPE_DELAYED, &sh->state); 2590 set_bit(STRIPE_HANDLE, &sh->state); 2591 } 2592 } 2593 } 2594 if (rcw <= rmw && rcw > 0) 2595 /* want reconstruct write, but need to get some data */ 2596 for (i = disks; i--; ) { 2597 struct r5dev *dev = &sh->dev[i]; 2598 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2599 i != sh->pd_idx && 2600 !test_bit(R5_LOCKED, &dev->flags) && 2601 !(test_bit(R5_UPTODATE, &dev->flags) || 2602 test_bit(R5_Wantcompute, &dev->flags)) && 2603 test_bit(R5_Insync, &dev->flags)) { 2604 if ( 2605 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2606 pr_debug("Read_old block " 2607 "%d for Reconstruct\n", i); 2608 set_bit(R5_LOCKED, &dev->flags); 2609 set_bit(R5_Wantread, &dev->flags); 2610 s->locked++; 2611 } else { 2612 set_bit(STRIPE_DELAYED, &sh->state); 2613 set_bit(STRIPE_HANDLE, &sh->state); 2614 } 2615 } 2616 } 2617 /* now if nothing is locked, and if we have enough data, 2618 * we can start a write request 2619 */ 2620 /* since handle_stripe can be called at any time we need to handle the 2621 * case where a compute block operation has been submitted and then a 2622 * subsequent call wants to start a write request. raid_run_ops only 2623 * handles the case where compute block and reconstruct are requested 2624 * simultaneously. If this is not the case then new writes need to be 2625 * held off until the compute completes. 2626 */ 2627 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2628 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2629 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2630 schedule_reconstruction(sh, s, rcw == 0, 0); 2631 } 2632 2633 static void handle_stripe_dirtying6(raid5_conf_t *conf, 2634 struct stripe_head *sh, struct stripe_head_state *s, 2635 struct r6_state *r6s, int disks) 2636 { 2637 int rcw = 0, pd_idx = sh->pd_idx, i; 2638 int qd_idx = sh->qd_idx; 2639 2640 set_bit(STRIPE_HANDLE, &sh->state); 2641 for (i = disks; i--; ) { 2642 struct r5dev *dev = &sh->dev[i]; 2643 /* check if we haven't enough data */ 2644 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2645 i != pd_idx && i != qd_idx && 2646 !test_bit(R5_LOCKED, &dev->flags) && 2647 !(test_bit(R5_UPTODATE, &dev->flags) || 2648 test_bit(R5_Wantcompute, &dev->flags))) { 2649 rcw++; 2650 if (!test_bit(R5_Insync, &dev->flags)) 2651 continue; /* it's a failed drive */ 2652 2653 if ( 2654 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2655 pr_debug("Read_old stripe %llu " 2656 "block %d for Reconstruct\n", 2657 (unsigned long long)sh->sector, i); 2658 set_bit(R5_LOCKED, &dev->flags); 2659 set_bit(R5_Wantread, &dev->flags); 2660 s->locked++; 2661 } else { 2662 pr_debug("Request delayed stripe %llu " 2663 "block %d for Reconstruct\n", 2664 (unsigned long long)sh->sector, i); 2665 set_bit(STRIPE_DELAYED, &sh->state); 2666 set_bit(STRIPE_HANDLE, &sh->state); 2667 } 2668 } 2669 } 2670 /* now if nothing is locked, and if we have enough data, we can start a 2671 * write request 2672 */ 2673 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2674 s->locked == 0 && rcw == 0 && 2675 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2676 schedule_reconstruction(sh, s, 1, 0); 2677 } 2678 } 2679 2680 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2681 struct stripe_head_state *s, int disks) 2682 { 2683 struct r5dev *dev = NULL; 2684 2685 set_bit(STRIPE_HANDLE, &sh->state); 2686 2687 switch (sh->check_state) { 2688 case check_state_idle: 2689 /* start a new check operation if there are no failures */ 2690 if (s->failed == 0) { 2691 BUG_ON(s->uptodate != disks); 2692 sh->check_state = check_state_run; 2693 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2694 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2695 s->uptodate--; 2696 break; 2697 } 2698 dev = &sh->dev[s->failed_num]; 2699 /* fall through */ 2700 case check_state_compute_result: 2701 sh->check_state = check_state_idle; 2702 if (!dev) 2703 dev = &sh->dev[sh->pd_idx]; 2704 2705 /* check that a write has not made the stripe insync */ 2706 if (test_bit(STRIPE_INSYNC, &sh->state)) 2707 break; 2708 2709 /* either failed parity check, or recovery is happening */ 2710 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2711 BUG_ON(s->uptodate != disks); 2712 2713 set_bit(R5_LOCKED, &dev->flags); 2714 s->locked++; 2715 set_bit(R5_Wantwrite, &dev->flags); 2716 2717 clear_bit(STRIPE_DEGRADED, &sh->state); 2718 set_bit(STRIPE_INSYNC, &sh->state); 2719 break; 2720 case check_state_run: 2721 break; /* we will be called again upon completion */ 2722 case check_state_check_result: 2723 sh->check_state = check_state_idle; 2724 2725 /* if a failure occurred during the check operation, leave 2726 * STRIPE_INSYNC not set and let the stripe be handled again 2727 */ 2728 if (s->failed) 2729 break; 2730 2731 /* handle a successful check operation, if parity is correct 2732 * we are done. Otherwise update the mismatch count and repair 2733 * parity if !MD_RECOVERY_CHECK 2734 */ 2735 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2736 /* parity is correct (on disc, 2737 * not in buffer any more) 2738 */ 2739 set_bit(STRIPE_INSYNC, &sh->state); 2740 else { 2741 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2742 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2743 /* don't try to repair!! */ 2744 set_bit(STRIPE_INSYNC, &sh->state); 2745 else { 2746 sh->check_state = check_state_compute_run; 2747 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2748 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2749 set_bit(R5_Wantcompute, 2750 &sh->dev[sh->pd_idx].flags); 2751 sh->ops.target = sh->pd_idx; 2752 sh->ops.target2 = -1; 2753 s->uptodate++; 2754 } 2755 } 2756 break; 2757 case check_state_compute_run: 2758 break; 2759 default: 2760 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2761 __func__, sh->check_state, 2762 (unsigned long long) sh->sector); 2763 BUG(); 2764 } 2765 } 2766 2767 2768 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2769 struct stripe_head_state *s, 2770 struct r6_state *r6s, int disks) 2771 { 2772 int pd_idx = sh->pd_idx; 2773 int qd_idx = sh->qd_idx; 2774 struct r5dev *dev; 2775 2776 set_bit(STRIPE_HANDLE, &sh->state); 2777 2778 BUG_ON(s->failed > 2); 2779 2780 /* Want to check and possibly repair P and Q. 2781 * However there could be one 'failed' device, in which 2782 * case we can only check one of them, possibly using the 2783 * other to generate missing data 2784 */ 2785 2786 switch (sh->check_state) { 2787 case check_state_idle: 2788 /* start a new check operation if there are < 2 failures */ 2789 if (s->failed == r6s->q_failed) { 2790 /* The only possible failed device holds Q, so it 2791 * makes sense to check P (If anything else were failed, 2792 * we would have used P to recreate it). 2793 */ 2794 sh->check_state = check_state_run; 2795 } 2796 if (!r6s->q_failed && s->failed < 2) { 2797 /* Q is not failed, and we didn't use it to generate 2798 * anything, so it makes sense to check it 2799 */ 2800 if (sh->check_state == check_state_run) 2801 sh->check_state = check_state_run_pq; 2802 else 2803 sh->check_state = check_state_run_q; 2804 } 2805 2806 /* discard potentially stale zero_sum_result */ 2807 sh->ops.zero_sum_result = 0; 2808 2809 if (sh->check_state == check_state_run) { 2810 /* async_xor_zero_sum destroys the contents of P */ 2811 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2812 s->uptodate--; 2813 } 2814 if (sh->check_state >= check_state_run && 2815 sh->check_state <= check_state_run_pq) { 2816 /* async_syndrome_zero_sum preserves P and Q, so 2817 * no need to mark them !uptodate here 2818 */ 2819 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2820 break; 2821 } 2822 2823 /* we have 2-disk failure */ 2824 BUG_ON(s->failed != 2); 2825 /* fall through */ 2826 case check_state_compute_result: 2827 sh->check_state = check_state_idle; 2828 2829 /* check that a write has not made the stripe insync */ 2830 if (test_bit(STRIPE_INSYNC, &sh->state)) 2831 break; 2832 2833 /* now write out any block on a failed drive, 2834 * or P or Q if they were recomputed 2835 */ 2836 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2837 if (s->failed == 2) { 2838 dev = &sh->dev[r6s->failed_num[1]]; 2839 s->locked++; 2840 set_bit(R5_LOCKED, &dev->flags); 2841 set_bit(R5_Wantwrite, &dev->flags); 2842 } 2843 if (s->failed >= 1) { 2844 dev = &sh->dev[r6s->failed_num[0]]; 2845 s->locked++; 2846 set_bit(R5_LOCKED, &dev->flags); 2847 set_bit(R5_Wantwrite, &dev->flags); 2848 } 2849 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2850 dev = &sh->dev[pd_idx]; 2851 s->locked++; 2852 set_bit(R5_LOCKED, &dev->flags); 2853 set_bit(R5_Wantwrite, &dev->flags); 2854 } 2855 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2856 dev = &sh->dev[qd_idx]; 2857 s->locked++; 2858 set_bit(R5_LOCKED, &dev->flags); 2859 set_bit(R5_Wantwrite, &dev->flags); 2860 } 2861 clear_bit(STRIPE_DEGRADED, &sh->state); 2862 2863 set_bit(STRIPE_INSYNC, &sh->state); 2864 break; 2865 case check_state_run: 2866 case check_state_run_q: 2867 case check_state_run_pq: 2868 break; /* we will be called again upon completion */ 2869 case check_state_check_result: 2870 sh->check_state = check_state_idle; 2871 2872 /* handle a successful check operation, if parity is correct 2873 * we are done. Otherwise update the mismatch count and repair 2874 * parity if !MD_RECOVERY_CHECK 2875 */ 2876 if (sh->ops.zero_sum_result == 0) { 2877 /* both parities are correct */ 2878 if (!s->failed) 2879 set_bit(STRIPE_INSYNC, &sh->state); 2880 else { 2881 /* in contrast to the raid5 case we can validate 2882 * parity, but still have a failure to write 2883 * back 2884 */ 2885 sh->check_state = check_state_compute_result; 2886 /* Returning at this point means that we may go 2887 * off and bring p and/or q uptodate again so 2888 * we make sure to check zero_sum_result again 2889 * to verify if p or q need writeback 2890 */ 2891 } 2892 } else { 2893 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2894 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2895 /* don't try to repair!! */ 2896 set_bit(STRIPE_INSYNC, &sh->state); 2897 else { 2898 int *target = &sh->ops.target; 2899 2900 sh->ops.target = -1; 2901 sh->ops.target2 = -1; 2902 sh->check_state = check_state_compute_run; 2903 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2904 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2905 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2906 set_bit(R5_Wantcompute, 2907 &sh->dev[pd_idx].flags); 2908 *target = pd_idx; 2909 target = &sh->ops.target2; 2910 s->uptodate++; 2911 } 2912 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2913 set_bit(R5_Wantcompute, 2914 &sh->dev[qd_idx].flags); 2915 *target = qd_idx; 2916 s->uptodate++; 2917 } 2918 } 2919 } 2920 break; 2921 case check_state_compute_run: 2922 break; 2923 default: 2924 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2925 __func__, sh->check_state, 2926 (unsigned long long) sh->sector); 2927 BUG(); 2928 } 2929 } 2930 2931 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2932 struct r6_state *r6s) 2933 { 2934 int i; 2935 2936 /* We have read all the blocks in this stripe and now we need to 2937 * copy some of them into a target stripe for expand. 2938 */ 2939 struct dma_async_tx_descriptor *tx = NULL; 2940 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2941 for (i = 0; i < sh->disks; i++) 2942 if (i != sh->pd_idx && i != sh->qd_idx) { 2943 int dd_idx, j; 2944 struct stripe_head *sh2; 2945 struct async_submit_ctl submit; 2946 2947 sector_t bn = compute_blocknr(sh, i, 1); 2948 sector_t s = raid5_compute_sector(conf, bn, 0, 2949 &dd_idx, NULL); 2950 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2951 if (sh2 == NULL) 2952 /* so far only the early blocks of this stripe 2953 * have been requested. When later blocks 2954 * get requested, we will try again 2955 */ 2956 continue; 2957 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2958 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2959 /* must have already done this block */ 2960 release_stripe(sh2); 2961 continue; 2962 } 2963 2964 /* place all the copies on one channel */ 2965 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 2966 tx = async_memcpy(sh2->dev[dd_idx].page, 2967 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2968 &submit); 2969 2970 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2971 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2972 for (j = 0; j < conf->raid_disks; j++) 2973 if (j != sh2->pd_idx && 2974 (!r6s || j != sh2->qd_idx) && 2975 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2976 break; 2977 if (j == conf->raid_disks) { 2978 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2979 set_bit(STRIPE_HANDLE, &sh2->state); 2980 } 2981 release_stripe(sh2); 2982 2983 } 2984 /* done submitting copies, wait for them to complete */ 2985 if (tx) { 2986 async_tx_ack(tx); 2987 dma_wait_for_async_tx(tx); 2988 } 2989 } 2990 2991 2992 /* 2993 * handle_stripe - do things to a stripe. 2994 * 2995 * We lock the stripe and then examine the state of various bits 2996 * to see what needs to be done. 2997 * Possible results: 2998 * return some read request which now have data 2999 * return some write requests which are safely on disc 3000 * schedule a read on some buffers 3001 * schedule a write of some buffers 3002 * return confirmation of parity correctness 3003 * 3004 * buffers are taken off read_list or write_list, and bh_cache buffers 3005 * get BH_Lock set before the stripe lock is released. 3006 * 3007 */ 3008 3009 static void handle_stripe5(struct stripe_head *sh) 3010 { 3011 raid5_conf_t *conf = sh->raid_conf; 3012 int disks = sh->disks, i; 3013 struct bio *return_bi = NULL; 3014 struct stripe_head_state s; 3015 struct r5dev *dev; 3016 mdk_rdev_t *blocked_rdev = NULL; 3017 int prexor; 3018 int dec_preread_active = 0; 3019 3020 memset(&s, 0, sizeof(s)); 3021 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 3022 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 3023 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 3024 sh->reconstruct_state); 3025 3026 spin_lock(&sh->lock); 3027 clear_bit(STRIPE_HANDLE, &sh->state); 3028 clear_bit(STRIPE_DELAYED, &sh->state); 3029 3030 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3031 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3032 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3033 3034 /* Now to look around and see what can be done */ 3035 rcu_read_lock(); 3036 for (i=disks; i--; ) { 3037 mdk_rdev_t *rdev; 3038 3039 dev = &sh->dev[i]; 3040 3041 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3042 "written %p\n", i, dev->flags, dev->toread, dev->read, 3043 dev->towrite, dev->written); 3044 3045 /* maybe we can request a biofill operation 3046 * 3047 * new wantfill requests are only permitted while 3048 * ops_complete_biofill is guaranteed to be inactive 3049 */ 3050 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3051 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3052 set_bit(R5_Wantfill, &dev->flags); 3053 3054 /* now count some things */ 3055 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3056 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3057 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3058 3059 if (test_bit(R5_Wantfill, &dev->flags)) 3060 s.to_fill++; 3061 else if (dev->toread) 3062 s.to_read++; 3063 if (dev->towrite) { 3064 s.to_write++; 3065 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3066 s.non_overwrite++; 3067 } 3068 if (dev->written) 3069 s.written++; 3070 rdev = rcu_dereference(conf->disks[i].rdev); 3071 if (blocked_rdev == NULL && 3072 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3073 blocked_rdev = rdev; 3074 atomic_inc(&rdev->nr_pending); 3075 } 3076 clear_bit(R5_Insync, &dev->flags); 3077 if (!rdev) 3078 /* Not in-sync */; 3079 else if (test_bit(In_sync, &rdev->flags)) 3080 set_bit(R5_Insync, &dev->flags); 3081 else { 3082 /* could be in-sync depending on recovery/reshape status */ 3083 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3084 set_bit(R5_Insync, &dev->flags); 3085 } 3086 if (!test_bit(R5_Insync, &dev->flags)) { 3087 /* The ReadError flag will just be confusing now */ 3088 clear_bit(R5_ReadError, &dev->flags); 3089 clear_bit(R5_ReWrite, &dev->flags); 3090 } 3091 if (test_bit(R5_ReadError, &dev->flags)) 3092 clear_bit(R5_Insync, &dev->flags); 3093 if (!test_bit(R5_Insync, &dev->flags)) { 3094 s.failed++; 3095 s.failed_num = i; 3096 } 3097 } 3098 rcu_read_unlock(); 3099 3100 if (unlikely(blocked_rdev)) { 3101 if (s.syncing || s.expanding || s.expanded || 3102 s.to_write || s.written) { 3103 set_bit(STRIPE_HANDLE, &sh->state); 3104 goto unlock; 3105 } 3106 /* There is nothing for the blocked_rdev to block */ 3107 rdev_dec_pending(blocked_rdev, conf->mddev); 3108 blocked_rdev = NULL; 3109 } 3110 3111 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3112 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3113 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3114 } 3115 3116 pr_debug("locked=%d uptodate=%d to_read=%d" 3117 " to_write=%d failed=%d failed_num=%d\n", 3118 s.locked, s.uptodate, s.to_read, s.to_write, 3119 s.failed, s.failed_num); 3120 /* check if the array has lost two devices and, if so, some requests might 3121 * need to be failed 3122 */ 3123 if (s.failed > 1 && s.to_read+s.to_write+s.written) 3124 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3125 if (s.failed > 1 && s.syncing) { 3126 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3127 clear_bit(STRIPE_SYNCING, &sh->state); 3128 s.syncing = 0; 3129 } 3130 3131 /* might be able to return some write requests if the parity block 3132 * is safe, or on a failed drive 3133 */ 3134 dev = &sh->dev[sh->pd_idx]; 3135 if ( s.written && 3136 ((test_bit(R5_Insync, &dev->flags) && 3137 !test_bit(R5_LOCKED, &dev->flags) && 3138 test_bit(R5_UPTODATE, &dev->flags)) || 3139 (s.failed == 1 && s.failed_num == sh->pd_idx))) 3140 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3141 3142 /* Now we might consider reading some blocks, either to check/generate 3143 * parity, or to satisfy requests 3144 * or to load a block that is being partially written. 3145 */ 3146 if (s.to_read || s.non_overwrite || 3147 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3148 handle_stripe_fill5(sh, &s, disks); 3149 3150 /* Now we check to see if any write operations have recently 3151 * completed 3152 */ 3153 prexor = 0; 3154 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3155 prexor = 1; 3156 if (sh->reconstruct_state == reconstruct_state_drain_result || 3157 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3158 sh->reconstruct_state = reconstruct_state_idle; 3159 3160 /* All the 'written' buffers and the parity block are ready to 3161 * be written back to disk 3162 */ 3163 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3164 for (i = disks; i--; ) { 3165 dev = &sh->dev[i]; 3166 if (test_bit(R5_LOCKED, &dev->flags) && 3167 (i == sh->pd_idx || dev->written)) { 3168 pr_debug("Writing block %d\n", i); 3169 set_bit(R5_Wantwrite, &dev->flags); 3170 if (prexor) 3171 continue; 3172 if (!test_bit(R5_Insync, &dev->flags) || 3173 (i == sh->pd_idx && s.failed == 0)) 3174 set_bit(STRIPE_INSYNC, &sh->state); 3175 } 3176 } 3177 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3178 dec_preread_active = 1; 3179 } 3180 3181 /* Now to consider new write requests and what else, if anything 3182 * should be read. We do not handle new writes when: 3183 * 1/ A 'write' operation (copy+xor) is already in flight. 3184 * 2/ A 'check' operation is in flight, as it may clobber the parity 3185 * block. 3186 */ 3187 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3188 handle_stripe_dirtying5(conf, sh, &s, disks); 3189 3190 /* maybe we need to check and possibly fix the parity for this stripe 3191 * Any reads will already have been scheduled, so we just see if enough 3192 * data is available. The parity check is held off while parity 3193 * dependent operations are in flight. 3194 */ 3195 if (sh->check_state || 3196 (s.syncing && s.locked == 0 && 3197 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3198 !test_bit(STRIPE_INSYNC, &sh->state))) 3199 handle_parity_checks5(conf, sh, &s, disks); 3200 3201 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3202 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3203 clear_bit(STRIPE_SYNCING, &sh->state); 3204 } 3205 3206 /* If the failed drive is just a ReadError, then we might need to progress 3207 * the repair/check process 3208 */ 3209 if (s.failed == 1 && !conf->mddev->ro && 3210 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 3211 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 3212 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 3213 ) { 3214 dev = &sh->dev[s.failed_num]; 3215 if (!test_bit(R5_ReWrite, &dev->flags)) { 3216 set_bit(R5_Wantwrite, &dev->flags); 3217 set_bit(R5_ReWrite, &dev->flags); 3218 set_bit(R5_LOCKED, &dev->flags); 3219 s.locked++; 3220 } else { 3221 /* let's read it back */ 3222 set_bit(R5_Wantread, &dev->flags); 3223 set_bit(R5_LOCKED, &dev->flags); 3224 s.locked++; 3225 } 3226 } 3227 3228 /* Finish reconstruct operations initiated by the expansion process */ 3229 if (sh->reconstruct_state == reconstruct_state_result) { 3230 struct stripe_head *sh2 3231 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3232 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3233 /* sh cannot be written until sh2 has been read. 3234 * so arrange for sh to be delayed a little 3235 */ 3236 set_bit(STRIPE_DELAYED, &sh->state); 3237 set_bit(STRIPE_HANDLE, &sh->state); 3238 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3239 &sh2->state)) 3240 atomic_inc(&conf->preread_active_stripes); 3241 release_stripe(sh2); 3242 goto unlock; 3243 } 3244 if (sh2) 3245 release_stripe(sh2); 3246 3247 sh->reconstruct_state = reconstruct_state_idle; 3248 clear_bit(STRIPE_EXPANDING, &sh->state); 3249 for (i = conf->raid_disks; i--; ) { 3250 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3251 set_bit(R5_LOCKED, &sh->dev[i].flags); 3252 s.locked++; 3253 } 3254 } 3255 3256 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3257 !sh->reconstruct_state) { 3258 /* Need to write out all blocks after computing parity */ 3259 sh->disks = conf->raid_disks; 3260 stripe_set_idx(sh->sector, conf, 0, sh); 3261 schedule_reconstruction(sh, &s, 1, 1); 3262 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3263 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3264 atomic_dec(&conf->reshape_stripes); 3265 wake_up(&conf->wait_for_overlap); 3266 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3267 } 3268 3269 if (s.expanding && s.locked == 0 && 3270 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3271 handle_stripe_expansion(conf, sh, NULL); 3272 3273 unlock: 3274 spin_unlock(&sh->lock); 3275 3276 /* wait for this device to become unblocked */ 3277 if (unlikely(blocked_rdev)) 3278 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3279 3280 if (s.ops_request) 3281 raid_run_ops(sh, s.ops_request); 3282 3283 ops_run_io(sh, &s); 3284 3285 if (dec_preread_active) { 3286 /* We delay this until after ops_run_io so that if make_request 3287 * is waiting on a flush, it won't continue until the writes 3288 * have actually been submitted. 3289 */ 3290 atomic_dec(&conf->preread_active_stripes); 3291 if (atomic_read(&conf->preread_active_stripes) < 3292 IO_THRESHOLD) 3293 md_wakeup_thread(conf->mddev->thread); 3294 } 3295 return_io(return_bi); 3296 } 3297 3298 static void handle_stripe6(struct stripe_head *sh) 3299 { 3300 raid5_conf_t *conf = sh->raid_conf; 3301 int disks = sh->disks; 3302 struct bio *return_bi = NULL; 3303 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3304 struct stripe_head_state s; 3305 struct r6_state r6s; 3306 struct r5dev *dev, *pdev, *qdev; 3307 mdk_rdev_t *blocked_rdev = NULL; 3308 int dec_preread_active = 0; 3309 3310 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3311 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3312 (unsigned long long)sh->sector, sh->state, 3313 atomic_read(&sh->count), pd_idx, qd_idx, 3314 sh->check_state, sh->reconstruct_state); 3315 memset(&s, 0, sizeof(s)); 3316 3317 spin_lock(&sh->lock); 3318 clear_bit(STRIPE_HANDLE, &sh->state); 3319 clear_bit(STRIPE_DELAYED, &sh->state); 3320 3321 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3322 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3323 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3324 /* Now to look around and see what can be done */ 3325 3326 rcu_read_lock(); 3327 for (i=disks; i--; ) { 3328 mdk_rdev_t *rdev; 3329 dev = &sh->dev[i]; 3330 3331 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3332 i, dev->flags, dev->toread, dev->towrite, dev->written); 3333 /* maybe we can reply to a read 3334 * 3335 * new wantfill requests are only permitted while 3336 * ops_complete_biofill is guaranteed to be inactive 3337 */ 3338 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3339 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3340 set_bit(R5_Wantfill, &dev->flags); 3341 3342 /* now count some things */ 3343 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3344 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3345 if (test_bit(R5_Wantcompute, &dev->flags)) { 3346 s.compute++; 3347 BUG_ON(s.compute > 2); 3348 } 3349 3350 if (test_bit(R5_Wantfill, &dev->flags)) { 3351 s.to_fill++; 3352 } else if (dev->toread) 3353 s.to_read++; 3354 if (dev->towrite) { 3355 s.to_write++; 3356 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3357 s.non_overwrite++; 3358 } 3359 if (dev->written) 3360 s.written++; 3361 rdev = rcu_dereference(conf->disks[i].rdev); 3362 if (blocked_rdev == NULL && 3363 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3364 blocked_rdev = rdev; 3365 atomic_inc(&rdev->nr_pending); 3366 } 3367 clear_bit(R5_Insync, &dev->flags); 3368 if (!rdev) 3369 /* Not in-sync */; 3370 else if (test_bit(In_sync, &rdev->flags)) 3371 set_bit(R5_Insync, &dev->flags); 3372 else { 3373 /* in sync if before recovery_offset */ 3374 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3375 set_bit(R5_Insync, &dev->flags); 3376 } 3377 if (!test_bit(R5_Insync, &dev->flags)) { 3378 /* The ReadError flag will just be confusing now */ 3379 clear_bit(R5_ReadError, &dev->flags); 3380 clear_bit(R5_ReWrite, &dev->flags); 3381 } 3382 if (test_bit(R5_ReadError, &dev->flags)) 3383 clear_bit(R5_Insync, &dev->flags); 3384 if (!test_bit(R5_Insync, &dev->flags)) { 3385 if (s.failed < 2) 3386 r6s.failed_num[s.failed] = i; 3387 s.failed++; 3388 } 3389 } 3390 rcu_read_unlock(); 3391 3392 if (unlikely(blocked_rdev)) { 3393 if (s.syncing || s.expanding || s.expanded || 3394 s.to_write || s.written) { 3395 set_bit(STRIPE_HANDLE, &sh->state); 3396 goto unlock; 3397 } 3398 /* There is nothing for the blocked_rdev to block */ 3399 rdev_dec_pending(blocked_rdev, conf->mddev); 3400 blocked_rdev = NULL; 3401 } 3402 3403 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3404 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3405 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3406 } 3407 3408 pr_debug("locked=%d uptodate=%d to_read=%d" 3409 " to_write=%d failed=%d failed_num=%d,%d\n", 3410 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3411 r6s.failed_num[0], r6s.failed_num[1]); 3412 /* check if the array has lost >2 devices and, if so, some requests 3413 * might need to be failed 3414 */ 3415 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3416 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3417 if (s.failed > 2 && s.syncing) { 3418 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3419 clear_bit(STRIPE_SYNCING, &sh->state); 3420 s.syncing = 0; 3421 } 3422 3423 /* 3424 * might be able to return some write requests if the parity blocks 3425 * are safe, or on a failed drive 3426 */ 3427 pdev = &sh->dev[pd_idx]; 3428 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3429 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3430 qdev = &sh->dev[qd_idx]; 3431 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3432 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3433 3434 if ( s.written && 3435 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3436 && !test_bit(R5_LOCKED, &pdev->flags) 3437 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3438 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3439 && !test_bit(R5_LOCKED, &qdev->flags) 3440 && test_bit(R5_UPTODATE, &qdev->flags))))) 3441 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3442 3443 /* Now we might consider reading some blocks, either to check/generate 3444 * parity, or to satisfy requests 3445 * or to load a block that is being partially written. 3446 */ 3447 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3448 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3449 handle_stripe_fill6(sh, &s, &r6s, disks); 3450 3451 /* Now we check to see if any write operations have recently 3452 * completed 3453 */ 3454 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3455 3456 sh->reconstruct_state = reconstruct_state_idle; 3457 /* All the 'written' buffers and the parity blocks are ready to 3458 * be written back to disk 3459 */ 3460 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3461 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3462 for (i = disks; i--; ) { 3463 dev = &sh->dev[i]; 3464 if (test_bit(R5_LOCKED, &dev->flags) && 3465 (i == sh->pd_idx || i == qd_idx || 3466 dev->written)) { 3467 pr_debug("Writing block %d\n", i); 3468 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3469 set_bit(R5_Wantwrite, &dev->flags); 3470 if (!test_bit(R5_Insync, &dev->flags) || 3471 ((i == sh->pd_idx || i == qd_idx) && 3472 s.failed == 0)) 3473 set_bit(STRIPE_INSYNC, &sh->state); 3474 } 3475 } 3476 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3477 dec_preread_active = 1; 3478 } 3479 3480 /* Now to consider new write requests and what else, if anything 3481 * should be read. We do not handle new writes when: 3482 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3483 * 2/ A 'check' operation is in flight, as it may clobber the parity 3484 * block. 3485 */ 3486 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3487 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3488 3489 /* maybe we need to check and possibly fix the parity for this stripe 3490 * Any reads will already have been scheduled, so we just see if enough 3491 * data is available. The parity check is held off while parity 3492 * dependent operations are in flight. 3493 */ 3494 if (sh->check_state || 3495 (s.syncing && s.locked == 0 && 3496 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3497 !test_bit(STRIPE_INSYNC, &sh->state))) 3498 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3499 3500 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3501 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3502 clear_bit(STRIPE_SYNCING, &sh->state); 3503 } 3504 3505 /* If the failed drives are just a ReadError, then we might need 3506 * to progress the repair/check process 3507 */ 3508 if (s.failed <= 2 && !conf->mddev->ro) 3509 for (i = 0; i < s.failed; i++) { 3510 dev = &sh->dev[r6s.failed_num[i]]; 3511 if (test_bit(R5_ReadError, &dev->flags) 3512 && !test_bit(R5_LOCKED, &dev->flags) 3513 && test_bit(R5_UPTODATE, &dev->flags) 3514 ) { 3515 if (!test_bit(R5_ReWrite, &dev->flags)) { 3516 set_bit(R5_Wantwrite, &dev->flags); 3517 set_bit(R5_ReWrite, &dev->flags); 3518 set_bit(R5_LOCKED, &dev->flags); 3519 s.locked++; 3520 } else { 3521 /* let's read it back */ 3522 set_bit(R5_Wantread, &dev->flags); 3523 set_bit(R5_LOCKED, &dev->flags); 3524 s.locked++; 3525 } 3526 } 3527 } 3528 3529 /* Finish reconstruct operations initiated by the expansion process */ 3530 if (sh->reconstruct_state == reconstruct_state_result) { 3531 sh->reconstruct_state = reconstruct_state_idle; 3532 clear_bit(STRIPE_EXPANDING, &sh->state); 3533 for (i = conf->raid_disks; i--; ) { 3534 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3535 set_bit(R5_LOCKED, &sh->dev[i].flags); 3536 s.locked++; 3537 } 3538 } 3539 3540 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3541 !sh->reconstruct_state) { 3542 struct stripe_head *sh2 3543 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3544 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3545 /* sh cannot be written until sh2 has been read. 3546 * so arrange for sh to be delayed a little 3547 */ 3548 set_bit(STRIPE_DELAYED, &sh->state); 3549 set_bit(STRIPE_HANDLE, &sh->state); 3550 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3551 &sh2->state)) 3552 atomic_inc(&conf->preread_active_stripes); 3553 release_stripe(sh2); 3554 goto unlock; 3555 } 3556 if (sh2) 3557 release_stripe(sh2); 3558 3559 /* Need to write out all blocks after computing P&Q */ 3560 sh->disks = conf->raid_disks; 3561 stripe_set_idx(sh->sector, conf, 0, sh); 3562 schedule_reconstruction(sh, &s, 1, 1); 3563 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3564 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3565 atomic_dec(&conf->reshape_stripes); 3566 wake_up(&conf->wait_for_overlap); 3567 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3568 } 3569 3570 if (s.expanding && s.locked == 0 && 3571 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3572 handle_stripe_expansion(conf, sh, &r6s); 3573 3574 unlock: 3575 spin_unlock(&sh->lock); 3576 3577 /* wait for this device to become unblocked */ 3578 if (unlikely(blocked_rdev)) 3579 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3580 3581 if (s.ops_request) 3582 raid_run_ops(sh, s.ops_request); 3583 3584 ops_run_io(sh, &s); 3585 3586 3587 if (dec_preread_active) { 3588 /* We delay this until after ops_run_io so that if make_request 3589 * is waiting on a flush, it won't continue until the writes 3590 * have actually been submitted. 3591 */ 3592 atomic_dec(&conf->preread_active_stripes); 3593 if (atomic_read(&conf->preread_active_stripes) < 3594 IO_THRESHOLD) 3595 md_wakeup_thread(conf->mddev->thread); 3596 } 3597 3598 return_io(return_bi); 3599 } 3600 3601 static void handle_stripe(struct stripe_head *sh) 3602 { 3603 if (sh->raid_conf->level == 6) 3604 handle_stripe6(sh); 3605 else 3606 handle_stripe5(sh); 3607 } 3608 3609 static void raid5_activate_delayed(raid5_conf_t *conf) 3610 { 3611 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3612 while (!list_empty(&conf->delayed_list)) { 3613 struct list_head *l = conf->delayed_list.next; 3614 struct stripe_head *sh; 3615 sh = list_entry(l, struct stripe_head, lru); 3616 list_del_init(l); 3617 clear_bit(STRIPE_DELAYED, &sh->state); 3618 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3619 atomic_inc(&conf->preread_active_stripes); 3620 list_add_tail(&sh->lru, &conf->hold_list); 3621 } 3622 } 3623 } 3624 3625 static void activate_bit_delay(raid5_conf_t *conf) 3626 { 3627 /* device_lock is held */ 3628 struct list_head head; 3629 list_add(&head, &conf->bitmap_list); 3630 list_del_init(&conf->bitmap_list); 3631 while (!list_empty(&head)) { 3632 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3633 list_del_init(&sh->lru); 3634 atomic_inc(&sh->count); 3635 __release_stripe(conf, sh); 3636 } 3637 } 3638 3639 int md_raid5_congested(mddev_t *mddev, int bits) 3640 { 3641 raid5_conf_t *conf = mddev->private; 3642 3643 /* No difference between reads and writes. Just check 3644 * how busy the stripe_cache is 3645 */ 3646 3647 if (conf->inactive_blocked) 3648 return 1; 3649 if (conf->quiesce) 3650 return 1; 3651 if (list_empty_careful(&conf->inactive_list)) 3652 return 1; 3653 3654 return 0; 3655 } 3656 EXPORT_SYMBOL_GPL(md_raid5_congested); 3657 3658 static int raid5_congested(void *data, int bits) 3659 { 3660 mddev_t *mddev = data; 3661 3662 return mddev_congested(mddev, bits) || 3663 md_raid5_congested(mddev, bits); 3664 } 3665 3666 /* We want read requests to align with chunks where possible, 3667 * but write requests don't need to. 3668 */ 3669 static int raid5_mergeable_bvec(struct request_queue *q, 3670 struct bvec_merge_data *bvm, 3671 struct bio_vec *biovec) 3672 { 3673 mddev_t *mddev = q->queuedata; 3674 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3675 int max; 3676 unsigned int chunk_sectors = mddev->chunk_sectors; 3677 unsigned int bio_sectors = bvm->bi_size >> 9; 3678 3679 if ((bvm->bi_rw & 1) == WRITE) 3680 return biovec->bv_len; /* always allow writes to be mergeable */ 3681 3682 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3683 chunk_sectors = mddev->new_chunk_sectors; 3684 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3685 if (max < 0) max = 0; 3686 if (max <= biovec->bv_len && bio_sectors == 0) 3687 return biovec->bv_len; 3688 else 3689 return max; 3690 } 3691 3692 3693 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3694 { 3695 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3696 unsigned int chunk_sectors = mddev->chunk_sectors; 3697 unsigned int bio_sectors = bio->bi_size >> 9; 3698 3699 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3700 chunk_sectors = mddev->new_chunk_sectors; 3701 return chunk_sectors >= 3702 ((sector & (chunk_sectors - 1)) + bio_sectors); 3703 } 3704 3705 /* 3706 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3707 * later sampled by raid5d. 3708 */ 3709 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3710 { 3711 unsigned long flags; 3712 3713 spin_lock_irqsave(&conf->device_lock, flags); 3714 3715 bi->bi_next = conf->retry_read_aligned_list; 3716 conf->retry_read_aligned_list = bi; 3717 3718 spin_unlock_irqrestore(&conf->device_lock, flags); 3719 md_wakeup_thread(conf->mddev->thread); 3720 } 3721 3722 3723 static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3724 { 3725 struct bio *bi; 3726 3727 bi = conf->retry_read_aligned; 3728 if (bi) { 3729 conf->retry_read_aligned = NULL; 3730 return bi; 3731 } 3732 bi = conf->retry_read_aligned_list; 3733 if(bi) { 3734 conf->retry_read_aligned_list = bi->bi_next; 3735 bi->bi_next = NULL; 3736 /* 3737 * this sets the active strip count to 1 and the processed 3738 * strip count to zero (upper 8 bits) 3739 */ 3740 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3741 } 3742 3743 return bi; 3744 } 3745 3746 3747 /* 3748 * The "raid5_align_endio" should check if the read succeeded and if it 3749 * did, call bio_endio on the original bio (having bio_put the new bio 3750 * first). 3751 * If the read failed.. 3752 */ 3753 static void raid5_align_endio(struct bio *bi, int error) 3754 { 3755 struct bio* raid_bi = bi->bi_private; 3756 mddev_t *mddev; 3757 raid5_conf_t *conf; 3758 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3759 mdk_rdev_t *rdev; 3760 3761 bio_put(bi); 3762 3763 rdev = (void*)raid_bi->bi_next; 3764 raid_bi->bi_next = NULL; 3765 mddev = rdev->mddev; 3766 conf = mddev->private; 3767 3768 rdev_dec_pending(rdev, conf->mddev); 3769 3770 if (!error && uptodate) { 3771 bio_endio(raid_bi, 0); 3772 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3773 wake_up(&conf->wait_for_stripe); 3774 return; 3775 } 3776 3777 3778 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3779 3780 add_bio_to_retry(raid_bi, conf); 3781 } 3782 3783 static int bio_fits_rdev(struct bio *bi) 3784 { 3785 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3786 3787 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3788 return 0; 3789 blk_recount_segments(q, bi); 3790 if (bi->bi_phys_segments > queue_max_segments(q)) 3791 return 0; 3792 3793 if (q->merge_bvec_fn) 3794 /* it's too hard to apply the merge_bvec_fn at this stage, 3795 * just just give up 3796 */ 3797 return 0; 3798 3799 return 1; 3800 } 3801 3802 3803 static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) 3804 { 3805 raid5_conf_t *conf = mddev->private; 3806 int dd_idx; 3807 struct bio* align_bi; 3808 mdk_rdev_t *rdev; 3809 3810 if (!in_chunk_boundary(mddev, raid_bio)) { 3811 pr_debug("chunk_aligned_read : non aligned\n"); 3812 return 0; 3813 } 3814 /* 3815 * use bio_clone_mddev to make a copy of the bio 3816 */ 3817 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3818 if (!align_bi) 3819 return 0; 3820 /* 3821 * set bi_end_io to a new function, and set bi_private to the 3822 * original bio. 3823 */ 3824 align_bi->bi_end_io = raid5_align_endio; 3825 align_bi->bi_private = raid_bio; 3826 /* 3827 * compute position 3828 */ 3829 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3830 0, 3831 &dd_idx, NULL); 3832 3833 rcu_read_lock(); 3834 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3835 if (rdev && test_bit(In_sync, &rdev->flags)) { 3836 atomic_inc(&rdev->nr_pending); 3837 rcu_read_unlock(); 3838 raid_bio->bi_next = (void*)rdev; 3839 align_bi->bi_bdev = rdev->bdev; 3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3841 align_bi->bi_sector += rdev->data_offset; 3842 3843 if (!bio_fits_rdev(align_bi)) { 3844 /* too big in some way */ 3845 bio_put(align_bi); 3846 rdev_dec_pending(rdev, mddev); 3847 return 0; 3848 } 3849 3850 spin_lock_irq(&conf->device_lock); 3851 wait_event_lock_irq(conf->wait_for_stripe, 3852 conf->quiesce == 0, 3853 conf->device_lock, /* nothing */); 3854 atomic_inc(&conf->active_aligned_reads); 3855 spin_unlock_irq(&conf->device_lock); 3856 3857 generic_make_request(align_bi); 3858 return 1; 3859 } else { 3860 rcu_read_unlock(); 3861 bio_put(align_bi); 3862 return 0; 3863 } 3864 } 3865 3866 /* __get_priority_stripe - get the next stripe to process 3867 * 3868 * Full stripe writes are allowed to pass preread active stripes up until 3869 * the bypass_threshold is exceeded. In general the bypass_count 3870 * increments when the handle_list is handled before the hold_list; however, it 3871 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3872 * stripe with in flight i/o. The bypass_count will be reset when the 3873 * head of the hold_list has changed, i.e. the head was promoted to the 3874 * handle_list. 3875 */ 3876 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3877 { 3878 struct stripe_head *sh; 3879 3880 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3881 __func__, 3882 list_empty(&conf->handle_list) ? "empty" : "busy", 3883 list_empty(&conf->hold_list) ? "empty" : "busy", 3884 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3885 3886 if (!list_empty(&conf->handle_list)) { 3887 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3888 3889 if (list_empty(&conf->hold_list)) 3890 conf->bypass_count = 0; 3891 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3892 if (conf->hold_list.next == conf->last_hold) 3893 conf->bypass_count++; 3894 else { 3895 conf->last_hold = conf->hold_list.next; 3896 conf->bypass_count -= conf->bypass_threshold; 3897 if (conf->bypass_count < 0) 3898 conf->bypass_count = 0; 3899 } 3900 } 3901 } else if (!list_empty(&conf->hold_list) && 3902 ((conf->bypass_threshold && 3903 conf->bypass_count > conf->bypass_threshold) || 3904 atomic_read(&conf->pending_full_writes) == 0)) { 3905 sh = list_entry(conf->hold_list.next, 3906 typeof(*sh), lru); 3907 conf->bypass_count -= conf->bypass_threshold; 3908 if (conf->bypass_count < 0) 3909 conf->bypass_count = 0; 3910 } else 3911 return NULL; 3912 3913 list_del_init(&sh->lru); 3914 atomic_inc(&sh->count); 3915 BUG_ON(atomic_read(&sh->count) != 1); 3916 return sh; 3917 } 3918 3919 static int make_request(mddev_t *mddev, struct bio * bi) 3920 { 3921 raid5_conf_t *conf = mddev->private; 3922 int dd_idx; 3923 sector_t new_sector; 3924 sector_t logical_sector, last_sector; 3925 struct stripe_head *sh; 3926 const int rw = bio_data_dir(bi); 3927 int remaining; 3928 int plugged; 3929 3930 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3931 md_flush_request(mddev, bi); 3932 return 0; 3933 } 3934 3935 md_write_start(mddev, bi); 3936 3937 if (rw == READ && 3938 mddev->reshape_position == MaxSector && 3939 chunk_aligned_read(mddev,bi)) 3940 return 0; 3941 3942 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3943 last_sector = bi->bi_sector + (bi->bi_size>>9); 3944 bi->bi_next = NULL; 3945 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3946 3947 plugged = mddev_check_plugged(mddev); 3948 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3949 DEFINE_WAIT(w); 3950 int disks, data_disks; 3951 int previous; 3952 3953 retry: 3954 previous = 0; 3955 disks = conf->raid_disks; 3956 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3957 if (unlikely(conf->reshape_progress != MaxSector)) { 3958 /* spinlock is needed as reshape_progress may be 3959 * 64bit on a 32bit platform, and so it might be 3960 * possible to see a half-updated value 3961 * Of course reshape_progress could change after 3962 * the lock is dropped, so once we get a reference 3963 * to the stripe that we think it is, we will have 3964 * to check again. 3965 */ 3966 spin_lock_irq(&conf->device_lock); 3967 if (mddev->delta_disks < 0 3968 ? logical_sector < conf->reshape_progress 3969 : logical_sector >= conf->reshape_progress) { 3970 disks = conf->previous_raid_disks; 3971 previous = 1; 3972 } else { 3973 if (mddev->delta_disks < 0 3974 ? logical_sector < conf->reshape_safe 3975 : logical_sector >= conf->reshape_safe) { 3976 spin_unlock_irq(&conf->device_lock); 3977 schedule(); 3978 goto retry; 3979 } 3980 } 3981 spin_unlock_irq(&conf->device_lock); 3982 } 3983 data_disks = disks - conf->max_degraded; 3984 3985 new_sector = raid5_compute_sector(conf, logical_sector, 3986 previous, 3987 &dd_idx, NULL); 3988 pr_debug("raid456: make_request, sector %llu logical %llu\n", 3989 (unsigned long long)new_sector, 3990 (unsigned long long)logical_sector); 3991 3992 sh = get_active_stripe(conf, new_sector, previous, 3993 (bi->bi_rw&RWA_MASK), 0); 3994 if (sh) { 3995 if (unlikely(previous)) { 3996 /* expansion might have moved on while waiting for a 3997 * stripe, so we must do the range check again. 3998 * Expansion could still move past after this 3999 * test, but as we are holding a reference to 4000 * 'sh', we know that if that happens, 4001 * STRIPE_EXPANDING will get set and the expansion 4002 * won't proceed until we finish with the stripe. 4003 */ 4004 int must_retry = 0; 4005 spin_lock_irq(&conf->device_lock); 4006 if (mddev->delta_disks < 0 4007 ? logical_sector >= conf->reshape_progress 4008 : logical_sector < conf->reshape_progress) 4009 /* mismatch, need to try again */ 4010 must_retry = 1; 4011 spin_unlock_irq(&conf->device_lock); 4012 if (must_retry) { 4013 release_stripe(sh); 4014 schedule(); 4015 goto retry; 4016 } 4017 } 4018 4019 if (bio_data_dir(bi) == WRITE && 4020 logical_sector >= mddev->suspend_lo && 4021 logical_sector < mddev->suspend_hi) { 4022 release_stripe(sh); 4023 /* As the suspend_* range is controlled by 4024 * userspace, we want an interruptible 4025 * wait. 4026 */ 4027 flush_signals(current); 4028 prepare_to_wait(&conf->wait_for_overlap, 4029 &w, TASK_INTERRUPTIBLE); 4030 if (logical_sector >= mddev->suspend_lo && 4031 logical_sector < mddev->suspend_hi) 4032 schedule(); 4033 goto retry; 4034 } 4035 4036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4037 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 4038 /* Stripe is busy expanding or 4039 * add failed due to overlap. Flush everything 4040 * and wait a while 4041 */ 4042 md_wakeup_thread(mddev->thread); 4043 release_stripe(sh); 4044 schedule(); 4045 goto retry; 4046 } 4047 finish_wait(&conf->wait_for_overlap, &w); 4048 set_bit(STRIPE_HANDLE, &sh->state); 4049 clear_bit(STRIPE_DELAYED, &sh->state); 4050 if ((bi->bi_rw & REQ_SYNC) && 4051 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4052 atomic_inc(&conf->preread_active_stripes); 4053 release_stripe(sh); 4054 } else { 4055 /* cannot get stripe for read-ahead, just give-up */ 4056 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4057 finish_wait(&conf->wait_for_overlap, &w); 4058 break; 4059 } 4060 4061 } 4062 if (!plugged) 4063 md_wakeup_thread(mddev->thread); 4064 4065 spin_lock_irq(&conf->device_lock); 4066 remaining = raid5_dec_bi_phys_segments(bi); 4067 spin_unlock_irq(&conf->device_lock); 4068 if (remaining == 0) { 4069 4070 if ( rw == WRITE ) 4071 md_write_end(mddev); 4072 4073 bio_endio(bi, 0); 4074 } 4075 4076 return 0; 4077 } 4078 4079 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); 4080 4081 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 4082 { 4083 /* reshaping is quite different to recovery/resync so it is 4084 * handled quite separately ... here. 4085 * 4086 * On each call to sync_request, we gather one chunk worth of 4087 * destination stripes and flag them as expanding. 4088 * Then we find all the source stripes and request reads. 4089 * As the reads complete, handle_stripe will copy the data 4090 * into the destination stripe and release that stripe. 4091 */ 4092 raid5_conf_t *conf = mddev->private; 4093 struct stripe_head *sh; 4094 sector_t first_sector, last_sector; 4095 int raid_disks = conf->previous_raid_disks; 4096 int data_disks = raid_disks - conf->max_degraded; 4097 int new_data_disks = conf->raid_disks - conf->max_degraded; 4098 int i; 4099 int dd_idx; 4100 sector_t writepos, readpos, safepos; 4101 sector_t stripe_addr; 4102 int reshape_sectors; 4103 struct list_head stripes; 4104 4105 if (sector_nr == 0) { 4106 /* If restarting in the middle, skip the initial sectors */ 4107 if (mddev->delta_disks < 0 && 4108 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4109 sector_nr = raid5_size(mddev, 0, 0) 4110 - conf->reshape_progress; 4111 } else if (mddev->delta_disks >= 0 && 4112 conf->reshape_progress > 0) 4113 sector_nr = conf->reshape_progress; 4114 sector_div(sector_nr, new_data_disks); 4115 if (sector_nr) { 4116 mddev->curr_resync_completed = sector_nr; 4117 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4118 *skipped = 1; 4119 return sector_nr; 4120 } 4121 } 4122 4123 /* We need to process a full chunk at a time. 4124 * If old and new chunk sizes differ, we need to process the 4125 * largest of these 4126 */ 4127 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4128 reshape_sectors = mddev->new_chunk_sectors; 4129 else 4130 reshape_sectors = mddev->chunk_sectors; 4131 4132 /* we update the metadata when there is more than 3Meg 4133 * in the block range (that is rather arbitrary, should 4134 * probably be time based) or when the data about to be 4135 * copied would over-write the source of the data at 4136 * the front of the range. 4137 * i.e. one new_stripe along from reshape_progress new_maps 4138 * to after where reshape_safe old_maps to 4139 */ 4140 writepos = conf->reshape_progress; 4141 sector_div(writepos, new_data_disks); 4142 readpos = conf->reshape_progress; 4143 sector_div(readpos, data_disks); 4144 safepos = conf->reshape_safe; 4145 sector_div(safepos, data_disks); 4146 if (mddev->delta_disks < 0) { 4147 writepos -= min_t(sector_t, reshape_sectors, writepos); 4148 readpos += reshape_sectors; 4149 safepos += reshape_sectors; 4150 } else { 4151 writepos += reshape_sectors; 4152 readpos -= min_t(sector_t, reshape_sectors, readpos); 4153 safepos -= min_t(sector_t, reshape_sectors, safepos); 4154 } 4155 4156 /* 'writepos' is the most advanced device address we might write. 4157 * 'readpos' is the least advanced device address we might read. 4158 * 'safepos' is the least address recorded in the metadata as having 4159 * been reshaped. 4160 * If 'readpos' is behind 'writepos', then there is no way that we can 4161 * ensure safety in the face of a crash - that must be done by userspace 4162 * making a backup of the data. So in that case there is no particular 4163 * rush to update metadata. 4164 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4165 * update the metadata to advance 'safepos' to match 'readpos' so that 4166 * we can be safe in the event of a crash. 4167 * So we insist on updating metadata if safepos is behind writepos and 4168 * readpos is beyond writepos. 4169 * In any case, update the metadata every 10 seconds. 4170 * Maybe that number should be configurable, but I'm not sure it is 4171 * worth it.... maybe it could be a multiple of safemode_delay??? 4172 */ 4173 if ((mddev->delta_disks < 0 4174 ? (safepos > writepos && readpos < writepos) 4175 : (safepos < writepos && readpos > writepos)) || 4176 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4177 /* Cannot proceed until we've updated the superblock... */ 4178 wait_event(conf->wait_for_overlap, 4179 atomic_read(&conf->reshape_stripes)==0); 4180 mddev->reshape_position = conf->reshape_progress; 4181 mddev->curr_resync_completed = sector_nr; 4182 conf->reshape_checkpoint = jiffies; 4183 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4184 md_wakeup_thread(mddev->thread); 4185 wait_event(mddev->sb_wait, mddev->flags == 0 || 4186 kthread_should_stop()); 4187 spin_lock_irq(&conf->device_lock); 4188 conf->reshape_safe = mddev->reshape_position; 4189 spin_unlock_irq(&conf->device_lock); 4190 wake_up(&conf->wait_for_overlap); 4191 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4192 } 4193 4194 if (mddev->delta_disks < 0) { 4195 BUG_ON(conf->reshape_progress == 0); 4196 stripe_addr = writepos; 4197 BUG_ON((mddev->dev_sectors & 4198 ~((sector_t)reshape_sectors - 1)) 4199 - reshape_sectors - stripe_addr 4200 != sector_nr); 4201 } else { 4202 BUG_ON(writepos != sector_nr + reshape_sectors); 4203 stripe_addr = sector_nr; 4204 } 4205 INIT_LIST_HEAD(&stripes); 4206 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4207 int j; 4208 int skipped_disk = 0; 4209 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4210 set_bit(STRIPE_EXPANDING, &sh->state); 4211 atomic_inc(&conf->reshape_stripes); 4212 /* If any of this stripe is beyond the end of the old 4213 * array, then we need to zero those blocks 4214 */ 4215 for (j=sh->disks; j--;) { 4216 sector_t s; 4217 if (j == sh->pd_idx) 4218 continue; 4219 if (conf->level == 6 && 4220 j == sh->qd_idx) 4221 continue; 4222 s = compute_blocknr(sh, j, 0); 4223 if (s < raid5_size(mddev, 0, 0)) { 4224 skipped_disk = 1; 4225 continue; 4226 } 4227 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4228 set_bit(R5_Expanded, &sh->dev[j].flags); 4229 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4230 } 4231 if (!skipped_disk) { 4232 set_bit(STRIPE_EXPAND_READY, &sh->state); 4233 set_bit(STRIPE_HANDLE, &sh->state); 4234 } 4235 list_add(&sh->lru, &stripes); 4236 } 4237 spin_lock_irq(&conf->device_lock); 4238 if (mddev->delta_disks < 0) 4239 conf->reshape_progress -= reshape_sectors * new_data_disks; 4240 else 4241 conf->reshape_progress += reshape_sectors * new_data_disks; 4242 spin_unlock_irq(&conf->device_lock); 4243 /* Ok, those stripe are ready. We can start scheduling 4244 * reads on the source stripes. 4245 * The source stripes are determined by mapping the first and last 4246 * block on the destination stripes. 4247 */ 4248 first_sector = 4249 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4250 1, &dd_idx, NULL); 4251 last_sector = 4252 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4253 * new_data_disks - 1), 4254 1, &dd_idx, NULL); 4255 if (last_sector >= mddev->dev_sectors) 4256 last_sector = mddev->dev_sectors - 1; 4257 while (first_sector <= last_sector) { 4258 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4259 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4260 set_bit(STRIPE_HANDLE, &sh->state); 4261 release_stripe(sh); 4262 first_sector += STRIPE_SECTORS; 4263 } 4264 /* Now that the sources are clearly marked, we can release 4265 * the destination stripes 4266 */ 4267 while (!list_empty(&stripes)) { 4268 sh = list_entry(stripes.next, struct stripe_head, lru); 4269 list_del_init(&sh->lru); 4270 release_stripe(sh); 4271 } 4272 /* If this takes us to the resync_max point where we have to pause, 4273 * then we need to write out the superblock. 4274 */ 4275 sector_nr += reshape_sectors; 4276 if ((sector_nr - mddev->curr_resync_completed) * 2 4277 >= mddev->resync_max - mddev->curr_resync_completed) { 4278 /* Cannot proceed until we've updated the superblock... */ 4279 wait_event(conf->wait_for_overlap, 4280 atomic_read(&conf->reshape_stripes) == 0); 4281 mddev->reshape_position = conf->reshape_progress; 4282 mddev->curr_resync_completed = sector_nr; 4283 conf->reshape_checkpoint = jiffies; 4284 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4285 md_wakeup_thread(mddev->thread); 4286 wait_event(mddev->sb_wait, 4287 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4288 || kthread_should_stop()); 4289 spin_lock_irq(&conf->device_lock); 4290 conf->reshape_safe = mddev->reshape_position; 4291 spin_unlock_irq(&conf->device_lock); 4292 wake_up(&conf->wait_for_overlap); 4293 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4294 } 4295 return reshape_sectors; 4296 } 4297 4298 /* FIXME go_faster isn't used */ 4299 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4300 { 4301 raid5_conf_t *conf = mddev->private; 4302 struct stripe_head *sh; 4303 sector_t max_sector = mddev->dev_sectors; 4304 sector_t sync_blocks; 4305 int still_degraded = 0; 4306 int i; 4307 4308 if (sector_nr >= max_sector) { 4309 /* just being told to finish up .. nothing much to do */ 4310 4311 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4312 end_reshape(conf); 4313 return 0; 4314 } 4315 4316 if (mddev->curr_resync < max_sector) /* aborted */ 4317 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4318 &sync_blocks, 1); 4319 else /* completed sync */ 4320 conf->fullsync = 0; 4321 bitmap_close_sync(mddev->bitmap); 4322 4323 return 0; 4324 } 4325 4326 /* Allow raid5_quiesce to complete */ 4327 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4328 4329 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4330 return reshape_request(mddev, sector_nr, skipped); 4331 4332 /* No need to check resync_max as we never do more than one 4333 * stripe, and as resync_max will always be on a chunk boundary, 4334 * if the check in md_do_sync didn't fire, there is no chance 4335 * of overstepping resync_max here 4336 */ 4337 4338 /* if there is too many failed drives and we are trying 4339 * to resync, then assert that we are finished, because there is 4340 * nothing we can do. 4341 */ 4342 if (mddev->degraded >= conf->max_degraded && 4343 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4344 sector_t rv = mddev->dev_sectors - sector_nr; 4345 *skipped = 1; 4346 return rv; 4347 } 4348 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4349 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4350 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4351 /* we can skip this block, and probably more */ 4352 sync_blocks /= STRIPE_SECTORS; 4353 *skipped = 1; 4354 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4355 } 4356 4357 4358 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4359 4360 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4361 if (sh == NULL) { 4362 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4363 /* make sure we don't swamp the stripe cache if someone else 4364 * is trying to get access 4365 */ 4366 schedule_timeout_uninterruptible(1); 4367 } 4368 /* Need to check if array will still be degraded after recovery/resync 4369 * We don't need to check the 'failed' flag as when that gets set, 4370 * recovery aborts. 4371 */ 4372 for (i = 0; i < conf->raid_disks; i++) 4373 if (conf->disks[i].rdev == NULL) 4374 still_degraded = 1; 4375 4376 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4377 4378 spin_lock(&sh->lock); 4379 set_bit(STRIPE_SYNCING, &sh->state); 4380 clear_bit(STRIPE_INSYNC, &sh->state); 4381 spin_unlock(&sh->lock); 4382 4383 handle_stripe(sh); 4384 release_stripe(sh); 4385 4386 return STRIPE_SECTORS; 4387 } 4388 4389 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 4390 { 4391 /* We may not be able to submit a whole bio at once as there 4392 * may not be enough stripe_heads available. 4393 * We cannot pre-allocate enough stripe_heads as we may need 4394 * more than exist in the cache (if we allow ever large chunks). 4395 * So we do one stripe head at a time and record in 4396 * ->bi_hw_segments how many have been done. 4397 * 4398 * We *know* that this entire raid_bio is in one chunk, so 4399 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4400 */ 4401 struct stripe_head *sh; 4402 int dd_idx; 4403 sector_t sector, logical_sector, last_sector; 4404 int scnt = 0; 4405 int remaining; 4406 int handled = 0; 4407 4408 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4409 sector = raid5_compute_sector(conf, logical_sector, 4410 0, &dd_idx, NULL); 4411 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4412 4413 for (; logical_sector < last_sector; 4414 logical_sector += STRIPE_SECTORS, 4415 sector += STRIPE_SECTORS, 4416 scnt++) { 4417 4418 if (scnt < raid5_bi_hw_segments(raid_bio)) 4419 /* already done this stripe */ 4420 continue; 4421 4422 sh = get_active_stripe(conf, sector, 0, 1, 0); 4423 4424 if (!sh) { 4425 /* failed to get a stripe - must wait */ 4426 raid5_set_bi_hw_segments(raid_bio, scnt); 4427 conf->retry_read_aligned = raid_bio; 4428 return handled; 4429 } 4430 4431 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4432 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4433 release_stripe(sh); 4434 raid5_set_bi_hw_segments(raid_bio, scnt); 4435 conf->retry_read_aligned = raid_bio; 4436 return handled; 4437 } 4438 4439 handle_stripe(sh); 4440 release_stripe(sh); 4441 handled++; 4442 } 4443 spin_lock_irq(&conf->device_lock); 4444 remaining = raid5_dec_bi_phys_segments(raid_bio); 4445 spin_unlock_irq(&conf->device_lock); 4446 if (remaining == 0) 4447 bio_endio(raid_bio, 0); 4448 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4449 wake_up(&conf->wait_for_stripe); 4450 return handled; 4451 } 4452 4453 4454 /* 4455 * This is our raid5 kernel thread. 4456 * 4457 * We scan the hash table for stripes which can be handled now. 4458 * During the scan, completed stripes are saved for us by the interrupt 4459 * handler, so that they will not have to wait for our next wakeup. 4460 */ 4461 static void raid5d(mddev_t *mddev) 4462 { 4463 struct stripe_head *sh; 4464 raid5_conf_t *conf = mddev->private; 4465 int handled; 4466 struct blk_plug plug; 4467 4468 pr_debug("+++ raid5d active\n"); 4469 4470 md_check_recovery(mddev); 4471 4472 blk_start_plug(&plug); 4473 handled = 0; 4474 spin_lock_irq(&conf->device_lock); 4475 while (1) { 4476 struct bio *bio; 4477 4478 if (atomic_read(&mddev->plug_cnt) == 0 && 4479 !list_empty(&conf->bitmap_list)) { 4480 /* Now is a good time to flush some bitmap updates */ 4481 conf->seq_flush++; 4482 spin_unlock_irq(&conf->device_lock); 4483 bitmap_unplug(mddev->bitmap); 4484 spin_lock_irq(&conf->device_lock); 4485 conf->seq_write = conf->seq_flush; 4486 activate_bit_delay(conf); 4487 } 4488 if (atomic_read(&mddev->plug_cnt) == 0) 4489 raid5_activate_delayed(conf); 4490 4491 while ((bio = remove_bio_from_retry(conf))) { 4492 int ok; 4493 spin_unlock_irq(&conf->device_lock); 4494 ok = retry_aligned_read(conf, bio); 4495 spin_lock_irq(&conf->device_lock); 4496 if (!ok) 4497 break; 4498 handled++; 4499 } 4500 4501 sh = __get_priority_stripe(conf); 4502 4503 if (!sh) 4504 break; 4505 spin_unlock_irq(&conf->device_lock); 4506 4507 handled++; 4508 handle_stripe(sh); 4509 release_stripe(sh); 4510 cond_resched(); 4511 4512 spin_lock_irq(&conf->device_lock); 4513 } 4514 pr_debug("%d stripes handled\n", handled); 4515 4516 spin_unlock_irq(&conf->device_lock); 4517 4518 async_tx_issue_pending_all(); 4519 blk_finish_plug(&plug); 4520 4521 pr_debug("--- raid5d inactive\n"); 4522 } 4523 4524 static ssize_t 4525 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4526 { 4527 raid5_conf_t *conf = mddev->private; 4528 if (conf) 4529 return sprintf(page, "%d\n", conf->max_nr_stripes); 4530 else 4531 return 0; 4532 } 4533 4534 int 4535 raid5_set_cache_size(mddev_t *mddev, int size) 4536 { 4537 raid5_conf_t *conf = mddev->private; 4538 int err; 4539 4540 if (size <= 16 || size > 32768) 4541 return -EINVAL; 4542 while (size < conf->max_nr_stripes) { 4543 if (drop_one_stripe(conf)) 4544 conf->max_nr_stripes--; 4545 else 4546 break; 4547 } 4548 err = md_allow_write(mddev); 4549 if (err) 4550 return err; 4551 while (size > conf->max_nr_stripes) { 4552 if (grow_one_stripe(conf)) 4553 conf->max_nr_stripes++; 4554 else break; 4555 } 4556 return 0; 4557 } 4558 EXPORT_SYMBOL(raid5_set_cache_size); 4559 4560 static ssize_t 4561 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4562 { 4563 raid5_conf_t *conf = mddev->private; 4564 unsigned long new; 4565 int err; 4566 4567 if (len >= PAGE_SIZE) 4568 return -EINVAL; 4569 if (!conf) 4570 return -ENODEV; 4571 4572 if (strict_strtoul(page, 10, &new)) 4573 return -EINVAL; 4574 err = raid5_set_cache_size(mddev, new); 4575 if (err) 4576 return err; 4577 return len; 4578 } 4579 4580 static struct md_sysfs_entry 4581 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4582 raid5_show_stripe_cache_size, 4583 raid5_store_stripe_cache_size); 4584 4585 static ssize_t 4586 raid5_show_preread_threshold(mddev_t *mddev, char *page) 4587 { 4588 raid5_conf_t *conf = mddev->private; 4589 if (conf) 4590 return sprintf(page, "%d\n", conf->bypass_threshold); 4591 else 4592 return 0; 4593 } 4594 4595 static ssize_t 4596 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4597 { 4598 raid5_conf_t *conf = mddev->private; 4599 unsigned long new; 4600 if (len >= PAGE_SIZE) 4601 return -EINVAL; 4602 if (!conf) 4603 return -ENODEV; 4604 4605 if (strict_strtoul(page, 10, &new)) 4606 return -EINVAL; 4607 if (new > conf->max_nr_stripes) 4608 return -EINVAL; 4609 conf->bypass_threshold = new; 4610 return len; 4611 } 4612 4613 static struct md_sysfs_entry 4614 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4615 S_IRUGO | S_IWUSR, 4616 raid5_show_preread_threshold, 4617 raid5_store_preread_threshold); 4618 4619 static ssize_t 4620 stripe_cache_active_show(mddev_t *mddev, char *page) 4621 { 4622 raid5_conf_t *conf = mddev->private; 4623 if (conf) 4624 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4625 else 4626 return 0; 4627 } 4628 4629 static struct md_sysfs_entry 4630 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4631 4632 static struct attribute *raid5_attrs[] = { 4633 &raid5_stripecache_size.attr, 4634 &raid5_stripecache_active.attr, 4635 &raid5_preread_bypass_threshold.attr, 4636 NULL, 4637 }; 4638 static struct attribute_group raid5_attrs_group = { 4639 .name = NULL, 4640 .attrs = raid5_attrs, 4641 }; 4642 4643 static sector_t 4644 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4645 { 4646 raid5_conf_t *conf = mddev->private; 4647 4648 if (!sectors) 4649 sectors = mddev->dev_sectors; 4650 if (!raid_disks) 4651 /* size is defined by the smallest of previous and new size */ 4652 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4653 4654 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4655 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4656 return sectors * (raid_disks - conf->max_degraded); 4657 } 4658 4659 static void raid5_free_percpu(raid5_conf_t *conf) 4660 { 4661 struct raid5_percpu *percpu; 4662 unsigned long cpu; 4663 4664 if (!conf->percpu) 4665 return; 4666 4667 get_online_cpus(); 4668 for_each_possible_cpu(cpu) { 4669 percpu = per_cpu_ptr(conf->percpu, cpu); 4670 safe_put_page(percpu->spare_page); 4671 kfree(percpu->scribble); 4672 } 4673 #ifdef CONFIG_HOTPLUG_CPU 4674 unregister_cpu_notifier(&conf->cpu_notify); 4675 #endif 4676 put_online_cpus(); 4677 4678 free_percpu(conf->percpu); 4679 } 4680 4681 static void free_conf(raid5_conf_t *conf) 4682 { 4683 shrink_stripes(conf); 4684 raid5_free_percpu(conf); 4685 kfree(conf->disks); 4686 kfree(conf->stripe_hashtbl); 4687 kfree(conf); 4688 } 4689 4690 #ifdef CONFIG_HOTPLUG_CPU 4691 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4692 void *hcpu) 4693 { 4694 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); 4695 long cpu = (long)hcpu; 4696 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4697 4698 switch (action) { 4699 case CPU_UP_PREPARE: 4700 case CPU_UP_PREPARE_FROZEN: 4701 if (conf->level == 6 && !percpu->spare_page) 4702 percpu->spare_page = alloc_page(GFP_KERNEL); 4703 if (!percpu->scribble) 4704 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4705 4706 if (!percpu->scribble || 4707 (conf->level == 6 && !percpu->spare_page)) { 4708 safe_put_page(percpu->spare_page); 4709 kfree(percpu->scribble); 4710 pr_err("%s: failed memory allocation for cpu%ld\n", 4711 __func__, cpu); 4712 return notifier_from_errno(-ENOMEM); 4713 } 4714 break; 4715 case CPU_DEAD: 4716 case CPU_DEAD_FROZEN: 4717 safe_put_page(percpu->spare_page); 4718 kfree(percpu->scribble); 4719 percpu->spare_page = NULL; 4720 percpu->scribble = NULL; 4721 break; 4722 default: 4723 break; 4724 } 4725 return NOTIFY_OK; 4726 } 4727 #endif 4728 4729 static int raid5_alloc_percpu(raid5_conf_t *conf) 4730 { 4731 unsigned long cpu; 4732 struct page *spare_page; 4733 struct raid5_percpu __percpu *allcpus; 4734 void *scribble; 4735 int err; 4736 4737 allcpus = alloc_percpu(struct raid5_percpu); 4738 if (!allcpus) 4739 return -ENOMEM; 4740 conf->percpu = allcpus; 4741 4742 get_online_cpus(); 4743 err = 0; 4744 for_each_present_cpu(cpu) { 4745 if (conf->level == 6) { 4746 spare_page = alloc_page(GFP_KERNEL); 4747 if (!spare_page) { 4748 err = -ENOMEM; 4749 break; 4750 } 4751 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4752 } 4753 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4754 if (!scribble) { 4755 err = -ENOMEM; 4756 break; 4757 } 4758 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4759 } 4760 #ifdef CONFIG_HOTPLUG_CPU 4761 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4762 conf->cpu_notify.priority = 0; 4763 if (err == 0) 4764 err = register_cpu_notifier(&conf->cpu_notify); 4765 #endif 4766 put_online_cpus(); 4767 4768 return err; 4769 } 4770 4771 static raid5_conf_t *setup_conf(mddev_t *mddev) 4772 { 4773 raid5_conf_t *conf; 4774 int raid_disk, memory, max_disks; 4775 mdk_rdev_t *rdev; 4776 struct disk_info *disk; 4777 4778 if (mddev->new_level != 5 4779 && mddev->new_level != 4 4780 && mddev->new_level != 6) { 4781 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4782 mdname(mddev), mddev->new_level); 4783 return ERR_PTR(-EIO); 4784 } 4785 if ((mddev->new_level == 5 4786 && !algorithm_valid_raid5(mddev->new_layout)) || 4787 (mddev->new_level == 6 4788 && !algorithm_valid_raid6(mddev->new_layout))) { 4789 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4790 mdname(mddev), mddev->new_layout); 4791 return ERR_PTR(-EIO); 4792 } 4793 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4794 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4795 mdname(mddev), mddev->raid_disks); 4796 return ERR_PTR(-EINVAL); 4797 } 4798 4799 if (!mddev->new_chunk_sectors || 4800 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4801 !is_power_of_2(mddev->new_chunk_sectors)) { 4802 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4803 mdname(mddev), mddev->new_chunk_sectors << 9); 4804 return ERR_PTR(-EINVAL); 4805 } 4806 4807 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4808 if (conf == NULL) 4809 goto abort; 4810 spin_lock_init(&conf->device_lock); 4811 init_waitqueue_head(&conf->wait_for_stripe); 4812 init_waitqueue_head(&conf->wait_for_overlap); 4813 INIT_LIST_HEAD(&conf->handle_list); 4814 INIT_LIST_HEAD(&conf->hold_list); 4815 INIT_LIST_HEAD(&conf->delayed_list); 4816 INIT_LIST_HEAD(&conf->bitmap_list); 4817 INIT_LIST_HEAD(&conf->inactive_list); 4818 atomic_set(&conf->active_stripes, 0); 4819 atomic_set(&conf->preread_active_stripes, 0); 4820 atomic_set(&conf->active_aligned_reads, 0); 4821 conf->bypass_threshold = BYPASS_THRESHOLD; 4822 4823 conf->raid_disks = mddev->raid_disks; 4824 if (mddev->reshape_position == MaxSector) 4825 conf->previous_raid_disks = mddev->raid_disks; 4826 else 4827 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4828 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4829 conf->scribble_len = scribble_len(max_disks); 4830 4831 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4832 GFP_KERNEL); 4833 if (!conf->disks) 4834 goto abort; 4835 4836 conf->mddev = mddev; 4837 4838 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4839 goto abort; 4840 4841 conf->level = mddev->new_level; 4842 if (raid5_alloc_percpu(conf) != 0) 4843 goto abort; 4844 4845 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4846 4847 list_for_each_entry(rdev, &mddev->disks, same_set) { 4848 raid_disk = rdev->raid_disk; 4849 if (raid_disk >= max_disks 4850 || raid_disk < 0) 4851 continue; 4852 disk = conf->disks + raid_disk; 4853 4854 disk->rdev = rdev; 4855 4856 if (test_bit(In_sync, &rdev->flags)) { 4857 char b[BDEVNAME_SIZE]; 4858 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4859 " disk %d\n", 4860 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4861 } else if (rdev->saved_raid_disk != raid_disk) 4862 /* Cannot rely on bitmap to complete recovery */ 4863 conf->fullsync = 1; 4864 } 4865 4866 conf->chunk_sectors = mddev->new_chunk_sectors; 4867 conf->level = mddev->new_level; 4868 if (conf->level == 6) 4869 conf->max_degraded = 2; 4870 else 4871 conf->max_degraded = 1; 4872 conf->algorithm = mddev->new_layout; 4873 conf->max_nr_stripes = NR_STRIPES; 4874 conf->reshape_progress = mddev->reshape_position; 4875 if (conf->reshape_progress != MaxSector) { 4876 conf->prev_chunk_sectors = mddev->chunk_sectors; 4877 conf->prev_algo = mddev->layout; 4878 } 4879 4880 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4881 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4882 if (grow_stripes(conf, conf->max_nr_stripes)) { 4883 printk(KERN_ERR 4884 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4885 mdname(mddev), memory); 4886 goto abort; 4887 } else 4888 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4889 mdname(mddev), memory); 4890 4891 conf->thread = md_register_thread(raid5d, mddev, NULL); 4892 if (!conf->thread) { 4893 printk(KERN_ERR 4894 "md/raid:%s: couldn't allocate thread.\n", 4895 mdname(mddev)); 4896 goto abort; 4897 } 4898 4899 return conf; 4900 4901 abort: 4902 if (conf) { 4903 free_conf(conf); 4904 return ERR_PTR(-EIO); 4905 } else 4906 return ERR_PTR(-ENOMEM); 4907 } 4908 4909 4910 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4911 { 4912 switch (algo) { 4913 case ALGORITHM_PARITY_0: 4914 if (raid_disk < max_degraded) 4915 return 1; 4916 break; 4917 case ALGORITHM_PARITY_N: 4918 if (raid_disk >= raid_disks - max_degraded) 4919 return 1; 4920 break; 4921 case ALGORITHM_PARITY_0_6: 4922 if (raid_disk == 0 || 4923 raid_disk == raid_disks - 1) 4924 return 1; 4925 break; 4926 case ALGORITHM_LEFT_ASYMMETRIC_6: 4927 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4928 case ALGORITHM_LEFT_SYMMETRIC_6: 4929 case ALGORITHM_RIGHT_SYMMETRIC_6: 4930 if (raid_disk == raid_disks - 1) 4931 return 1; 4932 } 4933 return 0; 4934 } 4935 4936 static int run(mddev_t *mddev) 4937 { 4938 raid5_conf_t *conf; 4939 int working_disks = 0; 4940 int dirty_parity_disks = 0; 4941 mdk_rdev_t *rdev; 4942 sector_t reshape_offset = 0; 4943 4944 if (mddev->recovery_cp != MaxSector) 4945 printk(KERN_NOTICE "md/raid:%s: not clean" 4946 " -- starting background reconstruction\n", 4947 mdname(mddev)); 4948 if (mddev->reshape_position != MaxSector) { 4949 /* Check that we can continue the reshape. 4950 * Currently only disks can change, it must 4951 * increase, and we must be past the point where 4952 * a stripe over-writes itself 4953 */ 4954 sector_t here_new, here_old; 4955 int old_disks; 4956 int max_degraded = (mddev->level == 6 ? 2 : 1); 4957 4958 if (mddev->new_level != mddev->level) { 4959 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4960 "required - aborting.\n", 4961 mdname(mddev)); 4962 return -EINVAL; 4963 } 4964 old_disks = mddev->raid_disks - mddev->delta_disks; 4965 /* reshape_position must be on a new-stripe boundary, and one 4966 * further up in new geometry must map after here in old 4967 * geometry. 4968 */ 4969 here_new = mddev->reshape_position; 4970 if (sector_div(here_new, mddev->new_chunk_sectors * 4971 (mddev->raid_disks - max_degraded))) { 4972 printk(KERN_ERR "md/raid:%s: reshape_position not " 4973 "on a stripe boundary\n", mdname(mddev)); 4974 return -EINVAL; 4975 } 4976 reshape_offset = here_new * mddev->new_chunk_sectors; 4977 /* here_new is the stripe we will write to */ 4978 here_old = mddev->reshape_position; 4979 sector_div(here_old, mddev->chunk_sectors * 4980 (old_disks-max_degraded)); 4981 /* here_old is the first stripe that we might need to read 4982 * from */ 4983 if (mddev->delta_disks == 0) { 4984 /* We cannot be sure it is safe to start an in-place 4985 * reshape. It is only safe if user-space if monitoring 4986 * and taking constant backups. 4987 * mdadm always starts a situation like this in 4988 * readonly mode so it can take control before 4989 * allowing any writes. So just check for that. 4990 */ 4991 if ((here_new * mddev->new_chunk_sectors != 4992 here_old * mddev->chunk_sectors) || 4993 mddev->ro == 0) { 4994 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4995 " in read-only mode - aborting\n", 4996 mdname(mddev)); 4997 return -EINVAL; 4998 } 4999 } else if (mddev->delta_disks < 0 5000 ? (here_new * mddev->new_chunk_sectors <= 5001 here_old * mddev->chunk_sectors) 5002 : (here_new * mddev->new_chunk_sectors >= 5003 here_old * mddev->chunk_sectors)) { 5004 /* Reading from the same stripe as writing to - bad */ 5005 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5006 "auto-recovery - aborting.\n", 5007 mdname(mddev)); 5008 return -EINVAL; 5009 } 5010 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5011 mdname(mddev)); 5012 /* OK, we should be able to continue; */ 5013 } else { 5014 BUG_ON(mddev->level != mddev->new_level); 5015 BUG_ON(mddev->layout != mddev->new_layout); 5016 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5017 BUG_ON(mddev->delta_disks != 0); 5018 } 5019 5020 if (mddev->private == NULL) 5021 conf = setup_conf(mddev); 5022 else 5023 conf = mddev->private; 5024 5025 if (IS_ERR(conf)) 5026 return PTR_ERR(conf); 5027 5028 mddev->thread = conf->thread; 5029 conf->thread = NULL; 5030 mddev->private = conf; 5031 5032 /* 5033 * 0 for a fully functional array, 1 or 2 for a degraded array. 5034 */ 5035 list_for_each_entry(rdev, &mddev->disks, same_set) { 5036 if (rdev->raid_disk < 0) 5037 continue; 5038 if (test_bit(In_sync, &rdev->flags)) { 5039 working_disks++; 5040 continue; 5041 } 5042 /* This disc is not fully in-sync. However if it 5043 * just stored parity (beyond the recovery_offset), 5044 * when we don't need to be concerned about the 5045 * array being dirty. 5046 * When reshape goes 'backwards', we never have 5047 * partially completed devices, so we only need 5048 * to worry about reshape going forwards. 5049 */ 5050 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5051 if (mddev->major_version == 0 && 5052 mddev->minor_version > 90) 5053 rdev->recovery_offset = reshape_offset; 5054 5055 if (rdev->recovery_offset < reshape_offset) { 5056 /* We need to check old and new layout */ 5057 if (!only_parity(rdev->raid_disk, 5058 conf->algorithm, 5059 conf->raid_disks, 5060 conf->max_degraded)) 5061 continue; 5062 } 5063 if (!only_parity(rdev->raid_disk, 5064 conf->prev_algo, 5065 conf->previous_raid_disks, 5066 conf->max_degraded)) 5067 continue; 5068 dirty_parity_disks++; 5069 } 5070 5071 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 5072 - working_disks); 5073 5074 if (has_failed(conf)) { 5075 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5076 " (%d/%d failed)\n", 5077 mdname(mddev), mddev->degraded, conf->raid_disks); 5078 goto abort; 5079 } 5080 5081 /* device size must be a multiple of chunk size */ 5082 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5083 mddev->resync_max_sectors = mddev->dev_sectors; 5084 5085 if (mddev->degraded > dirty_parity_disks && 5086 mddev->recovery_cp != MaxSector) { 5087 if (mddev->ok_start_degraded) 5088 printk(KERN_WARNING 5089 "md/raid:%s: starting dirty degraded array" 5090 " - data corruption possible.\n", 5091 mdname(mddev)); 5092 else { 5093 printk(KERN_ERR 5094 "md/raid:%s: cannot start dirty degraded array.\n", 5095 mdname(mddev)); 5096 goto abort; 5097 } 5098 } 5099 5100 if (mddev->degraded == 0) 5101 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5102 " devices, algorithm %d\n", mdname(mddev), conf->level, 5103 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5104 mddev->new_layout); 5105 else 5106 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5107 " out of %d devices, algorithm %d\n", 5108 mdname(mddev), conf->level, 5109 mddev->raid_disks - mddev->degraded, 5110 mddev->raid_disks, mddev->new_layout); 5111 5112 print_raid5_conf(conf); 5113 5114 if (conf->reshape_progress != MaxSector) { 5115 conf->reshape_safe = conf->reshape_progress; 5116 atomic_set(&conf->reshape_stripes, 0); 5117 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5118 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5119 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5120 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5121 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5122 "reshape"); 5123 } 5124 5125 5126 /* Ok, everything is just fine now */ 5127 if (mddev->to_remove == &raid5_attrs_group) 5128 mddev->to_remove = NULL; 5129 else if (mddev->kobj.sd && 5130 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5131 printk(KERN_WARNING 5132 "raid5: failed to create sysfs attributes for %s\n", 5133 mdname(mddev)); 5134 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5135 5136 if (mddev->queue) { 5137 int chunk_size; 5138 /* read-ahead size must cover two whole stripes, which 5139 * is 2 * (datadisks) * chunksize where 'n' is the 5140 * number of raid devices 5141 */ 5142 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5143 int stripe = data_disks * 5144 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5145 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5146 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5147 5148 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5149 5150 mddev->queue->backing_dev_info.congested_data = mddev; 5151 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5152 5153 chunk_size = mddev->chunk_sectors << 9; 5154 blk_queue_io_min(mddev->queue, chunk_size); 5155 blk_queue_io_opt(mddev->queue, chunk_size * 5156 (conf->raid_disks - conf->max_degraded)); 5157 5158 list_for_each_entry(rdev, &mddev->disks, same_set) 5159 disk_stack_limits(mddev->gendisk, rdev->bdev, 5160 rdev->data_offset << 9); 5161 } 5162 5163 return 0; 5164 abort: 5165 md_unregister_thread(mddev->thread); 5166 mddev->thread = NULL; 5167 if (conf) { 5168 print_raid5_conf(conf); 5169 free_conf(conf); 5170 } 5171 mddev->private = NULL; 5172 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5173 return -EIO; 5174 } 5175 5176 static int stop(mddev_t *mddev) 5177 { 5178 raid5_conf_t *conf = mddev->private; 5179 5180 md_unregister_thread(mddev->thread); 5181 mddev->thread = NULL; 5182 if (mddev->queue) 5183 mddev->queue->backing_dev_info.congested_fn = NULL; 5184 free_conf(conf); 5185 mddev->private = NULL; 5186 mddev->to_remove = &raid5_attrs_group; 5187 return 0; 5188 } 5189 5190 #ifdef DEBUG 5191 static void print_sh(struct seq_file *seq, struct stripe_head *sh) 5192 { 5193 int i; 5194 5195 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 5196 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 5197 seq_printf(seq, "sh %llu, count %d.\n", 5198 (unsigned long long)sh->sector, atomic_read(&sh->count)); 5199 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 5200 for (i = 0; i < sh->disks; i++) { 5201 seq_printf(seq, "(cache%d: %p %ld) ", 5202 i, sh->dev[i].page, sh->dev[i].flags); 5203 } 5204 seq_printf(seq, "\n"); 5205 } 5206 5207 static void printall(struct seq_file *seq, raid5_conf_t *conf) 5208 { 5209 struct stripe_head *sh; 5210 struct hlist_node *hn; 5211 int i; 5212 5213 spin_lock_irq(&conf->device_lock); 5214 for (i = 0; i < NR_HASH; i++) { 5215 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 5216 if (sh->raid_conf != conf) 5217 continue; 5218 print_sh(seq, sh); 5219 } 5220 } 5221 spin_unlock_irq(&conf->device_lock); 5222 } 5223 #endif 5224 5225 static void status(struct seq_file *seq, mddev_t *mddev) 5226 { 5227 raid5_conf_t *conf = mddev->private; 5228 int i; 5229 5230 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5231 mddev->chunk_sectors / 2, mddev->layout); 5232 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5233 for (i = 0; i < conf->raid_disks; i++) 5234 seq_printf (seq, "%s", 5235 conf->disks[i].rdev && 5236 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5237 seq_printf (seq, "]"); 5238 #ifdef DEBUG 5239 seq_printf (seq, "\n"); 5240 printall(seq, conf); 5241 #endif 5242 } 5243 5244 static void print_raid5_conf (raid5_conf_t *conf) 5245 { 5246 int i; 5247 struct disk_info *tmp; 5248 5249 printk(KERN_DEBUG "RAID conf printout:\n"); 5250 if (!conf) { 5251 printk("(conf==NULL)\n"); 5252 return; 5253 } 5254 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5255 conf->raid_disks, 5256 conf->raid_disks - conf->mddev->degraded); 5257 5258 for (i = 0; i < conf->raid_disks; i++) { 5259 char b[BDEVNAME_SIZE]; 5260 tmp = conf->disks + i; 5261 if (tmp->rdev) 5262 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5263 i, !test_bit(Faulty, &tmp->rdev->flags), 5264 bdevname(tmp->rdev->bdev, b)); 5265 } 5266 } 5267 5268 static int raid5_spare_active(mddev_t *mddev) 5269 { 5270 int i; 5271 raid5_conf_t *conf = mddev->private; 5272 struct disk_info *tmp; 5273 int count = 0; 5274 unsigned long flags; 5275 5276 for (i = 0; i < conf->raid_disks; i++) { 5277 tmp = conf->disks + i; 5278 if (tmp->rdev 5279 && tmp->rdev->recovery_offset == MaxSector 5280 && !test_bit(Faulty, &tmp->rdev->flags) 5281 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5282 count++; 5283 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5284 } 5285 } 5286 spin_lock_irqsave(&conf->device_lock, flags); 5287 mddev->degraded -= count; 5288 spin_unlock_irqrestore(&conf->device_lock, flags); 5289 print_raid5_conf(conf); 5290 return count; 5291 } 5292 5293 static int raid5_remove_disk(mddev_t *mddev, int number) 5294 { 5295 raid5_conf_t *conf = mddev->private; 5296 int err = 0; 5297 mdk_rdev_t *rdev; 5298 struct disk_info *p = conf->disks + number; 5299 5300 print_raid5_conf(conf); 5301 rdev = p->rdev; 5302 if (rdev) { 5303 if (number >= conf->raid_disks && 5304 conf->reshape_progress == MaxSector) 5305 clear_bit(In_sync, &rdev->flags); 5306 5307 if (test_bit(In_sync, &rdev->flags) || 5308 atomic_read(&rdev->nr_pending)) { 5309 err = -EBUSY; 5310 goto abort; 5311 } 5312 /* Only remove non-faulty devices if recovery 5313 * isn't possible. 5314 */ 5315 if (!test_bit(Faulty, &rdev->flags) && 5316 !has_failed(conf) && 5317 number < conf->raid_disks) { 5318 err = -EBUSY; 5319 goto abort; 5320 } 5321 p->rdev = NULL; 5322 synchronize_rcu(); 5323 if (atomic_read(&rdev->nr_pending)) { 5324 /* lost the race, try later */ 5325 err = -EBUSY; 5326 p->rdev = rdev; 5327 } 5328 } 5329 abort: 5330 5331 print_raid5_conf(conf); 5332 return err; 5333 } 5334 5335 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 5336 { 5337 raid5_conf_t *conf = mddev->private; 5338 int err = -EEXIST; 5339 int disk; 5340 struct disk_info *p; 5341 int first = 0; 5342 int last = conf->raid_disks - 1; 5343 5344 if (has_failed(conf)) 5345 /* no point adding a device */ 5346 return -EINVAL; 5347 5348 if (rdev->raid_disk >= 0) 5349 first = last = rdev->raid_disk; 5350 5351 /* 5352 * find the disk ... but prefer rdev->saved_raid_disk 5353 * if possible. 5354 */ 5355 if (rdev->saved_raid_disk >= 0 && 5356 rdev->saved_raid_disk >= first && 5357 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5358 disk = rdev->saved_raid_disk; 5359 else 5360 disk = first; 5361 for ( ; disk <= last ; disk++) 5362 if ((p=conf->disks + disk)->rdev == NULL) { 5363 clear_bit(In_sync, &rdev->flags); 5364 rdev->raid_disk = disk; 5365 err = 0; 5366 if (rdev->saved_raid_disk != disk) 5367 conf->fullsync = 1; 5368 rcu_assign_pointer(p->rdev, rdev); 5369 break; 5370 } 5371 print_raid5_conf(conf); 5372 return err; 5373 } 5374 5375 static int raid5_resize(mddev_t *mddev, sector_t sectors) 5376 { 5377 /* no resync is happening, and there is enough space 5378 * on all devices, so we can resize. 5379 * We need to make sure resync covers any new space. 5380 * If the array is shrinking we should possibly wait until 5381 * any io in the removed space completes, but it hardly seems 5382 * worth it. 5383 */ 5384 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5385 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5386 mddev->raid_disks)); 5387 if (mddev->array_sectors > 5388 raid5_size(mddev, sectors, mddev->raid_disks)) 5389 return -EINVAL; 5390 set_capacity(mddev->gendisk, mddev->array_sectors); 5391 revalidate_disk(mddev->gendisk); 5392 if (sectors > mddev->dev_sectors && 5393 mddev->recovery_cp > mddev->dev_sectors) { 5394 mddev->recovery_cp = mddev->dev_sectors; 5395 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5396 } 5397 mddev->dev_sectors = sectors; 5398 mddev->resync_max_sectors = sectors; 5399 return 0; 5400 } 5401 5402 static int check_stripe_cache(mddev_t *mddev) 5403 { 5404 /* Can only proceed if there are plenty of stripe_heads. 5405 * We need a minimum of one full stripe,, and for sensible progress 5406 * it is best to have about 4 times that. 5407 * If we require 4 times, then the default 256 4K stripe_heads will 5408 * allow for chunk sizes up to 256K, which is probably OK. 5409 * If the chunk size is greater, user-space should request more 5410 * stripe_heads first. 5411 */ 5412 raid5_conf_t *conf = mddev->private; 5413 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5414 > conf->max_nr_stripes || 5415 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5416 > conf->max_nr_stripes) { 5417 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5418 mdname(mddev), 5419 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5420 / STRIPE_SIZE)*4); 5421 return 0; 5422 } 5423 return 1; 5424 } 5425 5426 static int check_reshape(mddev_t *mddev) 5427 { 5428 raid5_conf_t *conf = mddev->private; 5429 5430 if (mddev->delta_disks == 0 && 5431 mddev->new_layout == mddev->layout && 5432 mddev->new_chunk_sectors == mddev->chunk_sectors) 5433 return 0; /* nothing to do */ 5434 if (mddev->bitmap) 5435 /* Cannot grow a bitmap yet */ 5436 return -EBUSY; 5437 if (has_failed(conf)) 5438 return -EINVAL; 5439 if (mddev->delta_disks < 0) { 5440 /* We might be able to shrink, but the devices must 5441 * be made bigger first. 5442 * For raid6, 4 is the minimum size. 5443 * Otherwise 2 is the minimum 5444 */ 5445 int min = 2; 5446 if (mddev->level == 6) 5447 min = 4; 5448 if (mddev->raid_disks + mddev->delta_disks < min) 5449 return -EINVAL; 5450 } 5451 5452 if (!check_stripe_cache(mddev)) 5453 return -ENOSPC; 5454 5455 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5456 } 5457 5458 static int raid5_start_reshape(mddev_t *mddev) 5459 { 5460 raid5_conf_t *conf = mddev->private; 5461 mdk_rdev_t *rdev; 5462 int spares = 0; 5463 unsigned long flags; 5464 5465 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5466 return -EBUSY; 5467 5468 if (!check_stripe_cache(mddev)) 5469 return -ENOSPC; 5470 5471 list_for_each_entry(rdev, &mddev->disks, same_set) 5472 if (!test_bit(In_sync, &rdev->flags) 5473 && !test_bit(Faulty, &rdev->flags)) 5474 spares++; 5475 5476 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5477 /* Not enough devices even to make a degraded array 5478 * of that size 5479 */ 5480 return -EINVAL; 5481 5482 /* Refuse to reduce size of the array. Any reductions in 5483 * array size must be through explicit setting of array_size 5484 * attribute. 5485 */ 5486 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5487 < mddev->array_sectors) { 5488 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5489 "before number of disks\n", mdname(mddev)); 5490 return -EINVAL; 5491 } 5492 5493 atomic_set(&conf->reshape_stripes, 0); 5494 spin_lock_irq(&conf->device_lock); 5495 conf->previous_raid_disks = conf->raid_disks; 5496 conf->raid_disks += mddev->delta_disks; 5497 conf->prev_chunk_sectors = conf->chunk_sectors; 5498 conf->chunk_sectors = mddev->new_chunk_sectors; 5499 conf->prev_algo = conf->algorithm; 5500 conf->algorithm = mddev->new_layout; 5501 if (mddev->delta_disks < 0) 5502 conf->reshape_progress = raid5_size(mddev, 0, 0); 5503 else 5504 conf->reshape_progress = 0; 5505 conf->reshape_safe = conf->reshape_progress; 5506 conf->generation++; 5507 spin_unlock_irq(&conf->device_lock); 5508 5509 /* Add some new drives, as many as will fit. 5510 * We know there are enough to make the newly sized array work. 5511 * Don't add devices if we are reducing the number of 5512 * devices in the array. This is because it is not possible 5513 * to correctly record the "partially reconstructed" state of 5514 * such devices during the reshape and confusion could result. 5515 */ 5516 if (mddev->delta_disks >= 0) { 5517 int added_devices = 0; 5518 list_for_each_entry(rdev, &mddev->disks, same_set) 5519 if (rdev->raid_disk < 0 && 5520 !test_bit(Faulty, &rdev->flags)) { 5521 if (raid5_add_disk(mddev, rdev) == 0) { 5522 char nm[20]; 5523 if (rdev->raid_disk 5524 >= conf->previous_raid_disks) { 5525 set_bit(In_sync, &rdev->flags); 5526 added_devices++; 5527 } else 5528 rdev->recovery_offset = 0; 5529 sprintf(nm, "rd%d", rdev->raid_disk); 5530 if (sysfs_create_link(&mddev->kobj, 5531 &rdev->kobj, nm)) 5532 /* Failure here is OK */; 5533 } 5534 } else if (rdev->raid_disk >= conf->previous_raid_disks 5535 && !test_bit(Faulty, &rdev->flags)) { 5536 /* This is a spare that was manually added */ 5537 set_bit(In_sync, &rdev->flags); 5538 added_devices++; 5539 } 5540 5541 /* When a reshape changes the number of devices, 5542 * ->degraded is measured against the larger of the 5543 * pre and post number of devices. 5544 */ 5545 spin_lock_irqsave(&conf->device_lock, flags); 5546 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5547 - added_devices; 5548 spin_unlock_irqrestore(&conf->device_lock, flags); 5549 } 5550 mddev->raid_disks = conf->raid_disks; 5551 mddev->reshape_position = conf->reshape_progress; 5552 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5553 5554 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5555 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5556 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5557 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5558 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5559 "reshape"); 5560 if (!mddev->sync_thread) { 5561 mddev->recovery = 0; 5562 spin_lock_irq(&conf->device_lock); 5563 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5564 conf->reshape_progress = MaxSector; 5565 spin_unlock_irq(&conf->device_lock); 5566 return -EAGAIN; 5567 } 5568 conf->reshape_checkpoint = jiffies; 5569 md_wakeup_thread(mddev->sync_thread); 5570 md_new_event(mddev); 5571 return 0; 5572 } 5573 5574 /* This is called from the reshape thread and should make any 5575 * changes needed in 'conf' 5576 */ 5577 static void end_reshape(raid5_conf_t *conf) 5578 { 5579 5580 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5581 5582 spin_lock_irq(&conf->device_lock); 5583 conf->previous_raid_disks = conf->raid_disks; 5584 conf->reshape_progress = MaxSector; 5585 spin_unlock_irq(&conf->device_lock); 5586 wake_up(&conf->wait_for_overlap); 5587 5588 /* read-ahead size must cover two whole stripes, which is 5589 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5590 */ 5591 if (conf->mddev->queue) { 5592 int data_disks = conf->raid_disks - conf->max_degraded; 5593 int stripe = data_disks * ((conf->chunk_sectors << 9) 5594 / PAGE_SIZE); 5595 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5596 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5597 } 5598 } 5599 } 5600 5601 /* This is called from the raid5d thread with mddev_lock held. 5602 * It makes config changes to the device. 5603 */ 5604 static void raid5_finish_reshape(mddev_t *mddev) 5605 { 5606 raid5_conf_t *conf = mddev->private; 5607 5608 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5609 5610 if (mddev->delta_disks > 0) { 5611 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5612 set_capacity(mddev->gendisk, mddev->array_sectors); 5613 revalidate_disk(mddev->gendisk); 5614 } else { 5615 int d; 5616 mddev->degraded = conf->raid_disks; 5617 for (d = 0; d < conf->raid_disks ; d++) 5618 if (conf->disks[d].rdev && 5619 test_bit(In_sync, 5620 &conf->disks[d].rdev->flags)) 5621 mddev->degraded--; 5622 for (d = conf->raid_disks ; 5623 d < conf->raid_disks - mddev->delta_disks; 5624 d++) { 5625 mdk_rdev_t *rdev = conf->disks[d].rdev; 5626 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5627 char nm[20]; 5628 sprintf(nm, "rd%d", rdev->raid_disk); 5629 sysfs_remove_link(&mddev->kobj, nm); 5630 rdev->raid_disk = -1; 5631 } 5632 } 5633 } 5634 mddev->layout = conf->algorithm; 5635 mddev->chunk_sectors = conf->chunk_sectors; 5636 mddev->reshape_position = MaxSector; 5637 mddev->delta_disks = 0; 5638 } 5639 } 5640 5641 static void raid5_quiesce(mddev_t *mddev, int state) 5642 { 5643 raid5_conf_t *conf = mddev->private; 5644 5645 switch(state) { 5646 case 2: /* resume for a suspend */ 5647 wake_up(&conf->wait_for_overlap); 5648 break; 5649 5650 case 1: /* stop all writes */ 5651 spin_lock_irq(&conf->device_lock); 5652 /* '2' tells resync/reshape to pause so that all 5653 * active stripes can drain 5654 */ 5655 conf->quiesce = 2; 5656 wait_event_lock_irq(conf->wait_for_stripe, 5657 atomic_read(&conf->active_stripes) == 0 && 5658 atomic_read(&conf->active_aligned_reads) == 0, 5659 conf->device_lock, /* nothing */); 5660 conf->quiesce = 1; 5661 spin_unlock_irq(&conf->device_lock); 5662 /* allow reshape to continue */ 5663 wake_up(&conf->wait_for_overlap); 5664 break; 5665 5666 case 0: /* re-enable writes */ 5667 spin_lock_irq(&conf->device_lock); 5668 conf->quiesce = 0; 5669 wake_up(&conf->wait_for_stripe); 5670 wake_up(&conf->wait_for_overlap); 5671 spin_unlock_irq(&conf->device_lock); 5672 break; 5673 } 5674 } 5675 5676 5677 static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5678 { 5679 struct raid0_private_data *raid0_priv = mddev->private; 5680 sector_t sectors; 5681 5682 /* for raid0 takeover only one zone is supported */ 5683 if (raid0_priv->nr_strip_zones > 1) { 5684 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5685 mdname(mddev)); 5686 return ERR_PTR(-EINVAL); 5687 } 5688 5689 sectors = raid0_priv->strip_zone[0].zone_end; 5690 sector_div(sectors, raid0_priv->strip_zone[0].nb_dev); 5691 mddev->dev_sectors = sectors; 5692 mddev->new_level = level; 5693 mddev->new_layout = ALGORITHM_PARITY_N; 5694 mddev->new_chunk_sectors = mddev->chunk_sectors; 5695 mddev->raid_disks += 1; 5696 mddev->delta_disks = 1; 5697 /* make sure it will be not marked as dirty */ 5698 mddev->recovery_cp = MaxSector; 5699 5700 return setup_conf(mddev); 5701 } 5702 5703 5704 static void *raid5_takeover_raid1(mddev_t *mddev) 5705 { 5706 int chunksect; 5707 5708 if (mddev->raid_disks != 2 || 5709 mddev->degraded > 1) 5710 return ERR_PTR(-EINVAL); 5711 5712 /* Should check if there are write-behind devices? */ 5713 5714 chunksect = 64*2; /* 64K by default */ 5715 5716 /* The array must be an exact multiple of chunksize */ 5717 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5718 chunksect >>= 1; 5719 5720 if ((chunksect<<9) < STRIPE_SIZE) 5721 /* array size does not allow a suitable chunk size */ 5722 return ERR_PTR(-EINVAL); 5723 5724 mddev->new_level = 5; 5725 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5726 mddev->new_chunk_sectors = chunksect; 5727 5728 return setup_conf(mddev); 5729 } 5730 5731 static void *raid5_takeover_raid6(mddev_t *mddev) 5732 { 5733 int new_layout; 5734 5735 switch (mddev->layout) { 5736 case ALGORITHM_LEFT_ASYMMETRIC_6: 5737 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5738 break; 5739 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5740 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5741 break; 5742 case ALGORITHM_LEFT_SYMMETRIC_6: 5743 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5744 break; 5745 case ALGORITHM_RIGHT_SYMMETRIC_6: 5746 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5747 break; 5748 case ALGORITHM_PARITY_0_6: 5749 new_layout = ALGORITHM_PARITY_0; 5750 break; 5751 case ALGORITHM_PARITY_N: 5752 new_layout = ALGORITHM_PARITY_N; 5753 break; 5754 default: 5755 return ERR_PTR(-EINVAL); 5756 } 5757 mddev->new_level = 5; 5758 mddev->new_layout = new_layout; 5759 mddev->delta_disks = -1; 5760 mddev->raid_disks -= 1; 5761 return setup_conf(mddev); 5762 } 5763 5764 5765 static int raid5_check_reshape(mddev_t *mddev) 5766 { 5767 /* For a 2-drive array, the layout and chunk size can be changed 5768 * immediately as not restriping is needed. 5769 * For larger arrays we record the new value - after validation 5770 * to be used by a reshape pass. 5771 */ 5772 raid5_conf_t *conf = mddev->private; 5773 int new_chunk = mddev->new_chunk_sectors; 5774 5775 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5776 return -EINVAL; 5777 if (new_chunk > 0) { 5778 if (!is_power_of_2(new_chunk)) 5779 return -EINVAL; 5780 if (new_chunk < (PAGE_SIZE>>9)) 5781 return -EINVAL; 5782 if (mddev->array_sectors & (new_chunk-1)) 5783 /* not factor of array size */ 5784 return -EINVAL; 5785 } 5786 5787 /* They look valid */ 5788 5789 if (mddev->raid_disks == 2) { 5790 /* can make the change immediately */ 5791 if (mddev->new_layout >= 0) { 5792 conf->algorithm = mddev->new_layout; 5793 mddev->layout = mddev->new_layout; 5794 } 5795 if (new_chunk > 0) { 5796 conf->chunk_sectors = new_chunk ; 5797 mddev->chunk_sectors = new_chunk; 5798 } 5799 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5800 md_wakeup_thread(mddev->thread); 5801 } 5802 return check_reshape(mddev); 5803 } 5804 5805 static int raid6_check_reshape(mddev_t *mddev) 5806 { 5807 int new_chunk = mddev->new_chunk_sectors; 5808 5809 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5810 return -EINVAL; 5811 if (new_chunk > 0) { 5812 if (!is_power_of_2(new_chunk)) 5813 return -EINVAL; 5814 if (new_chunk < (PAGE_SIZE >> 9)) 5815 return -EINVAL; 5816 if (mddev->array_sectors & (new_chunk-1)) 5817 /* not factor of array size */ 5818 return -EINVAL; 5819 } 5820 5821 /* They look valid */ 5822 return check_reshape(mddev); 5823 } 5824 5825 static void *raid5_takeover(mddev_t *mddev) 5826 { 5827 /* raid5 can take over: 5828 * raid0 - if there is only one strip zone - make it a raid4 layout 5829 * raid1 - if there are two drives. We need to know the chunk size 5830 * raid4 - trivial - just use a raid4 layout. 5831 * raid6 - Providing it is a *_6 layout 5832 */ 5833 if (mddev->level == 0) 5834 return raid45_takeover_raid0(mddev, 5); 5835 if (mddev->level == 1) 5836 return raid5_takeover_raid1(mddev); 5837 if (mddev->level == 4) { 5838 mddev->new_layout = ALGORITHM_PARITY_N; 5839 mddev->new_level = 5; 5840 return setup_conf(mddev); 5841 } 5842 if (mddev->level == 6) 5843 return raid5_takeover_raid6(mddev); 5844 5845 return ERR_PTR(-EINVAL); 5846 } 5847 5848 static void *raid4_takeover(mddev_t *mddev) 5849 { 5850 /* raid4 can take over: 5851 * raid0 - if there is only one strip zone 5852 * raid5 - if layout is right 5853 */ 5854 if (mddev->level == 0) 5855 return raid45_takeover_raid0(mddev, 4); 5856 if (mddev->level == 5 && 5857 mddev->layout == ALGORITHM_PARITY_N) { 5858 mddev->new_layout = 0; 5859 mddev->new_level = 4; 5860 return setup_conf(mddev); 5861 } 5862 return ERR_PTR(-EINVAL); 5863 } 5864 5865 static struct mdk_personality raid5_personality; 5866 5867 static void *raid6_takeover(mddev_t *mddev) 5868 { 5869 /* Currently can only take over a raid5. We map the 5870 * personality to an equivalent raid6 personality 5871 * with the Q block at the end. 5872 */ 5873 int new_layout; 5874 5875 if (mddev->pers != &raid5_personality) 5876 return ERR_PTR(-EINVAL); 5877 if (mddev->degraded > 1) 5878 return ERR_PTR(-EINVAL); 5879 if (mddev->raid_disks > 253) 5880 return ERR_PTR(-EINVAL); 5881 if (mddev->raid_disks < 3) 5882 return ERR_PTR(-EINVAL); 5883 5884 switch (mddev->layout) { 5885 case ALGORITHM_LEFT_ASYMMETRIC: 5886 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5887 break; 5888 case ALGORITHM_RIGHT_ASYMMETRIC: 5889 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5890 break; 5891 case ALGORITHM_LEFT_SYMMETRIC: 5892 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5893 break; 5894 case ALGORITHM_RIGHT_SYMMETRIC: 5895 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5896 break; 5897 case ALGORITHM_PARITY_0: 5898 new_layout = ALGORITHM_PARITY_0_6; 5899 break; 5900 case ALGORITHM_PARITY_N: 5901 new_layout = ALGORITHM_PARITY_N; 5902 break; 5903 default: 5904 return ERR_PTR(-EINVAL); 5905 } 5906 mddev->new_level = 6; 5907 mddev->new_layout = new_layout; 5908 mddev->delta_disks = 1; 5909 mddev->raid_disks += 1; 5910 return setup_conf(mddev); 5911 } 5912 5913 5914 static struct mdk_personality raid6_personality = 5915 { 5916 .name = "raid6", 5917 .level = 6, 5918 .owner = THIS_MODULE, 5919 .make_request = make_request, 5920 .run = run, 5921 .stop = stop, 5922 .status = status, 5923 .error_handler = error, 5924 .hot_add_disk = raid5_add_disk, 5925 .hot_remove_disk= raid5_remove_disk, 5926 .spare_active = raid5_spare_active, 5927 .sync_request = sync_request, 5928 .resize = raid5_resize, 5929 .size = raid5_size, 5930 .check_reshape = raid6_check_reshape, 5931 .start_reshape = raid5_start_reshape, 5932 .finish_reshape = raid5_finish_reshape, 5933 .quiesce = raid5_quiesce, 5934 .takeover = raid6_takeover, 5935 }; 5936 static struct mdk_personality raid5_personality = 5937 { 5938 .name = "raid5", 5939 .level = 5, 5940 .owner = THIS_MODULE, 5941 .make_request = make_request, 5942 .run = run, 5943 .stop = stop, 5944 .status = status, 5945 .error_handler = error, 5946 .hot_add_disk = raid5_add_disk, 5947 .hot_remove_disk= raid5_remove_disk, 5948 .spare_active = raid5_spare_active, 5949 .sync_request = sync_request, 5950 .resize = raid5_resize, 5951 .size = raid5_size, 5952 .check_reshape = raid5_check_reshape, 5953 .start_reshape = raid5_start_reshape, 5954 .finish_reshape = raid5_finish_reshape, 5955 .quiesce = raid5_quiesce, 5956 .takeover = raid5_takeover, 5957 }; 5958 5959 static struct mdk_personality raid4_personality = 5960 { 5961 .name = "raid4", 5962 .level = 4, 5963 .owner = THIS_MODULE, 5964 .make_request = make_request, 5965 .run = run, 5966 .stop = stop, 5967 .status = status, 5968 .error_handler = error, 5969 .hot_add_disk = raid5_add_disk, 5970 .hot_remove_disk= raid5_remove_disk, 5971 .spare_active = raid5_spare_active, 5972 .sync_request = sync_request, 5973 .resize = raid5_resize, 5974 .size = raid5_size, 5975 .check_reshape = raid5_check_reshape, 5976 .start_reshape = raid5_start_reshape, 5977 .finish_reshape = raid5_finish_reshape, 5978 .quiesce = raid5_quiesce, 5979 .takeover = raid4_takeover, 5980 }; 5981 5982 static int __init raid5_init(void) 5983 { 5984 register_md_personality(&raid6_personality); 5985 register_md_personality(&raid5_personality); 5986 register_md_personality(&raid4_personality); 5987 return 0; 5988 } 5989 5990 static void raid5_exit(void) 5991 { 5992 unregister_md_personality(&raid6_personality); 5993 unregister_md_personality(&raid5_personality); 5994 unregister_md_personality(&raid4_personality); 5995 } 5996 5997 module_init(raid5_init); 5998 module_exit(raid5_exit); 5999 MODULE_LICENSE("GPL"); 6000 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6001 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6002 MODULE_ALIAS("md-raid5"); 6003 MODULE_ALIAS("md-raid4"); 6004 MODULE_ALIAS("md-level-5"); 6005 MODULE_ALIAS("md-level-4"); 6006 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6007 MODULE_ALIAS("md-raid6"); 6008 MODULE_ALIAS("md-level-6"); 6009 6010 /* This used to be two separate modules, they were: */ 6011 MODULE_ALIAS("raid5"); 6012 MODULE_ALIAS("raid6"); 6013