1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/async.h> 51 #include <linux/seq_file.h> 52 #include <linux/cpu.h> 53 #include <linux/slab.h> 54 #include <linux/ratelimit.h> 55 #include "md.h" 56 #include "raid5.h" 57 #include "raid0.h" 58 #include "bitmap.h" 59 60 /* 61 * Stripe cache 62 */ 63 64 #define NR_STRIPES 256 65 #define STRIPE_SIZE PAGE_SIZE 66 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 67 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 68 #define IO_THRESHOLD 1 69 #define BYPASS_THRESHOLD 1 70 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 71 #define HASH_MASK (NR_HASH - 1) 72 73 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 74 { 75 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 76 return &conf->stripe_hashtbl[hash]; 77 } 78 79 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 80 * order without overlap. There may be several bio's per stripe+device, and 81 * a bio could span several devices. 82 * When walking this list for a particular stripe+device, we must never proceed 83 * beyond a bio that extends past this device, as the next bio might no longer 84 * be valid. 85 * This function is used to determine the 'next' bio in the list, given the sector 86 * of the current stripe+device 87 */ 88 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 89 { 90 int sectors = bio->bi_size >> 9; 91 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 92 return bio->bi_next; 93 else 94 return NULL; 95 } 96 97 /* 98 * We maintain a biased count of active stripes in the bottom 16 bits of 99 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 100 */ 101 static inline int raid5_bi_phys_segments(struct bio *bio) 102 { 103 return bio->bi_phys_segments & 0xffff; 104 } 105 106 static inline int raid5_bi_hw_segments(struct bio *bio) 107 { 108 return (bio->bi_phys_segments >> 16) & 0xffff; 109 } 110 111 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 112 { 113 --bio->bi_phys_segments; 114 return raid5_bi_phys_segments(bio); 115 } 116 117 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 118 { 119 unsigned short val = raid5_bi_hw_segments(bio); 120 121 --val; 122 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 123 return val; 124 } 125 126 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 127 { 128 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 129 } 130 131 /* Find first data disk in a raid6 stripe */ 132 static inline int raid6_d0(struct stripe_head *sh) 133 { 134 if (sh->ddf_layout) 135 /* ddf always start from first device */ 136 return 0; 137 /* md starts just after Q block */ 138 if (sh->qd_idx == sh->disks - 1) 139 return 0; 140 else 141 return sh->qd_idx + 1; 142 } 143 static inline int raid6_next_disk(int disk, int raid_disks) 144 { 145 disk++; 146 return (disk < raid_disks) ? disk : 0; 147 } 148 149 /* When walking through the disks in a raid5, starting at raid6_d0, 150 * We need to map each disk to a 'slot', where the data disks are slot 151 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 152 * is raid_disks-1. This help does that mapping. 153 */ 154 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 155 int *count, int syndrome_disks) 156 { 157 int slot = *count; 158 159 if (sh->ddf_layout) 160 (*count)++; 161 if (idx == sh->pd_idx) 162 return syndrome_disks; 163 if (idx == sh->qd_idx) 164 return syndrome_disks + 1; 165 if (!sh->ddf_layout) 166 (*count)++; 167 return slot; 168 } 169 170 static void return_io(struct bio *return_bi) 171 { 172 struct bio *bi = return_bi; 173 while (bi) { 174 175 return_bi = bi->bi_next; 176 bi->bi_next = NULL; 177 bi->bi_size = 0; 178 bio_endio(bi, 0); 179 bi = return_bi; 180 } 181 } 182 183 static void print_raid5_conf (struct r5conf *conf); 184 185 static int stripe_operations_active(struct stripe_head *sh) 186 { 187 return sh->check_state || sh->reconstruct_state || 188 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 189 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 190 } 191 192 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 193 { 194 if (atomic_dec_and_test(&sh->count)) { 195 BUG_ON(!list_empty(&sh->lru)); 196 BUG_ON(atomic_read(&conf->active_stripes)==0); 197 if (test_bit(STRIPE_HANDLE, &sh->state)) { 198 if (test_bit(STRIPE_DELAYED, &sh->state)) 199 list_add_tail(&sh->lru, &conf->delayed_list); 200 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 201 sh->bm_seq - conf->seq_write > 0) 202 list_add_tail(&sh->lru, &conf->bitmap_list); 203 else { 204 clear_bit(STRIPE_BIT_DELAY, &sh->state); 205 list_add_tail(&sh->lru, &conf->handle_list); 206 } 207 md_wakeup_thread(conf->mddev->thread); 208 } else { 209 BUG_ON(stripe_operations_active(sh)); 210 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 211 atomic_dec(&conf->preread_active_stripes); 212 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 213 md_wakeup_thread(conf->mddev->thread); 214 } 215 atomic_dec(&conf->active_stripes); 216 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 217 list_add_tail(&sh->lru, &conf->inactive_list); 218 wake_up(&conf->wait_for_stripe); 219 if (conf->retry_read_aligned) 220 md_wakeup_thread(conf->mddev->thread); 221 } 222 } 223 } 224 } 225 226 static void release_stripe(struct stripe_head *sh) 227 { 228 struct r5conf *conf = sh->raid_conf; 229 unsigned long flags; 230 231 spin_lock_irqsave(&conf->device_lock, flags); 232 __release_stripe(conf, sh); 233 spin_unlock_irqrestore(&conf->device_lock, flags); 234 } 235 236 static inline void remove_hash(struct stripe_head *sh) 237 { 238 pr_debug("remove_hash(), stripe %llu\n", 239 (unsigned long long)sh->sector); 240 241 hlist_del_init(&sh->hash); 242 } 243 244 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 245 { 246 struct hlist_head *hp = stripe_hash(conf, sh->sector); 247 248 pr_debug("insert_hash(), stripe %llu\n", 249 (unsigned long long)sh->sector); 250 251 hlist_add_head(&sh->hash, hp); 252 } 253 254 255 /* find an idle stripe, make sure it is unhashed, and return it. */ 256 static struct stripe_head *get_free_stripe(struct r5conf *conf) 257 { 258 struct stripe_head *sh = NULL; 259 struct list_head *first; 260 261 if (list_empty(&conf->inactive_list)) 262 goto out; 263 first = conf->inactive_list.next; 264 sh = list_entry(first, struct stripe_head, lru); 265 list_del_init(first); 266 remove_hash(sh); 267 atomic_inc(&conf->active_stripes); 268 out: 269 return sh; 270 } 271 272 static void shrink_buffers(struct stripe_head *sh) 273 { 274 struct page *p; 275 int i; 276 int num = sh->raid_conf->pool_size; 277 278 for (i = 0; i < num ; i++) { 279 p = sh->dev[i].page; 280 if (!p) 281 continue; 282 sh->dev[i].page = NULL; 283 put_page(p); 284 } 285 } 286 287 static int grow_buffers(struct stripe_head *sh) 288 { 289 int i; 290 int num = sh->raid_conf->pool_size; 291 292 for (i = 0; i < num; i++) { 293 struct page *page; 294 295 if (!(page = alloc_page(GFP_KERNEL))) { 296 return 1; 297 } 298 sh->dev[i].page = page; 299 } 300 return 0; 301 } 302 303 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 304 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 305 struct stripe_head *sh); 306 307 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 308 { 309 struct r5conf *conf = sh->raid_conf; 310 int i; 311 312 BUG_ON(atomic_read(&sh->count) != 0); 313 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 314 BUG_ON(stripe_operations_active(sh)); 315 316 pr_debug("init_stripe called, stripe %llu\n", 317 (unsigned long long)sh->sector); 318 319 remove_hash(sh); 320 321 sh->generation = conf->generation - previous; 322 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 323 sh->sector = sector; 324 stripe_set_idx(sector, conf, previous, sh); 325 sh->state = 0; 326 327 328 for (i = sh->disks; i--; ) { 329 struct r5dev *dev = &sh->dev[i]; 330 331 if (dev->toread || dev->read || dev->towrite || dev->written || 332 test_bit(R5_LOCKED, &dev->flags)) { 333 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 334 (unsigned long long)sh->sector, i, dev->toread, 335 dev->read, dev->towrite, dev->written, 336 test_bit(R5_LOCKED, &dev->flags)); 337 WARN_ON(1); 338 } 339 dev->flags = 0; 340 raid5_build_block(sh, i, previous); 341 } 342 insert_hash(conf, sh); 343 } 344 345 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 346 short generation) 347 { 348 struct stripe_head *sh; 349 struct hlist_node *hn; 350 351 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 352 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 353 if (sh->sector == sector && sh->generation == generation) 354 return sh; 355 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 356 return NULL; 357 } 358 359 /* 360 * Need to check if array has failed when deciding whether to: 361 * - start an array 362 * - remove non-faulty devices 363 * - add a spare 364 * - allow a reshape 365 * This determination is simple when no reshape is happening. 366 * However if there is a reshape, we need to carefully check 367 * both the before and after sections. 368 * This is because some failed devices may only affect one 369 * of the two sections, and some non-in_sync devices may 370 * be insync in the section most affected by failed devices. 371 */ 372 static int has_failed(struct r5conf *conf) 373 { 374 int degraded; 375 int i; 376 if (conf->mddev->reshape_position == MaxSector) 377 return conf->mddev->degraded > conf->max_degraded; 378 379 rcu_read_lock(); 380 degraded = 0; 381 for (i = 0; i < conf->previous_raid_disks; i++) { 382 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 383 if (!rdev || test_bit(Faulty, &rdev->flags)) 384 degraded++; 385 else if (test_bit(In_sync, &rdev->flags)) 386 ; 387 else 388 /* not in-sync or faulty. 389 * If the reshape increases the number of devices, 390 * this is being recovered by the reshape, so 391 * this 'previous' section is not in_sync. 392 * If the number of devices is being reduced however, 393 * the device can only be part of the array if 394 * we are reverting a reshape, so this section will 395 * be in-sync. 396 */ 397 if (conf->raid_disks >= conf->previous_raid_disks) 398 degraded++; 399 } 400 rcu_read_unlock(); 401 if (degraded > conf->max_degraded) 402 return 1; 403 rcu_read_lock(); 404 degraded = 0; 405 for (i = 0; i < conf->raid_disks; i++) { 406 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 407 if (!rdev || test_bit(Faulty, &rdev->flags)) 408 degraded++; 409 else if (test_bit(In_sync, &rdev->flags)) 410 ; 411 else 412 /* not in-sync or faulty. 413 * If reshape increases the number of devices, this 414 * section has already been recovered, else it 415 * almost certainly hasn't. 416 */ 417 if (conf->raid_disks <= conf->previous_raid_disks) 418 degraded++; 419 } 420 rcu_read_unlock(); 421 if (degraded > conf->max_degraded) 422 return 1; 423 return 0; 424 } 425 426 static struct stripe_head * 427 get_active_stripe(struct r5conf *conf, sector_t sector, 428 int previous, int noblock, int noquiesce) 429 { 430 struct stripe_head *sh; 431 432 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 433 434 spin_lock_irq(&conf->device_lock); 435 436 do { 437 wait_event_lock_irq(conf->wait_for_stripe, 438 conf->quiesce == 0 || noquiesce, 439 conf->device_lock, /* nothing */); 440 sh = __find_stripe(conf, sector, conf->generation - previous); 441 if (!sh) { 442 if (!conf->inactive_blocked) 443 sh = get_free_stripe(conf); 444 if (noblock && sh == NULL) 445 break; 446 if (!sh) { 447 conf->inactive_blocked = 1; 448 wait_event_lock_irq(conf->wait_for_stripe, 449 !list_empty(&conf->inactive_list) && 450 (atomic_read(&conf->active_stripes) 451 < (conf->max_nr_stripes *3/4) 452 || !conf->inactive_blocked), 453 conf->device_lock, 454 ); 455 conf->inactive_blocked = 0; 456 } else 457 init_stripe(sh, sector, previous); 458 } else { 459 if (atomic_read(&sh->count)) { 460 BUG_ON(!list_empty(&sh->lru) 461 && !test_bit(STRIPE_EXPANDING, &sh->state)); 462 } else { 463 if (!test_bit(STRIPE_HANDLE, &sh->state)) 464 atomic_inc(&conf->active_stripes); 465 if (list_empty(&sh->lru) && 466 !test_bit(STRIPE_EXPANDING, &sh->state)) 467 BUG(); 468 list_del_init(&sh->lru); 469 } 470 } 471 } while (sh == NULL); 472 473 if (sh) 474 atomic_inc(&sh->count); 475 476 spin_unlock_irq(&conf->device_lock); 477 return sh; 478 } 479 480 static void 481 raid5_end_read_request(struct bio *bi, int error); 482 static void 483 raid5_end_write_request(struct bio *bi, int error); 484 485 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 486 { 487 struct r5conf *conf = sh->raid_conf; 488 int i, disks = sh->disks; 489 490 might_sleep(); 491 492 for (i = disks; i--; ) { 493 int rw; 494 struct bio *bi; 495 struct md_rdev *rdev; 496 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 497 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 498 rw = WRITE_FUA; 499 else 500 rw = WRITE; 501 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 502 rw = READ; 503 else 504 continue; 505 506 bi = &sh->dev[i].req; 507 508 bi->bi_rw = rw; 509 if (rw & WRITE) 510 bi->bi_end_io = raid5_end_write_request; 511 else 512 bi->bi_end_io = raid5_end_read_request; 513 514 rcu_read_lock(); 515 rdev = rcu_dereference(conf->disks[i].rdev); 516 if (rdev && test_bit(Faulty, &rdev->flags)) 517 rdev = NULL; 518 if (rdev) 519 atomic_inc(&rdev->nr_pending); 520 rcu_read_unlock(); 521 522 /* We have already checked bad blocks for reads. Now 523 * need to check for writes. 524 */ 525 while ((rw & WRITE) && rdev && 526 test_bit(WriteErrorSeen, &rdev->flags)) { 527 sector_t first_bad; 528 int bad_sectors; 529 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 530 &first_bad, &bad_sectors); 531 if (!bad) 532 break; 533 534 if (bad < 0) { 535 set_bit(BlockedBadBlocks, &rdev->flags); 536 if (!conf->mddev->external && 537 conf->mddev->flags) { 538 /* It is very unlikely, but we might 539 * still need to write out the 540 * bad block log - better give it 541 * a chance*/ 542 md_check_recovery(conf->mddev); 543 } 544 md_wait_for_blocked_rdev(rdev, conf->mddev); 545 } else { 546 /* Acknowledged bad block - skip the write */ 547 rdev_dec_pending(rdev, conf->mddev); 548 rdev = NULL; 549 } 550 } 551 552 if (rdev) { 553 if (s->syncing || s->expanding || s->expanded) 554 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 555 556 set_bit(STRIPE_IO_STARTED, &sh->state); 557 558 bi->bi_bdev = rdev->bdev; 559 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 560 __func__, (unsigned long long)sh->sector, 561 bi->bi_rw, i); 562 atomic_inc(&sh->count); 563 bi->bi_sector = sh->sector + rdev->data_offset; 564 bi->bi_flags = 1 << BIO_UPTODATE; 565 bi->bi_vcnt = 1; 566 bi->bi_max_vecs = 1; 567 bi->bi_idx = 0; 568 bi->bi_io_vec = &sh->dev[i].vec; 569 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 570 bi->bi_io_vec[0].bv_offset = 0; 571 bi->bi_size = STRIPE_SIZE; 572 bi->bi_next = NULL; 573 generic_make_request(bi); 574 } else { 575 if (rw & WRITE) 576 set_bit(STRIPE_DEGRADED, &sh->state); 577 pr_debug("skip op %ld on disc %d for sector %llu\n", 578 bi->bi_rw, i, (unsigned long long)sh->sector); 579 clear_bit(R5_LOCKED, &sh->dev[i].flags); 580 set_bit(STRIPE_HANDLE, &sh->state); 581 } 582 } 583 } 584 585 static struct dma_async_tx_descriptor * 586 async_copy_data(int frombio, struct bio *bio, struct page *page, 587 sector_t sector, struct dma_async_tx_descriptor *tx) 588 { 589 struct bio_vec *bvl; 590 struct page *bio_page; 591 int i; 592 int page_offset; 593 struct async_submit_ctl submit; 594 enum async_tx_flags flags = 0; 595 596 if (bio->bi_sector >= sector) 597 page_offset = (signed)(bio->bi_sector - sector) * 512; 598 else 599 page_offset = (signed)(sector - bio->bi_sector) * -512; 600 601 if (frombio) 602 flags |= ASYNC_TX_FENCE; 603 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 604 605 bio_for_each_segment(bvl, bio, i) { 606 int len = bvl->bv_len; 607 int clen; 608 int b_offset = 0; 609 610 if (page_offset < 0) { 611 b_offset = -page_offset; 612 page_offset += b_offset; 613 len -= b_offset; 614 } 615 616 if (len > 0 && page_offset + len > STRIPE_SIZE) 617 clen = STRIPE_SIZE - page_offset; 618 else 619 clen = len; 620 621 if (clen > 0) { 622 b_offset += bvl->bv_offset; 623 bio_page = bvl->bv_page; 624 if (frombio) 625 tx = async_memcpy(page, bio_page, page_offset, 626 b_offset, clen, &submit); 627 else 628 tx = async_memcpy(bio_page, page, b_offset, 629 page_offset, clen, &submit); 630 } 631 /* chain the operations */ 632 submit.depend_tx = tx; 633 634 if (clen < len) /* hit end of page */ 635 break; 636 page_offset += len; 637 } 638 639 return tx; 640 } 641 642 static void ops_complete_biofill(void *stripe_head_ref) 643 { 644 struct stripe_head *sh = stripe_head_ref; 645 struct bio *return_bi = NULL; 646 struct r5conf *conf = sh->raid_conf; 647 int i; 648 649 pr_debug("%s: stripe %llu\n", __func__, 650 (unsigned long long)sh->sector); 651 652 /* clear completed biofills */ 653 spin_lock_irq(&conf->device_lock); 654 for (i = sh->disks; i--; ) { 655 struct r5dev *dev = &sh->dev[i]; 656 657 /* acknowledge completion of a biofill operation */ 658 /* and check if we need to reply to a read request, 659 * new R5_Wantfill requests are held off until 660 * !STRIPE_BIOFILL_RUN 661 */ 662 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 663 struct bio *rbi, *rbi2; 664 665 BUG_ON(!dev->read); 666 rbi = dev->read; 667 dev->read = NULL; 668 while (rbi && rbi->bi_sector < 669 dev->sector + STRIPE_SECTORS) { 670 rbi2 = r5_next_bio(rbi, dev->sector); 671 if (!raid5_dec_bi_phys_segments(rbi)) { 672 rbi->bi_next = return_bi; 673 return_bi = rbi; 674 } 675 rbi = rbi2; 676 } 677 } 678 } 679 spin_unlock_irq(&conf->device_lock); 680 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 681 682 return_io(return_bi); 683 684 set_bit(STRIPE_HANDLE, &sh->state); 685 release_stripe(sh); 686 } 687 688 static void ops_run_biofill(struct stripe_head *sh) 689 { 690 struct dma_async_tx_descriptor *tx = NULL; 691 struct r5conf *conf = sh->raid_conf; 692 struct async_submit_ctl submit; 693 int i; 694 695 pr_debug("%s: stripe %llu\n", __func__, 696 (unsigned long long)sh->sector); 697 698 for (i = sh->disks; i--; ) { 699 struct r5dev *dev = &sh->dev[i]; 700 if (test_bit(R5_Wantfill, &dev->flags)) { 701 struct bio *rbi; 702 spin_lock_irq(&conf->device_lock); 703 dev->read = rbi = dev->toread; 704 dev->toread = NULL; 705 spin_unlock_irq(&conf->device_lock); 706 while (rbi && rbi->bi_sector < 707 dev->sector + STRIPE_SECTORS) { 708 tx = async_copy_data(0, rbi, dev->page, 709 dev->sector, tx); 710 rbi = r5_next_bio(rbi, dev->sector); 711 } 712 } 713 } 714 715 atomic_inc(&sh->count); 716 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 717 async_trigger_callback(&submit); 718 } 719 720 static void mark_target_uptodate(struct stripe_head *sh, int target) 721 { 722 struct r5dev *tgt; 723 724 if (target < 0) 725 return; 726 727 tgt = &sh->dev[target]; 728 set_bit(R5_UPTODATE, &tgt->flags); 729 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 730 clear_bit(R5_Wantcompute, &tgt->flags); 731 } 732 733 static void ops_complete_compute(void *stripe_head_ref) 734 { 735 struct stripe_head *sh = stripe_head_ref; 736 737 pr_debug("%s: stripe %llu\n", __func__, 738 (unsigned long long)sh->sector); 739 740 /* mark the computed target(s) as uptodate */ 741 mark_target_uptodate(sh, sh->ops.target); 742 mark_target_uptodate(sh, sh->ops.target2); 743 744 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 745 if (sh->check_state == check_state_compute_run) 746 sh->check_state = check_state_compute_result; 747 set_bit(STRIPE_HANDLE, &sh->state); 748 release_stripe(sh); 749 } 750 751 /* return a pointer to the address conversion region of the scribble buffer */ 752 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 753 struct raid5_percpu *percpu) 754 { 755 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 756 } 757 758 static struct dma_async_tx_descriptor * 759 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 760 { 761 int disks = sh->disks; 762 struct page **xor_srcs = percpu->scribble; 763 int target = sh->ops.target; 764 struct r5dev *tgt = &sh->dev[target]; 765 struct page *xor_dest = tgt->page; 766 int count = 0; 767 struct dma_async_tx_descriptor *tx; 768 struct async_submit_ctl submit; 769 int i; 770 771 pr_debug("%s: stripe %llu block: %d\n", 772 __func__, (unsigned long long)sh->sector, target); 773 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 774 775 for (i = disks; i--; ) 776 if (i != target) 777 xor_srcs[count++] = sh->dev[i].page; 778 779 atomic_inc(&sh->count); 780 781 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 782 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 783 if (unlikely(count == 1)) 784 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 785 else 786 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 787 788 return tx; 789 } 790 791 /* set_syndrome_sources - populate source buffers for gen_syndrome 792 * @srcs - (struct page *) array of size sh->disks 793 * @sh - stripe_head to parse 794 * 795 * Populates srcs in proper layout order for the stripe and returns the 796 * 'count' of sources to be used in a call to async_gen_syndrome. The P 797 * destination buffer is recorded in srcs[count] and the Q destination 798 * is recorded in srcs[count+1]]. 799 */ 800 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 801 { 802 int disks = sh->disks; 803 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 804 int d0_idx = raid6_d0(sh); 805 int count; 806 int i; 807 808 for (i = 0; i < disks; i++) 809 srcs[i] = NULL; 810 811 count = 0; 812 i = d0_idx; 813 do { 814 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 815 816 srcs[slot] = sh->dev[i].page; 817 i = raid6_next_disk(i, disks); 818 } while (i != d0_idx); 819 820 return syndrome_disks; 821 } 822 823 static struct dma_async_tx_descriptor * 824 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 825 { 826 int disks = sh->disks; 827 struct page **blocks = percpu->scribble; 828 int target; 829 int qd_idx = sh->qd_idx; 830 struct dma_async_tx_descriptor *tx; 831 struct async_submit_ctl submit; 832 struct r5dev *tgt; 833 struct page *dest; 834 int i; 835 int count; 836 837 if (sh->ops.target < 0) 838 target = sh->ops.target2; 839 else if (sh->ops.target2 < 0) 840 target = sh->ops.target; 841 else 842 /* we should only have one valid target */ 843 BUG(); 844 BUG_ON(target < 0); 845 pr_debug("%s: stripe %llu block: %d\n", 846 __func__, (unsigned long long)sh->sector, target); 847 848 tgt = &sh->dev[target]; 849 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 850 dest = tgt->page; 851 852 atomic_inc(&sh->count); 853 854 if (target == qd_idx) { 855 count = set_syndrome_sources(blocks, sh); 856 blocks[count] = NULL; /* regenerating p is not necessary */ 857 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 858 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 859 ops_complete_compute, sh, 860 to_addr_conv(sh, percpu)); 861 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 862 } else { 863 /* Compute any data- or p-drive using XOR */ 864 count = 0; 865 for (i = disks; i-- ; ) { 866 if (i == target || i == qd_idx) 867 continue; 868 blocks[count++] = sh->dev[i].page; 869 } 870 871 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 872 NULL, ops_complete_compute, sh, 873 to_addr_conv(sh, percpu)); 874 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 875 } 876 877 return tx; 878 } 879 880 static struct dma_async_tx_descriptor * 881 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 882 { 883 int i, count, disks = sh->disks; 884 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 885 int d0_idx = raid6_d0(sh); 886 int faila = -1, failb = -1; 887 int target = sh->ops.target; 888 int target2 = sh->ops.target2; 889 struct r5dev *tgt = &sh->dev[target]; 890 struct r5dev *tgt2 = &sh->dev[target2]; 891 struct dma_async_tx_descriptor *tx; 892 struct page **blocks = percpu->scribble; 893 struct async_submit_ctl submit; 894 895 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 896 __func__, (unsigned long long)sh->sector, target, target2); 897 BUG_ON(target < 0 || target2 < 0); 898 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 899 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 900 901 /* we need to open-code set_syndrome_sources to handle the 902 * slot number conversion for 'faila' and 'failb' 903 */ 904 for (i = 0; i < disks ; i++) 905 blocks[i] = NULL; 906 count = 0; 907 i = d0_idx; 908 do { 909 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 910 911 blocks[slot] = sh->dev[i].page; 912 913 if (i == target) 914 faila = slot; 915 if (i == target2) 916 failb = slot; 917 i = raid6_next_disk(i, disks); 918 } while (i != d0_idx); 919 920 BUG_ON(faila == failb); 921 if (failb < faila) 922 swap(faila, failb); 923 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 924 __func__, (unsigned long long)sh->sector, faila, failb); 925 926 atomic_inc(&sh->count); 927 928 if (failb == syndrome_disks+1) { 929 /* Q disk is one of the missing disks */ 930 if (faila == syndrome_disks) { 931 /* Missing P+Q, just recompute */ 932 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 933 ops_complete_compute, sh, 934 to_addr_conv(sh, percpu)); 935 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 936 STRIPE_SIZE, &submit); 937 } else { 938 struct page *dest; 939 int data_target; 940 int qd_idx = sh->qd_idx; 941 942 /* Missing D+Q: recompute D from P, then recompute Q */ 943 if (target == qd_idx) 944 data_target = target2; 945 else 946 data_target = target; 947 948 count = 0; 949 for (i = disks; i-- ; ) { 950 if (i == data_target || i == qd_idx) 951 continue; 952 blocks[count++] = sh->dev[i].page; 953 } 954 dest = sh->dev[data_target].page; 955 init_async_submit(&submit, 956 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 957 NULL, NULL, NULL, 958 to_addr_conv(sh, percpu)); 959 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 960 &submit); 961 962 count = set_syndrome_sources(blocks, sh); 963 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 964 ops_complete_compute, sh, 965 to_addr_conv(sh, percpu)); 966 return async_gen_syndrome(blocks, 0, count+2, 967 STRIPE_SIZE, &submit); 968 } 969 } else { 970 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 971 ops_complete_compute, sh, 972 to_addr_conv(sh, percpu)); 973 if (failb == syndrome_disks) { 974 /* We're missing D+P. */ 975 return async_raid6_datap_recov(syndrome_disks+2, 976 STRIPE_SIZE, faila, 977 blocks, &submit); 978 } else { 979 /* We're missing D+D. */ 980 return async_raid6_2data_recov(syndrome_disks+2, 981 STRIPE_SIZE, faila, failb, 982 blocks, &submit); 983 } 984 } 985 } 986 987 988 static void ops_complete_prexor(void *stripe_head_ref) 989 { 990 struct stripe_head *sh = stripe_head_ref; 991 992 pr_debug("%s: stripe %llu\n", __func__, 993 (unsigned long long)sh->sector); 994 } 995 996 static struct dma_async_tx_descriptor * 997 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 998 struct dma_async_tx_descriptor *tx) 999 { 1000 int disks = sh->disks; 1001 struct page **xor_srcs = percpu->scribble; 1002 int count = 0, pd_idx = sh->pd_idx, i; 1003 struct async_submit_ctl submit; 1004 1005 /* existing parity data subtracted */ 1006 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1007 1008 pr_debug("%s: stripe %llu\n", __func__, 1009 (unsigned long long)sh->sector); 1010 1011 for (i = disks; i--; ) { 1012 struct r5dev *dev = &sh->dev[i]; 1013 /* Only process blocks that are known to be uptodate */ 1014 if (test_bit(R5_Wantdrain, &dev->flags)) 1015 xor_srcs[count++] = dev->page; 1016 } 1017 1018 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1019 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1020 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1021 1022 return tx; 1023 } 1024 1025 static struct dma_async_tx_descriptor * 1026 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1027 { 1028 int disks = sh->disks; 1029 int i; 1030 1031 pr_debug("%s: stripe %llu\n", __func__, 1032 (unsigned long long)sh->sector); 1033 1034 for (i = disks; i--; ) { 1035 struct r5dev *dev = &sh->dev[i]; 1036 struct bio *chosen; 1037 1038 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1039 struct bio *wbi; 1040 1041 spin_lock_irq(&sh->raid_conf->device_lock); 1042 chosen = dev->towrite; 1043 dev->towrite = NULL; 1044 BUG_ON(dev->written); 1045 wbi = dev->written = chosen; 1046 spin_unlock_irq(&sh->raid_conf->device_lock); 1047 1048 while (wbi && wbi->bi_sector < 1049 dev->sector + STRIPE_SECTORS) { 1050 if (wbi->bi_rw & REQ_FUA) 1051 set_bit(R5_WantFUA, &dev->flags); 1052 tx = async_copy_data(1, wbi, dev->page, 1053 dev->sector, tx); 1054 wbi = r5_next_bio(wbi, dev->sector); 1055 } 1056 } 1057 } 1058 1059 return tx; 1060 } 1061 1062 static void ops_complete_reconstruct(void *stripe_head_ref) 1063 { 1064 struct stripe_head *sh = stripe_head_ref; 1065 int disks = sh->disks; 1066 int pd_idx = sh->pd_idx; 1067 int qd_idx = sh->qd_idx; 1068 int i; 1069 bool fua = false; 1070 1071 pr_debug("%s: stripe %llu\n", __func__, 1072 (unsigned long long)sh->sector); 1073 1074 for (i = disks; i--; ) 1075 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1076 1077 for (i = disks; i--; ) { 1078 struct r5dev *dev = &sh->dev[i]; 1079 1080 if (dev->written || i == pd_idx || i == qd_idx) { 1081 set_bit(R5_UPTODATE, &dev->flags); 1082 if (fua) 1083 set_bit(R5_WantFUA, &dev->flags); 1084 } 1085 } 1086 1087 if (sh->reconstruct_state == reconstruct_state_drain_run) 1088 sh->reconstruct_state = reconstruct_state_drain_result; 1089 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1090 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1091 else { 1092 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1093 sh->reconstruct_state = reconstruct_state_result; 1094 } 1095 1096 set_bit(STRIPE_HANDLE, &sh->state); 1097 release_stripe(sh); 1098 } 1099 1100 static void 1101 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1102 struct dma_async_tx_descriptor *tx) 1103 { 1104 int disks = sh->disks; 1105 struct page **xor_srcs = percpu->scribble; 1106 struct async_submit_ctl submit; 1107 int count = 0, pd_idx = sh->pd_idx, i; 1108 struct page *xor_dest; 1109 int prexor = 0; 1110 unsigned long flags; 1111 1112 pr_debug("%s: stripe %llu\n", __func__, 1113 (unsigned long long)sh->sector); 1114 1115 /* check if prexor is active which means only process blocks 1116 * that are part of a read-modify-write (written) 1117 */ 1118 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1119 prexor = 1; 1120 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1121 for (i = disks; i--; ) { 1122 struct r5dev *dev = &sh->dev[i]; 1123 if (dev->written) 1124 xor_srcs[count++] = dev->page; 1125 } 1126 } else { 1127 xor_dest = sh->dev[pd_idx].page; 1128 for (i = disks; i--; ) { 1129 struct r5dev *dev = &sh->dev[i]; 1130 if (i != pd_idx) 1131 xor_srcs[count++] = dev->page; 1132 } 1133 } 1134 1135 /* 1/ if we prexor'd then the dest is reused as a source 1136 * 2/ if we did not prexor then we are redoing the parity 1137 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1138 * for the synchronous xor case 1139 */ 1140 flags = ASYNC_TX_ACK | 1141 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1142 1143 atomic_inc(&sh->count); 1144 1145 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1146 to_addr_conv(sh, percpu)); 1147 if (unlikely(count == 1)) 1148 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1149 else 1150 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1151 } 1152 1153 static void 1154 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1155 struct dma_async_tx_descriptor *tx) 1156 { 1157 struct async_submit_ctl submit; 1158 struct page **blocks = percpu->scribble; 1159 int count; 1160 1161 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1162 1163 count = set_syndrome_sources(blocks, sh); 1164 1165 atomic_inc(&sh->count); 1166 1167 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1168 sh, to_addr_conv(sh, percpu)); 1169 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1170 } 1171 1172 static void ops_complete_check(void *stripe_head_ref) 1173 { 1174 struct stripe_head *sh = stripe_head_ref; 1175 1176 pr_debug("%s: stripe %llu\n", __func__, 1177 (unsigned long long)sh->sector); 1178 1179 sh->check_state = check_state_check_result; 1180 set_bit(STRIPE_HANDLE, &sh->state); 1181 release_stripe(sh); 1182 } 1183 1184 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1185 { 1186 int disks = sh->disks; 1187 int pd_idx = sh->pd_idx; 1188 int qd_idx = sh->qd_idx; 1189 struct page *xor_dest; 1190 struct page **xor_srcs = percpu->scribble; 1191 struct dma_async_tx_descriptor *tx; 1192 struct async_submit_ctl submit; 1193 int count; 1194 int i; 1195 1196 pr_debug("%s: stripe %llu\n", __func__, 1197 (unsigned long long)sh->sector); 1198 1199 count = 0; 1200 xor_dest = sh->dev[pd_idx].page; 1201 xor_srcs[count++] = xor_dest; 1202 for (i = disks; i--; ) { 1203 if (i == pd_idx || i == qd_idx) 1204 continue; 1205 xor_srcs[count++] = sh->dev[i].page; 1206 } 1207 1208 init_async_submit(&submit, 0, NULL, NULL, NULL, 1209 to_addr_conv(sh, percpu)); 1210 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1211 &sh->ops.zero_sum_result, &submit); 1212 1213 atomic_inc(&sh->count); 1214 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1215 tx = async_trigger_callback(&submit); 1216 } 1217 1218 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1219 { 1220 struct page **srcs = percpu->scribble; 1221 struct async_submit_ctl submit; 1222 int count; 1223 1224 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1225 (unsigned long long)sh->sector, checkp); 1226 1227 count = set_syndrome_sources(srcs, sh); 1228 if (!checkp) 1229 srcs[count] = NULL; 1230 1231 atomic_inc(&sh->count); 1232 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1233 sh, to_addr_conv(sh, percpu)); 1234 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1235 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1236 } 1237 1238 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1239 { 1240 int overlap_clear = 0, i, disks = sh->disks; 1241 struct dma_async_tx_descriptor *tx = NULL; 1242 struct r5conf *conf = sh->raid_conf; 1243 int level = conf->level; 1244 struct raid5_percpu *percpu; 1245 unsigned long cpu; 1246 1247 cpu = get_cpu(); 1248 percpu = per_cpu_ptr(conf->percpu, cpu); 1249 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1250 ops_run_biofill(sh); 1251 overlap_clear++; 1252 } 1253 1254 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1255 if (level < 6) 1256 tx = ops_run_compute5(sh, percpu); 1257 else { 1258 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1259 tx = ops_run_compute6_1(sh, percpu); 1260 else 1261 tx = ops_run_compute6_2(sh, percpu); 1262 } 1263 /* terminate the chain if reconstruct is not set to be run */ 1264 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1265 async_tx_ack(tx); 1266 } 1267 1268 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1269 tx = ops_run_prexor(sh, percpu, tx); 1270 1271 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1272 tx = ops_run_biodrain(sh, tx); 1273 overlap_clear++; 1274 } 1275 1276 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1277 if (level < 6) 1278 ops_run_reconstruct5(sh, percpu, tx); 1279 else 1280 ops_run_reconstruct6(sh, percpu, tx); 1281 } 1282 1283 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1284 if (sh->check_state == check_state_run) 1285 ops_run_check_p(sh, percpu); 1286 else if (sh->check_state == check_state_run_q) 1287 ops_run_check_pq(sh, percpu, 0); 1288 else if (sh->check_state == check_state_run_pq) 1289 ops_run_check_pq(sh, percpu, 1); 1290 else 1291 BUG(); 1292 } 1293 1294 if (overlap_clear) 1295 for (i = disks; i--; ) { 1296 struct r5dev *dev = &sh->dev[i]; 1297 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1298 wake_up(&sh->raid_conf->wait_for_overlap); 1299 } 1300 put_cpu(); 1301 } 1302 1303 #ifdef CONFIG_MULTICORE_RAID456 1304 static void async_run_ops(void *param, async_cookie_t cookie) 1305 { 1306 struct stripe_head *sh = param; 1307 unsigned long ops_request = sh->ops.request; 1308 1309 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1310 wake_up(&sh->ops.wait_for_ops); 1311 1312 __raid_run_ops(sh, ops_request); 1313 release_stripe(sh); 1314 } 1315 1316 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1317 { 1318 /* since handle_stripe can be called outside of raid5d context 1319 * we need to ensure sh->ops.request is de-staged before another 1320 * request arrives 1321 */ 1322 wait_event(sh->ops.wait_for_ops, 1323 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1324 sh->ops.request = ops_request; 1325 1326 atomic_inc(&sh->count); 1327 async_schedule(async_run_ops, sh); 1328 } 1329 #else 1330 #define raid_run_ops __raid_run_ops 1331 #endif 1332 1333 static int grow_one_stripe(struct r5conf *conf) 1334 { 1335 struct stripe_head *sh; 1336 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1337 if (!sh) 1338 return 0; 1339 1340 sh->raid_conf = conf; 1341 #ifdef CONFIG_MULTICORE_RAID456 1342 init_waitqueue_head(&sh->ops.wait_for_ops); 1343 #endif 1344 1345 if (grow_buffers(sh)) { 1346 shrink_buffers(sh); 1347 kmem_cache_free(conf->slab_cache, sh); 1348 return 0; 1349 } 1350 /* we just created an active stripe so... */ 1351 atomic_set(&sh->count, 1); 1352 atomic_inc(&conf->active_stripes); 1353 INIT_LIST_HEAD(&sh->lru); 1354 release_stripe(sh); 1355 return 1; 1356 } 1357 1358 static int grow_stripes(struct r5conf *conf, int num) 1359 { 1360 struct kmem_cache *sc; 1361 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1362 1363 if (conf->mddev->gendisk) 1364 sprintf(conf->cache_name[0], 1365 "raid%d-%s", conf->level, mdname(conf->mddev)); 1366 else 1367 sprintf(conf->cache_name[0], 1368 "raid%d-%p", conf->level, conf->mddev); 1369 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1370 1371 conf->active_name = 0; 1372 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1373 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1374 0, 0, NULL); 1375 if (!sc) 1376 return 1; 1377 conf->slab_cache = sc; 1378 conf->pool_size = devs; 1379 while (num--) 1380 if (!grow_one_stripe(conf)) 1381 return 1; 1382 return 0; 1383 } 1384 1385 /** 1386 * scribble_len - return the required size of the scribble region 1387 * @num - total number of disks in the array 1388 * 1389 * The size must be enough to contain: 1390 * 1/ a struct page pointer for each device in the array +2 1391 * 2/ room to convert each entry in (1) to its corresponding dma 1392 * (dma_map_page()) or page (page_address()) address. 1393 * 1394 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1395 * calculate over all devices (not just the data blocks), using zeros in place 1396 * of the P and Q blocks. 1397 */ 1398 static size_t scribble_len(int num) 1399 { 1400 size_t len; 1401 1402 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1403 1404 return len; 1405 } 1406 1407 static int resize_stripes(struct r5conf *conf, int newsize) 1408 { 1409 /* Make all the stripes able to hold 'newsize' devices. 1410 * New slots in each stripe get 'page' set to a new page. 1411 * 1412 * This happens in stages: 1413 * 1/ create a new kmem_cache and allocate the required number of 1414 * stripe_heads. 1415 * 2/ gather all the old stripe_heads and tranfer the pages across 1416 * to the new stripe_heads. This will have the side effect of 1417 * freezing the array as once all stripe_heads have been collected, 1418 * no IO will be possible. Old stripe heads are freed once their 1419 * pages have been transferred over, and the old kmem_cache is 1420 * freed when all stripes are done. 1421 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1422 * we simple return a failre status - no need to clean anything up. 1423 * 4/ allocate new pages for the new slots in the new stripe_heads. 1424 * If this fails, we don't bother trying the shrink the 1425 * stripe_heads down again, we just leave them as they are. 1426 * As each stripe_head is processed the new one is released into 1427 * active service. 1428 * 1429 * Once step2 is started, we cannot afford to wait for a write, 1430 * so we use GFP_NOIO allocations. 1431 */ 1432 struct stripe_head *osh, *nsh; 1433 LIST_HEAD(newstripes); 1434 struct disk_info *ndisks; 1435 unsigned long cpu; 1436 int err; 1437 struct kmem_cache *sc; 1438 int i; 1439 1440 if (newsize <= conf->pool_size) 1441 return 0; /* never bother to shrink */ 1442 1443 err = md_allow_write(conf->mddev); 1444 if (err) 1445 return err; 1446 1447 /* Step 1 */ 1448 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1449 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1450 0, 0, NULL); 1451 if (!sc) 1452 return -ENOMEM; 1453 1454 for (i = conf->max_nr_stripes; i; i--) { 1455 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1456 if (!nsh) 1457 break; 1458 1459 nsh->raid_conf = conf; 1460 #ifdef CONFIG_MULTICORE_RAID456 1461 init_waitqueue_head(&nsh->ops.wait_for_ops); 1462 #endif 1463 1464 list_add(&nsh->lru, &newstripes); 1465 } 1466 if (i) { 1467 /* didn't get enough, give up */ 1468 while (!list_empty(&newstripes)) { 1469 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1470 list_del(&nsh->lru); 1471 kmem_cache_free(sc, nsh); 1472 } 1473 kmem_cache_destroy(sc); 1474 return -ENOMEM; 1475 } 1476 /* Step 2 - Must use GFP_NOIO now. 1477 * OK, we have enough stripes, start collecting inactive 1478 * stripes and copying them over 1479 */ 1480 list_for_each_entry(nsh, &newstripes, lru) { 1481 spin_lock_irq(&conf->device_lock); 1482 wait_event_lock_irq(conf->wait_for_stripe, 1483 !list_empty(&conf->inactive_list), 1484 conf->device_lock, 1485 ); 1486 osh = get_free_stripe(conf); 1487 spin_unlock_irq(&conf->device_lock); 1488 atomic_set(&nsh->count, 1); 1489 for(i=0; i<conf->pool_size; i++) 1490 nsh->dev[i].page = osh->dev[i].page; 1491 for( ; i<newsize; i++) 1492 nsh->dev[i].page = NULL; 1493 kmem_cache_free(conf->slab_cache, osh); 1494 } 1495 kmem_cache_destroy(conf->slab_cache); 1496 1497 /* Step 3. 1498 * At this point, we are holding all the stripes so the array 1499 * is completely stalled, so now is a good time to resize 1500 * conf->disks and the scribble region 1501 */ 1502 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1503 if (ndisks) { 1504 for (i=0; i<conf->raid_disks; i++) 1505 ndisks[i] = conf->disks[i]; 1506 kfree(conf->disks); 1507 conf->disks = ndisks; 1508 } else 1509 err = -ENOMEM; 1510 1511 get_online_cpus(); 1512 conf->scribble_len = scribble_len(newsize); 1513 for_each_present_cpu(cpu) { 1514 struct raid5_percpu *percpu; 1515 void *scribble; 1516 1517 percpu = per_cpu_ptr(conf->percpu, cpu); 1518 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1519 1520 if (scribble) { 1521 kfree(percpu->scribble); 1522 percpu->scribble = scribble; 1523 } else { 1524 err = -ENOMEM; 1525 break; 1526 } 1527 } 1528 put_online_cpus(); 1529 1530 /* Step 4, return new stripes to service */ 1531 while(!list_empty(&newstripes)) { 1532 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1533 list_del_init(&nsh->lru); 1534 1535 for (i=conf->raid_disks; i < newsize; i++) 1536 if (nsh->dev[i].page == NULL) { 1537 struct page *p = alloc_page(GFP_NOIO); 1538 nsh->dev[i].page = p; 1539 if (!p) 1540 err = -ENOMEM; 1541 } 1542 release_stripe(nsh); 1543 } 1544 /* critical section pass, GFP_NOIO no longer needed */ 1545 1546 conf->slab_cache = sc; 1547 conf->active_name = 1-conf->active_name; 1548 conf->pool_size = newsize; 1549 return err; 1550 } 1551 1552 static int drop_one_stripe(struct r5conf *conf) 1553 { 1554 struct stripe_head *sh; 1555 1556 spin_lock_irq(&conf->device_lock); 1557 sh = get_free_stripe(conf); 1558 spin_unlock_irq(&conf->device_lock); 1559 if (!sh) 1560 return 0; 1561 BUG_ON(atomic_read(&sh->count)); 1562 shrink_buffers(sh); 1563 kmem_cache_free(conf->slab_cache, sh); 1564 atomic_dec(&conf->active_stripes); 1565 return 1; 1566 } 1567 1568 static void shrink_stripes(struct r5conf *conf) 1569 { 1570 while (drop_one_stripe(conf)) 1571 ; 1572 1573 if (conf->slab_cache) 1574 kmem_cache_destroy(conf->slab_cache); 1575 conf->slab_cache = NULL; 1576 } 1577 1578 static void raid5_end_read_request(struct bio * bi, int error) 1579 { 1580 struct stripe_head *sh = bi->bi_private; 1581 struct r5conf *conf = sh->raid_conf; 1582 int disks = sh->disks, i; 1583 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1584 char b[BDEVNAME_SIZE]; 1585 struct md_rdev *rdev; 1586 1587 1588 for (i=0 ; i<disks; i++) 1589 if (bi == &sh->dev[i].req) 1590 break; 1591 1592 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1593 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1594 uptodate); 1595 if (i == disks) { 1596 BUG(); 1597 return; 1598 } 1599 1600 if (uptodate) { 1601 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1602 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1603 rdev = conf->disks[i].rdev; 1604 printk_ratelimited( 1605 KERN_INFO 1606 "md/raid:%s: read error corrected" 1607 " (%lu sectors at %llu on %s)\n", 1608 mdname(conf->mddev), STRIPE_SECTORS, 1609 (unsigned long long)(sh->sector 1610 + rdev->data_offset), 1611 bdevname(rdev->bdev, b)); 1612 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1613 clear_bit(R5_ReadError, &sh->dev[i].flags); 1614 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1615 } 1616 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1617 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1618 } else { 1619 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1620 int retry = 0; 1621 rdev = conf->disks[i].rdev; 1622 1623 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1624 atomic_inc(&rdev->read_errors); 1625 if (conf->mddev->degraded >= conf->max_degraded) 1626 printk_ratelimited( 1627 KERN_WARNING 1628 "md/raid:%s: read error not correctable " 1629 "(sector %llu on %s).\n", 1630 mdname(conf->mddev), 1631 (unsigned long long)(sh->sector 1632 + rdev->data_offset), 1633 bdn); 1634 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1635 /* Oh, no!!! */ 1636 printk_ratelimited( 1637 KERN_WARNING 1638 "md/raid:%s: read error NOT corrected!! " 1639 "(sector %llu on %s).\n", 1640 mdname(conf->mddev), 1641 (unsigned long long)(sh->sector 1642 + rdev->data_offset), 1643 bdn); 1644 else if (atomic_read(&rdev->read_errors) 1645 > conf->max_nr_stripes) 1646 printk(KERN_WARNING 1647 "md/raid:%s: Too many read errors, failing device %s.\n", 1648 mdname(conf->mddev), bdn); 1649 else 1650 retry = 1; 1651 if (retry) 1652 set_bit(R5_ReadError, &sh->dev[i].flags); 1653 else { 1654 clear_bit(R5_ReadError, &sh->dev[i].flags); 1655 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1656 md_error(conf->mddev, rdev); 1657 } 1658 } 1659 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1660 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1661 set_bit(STRIPE_HANDLE, &sh->state); 1662 release_stripe(sh); 1663 } 1664 1665 static void raid5_end_write_request(struct bio *bi, int error) 1666 { 1667 struct stripe_head *sh = bi->bi_private; 1668 struct r5conf *conf = sh->raid_conf; 1669 int disks = sh->disks, i; 1670 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1671 sector_t first_bad; 1672 int bad_sectors; 1673 1674 for (i=0 ; i<disks; i++) 1675 if (bi == &sh->dev[i].req) 1676 break; 1677 1678 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1679 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1680 uptodate); 1681 if (i == disks) { 1682 BUG(); 1683 return; 1684 } 1685 1686 if (!uptodate) { 1687 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); 1688 set_bit(R5_WriteError, &sh->dev[i].flags); 1689 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, 1690 &first_bad, &bad_sectors)) 1691 set_bit(R5_MadeGood, &sh->dev[i].flags); 1692 1693 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1694 1695 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1696 set_bit(STRIPE_HANDLE, &sh->state); 1697 release_stripe(sh); 1698 } 1699 1700 1701 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1702 1703 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1704 { 1705 struct r5dev *dev = &sh->dev[i]; 1706 1707 bio_init(&dev->req); 1708 dev->req.bi_io_vec = &dev->vec; 1709 dev->req.bi_vcnt++; 1710 dev->req.bi_max_vecs++; 1711 dev->vec.bv_page = dev->page; 1712 dev->vec.bv_len = STRIPE_SIZE; 1713 dev->vec.bv_offset = 0; 1714 1715 dev->req.bi_sector = sh->sector; 1716 dev->req.bi_private = sh; 1717 1718 dev->flags = 0; 1719 dev->sector = compute_blocknr(sh, i, previous); 1720 } 1721 1722 static void error(struct mddev *mddev, struct md_rdev *rdev) 1723 { 1724 char b[BDEVNAME_SIZE]; 1725 struct r5conf *conf = mddev->private; 1726 pr_debug("raid456: error called\n"); 1727 1728 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1729 unsigned long flags; 1730 spin_lock_irqsave(&conf->device_lock, flags); 1731 mddev->degraded++; 1732 spin_unlock_irqrestore(&conf->device_lock, flags); 1733 /* 1734 * if recovery was running, make sure it aborts. 1735 */ 1736 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1737 } 1738 set_bit(Blocked, &rdev->flags); 1739 set_bit(Faulty, &rdev->flags); 1740 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1741 printk(KERN_ALERT 1742 "md/raid:%s: Disk failure on %s, disabling device.\n" 1743 "md/raid:%s: Operation continuing on %d devices.\n", 1744 mdname(mddev), 1745 bdevname(rdev->bdev, b), 1746 mdname(mddev), 1747 conf->raid_disks - mddev->degraded); 1748 } 1749 1750 /* 1751 * Input: a 'big' sector number, 1752 * Output: index of the data and parity disk, and the sector # in them. 1753 */ 1754 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1755 int previous, int *dd_idx, 1756 struct stripe_head *sh) 1757 { 1758 sector_t stripe, stripe2; 1759 sector_t chunk_number; 1760 unsigned int chunk_offset; 1761 int pd_idx, qd_idx; 1762 int ddf_layout = 0; 1763 sector_t new_sector; 1764 int algorithm = previous ? conf->prev_algo 1765 : conf->algorithm; 1766 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1767 : conf->chunk_sectors; 1768 int raid_disks = previous ? conf->previous_raid_disks 1769 : conf->raid_disks; 1770 int data_disks = raid_disks - conf->max_degraded; 1771 1772 /* First compute the information on this sector */ 1773 1774 /* 1775 * Compute the chunk number and the sector offset inside the chunk 1776 */ 1777 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1778 chunk_number = r_sector; 1779 1780 /* 1781 * Compute the stripe number 1782 */ 1783 stripe = chunk_number; 1784 *dd_idx = sector_div(stripe, data_disks); 1785 stripe2 = stripe; 1786 /* 1787 * Select the parity disk based on the user selected algorithm. 1788 */ 1789 pd_idx = qd_idx = -1; 1790 switch(conf->level) { 1791 case 4: 1792 pd_idx = data_disks; 1793 break; 1794 case 5: 1795 switch (algorithm) { 1796 case ALGORITHM_LEFT_ASYMMETRIC: 1797 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1798 if (*dd_idx >= pd_idx) 1799 (*dd_idx)++; 1800 break; 1801 case ALGORITHM_RIGHT_ASYMMETRIC: 1802 pd_idx = sector_div(stripe2, raid_disks); 1803 if (*dd_idx >= pd_idx) 1804 (*dd_idx)++; 1805 break; 1806 case ALGORITHM_LEFT_SYMMETRIC: 1807 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1808 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1809 break; 1810 case ALGORITHM_RIGHT_SYMMETRIC: 1811 pd_idx = sector_div(stripe2, raid_disks); 1812 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1813 break; 1814 case ALGORITHM_PARITY_0: 1815 pd_idx = 0; 1816 (*dd_idx)++; 1817 break; 1818 case ALGORITHM_PARITY_N: 1819 pd_idx = data_disks; 1820 break; 1821 default: 1822 BUG(); 1823 } 1824 break; 1825 case 6: 1826 1827 switch (algorithm) { 1828 case ALGORITHM_LEFT_ASYMMETRIC: 1829 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1830 qd_idx = pd_idx + 1; 1831 if (pd_idx == raid_disks-1) { 1832 (*dd_idx)++; /* Q D D D P */ 1833 qd_idx = 0; 1834 } else if (*dd_idx >= pd_idx) 1835 (*dd_idx) += 2; /* D D P Q D */ 1836 break; 1837 case ALGORITHM_RIGHT_ASYMMETRIC: 1838 pd_idx = sector_div(stripe2, raid_disks); 1839 qd_idx = pd_idx + 1; 1840 if (pd_idx == raid_disks-1) { 1841 (*dd_idx)++; /* Q D D D P */ 1842 qd_idx = 0; 1843 } else if (*dd_idx >= pd_idx) 1844 (*dd_idx) += 2; /* D D P Q D */ 1845 break; 1846 case ALGORITHM_LEFT_SYMMETRIC: 1847 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1848 qd_idx = (pd_idx + 1) % raid_disks; 1849 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1850 break; 1851 case ALGORITHM_RIGHT_SYMMETRIC: 1852 pd_idx = sector_div(stripe2, raid_disks); 1853 qd_idx = (pd_idx + 1) % raid_disks; 1854 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1855 break; 1856 1857 case ALGORITHM_PARITY_0: 1858 pd_idx = 0; 1859 qd_idx = 1; 1860 (*dd_idx) += 2; 1861 break; 1862 case ALGORITHM_PARITY_N: 1863 pd_idx = data_disks; 1864 qd_idx = data_disks + 1; 1865 break; 1866 1867 case ALGORITHM_ROTATING_ZERO_RESTART: 1868 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1869 * of blocks for computing Q is different. 1870 */ 1871 pd_idx = sector_div(stripe2, raid_disks); 1872 qd_idx = pd_idx + 1; 1873 if (pd_idx == raid_disks-1) { 1874 (*dd_idx)++; /* Q D D D P */ 1875 qd_idx = 0; 1876 } else if (*dd_idx >= pd_idx) 1877 (*dd_idx) += 2; /* D D P Q D */ 1878 ddf_layout = 1; 1879 break; 1880 1881 case ALGORITHM_ROTATING_N_RESTART: 1882 /* Same a left_asymmetric, by first stripe is 1883 * D D D P Q rather than 1884 * Q D D D P 1885 */ 1886 stripe2 += 1; 1887 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1888 qd_idx = pd_idx + 1; 1889 if (pd_idx == raid_disks-1) { 1890 (*dd_idx)++; /* Q D D D P */ 1891 qd_idx = 0; 1892 } else if (*dd_idx >= pd_idx) 1893 (*dd_idx) += 2; /* D D P Q D */ 1894 ddf_layout = 1; 1895 break; 1896 1897 case ALGORITHM_ROTATING_N_CONTINUE: 1898 /* Same as left_symmetric but Q is before P */ 1899 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1900 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1901 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1902 ddf_layout = 1; 1903 break; 1904 1905 case ALGORITHM_LEFT_ASYMMETRIC_6: 1906 /* RAID5 left_asymmetric, with Q on last device */ 1907 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1908 if (*dd_idx >= pd_idx) 1909 (*dd_idx)++; 1910 qd_idx = raid_disks - 1; 1911 break; 1912 1913 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1914 pd_idx = sector_div(stripe2, raid_disks-1); 1915 if (*dd_idx >= pd_idx) 1916 (*dd_idx)++; 1917 qd_idx = raid_disks - 1; 1918 break; 1919 1920 case ALGORITHM_LEFT_SYMMETRIC_6: 1921 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 1922 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1923 qd_idx = raid_disks - 1; 1924 break; 1925 1926 case ALGORITHM_RIGHT_SYMMETRIC_6: 1927 pd_idx = sector_div(stripe2, raid_disks-1); 1928 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1929 qd_idx = raid_disks - 1; 1930 break; 1931 1932 case ALGORITHM_PARITY_0_6: 1933 pd_idx = 0; 1934 (*dd_idx)++; 1935 qd_idx = raid_disks - 1; 1936 break; 1937 1938 default: 1939 BUG(); 1940 } 1941 break; 1942 } 1943 1944 if (sh) { 1945 sh->pd_idx = pd_idx; 1946 sh->qd_idx = qd_idx; 1947 sh->ddf_layout = ddf_layout; 1948 } 1949 /* 1950 * Finally, compute the new sector number 1951 */ 1952 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1953 return new_sector; 1954 } 1955 1956 1957 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1958 { 1959 struct r5conf *conf = sh->raid_conf; 1960 int raid_disks = sh->disks; 1961 int data_disks = raid_disks - conf->max_degraded; 1962 sector_t new_sector = sh->sector, check; 1963 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1964 : conf->chunk_sectors; 1965 int algorithm = previous ? conf->prev_algo 1966 : conf->algorithm; 1967 sector_t stripe; 1968 int chunk_offset; 1969 sector_t chunk_number; 1970 int dummy1, dd_idx = i; 1971 sector_t r_sector; 1972 struct stripe_head sh2; 1973 1974 1975 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1976 stripe = new_sector; 1977 1978 if (i == sh->pd_idx) 1979 return 0; 1980 switch(conf->level) { 1981 case 4: break; 1982 case 5: 1983 switch (algorithm) { 1984 case ALGORITHM_LEFT_ASYMMETRIC: 1985 case ALGORITHM_RIGHT_ASYMMETRIC: 1986 if (i > sh->pd_idx) 1987 i--; 1988 break; 1989 case ALGORITHM_LEFT_SYMMETRIC: 1990 case ALGORITHM_RIGHT_SYMMETRIC: 1991 if (i < sh->pd_idx) 1992 i += raid_disks; 1993 i -= (sh->pd_idx + 1); 1994 break; 1995 case ALGORITHM_PARITY_0: 1996 i -= 1; 1997 break; 1998 case ALGORITHM_PARITY_N: 1999 break; 2000 default: 2001 BUG(); 2002 } 2003 break; 2004 case 6: 2005 if (i == sh->qd_idx) 2006 return 0; /* It is the Q disk */ 2007 switch (algorithm) { 2008 case ALGORITHM_LEFT_ASYMMETRIC: 2009 case ALGORITHM_RIGHT_ASYMMETRIC: 2010 case ALGORITHM_ROTATING_ZERO_RESTART: 2011 case ALGORITHM_ROTATING_N_RESTART: 2012 if (sh->pd_idx == raid_disks-1) 2013 i--; /* Q D D D P */ 2014 else if (i > sh->pd_idx) 2015 i -= 2; /* D D P Q D */ 2016 break; 2017 case ALGORITHM_LEFT_SYMMETRIC: 2018 case ALGORITHM_RIGHT_SYMMETRIC: 2019 if (sh->pd_idx == raid_disks-1) 2020 i--; /* Q D D D P */ 2021 else { 2022 /* D D P Q D */ 2023 if (i < sh->pd_idx) 2024 i += raid_disks; 2025 i -= (sh->pd_idx + 2); 2026 } 2027 break; 2028 case ALGORITHM_PARITY_0: 2029 i -= 2; 2030 break; 2031 case ALGORITHM_PARITY_N: 2032 break; 2033 case ALGORITHM_ROTATING_N_CONTINUE: 2034 /* Like left_symmetric, but P is before Q */ 2035 if (sh->pd_idx == 0) 2036 i--; /* P D D D Q */ 2037 else { 2038 /* D D Q P D */ 2039 if (i < sh->pd_idx) 2040 i += raid_disks; 2041 i -= (sh->pd_idx + 1); 2042 } 2043 break; 2044 case ALGORITHM_LEFT_ASYMMETRIC_6: 2045 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2046 if (i > sh->pd_idx) 2047 i--; 2048 break; 2049 case ALGORITHM_LEFT_SYMMETRIC_6: 2050 case ALGORITHM_RIGHT_SYMMETRIC_6: 2051 if (i < sh->pd_idx) 2052 i += data_disks + 1; 2053 i -= (sh->pd_idx + 1); 2054 break; 2055 case ALGORITHM_PARITY_0_6: 2056 i -= 1; 2057 break; 2058 default: 2059 BUG(); 2060 } 2061 break; 2062 } 2063 2064 chunk_number = stripe * data_disks + i; 2065 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2066 2067 check = raid5_compute_sector(conf, r_sector, 2068 previous, &dummy1, &sh2); 2069 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2070 || sh2.qd_idx != sh->qd_idx) { 2071 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2072 mdname(conf->mddev)); 2073 return 0; 2074 } 2075 return r_sector; 2076 } 2077 2078 2079 static void 2080 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2081 int rcw, int expand) 2082 { 2083 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2084 struct r5conf *conf = sh->raid_conf; 2085 int level = conf->level; 2086 2087 if (rcw) { 2088 /* if we are not expanding this is a proper write request, and 2089 * there will be bios with new data to be drained into the 2090 * stripe cache 2091 */ 2092 if (!expand) { 2093 sh->reconstruct_state = reconstruct_state_drain_run; 2094 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2095 } else 2096 sh->reconstruct_state = reconstruct_state_run; 2097 2098 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2099 2100 for (i = disks; i--; ) { 2101 struct r5dev *dev = &sh->dev[i]; 2102 2103 if (dev->towrite) { 2104 set_bit(R5_LOCKED, &dev->flags); 2105 set_bit(R5_Wantdrain, &dev->flags); 2106 if (!expand) 2107 clear_bit(R5_UPTODATE, &dev->flags); 2108 s->locked++; 2109 } 2110 } 2111 if (s->locked + conf->max_degraded == disks) 2112 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2113 atomic_inc(&conf->pending_full_writes); 2114 } else { 2115 BUG_ON(level == 6); 2116 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2117 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2118 2119 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2120 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2121 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2122 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2123 2124 for (i = disks; i--; ) { 2125 struct r5dev *dev = &sh->dev[i]; 2126 if (i == pd_idx) 2127 continue; 2128 2129 if (dev->towrite && 2130 (test_bit(R5_UPTODATE, &dev->flags) || 2131 test_bit(R5_Wantcompute, &dev->flags))) { 2132 set_bit(R5_Wantdrain, &dev->flags); 2133 set_bit(R5_LOCKED, &dev->flags); 2134 clear_bit(R5_UPTODATE, &dev->flags); 2135 s->locked++; 2136 } 2137 } 2138 } 2139 2140 /* keep the parity disk(s) locked while asynchronous operations 2141 * are in flight 2142 */ 2143 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2144 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2145 s->locked++; 2146 2147 if (level == 6) { 2148 int qd_idx = sh->qd_idx; 2149 struct r5dev *dev = &sh->dev[qd_idx]; 2150 2151 set_bit(R5_LOCKED, &dev->flags); 2152 clear_bit(R5_UPTODATE, &dev->flags); 2153 s->locked++; 2154 } 2155 2156 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2157 __func__, (unsigned long long)sh->sector, 2158 s->locked, s->ops_request); 2159 } 2160 2161 /* 2162 * Each stripe/dev can have one or more bion attached. 2163 * toread/towrite point to the first in a chain. 2164 * The bi_next chain must be in order. 2165 */ 2166 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2167 { 2168 struct bio **bip; 2169 struct r5conf *conf = sh->raid_conf; 2170 int firstwrite=0; 2171 2172 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2173 (unsigned long long)bi->bi_sector, 2174 (unsigned long long)sh->sector); 2175 2176 2177 spin_lock_irq(&conf->device_lock); 2178 if (forwrite) { 2179 bip = &sh->dev[dd_idx].towrite; 2180 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2181 firstwrite = 1; 2182 } else 2183 bip = &sh->dev[dd_idx].toread; 2184 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2185 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2186 goto overlap; 2187 bip = & (*bip)->bi_next; 2188 } 2189 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2190 goto overlap; 2191 2192 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2193 if (*bip) 2194 bi->bi_next = *bip; 2195 *bip = bi; 2196 bi->bi_phys_segments++; 2197 2198 if (forwrite) { 2199 /* check if page is covered */ 2200 sector_t sector = sh->dev[dd_idx].sector; 2201 for (bi=sh->dev[dd_idx].towrite; 2202 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2203 bi && bi->bi_sector <= sector; 2204 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2205 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2206 sector = bi->bi_sector + (bi->bi_size>>9); 2207 } 2208 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2209 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2210 } 2211 spin_unlock_irq(&conf->device_lock); 2212 2213 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2214 (unsigned long long)(*bip)->bi_sector, 2215 (unsigned long long)sh->sector, dd_idx); 2216 2217 if (conf->mddev->bitmap && firstwrite) { 2218 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2219 STRIPE_SECTORS, 0); 2220 sh->bm_seq = conf->seq_flush+1; 2221 set_bit(STRIPE_BIT_DELAY, &sh->state); 2222 } 2223 return 1; 2224 2225 overlap: 2226 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2227 spin_unlock_irq(&conf->device_lock); 2228 return 0; 2229 } 2230 2231 static void end_reshape(struct r5conf *conf); 2232 2233 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2234 struct stripe_head *sh) 2235 { 2236 int sectors_per_chunk = 2237 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2238 int dd_idx; 2239 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2240 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2241 2242 raid5_compute_sector(conf, 2243 stripe * (disks - conf->max_degraded) 2244 *sectors_per_chunk + chunk_offset, 2245 previous, 2246 &dd_idx, sh); 2247 } 2248 2249 static void 2250 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2251 struct stripe_head_state *s, int disks, 2252 struct bio **return_bi) 2253 { 2254 int i; 2255 for (i = disks; i--; ) { 2256 struct bio *bi; 2257 int bitmap_end = 0; 2258 2259 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2260 struct md_rdev *rdev; 2261 rcu_read_lock(); 2262 rdev = rcu_dereference(conf->disks[i].rdev); 2263 if (rdev && test_bit(In_sync, &rdev->flags)) 2264 atomic_inc(&rdev->nr_pending); 2265 else 2266 rdev = NULL; 2267 rcu_read_unlock(); 2268 if (rdev) { 2269 if (!rdev_set_badblocks( 2270 rdev, 2271 sh->sector, 2272 STRIPE_SECTORS, 0)) 2273 md_error(conf->mddev, rdev); 2274 rdev_dec_pending(rdev, conf->mddev); 2275 } 2276 } 2277 spin_lock_irq(&conf->device_lock); 2278 /* fail all writes first */ 2279 bi = sh->dev[i].towrite; 2280 sh->dev[i].towrite = NULL; 2281 if (bi) { 2282 s->to_write--; 2283 bitmap_end = 1; 2284 } 2285 2286 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2287 wake_up(&conf->wait_for_overlap); 2288 2289 while (bi && bi->bi_sector < 2290 sh->dev[i].sector + STRIPE_SECTORS) { 2291 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2292 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2293 if (!raid5_dec_bi_phys_segments(bi)) { 2294 md_write_end(conf->mddev); 2295 bi->bi_next = *return_bi; 2296 *return_bi = bi; 2297 } 2298 bi = nextbi; 2299 } 2300 /* and fail all 'written' */ 2301 bi = sh->dev[i].written; 2302 sh->dev[i].written = NULL; 2303 if (bi) bitmap_end = 1; 2304 while (bi && bi->bi_sector < 2305 sh->dev[i].sector + STRIPE_SECTORS) { 2306 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2307 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2308 if (!raid5_dec_bi_phys_segments(bi)) { 2309 md_write_end(conf->mddev); 2310 bi->bi_next = *return_bi; 2311 *return_bi = bi; 2312 } 2313 bi = bi2; 2314 } 2315 2316 /* fail any reads if this device is non-operational and 2317 * the data has not reached the cache yet. 2318 */ 2319 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2320 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2321 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2322 bi = sh->dev[i].toread; 2323 sh->dev[i].toread = NULL; 2324 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2325 wake_up(&conf->wait_for_overlap); 2326 if (bi) s->to_read--; 2327 while (bi && bi->bi_sector < 2328 sh->dev[i].sector + STRIPE_SECTORS) { 2329 struct bio *nextbi = 2330 r5_next_bio(bi, sh->dev[i].sector); 2331 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2332 if (!raid5_dec_bi_phys_segments(bi)) { 2333 bi->bi_next = *return_bi; 2334 *return_bi = bi; 2335 } 2336 bi = nextbi; 2337 } 2338 } 2339 spin_unlock_irq(&conf->device_lock); 2340 if (bitmap_end) 2341 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2342 STRIPE_SECTORS, 0, 0); 2343 /* If we were in the middle of a write the parity block might 2344 * still be locked - so just clear all R5_LOCKED flags 2345 */ 2346 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2347 } 2348 2349 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2350 if (atomic_dec_and_test(&conf->pending_full_writes)) 2351 md_wakeup_thread(conf->mddev->thread); 2352 } 2353 2354 static void 2355 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2356 struct stripe_head_state *s) 2357 { 2358 int abort = 0; 2359 int i; 2360 2361 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2362 clear_bit(STRIPE_SYNCING, &sh->state); 2363 s->syncing = 0; 2364 /* There is nothing more to do for sync/check/repair. 2365 * For recover we need to record a bad block on all 2366 * non-sync devices, or abort the recovery 2367 */ 2368 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2369 return; 2370 /* During recovery devices cannot be removed, so locking and 2371 * refcounting of rdevs is not needed 2372 */ 2373 for (i = 0; i < conf->raid_disks; i++) { 2374 struct md_rdev *rdev = conf->disks[i].rdev; 2375 if (!rdev 2376 || test_bit(Faulty, &rdev->flags) 2377 || test_bit(In_sync, &rdev->flags)) 2378 continue; 2379 if (!rdev_set_badblocks(rdev, sh->sector, 2380 STRIPE_SECTORS, 0)) 2381 abort = 1; 2382 } 2383 if (abort) { 2384 conf->recovery_disabled = conf->mddev->recovery_disabled; 2385 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); 2386 } 2387 } 2388 2389 /* fetch_block - checks the given member device to see if its data needs 2390 * to be read or computed to satisfy a request. 2391 * 2392 * Returns 1 when no more member devices need to be checked, otherwise returns 2393 * 0 to tell the loop in handle_stripe_fill to continue 2394 */ 2395 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2396 int disk_idx, int disks) 2397 { 2398 struct r5dev *dev = &sh->dev[disk_idx]; 2399 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2400 &sh->dev[s->failed_num[1]] }; 2401 2402 /* is the data in this block needed, and can we get it? */ 2403 if (!test_bit(R5_LOCKED, &dev->flags) && 2404 !test_bit(R5_UPTODATE, &dev->flags) && 2405 (dev->toread || 2406 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2407 s->syncing || s->expanding || 2408 (s->failed >= 1 && fdev[0]->toread) || 2409 (s->failed >= 2 && fdev[1]->toread) || 2410 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2411 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2412 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2413 /* we would like to get this block, possibly by computing it, 2414 * otherwise read it if the backing disk is insync 2415 */ 2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2418 if ((s->uptodate == disks - 1) && 2419 (s->failed && (disk_idx == s->failed_num[0] || 2420 disk_idx == s->failed_num[1]))) { 2421 /* have disk failed, and we're requested to fetch it; 2422 * do compute it 2423 */ 2424 pr_debug("Computing stripe %llu block %d\n", 2425 (unsigned long long)sh->sector, disk_idx); 2426 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2427 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2428 set_bit(R5_Wantcompute, &dev->flags); 2429 sh->ops.target = disk_idx; 2430 sh->ops.target2 = -1; /* no 2nd target */ 2431 s->req_compute = 1; 2432 /* Careful: from this point on 'uptodate' is in the eye 2433 * of raid_run_ops which services 'compute' operations 2434 * before writes. R5_Wantcompute flags a block that will 2435 * be R5_UPTODATE by the time it is needed for a 2436 * subsequent operation. 2437 */ 2438 s->uptodate++; 2439 return 1; 2440 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2441 /* Computing 2-failure is *very* expensive; only 2442 * do it if failed >= 2 2443 */ 2444 int other; 2445 for (other = disks; other--; ) { 2446 if (other == disk_idx) 2447 continue; 2448 if (!test_bit(R5_UPTODATE, 2449 &sh->dev[other].flags)) 2450 break; 2451 } 2452 BUG_ON(other < 0); 2453 pr_debug("Computing stripe %llu blocks %d,%d\n", 2454 (unsigned long long)sh->sector, 2455 disk_idx, other); 2456 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2457 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2458 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2459 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2460 sh->ops.target = disk_idx; 2461 sh->ops.target2 = other; 2462 s->uptodate += 2; 2463 s->req_compute = 1; 2464 return 1; 2465 } else if (test_bit(R5_Insync, &dev->flags)) { 2466 set_bit(R5_LOCKED, &dev->flags); 2467 set_bit(R5_Wantread, &dev->flags); 2468 s->locked++; 2469 pr_debug("Reading block %d (sync=%d)\n", 2470 disk_idx, s->syncing); 2471 } 2472 } 2473 2474 return 0; 2475 } 2476 2477 /** 2478 * handle_stripe_fill - read or compute data to satisfy pending requests. 2479 */ 2480 static void handle_stripe_fill(struct stripe_head *sh, 2481 struct stripe_head_state *s, 2482 int disks) 2483 { 2484 int i; 2485 2486 /* look for blocks to read/compute, skip this if a compute 2487 * is already in flight, or if the stripe contents are in the 2488 * midst of changing due to a write 2489 */ 2490 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2491 !sh->reconstruct_state) 2492 for (i = disks; i--; ) 2493 if (fetch_block(sh, s, i, disks)) 2494 break; 2495 set_bit(STRIPE_HANDLE, &sh->state); 2496 } 2497 2498 2499 /* handle_stripe_clean_event 2500 * any written block on an uptodate or failed drive can be returned. 2501 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2502 * never LOCKED, so we don't need to test 'failed' directly. 2503 */ 2504 static void handle_stripe_clean_event(struct r5conf *conf, 2505 struct stripe_head *sh, int disks, struct bio **return_bi) 2506 { 2507 int i; 2508 struct r5dev *dev; 2509 2510 for (i = disks; i--; ) 2511 if (sh->dev[i].written) { 2512 dev = &sh->dev[i]; 2513 if (!test_bit(R5_LOCKED, &dev->flags) && 2514 test_bit(R5_UPTODATE, &dev->flags)) { 2515 /* We can return any write requests */ 2516 struct bio *wbi, *wbi2; 2517 int bitmap_end = 0; 2518 pr_debug("Return write for disc %d\n", i); 2519 spin_lock_irq(&conf->device_lock); 2520 wbi = dev->written; 2521 dev->written = NULL; 2522 while (wbi && wbi->bi_sector < 2523 dev->sector + STRIPE_SECTORS) { 2524 wbi2 = r5_next_bio(wbi, dev->sector); 2525 if (!raid5_dec_bi_phys_segments(wbi)) { 2526 md_write_end(conf->mddev); 2527 wbi->bi_next = *return_bi; 2528 *return_bi = wbi; 2529 } 2530 wbi = wbi2; 2531 } 2532 if (dev->towrite == NULL) 2533 bitmap_end = 1; 2534 spin_unlock_irq(&conf->device_lock); 2535 if (bitmap_end) 2536 bitmap_endwrite(conf->mddev->bitmap, 2537 sh->sector, 2538 STRIPE_SECTORS, 2539 !test_bit(STRIPE_DEGRADED, &sh->state), 2540 0); 2541 } 2542 } 2543 2544 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2545 if (atomic_dec_and_test(&conf->pending_full_writes)) 2546 md_wakeup_thread(conf->mddev->thread); 2547 } 2548 2549 static void handle_stripe_dirtying(struct r5conf *conf, 2550 struct stripe_head *sh, 2551 struct stripe_head_state *s, 2552 int disks) 2553 { 2554 int rmw = 0, rcw = 0, i; 2555 if (conf->max_degraded == 2) { 2556 /* RAID6 requires 'rcw' in current implementation 2557 * Calculate the real rcw later - for now fake it 2558 * look like rcw is cheaper 2559 */ 2560 rcw = 1; rmw = 2; 2561 } else for (i = disks; i--; ) { 2562 /* would I have to read this buffer for read_modify_write */ 2563 struct r5dev *dev = &sh->dev[i]; 2564 if ((dev->towrite || i == sh->pd_idx) && 2565 !test_bit(R5_LOCKED, &dev->flags) && 2566 !(test_bit(R5_UPTODATE, &dev->flags) || 2567 test_bit(R5_Wantcompute, &dev->flags))) { 2568 if (test_bit(R5_Insync, &dev->flags)) 2569 rmw++; 2570 else 2571 rmw += 2*disks; /* cannot read it */ 2572 } 2573 /* Would I have to read this buffer for reconstruct_write */ 2574 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2575 !test_bit(R5_LOCKED, &dev->flags) && 2576 !(test_bit(R5_UPTODATE, &dev->flags) || 2577 test_bit(R5_Wantcompute, &dev->flags))) { 2578 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2579 else 2580 rcw += 2*disks; 2581 } 2582 } 2583 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2584 (unsigned long long)sh->sector, rmw, rcw); 2585 set_bit(STRIPE_HANDLE, &sh->state); 2586 if (rmw < rcw && rmw > 0) 2587 /* prefer read-modify-write, but need to get some data */ 2588 for (i = disks; i--; ) { 2589 struct r5dev *dev = &sh->dev[i]; 2590 if ((dev->towrite || i == sh->pd_idx) && 2591 !test_bit(R5_LOCKED, &dev->flags) && 2592 !(test_bit(R5_UPTODATE, &dev->flags) || 2593 test_bit(R5_Wantcompute, &dev->flags)) && 2594 test_bit(R5_Insync, &dev->flags)) { 2595 if ( 2596 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2597 pr_debug("Read_old block " 2598 "%d for r-m-w\n", i); 2599 set_bit(R5_LOCKED, &dev->flags); 2600 set_bit(R5_Wantread, &dev->flags); 2601 s->locked++; 2602 } else { 2603 set_bit(STRIPE_DELAYED, &sh->state); 2604 set_bit(STRIPE_HANDLE, &sh->state); 2605 } 2606 } 2607 } 2608 if (rcw <= rmw && rcw > 0) { 2609 /* want reconstruct write, but need to get some data */ 2610 rcw = 0; 2611 for (i = disks; i--; ) { 2612 struct r5dev *dev = &sh->dev[i]; 2613 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2614 i != sh->pd_idx && i != sh->qd_idx && 2615 !test_bit(R5_LOCKED, &dev->flags) && 2616 !(test_bit(R5_UPTODATE, &dev->flags) || 2617 test_bit(R5_Wantcompute, &dev->flags))) { 2618 rcw++; 2619 if (!test_bit(R5_Insync, &dev->flags)) 2620 continue; /* it's a failed drive */ 2621 if ( 2622 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2623 pr_debug("Read_old block " 2624 "%d for Reconstruct\n", i); 2625 set_bit(R5_LOCKED, &dev->flags); 2626 set_bit(R5_Wantread, &dev->flags); 2627 s->locked++; 2628 } else { 2629 set_bit(STRIPE_DELAYED, &sh->state); 2630 set_bit(STRIPE_HANDLE, &sh->state); 2631 } 2632 } 2633 } 2634 } 2635 /* now if nothing is locked, and if we have enough data, 2636 * we can start a write request 2637 */ 2638 /* since handle_stripe can be called at any time we need to handle the 2639 * case where a compute block operation has been submitted and then a 2640 * subsequent call wants to start a write request. raid_run_ops only 2641 * handles the case where compute block and reconstruct are requested 2642 * simultaneously. If this is not the case then new writes need to be 2643 * held off until the compute completes. 2644 */ 2645 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2646 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2647 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2648 schedule_reconstruction(sh, s, rcw == 0, 0); 2649 } 2650 2651 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2652 struct stripe_head_state *s, int disks) 2653 { 2654 struct r5dev *dev = NULL; 2655 2656 set_bit(STRIPE_HANDLE, &sh->state); 2657 2658 switch (sh->check_state) { 2659 case check_state_idle: 2660 /* start a new check operation if there are no failures */ 2661 if (s->failed == 0) { 2662 BUG_ON(s->uptodate != disks); 2663 sh->check_state = check_state_run; 2664 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2665 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2666 s->uptodate--; 2667 break; 2668 } 2669 dev = &sh->dev[s->failed_num[0]]; 2670 /* fall through */ 2671 case check_state_compute_result: 2672 sh->check_state = check_state_idle; 2673 if (!dev) 2674 dev = &sh->dev[sh->pd_idx]; 2675 2676 /* check that a write has not made the stripe insync */ 2677 if (test_bit(STRIPE_INSYNC, &sh->state)) 2678 break; 2679 2680 /* either failed parity check, or recovery is happening */ 2681 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2682 BUG_ON(s->uptodate != disks); 2683 2684 set_bit(R5_LOCKED, &dev->flags); 2685 s->locked++; 2686 set_bit(R5_Wantwrite, &dev->flags); 2687 2688 clear_bit(STRIPE_DEGRADED, &sh->state); 2689 set_bit(STRIPE_INSYNC, &sh->state); 2690 break; 2691 case check_state_run: 2692 break; /* we will be called again upon completion */ 2693 case check_state_check_result: 2694 sh->check_state = check_state_idle; 2695 2696 /* if a failure occurred during the check operation, leave 2697 * STRIPE_INSYNC not set and let the stripe be handled again 2698 */ 2699 if (s->failed) 2700 break; 2701 2702 /* handle a successful check operation, if parity is correct 2703 * we are done. Otherwise update the mismatch count and repair 2704 * parity if !MD_RECOVERY_CHECK 2705 */ 2706 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2707 /* parity is correct (on disc, 2708 * not in buffer any more) 2709 */ 2710 set_bit(STRIPE_INSYNC, &sh->state); 2711 else { 2712 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2713 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2714 /* don't try to repair!! */ 2715 set_bit(STRIPE_INSYNC, &sh->state); 2716 else { 2717 sh->check_state = check_state_compute_run; 2718 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2719 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2720 set_bit(R5_Wantcompute, 2721 &sh->dev[sh->pd_idx].flags); 2722 sh->ops.target = sh->pd_idx; 2723 sh->ops.target2 = -1; 2724 s->uptodate++; 2725 } 2726 } 2727 break; 2728 case check_state_compute_run: 2729 break; 2730 default: 2731 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2732 __func__, sh->check_state, 2733 (unsigned long long) sh->sector); 2734 BUG(); 2735 } 2736 } 2737 2738 2739 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2740 struct stripe_head_state *s, 2741 int disks) 2742 { 2743 int pd_idx = sh->pd_idx; 2744 int qd_idx = sh->qd_idx; 2745 struct r5dev *dev; 2746 2747 set_bit(STRIPE_HANDLE, &sh->state); 2748 2749 BUG_ON(s->failed > 2); 2750 2751 /* Want to check and possibly repair P and Q. 2752 * However there could be one 'failed' device, in which 2753 * case we can only check one of them, possibly using the 2754 * other to generate missing data 2755 */ 2756 2757 switch (sh->check_state) { 2758 case check_state_idle: 2759 /* start a new check operation if there are < 2 failures */ 2760 if (s->failed == s->q_failed) { 2761 /* The only possible failed device holds Q, so it 2762 * makes sense to check P (If anything else were failed, 2763 * we would have used P to recreate it). 2764 */ 2765 sh->check_state = check_state_run; 2766 } 2767 if (!s->q_failed && s->failed < 2) { 2768 /* Q is not failed, and we didn't use it to generate 2769 * anything, so it makes sense to check it 2770 */ 2771 if (sh->check_state == check_state_run) 2772 sh->check_state = check_state_run_pq; 2773 else 2774 sh->check_state = check_state_run_q; 2775 } 2776 2777 /* discard potentially stale zero_sum_result */ 2778 sh->ops.zero_sum_result = 0; 2779 2780 if (sh->check_state == check_state_run) { 2781 /* async_xor_zero_sum destroys the contents of P */ 2782 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2783 s->uptodate--; 2784 } 2785 if (sh->check_state >= check_state_run && 2786 sh->check_state <= check_state_run_pq) { 2787 /* async_syndrome_zero_sum preserves P and Q, so 2788 * no need to mark them !uptodate here 2789 */ 2790 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2791 break; 2792 } 2793 2794 /* we have 2-disk failure */ 2795 BUG_ON(s->failed != 2); 2796 /* fall through */ 2797 case check_state_compute_result: 2798 sh->check_state = check_state_idle; 2799 2800 /* check that a write has not made the stripe insync */ 2801 if (test_bit(STRIPE_INSYNC, &sh->state)) 2802 break; 2803 2804 /* now write out any block on a failed drive, 2805 * or P or Q if they were recomputed 2806 */ 2807 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2808 if (s->failed == 2) { 2809 dev = &sh->dev[s->failed_num[1]]; 2810 s->locked++; 2811 set_bit(R5_LOCKED, &dev->flags); 2812 set_bit(R5_Wantwrite, &dev->flags); 2813 } 2814 if (s->failed >= 1) { 2815 dev = &sh->dev[s->failed_num[0]]; 2816 s->locked++; 2817 set_bit(R5_LOCKED, &dev->flags); 2818 set_bit(R5_Wantwrite, &dev->flags); 2819 } 2820 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2821 dev = &sh->dev[pd_idx]; 2822 s->locked++; 2823 set_bit(R5_LOCKED, &dev->flags); 2824 set_bit(R5_Wantwrite, &dev->flags); 2825 } 2826 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2827 dev = &sh->dev[qd_idx]; 2828 s->locked++; 2829 set_bit(R5_LOCKED, &dev->flags); 2830 set_bit(R5_Wantwrite, &dev->flags); 2831 } 2832 clear_bit(STRIPE_DEGRADED, &sh->state); 2833 2834 set_bit(STRIPE_INSYNC, &sh->state); 2835 break; 2836 case check_state_run: 2837 case check_state_run_q: 2838 case check_state_run_pq: 2839 break; /* we will be called again upon completion */ 2840 case check_state_check_result: 2841 sh->check_state = check_state_idle; 2842 2843 /* handle a successful check operation, if parity is correct 2844 * we are done. Otherwise update the mismatch count and repair 2845 * parity if !MD_RECOVERY_CHECK 2846 */ 2847 if (sh->ops.zero_sum_result == 0) { 2848 /* both parities are correct */ 2849 if (!s->failed) 2850 set_bit(STRIPE_INSYNC, &sh->state); 2851 else { 2852 /* in contrast to the raid5 case we can validate 2853 * parity, but still have a failure to write 2854 * back 2855 */ 2856 sh->check_state = check_state_compute_result; 2857 /* Returning at this point means that we may go 2858 * off and bring p and/or q uptodate again so 2859 * we make sure to check zero_sum_result again 2860 * to verify if p or q need writeback 2861 */ 2862 } 2863 } else { 2864 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2865 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2866 /* don't try to repair!! */ 2867 set_bit(STRIPE_INSYNC, &sh->state); 2868 else { 2869 int *target = &sh->ops.target; 2870 2871 sh->ops.target = -1; 2872 sh->ops.target2 = -1; 2873 sh->check_state = check_state_compute_run; 2874 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2875 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2876 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2877 set_bit(R5_Wantcompute, 2878 &sh->dev[pd_idx].flags); 2879 *target = pd_idx; 2880 target = &sh->ops.target2; 2881 s->uptodate++; 2882 } 2883 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2884 set_bit(R5_Wantcompute, 2885 &sh->dev[qd_idx].flags); 2886 *target = qd_idx; 2887 s->uptodate++; 2888 } 2889 } 2890 } 2891 break; 2892 case check_state_compute_run: 2893 break; 2894 default: 2895 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2896 __func__, sh->check_state, 2897 (unsigned long long) sh->sector); 2898 BUG(); 2899 } 2900 } 2901 2902 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 2903 { 2904 int i; 2905 2906 /* We have read all the blocks in this stripe and now we need to 2907 * copy some of them into a target stripe for expand. 2908 */ 2909 struct dma_async_tx_descriptor *tx = NULL; 2910 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2911 for (i = 0; i < sh->disks; i++) 2912 if (i != sh->pd_idx && i != sh->qd_idx) { 2913 int dd_idx, j; 2914 struct stripe_head *sh2; 2915 struct async_submit_ctl submit; 2916 2917 sector_t bn = compute_blocknr(sh, i, 1); 2918 sector_t s = raid5_compute_sector(conf, bn, 0, 2919 &dd_idx, NULL); 2920 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2921 if (sh2 == NULL) 2922 /* so far only the early blocks of this stripe 2923 * have been requested. When later blocks 2924 * get requested, we will try again 2925 */ 2926 continue; 2927 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2928 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2929 /* must have already done this block */ 2930 release_stripe(sh2); 2931 continue; 2932 } 2933 2934 /* place all the copies on one channel */ 2935 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 2936 tx = async_memcpy(sh2->dev[dd_idx].page, 2937 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2938 &submit); 2939 2940 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2941 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2942 for (j = 0; j < conf->raid_disks; j++) 2943 if (j != sh2->pd_idx && 2944 j != sh2->qd_idx && 2945 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2946 break; 2947 if (j == conf->raid_disks) { 2948 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2949 set_bit(STRIPE_HANDLE, &sh2->state); 2950 } 2951 release_stripe(sh2); 2952 2953 } 2954 /* done submitting copies, wait for them to complete */ 2955 if (tx) { 2956 async_tx_ack(tx); 2957 dma_wait_for_async_tx(tx); 2958 } 2959 } 2960 2961 2962 /* 2963 * handle_stripe - do things to a stripe. 2964 * 2965 * We lock the stripe and then examine the state of various bits 2966 * to see what needs to be done. 2967 * Possible results: 2968 * return some read request which now have data 2969 * return some write requests which are safely on disc 2970 * schedule a read on some buffers 2971 * schedule a write of some buffers 2972 * return confirmation of parity correctness 2973 * 2974 * buffers are taken off read_list or write_list, and bh_cache buffers 2975 * get BH_Lock set before the stripe lock is released. 2976 * 2977 */ 2978 2979 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 2980 { 2981 struct r5conf *conf = sh->raid_conf; 2982 int disks = sh->disks; 2983 struct r5dev *dev; 2984 int i; 2985 2986 memset(s, 0, sizeof(*s)); 2987 2988 s->syncing = test_bit(STRIPE_SYNCING, &sh->state); 2989 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2990 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2991 s->failed_num[0] = -1; 2992 s->failed_num[1] = -1; 2993 2994 /* Now to look around and see what can be done */ 2995 rcu_read_lock(); 2996 spin_lock_irq(&conf->device_lock); 2997 for (i=disks; i--; ) { 2998 struct md_rdev *rdev; 2999 sector_t first_bad; 3000 int bad_sectors; 3001 int is_bad = 0; 3002 3003 dev = &sh->dev[i]; 3004 3005 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3006 i, dev->flags, dev->toread, dev->towrite, dev->written); 3007 /* maybe we can reply to a read 3008 * 3009 * new wantfill requests are only permitted while 3010 * ops_complete_biofill is guaranteed to be inactive 3011 */ 3012 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3013 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3014 set_bit(R5_Wantfill, &dev->flags); 3015 3016 /* now count some things */ 3017 if (test_bit(R5_LOCKED, &dev->flags)) 3018 s->locked++; 3019 if (test_bit(R5_UPTODATE, &dev->flags)) 3020 s->uptodate++; 3021 if (test_bit(R5_Wantcompute, &dev->flags)) { 3022 s->compute++; 3023 BUG_ON(s->compute > 2); 3024 } 3025 3026 if (test_bit(R5_Wantfill, &dev->flags)) 3027 s->to_fill++; 3028 else if (dev->toread) 3029 s->to_read++; 3030 if (dev->towrite) { 3031 s->to_write++; 3032 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3033 s->non_overwrite++; 3034 } 3035 if (dev->written) 3036 s->written++; 3037 rdev = rcu_dereference(conf->disks[i].rdev); 3038 if (rdev) { 3039 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3040 &first_bad, &bad_sectors); 3041 if (s->blocked_rdev == NULL 3042 && (test_bit(Blocked, &rdev->flags) 3043 || is_bad < 0)) { 3044 if (is_bad < 0) 3045 set_bit(BlockedBadBlocks, 3046 &rdev->flags); 3047 s->blocked_rdev = rdev; 3048 atomic_inc(&rdev->nr_pending); 3049 } 3050 } 3051 clear_bit(R5_Insync, &dev->flags); 3052 if (!rdev) 3053 /* Not in-sync */; 3054 else if (is_bad) { 3055 /* also not in-sync */ 3056 if (!test_bit(WriteErrorSeen, &rdev->flags)) { 3057 /* treat as in-sync, but with a read error 3058 * which we can now try to correct 3059 */ 3060 set_bit(R5_Insync, &dev->flags); 3061 set_bit(R5_ReadError, &dev->flags); 3062 } 3063 } else if (test_bit(In_sync, &rdev->flags)) 3064 set_bit(R5_Insync, &dev->flags); 3065 else if (!test_bit(Faulty, &rdev->flags)) { 3066 /* in sync if before recovery_offset */ 3067 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3068 set_bit(R5_Insync, &dev->flags); 3069 } 3070 if (test_bit(R5_WriteError, &dev->flags)) { 3071 clear_bit(R5_Insync, &dev->flags); 3072 if (!test_bit(Faulty, &rdev->flags)) { 3073 s->handle_bad_blocks = 1; 3074 atomic_inc(&rdev->nr_pending); 3075 } else 3076 clear_bit(R5_WriteError, &dev->flags); 3077 } 3078 if (test_bit(R5_MadeGood, &dev->flags)) { 3079 if (!test_bit(Faulty, &rdev->flags)) { 3080 s->handle_bad_blocks = 1; 3081 atomic_inc(&rdev->nr_pending); 3082 } else 3083 clear_bit(R5_MadeGood, &dev->flags); 3084 } 3085 if (!test_bit(R5_Insync, &dev->flags)) { 3086 /* The ReadError flag will just be confusing now */ 3087 clear_bit(R5_ReadError, &dev->flags); 3088 clear_bit(R5_ReWrite, &dev->flags); 3089 } 3090 if (test_bit(R5_ReadError, &dev->flags)) 3091 clear_bit(R5_Insync, &dev->flags); 3092 if (!test_bit(R5_Insync, &dev->flags)) { 3093 if (s->failed < 2) 3094 s->failed_num[s->failed] = i; 3095 s->failed++; 3096 } 3097 } 3098 spin_unlock_irq(&conf->device_lock); 3099 rcu_read_unlock(); 3100 } 3101 3102 static void handle_stripe(struct stripe_head *sh) 3103 { 3104 struct stripe_head_state s; 3105 struct r5conf *conf = sh->raid_conf; 3106 int i; 3107 int prexor; 3108 int disks = sh->disks; 3109 struct r5dev *pdev, *qdev; 3110 3111 clear_bit(STRIPE_HANDLE, &sh->state); 3112 if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { 3113 /* already being handled, ensure it gets handled 3114 * again when current action finishes */ 3115 set_bit(STRIPE_HANDLE, &sh->state); 3116 return; 3117 } 3118 3119 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3120 set_bit(STRIPE_SYNCING, &sh->state); 3121 clear_bit(STRIPE_INSYNC, &sh->state); 3122 } 3123 clear_bit(STRIPE_DELAYED, &sh->state); 3124 3125 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3126 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3127 (unsigned long long)sh->sector, sh->state, 3128 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3129 sh->check_state, sh->reconstruct_state); 3130 3131 analyse_stripe(sh, &s); 3132 3133 if (s.handle_bad_blocks) { 3134 set_bit(STRIPE_HANDLE, &sh->state); 3135 goto finish; 3136 } 3137 3138 if (unlikely(s.blocked_rdev)) { 3139 if (s.syncing || s.expanding || s.expanded || 3140 s.to_write || s.written) { 3141 set_bit(STRIPE_HANDLE, &sh->state); 3142 goto finish; 3143 } 3144 /* There is nothing for the blocked_rdev to block */ 3145 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3146 s.blocked_rdev = NULL; 3147 } 3148 3149 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3150 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3151 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3152 } 3153 3154 pr_debug("locked=%d uptodate=%d to_read=%d" 3155 " to_write=%d failed=%d failed_num=%d,%d\n", 3156 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3157 s.failed_num[0], s.failed_num[1]); 3158 /* check if the array has lost more than max_degraded devices and, 3159 * if so, some requests might need to be failed. 3160 */ 3161 if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written) 3162 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3163 if (s.failed > conf->max_degraded && s.syncing) 3164 handle_failed_sync(conf, sh, &s); 3165 3166 /* 3167 * might be able to return some write requests if the parity blocks 3168 * are safe, or on a failed drive 3169 */ 3170 pdev = &sh->dev[sh->pd_idx]; 3171 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3172 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3173 qdev = &sh->dev[sh->qd_idx]; 3174 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3175 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3176 || conf->level < 6; 3177 3178 if (s.written && 3179 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3180 && !test_bit(R5_LOCKED, &pdev->flags) 3181 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3182 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3183 && !test_bit(R5_LOCKED, &qdev->flags) 3184 && test_bit(R5_UPTODATE, &qdev->flags))))) 3185 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3186 3187 /* Now we might consider reading some blocks, either to check/generate 3188 * parity, or to satisfy requests 3189 * or to load a block that is being partially written. 3190 */ 3191 if (s.to_read || s.non_overwrite 3192 || (conf->level == 6 && s.to_write && s.failed) 3193 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3194 handle_stripe_fill(sh, &s, disks); 3195 3196 /* Now we check to see if any write operations have recently 3197 * completed 3198 */ 3199 prexor = 0; 3200 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3201 prexor = 1; 3202 if (sh->reconstruct_state == reconstruct_state_drain_result || 3203 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3204 sh->reconstruct_state = reconstruct_state_idle; 3205 3206 /* All the 'written' buffers and the parity block are ready to 3207 * be written back to disk 3208 */ 3209 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3210 BUG_ON(sh->qd_idx >= 0 && 3211 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3212 for (i = disks; i--; ) { 3213 struct r5dev *dev = &sh->dev[i]; 3214 if (test_bit(R5_LOCKED, &dev->flags) && 3215 (i == sh->pd_idx || i == sh->qd_idx || 3216 dev->written)) { 3217 pr_debug("Writing block %d\n", i); 3218 set_bit(R5_Wantwrite, &dev->flags); 3219 if (prexor) 3220 continue; 3221 if (!test_bit(R5_Insync, &dev->flags) || 3222 ((i == sh->pd_idx || i == sh->qd_idx) && 3223 s.failed == 0)) 3224 set_bit(STRIPE_INSYNC, &sh->state); 3225 } 3226 } 3227 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3228 s.dec_preread_active = 1; 3229 } 3230 3231 /* Now to consider new write requests and what else, if anything 3232 * should be read. We do not handle new writes when: 3233 * 1/ A 'write' operation (copy+xor) is already in flight. 3234 * 2/ A 'check' operation is in flight, as it may clobber the parity 3235 * block. 3236 */ 3237 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3238 handle_stripe_dirtying(conf, sh, &s, disks); 3239 3240 /* maybe we need to check and possibly fix the parity for this stripe 3241 * Any reads will already have been scheduled, so we just see if enough 3242 * data is available. The parity check is held off while parity 3243 * dependent operations are in flight. 3244 */ 3245 if (sh->check_state || 3246 (s.syncing && s.locked == 0 && 3247 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3248 !test_bit(STRIPE_INSYNC, &sh->state))) { 3249 if (conf->level == 6) 3250 handle_parity_checks6(conf, sh, &s, disks); 3251 else 3252 handle_parity_checks5(conf, sh, &s, disks); 3253 } 3254 3255 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3256 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3257 clear_bit(STRIPE_SYNCING, &sh->state); 3258 } 3259 3260 /* If the failed drives are just a ReadError, then we might need 3261 * to progress the repair/check process 3262 */ 3263 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3264 for (i = 0; i < s.failed; i++) { 3265 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3266 if (test_bit(R5_ReadError, &dev->flags) 3267 && !test_bit(R5_LOCKED, &dev->flags) 3268 && test_bit(R5_UPTODATE, &dev->flags) 3269 ) { 3270 if (!test_bit(R5_ReWrite, &dev->flags)) { 3271 set_bit(R5_Wantwrite, &dev->flags); 3272 set_bit(R5_ReWrite, &dev->flags); 3273 set_bit(R5_LOCKED, &dev->flags); 3274 s.locked++; 3275 } else { 3276 /* let's read it back */ 3277 set_bit(R5_Wantread, &dev->flags); 3278 set_bit(R5_LOCKED, &dev->flags); 3279 s.locked++; 3280 } 3281 } 3282 } 3283 3284 3285 /* Finish reconstruct operations initiated by the expansion process */ 3286 if (sh->reconstruct_state == reconstruct_state_result) { 3287 struct stripe_head *sh_src 3288 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3289 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3290 /* sh cannot be written until sh_src has been read. 3291 * so arrange for sh to be delayed a little 3292 */ 3293 set_bit(STRIPE_DELAYED, &sh->state); 3294 set_bit(STRIPE_HANDLE, &sh->state); 3295 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3296 &sh_src->state)) 3297 atomic_inc(&conf->preread_active_stripes); 3298 release_stripe(sh_src); 3299 goto finish; 3300 } 3301 if (sh_src) 3302 release_stripe(sh_src); 3303 3304 sh->reconstruct_state = reconstruct_state_idle; 3305 clear_bit(STRIPE_EXPANDING, &sh->state); 3306 for (i = conf->raid_disks; i--; ) { 3307 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3308 set_bit(R5_LOCKED, &sh->dev[i].flags); 3309 s.locked++; 3310 } 3311 } 3312 3313 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3314 !sh->reconstruct_state) { 3315 /* Need to write out all blocks after computing parity */ 3316 sh->disks = conf->raid_disks; 3317 stripe_set_idx(sh->sector, conf, 0, sh); 3318 schedule_reconstruction(sh, &s, 1, 1); 3319 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3320 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3321 atomic_dec(&conf->reshape_stripes); 3322 wake_up(&conf->wait_for_overlap); 3323 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3324 } 3325 3326 if (s.expanding && s.locked == 0 && 3327 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3328 handle_stripe_expansion(conf, sh); 3329 3330 finish: 3331 /* wait for this device to become unblocked */ 3332 if (conf->mddev->external && unlikely(s.blocked_rdev)) 3333 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); 3334 3335 if (s.handle_bad_blocks) 3336 for (i = disks; i--; ) { 3337 struct md_rdev *rdev; 3338 struct r5dev *dev = &sh->dev[i]; 3339 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3340 /* We own a safe reference to the rdev */ 3341 rdev = conf->disks[i].rdev; 3342 if (!rdev_set_badblocks(rdev, sh->sector, 3343 STRIPE_SECTORS, 0)) 3344 md_error(conf->mddev, rdev); 3345 rdev_dec_pending(rdev, conf->mddev); 3346 } 3347 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3348 rdev = conf->disks[i].rdev; 3349 rdev_clear_badblocks(rdev, sh->sector, 3350 STRIPE_SECTORS); 3351 rdev_dec_pending(rdev, conf->mddev); 3352 } 3353 } 3354 3355 if (s.ops_request) 3356 raid_run_ops(sh, s.ops_request); 3357 3358 ops_run_io(sh, &s); 3359 3360 if (s.dec_preread_active) { 3361 /* We delay this until after ops_run_io so that if make_request 3362 * is waiting on a flush, it won't continue until the writes 3363 * have actually been submitted. 3364 */ 3365 atomic_dec(&conf->preread_active_stripes); 3366 if (atomic_read(&conf->preread_active_stripes) < 3367 IO_THRESHOLD) 3368 md_wakeup_thread(conf->mddev->thread); 3369 } 3370 3371 return_io(s.return_bi); 3372 3373 clear_bit(STRIPE_ACTIVE, &sh->state); 3374 } 3375 3376 static void raid5_activate_delayed(struct r5conf *conf) 3377 { 3378 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3379 while (!list_empty(&conf->delayed_list)) { 3380 struct list_head *l = conf->delayed_list.next; 3381 struct stripe_head *sh; 3382 sh = list_entry(l, struct stripe_head, lru); 3383 list_del_init(l); 3384 clear_bit(STRIPE_DELAYED, &sh->state); 3385 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3386 atomic_inc(&conf->preread_active_stripes); 3387 list_add_tail(&sh->lru, &conf->hold_list); 3388 } 3389 } 3390 } 3391 3392 static void activate_bit_delay(struct r5conf *conf) 3393 { 3394 /* device_lock is held */ 3395 struct list_head head; 3396 list_add(&head, &conf->bitmap_list); 3397 list_del_init(&conf->bitmap_list); 3398 while (!list_empty(&head)) { 3399 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3400 list_del_init(&sh->lru); 3401 atomic_inc(&sh->count); 3402 __release_stripe(conf, sh); 3403 } 3404 } 3405 3406 int md_raid5_congested(struct mddev *mddev, int bits) 3407 { 3408 struct r5conf *conf = mddev->private; 3409 3410 /* No difference between reads and writes. Just check 3411 * how busy the stripe_cache is 3412 */ 3413 3414 if (conf->inactive_blocked) 3415 return 1; 3416 if (conf->quiesce) 3417 return 1; 3418 if (list_empty_careful(&conf->inactive_list)) 3419 return 1; 3420 3421 return 0; 3422 } 3423 EXPORT_SYMBOL_GPL(md_raid5_congested); 3424 3425 static int raid5_congested(void *data, int bits) 3426 { 3427 struct mddev *mddev = data; 3428 3429 return mddev_congested(mddev, bits) || 3430 md_raid5_congested(mddev, bits); 3431 } 3432 3433 /* We want read requests to align with chunks where possible, 3434 * but write requests don't need to. 3435 */ 3436 static int raid5_mergeable_bvec(struct request_queue *q, 3437 struct bvec_merge_data *bvm, 3438 struct bio_vec *biovec) 3439 { 3440 struct mddev *mddev = q->queuedata; 3441 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3442 int max; 3443 unsigned int chunk_sectors = mddev->chunk_sectors; 3444 unsigned int bio_sectors = bvm->bi_size >> 9; 3445 3446 if ((bvm->bi_rw & 1) == WRITE) 3447 return biovec->bv_len; /* always allow writes to be mergeable */ 3448 3449 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3450 chunk_sectors = mddev->new_chunk_sectors; 3451 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3452 if (max < 0) max = 0; 3453 if (max <= biovec->bv_len && bio_sectors == 0) 3454 return biovec->bv_len; 3455 else 3456 return max; 3457 } 3458 3459 3460 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3461 { 3462 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3463 unsigned int chunk_sectors = mddev->chunk_sectors; 3464 unsigned int bio_sectors = bio->bi_size >> 9; 3465 3466 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3467 chunk_sectors = mddev->new_chunk_sectors; 3468 return chunk_sectors >= 3469 ((sector & (chunk_sectors - 1)) + bio_sectors); 3470 } 3471 3472 /* 3473 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3474 * later sampled by raid5d. 3475 */ 3476 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3477 { 3478 unsigned long flags; 3479 3480 spin_lock_irqsave(&conf->device_lock, flags); 3481 3482 bi->bi_next = conf->retry_read_aligned_list; 3483 conf->retry_read_aligned_list = bi; 3484 3485 spin_unlock_irqrestore(&conf->device_lock, flags); 3486 md_wakeup_thread(conf->mddev->thread); 3487 } 3488 3489 3490 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3491 { 3492 struct bio *bi; 3493 3494 bi = conf->retry_read_aligned; 3495 if (bi) { 3496 conf->retry_read_aligned = NULL; 3497 return bi; 3498 } 3499 bi = conf->retry_read_aligned_list; 3500 if(bi) { 3501 conf->retry_read_aligned_list = bi->bi_next; 3502 bi->bi_next = NULL; 3503 /* 3504 * this sets the active strip count to 1 and the processed 3505 * strip count to zero (upper 8 bits) 3506 */ 3507 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3508 } 3509 3510 return bi; 3511 } 3512 3513 3514 /* 3515 * The "raid5_align_endio" should check if the read succeeded and if it 3516 * did, call bio_endio on the original bio (having bio_put the new bio 3517 * first). 3518 * If the read failed.. 3519 */ 3520 static void raid5_align_endio(struct bio *bi, int error) 3521 { 3522 struct bio* raid_bi = bi->bi_private; 3523 struct mddev *mddev; 3524 struct r5conf *conf; 3525 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3526 struct md_rdev *rdev; 3527 3528 bio_put(bi); 3529 3530 rdev = (void*)raid_bi->bi_next; 3531 raid_bi->bi_next = NULL; 3532 mddev = rdev->mddev; 3533 conf = mddev->private; 3534 3535 rdev_dec_pending(rdev, conf->mddev); 3536 3537 if (!error && uptodate) { 3538 bio_endio(raid_bi, 0); 3539 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3540 wake_up(&conf->wait_for_stripe); 3541 return; 3542 } 3543 3544 3545 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3546 3547 add_bio_to_retry(raid_bi, conf); 3548 } 3549 3550 static int bio_fits_rdev(struct bio *bi) 3551 { 3552 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3553 3554 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3555 return 0; 3556 blk_recount_segments(q, bi); 3557 if (bi->bi_phys_segments > queue_max_segments(q)) 3558 return 0; 3559 3560 if (q->merge_bvec_fn) 3561 /* it's too hard to apply the merge_bvec_fn at this stage, 3562 * just just give up 3563 */ 3564 return 0; 3565 3566 return 1; 3567 } 3568 3569 3570 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3571 { 3572 struct r5conf *conf = mddev->private; 3573 int dd_idx; 3574 struct bio* align_bi; 3575 struct md_rdev *rdev; 3576 3577 if (!in_chunk_boundary(mddev, raid_bio)) { 3578 pr_debug("chunk_aligned_read : non aligned\n"); 3579 return 0; 3580 } 3581 /* 3582 * use bio_clone_mddev to make a copy of the bio 3583 */ 3584 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3585 if (!align_bi) 3586 return 0; 3587 /* 3588 * set bi_end_io to a new function, and set bi_private to the 3589 * original bio. 3590 */ 3591 align_bi->bi_end_io = raid5_align_endio; 3592 align_bi->bi_private = raid_bio; 3593 /* 3594 * compute position 3595 */ 3596 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3597 0, 3598 &dd_idx, NULL); 3599 3600 rcu_read_lock(); 3601 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3602 if (rdev && test_bit(In_sync, &rdev->flags)) { 3603 sector_t first_bad; 3604 int bad_sectors; 3605 3606 atomic_inc(&rdev->nr_pending); 3607 rcu_read_unlock(); 3608 raid_bio->bi_next = (void*)rdev; 3609 align_bi->bi_bdev = rdev->bdev; 3610 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3611 align_bi->bi_sector += rdev->data_offset; 3612 3613 if (!bio_fits_rdev(align_bi) || 3614 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3615 &first_bad, &bad_sectors)) { 3616 /* too big in some way, or has a known bad block */ 3617 bio_put(align_bi); 3618 rdev_dec_pending(rdev, mddev); 3619 return 0; 3620 } 3621 3622 spin_lock_irq(&conf->device_lock); 3623 wait_event_lock_irq(conf->wait_for_stripe, 3624 conf->quiesce == 0, 3625 conf->device_lock, /* nothing */); 3626 atomic_inc(&conf->active_aligned_reads); 3627 spin_unlock_irq(&conf->device_lock); 3628 3629 generic_make_request(align_bi); 3630 return 1; 3631 } else { 3632 rcu_read_unlock(); 3633 bio_put(align_bi); 3634 return 0; 3635 } 3636 } 3637 3638 /* __get_priority_stripe - get the next stripe to process 3639 * 3640 * Full stripe writes are allowed to pass preread active stripes up until 3641 * the bypass_threshold is exceeded. In general the bypass_count 3642 * increments when the handle_list is handled before the hold_list; however, it 3643 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3644 * stripe with in flight i/o. The bypass_count will be reset when the 3645 * head of the hold_list has changed, i.e. the head was promoted to the 3646 * handle_list. 3647 */ 3648 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3649 { 3650 struct stripe_head *sh; 3651 3652 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3653 __func__, 3654 list_empty(&conf->handle_list) ? "empty" : "busy", 3655 list_empty(&conf->hold_list) ? "empty" : "busy", 3656 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3657 3658 if (!list_empty(&conf->handle_list)) { 3659 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3660 3661 if (list_empty(&conf->hold_list)) 3662 conf->bypass_count = 0; 3663 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3664 if (conf->hold_list.next == conf->last_hold) 3665 conf->bypass_count++; 3666 else { 3667 conf->last_hold = conf->hold_list.next; 3668 conf->bypass_count -= conf->bypass_threshold; 3669 if (conf->bypass_count < 0) 3670 conf->bypass_count = 0; 3671 } 3672 } 3673 } else if (!list_empty(&conf->hold_list) && 3674 ((conf->bypass_threshold && 3675 conf->bypass_count > conf->bypass_threshold) || 3676 atomic_read(&conf->pending_full_writes) == 0)) { 3677 sh = list_entry(conf->hold_list.next, 3678 typeof(*sh), lru); 3679 conf->bypass_count -= conf->bypass_threshold; 3680 if (conf->bypass_count < 0) 3681 conf->bypass_count = 0; 3682 } else 3683 return NULL; 3684 3685 list_del_init(&sh->lru); 3686 atomic_inc(&sh->count); 3687 BUG_ON(atomic_read(&sh->count) != 1); 3688 return sh; 3689 } 3690 3691 static int make_request(struct mddev *mddev, struct bio * bi) 3692 { 3693 struct r5conf *conf = mddev->private; 3694 int dd_idx; 3695 sector_t new_sector; 3696 sector_t logical_sector, last_sector; 3697 struct stripe_head *sh; 3698 const int rw = bio_data_dir(bi); 3699 int remaining; 3700 int plugged; 3701 3702 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3703 md_flush_request(mddev, bi); 3704 return 0; 3705 } 3706 3707 md_write_start(mddev, bi); 3708 3709 if (rw == READ && 3710 mddev->reshape_position == MaxSector && 3711 chunk_aligned_read(mddev,bi)) 3712 return 0; 3713 3714 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3715 last_sector = bi->bi_sector + (bi->bi_size>>9); 3716 bi->bi_next = NULL; 3717 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3718 3719 plugged = mddev_check_plugged(mddev); 3720 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3721 DEFINE_WAIT(w); 3722 int disks, data_disks; 3723 int previous; 3724 3725 retry: 3726 previous = 0; 3727 disks = conf->raid_disks; 3728 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3729 if (unlikely(conf->reshape_progress != MaxSector)) { 3730 /* spinlock is needed as reshape_progress may be 3731 * 64bit on a 32bit platform, and so it might be 3732 * possible to see a half-updated value 3733 * Of course reshape_progress could change after 3734 * the lock is dropped, so once we get a reference 3735 * to the stripe that we think it is, we will have 3736 * to check again. 3737 */ 3738 spin_lock_irq(&conf->device_lock); 3739 if (mddev->delta_disks < 0 3740 ? logical_sector < conf->reshape_progress 3741 : logical_sector >= conf->reshape_progress) { 3742 disks = conf->previous_raid_disks; 3743 previous = 1; 3744 } else { 3745 if (mddev->delta_disks < 0 3746 ? logical_sector < conf->reshape_safe 3747 : logical_sector >= conf->reshape_safe) { 3748 spin_unlock_irq(&conf->device_lock); 3749 schedule(); 3750 goto retry; 3751 } 3752 } 3753 spin_unlock_irq(&conf->device_lock); 3754 } 3755 data_disks = disks - conf->max_degraded; 3756 3757 new_sector = raid5_compute_sector(conf, logical_sector, 3758 previous, 3759 &dd_idx, NULL); 3760 pr_debug("raid456: make_request, sector %llu logical %llu\n", 3761 (unsigned long long)new_sector, 3762 (unsigned long long)logical_sector); 3763 3764 sh = get_active_stripe(conf, new_sector, previous, 3765 (bi->bi_rw&RWA_MASK), 0); 3766 if (sh) { 3767 if (unlikely(previous)) { 3768 /* expansion might have moved on while waiting for a 3769 * stripe, so we must do the range check again. 3770 * Expansion could still move past after this 3771 * test, but as we are holding a reference to 3772 * 'sh', we know that if that happens, 3773 * STRIPE_EXPANDING will get set and the expansion 3774 * won't proceed until we finish with the stripe. 3775 */ 3776 int must_retry = 0; 3777 spin_lock_irq(&conf->device_lock); 3778 if (mddev->delta_disks < 0 3779 ? logical_sector >= conf->reshape_progress 3780 : logical_sector < conf->reshape_progress) 3781 /* mismatch, need to try again */ 3782 must_retry = 1; 3783 spin_unlock_irq(&conf->device_lock); 3784 if (must_retry) { 3785 release_stripe(sh); 3786 schedule(); 3787 goto retry; 3788 } 3789 } 3790 3791 if (rw == WRITE && 3792 logical_sector >= mddev->suspend_lo && 3793 logical_sector < mddev->suspend_hi) { 3794 release_stripe(sh); 3795 /* As the suspend_* range is controlled by 3796 * userspace, we want an interruptible 3797 * wait. 3798 */ 3799 flush_signals(current); 3800 prepare_to_wait(&conf->wait_for_overlap, 3801 &w, TASK_INTERRUPTIBLE); 3802 if (logical_sector >= mddev->suspend_lo && 3803 logical_sector < mddev->suspend_hi) 3804 schedule(); 3805 goto retry; 3806 } 3807 3808 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3809 !add_stripe_bio(sh, bi, dd_idx, rw)) { 3810 /* Stripe is busy expanding or 3811 * add failed due to overlap. Flush everything 3812 * and wait a while 3813 */ 3814 md_wakeup_thread(mddev->thread); 3815 release_stripe(sh); 3816 schedule(); 3817 goto retry; 3818 } 3819 finish_wait(&conf->wait_for_overlap, &w); 3820 set_bit(STRIPE_HANDLE, &sh->state); 3821 clear_bit(STRIPE_DELAYED, &sh->state); 3822 if ((bi->bi_rw & REQ_SYNC) && 3823 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3824 atomic_inc(&conf->preread_active_stripes); 3825 release_stripe(sh); 3826 } else { 3827 /* cannot get stripe for read-ahead, just give-up */ 3828 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3829 finish_wait(&conf->wait_for_overlap, &w); 3830 break; 3831 } 3832 3833 } 3834 if (!plugged) 3835 md_wakeup_thread(mddev->thread); 3836 3837 spin_lock_irq(&conf->device_lock); 3838 remaining = raid5_dec_bi_phys_segments(bi); 3839 spin_unlock_irq(&conf->device_lock); 3840 if (remaining == 0) { 3841 3842 if ( rw == WRITE ) 3843 md_write_end(mddev); 3844 3845 bio_endio(bi, 0); 3846 } 3847 3848 return 0; 3849 } 3850 3851 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 3852 3853 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 3854 { 3855 /* reshaping is quite different to recovery/resync so it is 3856 * handled quite separately ... here. 3857 * 3858 * On each call to sync_request, we gather one chunk worth of 3859 * destination stripes and flag them as expanding. 3860 * Then we find all the source stripes and request reads. 3861 * As the reads complete, handle_stripe will copy the data 3862 * into the destination stripe and release that stripe. 3863 */ 3864 struct r5conf *conf = mddev->private; 3865 struct stripe_head *sh; 3866 sector_t first_sector, last_sector; 3867 int raid_disks = conf->previous_raid_disks; 3868 int data_disks = raid_disks - conf->max_degraded; 3869 int new_data_disks = conf->raid_disks - conf->max_degraded; 3870 int i; 3871 int dd_idx; 3872 sector_t writepos, readpos, safepos; 3873 sector_t stripe_addr; 3874 int reshape_sectors; 3875 struct list_head stripes; 3876 3877 if (sector_nr == 0) { 3878 /* If restarting in the middle, skip the initial sectors */ 3879 if (mddev->delta_disks < 0 && 3880 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 3881 sector_nr = raid5_size(mddev, 0, 0) 3882 - conf->reshape_progress; 3883 } else if (mddev->delta_disks >= 0 && 3884 conf->reshape_progress > 0) 3885 sector_nr = conf->reshape_progress; 3886 sector_div(sector_nr, new_data_disks); 3887 if (sector_nr) { 3888 mddev->curr_resync_completed = sector_nr; 3889 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 3890 *skipped = 1; 3891 return sector_nr; 3892 } 3893 } 3894 3895 /* We need to process a full chunk at a time. 3896 * If old and new chunk sizes differ, we need to process the 3897 * largest of these 3898 */ 3899 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 3900 reshape_sectors = mddev->new_chunk_sectors; 3901 else 3902 reshape_sectors = mddev->chunk_sectors; 3903 3904 /* we update the metadata when there is more than 3Meg 3905 * in the block range (that is rather arbitrary, should 3906 * probably be time based) or when the data about to be 3907 * copied would over-write the source of the data at 3908 * the front of the range. 3909 * i.e. one new_stripe along from reshape_progress new_maps 3910 * to after where reshape_safe old_maps to 3911 */ 3912 writepos = conf->reshape_progress; 3913 sector_div(writepos, new_data_disks); 3914 readpos = conf->reshape_progress; 3915 sector_div(readpos, data_disks); 3916 safepos = conf->reshape_safe; 3917 sector_div(safepos, data_disks); 3918 if (mddev->delta_disks < 0) { 3919 writepos -= min_t(sector_t, reshape_sectors, writepos); 3920 readpos += reshape_sectors; 3921 safepos += reshape_sectors; 3922 } else { 3923 writepos += reshape_sectors; 3924 readpos -= min_t(sector_t, reshape_sectors, readpos); 3925 safepos -= min_t(sector_t, reshape_sectors, safepos); 3926 } 3927 3928 /* 'writepos' is the most advanced device address we might write. 3929 * 'readpos' is the least advanced device address we might read. 3930 * 'safepos' is the least address recorded in the metadata as having 3931 * been reshaped. 3932 * If 'readpos' is behind 'writepos', then there is no way that we can 3933 * ensure safety in the face of a crash - that must be done by userspace 3934 * making a backup of the data. So in that case there is no particular 3935 * rush to update metadata. 3936 * Otherwise if 'safepos' is behind 'writepos', then we really need to 3937 * update the metadata to advance 'safepos' to match 'readpos' so that 3938 * we can be safe in the event of a crash. 3939 * So we insist on updating metadata if safepos is behind writepos and 3940 * readpos is beyond writepos. 3941 * In any case, update the metadata every 10 seconds. 3942 * Maybe that number should be configurable, but I'm not sure it is 3943 * worth it.... maybe it could be a multiple of safemode_delay??? 3944 */ 3945 if ((mddev->delta_disks < 0 3946 ? (safepos > writepos && readpos < writepos) 3947 : (safepos < writepos && readpos > writepos)) || 3948 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 3949 /* Cannot proceed until we've updated the superblock... */ 3950 wait_event(conf->wait_for_overlap, 3951 atomic_read(&conf->reshape_stripes)==0); 3952 mddev->reshape_position = conf->reshape_progress; 3953 mddev->curr_resync_completed = sector_nr; 3954 conf->reshape_checkpoint = jiffies; 3955 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3956 md_wakeup_thread(mddev->thread); 3957 wait_event(mddev->sb_wait, mddev->flags == 0 || 3958 kthread_should_stop()); 3959 spin_lock_irq(&conf->device_lock); 3960 conf->reshape_safe = mddev->reshape_position; 3961 spin_unlock_irq(&conf->device_lock); 3962 wake_up(&conf->wait_for_overlap); 3963 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 3964 } 3965 3966 if (mddev->delta_disks < 0) { 3967 BUG_ON(conf->reshape_progress == 0); 3968 stripe_addr = writepos; 3969 BUG_ON((mddev->dev_sectors & 3970 ~((sector_t)reshape_sectors - 1)) 3971 - reshape_sectors - stripe_addr 3972 != sector_nr); 3973 } else { 3974 BUG_ON(writepos != sector_nr + reshape_sectors); 3975 stripe_addr = sector_nr; 3976 } 3977 INIT_LIST_HEAD(&stripes); 3978 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 3979 int j; 3980 int skipped_disk = 0; 3981 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 3982 set_bit(STRIPE_EXPANDING, &sh->state); 3983 atomic_inc(&conf->reshape_stripes); 3984 /* If any of this stripe is beyond the end of the old 3985 * array, then we need to zero those blocks 3986 */ 3987 for (j=sh->disks; j--;) { 3988 sector_t s; 3989 if (j == sh->pd_idx) 3990 continue; 3991 if (conf->level == 6 && 3992 j == sh->qd_idx) 3993 continue; 3994 s = compute_blocknr(sh, j, 0); 3995 if (s < raid5_size(mddev, 0, 0)) { 3996 skipped_disk = 1; 3997 continue; 3998 } 3999 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4000 set_bit(R5_Expanded, &sh->dev[j].flags); 4001 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4002 } 4003 if (!skipped_disk) { 4004 set_bit(STRIPE_EXPAND_READY, &sh->state); 4005 set_bit(STRIPE_HANDLE, &sh->state); 4006 } 4007 list_add(&sh->lru, &stripes); 4008 } 4009 spin_lock_irq(&conf->device_lock); 4010 if (mddev->delta_disks < 0) 4011 conf->reshape_progress -= reshape_sectors * new_data_disks; 4012 else 4013 conf->reshape_progress += reshape_sectors * new_data_disks; 4014 spin_unlock_irq(&conf->device_lock); 4015 /* Ok, those stripe are ready. We can start scheduling 4016 * reads on the source stripes. 4017 * The source stripes are determined by mapping the first and last 4018 * block on the destination stripes. 4019 */ 4020 first_sector = 4021 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4022 1, &dd_idx, NULL); 4023 last_sector = 4024 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4025 * new_data_disks - 1), 4026 1, &dd_idx, NULL); 4027 if (last_sector >= mddev->dev_sectors) 4028 last_sector = mddev->dev_sectors - 1; 4029 while (first_sector <= last_sector) { 4030 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4031 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4032 set_bit(STRIPE_HANDLE, &sh->state); 4033 release_stripe(sh); 4034 first_sector += STRIPE_SECTORS; 4035 } 4036 /* Now that the sources are clearly marked, we can release 4037 * the destination stripes 4038 */ 4039 while (!list_empty(&stripes)) { 4040 sh = list_entry(stripes.next, struct stripe_head, lru); 4041 list_del_init(&sh->lru); 4042 release_stripe(sh); 4043 } 4044 /* If this takes us to the resync_max point where we have to pause, 4045 * then we need to write out the superblock. 4046 */ 4047 sector_nr += reshape_sectors; 4048 if ((sector_nr - mddev->curr_resync_completed) * 2 4049 >= mddev->resync_max - mddev->curr_resync_completed) { 4050 /* Cannot proceed until we've updated the superblock... */ 4051 wait_event(conf->wait_for_overlap, 4052 atomic_read(&conf->reshape_stripes) == 0); 4053 mddev->reshape_position = conf->reshape_progress; 4054 mddev->curr_resync_completed = sector_nr; 4055 conf->reshape_checkpoint = jiffies; 4056 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4057 md_wakeup_thread(mddev->thread); 4058 wait_event(mddev->sb_wait, 4059 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4060 || kthread_should_stop()); 4061 spin_lock_irq(&conf->device_lock); 4062 conf->reshape_safe = mddev->reshape_position; 4063 spin_unlock_irq(&conf->device_lock); 4064 wake_up(&conf->wait_for_overlap); 4065 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4066 } 4067 return reshape_sectors; 4068 } 4069 4070 /* FIXME go_faster isn't used */ 4071 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4072 { 4073 struct r5conf *conf = mddev->private; 4074 struct stripe_head *sh; 4075 sector_t max_sector = mddev->dev_sectors; 4076 sector_t sync_blocks; 4077 int still_degraded = 0; 4078 int i; 4079 4080 if (sector_nr >= max_sector) { 4081 /* just being told to finish up .. nothing much to do */ 4082 4083 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4084 end_reshape(conf); 4085 return 0; 4086 } 4087 4088 if (mddev->curr_resync < max_sector) /* aborted */ 4089 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4090 &sync_blocks, 1); 4091 else /* completed sync */ 4092 conf->fullsync = 0; 4093 bitmap_close_sync(mddev->bitmap); 4094 4095 return 0; 4096 } 4097 4098 /* Allow raid5_quiesce to complete */ 4099 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4100 4101 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4102 return reshape_request(mddev, sector_nr, skipped); 4103 4104 /* No need to check resync_max as we never do more than one 4105 * stripe, and as resync_max will always be on a chunk boundary, 4106 * if the check in md_do_sync didn't fire, there is no chance 4107 * of overstepping resync_max here 4108 */ 4109 4110 /* if there is too many failed drives and we are trying 4111 * to resync, then assert that we are finished, because there is 4112 * nothing we can do. 4113 */ 4114 if (mddev->degraded >= conf->max_degraded && 4115 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4116 sector_t rv = mddev->dev_sectors - sector_nr; 4117 *skipped = 1; 4118 return rv; 4119 } 4120 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4121 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4122 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4123 /* we can skip this block, and probably more */ 4124 sync_blocks /= STRIPE_SECTORS; 4125 *skipped = 1; 4126 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4127 } 4128 4129 4130 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4131 4132 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4133 if (sh == NULL) { 4134 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4135 /* make sure we don't swamp the stripe cache if someone else 4136 * is trying to get access 4137 */ 4138 schedule_timeout_uninterruptible(1); 4139 } 4140 /* Need to check if array will still be degraded after recovery/resync 4141 * We don't need to check the 'failed' flag as when that gets set, 4142 * recovery aborts. 4143 */ 4144 for (i = 0; i < conf->raid_disks; i++) 4145 if (conf->disks[i].rdev == NULL) 4146 still_degraded = 1; 4147 4148 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4149 4150 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4151 4152 handle_stripe(sh); 4153 release_stripe(sh); 4154 4155 return STRIPE_SECTORS; 4156 } 4157 4158 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4159 { 4160 /* We may not be able to submit a whole bio at once as there 4161 * may not be enough stripe_heads available. 4162 * We cannot pre-allocate enough stripe_heads as we may need 4163 * more than exist in the cache (if we allow ever large chunks). 4164 * So we do one stripe head at a time and record in 4165 * ->bi_hw_segments how many have been done. 4166 * 4167 * We *know* that this entire raid_bio is in one chunk, so 4168 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4169 */ 4170 struct stripe_head *sh; 4171 int dd_idx; 4172 sector_t sector, logical_sector, last_sector; 4173 int scnt = 0; 4174 int remaining; 4175 int handled = 0; 4176 4177 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4178 sector = raid5_compute_sector(conf, logical_sector, 4179 0, &dd_idx, NULL); 4180 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4181 4182 for (; logical_sector < last_sector; 4183 logical_sector += STRIPE_SECTORS, 4184 sector += STRIPE_SECTORS, 4185 scnt++) { 4186 4187 if (scnt < raid5_bi_hw_segments(raid_bio)) 4188 /* already done this stripe */ 4189 continue; 4190 4191 sh = get_active_stripe(conf, sector, 0, 1, 0); 4192 4193 if (!sh) { 4194 /* failed to get a stripe - must wait */ 4195 raid5_set_bi_hw_segments(raid_bio, scnt); 4196 conf->retry_read_aligned = raid_bio; 4197 return handled; 4198 } 4199 4200 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4201 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4202 release_stripe(sh); 4203 raid5_set_bi_hw_segments(raid_bio, scnt); 4204 conf->retry_read_aligned = raid_bio; 4205 return handled; 4206 } 4207 4208 handle_stripe(sh); 4209 release_stripe(sh); 4210 handled++; 4211 } 4212 spin_lock_irq(&conf->device_lock); 4213 remaining = raid5_dec_bi_phys_segments(raid_bio); 4214 spin_unlock_irq(&conf->device_lock); 4215 if (remaining == 0) 4216 bio_endio(raid_bio, 0); 4217 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4218 wake_up(&conf->wait_for_stripe); 4219 return handled; 4220 } 4221 4222 4223 /* 4224 * This is our raid5 kernel thread. 4225 * 4226 * We scan the hash table for stripes which can be handled now. 4227 * During the scan, completed stripes are saved for us by the interrupt 4228 * handler, so that they will not have to wait for our next wakeup. 4229 */ 4230 static void raid5d(struct mddev *mddev) 4231 { 4232 struct stripe_head *sh; 4233 struct r5conf *conf = mddev->private; 4234 int handled; 4235 struct blk_plug plug; 4236 4237 pr_debug("+++ raid5d active\n"); 4238 4239 md_check_recovery(mddev); 4240 4241 blk_start_plug(&plug); 4242 handled = 0; 4243 spin_lock_irq(&conf->device_lock); 4244 while (1) { 4245 struct bio *bio; 4246 4247 if (atomic_read(&mddev->plug_cnt) == 0 && 4248 !list_empty(&conf->bitmap_list)) { 4249 /* Now is a good time to flush some bitmap updates */ 4250 conf->seq_flush++; 4251 spin_unlock_irq(&conf->device_lock); 4252 bitmap_unplug(mddev->bitmap); 4253 spin_lock_irq(&conf->device_lock); 4254 conf->seq_write = conf->seq_flush; 4255 activate_bit_delay(conf); 4256 } 4257 if (atomic_read(&mddev->plug_cnt) == 0) 4258 raid5_activate_delayed(conf); 4259 4260 while ((bio = remove_bio_from_retry(conf))) { 4261 int ok; 4262 spin_unlock_irq(&conf->device_lock); 4263 ok = retry_aligned_read(conf, bio); 4264 spin_lock_irq(&conf->device_lock); 4265 if (!ok) 4266 break; 4267 handled++; 4268 } 4269 4270 sh = __get_priority_stripe(conf); 4271 4272 if (!sh) 4273 break; 4274 spin_unlock_irq(&conf->device_lock); 4275 4276 handled++; 4277 handle_stripe(sh); 4278 release_stripe(sh); 4279 cond_resched(); 4280 4281 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 4282 md_check_recovery(mddev); 4283 4284 spin_lock_irq(&conf->device_lock); 4285 } 4286 pr_debug("%d stripes handled\n", handled); 4287 4288 spin_unlock_irq(&conf->device_lock); 4289 4290 async_tx_issue_pending_all(); 4291 blk_finish_plug(&plug); 4292 4293 pr_debug("--- raid5d inactive\n"); 4294 } 4295 4296 static ssize_t 4297 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4298 { 4299 struct r5conf *conf = mddev->private; 4300 if (conf) 4301 return sprintf(page, "%d\n", conf->max_nr_stripes); 4302 else 4303 return 0; 4304 } 4305 4306 int 4307 raid5_set_cache_size(struct mddev *mddev, int size) 4308 { 4309 struct r5conf *conf = mddev->private; 4310 int err; 4311 4312 if (size <= 16 || size > 32768) 4313 return -EINVAL; 4314 while (size < conf->max_nr_stripes) { 4315 if (drop_one_stripe(conf)) 4316 conf->max_nr_stripes--; 4317 else 4318 break; 4319 } 4320 err = md_allow_write(mddev); 4321 if (err) 4322 return err; 4323 while (size > conf->max_nr_stripes) { 4324 if (grow_one_stripe(conf)) 4325 conf->max_nr_stripes++; 4326 else break; 4327 } 4328 return 0; 4329 } 4330 EXPORT_SYMBOL(raid5_set_cache_size); 4331 4332 static ssize_t 4333 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4334 { 4335 struct r5conf *conf = mddev->private; 4336 unsigned long new; 4337 int err; 4338 4339 if (len >= PAGE_SIZE) 4340 return -EINVAL; 4341 if (!conf) 4342 return -ENODEV; 4343 4344 if (strict_strtoul(page, 10, &new)) 4345 return -EINVAL; 4346 err = raid5_set_cache_size(mddev, new); 4347 if (err) 4348 return err; 4349 return len; 4350 } 4351 4352 static struct md_sysfs_entry 4353 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4354 raid5_show_stripe_cache_size, 4355 raid5_store_stripe_cache_size); 4356 4357 static ssize_t 4358 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4359 { 4360 struct r5conf *conf = mddev->private; 4361 if (conf) 4362 return sprintf(page, "%d\n", conf->bypass_threshold); 4363 else 4364 return 0; 4365 } 4366 4367 static ssize_t 4368 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4369 { 4370 struct r5conf *conf = mddev->private; 4371 unsigned long new; 4372 if (len >= PAGE_SIZE) 4373 return -EINVAL; 4374 if (!conf) 4375 return -ENODEV; 4376 4377 if (strict_strtoul(page, 10, &new)) 4378 return -EINVAL; 4379 if (new > conf->max_nr_stripes) 4380 return -EINVAL; 4381 conf->bypass_threshold = new; 4382 return len; 4383 } 4384 4385 static struct md_sysfs_entry 4386 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4387 S_IRUGO | S_IWUSR, 4388 raid5_show_preread_threshold, 4389 raid5_store_preread_threshold); 4390 4391 static ssize_t 4392 stripe_cache_active_show(struct mddev *mddev, char *page) 4393 { 4394 struct r5conf *conf = mddev->private; 4395 if (conf) 4396 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4397 else 4398 return 0; 4399 } 4400 4401 static struct md_sysfs_entry 4402 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4403 4404 static struct attribute *raid5_attrs[] = { 4405 &raid5_stripecache_size.attr, 4406 &raid5_stripecache_active.attr, 4407 &raid5_preread_bypass_threshold.attr, 4408 NULL, 4409 }; 4410 static struct attribute_group raid5_attrs_group = { 4411 .name = NULL, 4412 .attrs = raid5_attrs, 4413 }; 4414 4415 static sector_t 4416 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4417 { 4418 struct r5conf *conf = mddev->private; 4419 4420 if (!sectors) 4421 sectors = mddev->dev_sectors; 4422 if (!raid_disks) 4423 /* size is defined by the smallest of previous and new size */ 4424 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4425 4426 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4427 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4428 return sectors * (raid_disks - conf->max_degraded); 4429 } 4430 4431 static void raid5_free_percpu(struct r5conf *conf) 4432 { 4433 struct raid5_percpu *percpu; 4434 unsigned long cpu; 4435 4436 if (!conf->percpu) 4437 return; 4438 4439 get_online_cpus(); 4440 for_each_possible_cpu(cpu) { 4441 percpu = per_cpu_ptr(conf->percpu, cpu); 4442 safe_put_page(percpu->spare_page); 4443 kfree(percpu->scribble); 4444 } 4445 #ifdef CONFIG_HOTPLUG_CPU 4446 unregister_cpu_notifier(&conf->cpu_notify); 4447 #endif 4448 put_online_cpus(); 4449 4450 free_percpu(conf->percpu); 4451 } 4452 4453 static void free_conf(struct r5conf *conf) 4454 { 4455 shrink_stripes(conf); 4456 raid5_free_percpu(conf); 4457 kfree(conf->disks); 4458 kfree(conf->stripe_hashtbl); 4459 kfree(conf); 4460 } 4461 4462 #ifdef CONFIG_HOTPLUG_CPU 4463 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4464 void *hcpu) 4465 { 4466 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4467 long cpu = (long)hcpu; 4468 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4469 4470 switch (action) { 4471 case CPU_UP_PREPARE: 4472 case CPU_UP_PREPARE_FROZEN: 4473 if (conf->level == 6 && !percpu->spare_page) 4474 percpu->spare_page = alloc_page(GFP_KERNEL); 4475 if (!percpu->scribble) 4476 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4477 4478 if (!percpu->scribble || 4479 (conf->level == 6 && !percpu->spare_page)) { 4480 safe_put_page(percpu->spare_page); 4481 kfree(percpu->scribble); 4482 pr_err("%s: failed memory allocation for cpu%ld\n", 4483 __func__, cpu); 4484 return notifier_from_errno(-ENOMEM); 4485 } 4486 break; 4487 case CPU_DEAD: 4488 case CPU_DEAD_FROZEN: 4489 safe_put_page(percpu->spare_page); 4490 kfree(percpu->scribble); 4491 percpu->spare_page = NULL; 4492 percpu->scribble = NULL; 4493 break; 4494 default: 4495 break; 4496 } 4497 return NOTIFY_OK; 4498 } 4499 #endif 4500 4501 static int raid5_alloc_percpu(struct r5conf *conf) 4502 { 4503 unsigned long cpu; 4504 struct page *spare_page; 4505 struct raid5_percpu __percpu *allcpus; 4506 void *scribble; 4507 int err; 4508 4509 allcpus = alloc_percpu(struct raid5_percpu); 4510 if (!allcpus) 4511 return -ENOMEM; 4512 conf->percpu = allcpus; 4513 4514 get_online_cpus(); 4515 err = 0; 4516 for_each_present_cpu(cpu) { 4517 if (conf->level == 6) { 4518 spare_page = alloc_page(GFP_KERNEL); 4519 if (!spare_page) { 4520 err = -ENOMEM; 4521 break; 4522 } 4523 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4524 } 4525 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4526 if (!scribble) { 4527 err = -ENOMEM; 4528 break; 4529 } 4530 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4531 } 4532 #ifdef CONFIG_HOTPLUG_CPU 4533 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4534 conf->cpu_notify.priority = 0; 4535 if (err == 0) 4536 err = register_cpu_notifier(&conf->cpu_notify); 4537 #endif 4538 put_online_cpus(); 4539 4540 return err; 4541 } 4542 4543 static struct r5conf *setup_conf(struct mddev *mddev) 4544 { 4545 struct r5conf *conf; 4546 int raid_disk, memory, max_disks; 4547 struct md_rdev *rdev; 4548 struct disk_info *disk; 4549 4550 if (mddev->new_level != 5 4551 && mddev->new_level != 4 4552 && mddev->new_level != 6) { 4553 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4554 mdname(mddev), mddev->new_level); 4555 return ERR_PTR(-EIO); 4556 } 4557 if ((mddev->new_level == 5 4558 && !algorithm_valid_raid5(mddev->new_layout)) || 4559 (mddev->new_level == 6 4560 && !algorithm_valid_raid6(mddev->new_layout))) { 4561 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4562 mdname(mddev), mddev->new_layout); 4563 return ERR_PTR(-EIO); 4564 } 4565 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4566 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4567 mdname(mddev), mddev->raid_disks); 4568 return ERR_PTR(-EINVAL); 4569 } 4570 4571 if (!mddev->new_chunk_sectors || 4572 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4573 !is_power_of_2(mddev->new_chunk_sectors)) { 4574 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4575 mdname(mddev), mddev->new_chunk_sectors << 9); 4576 return ERR_PTR(-EINVAL); 4577 } 4578 4579 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4580 if (conf == NULL) 4581 goto abort; 4582 spin_lock_init(&conf->device_lock); 4583 init_waitqueue_head(&conf->wait_for_stripe); 4584 init_waitqueue_head(&conf->wait_for_overlap); 4585 INIT_LIST_HEAD(&conf->handle_list); 4586 INIT_LIST_HEAD(&conf->hold_list); 4587 INIT_LIST_HEAD(&conf->delayed_list); 4588 INIT_LIST_HEAD(&conf->bitmap_list); 4589 INIT_LIST_HEAD(&conf->inactive_list); 4590 atomic_set(&conf->active_stripes, 0); 4591 atomic_set(&conf->preread_active_stripes, 0); 4592 atomic_set(&conf->active_aligned_reads, 0); 4593 conf->bypass_threshold = BYPASS_THRESHOLD; 4594 conf->recovery_disabled = mddev->recovery_disabled - 1; 4595 4596 conf->raid_disks = mddev->raid_disks; 4597 if (mddev->reshape_position == MaxSector) 4598 conf->previous_raid_disks = mddev->raid_disks; 4599 else 4600 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4601 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4602 conf->scribble_len = scribble_len(max_disks); 4603 4604 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4605 GFP_KERNEL); 4606 if (!conf->disks) 4607 goto abort; 4608 4609 conf->mddev = mddev; 4610 4611 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4612 goto abort; 4613 4614 conf->level = mddev->new_level; 4615 if (raid5_alloc_percpu(conf) != 0) 4616 goto abort; 4617 4618 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4619 4620 list_for_each_entry(rdev, &mddev->disks, same_set) { 4621 raid_disk = rdev->raid_disk; 4622 if (raid_disk >= max_disks 4623 || raid_disk < 0) 4624 continue; 4625 disk = conf->disks + raid_disk; 4626 4627 disk->rdev = rdev; 4628 4629 if (test_bit(In_sync, &rdev->flags)) { 4630 char b[BDEVNAME_SIZE]; 4631 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4632 " disk %d\n", 4633 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4634 } else if (rdev->saved_raid_disk != raid_disk) 4635 /* Cannot rely on bitmap to complete recovery */ 4636 conf->fullsync = 1; 4637 } 4638 4639 conf->chunk_sectors = mddev->new_chunk_sectors; 4640 conf->level = mddev->new_level; 4641 if (conf->level == 6) 4642 conf->max_degraded = 2; 4643 else 4644 conf->max_degraded = 1; 4645 conf->algorithm = mddev->new_layout; 4646 conf->max_nr_stripes = NR_STRIPES; 4647 conf->reshape_progress = mddev->reshape_position; 4648 if (conf->reshape_progress != MaxSector) { 4649 conf->prev_chunk_sectors = mddev->chunk_sectors; 4650 conf->prev_algo = mddev->layout; 4651 } 4652 4653 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4654 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4655 if (grow_stripes(conf, conf->max_nr_stripes)) { 4656 printk(KERN_ERR 4657 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4658 mdname(mddev), memory); 4659 goto abort; 4660 } else 4661 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4662 mdname(mddev), memory); 4663 4664 conf->thread = md_register_thread(raid5d, mddev, NULL); 4665 if (!conf->thread) { 4666 printk(KERN_ERR 4667 "md/raid:%s: couldn't allocate thread.\n", 4668 mdname(mddev)); 4669 goto abort; 4670 } 4671 4672 return conf; 4673 4674 abort: 4675 if (conf) { 4676 free_conf(conf); 4677 return ERR_PTR(-EIO); 4678 } else 4679 return ERR_PTR(-ENOMEM); 4680 } 4681 4682 4683 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4684 { 4685 switch (algo) { 4686 case ALGORITHM_PARITY_0: 4687 if (raid_disk < max_degraded) 4688 return 1; 4689 break; 4690 case ALGORITHM_PARITY_N: 4691 if (raid_disk >= raid_disks - max_degraded) 4692 return 1; 4693 break; 4694 case ALGORITHM_PARITY_0_6: 4695 if (raid_disk == 0 || 4696 raid_disk == raid_disks - 1) 4697 return 1; 4698 break; 4699 case ALGORITHM_LEFT_ASYMMETRIC_6: 4700 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4701 case ALGORITHM_LEFT_SYMMETRIC_6: 4702 case ALGORITHM_RIGHT_SYMMETRIC_6: 4703 if (raid_disk == raid_disks - 1) 4704 return 1; 4705 } 4706 return 0; 4707 } 4708 4709 static int run(struct mddev *mddev) 4710 { 4711 struct r5conf *conf; 4712 int working_disks = 0; 4713 int dirty_parity_disks = 0; 4714 struct md_rdev *rdev; 4715 sector_t reshape_offset = 0; 4716 4717 if (mddev->recovery_cp != MaxSector) 4718 printk(KERN_NOTICE "md/raid:%s: not clean" 4719 " -- starting background reconstruction\n", 4720 mdname(mddev)); 4721 if (mddev->reshape_position != MaxSector) { 4722 /* Check that we can continue the reshape. 4723 * Currently only disks can change, it must 4724 * increase, and we must be past the point where 4725 * a stripe over-writes itself 4726 */ 4727 sector_t here_new, here_old; 4728 int old_disks; 4729 int max_degraded = (mddev->level == 6 ? 2 : 1); 4730 4731 if (mddev->new_level != mddev->level) { 4732 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4733 "required - aborting.\n", 4734 mdname(mddev)); 4735 return -EINVAL; 4736 } 4737 old_disks = mddev->raid_disks - mddev->delta_disks; 4738 /* reshape_position must be on a new-stripe boundary, and one 4739 * further up in new geometry must map after here in old 4740 * geometry. 4741 */ 4742 here_new = mddev->reshape_position; 4743 if (sector_div(here_new, mddev->new_chunk_sectors * 4744 (mddev->raid_disks - max_degraded))) { 4745 printk(KERN_ERR "md/raid:%s: reshape_position not " 4746 "on a stripe boundary\n", mdname(mddev)); 4747 return -EINVAL; 4748 } 4749 reshape_offset = here_new * mddev->new_chunk_sectors; 4750 /* here_new is the stripe we will write to */ 4751 here_old = mddev->reshape_position; 4752 sector_div(here_old, mddev->chunk_sectors * 4753 (old_disks-max_degraded)); 4754 /* here_old is the first stripe that we might need to read 4755 * from */ 4756 if (mddev->delta_disks == 0) { 4757 /* We cannot be sure it is safe to start an in-place 4758 * reshape. It is only safe if user-space if monitoring 4759 * and taking constant backups. 4760 * mdadm always starts a situation like this in 4761 * readonly mode so it can take control before 4762 * allowing any writes. So just check for that. 4763 */ 4764 if ((here_new * mddev->new_chunk_sectors != 4765 here_old * mddev->chunk_sectors) || 4766 mddev->ro == 0) { 4767 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4768 " in read-only mode - aborting\n", 4769 mdname(mddev)); 4770 return -EINVAL; 4771 } 4772 } else if (mddev->delta_disks < 0 4773 ? (here_new * mddev->new_chunk_sectors <= 4774 here_old * mddev->chunk_sectors) 4775 : (here_new * mddev->new_chunk_sectors >= 4776 here_old * mddev->chunk_sectors)) { 4777 /* Reading from the same stripe as writing to - bad */ 4778 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 4779 "auto-recovery - aborting.\n", 4780 mdname(mddev)); 4781 return -EINVAL; 4782 } 4783 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 4784 mdname(mddev)); 4785 /* OK, we should be able to continue; */ 4786 } else { 4787 BUG_ON(mddev->level != mddev->new_level); 4788 BUG_ON(mddev->layout != mddev->new_layout); 4789 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 4790 BUG_ON(mddev->delta_disks != 0); 4791 } 4792 4793 if (mddev->private == NULL) 4794 conf = setup_conf(mddev); 4795 else 4796 conf = mddev->private; 4797 4798 if (IS_ERR(conf)) 4799 return PTR_ERR(conf); 4800 4801 mddev->thread = conf->thread; 4802 conf->thread = NULL; 4803 mddev->private = conf; 4804 4805 /* 4806 * 0 for a fully functional array, 1 or 2 for a degraded array. 4807 */ 4808 list_for_each_entry(rdev, &mddev->disks, same_set) { 4809 if (rdev->raid_disk < 0) 4810 continue; 4811 if (test_bit(In_sync, &rdev->flags)) { 4812 working_disks++; 4813 continue; 4814 } 4815 /* This disc is not fully in-sync. However if it 4816 * just stored parity (beyond the recovery_offset), 4817 * when we don't need to be concerned about the 4818 * array being dirty. 4819 * When reshape goes 'backwards', we never have 4820 * partially completed devices, so we only need 4821 * to worry about reshape going forwards. 4822 */ 4823 /* Hack because v0.91 doesn't store recovery_offset properly. */ 4824 if (mddev->major_version == 0 && 4825 mddev->minor_version > 90) 4826 rdev->recovery_offset = reshape_offset; 4827 4828 if (rdev->recovery_offset < reshape_offset) { 4829 /* We need to check old and new layout */ 4830 if (!only_parity(rdev->raid_disk, 4831 conf->algorithm, 4832 conf->raid_disks, 4833 conf->max_degraded)) 4834 continue; 4835 } 4836 if (!only_parity(rdev->raid_disk, 4837 conf->prev_algo, 4838 conf->previous_raid_disks, 4839 conf->max_degraded)) 4840 continue; 4841 dirty_parity_disks++; 4842 } 4843 4844 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) 4845 - working_disks); 4846 4847 if (has_failed(conf)) { 4848 printk(KERN_ERR "md/raid:%s: not enough operational devices" 4849 " (%d/%d failed)\n", 4850 mdname(mddev), mddev->degraded, conf->raid_disks); 4851 goto abort; 4852 } 4853 4854 /* device size must be a multiple of chunk size */ 4855 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 4856 mddev->resync_max_sectors = mddev->dev_sectors; 4857 4858 if (mddev->degraded > dirty_parity_disks && 4859 mddev->recovery_cp != MaxSector) { 4860 if (mddev->ok_start_degraded) 4861 printk(KERN_WARNING 4862 "md/raid:%s: starting dirty degraded array" 4863 " - data corruption possible.\n", 4864 mdname(mddev)); 4865 else { 4866 printk(KERN_ERR 4867 "md/raid:%s: cannot start dirty degraded array.\n", 4868 mdname(mddev)); 4869 goto abort; 4870 } 4871 } 4872 4873 if (mddev->degraded == 0) 4874 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 4875 " devices, algorithm %d\n", mdname(mddev), conf->level, 4876 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 4877 mddev->new_layout); 4878 else 4879 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 4880 " out of %d devices, algorithm %d\n", 4881 mdname(mddev), conf->level, 4882 mddev->raid_disks - mddev->degraded, 4883 mddev->raid_disks, mddev->new_layout); 4884 4885 print_raid5_conf(conf); 4886 4887 if (conf->reshape_progress != MaxSector) { 4888 conf->reshape_safe = conf->reshape_progress; 4889 atomic_set(&conf->reshape_stripes, 0); 4890 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4891 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4892 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4893 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4894 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4895 "reshape"); 4896 } 4897 4898 4899 /* Ok, everything is just fine now */ 4900 if (mddev->to_remove == &raid5_attrs_group) 4901 mddev->to_remove = NULL; 4902 else if (mddev->kobj.sd && 4903 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 4904 printk(KERN_WARNING 4905 "raid5: failed to create sysfs attributes for %s\n", 4906 mdname(mddev)); 4907 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 4908 4909 if (mddev->queue) { 4910 int chunk_size; 4911 /* read-ahead size must cover two whole stripes, which 4912 * is 2 * (datadisks) * chunksize where 'n' is the 4913 * number of raid devices 4914 */ 4915 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4916 int stripe = data_disks * 4917 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 4918 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4919 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4920 4921 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4922 4923 mddev->queue->backing_dev_info.congested_data = mddev; 4924 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4925 4926 chunk_size = mddev->chunk_sectors << 9; 4927 blk_queue_io_min(mddev->queue, chunk_size); 4928 blk_queue_io_opt(mddev->queue, chunk_size * 4929 (conf->raid_disks - conf->max_degraded)); 4930 4931 list_for_each_entry(rdev, &mddev->disks, same_set) 4932 disk_stack_limits(mddev->gendisk, rdev->bdev, 4933 rdev->data_offset << 9); 4934 } 4935 4936 return 0; 4937 abort: 4938 md_unregister_thread(&mddev->thread); 4939 print_raid5_conf(conf); 4940 free_conf(conf); 4941 mddev->private = NULL; 4942 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 4943 return -EIO; 4944 } 4945 4946 static int stop(struct mddev *mddev) 4947 { 4948 struct r5conf *conf = mddev->private; 4949 4950 md_unregister_thread(&mddev->thread); 4951 if (mddev->queue) 4952 mddev->queue->backing_dev_info.congested_fn = NULL; 4953 free_conf(conf); 4954 mddev->private = NULL; 4955 mddev->to_remove = &raid5_attrs_group; 4956 return 0; 4957 } 4958 4959 static void status(struct seq_file *seq, struct mddev *mddev) 4960 { 4961 struct r5conf *conf = mddev->private; 4962 int i; 4963 4964 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 4965 mddev->chunk_sectors / 2, mddev->layout); 4966 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 4967 for (i = 0; i < conf->raid_disks; i++) 4968 seq_printf (seq, "%s", 4969 conf->disks[i].rdev && 4970 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 4971 seq_printf (seq, "]"); 4972 } 4973 4974 static void print_raid5_conf (struct r5conf *conf) 4975 { 4976 int i; 4977 struct disk_info *tmp; 4978 4979 printk(KERN_DEBUG "RAID conf printout:\n"); 4980 if (!conf) { 4981 printk("(conf==NULL)\n"); 4982 return; 4983 } 4984 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 4985 conf->raid_disks, 4986 conf->raid_disks - conf->mddev->degraded); 4987 4988 for (i = 0; i < conf->raid_disks; i++) { 4989 char b[BDEVNAME_SIZE]; 4990 tmp = conf->disks + i; 4991 if (tmp->rdev) 4992 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 4993 i, !test_bit(Faulty, &tmp->rdev->flags), 4994 bdevname(tmp->rdev->bdev, b)); 4995 } 4996 } 4997 4998 static int raid5_spare_active(struct mddev *mddev) 4999 { 5000 int i; 5001 struct r5conf *conf = mddev->private; 5002 struct disk_info *tmp; 5003 int count = 0; 5004 unsigned long flags; 5005 5006 for (i = 0; i < conf->raid_disks; i++) { 5007 tmp = conf->disks + i; 5008 if (tmp->rdev 5009 && tmp->rdev->recovery_offset == MaxSector 5010 && !test_bit(Faulty, &tmp->rdev->flags) 5011 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5012 count++; 5013 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5014 } 5015 } 5016 spin_lock_irqsave(&conf->device_lock, flags); 5017 mddev->degraded -= count; 5018 spin_unlock_irqrestore(&conf->device_lock, flags); 5019 print_raid5_conf(conf); 5020 return count; 5021 } 5022 5023 static int raid5_remove_disk(struct mddev *mddev, int number) 5024 { 5025 struct r5conf *conf = mddev->private; 5026 int err = 0; 5027 struct md_rdev *rdev; 5028 struct disk_info *p = conf->disks + number; 5029 5030 print_raid5_conf(conf); 5031 rdev = p->rdev; 5032 if (rdev) { 5033 if (number >= conf->raid_disks && 5034 conf->reshape_progress == MaxSector) 5035 clear_bit(In_sync, &rdev->flags); 5036 5037 if (test_bit(In_sync, &rdev->flags) || 5038 atomic_read(&rdev->nr_pending)) { 5039 err = -EBUSY; 5040 goto abort; 5041 } 5042 /* Only remove non-faulty devices if recovery 5043 * isn't possible. 5044 */ 5045 if (!test_bit(Faulty, &rdev->flags) && 5046 mddev->recovery_disabled != conf->recovery_disabled && 5047 !has_failed(conf) && 5048 number < conf->raid_disks) { 5049 err = -EBUSY; 5050 goto abort; 5051 } 5052 p->rdev = NULL; 5053 synchronize_rcu(); 5054 if (atomic_read(&rdev->nr_pending)) { 5055 /* lost the race, try later */ 5056 err = -EBUSY; 5057 p->rdev = rdev; 5058 } 5059 } 5060 abort: 5061 5062 print_raid5_conf(conf); 5063 return err; 5064 } 5065 5066 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5067 { 5068 struct r5conf *conf = mddev->private; 5069 int err = -EEXIST; 5070 int disk; 5071 struct disk_info *p; 5072 int first = 0; 5073 int last = conf->raid_disks - 1; 5074 5075 if (mddev->recovery_disabled == conf->recovery_disabled) 5076 return -EBUSY; 5077 5078 if (has_failed(conf)) 5079 /* no point adding a device */ 5080 return -EINVAL; 5081 5082 if (rdev->raid_disk >= 0) 5083 first = last = rdev->raid_disk; 5084 5085 /* 5086 * find the disk ... but prefer rdev->saved_raid_disk 5087 * if possible. 5088 */ 5089 if (rdev->saved_raid_disk >= 0 && 5090 rdev->saved_raid_disk >= first && 5091 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5092 disk = rdev->saved_raid_disk; 5093 else 5094 disk = first; 5095 for ( ; disk <= last ; disk++) 5096 if ((p=conf->disks + disk)->rdev == NULL) { 5097 clear_bit(In_sync, &rdev->flags); 5098 rdev->raid_disk = disk; 5099 err = 0; 5100 if (rdev->saved_raid_disk != disk) 5101 conf->fullsync = 1; 5102 rcu_assign_pointer(p->rdev, rdev); 5103 break; 5104 } 5105 print_raid5_conf(conf); 5106 return err; 5107 } 5108 5109 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5110 { 5111 /* no resync is happening, and there is enough space 5112 * on all devices, so we can resize. 5113 * We need to make sure resync covers any new space. 5114 * If the array is shrinking we should possibly wait until 5115 * any io in the removed space completes, but it hardly seems 5116 * worth it. 5117 */ 5118 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5119 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5120 mddev->raid_disks)); 5121 if (mddev->array_sectors > 5122 raid5_size(mddev, sectors, mddev->raid_disks)) 5123 return -EINVAL; 5124 set_capacity(mddev->gendisk, mddev->array_sectors); 5125 revalidate_disk(mddev->gendisk); 5126 if (sectors > mddev->dev_sectors && 5127 mddev->recovery_cp > mddev->dev_sectors) { 5128 mddev->recovery_cp = mddev->dev_sectors; 5129 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5130 } 5131 mddev->dev_sectors = sectors; 5132 mddev->resync_max_sectors = sectors; 5133 return 0; 5134 } 5135 5136 static int check_stripe_cache(struct mddev *mddev) 5137 { 5138 /* Can only proceed if there are plenty of stripe_heads. 5139 * We need a minimum of one full stripe,, and for sensible progress 5140 * it is best to have about 4 times that. 5141 * If we require 4 times, then the default 256 4K stripe_heads will 5142 * allow for chunk sizes up to 256K, which is probably OK. 5143 * If the chunk size is greater, user-space should request more 5144 * stripe_heads first. 5145 */ 5146 struct r5conf *conf = mddev->private; 5147 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5148 > conf->max_nr_stripes || 5149 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5150 > conf->max_nr_stripes) { 5151 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5152 mdname(mddev), 5153 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5154 / STRIPE_SIZE)*4); 5155 return 0; 5156 } 5157 return 1; 5158 } 5159 5160 static int check_reshape(struct mddev *mddev) 5161 { 5162 struct r5conf *conf = mddev->private; 5163 5164 if (mddev->delta_disks == 0 && 5165 mddev->new_layout == mddev->layout && 5166 mddev->new_chunk_sectors == mddev->chunk_sectors) 5167 return 0; /* nothing to do */ 5168 if (mddev->bitmap) 5169 /* Cannot grow a bitmap yet */ 5170 return -EBUSY; 5171 if (has_failed(conf)) 5172 return -EINVAL; 5173 if (mddev->delta_disks < 0) { 5174 /* We might be able to shrink, but the devices must 5175 * be made bigger first. 5176 * For raid6, 4 is the minimum size. 5177 * Otherwise 2 is the minimum 5178 */ 5179 int min = 2; 5180 if (mddev->level == 6) 5181 min = 4; 5182 if (mddev->raid_disks + mddev->delta_disks < min) 5183 return -EINVAL; 5184 } 5185 5186 if (!check_stripe_cache(mddev)) 5187 return -ENOSPC; 5188 5189 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5190 } 5191 5192 static int raid5_start_reshape(struct mddev *mddev) 5193 { 5194 struct r5conf *conf = mddev->private; 5195 struct md_rdev *rdev; 5196 int spares = 0; 5197 unsigned long flags; 5198 5199 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5200 return -EBUSY; 5201 5202 if (!check_stripe_cache(mddev)) 5203 return -ENOSPC; 5204 5205 list_for_each_entry(rdev, &mddev->disks, same_set) 5206 if (!test_bit(In_sync, &rdev->flags) 5207 && !test_bit(Faulty, &rdev->flags)) 5208 spares++; 5209 5210 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5211 /* Not enough devices even to make a degraded array 5212 * of that size 5213 */ 5214 return -EINVAL; 5215 5216 /* Refuse to reduce size of the array. Any reductions in 5217 * array size must be through explicit setting of array_size 5218 * attribute. 5219 */ 5220 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5221 < mddev->array_sectors) { 5222 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5223 "before number of disks\n", mdname(mddev)); 5224 return -EINVAL; 5225 } 5226 5227 atomic_set(&conf->reshape_stripes, 0); 5228 spin_lock_irq(&conf->device_lock); 5229 conf->previous_raid_disks = conf->raid_disks; 5230 conf->raid_disks += mddev->delta_disks; 5231 conf->prev_chunk_sectors = conf->chunk_sectors; 5232 conf->chunk_sectors = mddev->new_chunk_sectors; 5233 conf->prev_algo = conf->algorithm; 5234 conf->algorithm = mddev->new_layout; 5235 if (mddev->delta_disks < 0) 5236 conf->reshape_progress = raid5_size(mddev, 0, 0); 5237 else 5238 conf->reshape_progress = 0; 5239 conf->reshape_safe = conf->reshape_progress; 5240 conf->generation++; 5241 spin_unlock_irq(&conf->device_lock); 5242 5243 /* Add some new drives, as many as will fit. 5244 * We know there are enough to make the newly sized array work. 5245 * Don't add devices if we are reducing the number of 5246 * devices in the array. This is because it is not possible 5247 * to correctly record the "partially reconstructed" state of 5248 * such devices during the reshape and confusion could result. 5249 */ 5250 if (mddev->delta_disks >= 0) { 5251 int added_devices = 0; 5252 list_for_each_entry(rdev, &mddev->disks, same_set) 5253 if (rdev->raid_disk < 0 && 5254 !test_bit(Faulty, &rdev->flags)) { 5255 if (raid5_add_disk(mddev, rdev) == 0) { 5256 if (rdev->raid_disk 5257 >= conf->previous_raid_disks) { 5258 set_bit(In_sync, &rdev->flags); 5259 added_devices++; 5260 } else 5261 rdev->recovery_offset = 0; 5262 5263 if (sysfs_link_rdev(mddev, rdev)) 5264 /* Failure here is OK */; 5265 } 5266 } else if (rdev->raid_disk >= conf->previous_raid_disks 5267 && !test_bit(Faulty, &rdev->flags)) { 5268 /* This is a spare that was manually added */ 5269 set_bit(In_sync, &rdev->flags); 5270 added_devices++; 5271 } 5272 5273 /* When a reshape changes the number of devices, 5274 * ->degraded is measured against the larger of the 5275 * pre and post number of devices. 5276 */ 5277 spin_lock_irqsave(&conf->device_lock, flags); 5278 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5279 - added_devices; 5280 spin_unlock_irqrestore(&conf->device_lock, flags); 5281 } 5282 mddev->raid_disks = conf->raid_disks; 5283 mddev->reshape_position = conf->reshape_progress; 5284 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5285 5286 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5287 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5288 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5289 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5290 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5291 "reshape"); 5292 if (!mddev->sync_thread) { 5293 mddev->recovery = 0; 5294 spin_lock_irq(&conf->device_lock); 5295 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5296 conf->reshape_progress = MaxSector; 5297 spin_unlock_irq(&conf->device_lock); 5298 return -EAGAIN; 5299 } 5300 conf->reshape_checkpoint = jiffies; 5301 md_wakeup_thread(mddev->sync_thread); 5302 md_new_event(mddev); 5303 return 0; 5304 } 5305 5306 /* This is called from the reshape thread and should make any 5307 * changes needed in 'conf' 5308 */ 5309 static void end_reshape(struct r5conf *conf) 5310 { 5311 5312 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5313 5314 spin_lock_irq(&conf->device_lock); 5315 conf->previous_raid_disks = conf->raid_disks; 5316 conf->reshape_progress = MaxSector; 5317 spin_unlock_irq(&conf->device_lock); 5318 wake_up(&conf->wait_for_overlap); 5319 5320 /* read-ahead size must cover two whole stripes, which is 5321 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5322 */ 5323 if (conf->mddev->queue) { 5324 int data_disks = conf->raid_disks - conf->max_degraded; 5325 int stripe = data_disks * ((conf->chunk_sectors << 9) 5326 / PAGE_SIZE); 5327 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5328 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5329 } 5330 } 5331 } 5332 5333 /* This is called from the raid5d thread with mddev_lock held. 5334 * It makes config changes to the device. 5335 */ 5336 static void raid5_finish_reshape(struct mddev *mddev) 5337 { 5338 struct r5conf *conf = mddev->private; 5339 5340 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5341 5342 if (mddev->delta_disks > 0) { 5343 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5344 set_capacity(mddev->gendisk, mddev->array_sectors); 5345 revalidate_disk(mddev->gendisk); 5346 } else { 5347 int d; 5348 mddev->degraded = conf->raid_disks; 5349 for (d = 0; d < conf->raid_disks ; d++) 5350 if (conf->disks[d].rdev && 5351 test_bit(In_sync, 5352 &conf->disks[d].rdev->flags)) 5353 mddev->degraded--; 5354 for (d = conf->raid_disks ; 5355 d < conf->raid_disks - mddev->delta_disks; 5356 d++) { 5357 struct md_rdev *rdev = conf->disks[d].rdev; 5358 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5359 sysfs_unlink_rdev(mddev, rdev); 5360 rdev->raid_disk = -1; 5361 } 5362 } 5363 } 5364 mddev->layout = conf->algorithm; 5365 mddev->chunk_sectors = conf->chunk_sectors; 5366 mddev->reshape_position = MaxSector; 5367 mddev->delta_disks = 0; 5368 } 5369 } 5370 5371 static void raid5_quiesce(struct mddev *mddev, int state) 5372 { 5373 struct r5conf *conf = mddev->private; 5374 5375 switch(state) { 5376 case 2: /* resume for a suspend */ 5377 wake_up(&conf->wait_for_overlap); 5378 break; 5379 5380 case 1: /* stop all writes */ 5381 spin_lock_irq(&conf->device_lock); 5382 /* '2' tells resync/reshape to pause so that all 5383 * active stripes can drain 5384 */ 5385 conf->quiesce = 2; 5386 wait_event_lock_irq(conf->wait_for_stripe, 5387 atomic_read(&conf->active_stripes) == 0 && 5388 atomic_read(&conf->active_aligned_reads) == 0, 5389 conf->device_lock, /* nothing */); 5390 conf->quiesce = 1; 5391 spin_unlock_irq(&conf->device_lock); 5392 /* allow reshape to continue */ 5393 wake_up(&conf->wait_for_overlap); 5394 break; 5395 5396 case 0: /* re-enable writes */ 5397 spin_lock_irq(&conf->device_lock); 5398 conf->quiesce = 0; 5399 wake_up(&conf->wait_for_stripe); 5400 wake_up(&conf->wait_for_overlap); 5401 spin_unlock_irq(&conf->device_lock); 5402 break; 5403 } 5404 } 5405 5406 5407 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5408 { 5409 struct r0conf *raid0_conf = mddev->private; 5410 sector_t sectors; 5411 5412 /* for raid0 takeover only one zone is supported */ 5413 if (raid0_conf->nr_strip_zones > 1) { 5414 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5415 mdname(mddev)); 5416 return ERR_PTR(-EINVAL); 5417 } 5418 5419 sectors = raid0_conf->strip_zone[0].zone_end; 5420 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5421 mddev->dev_sectors = sectors; 5422 mddev->new_level = level; 5423 mddev->new_layout = ALGORITHM_PARITY_N; 5424 mddev->new_chunk_sectors = mddev->chunk_sectors; 5425 mddev->raid_disks += 1; 5426 mddev->delta_disks = 1; 5427 /* make sure it will be not marked as dirty */ 5428 mddev->recovery_cp = MaxSector; 5429 5430 return setup_conf(mddev); 5431 } 5432 5433 5434 static void *raid5_takeover_raid1(struct mddev *mddev) 5435 { 5436 int chunksect; 5437 5438 if (mddev->raid_disks != 2 || 5439 mddev->degraded > 1) 5440 return ERR_PTR(-EINVAL); 5441 5442 /* Should check if there are write-behind devices? */ 5443 5444 chunksect = 64*2; /* 64K by default */ 5445 5446 /* The array must be an exact multiple of chunksize */ 5447 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5448 chunksect >>= 1; 5449 5450 if ((chunksect<<9) < STRIPE_SIZE) 5451 /* array size does not allow a suitable chunk size */ 5452 return ERR_PTR(-EINVAL); 5453 5454 mddev->new_level = 5; 5455 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5456 mddev->new_chunk_sectors = chunksect; 5457 5458 return setup_conf(mddev); 5459 } 5460 5461 static void *raid5_takeover_raid6(struct mddev *mddev) 5462 { 5463 int new_layout; 5464 5465 switch (mddev->layout) { 5466 case ALGORITHM_LEFT_ASYMMETRIC_6: 5467 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5468 break; 5469 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5470 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5471 break; 5472 case ALGORITHM_LEFT_SYMMETRIC_6: 5473 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5474 break; 5475 case ALGORITHM_RIGHT_SYMMETRIC_6: 5476 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5477 break; 5478 case ALGORITHM_PARITY_0_6: 5479 new_layout = ALGORITHM_PARITY_0; 5480 break; 5481 case ALGORITHM_PARITY_N: 5482 new_layout = ALGORITHM_PARITY_N; 5483 break; 5484 default: 5485 return ERR_PTR(-EINVAL); 5486 } 5487 mddev->new_level = 5; 5488 mddev->new_layout = new_layout; 5489 mddev->delta_disks = -1; 5490 mddev->raid_disks -= 1; 5491 return setup_conf(mddev); 5492 } 5493 5494 5495 static int raid5_check_reshape(struct mddev *mddev) 5496 { 5497 /* For a 2-drive array, the layout and chunk size can be changed 5498 * immediately as not restriping is needed. 5499 * For larger arrays we record the new value - after validation 5500 * to be used by a reshape pass. 5501 */ 5502 struct r5conf *conf = mddev->private; 5503 int new_chunk = mddev->new_chunk_sectors; 5504 5505 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5506 return -EINVAL; 5507 if (new_chunk > 0) { 5508 if (!is_power_of_2(new_chunk)) 5509 return -EINVAL; 5510 if (new_chunk < (PAGE_SIZE>>9)) 5511 return -EINVAL; 5512 if (mddev->array_sectors & (new_chunk-1)) 5513 /* not factor of array size */ 5514 return -EINVAL; 5515 } 5516 5517 /* They look valid */ 5518 5519 if (mddev->raid_disks == 2) { 5520 /* can make the change immediately */ 5521 if (mddev->new_layout >= 0) { 5522 conf->algorithm = mddev->new_layout; 5523 mddev->layout = mddev->new_layout; 5524 } 5525 if (new_chunk > 0) { 5526 conf->chunk_sectors = new_chunk ; 5527 mddev->chunk_sectors = new_chunk; 5528 } 5529 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5530 md_wakeup_thread(mddev->thread); 5531 } 5532 return check_reshape(mddev); 5533 } 5534 5535 static int raid6_check_reshape(struct mddev *mddev) 5536 { 5537 int new_chunk = mddev->new_chunk_sectors; 5538 5539 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5540 return -EINVAL; 5541 if (new_chunk > 0) { 5542 if (!is_power_of_2(new_chunk)) 5543 return -EINVAL; 5544 if (new_chunk < (PAGE_SIZE >> 9)) 5545 return -EINVAL; 5546 if (mddev->array_sectors & (new_chunk-1)) 5547 /* not factor of array size */ 5548 return -EINVAL; 5549 } 5550 5551 /* They look valid */ 5552 return check_reshape(mddev); 5553 } 5554 5555 static void *raid5_takeover(struct mddev *mddev) 5556 { 5557 /* raid5 can take over: 5558 * raid0 - if there is only one strip zone - make it a raid4 layout 5559 * raid1 - if there are two drives. We need to know the chunk size 5560 * raid4 - trivial - just use a raid4 layout. 5561 * raid6 - Providing it is a *_6 layout 5562 */ 5563 if (mddev->level == 0) 5564 return raid45_takeover_raid0(mddev, 5); 5565 if (mddev->level == 1) 5566 return raid5_takeover_raid1(mddev); 5567 if (mddev->level == 4) { 5568 mddev->new_layout = ALGORITHM_PARITY_N; 5569 mddev->new_level = 5; 5570 return setup_conf(mddev); 5571 } 5572 if (mddev->level == 6) 5573 return raid5_takeover_raid6(mddev); 5574 5575 return ERR_PTR(-EINVAL); 5576 } 5577 5578 static void *raid4_takeover(struct mddev *mddev) 5579 { 5580 /* raid4 can take over: 5581 * raid0 - if there is only one strip zone 5582 * raid5 - if layout is right 5583 */ 5584 if (mddev->level == 0) 5585 return raid45_takeover_raid0(mddev, 4); 5586 if (mddev->level == 5 && 5587 mddev->layout == ALGORITHM_PARITY_N) { 5588 mddev->new_layout = 0; 5589 mddev->new_level = 4; 5590 return setup_conf(mddev); 5591 } 5592 return ERR_PTR(-EINVAL); 5593 } 5594 5595 static struct md_personality raid5_personality; 5596 5597 static void *raid6_takeover(struct mddev *mddev) 5598 { 5599 /* Currently can only take over a raid5. We map the 5600 * personality to an equivalent raid6 personality 5601 * with the Q block at the end. 5602 */ 5603 int new_layout; 5604 5605 if (mddev->pers != &raid5_personality) 5606 return ERR_PTR(-EINVAL); 5607 if (mddev->degraded > 1) 5608 return ERR_PTR(-EINVAL); 5609 if (mddev->raid_disks > 253) 5610 return ERR_PTR(-EINVAL); 5611 if (mddev->raid_disks < 3) 5612 return ERR_PTR(-EINVAL); 5613 5614 switch (mddev->layout) { 5615 case ALGORITHM_LEFT_ASYMMETRIC: 5616 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5617 break; 5618 case ALGORITHM_RIGHT_ASYMMETRIC: 5619 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5620 break; 5621 case ALGORITHM_LEFT_SYMMETRIC: 5622 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5623 break; 5624 case ALGORITHM_RIGHT_SYMMETRIC: 5625 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5626 break; 5627 case ALGORITHM_PARITY_0: 5628 new_layout = ALGORITHM_PARITY_0_6; 5629 break; 5630 case ALGORITHM_PARITY_N: 5631 new_layout = ALGORITHM_PARITY_N; 5632 break; 5633 default: 5634 return ERR_PTR(-EINVAL); 5635 } 5636 mddev->new_level = 6; 5637 mddev->new_layout = new_layout; 5638 mddev->delta_disks = 1; 5639 mddev->raid_disks += 1; 5640 return setup_conf(mddev); 5641 } 5642 5643 5644 static struct md_personality raid6_personality = 5645 { 5646 .name = "raid6", 5647 .level = 6, 5648 .owner = THIS_MODULE, 5649 .make_request = make_request, 5650 .run = run, 5651 .stop = stop, 5652 .status = status, 5653 .error_handler = error, 5654 .hot_add_disk = raid5_add_disk, 5655 .hot_remove_disk= raid5_remove_disk, 5656 .spare_active = raid5_spare_active, 5657 .sync_request = sync_request, 5658 .resize = raid5_resize, 5659 .size = raid5_size, 5660 .check_reshape = raid6_check_reshape, 5661 .start_reshape = raid5_start_reshape, 5662 .finish_reshape = raid5_finish_reshape, 5663 .quiesce = raid5_quiesce, 5664 .takeover = raid6_takeover, 5665 }; 5666 static struct md_personality raid5_personality = 5667 { 5668 .name = "raid5", 5669 .level = 5, 5670 .owner = THIS_MODULE, 5671 .make_request = make_request, 5672 .run = run, 5673 .stop = stop, 5674 .status = status, 5675 .error_handler = error, 5676 .hot_add_disk = raid5_add_disk, 5677 .hot_remove_disk= raid5_remove_disk, 5678 .spare_active = raid5_spare_active, 5679 .sync_request = sync_request, 5680 .resize = raid5_resize, 5681 .size = raid5_size, 5682 .check_reshape = raid5_check_reshape, 5683 .start_reshape = raid5_start_reshape, 5684 .finish_reshape = raid5_finish_reshape, 5685 .quiesce = raid5_quiesce, 5686 .takeover = raid5_takeover, 5687 }; 5688 5689 static struct md_personality raid4_personality = 5690 { 5691 .name = "raid4", 5692 .level = 4, 5693 .owner = THIS_MODULE, 5694 .make_request = make_request, 5695 .run = run, 5696 .stop = stop, 5697 .status = status, 5698 .error_handler = error, 5699 .hot_add_disk = raid5_add_disk, 5700 .hot_remove_disk= raid5_remove_disk, 5701 .spare_active = raid5_spare_active, 5702 .sync_request = sync_request, 5703 .resize = raid5_resize, 5704 .size = raid5_size, 5705 .check_reshape = raid5_check_reshape, 5706 .start_reshape = raid5_start_reshape, 5707 .finish_reshape = raid5_finish_reshape, 5708 .quiesce = raid5_quiesce, 5709 .takeover = raid4_takeover, 5710 }; 5711 5712 static int __init raid5_init(void) 5713 { 5714 register_md_personality(&raid6_personality); 5715 register_md_personality(&raid5_personality); 5716 register_md_personality(&raid4_personality); 5717 return 0; 5718 } 5719 5720 static void raid5_exit(void) 5721 { 5722 unregister_md_personality(&raid6_personality); 5723 unregister_md_personality(&raid5_personality); 5724 unregister_md_personality(&raid4_personality); 5725 } 5726 5727 module_init(raid5_init); 5728 module_exit(raid5_exit); 5729 MODULE_LICENSE("GPL"); 5730 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 5731 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5732 MODULE_ALIAS("md-raid5"); 5733 MODULE_ALIAS("md-raid4"); 5734 MODULE_ALIAS("md-level-5"); 5735 MODULE_ALIAS("md-level-4"); 5736 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 5737 MODULE_ALIAS("md-raid6"); 5738 MODULE_ALIAS("md-level-6"); 5739 5740 /* This used to be two separate modules, they were: */ 5741 MODULE_ALIAS("raid5"); 5742 MODULE_ALIAS("raid6"); 5743