1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include "md.h" 57 #include "raid5.h" 58 #include "raid0.h" 59 #include "bitmap.h" 60 61 /* 62 * Stripe cache 63 */ 64 65 #define NR_STRIPES 256 66 #define STRIPE_SIZE PAGE_SIZE 67 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 68 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 69 #define IO_THRESHOLD 1 70 #define BYPASS_THRESHOLD 1 71 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 72 #define HASH_MASK (NR_HASH - 1) 73 74 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 75 { 76 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 77 return &conf->stripe_hashtbl[hash]; 78 } 79 80 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 81 * order without overlap. There may be several bio's per stripe+device, and 82 * a bio could span several devices. 83 * When walking this list for a particular stripe+device, we must never proceed 84 * beyond a bio that extends past this device, as the next bio might no longer 85 * be valid. 86 * This function is used to determine the 'next' bio in the list, given the sector 87 * of the current stripe+device 88 */ 89 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 90 { 91 int sectors = bio->bi_size >> 9; 92 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 93 return bio->bi_next; 94 else 95 return NULL; 96 } 97 98 /* 99 * We maintain a biased count of active stripes in the bottom 16 bits of 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 */ 102 static inline int raid5_bi_phys_segments(struct bio *bio) 103 { 104 return bio->bi_phys_segments & 0xffff; 105 } 106 107 static inline int raid5_bi_hw_segments(struct bio *bio) 108 { 109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 } 111 112 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 113 { 114 --bio->bi_phys_segments; 115 return raid5_bi_phys_segments(bio); 116 } 117 118 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 119 { 120 unsigned short val = raid5_bi_hw_segments(bio); 121 122 --val; 123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 124 return val; 125 } 126 127 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 128 { 129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 130 } 131 132 /* Find first data disk in a raid6 stripe */ 133 static inline int raid6_d0(struct stripe_head *sh) 134 { 135 if (sh->ddf_layout) 136 /* ddf always start from first device */ 137 return 0; 138 /* md starts just after Q block */ 139 if (sh->qd_idx == sh->disks - 1) 140 return 0; 141 else 142 return sh->qd_idx + 1; 143 } 144 static inline int raid6_next_disk(int disk, int raid_disks) 145 { 146 disk++; 147 return (disk < raid_disks) ? disk : 0; 148 } 149 150 /* When walking through the disks in a raid5, starting at raid6_d0, 151 * We need to map each disk to a 'slot', where the data disks are slot 152 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 153 * is raid_disks-1. This help does that mapping. 154 */ 155 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 156 int *count, int syndrome_disks) 157 { 158 int slot = *count; 159 160 if (sh->ddf_layout) 161 (*count)++; 162 if (idx == sh->pd_idx) 163 return syndrome_disks; 164 if (idx == sh->qd_idx) 165 return syndrome_disks + 1; 166 if (!sh->ddf_layout) 167 (*count)++; 168 return slot; 169 } 170 171 static void return_io(struct bio *return_bi) 172 { 173 struct bio *bi = return_bi; 174 while (bi) { 175 176 return_bi = bi->bi_next; 177 bi->bi_next = NULL; 178 bi->bi_size = 0; 179 bio_endio(bi, 0); 180 bi = return_bi; 181 } 182 } 183 184 static void print_raid5_conf (struct r5conf *conf); 185 186 static int stripe_operations_active(struct stripe_head *sh) 187 { 188 return sh->check_state || sh->reconstruct_state || 189 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 191 } 192 193 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 194 { 195 if (atomic_dec_and_test(&sh->count)) { 196 BUG_ON(!list_empty(&sh->lru)); 197 BUG_ON(atomic_read(&conf->active_stripes)==0); 198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 199 if (test_bit(STRIPE_DELAYED, &sh->state)) 200 list_add_tail(&sh->lru, &conf->delayed_list); 201 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 202 sh->bm_seq - conf->seq_write > 0) 203 list_add_tail(&sh->lru, &conf->bitmap_list); 204 else { 205 clear_bit(STRIPE_BIT_DELAY, &sh->state); 206 list_add_tail(&sh->lru, &conf->handle_list); 207 } 208 md_wakeup_thread(conf->mddev->thread); 209 } else { 210 BUG_ON(stripe_operations_active(sh)); 211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 212 atomic_dec(&conf->preread_active_stripes); 213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 214 md_wakeup_thread(conf->mddev->thread); 215 } 216 atomic_dec(&conf->active_stripes); 217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 218 list_add_tail(&sh->lru, &conf->inactive_list); 219 wake_up(&conf->wait_for_stripe); 220 if (conf->retry_read_aligned) 221 md_wakeup_thread(conf->mddev->thread); 222 } 223 } 224 } 225 } 226 227 static void release_stripe(struct stripe_head *sh) 228 { 229 struct r5conf *conf = sh->raid_conf; 230 unsigned long flags; 231 232 spin_lock_irqsave(&conf->device_lock, flags); 233 __release_stripe(conf, sh); 234 spin_unlock_irqrestore(&conf->device_lock, flags); 235 } 236 237 static inline void remove_hash(struct stripe_head *sh) 238 { 239 pr_debug("remove_hash(), stripe %llu\n", 240 (unsigned long long)sh->sector); 241 242 hlist_del_init(&sh->hash); 243 } 244 245 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 246 { 247 struct hlist_head *hp = stripe_hash(conf, sh->sector); 248 249 pr_debug("insert_hash(), stripe %llu\n", 250 (unsigned long long)sh->sector); 251 252 hlist_add_head(&sh->hash, hp); 253 } 254 255 256 /* find an idle stripe, make sure it is unhashed, and return it. */ 257 static struct stripe_head *get_free_stripe(struct r5conf *conf) 258 { 259 struct stripe_head *sh = NULL; 260 struct list_head *first; 261 262 if (list_empty(&conf->inactive_list)) 263 goto out; 264 first = conf->inactive_list.next; 265 sh = list_entry(first, struct stripe_head, lru); 266 list_del_init(first); 267 remove_hash(sh); 268 atomic_inc(&conf->active_stripes); 269 out: 270 return sh; 271 } 272 273 static void shrink_buffers(struct stripe_head *sh) 274 { 275 struct page *p; 276 int i; 277 int num = sh->raid_conf->pool_size; 278 279 for (i = 0; i < num ; i++) { 280 p = sh->dev[i].page; 281 if (!p) 282 continue; 283 sh->dev[i].page = NULL; 284 put_page(p); 285 } 286 } 287 288 static int grow_buffers(struct stripe_head *sh) 289 { 290 int i; 291 int num = sh->raid_conf->pool_size; 292 293 for (i = 0; i < num; i++) { 294 struct page *page; 295 296 if (!(page = alloc_page(GFP_KERNEL))) { 297 return 1; 298 } 299 sh->dev[i].page = page; 300 } 301 return 0; 302 } 303 304 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 305 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 306 struct stripe_head *sh); 307 308 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 309 { 310 struct r5conf *conf = sh->raid_conf; 311 int i; 312 313 BUG_ON(atomic_read(&sh->count) != 0); 314 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 315 BUG_ON(stripe_operations_active(sh)); 316 317 pr_debug("init_stripe called, stripe %llu\n", 318 (unsigned long long)sh->sector); 319 320 remove_hash(sh); 321 322 sh->generation = conf->generation - previous; 323 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 324 sh->sector = sector; 325 stripe_set_idx(sector, conf, previous, sh); 326 sh->state = 0; 327 328 329 for (i = sh->disks; i--; ) { 330 struct r5dev *dev = &sh->dev[i]; 331 332 if (dev->toread || dev->read || dev->towrite || dev->written || 333 test_bit(R5_LOCKED, &dev->flags)) { 334 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 335 (unsigned long long)sh->sector, i, dev->toread, 336 dev->read, dev->towrite, dev->written, 337 test_bit(R5_LOCKED, &dev->flags)); 338 WARN_ON(1); 339 } 340 dev->flags = 0; 341 raid5_build_block(sh, i, previous); 342 } 343 insert_hash(conf, sh); 344 } 345 346 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 347 short generation) 348 { 349 struct stripe_head *sh; 350 struct hlist_node *hn; 351 352 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 353 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 354 if (sh->sector == sector && sh->generation == generation) 355 return sh; 356 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 357 return NULL; 358 } 359 360 /* 361 * Need to check if array has failed when deciding whether to: 362 * - start an array 363 * - remove non-faulty devices 364 * - add a spare 365 * - allow a reshape 366 * This determination is simple when no reshape is happening. 367 * However if there is a reshape, we need to carefully check 368 * both the before and after sections. 369 * This is because some failed devices may only affect one 370 * of the two sections, and some non-in_sync devices may 371 * be insync in the section most affected by failed devices. 372 */ 373 static int calc_degraded(struct r5conf *conf) 374 { 375 int degraded, degraded2; 376 int i; 377 378 rcu_read_lock(); 379 degraded = 0; 380 for (i = 0; i < conf->previous_raid_disks; i++) { 381 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 382 if (!rdev || test_bit(Faulty, &rdev->flags)) 383 degraded++; 384 else if (test_bit(In_sync, &rdev->flags)) 385 ; 386 else 387 /* not in-sync or faulty. 388 * If the reshape increases the number of devices, 389 * this is being recovered by the reshape, so 390 * this 'previous' section is not in_sync. 391 * If the number of devices is being reduced however, 392 * the device can only be part of the array if 393 * we are reverting a reshape, so this section will 394 * be in-sync. 395 */ 396 if (conf->raid_disks >= conf->previous_raid_disks) 397 degraded++; 398 } 399 rcu_read_unlock(); 400 if (conf->raid_disks == conf->previous_raid_disks) 401 return degraded; 402 rcu_read_lock(); 403 degraded2 = 0; 404 for (i = 0; i < conf->raid_disks; i++) { 405 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 406 if (!rdev || test_bit(Faulty, &rdev->flags)) 407 degraded2++; 408 else if (test_bit(In_sync, &rdev->flags)) 409 ; 410 else 411 /* not in-sync or faulty. 412 * If reshape increases the number of devices, this 413 * section has already been recovered, else it 414 * almost certainly hasn't. 415 */ 416 if (conf->raid_disks <= conf->previous_raid_disks) 417 degraded2++; 418 } 419 rcu_read_unlock(); 420 if (degraded2 > degraded) 421 return degraded2; 422 return degraded; 423 } 424 425 static int has_failed(struct r5conf *conf) 426 { 427 int degraded; 428 429 if (conf->mddev->reshape_position == MaxSector) 430 return conf->mddev->degraded > conf->max_degraded; 431 432 degraded = calc_degraded(conf); 433 if (degraded > conf->max_degraded) 434 return 1; 435 return 0; 436 } 437 438 static struct stripe_head * 439 get_active_stripe(struct r5conf *conf, sector_t sector, 440 int previous, int noblock, int noquiesce) 441 { 442 struct stripe_head *sh; 443 444 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 445 446 spin_lock_irq(&conf->device_lock); 447 448 do { 449 wait_event_lock_irq(conf->wait_for_stripe, 450 conf->quiesce == 0 || noquiesce, 451 conf->device_lock, /* nothing */); 452 sh = __find_stripe(conf, sector, conf->generation - previous); 453 if (!sh) { 454 if (!conf->inactive_blocked) 455 sh = get_free_stripe(conf); 456 if (noblock && sh == NULL) 457 break; 458 if (!sh) { 459 conf->inactive_blocked = 1; 460 wait_event_lock_irq(conf->wait_for_stripe, 461 !list_empty(&conf->inactive_list) && 462 (atomic_read(&conf->active_stripes) 463 < (conf->max_nr_stripes *3/4) 464 || !conf->inactive_blocked), 465 conf->device_lock, 466 ); 467 conf->inactive_blocked = 0; 468 } else 469 init_stripe(sh, sector, previous); 470 } else { 471 if (atomic_read(&sh->count)) { 472 BUG_ON(!list_empty(&sh->lru) 473 && !test_bit(STRIPE_EXPANDING, &sh->state)); 474 } else { 475 if (!test_bit(STRIPE_HANDLE, &sh->state)) 476 atomic_inc(&conf->active_stripes); 477 if (list_empty(&sh->lru) && 478 !test_bit(STRIPE_EXPANDING, &sh->state)) 479 BUG(); 480 list_del_init(&sh->lru); 481 } 482 } 483 } while (sh == NULL); 484 485 if (sh) 486 atomic_inc(&sh->count); 487 488 spin_unlock_irq(&conf->device_lock); 489 return sh; 490 } 491 492 static void 493 raid5_end_read_request(struct bio *bi, int error); 494 static void 495 raid5_end_write_request(struct bio *bi, int error); 496 497 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 498 { 499 struct r5conf *conf = sh->raid_conf; 500 int i, disks = sh->disks; 501 502 might_sleep(); 503 504 for (i = disks; i--; ) { 505 int rw; 506 int replace_only = 0; 507 struct bio *bi, *rbi; 508 struct md_rdev *rdev, *rrdev = NULL; 509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 510 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 511 rw = WRITE_FUA; 512 else 513 rw = WRITE; 514 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 515 rw = READ; 516 else if (test_and_clear_bit(R5_WantReplace, 517 &sh->dev[i].flags)) { 518 rw = WRITE; 519 replace_only = 1; 520 } else 521 continue; 522 523 bi = &sh->dev[i].req; 524 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 525 526 bi->bi_rw = rw; 527 rbi->bi_rw = rw; 528 if (rw & WRITE) { 529 bi->bi_end_io = raid5_end_write_request; 530 rbi->bi_end_io = raid5_end_write_request; 531 } else 532 bi->bi_end_io = raid5_end_read_request; 533 534 rcu_read_lock(); 535 rrdev = rcu_dereference(conf->disks[i].replacement); 536 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 537 rdev = rcu_dereference(conf->disks[i].rdev); 538 if (!rdev) { 539 rdev = rrdev; 540 rrdev = NULL; 541 } 542 if (rw & WRITE) { 543 if (replace_only) 544 rdev = NULL; 545 if (rdev == rrdev) 546 /* We raced and saw duplicates */ 547 rrdev = NULL; 548 } else { 549 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 550 rdev = rrdev; 551 rrdev = NULL; 552 } 553 554 if (rdev && test_bit(Faulty, &rdev->flags)) 555 rdev = NULL; 556 if (rdev) 557 atomic_inc(&rdev->nr_pending); 558 if (rrdev && test_bit(Faulty, &rrdev->flags)) 559 rrdev = NULL; 560 if (rrdev) 561 atomic_inc(&rrdev->nr_pending); 562 rcu_read_unlock(); 563 564 /* We have already checked bad blocks for reads. Now 565 * need to check for writes. We never accept write errors 566 * on the replacement, so we don't to check rrdev. 567 */ 568 while ((rw & WRITE) && rdev && 569 test_bit(WriteErrorSeen, &rdev->flags)) { 570 sector_t first_bad; 571 int bad_sectors; 572 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 573 &first_bad, &bad_sectors); 574 if (!bad) 575 break; 576 577 if (bad < 0) { 578 set_bit(BlockedBadBlocks, &rdev->flags); 579 if (!conf->mddev->external && 580 conf->mddev->flags) { 581 /* It is very unlikely, but we might 582 * still need to write out the 583 * bad block log - better give it 584 * a chance*/ 585 md_check_recovery(conf->mddev); 586 } 587 md_wait_for_blocked_rdev(rdev, conf->mddev); 588 } else { 589 /* Acknowledged bad block - skip the write */ 590 rdev_dec_pending(rdev, conf->mddev); 591 rdev = NULL; 592 } 593 } 594 595 if (rdev) { 596 if (s->syncing || s->expanding || s->expanded 597 || s->replacing) 598 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 599 600 set_bit(STRIPE_IO_STARTED, &sh->state); 601 602 bi->bi_bdev = rdev->bdev; 603 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 604 __func__, (unsigned long long)sh->sector, 605 bi->bi_rw, i); 606 atomic_inc(&sh->count); 607 bi->bi_sector = sh->sector + rdev->data_offset; 608 bi->bi_flags = 1 << BIO_UPTODATE; 609 bi->bi_idx = 0; 610 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 611 bi->bi_io_vec[0].bv_offset = 0; 612 bi->bi_size = STRIPE_SIZE; 613 bi->bi_next = NULL; 614 if (rrdev) 615 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 616 generic_make_request(bi); 617 } 618 if (rrdev) { 619 if (s->syncing || s->expanding || s->expanded 620 || s->replacing) 621 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 622 623 set_bit(STRIPE_IO_STARTED, &sh->state); 624 625 rbi->bi_bdev = rrdev->bdev; 626 pr_debug("%s: for %llu schedule op %ld on " 627 "replacement disc %d\n", 628 __func__, (unsigned long long)sh->sector, 629 rbi->bi_rw, i); 630 atomic_inc(&sh->count); 631 rbi->bi_sector = sh->sector + rrdev->data_offset; 632 rbi->bi_flags = 1 << BIO_UPTODATE; 633 rbi->bi_idx = 0; 634 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 635 rbi->bi_io_vec[0].bv_offset = 0; 636 rbi->bi_size = STRIPE_SIZE; 637 rbi->bi_next = NULL; 638 generic_make_request(rbi); 639 } 640 if (!rdev && !rrdev) { 641 if (rw & WRITE) 642 set_bit(STRIPE_DEGRADED, &sh->state); 643 pr_debug("skip op %ld on disc %d for sector %llu\n", 644 bi->bi_rw, i, (unsigned long long)sh->sector); 645 clear_bit(R5_LOCKED, &sh->dev[i].flags); 646 set_bit(STRIPE_HANDLE, &sh->state); 647 } 648 } 649 } 650 651 static struct dma_async_tx_descriptor * 652 async_copy_data(int frombio, struct bio *bio, struct page *page, 653 sector_t sector, struct dma_async_tx_descriptor *tx) 654 { 655 struct bio_vec *bvl; 656 struct page *bio_page; 657 int i; 658 int page_offset; 659 struct async_submit_ctl submit; 660 enum async_tx_flags flags = 0; 661 662 if (bio->bi_sector >= sector) 663 page_offset = (signed)(bio->bi_sector - sector) * 512; 664 else 665 page_offset = (signed)(sector - bio->bi_sector) * -512; 666 667 if (frombio) 668 flags |= ASYNC_TX_FENCE; 669 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 670 671 bio_for_each_segment(bvl, bio, i) { 672 int len = bvl->bv_len; 673 int clen; 674 int b_offset = 0; 675 676 if (page_offset < 0) { 677 b_offset = -page_offset; 678 page_offset += b_offset; 679 len -= b_offset; 680 } 681 682 if (len > 0 && page_offset + len > STRIPE_SIZE) 683 clen = STRIPE_SIZE - page_offset; 684 else 685 clen = len; 686 687 if (clen > 0) { 688 b_offset += bvl->bv_offset; 689 bio_page = bvl->bv_page; 690 if (frombio) 691 tx = async_memcpy(page, bio_page, page_offset, 692 b_offset, clen, &submit); 693 else 694 tx = async_memcpy(bio_page, page, b_offset, 695 page_offset, clen, &submit); 696 } 697 /* chain the operations */ 698 submit.depend_tx = tx; 699 700 if (clen < len) /* hit end of page */ 701 break; 702 page_offset += len; 703 } 704 705 return tx; 706 } 707 708 static void ops_complete_biofill(void *stripe_head_ref) 709 { 710 struct stripe_head *sh = stripe_head_ref; 711 struct bio *return_bi = NULL; 712 struct r5conf *conf = sh->raid_conf; 713 int i; 714 715 pr_debug("%s: stripe %llu\n", __func__, 716 (unsigned long long)sh->sector); 717 718 /* clear completed biofills */ 719 spin_lock_irq(&conf->device_lock); 720 for (i = sh->disks; i--; ) { 721 struct r5dev *dev = &sh->dev[i]; 722 723 /* acknowledge completion of a biofill operation */ 724 /* and check if we need to reply to a read request, 725 * new R5_Wantfill requests are held off until 726 * !STRIPE_BIOFILL_RUN 727 */ 728 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 729 struct bio *rbi, *rbi2; 730 731 BUG_ON(!dev->read); 732 rbi = dev->read; 733 dev->read = NULL; 734 while (rbi && rbi->bi_sector < 735 dev->sector + STRIPE_SECTORS) { 736 rbi2 = r5_next_bio(rbi, dev->sector); 737 if (!raid5_dec_bi_phys_segments(rbi)) { 738 rbi->bi_next = return_bi; 739 return_bi = rbi; 740 } 741 rbi = rbi2; 742 } 743 } 744 } 745 spin_unlock_irq(&conf->device_lock); 746 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 747 748 return_io(return_bi); 749 750 set_bit(STRIPE_HANDLE, &sh->state); 751 release_stripe(sh); 752 } 753 754 static void ops_run_biofill(struct stripe_head *sh) 755 { 756 struct dma_async_tx_descriptor *tx = NULL; 757 struct r5conf *conf = sh->raid_conf; 758 struct async_submit_ctl submit; 759 int i; 760 761 pr_debug("%s: stripe %llu\n", __func__, 762 (unsigned long long)sh->sector); 763 764 for (i = sh->disks; i--; ) { 765 struct r5dev *dev = &sh->dev[i]; 766 if (test_bit(R5_Wantfill, &dev->flags)) { 767 struct bio *rbi; 768 spin_lock_irq(&conf->device_lock); 769 dev->read = rbi = dev->toread; 770 dev->toread = NULL; 771 spin_unlock_irq(&conf->device_lock); 772 while (rbi && rbi->bi_sector < 773 dev->sector + STRIPE_SECTORS) { 774 tx = async_copy_data(0, rbi, dev->page, 775 dev->sector, tx); 776 rbi = r5_next_bio(rbi, dev->sector); 777 } 778 } 779 } 780 781 atomic_inc(&sh->count); 782 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 783 async_trigger_callback(&submit); 784 } 785 786 static void mark_target_uptodate(struct stripe_head *sh, int target) 787 { 788 struct r5dev *tgt; 789 790 if (target < 0) 791 return; 792 793 tgt = &sh->dev[target]; 794 set_bit(R5_UPTODATE, &tgt->flags); 795 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 796 clear_bit(R5_Wantcompute, &tgt->flags); 797 } 798 799 static void ops_complete_compute(void *stripe_head_ref) 800 { 801 struct stripe_head *sh = stripe_head_ref; 802 803 pr_debug("%s: stripe %llu\n", __func__, 804 (unsigned long long)sh->sector); 805 806 /* mark the computed target(s) as uptodate */ 807 mark_target_uptodate(sh, sh->ops.target); 808 mark_target_uptodate(sh, sh->ops.target2); 809 810 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 811 if (sh->check_state == check_state_compute_run) 812 sh->check_state = check_state_compute_result; 813 set_bit(STRIPE_HANDLE, &sh->state); 814 release_stripe(sh); 815 } 816 817 /* return a pointer to the address conversion region of the scribble buffer */ 818 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 819 struct raid5_percpu *percpu) 820 { 821 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 822 } 823 824 static struct dma_async_tx_descriptor * 825 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 826 { 827 int disks = sh->disks; 828 struct page **xor_srcs = percpu->scribble; 829 int target = sh->ops.target; 830 struct r5dev *tgt = &sh->dev[target]; 831 struct page *xor_dest = tgt->page; 832 int count = 0; 833 struct dma_async_tx_descriptor *tx; 834 struct async_submit_ctl submit; 835 int i; 836 837 pr_debug("%s: stripe %llu block: %d\n", 838 __func__, (unsigned long long)sh->sector, target); 839 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 840 841 for (i = disks; i--; ) 842 if (i != target) 843 xor_srcs[count++] = sh->dev[i].page; 844 845 atomic_inc(&sh->count); 846 847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 848 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 849 if (unlikely(count == 1)) 850 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 851 else 852 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 853 854 return tx; 855 } 856 857 /* set_syndrome_sources - populate source buffers for gen_syndrome 858 * @srcs - (struct page *) array of size sh->disks 859 * @sh - stripe_head to parse 860 * 861 * Populates srcs in proper layout order for the stripe and returns the 862 * 'count' of sources to be used in a call to async_gen_syndrome. The P 863 * destination buffer is recorded in srcs[count] and the Q destination 864 * is recorded in srcs[count+1]]. 865 */ 866 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 867 { 868 int disks = sh->disks; 869 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 870 int d0_idx = raid6_d0(sh); 871 int count; 872 int i; 873 874 for (i = 0; i < disks; i++) 875 srcs[i] = NULL; 876 877 count = 0; 878 i = d0_idx; 879 do { 880 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 881 882 srcs[slot] = sh->dev[i].page; 883 i = raid6_next_disk(i, disks); 884 } while (i != d0_idx); 885 886 return syndrome_disks; 887 } 888 889 static struct dma_async_tx_descriptor * 890 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 891 { 892 int disks = sh->disks; 893 struct page **blocks = percpu->scribble; 894 int target; 895 int qd_idx = sh->qd_idx; 896 struct dma_async_tx_descriptor *tx; 897 struct async_submit_ctl submit; 898 struct r5dev *tgt; 899 struct page *dest; 900 int i; 901 int count; 902 903 if (sh->ops.target < 0) 904 target = sh->ops.target2; 905 else if (sh->ops.target2 < 0) 906 target = sh->ops.target; 907 else 908 /* we should only have one valid target */ 909 BUG(); 910 BUG_ON(target < 0); 911 pr_debug("%s: stripe %llu block: %d\n", 912 __func__, (unsigned long long)sh->sector, target); 913 914 tgt = &sh->dev[target]; 915 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 916 dest = tgt->page; 917 918 atomic_inc(&sh->count); 919 920 if (target == qd_idx) { 921 count = set_syndrome_sources(blocks, sh); 922 blocks[count] = NULL; /* regenerating p is not necessary */ 923 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 924 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 925 ops_complete_compute, sh, 926 to_addr_conv(sh, percpu)); 927 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 928 } else { 929 /* Compute any data- or p-drive using XOR */ 930 count = 0; 931 for (i = disks; i-- ; ) { 932 if (i == target || i == qd_idx) 933 continue; 934 blocks[count++] = sh->dev[i].page; 935 } 936 937 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 938 NULL, ops_complete_compute, sh, 939 to_addr_conv(sh, percpu)); 940 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 941 } 942 943 return tx; 944 } 945 946 static struct dma_async_tx_descriptor * 947 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 948 { 949 int i, count, disks = sh->disks; 950 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 951 int d0_idx = raid6_d0(sh); 952 int faila = -1, failb = -1; 953 int target = sh->ops.target; 954 int target2 = sh->ops.target2; 955 struct r5dev *tgt = &sh->dev[target]; 956 struct r5dev *tgt2 = &sh->dev[target2]; 957 struct dma_async_tx_descriptor *tx; 958 struct page **blocks = percpu->scribble; 959 struct async_submit_ctl submit; 960 961 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 962 __func__, (unsigned long long)sh->sector, target, target2); 963 BUG_ON(target < 0 || target2 < 0); 964 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 965 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 966 967 /* we need to open-code set_syndrome_sources to handle the 968 * slot number conversion for 'faila' and 'failb' 969 */ 970 for (i = 0; i < disks ; i++) 971 blocks[i] = NULL; 972 count = 0; 973 i = d0_idx; 974 do { 975 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 976 977 blocks[slot] = sh->dev[i].page; 978 979 if (i == target) 980 faila = slot; 981 if (i == target2) 982 failb = slot; 983 i = raid6_next_disk(i, disks); 984 } while (i != d0_idx); 985 986 BUG_ON(faila == failb); 987 if (failb < faila) 988 swap(faila, failb); 989 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 990 __func__, (unsigned long long)sh->sector, faila, failb); 991 992 atomic_inc(&sh->count); 993 994 if (failb == syndrome_disks+1) { 995 /* Q disk is one of the missing disks */ 996 if (faila == syndrome_disks) { 997 /* Missing P+Q, just recompute */ 998 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 999 ops_complete_compute, sh, 1000 to_addr_conv(sh, percpu)); 1001 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1002 STRIPE_SIZE, &submit); 1003 } else { 1004 struct page *dest; 1005 int data_target; 1006 int qd_idx = sh->qd_idx; 1007 1008 /* Missing D+Q: recompute D from P, then recompute Q */ 1009 if (target == qd_idx) 1010 data_target = target2; 1011 else 1012 data_target = target; 1013 1014 count = 0; 1015 for (i = disks; i-- ; ) { 1016 if (i == data_target || i == qd_idx) 1017 continue; 1018 blocks[count++] = sh->dev[i].page; 1019 } 1020 dest = sh->dev[data_target].page; 1021 init_async_submit(&submit, 1022 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1023 NULL, NULL, NULL, 1024 to_addr_conv(sh, percpu)); 1025 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1026 &submit); 1027 1028 count = set_syndrome_sources(blocks, sh); 1029 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1030 ops_complete_compute, sh, 1031 to_addr_conv(sh, percpu)); 1032 return async_gen_syndrome(blocks, 0, count+2, 1033 STRIPE_SIZE, &submit); 1034 } 1035 } else { 1036 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1037 ops_complete_compute, sh, 1038 to_addr_conv(sh, percpu)); 1039 if (failb == syndrome_disks) { 1040 /* We're missing D+P. */ 1041 return async_raid6_datap_recov(syndrome_disks+2, 1042 STRIPE_SIZE, faila, 1043 blocks, &submit); 1044 } else { 1045 /* We're missing D+D. */ 1046 return async_raid6_2data_recov(syndrome_disks+2, 1047 STRIPE_SIZE, faila, failb, 1048 blocks, &submit); 1049 } 1050 } 1051 } 1052 1053 1054 static void ops_complete_prexor(void *stripe_head_ref) 1055 { 1056 struct stripe_head *sh = stripe_head_ref; 1057 1058 pr_debug("%s: stripe %llu\n", __func__, 1059 (unsigned long long)sh->sector); 1060 } 1061 1062 static struct dma_async_tx_descriptor * 1063 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1064 struct dma_async_tx_descriptor *tx) 1065 { 1066 int disks = sh->disks; 1067 struct page **xor_srcs = percpu->scribble; 1068 int count = 0, pd_idx = sh->pd_idx, i; 1069 struct async_submit_ctl submit; 1070 1071 /* existing parity data subtracted */ 1072 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1073 1074 pr_debug("%s: stripe %llu\n", __func__, 1075 (unsigned long long)sh->sector); 1076 1077 for (i = disks; i--; ) { 1078 struct r5dev *dev = &sh->dev[i]; 1079 /* Only process blocks that are known to be uptodate */ 1080 if (test_bit(R5_Wantdrain, &dev->flags)) 1081 xor_srcs[count++] = dev->page; 1082 } 1083 1084 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1085 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1086 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1087 1088 return tx; 1089 } 1090 1091 static struct dma_async_tx_descriptor * 1092 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1093 { 1094 int disks = sh->disks; 1095 int i; 1096 1097 pr_debug("%s: stripe %llu\n", __func__, 1098 (unsigned long long)sh->sector); 1099 1100 for (i = disks; i--; ) { 1101 struct r5dev *dev = &sh->dev[i]; 1102 struct bio *chosen; 1103 1104 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1105 struct bio *wbi; 1106 1107 spin_lock_irq(&sh->raid_conf->device_lock); 1108 chosen = dev->towrite; 1109 dev->towrite = NULL; 1110 BUG_ON(dev->written); 1111 wbi = dev->written = chosen; 1112 spin_unlock_irq(&sh->raid_conf->device_lock); 1113 1114 while (wbi && wbi->bi_sector < 1115 dev->sector + STRIPE_SECTORS) { 1116 if (wbi->bi_rw & REQ_FUA) 1117 set_bit(R5_WantFUA, &dev->flags); 1118 tx = async_copy_data(1, wbi, dev->page, 1119 dev->sector, tx); 1120 wbi = r5_next_bio(wbi, dev->sector); 1121 } 1122 } 1123 } 1124 1125 return tx; 1126 } 1127 1128 static void ops_complete_reconstruct(void *stripe_head_ref) 1129 { 1130 struct stripe_head *sh = stripe_head_ref; 1131 int disks = sh->disks; 1132 int pd_idx = sh->pd_idx; 1133 int qd_idx = sh->qd_idx; 1134 int i; 1135 bool fua = false; 1136 1137 pr_debug("%s: stripe %llu\n", __func__, 1138 (unsigned long long)sh->sector); 1139 1140 for (i = disks; i--; ) 1141 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1142 1143 for (i = disks; i--; ) { 1144 struct r5dev *dev = &sh->dev[i]; 1145 1146 if (dev->written || i == pd_idx || i == qd_idx) { 1147 set_bit(R5_UPTODATE, &dev->flags); 1148 if (fua) 1149 set_bit(R5_WantFUA, &dev->flags); 1150 } 1151 } 1152 1153 if (sh->reconstruct_state == reconstruct_state_drain_run) 1154 sh->reconstruct_state = reconstruct_state_drain_result; 1155 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1156 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1157 else { 1158 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1159 sh->reconstruct_state = reconstruct_state_result; 1160 } 1161 1162 set_bit(STRIPE_HANDLE, &sh->state); 1163 release_stripe(sh); 1164 } 1165 1166 static void 1167 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1168 struct dma_async_tx_descriptor *tx) 1169 { 1170 int disks = sh->disks; 1171 struct page **xor_srcs = percpu->scribble; 1172 struct async_submit_ctl submit; 1173 int count = 0, pd_idx = sh->pd_idx, i; 1174 struct page *xor_dest; 1175 int prexor = 0; 1176 unsigned long flags; 1177 1178 pr_debug("%s: stripe %llu\n", __func__, 1179 (unsigned long long)sh->sector); 1180 1181 /* check if prexor is active which means only process blocks 1182 * that are part of a read-modify-write (written) 1183 */ 1184 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1185 prexor = 1; 1186 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1187 for (i = disks; i--; ) { 1188 struct r5dev *dev = &sh->dev[i]; 1189 if (dev->written) 1190 xor_srcs[count++] = dev->page; 1191 } 1192 } else { 1193 xor_dest = sh->dev[pd_idx].page; 1194 for (i = disks; i--; ) { 1195 struct r5dev *dev = &sh->dev[i]; 1196 if (i != pd_idx) 1197 xor_srcs[count++] = dev->page; 1198 } 1199 } 1200 1201 /* 1/ if we prexor'd then the dest is reused as a source 1202 * 2/ if we did not prexor then we are redoing the parity 1203 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1204 * for the synchronous xor case 1205 */ 1206 flags = ASYNC_TX_ACK | 1207 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1208 1209 atomic_inc(&sh->count); 1210 1211 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1212 to_addr_conv(sh, percpu)); 1213 if (unlikely(count == 1)) 1214 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1215 else 1216 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1217 } 1218 1219 static void 1220 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1221 struct dma_async_tx_descriptor *tx) 1222 { 1223 struct async_submit_ctl submit; 1224 struct page **blocks = percpu->scribble; 1225 int count; 1226 1227 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1228 1229 count = set_syndrome_sources(blocks, sh); 1230 1231 atomic_inc(&sh->count); 1232 1233 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1234 sh, to_addr_conv(sh, percpu)); 1235 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1236 } 1237 1238 static void ops_complete_check(void *stripe_head_ref) 1239 { 1240 struct stripe_head *sh = stripe_head_ref; 1241 1242 pr_debug("%s: stripe %llu\n", __func__, 1243 (unsigned long long)sh->sector); 1244 1245 sh->check_state = check_state_check_result; 1246 set_bit(STRIPE_HANDLE, &sh->state); 1247 release_stripe(sh); 1248 } 1249 1250 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1251 { 1252 int disks = sh->disks; 1253 int pd_idx = sh->pd_idx; 1254 int qd_idx = sh->qd_idx; 1255 struct page *xor_dest; 1256 struct page **xor_srcs = percpu->scribble; 1257 struct dma_async_tx_descriptor *tx; 1258 struct async_submit_ctl submit; 1259 int count; 1260 int i; 1261 1262 pr_debug("%s: stripe %llu\n", __func__, 1263 (unsigned long long)sh->sector); 1264 1265 count = 0; 1266 xor_dest = sh->dev[pd_idx].page; 1267 xor_srcs[count++] = xor_dest; 1268 for (i = disks; i--; ) { 1269 if (i == pd_idx || i == qd_idx) 1270 continue; 1271 xor_srcs[count++] = sh->dev[i].page; 1272 } 1273 1274 init_async_submit(&submit, 0, NULL, NULL, NULL, 1275 to_addr_conv(sh, percpu)); 1276 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1277 &sh->ops.zero_sum_result, &submit); 1278 1279 atomic_inc(&sh->count); 1280 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1281 tx = async_trigger_callback(&submit); 1282 } 1283 1284 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1285 { 1286 struct page **srcs = percpu->scribble; 1287 struct async_submit_ctl submit; 1288 int count; 1289 1290 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1291 (unsigned long long)sh->sector, checkp); 1292 1293 count = set_syndrome_sources(srcs, sh); 1294 if (!checkp) 1295 srcs[count] = NULL; 1296 1297 atomic_inc(&sh->count); 1298 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1299 sh, to_addr_conv(sh, percpu)); 1300 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1301 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1302 } 1303 1304 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1305 { 1306 int overlap_clear = 0, i, disks = sh->disks; 1307 struct dma_async_tx_descriptor *tx = NULL; 1308 struct r5conf *conf = sh->raid_conf; 1309 int level = conf->level; 1310 struct raid5_percpu *percpu; 1311 unsigned long cpu; 1312 1313 cpu = get_cpu(); 1314 percpu = per_cpu_ptr(conf->percpu, cpu); 1315 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1316 ops_run_biofill(sh); 1317 overlap_clear++; 1318 } 1319 1320 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1321 if (level < 6) 1322 tx = ops_run_compute5(sh, percpu); 1323 else { 1324 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1325 tx = ops_run_compute6_1(sh, percpu); 1326 else 1327 tx = ops_run_compute6_2(sh, percpu); 1328 } 1329 /* terminate the chain if reconstruct is not set to be run */ 1330 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1331 async_tx_ack(tx); 1332 } 1333 1334 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1335 tx = ops_run_prexor(sh, percpu, tx); 1336 1337 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1338 tx = ops_run_biodrain(sh, tx); 1339 overlap_clear++; 1340 } 1341 1342 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1343 if (level < 6) 1344 ops_run_reconstruct5(sh, percpu, tx); 1345 else 1346 ops_run_reconstruct6(sh, percpu, tx); 1347 } 1348 1349 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1350 if (sh->check_state == check_state_run) 1351 ops_run_check_p(sh, percpu); 1352 else if (sh->check_state == check_state_run_q) 1353 ops_run_check_pq(sh, percpu, 0); 1354 else if (sh->check_state == check_state_run_pq) 1355 ops_run_check_pq(sh, percpu, 1); 1356 else 1357 BUG(); 1358 } 1359 1360 if (overlap_clear) 1361 for (i = disks; i--; ) { 1362 struct r5dev *dev = &sh->dev[i]; 1363 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1364 wake_up(&sh->raid_conf->wait_for_overlap); 1365 } 1366 put_cpu(); 1367 } 1368 1369 #ifdef CONFIG_MULTICORE_RAID456 1370 static void async_run_ops(void *param, async_cookie_t cookie) 1371 { 1372 struct stripe_head *sh = param; 1373 unsigned long ops_request = sh->ops.request; 1374 1375 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1376 wake_up(&sh->ops.wait_for_ops); 1377 1378 __raid_run_ops(sh, ops_request); 1379 release_stripe(sh); 1380 } 1381 1382 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1383 { 1384 /* since handle_stripe can be called outside of raid5d context 1385 * we need to ensure sh->ops.request is de-staged before another 1386 * request arrives 1387 */ 1388 wait_event(sh->ops.wait_for_ops, 1389 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1390 sh->ops.request = ops_request; 1391 1392 atomic_inc(&sh->count); 1393 async_schedule(async_run_ops, sh); 1394 } 1395 #else 1396 #define raid_run_ops __raid_run_ops 1397 #endif 1398 1399 static int grow_one_stripe(struct r5conf *conf) 1400 { 1401 struct stripe_head *sh; 1402 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1403 if (!sh) 1404 return 0; 1405 1406 sh->raid_conf = conf; 1407 #ifdef CONFIG_MULTICORE_RAID456 1408 init_waitqueue_head(&sh->ops.wait_for_ops); 1409 #endif 1410 1411 if (grow_buffers(sh)) { 1412 shrink_buffers(sh); 1413 kmem_cache_free(conf->slab_cache, sh); 1414 return 0; 1415 } 1416 /* we just created an active stripe so... */ 1417 atomic_set(&sh->count, 1); 1418 atomic_inc(&conf->active_stripes); 1419 INIT_LIST_HEAD(&sh->lru); 1420 release_stripe(sh); 1421 return 1; 1422 } 1423 1424 static int grow_stripes(struct r5conf *conf, int num) 1425 { 1426 struct kmem_cache *sc; 1427 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1428 1429 if (conf->mddev->gendisk) 1430 sprintf(conf->cache_name[0], 1431 "raid%d-%s", conf->level, mdname(conf->mddev)); 1432 else 1433 sprintf(conf->cache_name[0], 1434 "raid%d-%p", conf->level, conf->mddev); 1435 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1436 1437 conf->active_name = 0; 1438 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1439 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1440 0, 0, NULL); 1441 if (!sc) 1442 return 1; 1443 conf->slab_cache = sc; 1444 conf->pool_size = devs; 1445 while (num--) 1446 if (!grow_one_stripe(conf)) 1447 return 1; 1448 return 0; 1449 } 1450 1451 /** 1452 * scribble_len - return the required size of the scribble region 1453 * @num - total number of disks in the array 1454 * 1455 * The size must be enough to contain: 1456 * 1/ a struct page pointer for each device in the array +2 1457 * 2/ room to convert each entry in (1) to its corresponding dma 1458 * (dma_map_page()) or page (page_address()) address. 1459 * 1460 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1461 * calculate over all devices (not just the data blocks), using zeros in place 1462 * of the P and Q blocks. 1463 */ 1464 static size_t scribble_len(int num) 1465 { 1466 size_t len; 1467 1468 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1469 1470 return len; 1471 } 1472 1473 static int resize_stripes(struct r5conf *conf, int newsize) 1474 { 1475 /* Make all the stripes able to hold 'newsize' devices. 1476 * New slots in each stripe get 'page' set to a new page. 1477 * 1478 * This happens in stages: 1479 * 1/ create a new kmem_cache and allocate the required number of 1480 * stripe_heads. 1481 * 2/ gather all the old stripe_heads and tranfer the pages across 1482 * to the new stripe_heads. This will have the side effect of 1483 * freezing the array as once all stripe_heads have been collected, 1484 * no IO will be possible. Old stripe heads are freed once their 1485 * pages have been transferred over, and the old kmem_cache is 1486 * freed when all stripes are done. 1487 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1488 * we simple return a failre status - no need to clean anything up. 1489 * 4/ allocate new pages for the new slots in the new stripe_heads. 1490 * If this fails, we don't bother trying the shrink the 1491 * stripe_heads down again, we just leave them as they are. 1492 * As each stripe_head is processed the new one is released into 1493 * active service. 1494 * 1495 * Once step2 is started, we cannot afford to wait for a write, 1496 * so we use GFP_NOIO allocations. 1497 */ 1498 struct stripe_head *osh, *nsh; 1499 LIST_HEAD(newstripes); 1500 struct disk_info *ndisks; 1501 unsigned long cpu; 1502 int err; 1503 struct kmem_cache *sc; 1504 int i; 1505 1506 if (newsize <= conf->pool_size) 1507 return 0; /* never bother to shrink */ 1508 1509 err = md_allow_write(conf->mddev); 1510 if (err) 1511 return err; 1512 1513 /* Step 1 */ 1514 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1515 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1516 0, 0, NULL); 1517 if (!sc) 1518 return -ENOMEM; 1519 1520 for (i = conf->max_nr_stripes; i; i--) { 1521 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1522 if (!nsh) 1523 break; 1524 1525 nsh->raid_conf = conf; 1526 #ifdef CONFIG_MULTICORE_RAID456 1527 init_waitqueue_head(&nsh->ops.wait_for_ops); 1528 #endif 1529 1530 list_add(&nsh->lru, &newstripes); 1531 } 1532 if (i) { 1533 /* didn't get enough, give up */ 1534 while (!list_empty(&newstripes)) { 1535 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1536 list_del(&nsh->lru); 1537 kmem_cache_free(sc, nsh); 1538 } 1539 kmem_cache_destroy(sc); 1540 return -ENOMEM; 1541 } 1542 /* Step 2 - Must use GFP_NOIO now. 1543 * OK, we have enough stripes, start collecting inactive 1544 * stripes and copying them over 1545 */ 1546 list_for_each_entry(nsh, &newstripes, lru) { 1547 spin_lock_irq(&conf->device_lock); 1548 wait_event_lock_irq(conf->wait_for_stripe, 1549 !list_empty(&conf->inactive_list), 1550 conf->device_lock, 1551 ); 1552 osh = get_free_stripe(conf); 1553 spin_unlock_irq(&conf->device_lock); 1554 atomic_set(&nsh->count, 1); 1555 for(i=0; i<conf->pool_size; i++) 1556 nsh->dev[i].page = osh->dev[i].page; 1557 for( ; i<newsize; i++) 1558 nsh->dev[i].page = NULL; 1559 kmem_cache_free(conf->slab_cache, osh); 1560 } 1561 kmem_cache_destroy(conf->slab_cache); 1562 1563 /* Step 3. 1564 * At this point, we are holding all the stripes so the array 1565 * is completely stalled, so now is a good time to resize 1566 * conf->disks and the scribble region 1567 */ 1568 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1569 if (ndisks) { 1570 for (i=0; i<conf->raid_disks; i++) 1571 ndisks[i] = conf->disks[i]; 1572 kfree(conf->disks); 1573 conf->disks = ndisks; 1574 } else 1575 err = -ENOMEM; 1576 1577 get_online_cpus(); 1578 conf->scribble_len = scribble_len(newsize); 1579 for_each_present_cpu(cpu) { 1580 struct raid5_percpu *percpu; 1581 void *scribble; 1582 1583 percpu = per_cpu_ptr(conf->percpu, cpu); 1584 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1585 1586 if (scribble) { 1587 kfree(percpu->scribble); 1588 percpu->scribble = scribble; 1589 } else { 1590 err = -ENOMEM; 1591 break; 1592 } 1593 } 1594 put_online_cpus(); 1595 1596 /* Step 4, return new stripes to service */ 1597 while(!list_empty(&newstripes)) { 1598 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1599 list_del_init(&nsh->lru); 1600 1601 for (i=conf->raid_disks; i < newsize; i++) 1602 if (nsh->dev[i].page == NULL) { 1603 struct page *p = alloc_page(GFP_NOIO); 1604 nsh->dev[i].page = p; 1605 if (!p) 1606 err = -ENOMEM; 1607 } 1608 release_stripe(nsh); 1609 } 1610 /* critical section pass, GFP_NOIO no longer needed */ 1611 1612 conf->slab_cache = sc; 1613 conf->active_name = 1-conf->active_name; 1614 conf->pool_size = newsize; 1615 return err; 1616 } 1617 1618 static int drop_one_stripe(struct r5conf *conf) 1619 { 1620 struct stripe_head *sh; 1621 1622 spin_lock_irq(&conf->device_lock); 1623 sh = get_free_stripe(conf); 1624 spin_unlock_irq(&conf->device_lock); 1625 if (!sh) 1626 return 0; 1627 BUG_ON(atomic_read(&sh->count)); 1628 shrink_buffers(sh); 1629 kmem_cache_free(conf->slab_cache, sh); 1630 atomic_dec(&conf->active_stripes); 1631 return 1; 1632 } 1633 1634 static void shrink_stripes(struct r5conf *conf) 1635 { 1636 while (drop_one_stripe(conf)) 1637 ; 1638 1639 if (conf->slab_cache) 1640 kmem_cache_destroy(conf->slab_cache); 1641 conf->slab_cache = NULL; 1642 } 1643 1644 static void raid5_end_read_request(struct bio * bi, int error) 1645 { 1646 struct stripe_head *sh = bi->bi_private; 1647 struct r5conf *conf = sh->raid_conf; 1648 int disks = sh->disks, i; 1649 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1650 char b[BDEVNAME_SIZE]; 1651 struct md_rdev *rdev = NULL; 1652 1653 1654 for (i=0 ; i<disks; i++) 1655 if (bi == &sh->dev[i].req) 1656 break; 1657 1658 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1659 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1660 uptodate); 1661 if (i == disks) { 1662 BUG(); 1663 return; 1664 } 1665 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1666 /* If replacement finished while this request was outstanding, 1667 * 'replacement' might be NULL already. 1668 * In that case it moved down to 'rdev'. 1669 * rdev is not removed until all requests are finished. 1670 */ 1671 rdev = conf->disks[i].replacement; 1672 if (!rdev) 1673 rdev = conf->disks[i].rdev; 1674 1675 if (uptodate) { 1676 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1677 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1678 /* Note that this cannot happen on a 1679 * replacement device. We just fail those on 1680 * any error 1681 */ 1682 printk_ratelimited( 1683 KERN_INFO 1684 "md/raid:%s: read error corrected" 1685 " (%lu sectors at %llu on %s)\n", 1686 mdname(conf->mddev), STRIPE_SECTORS, 1687 (unsigned long long)(sh->sector 1688 + rdev->data_offset), 1689 bdevname(rdev->bdev, b)); 1690 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1691 clear_bit(R5_ReadError, &sh->dev[i].flags); 1692 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1693 } 1694 if (atomic_read(&rdev->read_errors)) 1695 atomic_set(&rdev->read_errors, 0); 1696 } else { 1697 const char *bdn = bdevname(rdev->bdev, b); 1698 int retry = 0; 1699 1700 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1701 atomic_inc(&rdev->read_errors); 1702 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1703 printk_ratelimited( 1704 KERN_WARNING 1705 "md/raid:%s: read error on replacement device " 1706 "(sector %llu on %s).\n", 1707 mdname(conf->mddev), 1708 (unsigned long long)(sh->sector 1709 + rdev->data_offset), 1710 bdn); 1711 else if (conf->mddev->degraded >= conf->max_degraded) 1712 printk_ratelimited( 1713 KERN_WARNING 1714 "md/raid:%s: read error not correctable " 1715 "(sector %llu on %s).\n", 1716 mdname(conf->mddev), 1717 (unsigned long long)(sh->sector 1718 + rdev->data_offset), 1719 bdn); 1720 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1721 /* Oh, no!!! */ 1722 printk_ratelimited( 1723 KERN_WARNING 1724 "md/raid:%s: read error NOT corrected!! " 1725 "(sector %llu on %s).\n", 1726 mdname(conf->mddev), 1727 (unsigned long long)(sh->sector 1728 + rdev->data_offset), 1729 bdn); 1730 else if (atomic_read(&rdev->read_errors) 1731 > conf->max_nr_stripes) 1732 printk(KERN_WARNING 1733 "md/raid:%s: Too many read errors, failing device %s.\n", 1734 mdname(conf->mddev), bdn); 1735 else 1736 retry = 1; 1737 if (retry) 1738 set_bit(R5_ReadError, &sh->dev[i].flags); 1739 else { 1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1742 md_error(conf->mddev, rdev); 1743 } 1744 } 1745 rdev_dec_pending(rdev, conf->mddev); 1746 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1747 set_bit(STRIPE_HANDLE, &sh->state); 1748 release_stripe(sh); 1749 } 1750 1751 static void raid5_end_write_request(struct bio *bi, int error) 1752 { 1753 struct stripe_head *sh = bi->bi_private; 1754 struct r5conf *conf = sh->raid_conf; 1755 int disks = sh->disks, i; 1756 struct md_rdev *uninitialized_var(rdev); 1757 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1758 sector_t first_bad; 1759 int bad_sectors; 1760 int replacement = 0; 1761 1762 for (i = 0 ; i < disks; i++) { 1763 if (bi == &sh->dev[i].req) { 1764 rdev = conf->disks[i].rdev; 1765 break; 1766 } 1767 if (bi == &sh->dev[i].rreq) { 1768 rdev = conf->disks[i].replacement; 1769 if (rdev) 1770 replacement = 1; 1771 else 1772 /* rdev was removed and 'replacement' 1773 * replaced it. rdev is not removed 1774 * until all requests are finished. 1775 */ 1776 rdev = conf->disks[i].rdev; 1777 break; 1778 } 1779 } 1780 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1781 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1782 uptodate); 1783 if (i == disks) { 1784 BUG(); 1785 return; 1786 } 1787 1788 if (replacement) { 1789 if (!uptodate) 1790 md_error(conf->mddev, rdev); 1791 else if (is_badblock(rdev, sh->sector, 1792 STRIPE_SECTORS, 1793 &first_bad, &bad_sectors)) 1794 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1795 } else { 1796 if (!uptodate) { 1797 set_bit(WriteErrorSeen, &rdev->flags); 1798 set_bit(R5_WriteError, &sh->dev[i].flags); 1799 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1800 set_bit(MD_RECOVERY_NEEDED, 1801 &rdev->mddev->recovery); 1802 } else if (is_badblock(rdev, sh->sector, 1803 STRIPE_SECTORS, 1804 &first_bad, &bad_sectors)) 1805 set_bit(R5_MadeGood, &sh->dev[i].flags); 1806 } 1807 rdev_dec_pending(rdev, conf->mddev); 1808 1809 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 1810 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1811 set_bit(STRIPE_HANDLE, &sh->state); 1812 release_stripe(sh); 1813 } 1814 1815 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1816 1817 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1818 { 1819 struct r5dev *dev = &sh->dev[i]; 1820 1821 bio_init(&dev->req); 1822 dev->req.bi_io_vec = &dev->vec; 1823 dev->req.bi_vcnt++; 1824 dev->req.bi_max_vecs++; 1825 dev->req.bi_private = sh; 1826 dev->vec.bv_page = dev->page; 1827 1828 bio_init(&dev->rreq); 1829 dev->rreq.bi_io_vec = &dev->rvec; 1830 dev->rreq.bi_vcnt++; 1831 dev->rreq.bi_max_vecs++; 1832 dev->rreq.bi_private = sh; 1833 dev->rvec.bv_page = dev->page; 1834 1835 dev->flags = 0; 1836 dev->sector = compute_blocknr(sh, i, previous); 1837 } 1838 1839 static void error(struct mddev *mddev, struct md_rdev *rdev) 1840 { 1841 char b[BDEVNAME_SIZE]; 1842 struct r5conf *conf = mddev->private; 1843 unsigned long flags; 1844 pr_debug("raid456: error called\n"); 1845 1846 spin_lock_irqsave(&conf->device_lock, flags); 1847 clear_bit(In_sync, &rdev->flags); 1848 mddev->degraded = calc_degraded(conf); 1849 spin_unlock_irqrestore(&conf->device_lock, flags); 1850 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1851 1852 set_bit(Blocked, &rdev->flags); 1853 set_bit(Faulty, &rdev->flags); 1854 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1855 printk(KERN_ALERT 1856 "md/raid:%s: Disk failure on %s, disabling device.\n" 1857 "md/raid:%s: Operation continuing on %d devices.\n", 1858 mdname(mddev), 1859 bdevname(rdev->bdev, b), 1860 mdname(mddev), 1861 conf->raid_disks - mddev->degraded); 1862 } 1863 1864 /* 1865 * Input: a 'big' sector number, 1866 * Output: index of the data and parity disk, and the sector # in them. 1867 */ 1868 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 1869 int previous, int *dd_idx, 1870 struct stripe_head *sh) 1871 { 1872 sector_t stripe, stripe2; 1873 sector_t chunk_number; 1874 unsigned int chunk_offset; 1875 int pd_idx, qd_idx; 1876 int ddf_layout = 0; 1877 sector_t new_sector; 1878 int algorithm = previous ? conf->prev_algo 1879 : conf->algorithm; 1880 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1881 : conf->chunk_sectors; 1882 int raid_disks = previous ? conf->previous_raid_disks 1883 : conf->raid_disks; 1884 int data_disks = raid_disks - conf->max_degraded; 1885 1886 /* First compute the information on this sector */ 1887 1888 /* 1889 * Compute the chunk number and the sector offset inside the chunk 1890 */ 1891 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1892 chunk_number = r_sector; 1893 1894 /* 1895 * Compute the stripe number 1896 */ 1897 stripe = chunk_number; 1898 *dd_idx = sector_div(stripe, data_disks); 1899 stripe2 = stripe; 1900 /* 1901 * Select the parity disk based on the user selected algorithm. 1902 */ 1903 pd_idx = qd_idx = -1; 1904 switch(conf->level) { 1905 case 4: 1906 pd_idx = data_disks; 1907 break; 1908 case 5: 1909 switch (algorithm) { 1910 case ALGORITHM_LEFT_ASYMMETRIC: 1911 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1912 if (*dd_idx >= pd_idx) 1913 (*dd_idx)++; 1914 break; 1915 case ALGORITHM_RIGHT_ASYMMETRIC: 1916 pd_idx = sector_div(stripe2, raid_disks); 1917 if (*dd_idx >= pd_idx) 1918 (*dd_idx)++; 1919 break; 1920 case ALGORITHM_LEFT_SYMMETRIC: 1921 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1922 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1923 break; 1924 case ALGORITHM_RIGHT_SYMMETRIC: 1925 pd_idx = sector_div(stripe2, raid_disks); 1926 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1927 break; 1928 case ALGORITHM_PARITY_0: 1929 pd_idx = 0; 1930 (*dd_idx)++; 1931 break; 1932 case ALGORITHM_PARITY_N: 1933 pd_idx = data_disks; 1934 break; 1935 default: 1936 BUG(); 1937 } 1938 break; 1939 case 6: 1940 1941 switch (algorithm) { 1942 case ALGORITHM_LEFT_ASYMMETRIC: 1943 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1944 qd_idx = pd_idx + 1; 1945 if (pd_idx == raid_disks-1) { 1946 (*dd_idx)++; /* Q D D D P */ 1947 qd_idx = 0; 1948 } else if (*dd_idx >= pd_idx) 1949 (*dd_idx) += 2; /* D D P Q D */ 1950 break; 1951 case ALGORITHM_RIGHT_ASYMMETRIC: 1952 pd_idx = sector_div(stripe2, raid_disks); 1953 qd_idx = pd_idx + 1; 1954 if (pd_idx == raid_disks-1) { 1955 (*dd_idx)++; /* Q D D D P */ 1956 qd_idx = 0; 1957 } else if (*dd_idx >= pd_idx) 1958 (*dd_idx) += 2; /* D D P Q D */ 1959 break; 1960 case ALGORITHM_LEFT_SYMMETRIC: 1961 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1962 qd_idx = (pd_idx + 1) % raid_disks; 1963 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1964 break; 1965 case ALGORITHM_RIGHT_SYMMETRIC: 1966 pd_idx = sector_div(stripe2, raid_disks); 1967 qd_idx = (pd_idx + 1) % raid_disks; 1968 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1969 break; 1970 1971 case ALGORITHM_PARITY_0: 1972 pd_idx = 0; 1973 qd_idx = 1; 1974 (*dd_idx) += 2; 1975 break; 1976 case ALGORITHM_PARITY_N: 1977 pd_idx = data_disks; 1978 qd_idx = data_disks + 1; 1979 break; 1980 1981 case ALGORITHM_ROTATING_ZERO_RESTART: 1982 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1983 * of blocks for computing Q is different. 1984 */ 1985 pd_idx = sector_div(stripe2, raid_disks); 1986 qd_idx = pd_idx + 1; 1987 if (pd_idx == raid_disks-1) { 1988 (*dd_idx)++; /* Q D D D P */ 1989 qd_idx = 0; 1990 } else if (*dd_idx >= pd_idx) 1991 (*dd_idx) += 2; /* D D P Q D */ 1992 ddf_layout = 1; 1993 break; 1994 1995 case ALGORITHM_ROTATING_N_RESTART: 1996 /* Same a left_asymmetric, by first stripe is 1997 * D D D P Q rather than 1998 * Q D D D P 1999 */ 2000 stripe2 += 1; 2001 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2002 qd_idx = pd_idx + 1; 2003 if (pd_idx == raid_disks-1) { 2004 (*dd_idx)++; /* Q D D D P */ 2005 qd_idx = 0; 2006 } else if (*dd_idx >= pd_idx) 2007 (*dd_idx) += 2; /* D D P Q D */ 2008 ddf_layout = 1; 2009 break; 2010 2011 case ALGORITHM_ROTATING_N_CONTINUE: 2012 /* Same as left_symmetric but Q is before P */ 2013 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2014 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2015 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2016 ddf_layout = 1; 2017 break; 2018 2019 case ALGORITHM_LEFT_ASYMMETRIC_6: 2020 /* RAID5 left_asymmetric, with Q on last device */ 2021 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2022 if (*dd_idx >= pd_idx) 2023 (*dd_idx)++; 2024 qd_idx = raid_disks - 1; 2025 break; 2026 2027 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2028 pd_idx = sector_div(stripe2, raid_disks-1); 2029 if (*dd_idx >= pd_idx) 2030 (*dd_idx)++; 2031 qd_idx = raid_disks - 1; 2032 break; 2033 2034 case ALGORITHM_LEFT_SYMMETRIC_6: 2035 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2036 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2037 qd_idx = raid_disks - 1; 2038 break; 2039 2040 case ALGORITHM_RIGHT_SYMMETRIC_6: 2041 pd_idx = sector_div(stripe2, raid_disks-1); 2042 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2043 qd_idx = raid_disks - 1; 2044 break; 2045 2046 case ALGORITHM_PARITY_0_6: 2047 pd_idx = 0; 2048 (*dd_idx)++; 2049 qd_idx = raid_disks - 1; 2050 break; 2051 2052 default: 2053 BUG(); 2054 } 2055 break; 2056 } 2057 2058 if (sh) { 2059 sh->pd_idx = pd_idx; 2060 sh->qd_idx = qd_idx; 2061 sh->ddf_layout = ddf_layout; 2062 } 2063 /* 2064 * Finally, compute the new sector number 2065 */ 2066 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2067 return new_sector; 2068 } 2069 2070 2071 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2072 { 2073 struct r5conf *conf = sh->raid_conf; 2074 int raid_disks = sh->disks; 2075 int data_disks = raid_disks - conf->max_degraded; 2076 sector_t new_sector = sh->sector, check; 2077 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2078 : conf->chunk_sectors; 2079 int algorithm = previous ? conf->prev_algo 2080 : conf->algorithm; 2081 sector_t stripe; 2082 int chunk_offset; 2083 sector_t chunk_number; 2084 int dummy1, dd_idx = i; 2085 sector_t r_sector; 2086 struct stripe_head sh2; 2087 2088 2089 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2090 stripe = new_sector; 2091 2092 if (i == sh->pd_idx) 2093 return 0; 2094 switch(conf->level) { 2095 case 4: break; 2096 case 5: 2097 switch (algorithm) { 2098 case ALGORITHM_LEFT_ASYMMETRIC: 2099 case ALGORITHM_RIGHT_ASYMMETRIC: 2100 if (i > sh->pd_idx) 2101 i--; 2102 break; 2103 case ALGORITHM_LEFT_SYMMETRIC: 2104 case ALGORITHM_RIGHT_SYMMETRIC: 2105 if (i < sh->pd_idx) 2106 i += raid_disks; 2107 i -= (sh->pd_idx + 1); 2108 break; 2109 case ALGORITHM_PARITY_0: 2110 i -= 1; 2111 break; 2112 case ALGORITHM_PARITY_N: 2113 break; 2114 default: 2115 BUG(); 2116 } 2117 break; 2118 case 6: 2119 if (i == sh->qd_idx) 2120 return 0; /* It is the Q disk */ 2121 switch (algorithm) { 2122 case ALGORITHM_LEFT_ASYMMETRIC: 2123 case ALGORITHM_RIGHT_ASYMMETRIC: 2124 case ALGORITHM_ROTATING_ZERO_RESTART: 2125 case ALGORITHM_ROTATING_N_RESTART: 2126 if (sh->pd_idx == raid_disks-1) 2127 i--; /* Q D D D P */ 2128 else if (i > sh->pd_idx) 2129 i -= 2; /* D D P Q D */ 2130 break; 2131 case ALGORITHM_LEFT_SYMMETRIC: 2132 case ALGORITHM_RIGHT_SYMMETRIC: 2133 if (sh->pd_idx == raid_disks-1) 2134 i--; /* Q D D D P */ 2135 else { 2136 /* D D P Q D */ 2137 if (i < sh->pd_idx) 2138 i += raid_disks; 2139 i -= (sh->pd_idx + 2); 2140 } 2141 break; 2142 case ALGORITHM_PARITY_0: 2143 i -= 2; 2144 break; 2145 case ALGORITHM_PARITY_N: 2146 break; 2147 case ALGORITHM_ROTATING_N_CONTINUE: 2148 /* Like left_symmetric, but P is before Q */ 2149 if (sh->pd_idx == 0) 2150 i--; /* P D D D Q */ 2151 else { 2152 /* D D Q P D */ 2153 if (i < sh->pd_idx) 2154 i += raid_disks; 2155 i -= (sh->pd_idx + 1); 2156 } 2157 break; 2158 case ALGORITHM_LEFT_ASYMMETRIC_6: 2159 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2160 if (i > sh->pd_idx) 2161 i--; 2162 break; 2163 case ALGORITHM_LEFT_SYMMETRIC_6: 2164 case ALGORITHM_RIGHT_SYMMETRIC_6: 2165 if (i < sh->pd_idx) 2166 i += data_disks + 1; 2167 i -= (sh->pd_idx + 1); 2168 break; 2169 case ALGORITHM_PARITY_0_6: 2170 i -= 1; 2171 break; 2172 default: 2173 BUG(); 2174 } 2175 break; 2176 } 2177 2178 chunk_number = stripe * data_disks + i; 2179 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2180 2181 check = raid5_compute_sector(conf, r_sector, 2182 previous, &dummy1, &sh2); 2183 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2184 || sh2.qd_idx != sh->qd_idx) { 2185 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2186 mdname(conf->mddev)); 2187 return 0; 2188 } 2189 return r_sector; 2190 } 2191 2192 2193 static void 2194 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2195 int rcw, int expand) 2196 { 2197 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2198 struct r5conf *conf = sh->raid_conf; 2199 int level = conf->level; 2200 2201 if (rcw) { 2202 /* if we are not expanding this is a proper write request, and 2203 * there will be bios with new data to be drained into the 2204 * stripe cache 2205 */ 2206 if (!expand) { 2207 sh->reconstruct_state = reconstruct_state_drain_run; 2208 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2209 } else 2210 sh->reconstruct_state = reconstruct_state_run; 2211 2212 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2213 2214 for (i = disks; i--; ) { 2215 struct r5dev *dev = &sh->dev[i]; 2216 2217 if (dev->towrite) { 2218 set_bit(R5_LOCKED, &dev->flags); 2219 set_bit(R5_Wantdrain, &dev->flags); 2220 if (!expand) 2221 clear_bit(R5_UPTODATE, &dev->flags); 2222 s->locked++; 2223 } 2224 } 2225 if (s->locked + conf->max_degraded == disks) 2226 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2227 atomic_inc(&conf->pending_full_writes); 2228 } else { 2229 BUG_ON(level == 6); 2230 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2231 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2232 2233 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2234 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2235 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2236 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2237 2238 for (i = disks; i--; ) { 2239 struct r5dev *dev = &sh->dev[i]; 2240 if (i == pd_idx) 2241 continue; 2242 2243 if (dev->towrite && 2244 (test_bit(R5_UPTODATE, &dev->flags) || 2245 test_bit(R5_Wantcompute, &dev->flags))) { 2246 set_bit(R5_Wantdrain, &dev->flags); 2247 set_bit(R5_LOCKED, &dev->flags); 2248 clear_bit(R5_UPTODATE, &dev->flags); 2249 s->locked++; 2250 } 2251 } 2252 } 2253 2254 /* keep the parity disk(s) locked while asynchronous operations 2255 * are in flight 2256 */ 2257 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2258 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2259 s->locked++; 2260 2261 if (level == 6) { 2262 int qd_idx = sh->qd_idx; 2263 struct r5dev *dev = &sh->dev[qd_idx]; 2264 2265 set_bit(R5_LOCKED, &dev->flags); 2266 clear_bit(R5_UPTODATE, &dev->flags); 2267 s->locked++; 2268 } 2269 2270 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2271 __func__, (unsigned long long)sh->sector, 2272 s->locked, s->ops_request); 2273 } 2274 2275 /* 2276 * Each stripe/dev can have one or more bion attached. 2277 * toread/towrite point to the first in a chain. 2278 * The bi_next chain must be in order. 2279 */ 2280 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2281 { 2282 struct bio **bip; 2283 struct r5conf *conf = sh->raid_conf; 2284 int firstwrite=0; 2285 2286 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2287 (unsigned long long)bi->bi_sector, 2288 (unsigned long long)sh->sector); 2289 2290 2291 spin_lock_irq(&conf->device_lock); 2292 if (forwrite) { 2293 bip = &sh->dev[dd_idx].towrite; 2294 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2295 firstwrite = 1; 2296 } else 2297 bip = &sh->dev[dd_idx].toread; 2298 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2299 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2300 goto overlap; 2301 bip = & (*bip)->bi_next; 2302 } 2303 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2304 goto overlap; 2305 2306 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2307 if (*bip) 2308 bi->bi_next = *bip; 2309 *bip = bi; 2310 bi->bi_phys_segments++; 2311 2312 if (forwrite) { 2313 /* check if page is covered */ 2314 sector_t sector = sh->dev[dd_idx].sector; 2315 for (bi=sh->dev[dd_idx].towrite; 2316 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2317 bi && bi->bi_sector <= sector; 2318 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2319 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2320 sector = bi->bi_sector + (bi->bi_size>>9); 2321 } 2322 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2323 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2324 } 2325 spin_unlock_irq(&conf->device_lock); 2326 2327 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2328 (unsigned long long)(*bip)->bi_sector, 2329 (unsigned long long)sh->sector, dd_idx); 2330 2331 if (conf->mddev->bitmap && firstwrite) { 2332 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2333 STRIPE_SECTORS, 0); 2334 sh->bm_seq = conf->seq_flush+1; 2335 set_bit(STRIPE_BIT_DELAY, &sh->state); 2336 } 2337 return 1; 2338 2339 overlap: 2340 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2341 spin_unlock_irq(&conf->device_lock); 2342 return 0; 2343 } 2344 2345 static void end_reshape(struct r5conf *conf); 2346 2347 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2348 struct stripe_head *sh) 2349 { 2350 int sectors_per_chunk = 2351 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2352 int dd_idx; 2353 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2354 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2355 2356 raid5_compute_sector(conf, 2357 stripe * (disks - conf->max_degraded) 2358 *sectors_per_chunk + chunk_offset, 2359 previous, 2360 &dd_idx, sh); 2361 } 2362 2363 static void 2364 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2365 struct stripe_head_state *s, int disks, 2366 struct bio **return_bi) 2367 { 2368 int i; 2369 for (i = disks; i--; ) { 2370 struct bio *bi; 2371 int bitmap_end = 0; 2372 2373 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2374 struct md_rdev *rdev; 2375 rcu_read_lock(); 2376 rdev = rcu_dereference(conf->disks[i].rdev); 2377 if (rdev && test_bit(In_sync, &rdev->flags)) 2378 atomic_inc(&rdev->nr_pending); 2379 else 2380 rdev = NULL; 2381 rcu_read_unlock(); 2382 if (rdev) { 2383 if (!rdev_set_badblocks( 2384 rdev, 2385 sh->sector, 2386 STRIPE_SECTORS, 0)) 2387 md_error(conf->mddev, rdev); 2388 rdev_dec_pending(rdev, conf->mddev); 2389 } 2390 } 2391 spin_lock_irq(&conf->device_lock); 2392 /* fail all writes first */ 2393 bi = sh->dev[i].towrite; 2394 sh->dev[i].towrite = NULL; 2395 if (bi) { 2396 s->to_write--; 2397 bitmap_end = 1; 2398 } 2399 2400 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2401 wake_up(&conf->wait_for_overlap); 2402 2403 while (bi && bi->bi_sector < 2404 sh->dev[i].sector + STRIPE_SECTORS) { 2405 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2406 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2407 if (!raid5_dec_bi_phys_segments(bi)) { 2408 md_write_end(conf->mddev); 2409 bi->bi_next = *return_bi; 2410 *return_bi = bi; 2411 } 2412 bi = nextbi; 2413 } 2414 /* and fail all 'written' */ 2415 bi = sh->dev[i].written; 2416 sh->dev[i].written = NULL; 2417 if (bi) bitmap_end = 1; 2418 while (bi && bi->bi_sector < 2419 sh->dev[i].sector + STRIPE_SECTORS) { 2420 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2421 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2422 if (!raid5_dec_bi_phys_segments(bi)) { 2423 md_write_end(conf->mddev); 2424 bi->bi_next = *return_bi; 2425 *return_bi = bi; 2426 } 2427 bi = bi2; 2428 } 2429 2430 /* fail any reads if this device is non-operational and 2431 * the data has not reached the cache yet. 2432 */ 2433 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2434 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2435 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2436 bi = sh->dev[i].toread; 2437 sh->dev[i].toread = NULL; 2438 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2439 wake_up(&conf->wait_for_overlap); 2440 if (bi) s->to_read--; 2441 while (bi && bi->bi_sector < 2442 sh->dev[i].sector + STRIPE_SECTORS) { 2443 struct bio *nextbi = 2444 r5_next_bio(bi, sh->dev[i].sector); 2445 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2446 if (!raid5_dec_bi_phys_segments(bi)) { 2447 bi->bi_next = *return_bi; 2448 *return_bi = bi; 2449 } 2450 bi = nextbi; 2451 } 2452 } 2453 spin_unlock_irq(&conf->device_lock); 2454 if (bitmap_end) 2455 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2456 STRIPE_SECTORS, 0, 0); 2457 /* If we were in the middle of a write the parity block might 2458 * still be locked - so just clear all R5_LOCKED flags 2459 */ 2460 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2461 } 2462 2463 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2464 if (atomic_dec_and_test(&conf->pending_full_writes)) 2465 md_wakeup_thread(conf->mddev->thread); 2466 } 2467 2468 static void 2469 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2470 struct stripe_head_state *s) 2471 { 2472 int abort = 0; 2473 int i; 2474 2475 md_done_sync(conf->mddev, STRIPE_SECTORS, 0); 2476 clear_bit(STRIPE_SYNCING, &sh->state); 2477 s->syncing = 0; 2478 s->replacing = 0; 2479 /* There is nothing more to do for sync/check/repair. 2480 * For recover/replace we need to record a bad block on all 2481 * non-sync devices, or abort the recovery 2482 */ 2483 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) 2484 return; 2485 /* During recovery devices cannot be removed, so locking and 2486 * refcounting of rdevs is not needed 2487 */ 2488 for (i = 0; i < conf->raid_disks; i++) { 2489 struct md_rdev *rdev = conf->disks[i].rdev; 2490 if (rdev 2491 && !test_bit(Faulty, &rdev->flags) 2492 && !test_bit(In_sync, &rdev->flags) 2493 && !rdev_set_badblocks(rdev, sh->sector, 2494 STRIPE_SECTORS, 0)) 2495 abort = 1; 2496 rdev = conf->disks[i].replacement; 2497 if (rdev 2498 && !test_bit(Faulty, &rdev->flags) 2499 && !test_bit(In_sync, &rdev->flags) 2500 && !rdev_set_badblocks(rdev, sh->sector, 2501 STRIPE_SECTORS, 0)) 2502 abort = 1; 2503 } 2504 if (abort) { 2505 conf->recovery_disabled = conf->mddev->recovery_disabled; 2506 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); 2507 } 2508 } 2509 2510 static int want_replace(struct stripe_head *sh, int disk_idx) 2511 { 2512 struct md_rdev *rdev; 2513 int rv = 0; 2514 /* Doing recovery so rcu locking not required */ 2515 rdev = sh->raid_conf->disks[disk_idx].replacement; 2516 if (rdev 2517 && !test_bit(Faulty, &rdev->flags) 2518 && !test_bit(In_sync, &rdev->flags) 2519 && (rdev->recovery_offset <= sh->sector 2520 || rdev->mddev->recovery_cp <= sh->sector)) 2521 rv = 1; 2522 2523 return rv; 2524 } 2525 2526 /* fetch_block - checks the given member device to see if its data needs 2527 * to be read or computed to satisfy a request. 2528 * 2529 * Returns 1 when no more member devices need to be checked, otherwise returns 2530 * 0 to tell the loop in handle_stripe_fill to continue 2531 */ 2532 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2533 int disk_idx, int disks) 2534 { 2535 struct r5dev *dev = &sh->dev[disk_idx]; 2536 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2537 &sh->dev[s->failed_num[1]] }; 2538 2539 /* is the data in this block needed, and can we get it? */ 2540 if (!test_bit(R5_LOCKED, &dev->flags) && 2541 !test_bit(R5_UPTODATE, &dev->flags) && 2542 (dev->toread || 2543 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2544 s->syncing || s->expanding || 2545 (s->replacing && want_replace(sh, disk_idx)) || 2546 (s->failed >= 1 && fdev[0]->toread) || 2547 (s->failed >= 2 && fdev[1]->toread) || 2548 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2549 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2550 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2551 /* we would like to get this block, possibly by computing it, 2552 * otherwise read it if the backing disk is insync 2553 */ 2554 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2555 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2556 if ((s->uptodate == disks - 1) && 2557 (s->failed && (disk_idx == s->failed_num[0] || 2558 disk_idx == s->failed_num[1]))) { 2559 /* have disk failed, and we're requested to fetch it; 2560 * do compute it 2561 */ 2562 pr_debug("Computing stripe %llu block %d\n", 2563 (unsigned long long)sh->sector, disk_idx); 2564 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2565 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2566 set_bit(R5_Wantcompute, &dev->flags); 2567 sh->ops.target = disk_idx; 2568 sh->ops.target2 = -1; /* no 2nd target */ 2569 s->req_compute = 1; 2570 /* Careful: from this point on 'uptodate' is in the eye 2571 * of raid_run_ops which services 'compute' operations 2572 * before writes. R5_Wantcompute flags a block that will 2573 * be R5_UPTODATE by the time it is needed for a 2574 * subsequent operation. 2575 */ 2576 s->uptodate++; 2577 return 1; 2578 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2579 /* Computing 2-failure is *very* expensive; only 2580 * do it if failed >= 2 2581 */ 2582 int other; 2583 for (other = disks; other--; ) { 2584 if (other == disk_idx) 2585 continue; 2586 if (!test_bit(R5_UPTODATE, 2587 &sh->dev[other].flags)) 2588 break; 2589 } 2590 BUG_ON(other < 0); 2591 pr_debug("Computing stripe %llu blocks %d,%d\n", 2592 (unsigned long long)sh->sector, 2593 disk_idx, other); 2594 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2595 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2596 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2597 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2598 sh->ops.target = disk_idx; 2599 sh->ops.target2 = other; 2600 s->uptodate += 2; 2601 s->req_compute = 1; 2602 return 1; 2603 } else if (test_bit(R5_Insync, &dev->flags)) { 2604 set_bit(R5_LOCKED, &dev->flags); 2605 set_bit(R5_Wantread, &dev->flags); 2606 s->locked++; 2607 pr_debug("Reading block %d (sync=%d)\n", 2608 disk_idx, s->syncing); 2609 } 2610 } 2611 2612 return 0; 2613 } 2614 2615 /** 2616 * handle_stripe_fill - read or compute data to satisfy pending requests. 2617 */ 2618 static void handle_stripe_fill(struct stripe_head *sh, 2619 struct stripe_head_state *s, 2620 int disks) 2621 { 2622 int i; 2623 2624 /* look for blocks to read/compute, skip this if a compute 2625 * is already in flight, or if the stripe contents are in the 2626 * midst of changing due to a write 2627 */ 2628 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2629 !sh->reconstruct_state) 2630 for (i = disks; i--; ) 2631 if (fetch_block(sh, s, i, disks)) 2632 break; 2633 set_bit(STRIPE_HANDLE, &sh->state); 2634 } 2635 2636 2637 /* handle_stripe_clean_event 2638 * any written block on an uptodate or failed drive can be returned. 2639 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2640 * never LOCKED, so we don't need to test 'failed' directly. 2641 */ 2642 static void handle_stripe_clean_event(struct r5conf *conf, 2643 struct stripe_head *sh, int disks, struct bio **return_bi) 2644 { 2645 int i; 2646 struct r5dev *dev; 2647 2648 for (i = disks; i--; ) 2649 if (sh->dev[i].written) { 2650 dev = &sh->dev[i]; 2651 if (!test_bit(R5_LOCKED, &dev->flags) && 2652 test_bit(R5_UPTODATE, &dev->flags)) { 2653 /* We can return any write requests */ 2654 struct bio *wbi, *wbi2; 2655 int bitmap_end = 0; 2656 pr_debug("Return write for disc %d\n", i); 2657 spin_lock_irq(&conf->device_lock); 2658 wbi = dev->written; 2659 dev->written = NULL; 2660 while (wbi && wbi->bi_sector < 2661 dev->sector + STRIPE_SECTORS) { 2662 wbi2 = r5_next_bio(wbi, dev->sector); 2663 if (!raid5_dec_bi_phys_segments(wbi)) { 2664 md_write_end(conf->mddev); 2665 wbi->bi_next = *return_bi; 2666 *return_bi = wbi; 2667 } 2668 wbi = wbi2; 2669 } 2670 if (dev->towrite == NULL) 2671 bitmap_end = 1; 2672 spin_unlock_irq(&conf->device_lock); 2673 if (bitmap_end) 2674 bitmap_endwrite(conf->mddev->bitmap, 2675 sh->sector, 2676 STRIPE_SECTORS, 2677 !test_bit(STRIPE_DEGRADED, &sh->state), 2678 0); 2679 } 2680 } 2681 2682 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2683 if (atomic_dec_and_test(&conf->pending_full_writes)) 2684 md_wakeup_thread(conf->mddev->thread); 2685 } 2686 2687 static void handle_stripe_dirtying(struct r5conf *conf, 2688 struct stripe_head *sh, 2689 struct stripe_head_state *s, 2690 int disks) 2691 { 2692 int rmw = 0, rcw = 0, i; 2693 if (conf->max_degraded == 2) { 2694 /* RAID6 requires 'rcw' in current implementation 2695 * Calculate the real rcw later - for now fake it 2696 * look like rcw is cheaper 2697 */ 2698 rcw = 1; rmw = 2; 2699 } else for (i = disks; i--; ) { 2700 /* would I have to read this buffer for read_modify_write */ 2701 struct r5dev *dev = &sh->dev[i]; 2702 if ((dev->towrite || i == sh->pd_idx) && 2703 !test_bit(R5_LOCKED, &dev->flags) && 2704 !(test_bit(R5_UPTODATE, &dev->flags) || 2705 test_bit(R5_Wantcompute, &dev->flags))) { 2706 if (test_bit(R5_Insync, &dev->flags)) 2707 rmw++; 2708 else 2709 rmw += 2*disks; /* cannot read it */ 2710 } 2711 /* Would I have to read this buffer for reconstruct_write */ 2712 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2713 !test_bit(R5_LOCKED, &dev->flags) && 2714 !(test_bit(R5_UPTODATE, &dev->flags) || 2715 test_bit(R5_Wantcompute, &dev->flags))) { 2716 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2717 else 2718 rcw += 2*disks; 2719 } 2720 } 2721 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2722 (unsigned long long)sh->sector, rmw, rcw); 2723 set_bit(STRIPE_HANDLE, &sh->state); 2724 if (rmw < rcw && rmw > 0) 2725 /* prefer read-modify-write, but need to get some data */ 2726 for (i = disks; i--; ) { 2727 struct r5dev *dev = &sh->dev[i]; 2728 if ((dev->towrite || i == sh->pd_idx) && 2729 !test_bit(R5_LOCKED, &dev->flags) && 2730 !(test_bit(R5_UPTODATE, &dev->flags) || 2731 test_bit(R5_Wantcompute, &dev->flags)) && 2732 test_bit(R5_Insync, &dev->flags)) { 2733 if ( 2734 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2735 pr_debug("Read_old block " 2736 "%d for r-m-w\n", i); 2737 set_bit(R5_LOCKED, &dev->flags); 2738 set_bit(R5_Wantread, &dev->flags); 2739 s->locked++; 2740 } else { 2741 set_bit(STRIPE_DELAYED, &sh->state); 2742 set_bit(STRIPE_HANDLE, &sh->state); 2743 } 2744 } 2745 } 2746 if (rcw <= rmw && rcw > 0) { 2747 /* want reconstruct write, but need to get some data */ 2748 rcw = 0; 2749 for (i = disks; i--; ) { 2750 struct r5dev *dev = &sh->dev[i]; 2751 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2752 i != sh->pd_idx && i != sh->qd_idx && 2753 !test_bit(R5_LOCKED, &dev->flags) && 2754 !(test_bit(R5_UPTODATE, &dev->flags) || 2755 test_bit(R5_Wantcompute, &dev->flags))) { 2756 rcw++; 2757 if (!test_bit(R5_Insync, &dev->flags)) 2758 continue; /* it's a failed drive */ 2759 if ( 2760 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2761 pr_debug("Read_old block " 2762 "%d for Reconstruct\n", i); 2763 set_bit(R5_LOCKED, &dev->flags); 2764 set_bit(R5_Wantread, &dev->flags); 2765 s->locked++; 2766 } else { 2767 set_bit(STRIPE_DELAYED, &sh->state); 2768 set_bit(STRIPE_HANDLE, &sh->state); 2769 } 2770 } 2771 } 2772 } 2773 /* now if nothing is locked, and if we have enough data, 2774 * we can start a write request 2775 */ 2776 /* since handle_stripe can be called at any time we need to handle the 2777 * case where a compute block operation has been submitted and then a 2778 * subsequent call wants to start a write request. raid_run_ops only 2779 * handles the case where compute block and reconstruct are requested 2780 * simultaneously. If this is not the case then new writes need to be 2781 * held off until the compute completes. 2782 */ 2783 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2784 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2785 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2786 schedule_reconstruction(sh, s, rcw == 0, 0); 2787 } 2788 2789 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 2790 struct stripe_head_state *s, int disks) 2791 { 2792 struct r5dev *dev = NULL; 2793 2794 set_bit(STRIPE_HANDLE, &sh->state); 2795 2796 switch (sh->check_state) { 2797 case check_state_idle: 2798 /* start a new check operation if there are no failures */ 2799 if (s->failed == 0) { 2800 BUG_ON(s->uptodate != disks); 2801 sh->check_state = check_state_run; 2802 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2803 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2804 s->uptodate--; 2805 break; 2806 } 2807 dev = &sh->dev[s->failed_num[0]]; 2808 /* fall through */ 2809 case check_state_compute_result: 2810 sh->check_state = check_state_idle; 2811 if (!dev) 2812 dev = &sh->dev[sh->pd_idx]; 2813 2814 /* check that a write has not made the stripe insync */ 2815 if (test_bit(STRIPE_INSYNC, &sh->state)) 2816 break; 2817 2818 /* either failed parity check, or recovery is happening */ 2819 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2820 BUG_ON(s->uptodate != disks); 2821 2822 set_bit(R5_LOCKED, &dev->flags); 2823 s->locked++; 2824 set_bit(R5_Wantwrite, &dev->flags); 2825 2826 clear_bit(STRIPE_DEGRADED, &sh->state); 2827 set_bit(STRIPE_INSYNC, &sh->state); 2828 break; 2829 case check_state_run: 2830 break; /* we will be called again upon completion */ 2831 case check_state_check_result: 2832 sh->check_state = check_state_idle; 2833 2834 /* if a failure occurred during the check operation, leave 2835 * STRIPE_INSYNC not set and let the stripe be handled again 2836 */ 2837 if (s->failed) 2838 break; 2839 2840 /* handle a successful check operation, if parity is correct 2841 * we are done. Otherwise update the mismatch count and repair 2842 * parity if !MD_RECOVERY_CHECK 2843 */ 2844 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2845 /* parity is correct (on disc, 2846 * not in buffer any more) 2847 */ 2848 set_bit(STRIPE_INSYNC, &sh->state); 2849 else { 2850 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2851 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2852 /* don't try to repair!! */ 2853 set_bit(STRIPE_INSYNC, &sh->state); 2854 else { 2855 sh->check_state = check_state_compute_run; 2856 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2857 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2858 set_bit(R5_Wantcompute, 2859 &sh->dev[sh->pd_idx].flags); 2860 sh->ops.target = sh->pd_idx; 2861 sh->ops.target2 = -1; 2862 s->uptodate++; 2863 } 2864 } 2865 break; 2866 case check_state_compute_run: 2867 break; 2868 default: 2869 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2870 __func__, sh->check_state, 2871 (unsigned long long) sh->sector); 2872 BUG(); 2873 } 2874 } 2875 2876 2877 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 2878 struct stripe_head_state *s, 2879 int disks) 2880 { 2881 int pd_idx = sh->pd_idx; 2882 int qd_idx = sh->qd_idx; 2883 struct r5dev *dev; 2884 2885 set_bit(STRIPE_HANDLE, &sh->state); 2886 2887 BUG_ON(s->failed > 2); 2888 2889 /* Want to check and possibly repair P and Q. 2890 * However there could be one 'failed' device, in which 2891 * case we can only check one of them, possibly using the 2892 * other to generate missing data 2893 */ 2894 2895 switch (sh->check_state) { 2896 case check_state_idle: 2897 /* start a new check operation if there are < 2 failures */ 2898 if (s->failed == s->q_failed) { 2899 /* The only possible failed device holds Q, so it 2900 * makes sense to check P (If anything else were failed, 2901 * we would have used P to recreate it). 2902 */ 2903 sh->check_state = check_state_run; 2904 } 2905 if (!s->q_failed && s->failed < 2) { 2906 /* Q is not failed, and we didn't use it to generate 2907 * anything, so it makes sense to check it 2908 */ 2909 if (sh->check_state == check_state_run) 2910 sh->check_state = check_state_run_pq; 2911 else 2912 sh->check_state = check_state_run_q; 2913 } 2914 2915 /* discard potentially stale zero_sum_result */ 2916 sh->ops.zero_sum_result = 0; 2917 2918 if (sh->check_state == check_state_run) { 2919 /* async_xor_zero_sum destroys the contents of P */ 2920 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2921 s->uptodate--; 2922 } 2923 if (sh->check_state >= check_state_run && 2924 sh->check_state <= check_state_run_pq) { 2925 /* async_syndrome_zero_sum preserves P and Q, so 2926 * no need to mark them !uptodate here 2927 */ 2928 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2929 break; 2930 } 2931 2932 /* we have 2-disk failure */ 2933 BUG_ON(s->failed != 2); 2934 /* fall through */ 2935 case check_state_compute_result: 2936 sh->check_state = check_state_idle; 2937 2938 /* check that a write has not made the stripe insync */ 2939 if (test_bit(STRIPE_INSYNC, &sh->state)) 2940 break; 2941 2942 /* now write out any block on a failed drive, 2943 * or P or Q if they were recomputed 2944 */ 2945 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2946 if (s->failed == 2) { 2947 dev = &sh->dev[s->failed_num[1]]; 2948 s->locked++; 2949 set_bit(R5_LOCKED, &dev->flags); 2950 set_bit(R5_Wantwrite, &dev->flags); 2951 } 2952 if (s->failed >= 1) { 2953 dev = &sh->dev[s->failed_num[0]]; 2954 s->locked++; 2955 set_bit(R5_LOCKED, &dev->flags); 2956 set_bit(R5_Wantwrite, &dev->flags); 2957 } 2958 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2959 dev = &sh->dev[pd_idx]; 2960 s->locked++; 2961 set_bit(R5_LOCKED, &dev->flags); 2962 set_bit(R5_Wantwrite, &dev->flags); 2963 } 2964 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2965 dev = &sh->dev[qd_idx]; 2966 s->locked++; 2967 set_bit(R5_LOCKED, &dev->flags); 2968 set_bit(R5_Wantwrite, &dev->flags); 2969 } 2970 clear_bit(STRIPE_DEGRADED, &sh->state); 2971 2972 set_bit(STRIPE_INSYNC, &sh->state); 2973 break; 2974 case check_state_run: 2975 case check_state_run_q: 2976 case check_state_run_pq: 2977 break; /* we will be called again upon completion */ 2978 case check_state_check_result: 2979 sh->check_state = check_state_idle; 2980 2981 /* handle a successful check operation, if parity is correct 2982 * we are done. Otherwise update the mismatch count and repair 2983 * parity if !MD_RECOVERY_CHECK 2984 */ 2985 if (sh->ops.zero_sum_result == 0) { 2986 /* both parities are correct */ 2987 if (!s->failed) 2988 set_bit(STRIPE_INSYNC, &sh->state); 2989 else { 2990 /* in contrast to the raid5 case we can validate 2991 * parity, but still have a failure to write 2992 * back 2993 */ 2994 sh->check_state = check_state_compute_result; 2995 /* Returning at this point means that we may go 2996 * off and bring p and/or q uptodate again so 2997 * we make sure to check zero_sum_result again 2998 * to verify if p or q need writeback 2999 */ 3000 } 3001 } else { 3002 conf->mddev->resync_mismatches += STRIPE_SECTORS; 3003 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3004 /* don't try to repair!! */ 3005 set_bit(STRIPE_INSYNC, &sh->state); 3006 else { 3007 int *target = &sh->ops.target; 3008 3009 sh->ops.target = -1; 3010 sh->ops.target2 = -1; 3011 sh->check_state = check_state_compute_run; 3012 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3013 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3014 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3015 set_bit(R5_Wantcompute, 3016 &sh->dev[pd_idx].flags); 3017 *target = pd_idx; 3018 target = &sh->ops.target2; 3019 s->uptodate++; 3020 } 3021 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3022 set_bit(R5_Wantcompute, 3023 &sh->dev[qd_idx].flags); 3024 *target = qd_idx; 3025 s->uptodate++; 3026 } 3027 } 3028 } 3029 break; 3030 case check_state_compute_run: 3031 break; 3032 default: 3033 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3034 __func__, sh->check_state, 3035 (unsigned long long) sh->sector); 3036 BUG(); 3037 } 3038 } 3039 3040 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3041 { 3042 int i; 3043 3044 /* We have read all the blocks in this stripe and now we need to 3045 * copy some of them into a target stripe for expand. 3046 */ 3047 struct dma_async_tx_descriptor *tx = NULL; 3048 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3049 for (i = 0; i < sh->disks; i++) 3050 if (i != sh->pd_idx && i != sh->qd_idx) { 3051 int dd_idx, j; 3052 struct stripe_head *sh2; 3053 struct async_submit_ctl submit; 3054 3055 sector_t bn = compute_blocknr(sh, i, 1); 3056 sector_t s = raid5_compute_sector(conf, bn, 0, 3057 &dd_idx, NULL); 3058 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3059 if (sh2 == NULL) 3060 /* so far only the early blocks of this stripe 3061 * have been requested. When later blocks 3062 * get requested, we will try again 3063 */ 3064 continue; 3065 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3066 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3067 /* must have already done this block */ 3068 release_stripe(sh2); 3069 continue; 3070 } 3071 3072 /* place all the copies on one channel */ 3073 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3074 tx = async_memcpy(sh2->dev[dd_idx].page, 3075 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3076 &submit); 3077 3078 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3079 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3080 for (j = 0; j < conf->raid_disks; j++) 3081 if (j != sh2->pd_idx && 3082 j != sh2->qd_idx && 3083 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3084 break; 3085 if (j == conf->raid_disks) { 3086 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3087 set_bit(STRIPE_HANDLE, &sh2->state); 3088 } 3089 release_stripe(sh2); 3090 3091 } 3092 /* done submitting copies, wait for them to complete */ 3093 if (tx) { 3094 async_tx_ack(tx); 3095 dma_wait_for_async_tx(tx); 3096 } 3097 } 3098 3099 /* 3100 * handle_stripe - do things to a stripe. 3101 * 3102 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3103 * state of various bits to see what needs to be done. 3104 * Possible results: 3105 * return some read requests which now have data 3106 * return some write requests which are safely on storage 3107 * schedule a read on some buffers 3108 * schedule a write of some buffers 3109 * return confirmation of parity correctness 3110 * 3111 */ 3112 3113 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3114 { 3115 struct r5conf *conf = sh->raid_conf; 3116 int disks = sh->disks; 3117 struct r5dev *dev; 3118 int i; 3119 int do_recovery = 0; 3120 3121 memset(s, 0, sizeof(*s)); 3122 3123 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3124 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3125 s->failed_num[0] = -1; 3126 s->failed_num[1] = -1; 3127 3128 /* Now to look around and see what can be done */ 3129 rcu_read_lock(); 3130 spin_lock_irq(&conf->device_lock); 3131 for (i=disks; i--; ) { 3132 struct md_rdev *rdev; 3133 sector_t first_bad; 3134 int bad_sectors; 3135 int is_bad = 0; 3136 3137 dev = &sh->dev[i]; 3138 3139 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3140 i, dev->flags, 3141 dev->toread, dev->towrite, dev->written); 3142 /* maybe we can reply to a read 3143 * 3144 * new wantfill requests are only permitted while 3145 * ops_complete_biofill is guaranteed to be inactive 3146 */ 3147 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3148 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3149 set_bit(R5_Wantfill, &dev->flags); 3150 3151 /* now count some things */ 3152 if (test_bit(R5_LOCKED, &dev->flags)) 3153 s->locked++; 3154 if (test_bit(R5_UPTODATE, &dev->flags)) 3155 s->uptodate++; 3156 if (test_bit(R5_Wantcompute, &dev->flags)) { 3157 s->compute++; 3158 BUG_ON(s->compute > 2); 3159 } 3160 3161 if (test_bit(R5_Wantfill, &dev->flags)) 3162 s->to_fill++; 3163 else if (dev->toread) 3164 s->to_read++; 3165 if (dev->towrite) { 3166 s->to_write++; 3167 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3168 s->non_overwrite++; 3169 } 3170 if (dev->written) 3171 s->written++; 3172 /* Prefer to use the replacement for reads, but only 3173 * if it is recovered enough and has no bad blocks. 3174 */ 3175 rdev = rcu_dereference(conf->disks[i].replacement); 3176 if (rdev && !test_bit(Faulty, &rdev->flags) && 3177 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3178 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3179 &first_bad, &bad_sectors)) 3180 set_bit(R5_ReadRepl, &dev->flags); 3181 else { 3182 if (rdev) 3183 set_bit(R5_NeedReplace, &dev->flags); 3184 rdev = rcu_dereference(conf->disks[i].rdev); 3185 clear_bit(R5_ReadRepl, &dev->flags); 3186 } 3187 if (rdev && test_bit(Faulty, &rdev->flags)) 3188 rdev = NULL; 3189 if (rdev) { 3190 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3191 &first_bad, &bad_sectors); 3192 if (s->blocked_rdev == NULL 3193 && (test_bit(Blocked, &rdev->flags) 3194 || is_bad < 0)) { 3195 if (is_bad < 0) 3196 set_bit(BlockedBadBlocks, 3197 &rdev->flags); 3198 s->blocked_rdev = rdev; 3199 atomic_inc(&rdev->nr_pending); 3200 } 3201 } 3202 clear_bit(R5_Insync, &dev->flags); 3203 if (!rdev) 3204 /* Not in-sync */; 3205 else if (is_bad) { 3206 /* also not in-sync */ 3207 if (!test_bit(WriteErrorSeen, &rdev->flags)) { 3208 /* treat as in-sync, but with a read error 3209 * which we can now try to correct 3210 */ 3211 set_bit(R5_Insync, &dev->flags); 3212 set_bit(R5_ReadError, &dev->flags); 3213 } 3214 } else if (test_bit(In_sync, &rdev->flags)) 3215 set_bit(R5_Insync, &dev->flags); 3216 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3217 /* in sync if before recovery_offset */ 3218 set_bit(R5_Insync, &dev->flags); 3219 else if (test_bit(R5_UPTODATE, &dev->flags) && 3220 test_bit(R5_Expanded, &dev->flags)) 3221 /* If we've reshaped into here, we assume it is Insync. 3222 * We will shortly update recovery_offset to make 3223 * it official. 3224 */ 3225 set_bit(R5_Insync, &dev->flags); 3226 3227 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3228 /* This flag does not apply to '.replacement' 3229 * only to .rdev, so make sure to check that*/ 3230 struct md_rdev *rdev2 = rcu_dereference( 3231 conf->disks[i].rdev); 3232 if (rdev2 == rdev) 3233 clear_bit(R5_Insync, &dev->flags); 3234 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3235 s->handle_bad_blocks = 1; 3236 atomic_inc(&rdev2->nr_pending); 3237 } else 3238 clear_bit(R5_WriteError, &dev->flags); 3239 } 3240 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3241 /* This flag does not apply to '.replacement' 3242 * only to .rdev, so make sure to check that*/ 3243 struct md_rdev *rdev2 = rcu_dereference( 3244 conf->disks[i].rdev); 3245 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3246 s->handle_bad_blocks = 1; 3247 atomic_inc(&rdev2->nr_pending); 3248 } else 3249 clear_bit(R5_MadeGood, &dev->flags); 3250 } 3251 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3252 struct md_rdev *rdev2 = rcu_dereference( 3253 conf->disks[i].replacement); 3254 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3255 s->handle_bad_blocks = 1; 3256 atomic_inc(&rdev2->nr_pending); 3257 } else 3258 clear_bit(R5_MadeGoodRepl, &dev->flags); 3259 } 3260 if (!test_bit(R5_Insync, &dev->flags)) { 3261 /* The ReadError flag will just be confusing now */ 3262 clear_bit(R5_ReadError, &dev->flags); 3263 clear_bit(R5_ReWrite, &dev->flags); 3264 } 3265 if (test_bit(R5_ReadError, &dev->flags)) 3266 clear_bit(R5_Insync, &dev->flags); 3267 if (!test_bit(R5_Insync, &dev->flags)) { 3268 if (s->failed < 2) 3269 s->failed_num[s->failed] = i; 3270 s->failed++; 3271 if (rdev && !test_bit(Faulty, &rdev->flags)) 3272 do_recovery = 1; 3273 } 3274 } 3275 spin_unlock_irq(&conf->device_lock); 3276 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3277 /* If there is a failed device being replaced, 3278 * we must be recovering. 3279 * else if we are after recovery_cp, we must be syncing 3280 * else we can only be replacing 3281 * sync and recovery both need to read all devices, and so 3282 * use the same flag. 3283 */ 3284 if (do_recovery || 3285 sh->sector >= conf->mddev->recovery_cp) 3286 s->syncing = 1; 3287 else 3288 s->replacing = 1; 3289 } 3290 rcu_read_unlock(); 3291 } 3292 3293 static void handle_stripe(struct stripe_head *sh) 3294 { 3295 struct stripe_head_state s; 3296 struct r5conf *conf = sh->raid_conf; 3297 int i; 3298 int prexor; 3299 int disks = sh->disks; 3300 struct r5dev *pdev, *qdev; 3301 3302 clear_bit(STRIPE_HANDLE, &sh->state); 3303 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3304 /* already being handled, ensure it gets handled 3305 * again when current action finishes */ 3306 set_bit(STRIPE_HANDLE, &sh->state); 3307 return; 3308 } 3309 3310 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3311 set_bit(STRIPE_SYNCING, &sh->state); 3312 clear_bit(STRIPE_INSYNC, &sh->state); 3313 } 3314 clear_bit(STRIPE_DELAYED, &sh->state); 3315 3316 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3317 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3318 (unsigned long long)sh->sector, sh->state, 3319 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3320 sh->check_state, sh->reconstruct_state); 3321 3322 analyse_stripe(sh, &s); 3323 3324 if (s.handle_bad_blocks) { 3325 set_bit(STRIPE_HANDLE, &sh->state); 3326 goto finish; 3327 } 3328 3329 if (unlikely(s.blocked_rdev)) { 3330 if (s.syncing || s.expanding || s.expanded || 3331 s.replacing || s.to_write || s.written) { 3332 set_bit(STRIPE_HANDLE, &sh->state); 3333 goto finish; 3334 } 3335 /* There is nothing for the blocked_rdev to block */ 3336 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3337 s.blocked_rdev = NULL; 3338 } 3339 3340 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3341 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3342 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3343 } 3344 3345 pr_debug("locked=%d uptodate=%d to_read=%d" 3346 " to_write=%d failed=%d failed_num=%d,%d\n", 3347 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3348 s.failed_num[0], s.failed_num[1]); 3349 /* check if the array has lost more than max_degraded devices and, 3350 * if so, some requests might need to be failed. 3351 */ 3352 if (s.failed > conf->max_degraded) { 3353 sh->check_state = 0; 3354 sh->reconstruct_state = 0; 3355 if (s.to_read+s.to_write+s.written) 3356 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3357 if (s.syncing + s.replacing) 3358 handle_failed_sync(conf, sh, &s); 3359 } 3360 3361 /* 3362 * might be able to return some write requests if the parity blocks 3363 * are safe, or on a failed drive 3364 */ 3365 pdev = &sh->dev[sh->pd_idx]; 3366 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3367 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3368 qdev = &sh->dev[sh->qd_idx]; 3369 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3370 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3371 || conf->level < 6; 3372 3373 if (s.written && 3374 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3375 && !test_bit(R5_LOCKED, &pdev->flags) 3376 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3377 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3378 && !test_bit(R5_LOCKED, &qdev->flags) 3379 && test_bit(R5_UPTODATE, &qdev->flags))))) 3380 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3381 3382 /* Now we might consider reading some blocks, either to check/generate 3383 * parity, or to satisfy requests 3384 * or to load a block that is being partially written. 3385 */ 3386 if (s.to_read || s.non_overwrite 3387 || (conf->level == 6 && s.to_write && s.failed) 3388 || (s.syncing && (s.uptodate + s.compute < disks)) 3389 || s.replacing 3390 || s.expanding) 3391 handle_stripe_fill(sh, &s, disks); 3392 3393 /* Now we check to see if any write operations have recently 3394 * completed 3395 */ 3396 prexor = 0; 3397 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3398 prexor = 1; 3399 if (sh->reconstruct_state == reconstruct_state_drain_result || 3400 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3401 sh->reconstruct_state = reconstruct_state_idle; 3402 3403 /* All the 'written' buffers and the parity block are ready to 3404 * be written back to disk 3405 */ 3406 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3407 BUG_ON(sh->qd_idx >= 0 && 3408 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3409 for (i = disks; i--; ) { 3410 struct r5dev *dev = &sh->dev[i]; 3411 if (test_bit(R5_LOCKED, &dev->flags) && 3412 (i == sh->pd_idx || i == sh->qd_idx || 3413 dev->written)) { 3414 pr_debug("Writing block %d\n", i); 3415 set_bit(R5_Wantwrite, &dev->flags); 3416 if (prexor) 3417 continue; 3418 if (!test_bit(R5_Insync, &dev->flags) || 3419 ((i == sh->pd_idx || i == sh->qd_idx) && 3420 s.failed == 0)) 3421 set_bit(STRIPE_INSYNC, &sh->state); 3422 } 3423 } 3424 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3425 s.dec_preread_active = 1; 3426 } 3427 3428 /* Now to consider new write requests and what else, if anything 3429 * should be read. We do not handle new writes when: 3430 * 1/ A 'write' operation (copy+xor) is already in flight. 3431 * 2/ A 'check' operation is in flight, as it may clobber the parity 3432 * block. 3433 */ 3434 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3435 handle_stripe_dirtying(conf, sh, &s, disks); 3436 3437 /* maybe we need to check and possibly fix the parity for this stripe 3438 * Any reads will already have been scheduled, so we just see if enough 3439 * data is available. The parity check is held off while parity 3440 * dependent operations are in flight. 3441 */ 3442 if (sh->check_state || 3443 (s.syncing && s.locked == 0 && 3444 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3445 !test_bit(STRIPE_INSYNC, &sh->state))) { 3446 if (conf->level == 6) 3447 handle_parity_checks6(conf, sh, &s, disks); 3448 else 3449 handle_parity_checks5(conf, sh, &s, disks); 3450 } 3451 3452 if (s.replacing && s.locked == 0 3453 && !test_bit(STRIPE_INSYNC, &sh->state)) { 3454 /* Write out to replacement devices where possible */ 3455 for (i = 0; i < conf->raid_disks; i++) 3456 if (test_bit(R5_UPTODATE, &sh->dev[i].flags) && 3457 test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3458 set_bit(R5_WantReplace, &sh->dev[i].flags); 3459 set_bit(R5_LOCKED, &sh->dev[i].flags); 3460 s.locked++; 3461 } 3462 set_bit(STRIPE_INSYNC, &sh->state); 3463 } 3464 if ((s.syncing || s.replacing) && s.locked == 0 && 3465 test_bit(STRIPE_INSYNC, &sh->state)) { 3466 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3467 clear_bit(STRIPE_SYNCING, &sh->state); 3468 } 3469 3470 /* If the failed drives are just a ReadError, then we might need 3471 * to progress the repair/check process 3472 */ 3473 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3474 for (i = 0; i < s.failed; i++) { 3475 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3476 if (test_bit(R5_ReadError, &dev->flags) 3477 && !test_bit(R5_LOCKED, &dev->flags) 3478 && test_bit(R5_UPTODATE, &dev->flags) 3479 ) { 3480 if (!test_bit(R5_ReWrite, &dev->flags)) { 3481 set_bit(R5_Wantwrite, &dev->flags); 3482 set_bit(R5_ReWrite, &dev->flags); 3483 set_bit(R5_LOCKED, &dev->flags); 3484 s.locked++; 3485 } else { 3486 /* let's read it back */ 3487 set_bit(R5_Wantread, &dev->flags); 3488 set_bit(R5_LOCKED, &dev->flags); 3489 s.locked++; 3490 } 3491 } 3492 } 3493 3494 3495 /* Finish reconstruct operations initiated by the expansion process */ 3496 if (sh->reconstruct_state == reconstruct_state_result) { 3497 struct stripe_head *sh_src 3498 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3499 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3500 /* sh cannot be written until sh_src has been read. 3501 * so arrange for sh to be delayed a little 3502 */ 3503 set_bit(STRIPE_DELAYED, &sh->state); 3504 set_bit(STRIPE_HANDLE, &sh->state); 3505 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3506 &sh_src->state)) 3507 atomic_inc(&conf->preread_active_stripes); 3508 release_stripe(sh_src); 3509 goto finish; 3510 } 3511 if (sh_src) 3512 release_stripe(sh_src); 3513 3514 sh->reconstruct_state = reconstruct_state_idle; 3515 clear_bit(STRIPE_EXPANDING, &sh->state); 3516 for (i = conf->raid_disks; i--; ) { 3517 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3518 set_bit(R5_LOCKED, &sh->dev[i].flags); 3519 s.locked++; 3520 } 3521 } 3522 3523 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3524 !sh->reconstruct_state) { 3525 /* Need to write out all blocks after computing parity */ 3526 sh->disks = conf->raid_disks; 3527 stripe_set_idx(sh->sector, conf, 0, sh); 3528 schedule_reconstruction(sh, &s, 1, 1); 3529 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3530 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3531 atomic_dec(&conf->reshape_stripes); 3532 wake_up(&conf->wait_for_overlap); 3533 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3534 } 3535 3536 if (s.expanding && s.locked == 0 && 3537 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3538 handle_stripe_expansion(conf, sh); 3539 3540 finish: 3541 /* wait for this device to become unblocked */ 3542 if (conf->mddev->external && unlikely(s.blocked_rdev)) 3543 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); 3544 3545 if (s.handle_bad_blocks) 3546 for (i = disks; i--; ) { 3547 struct md_rdev *rdev; 3548 struct r5dev *dev = &sh->dev[i]; 3549 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3550 /* We own a safe reference to the rdev */ 3551 rdev = conf->disks[i].rdev; 3552 if (!rdev_set_badblocks(rdev, sh->sector, 3553 STRIPE_SECTORS, 0)) 3554 md_error(conf->mddev, rdev); 3555 rdev_dec_pending(rdev, conf->mddev); 3556 } 3557 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3558 rdev = conf->disks[i].rdev; 3559 rdev_clear_badblocks(rdev, sh->sector, 3560 STRIPE_SECTORS); 3561 rdev_dec_pending(rdev, conf->mddev); 3562 } 3563 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3564 rdev = conf->disks[i].replacement; 3565 if (!rdev) 3566 /* rdev have been moved down */ 3567 rdev = conf->disks[i].rdev; 3568 rdev_clear_badblocks(rdev, sh->sector, 3569 STRIPE_SECTORS); 3570 rdev_dec_pending(rdev, conf->mddev); 3571 } 3572 } 3573 3574 if (s.ops_request) 3575 raid_run_ops(sh, s.ops_request); 3576 3577 ops_run_io(sh, &s); 3578 3579 if (s.dec_preread_active) { 3580 /* We delay this until after ops_run_io so that if make_request 3581 * is waiting on a flush, it won't continue until the writes 3582 * have actually been submitted. 3583 */ 3584 atomic_dec(&conf->preread_active_stripes); 3585 if (atomic_read(&conf->preread_active_stripes) < 3586 IO_THRESHOLD) 3587 md_wakeup_thread(conf->mddev->thread); 3588 } 3589 3590 return_io(s.return_bi); 3591 3592 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3593 } 3594 3595 static void raid5_activate_delayed(struct r5conf *conf) 3596 { 3597 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3598 while (!list_empty(&conf->delayed_list)) { 3599 struct list_head *l = conf->delayed_list.next; 3600 struct stripe_head *sh; 3601 sh = list_entry(l, struct stripe_head, lru); 3602 list_del_init(l); 3603 clear_bit(STRIPE_DELAYED, &sh->state); 3604 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3605 atomic_inc(&conf->preread_active_stripes); 3606 list_add_tail(&sh->lru, &conf->hold_list); 3607 } 3608 } 3609 } 3610 3611 static void activate_bit_delay(struct r5conf *conf) 3612 { 3613 /* device_lock is held */ 3614 struct list_head head; 3615 list_add(&head, &conf->bitmap_list); 3616 list_del_init(&conf->bitmap_list); 3617 while (!list_empty(&head)) { 3618 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3619 list_del_init(&sh->lru); 3620 atomic_inc(&sh->count); 3621 __release_stripe(conf, sh); 3622 } 3623 } 3624 3625 int md_raid5_congested(struct mddev *mddev, int bits) 3626 { 3627 struct r5conf *conf = mddev->private; 3628 3629 /* No difference between reads and writes. Just check 3630 * how busy the stripe_cache is 3631 */ 3632 3633 if (conf->inactive_blocked) 3634 return 1; 3635 if (conf->quiesce) 3636 return 1; 3637 if (list_empty_careful(&conf->inactive_list)) 3638 return 1; 3639 3640 return 0; 3641 } 3642 EXPORT_SYMBOL_GPL(md_raid5_congested); 3643 3644 static int raid5_congested(void *data, int bits) 3645 { 3646 struct mddev *mddev = data; 3647 3648 return mddev_congested(mddev, bits) || 3649 md_raid5_congested(mddev, bits); 3650 } 3651 3652 /* We want read requests to align with chunks where possible, 3653 * but write requests don't need to. 3654 */ 3655 static int raid5_mergeable_bvec(struct request_queue *q, 3656 struct bvec_merge_data *bvm, 3657 struct bio_vec *biovec) 3658 { 3659 struct mddev *mddev = q->queuedata; 3660 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3661 int max; 3662 unsigned int chunk_sectors = mddev->chunk_sectors; 3663 unsigned int bio_sectors = bvm->bi_size >> 9; 3664 3665 if ((bvm->bi_rw & 1) == WRITE) 3666 return biovec->bv_len; /* always allow writes to be mergeable */ 3667 3668 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3669 chunk_sectors = mddev->new_chunk_sectors; 3670 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3671 if (max < 0) max = 0; 3672 if (max <= biovec->bv_len && bio_sectors == 0) 3673 return biovec->bv_len; 3674 else 3675 return max; 3676 } 3677 3678 3679 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3680 { 3681 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3682 unsigned int chunk_sectors = mddev->chunk_sectors; 3683 unsigned int bio_sectors = bio->bi_size >> 9; 3684 3685 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3686 chunk_sectors = mddev->new_chunk_sectors; 3687 return chunk_sectors >= 3688 ((sector & (chunk_sectors - 1)) + bio_sectors); 3689 } 3690 3691 /* 3692 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3693 * later sampled by raid5d. 3694 */ 3695 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3696 { 3697 unsigned long flags; 3698 3699 spin_lock_irqsave(&conf->device_lock, flags); 3700 3701 bi->bi_next = conf->retry_read_aligned_list; 3702 conf->retry_read_aligned_list = bi; 3703 3704 spin_unlock_irqrestore(&conf->device_lock, flags); 3705 md_wakeup_thread(conf->mddev->thread); 3706 } 3707 3708 3709 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3710 { 3711 struct bio *bi; 3712 3713 bi = conf->retry_read_aligned; 3714 if (bi) { 3715 conf->retry_read_aligned = NULL; 3716 return bi; 3717 } 3718 bi = conf->retry_read_aligned_list; 3719 if(bi) { 3720 conf->retry_read_aligned_list = bi->bi_next; 3721 bi->bi_next = NULL; 3722 /* 3723 * this sets the active strip count to 1 and the processed 3724 * strip count to zero (upper 8 bits) 3725 */ 3726 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3727 } 3728 3729 return bi; 3730 } 3731 3732 3733 /* 3734 * The "raid5_align_endio" should check if the read succeeded and if it 3735 * did, call bio_endio on the original bio (having bio_put the new bio 3736 * first). 3737 * If the read failed.. 3738 */ 3739 static void raid5_align_endio(struct bio *bi, int error) 3740 { 3741 struct bio* raid_bi = bi->bi_private; 3742 struct mddev *mddev; 3743 struct r5conf *conf; 3744 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3745 struct md_rdev *rdev; 3746 3747 bio_put(bi); 3748 3749 rdev = (void*)raid_bi->bi_next; 3750 raid_bi->bi_next = NULL; 3751 mddev = rdev->mddev; 3752 conf = mddev->private; 3753 3754 rdev_dec_pending(rdev, conf->mddev); 3755 3756 if (!error && uptodate) { 3757 bio_endio(raid_bi, 0); 3758 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3759 wake_up(&conf->wait_for_stripe); 3760 return; 3761 } 3762 3763 3764 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3765 3766 add_bio_to_retry(raid_bi, conf); 3767 } 3768 3769 static int bio_fits_rdev(struct bio *bi) 3770 { 3771 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3772 3773 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3774 return 0; 3775 blk_recount_segments(q, bi); 3776 if (bi->bi_phys_segments > queue_max_segments(q)) 3777 return 0; 3778 3779 if (q->merge_bvec_fn) 3780 /* it's too hard to apply the merge_bvec_fn at this stage, 3781 * just just give up 3782 */ 3783 return 0; 3784 3785 return 1; 3786 } 3787 3788 3789 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 3790 { 3791 struct r5conf *conf = mddev->private; 3792 int dd_idx; 3793 struct bio* align_bi; 3794 struct md_rdev *rdev; 3795 sector_t end_sector; 3796 3797 if (!in_chunk_boundary(mddev, raid_bio)) { 3798 pr_debug("chunk_aligned_read : non aligned\n"); 3799 return 0; 3800 } 3801 /* 3802 * use bio_clone_mddev to make a copy of the bio 3803 */ 3804 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 3805 if (!align_bi) 3806 return 0; 3807 /* 3808 * set bi_end_io to a new function, and set bi_private to the 3809 * original bio. 3810 */ 3811 align_bi->bi_end_io = raid5_align_endio; 3812 align_bi->bi_private = raid_bio; 3813 /* 3814 * compute position 3815 */ 3816 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3817 0, 3818 &dd_idx, NULL); 3819 3820 end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); 3821 rcu_read_lock(); 3822 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 3823 if (!rdev || test_bit(Faulty, &rdev->flags) || 3824 rdev->recovery_offset < end_sector) { 3825 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3826 if (rdev && 3827 (test_bit(Faulty, &rdev->flags) || 3828 !(test_bit(In_sync, &rdev->flags) || 3829 rdev->recovery_offset >= end_sector))) 3830 rdev = NULL; 3831 } 3832 if (rdev) { 3833 sector_t first_bad; 3834 int bad_sectors; 3835 3836 atomic_inc(&rdev->nr_pending); 3837 rcu_read_unlock(); 3838 raid_bio->bi_next = (void*)rdev; 3839 align_bi->bi_bdev = rdev->bdev; 3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3841 align_bi->bi_sector += rdev->data_offset; 3842 3843 if (!bio_fits_rdev(align_bi) || 3844 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, 3845 &first_bad, &bad_sectors)) { 3846 /* too big in some way, or has a known bad block */ 3847 bio_put(align_bi); 3848 rdev_dec_pending(rdev, mddev); 3849 return 0; 3850 } 3851 3852 spin_lock_irq(&conf->device_lock); 3853 wait_event_lock_irq(conf->wait_for_stripe, 3854 conf->quiesce == 0, 3855 conf->device_lock, /* nothing */); 3856 atomic_inc(&conf->active_aligned_reads); 3857 spin_unlock_irq(&conf->device_lock); 3858 3859 generic_make_request(align_bi); 3860 return 1; 3861 } else { 3862 rcu_read_unlock(); 3863 bio_put(align_bi); 3864 return 0; 3865 } 3866 } 3867 3868 /* __get_priority_stripe - get the next stripe to process 3869 * 3870 * Full stripe writes are allowed to pass preread active stripes up until 3871 * the bypass_threshold is exceeded. In general the bypass_count 3872 * increments when the handle_list is handled before the hold_list; however, it 3873 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3874 * stripe with in flight i/o. The bypass_count will be reset when the 3875 * head of the hold_list has changed, i.e. the head was promoted to the 3876 * handle_list. 3877 */ 3878 static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 3879 { 3880 struct stripe_head *sh; 3881 3882 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3883 __func__, 3884 list_empty(&conf->handle_list) ? "empty" : "busy", 3885 list_empty(&conf->hold_list) ? "empty" : "busy", 3886 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3887 3888 if (!list_empty(&conf->handle_list)) { 3889 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3890 3891 if (list_empty(&conf->hold_list)) 3892 conf->bypass_count = 0; 3893 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3894 if (conf->hold_list.next == conf->last_hold) 3895 conf->bypass_count++; 3896 else { 3897 conf->last_hold = conf->hold_list.next; 3898 conf->bypass_count -= conf->bypass_threshold; 3899 if (conf->bypass_count < 0) 3900 conf->bypass_count = 0; 3901 } 3902 } 3903 } else if (!list_empty(&conf->hold_list) && 3904 ((conf->bypass_threshold && 3905 conf->bypass_count > conf->bypass_threshold) || 3906 atomic_read(&conf->pending_full_writes) == 0)) { 3907 sh = list_entry(conf->hold_list.next, 3908 typeof(*sh), lru); 3909 conf->bypass_count -= conf->bypass_threshold; 3910 if (conf->bypass_count < 0) 3911 conf->bypass_count = 0; 3912 } else 3913 return NULL; 3914 3915 list_del_init(&sh->lru); 3916 atomic_inc(&sh->count); 3917 BUG_ON(atomic_read(&sh->count) != 1); 3918 return sh; 3919 } 3920 3921 static void make_request(struct mddev *mddev, struct bio * bi) 3922 { 3923 struct r5conf *conf = mddev->private; 3924 int dd_idx; 3925 sector_t new_sector; 3926 sector_t logical_sector, last_sector; 3927 struct stripe_head *sh; 3928 const int rw = bio_data_dir(bi); 3929 int remaining; 3930 int plugged; 3931 3932 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 3933 md_flush_request(mddev, bi); 3934 return; 3935 } 3936 3937 md_write_start(mddev, bi); 3938 3939 if (rw == READ && 3940 mddev->reshape_position == MaxSector && 3941 chunk_aligned_read(mddev,bi)) 3942 return; 3943 3944 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3945 last_sector = bi->bi_sector + (bi->bi_size>>9); 3946 bi->bi_next = NULL; 3947 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3948 3949 plugged = mddev_check_plugged(mddev); 3950 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3951 DEFINE_WAIT(w); 3952 int disks, data_disks; 3953 int previous; 3954 3955 retry: 3956 previous = 0; 3957 disks = conf->raid_disks; 3958 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3959 if (unlikely(conf->reshape_progress != MaxSector)) { 3960 /* spinlock is needed as reshape_progress may be 3961 * 64bit on a 32bit platform, and so it might be 3962 * possible to see a half-updated value 3963 * Of course reshape_progress could change after 3964 * the lock is dropped, so once we get a reference 3965 * to the stripe that we think it is, we will have 3966 * to check again. 3967 */ 3968 spin_lock_irq(&conf->device_lock); 3969 if (mddev->delta_disks < 0 3970 ? logical_sector < conf->reshape_progress 3971 : logical_sector >= conf->reshape_progress) { 3972 disks = conf->previous_raid_disks; 3973 previous = 1; 3974 } else { 3975 if (mddev->delta_disks < 0 3976 ? logical_sector < conf->reshape_safe 3977 : logical_sector >= conf->reshape_safe) { 3978 spin_unlock_irq(&conf->device_lock); 3979 schedule(); 3980 goto retry; 3981 } 3982 } 3983 spin_unlock_irq(&conf->device_lock); 3984 } 3985 data_disks = disks - conf->max_degraded; 3986 3987 new_sector = raid5_compute_sector(conf, logical_sector, 3988 previous, 3989 &dd_idx, NULL); 3990 pr_debug("raid456: make_request, sector %llu logical %llu\n", 3991 (unsigned long long)new_sector, 3992 (unsigned long long)logical_sector); 3993 3994 sh = get_active_stripe(conf, new_sector, previous, 3995 (bi->bi_rw&RWA_MASK), 0); 3996 if (sh) { 3997 if (unlikely(previous)) { 3998 /* expansion might have moved on while waiting for a 3999 * stripe, so we must do the range check again. 4000 * Expansion could still move past after this 4001 * test, but as we are holding a reference to 4002 * 'sh', we know that if that happens, 4003 * STRIPE_EXPANDING will get set and the expansion 4004 * won't proceed until we finish with the stripe. 4005 */ 4006 int must_retry = 0; 4007 spin_lock_irq(&conf->device_lock); 4008 if (mddev->delta_disks < 0 4009 ? logical_sector >= conf->reshape_progress 4010 : logical_sector < conf->reshape_progress) 4011 /* mismatch, need to try again */ 4012 must_retry = 1; 4013 spin_unlock_irq(&conf->device_lock); 4014 if (must_retry) { 4015 release_stripe(sh); 4016 schedule(); 4017 goto retry; 4018 } 4019 } 4020 4021 if (rw == WRITE && 4022 logical_sector >= mddev->suspend_lo && 4023 logical_sector < mddev->suspend_hi) { 4024 release_stripe(sh); 4025 /* As the suspend_* range is controlled by 4026 * userspace, we want an interruptible 4027 * wait. 4028 */ 4029 flush_signals(current); 4030 prepare_to_wait(&conf->wait_for_overlap, 4031 &w, TASK_INTERRUPTIBLE); 4032 if (logical_sector >= mddev->suspend_lo && 4033 logical_sector < mddev->suspend_hi) 4034 schedule(); 4035 goto retry; 4036 } 4037 4038 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4039 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4040 /* Stripe is busy expanding or 4041 * add failed due to overlap. Flush everything 4042 * and wait a while 4043 */ 4044 md_wakeup_thread(mddev->thread); 4045 release_stripe(sh); 4046 schedule(); 4047 goto retry; 4048 } 4049 finish_wait(&conf->wait_for_overlap, &w); 4050 set_bit(STRIPE_HANDLE, &sh->state); 4051 clear_bit(STRIPE_DELAYED, &sh->state); 4052 if ((bi->bi_rw & REQ_SYNC) && 4053 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4054 atomic_inc(&conf->preread_active_stripes); 4055 release_stripe(sh); 4056 } else { 4057 /* cannot get stripe for read-ahead, just give-up */ 4058 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4059 finish_wait(&conf->wait_for_overlap, &w); 4060 break; 4061 } 4062 4063 } 4064 if (!plugged) 4065 md_wakeup_thread(mddev->thread); 4066 4067 spin_lock_irq(&conf->device_lock); 4068 remaining = raid5_dec_bi_phys_segments(bi); 4069 spin_unlock_irq(&conf->device_lock); 4070 if (remaining == 0) { 4071 4072 if ( rw == WRITE ) 4073 md_write_end(mddev); 4074 4075 bio_endio(bi, 0); 4076 } 4077 } 4078 4079 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4080 4081 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4082 { 4083 /* reshaping is quite different to recovery/resync so it is 4084 * handled quite separately ... here. 4085 * 4086 * On each call to sync_request, we gather one chunk worth of 4087 * destination stripes and flag them as expanding. 4088 * Then we find all the source stripes and request reads. 4089 * As the reads complete, handle_stripe will copy the data 4090 * into the destination stripe and release that stripe. 4091 */ 4092 struct r5conf *conf = mddev->private; 4093 struct stripe_head *sh; 4094 sector_t first_sector, last_sector; 4095 int raid_disks = conf->previous_raid_disks; 4096 int data_disks = raid_disks - conf->max_degraded; 4097 int new_data_disks = conf->raid_disks - conf->max_degraded; 4098 int i; 4099 int dd_idx; 4100 sector_t writepos, readpos, safepos; 4101 sector_t stripe_addr; 4102 int reshape_sectors; 4103 struct list_head stripes; 4104 4105 if (sector_nr == 0) { 4106 /* If restarting in the middle, skip the initial sectors */ 4107 if (mddev->delta_disks < 0 && 4108 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4109 sector_nr = raid5_size(mddev, 0, 0) 4110 - conf->reshape_progress; 4111 } else if (mddev->delta_disks >= 0 && 4112 conf->reshape_progress > 0) 4113 sector_nr = conf->reshape_progress; 4114 sector_div(sector_nr, new_data_disks); 4115 if (sector_nr) { 4116 mddev->curr_resync_completed = sector_nr; 4117 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4118 *skipped = 1; 4119 return sector_nr; 4120 } 4121 } 4122 4123 /* We need to process a full chunk at a time. 4124 * If old and new chunk sizes differ, we need to process the 4125 * largest of these 4126 */ 4127 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4128 reshape_sectors = mddev->new_chunk_sectors; 4129 else 4130 reshape_sectors = mddev->chunk_sectors; 4131 4132 /* we update the metadata when there is more than 3Meg 4133 * in the block range (that is rather arbitrary, should 4134 * probably be time based) or when the data about to be 4135 * copied would over-write the source of the data at 4136 * the front of the range. 4137 * i.e. one new_stripe along from reshape_progress new_maps 4138 * to after where reshape_safe old_maps to 4139 */ 4140 writepos = conf->reshape_progress; 4141 sector_div(writepos, new_data_disks); 4142 readpos = conf->reshape_progress; 4143 sector_div(readpos, data_disks); 4144 safepos = conf->reshape_safe; 4145 sector_div(safepos, data_disks); 4146 if (mddev->delta_disks < 0) { 4147 writepos -= min_t(sector_t, reshape_sectors, writepos); 4148 readpos += reshape_sectors; 4149 safepos += reshape_sectors; 4150 } else { 4151 writepos += reshape_sectors; 4152 readpos -= min_t(sector_t, reshape_sectors, readpos); 4153 safepos -= min_t(sector_t, reshape_sectors, safepos); 4154 } 4155 4156 /* 'writepos' is the most advanced device address we might write. 4157 * 'readpos' is the least advanced device address we might read. 4158 * 'safepos' is the least address recorded in the metadata as having 4159 * been reshaped. 4160 * If 'readpos' is behind 'writepos', then there is no way that we can 4161 * ensure safety in the face of a crash - that must be done by userspace 4162 * making a backup of the data. So in that case there is no particular 4163 * rush to update metadata. 4164 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4165 * update the metadata to advance 'safepos' to match 'readpos' so that 4166 * we can be safe in the event of a crash. 4167 * So we insist on updating metadata if safepos is behind writepos and 4168 * readpos is beyond writepos. 4169 * In any case, update the metadata every 10 seconds. 4170 * Maybe that number should be configurable, but I'm not sure it is 4171 * worth it.... maybe it could be a multiple of safemode_delay??? 4172 */ 4173 if ((mddev->delta_disks < 0 4174 ? (safepos > writepos && readpos < writepos) 4175 : (safepos < writepos && readpos > writepos)) || 4176 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4177 /* Cannot proceed until we've updated the superblock... */ 4178 wait_event(conf->wait_for_overlap, 4179 atomic_read(&conf->reshape_stripes)==0); 4180 mddev->reshape_position = conf->reshape_progress; 4181 mddev->curr_resync_completed = sector_nr; 4182 conf->reshape_checkpoint = jiffies; 4183 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4184 md_wakeup_thread(mddev->thread); 4185 wait_event(mddev->sb_wait, mddev->flags == 0 || 4186 kthread_should_stop()); 4187 spin_lock_irq(&conf->device_lock); 4188 conf->reshape_safe = mddev->reshape_position; 4189 spin_unlock_irq(&conf->device_lock); 4190 wake_up(&conf->wait_for_overlap); 4191 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4192 } 4193 4194 if (mddev->delta_disks < 0) { 4195 BUG_ON(conf->reshape_progress == 0); 4196 stripe_addr = writepos; 4197 BUG_ON((mddev->dev_sectors & 4198 ~((sector_t)reshape_sectors - 1)) 4199 - reshape_sectors - stripe_addr 4200 != sector_nr); 4201 } else { 4202 BUG_ON(writepos != sector_nr + reshape_sectors); 4203 stripe_addr = sector_nr; 4204 } 4205 INIT_LIST_HEAD(&stripes); 4206 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4207 int j; 4208 int skipped_disk = 0; 4209 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4210 set_bit(STRIPE_EXPANDING, &sh->state); 4211 atomic_inc(&conf->reshape_stripes); 4212 /* If any of this stripe is beyond the end of the old 4213 * array, then we need to zero those blocks 4214 */ 4215 for (j=sh->disks; j--;) { 4216 sector_t s; 4217 if (j == sh->pd_idx) 4218 continue; 4219 if (conf->level == 6 && 4220 j == sh->qd_idx) 4221 continue; 4222 s = compute_blocknr(sh, j, 0); 4223 if (s < raid5_size(mddev, 0, 0)) { 4224 skipped_disk = 1; 4225 continue; 4226 } 4227 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4228 set_bit(R5_Expanded, &sh->dev[j].flags); 4229 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4230 } 4231 if (!skipped_disk) { 4232 set_bit(STRIPE_EXPAND_READY, &sh->state); 4233 set_bit(STRIPE_HANDLE, &sh->state); 4234 } 4235 list_add(&sh->lru, &stripes); 4236 } 4237 spin_lock_irq(&conf->device_lock); 4238 if (mddev->delta_disks < 0) 4239 conf->reshape_progress -= reshape_sectors * new_data_disks; 4240 else 4241 conf->reshape_progress += reshape_sectors * new_data_disks; 4242 spin_unlock_irq(&conf->device_lock); 4243 /* Ok, those stripe are ready. We can start scheduling 4244 * reads on the source stripes. 4245 * The source stripes are determined by mapping the first and last 4246 * block on the destination stripes. 4247 */ 4248 first_sector = 4249 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4250 1, &dd_idx, NULL); 4251 last_sector = 4252 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4253 * new_data_disks - 1), 4254 1, &dd_idx, NULL); 4255 if (last_sector >= mddev->dev_sectors) 4256 last_sector = mddev->dev_sectors - 1; 4257 while (first_sector <= last_sector) { 4258 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4259 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4260 set_bit(STRIPE_HANDLE, &sh->state); 4261 release_stripe(sh); 4262 first_sector += STRIPE_SECTORS; 4263 } 4264 /* Now that the sources are clearly marked, we can release 4265 * the destination stripes 4266 */ 4267 while (!list_empty(&stripes)) { 4268 sh = list_entry(stripes.next, struct stripe_head, lru); 4269 list_del_init(&sh->lru); 4270 release_stripe(sh); 4271 } 4272 /* If this takes us to the resync_max point where we have to pause, 4273 * then we need to write out the superblock. 4274 */ 4275 sector_nr += reshape_sectors; 4276 if ((sector_nr - mddev->curr_resync_completed) * 2 4277 >= mddev->resync_max - mddev->curr_resync_completed) { 4278 /* Cannot proceed until we've updated the superblock... */ 4279 wait_event(conf->wait_for_overlap, 4280 atomic_read(&conf->reshape_stripes) == 0); 4281 mddev->reshape_position = conf->reshape_progress; 4282 mddev->curr_resync_completed = sector_nr; 4283 conf->reshape_checkpoint = jiffies; 4284 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4285 md_wakeup_thread(mddev->thread); 4286 wait_event(mddev->sb_wait, 4287 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4288 || kthread_should_stop()); 4289 spin_lock_irq(&conf->device_lock); 4290 conf->reshape_safe = mddev->reshape_position; 4291 spin_unlock_irq(&conf->device_lock); 4292 wake_up(&conf->wait_for_overlap); 4293 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4294 } 4295 return reshape_sectors; 4296 } 4297 4298 /* FIXME go_faster isn't used */ 4299 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4300 { 4301 struct r5conf *conf = mddev->private; 4302 struct stripe_head *sh; 4303 sector_t max_sector = mddev->dev_sectors; 4304 sector_t sync_blocks; 4305 int still_degraded = 0; 4306 int i; 4307 4308 if (sector_nr >= max_sector) { 4309 /* just being told to finish up .. nothing much to do */ 4310 4311 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4312 end_reshape(conf); 4313 return 0; 4314 } 4315 4316 if (mddev->curr_resync < max_sector) /* aborted */ 4317 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4318 &sync_blocks, 1); 4319 else /* completed sync */ 4320 conf->fullsync = 0; 4321 bitmap_close_sync(mddev->bitmap); 4322 4323 return 0; 4324 } 4325 4326 /* Allow raid5_quiesce to complete */ 4327 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4328 4329 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4330 return reshape_request(mddev, sector_nr, skipped); 4331 4332 /* No need to check resync_max as we never do more than one 4333 * stripe, and as resync_max will always be on a chunk boundary, 4334 * if the check in md_do_sync didn't fire, there is no chance 4335 * of overstepping resync_max here 4336 */ 4337 4338 /* if there is too many failed drives and we are trying 4339 * to resync, then assert that we are finished, because there is 4340 * nothing we can do. 4341 */ 4342 if (mddev->degraded >= conf->max_degraded && 4343 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4344 sector_t rv = mddev->dev_sectors - sector_nr; 4345 *skipped = 1; 4346 return rv; 4347 } 4348 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4349 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4350 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4351 /* we can skip this block, and probably more */ 4352 sync_blocks /= STRIPE_SECTORS; 4353 *skipped = 1; 4354 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4355 } 4356 4357 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4358 4359 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4360 if (sh == NULL) { 4361 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4362 /* make sure we don't swamp the stripe cache if someone else 4363 * is trying to get access 4364 */ 4365 schedule_timeout_uninterruptible(1); 4366 } 4367 /* Need to check if array will still be degraded after recovery/resync 4368 * We don't need to check the 'failed' flag as when that gets set, 4369 * recovery aborts. 4370 */ 4371 for (i = 0; i < conf->raid_disks; i++) 4372 if (conf->disks[i].rdev == NULL) 4373 still_degraded = 1; 4374 4375 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4376 4377 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4378 4379 handle_stripe(sh); 4380 release_stripe(sh); 4381 4382 return STRIPE_SECTORS; 4383 } 4384 4385 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4386 { 4387 /* We may not be able to submit a whole bio at once as there 4388 * may not be enough stripe_heads available. 4389 * We cannot pre-allocate enough stripe_heads as we may need 4390 * more than exist in the cache (if we allow ever large chunks). 4391 * So we do one stripe head at a time and record in 4392 * ->bi_hw_segments how many have been done. 4393 * 4394 * We *know* that this entire raid_bio is in one chunk, so 4395 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4396 */ 4397 struct stripe_head *sh; 4398 int dd_idx; 4399 sector_t sector, logical_sector, last_sector; 4400 int scnt = 0; 4401 int remaining; 4402 int handled = 0; 4403 4404 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4405 sector = raid5_compute_sector(conf, logical_sector, 4406 0, &dd_idx, NULL); 4407 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4408 4409 for (; logical_sector < last_sector; 4410 logical_sector += STRIPE_SECTORS, 4411 sector += STRIPE_SECTORS, 4412 scnt++) { 4413 4414 if (scnt < raid5_bi_hw_segments(raid_bio)) 4415 /* already done this stripe */ 4416 continue; 4417 4418 sh = get_active_stripe(conf, sector, 0, 1, 0); 4419 4420 if (!sh) { 4421 /* failed to get a stripe - must wait */ 4422 raid5_set_bi_hw_segments(raid_bio, scnt); 4423 conf->retry_read_aligned = raid_bio; 4424 return handled; 4425 } 4426 4427 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4428 release_stripe(sh); 4429 raid5_set_bi_hw_segments(raid_bio, scnt); 4430 conf->retry_read_aligned = raid_bio; 4431 return handled; 4432 } 4433 4434 handle_stripe(sh); 4435 release_stripe(sh); 4436 handled++; 4437 } 4438 spin_lock_irq(&conf->device_lock); 4439 remaining = raid5_dec_bi_phys_segments(raid_bio); 4440 spin_unlock_irq(&conf->device_lock); 4441 if (remaining == 0) 4442 bio_endio(raid_bio, 0); 4443 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4444 wake_up(&conf->wait_for_stripe); 4445 return handled; 4446 } 4447 4448 4449 /* 4450 * This is our raid5 kernel thread. 4451 * 4452 * We scan the hash table for stripes which can be handled now. 4453 * During the scan, completed stripes are saved for us by the interrupt 4454 * handler, so that they will not have to wait for our next wakeup. 4455 */ 4456 static void raid5d(struct mddev *mddev) 4457 { 4458 struct stripe_head *sh; 4459 struct r5conf *conf = mddev->private; 4460 int handled; 4461 struct blk_plug plug; 4462 4463 pr_debug("+++ raid5d active\n"); 4464 4465 md_check_recovery(mddev); 4466 4467 blk_start_plug(&plug); 4468 handled = 0; 4469 spin_lock_irq(&conf->device_lock); 4470 while (1) { 4471 struct bio *bio; 4472 4473 if (atomic_read(&mddev->plug_cnt) == 0 && 4474 !list_empty(&conf->bitmap_list)) { 4475 /* Now is a good time to flush some bitmap updates */ 4476 conf->seq_flush++; 4477 spin_unlock_irq(&conf->device_lock); 4478 bitmap_unplug(mddev->bitmap); 4479 spin_lock_irq(&conf->device_lock); 4480 conf->seq_write = conf->seq_flush; 4481 activate_bit_delay(conf); 4482 } 4483 if (atomic_read(&mddev->plug_cnt) == 0) 4484 raid5_activate_delayed(conf); 4485 4486 while ((bio = remove_bio_from_retry(conf))) { 4487 int ok; 4488 spin_unlock_irq(&conf->device_lock); 4489 ok = retry_aligned_read(conf, bio); 4490 spin_lock_irq(&conf->device_lock); 4491 if (!ok) 4492 break; 4493 handled++; 4494 } 4495 4496 sh = __get_priority_stripe(conf); 4497 4498 if (!sh) 4499 break; 4500 spin_unlock_irq(&conf->device_lock); 4501 4502 handled++; 4503 handle_stripe(sh); 4504 release_stripe(sh); 4505 cond_resched(); 4506 4507 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 4508 md_check_recovery(mddev); 4509 4510 spin_lock_irq(&conf->device_lock); 4511 } 4512 pr_debug("%d stripes handled\n", handled); 4513 4514 spin_unlock_irq(&conf->device_lock); 4515 4516 async_tx_issue_pending_all(); 4517 blk_finish_plug(&plug); 4518 4519 pr_debug("--- raid5d inactive\n"); 4520 } 4521 4522 static ssize_t 4523 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 4524 { 4525 struct r5conf *conf = mddev->private; 4526 if (conf) 4527 return sprintf(page, "%d\n", conf->max_nr_stripes); 4528 else 4529 return 0; 4530 } 4531 4532 int 4533 raid5_set_cache_size(struct mddev *mddev, int size) 4534 { 4535 struct r5conf *conf = mddev->private; 4536 int err; 4537 4538 if (size <= 16 || size > 32768) 4539 return -EINVAL; 4540 while (size < conf->max_nr_stripes) { 4541 if (drop_one_stripe(conf)) 4542 conf->max_nr_stripes--; 4543 else 4544 break; 4545 } 4546 err = md_allow_write(mddev); 4547 if (err) 4548 return err; 4549 while (size > conf->max_nr_stripes) { 4550 if (grow_one_stripe(conf)) 4551 conf->max_nr_stripes++; 4552 else break; 4553 } 4554 return 0; 4555 } 4556 EXPORT_SYMBOL(raid5_set_cache_size); 4557 4558 static ssize_t 4559 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 4560 { 4561 struct r5conf *conf = mddev->private; 4562 unsigned long new; 4563 int err; 4564 4565 if (len >= PAGE_SIZE) 4566 return -EINVAL; 4567 if (!conf) 4568 return -ENODEV; 4569 4570 if (strict_strtoul(page, 10, &new)) 4571 return -EINVAL; 4572 err = raid5_set_cache_size(mddev, new); 4573 if (err) 4574 return err; 4575 return len; 4576 } 4577 4578 static struct md_sysfs_entry 4579 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4580 raid5_show_stripe_cache_size, 4581 raid5_store_stripe_cache_size); 4582 4583 static ssize_t 4584 raid5_show_preread_threshold(struct mddev *mddev, char *page) 4585 { 4586 struct r5conf *conf = mddev->private; 4587 if (conf) 4588 return sprintf(page, "%d\n", conf->bypass_threshold); 4589 else 4590 return 0; 4591 } 4592 4593 static ssize_t 4594 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 4595 { 4596 struct r5conf *conf = mddev->private; 4597 unsigned long new; 4598 if (len >= PAGE_SIZE) 4599 return -EINVAL; 4600 if (!conf) 4601 return -ENODEV; 4602 4603 if (strict_strtoul(page, 10, &new)) 4604 return -EINVAL; 4605 if (new > conf->max_nr_stripes) 4606 return -EINVAL; 4607 conf->bypass_threshold = new; 4608 return len; 4609 } 4610 4611 static struct md_sysfs_entry 4612 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4613 S_IRUGO | S_IWUSR, 4614 raid5_show_preread_threshold, 4615 raid5_store_preread_threshold); 4616 4617 static ssize_t 4618 stripe_cache_active_show(struct mddev *mddev, char *page) 4619 { 4620 struct r5conf *conf = mddev->private; 4621 if (conf) 4622 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4623 else 4624 return 0; 4625 } 4626 4627 static struct md_sysfs_entry 4628 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4629 4630 static struct attribute *raid5_attrs[] = { 4631 &raid5_stripecache_size.attr, 4632 &raid5_stripecache_active.attr, 4633 &raid5_preread_bypass_threshold.attr, 4634 NULL, 4635 }; 4636 static struct attribute_group raid5_attrs_group = { 4637 .name = NULL, 4638 .attrs = raid5_attrs, 4639 }; 4640 4641 static sector_t 4642 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 4643 { 4644 struct r5conf *conf = mddev->private; 4645 4646 if (!sectors) 4647 sectors = mddev->dev_sectors; 4648 if (!raid_disks) 4649 /* size is defined by the smallest of previous and new size */ 4650 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 4651 4652 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4653 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4654 return sectors * (raid_disks - conf->max_degraded); 4655 } 4656 4657 static void raid5_free_percpu(struct r5conf *conf) 4658 { 4659 struct raid5_percpu *percpu; 4660 unsigned long cpu; 4661 4662 if (!conf->percpu) 4663 return; 4664 4665 get_online_cpus(); 4666 for_each_possible_cpu(cpu) { 4667 percpu = per_cpu_ptr(conf->percpu, cpu); 4668 safe_put_page(percpu->spare_page); 4669 kfree(percpu->scribble); 4670 } 4671 #ifdef CONFIG_HOTPLUG_CPU 4672 unregister_cpu_notifier(&conf->cpu_notify); 4673 #endif 4674 put_online_cpus(); 4675 4676 free_percpu(conf->percpu); 4677 } 4678 4679 static void free_conf(struct r5conf *conf) 4680 { 4681 shrink_stripes(conf); 4682 raid5_free_percpu(conf); 4683 kfree(conf->disks); 4684 kfree(conf->stripe_hashtbl); 4685 kfree(conf); 4686 } 4687 4688 #ifdef CONFIG_HOTPLUG_CPU 4689 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4690 void *hcpu) 4691 { 4692 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 4693 long cpu = (long)hcpu; 4694 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4695 4696 switch (action) { 4697 case CPU_UP_PREPARE: 4698 case CPU_UP_PREPARE_FROZEN: 4699 if (conf->level == 6 && !percpu->spare_page) 4700 percpu->spare_page = alloc_page(GFP_KERNEL); 4701 if (!percpu->scribble) 4702 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4703 4704 if (!percpu->scribble || 4705 (conf->level == 6 && !percpu->spare_page)) { 4706 safe_put_page(percpu->spare_page); 4707 kfree(percpu->scribble); 4708 pr_err("%s: failed memory allocation for cpu%ld\n", 4709 __func__, cpu); 4710 return notifier_from_errno(-ENOMEM); 4711 } 4712 break; 4713 case CPU_DEAD: 4714 case CPU_DEAD_FROZEN: 4715 safe_put_page(percpu->spare_page); 4716 kfree(percpu->scribble); 4717 percpu->spare_page = NULL; 4718 percpu->scribble = NULL; 4719 break; 4720 default: 4721 break; 4722 } 4723 return NOTIFY_OK; 4724 } 4725 #endif 4726 4727 static int raid5_alloc_percpu(struct r5conf *conf) 4728 { 4729 unsigned long cpu; 4730 struct page *spare_page; 4731 struct raid5_percpu __percpu *allcpus; 4732 void *scribble; 4733 int err; 4734 4735 allcpus = alloc_percpu(struct raid5_percpu); 4736 if (!allcpus) 4737 return -ENOMEM; 4738 conf->percpu = allcpus; 4739 4740 get_online_cpus(); 4741 err = 0; 4742 for_each_present_cpu(cpu) { 4743 if (conf->level == 6) { 4744 spare_page = alloc_page(GFP_KERNEL); 4745 if (!spare_page) { 4746 err = -ENOMEM; 4747 break; 4748 } 4749 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4750 } 4751 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4752 if (!scribble) { 4753 err = -ENOMEM; 4754 break; 4755 } 4756 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4757 } 4758 #ifdef CONFIG_HOTPLUG_CPU 4759 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4760 conf->cpu_notify.priority = 0; 4761 if (err == 0) 4762 err = register_cpu_notifier(&conf->cpu_notify); 4763 #endif 4764 put_online_cpus(); 4765 4766 return err; 4767 } 4768 4769 static struct r5conf *setup_conf(struct mddev *mddev) 4770 { 4771 struct r5conf *conf; 4772 int raid_disk, memory, max_disks; 4773 struct md_rdev *rdev; 4774 struct disk_info *disk; 4775 4776 if (mddev->new_level != 5 4777 && mddev->new_level != 4 4778 && mddev->new_level != 6) { 4779 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4780 mdname(mddev), mddev->new_level); 4781 return ERR_PTR(-EIO); 4782 } 4783 if ((mddev->new_level == 5 4784 && !algorithm_valid_raid5(mddev->new_layout)) || 4785 (mddev->new_level == 6 4786 && !algorithm_valid_raid6(mddev->new_layout))) { 4787 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4788 mdname(mddev), mddev->new_layout); 4789 return ERR_PTR(-EIO); 4790 } 4791 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4792 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4793 mdname(mddev), mddev->raid_disks); 4794 return ERR_PTR(-EINVAL); 4795 } 4796 4797 if (!mddev->new_chunk_sectors || 4798 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4799 !is_power_of_2(mddev->new_chunk_sectors)) { 4800 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4801 mdname(mddev), mddev->new_chunk_sectors << 9); 4802 return ERR_PTR(-EINVAL); 4803 } 4804 4805 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 4806 if (conf == NULL) 4807 goto abort; 4808 spin_lock_init(&conf->device_lock); 4809 init_waitqueue_head(&conf->wait_for_stripe); 4810 init_waitqueue_head(&conf->wait_for_overlap); 4811 INIT_LIST_HEAD(&conf->handle_list); 4812 INIT_LIST_HEAD(&conf->hold_list); 4813 INIT_LIST_HEAD(&conf->delayed_list); 4814 INIT_LIST_HEAD(&conf->bitmap_list); 4815 INIT_LIST_HEAD(&conf->inactive_list); 4816 atomic_set(&conf->active_stripes, 0); 4817 atomic_set(&conf->preread_active_stripes, 0); 4818 atomic_set(&conf->active_aligned_reads, 0); 4819 conf->bypass_threshold = BYPASS_THRESHOLD; 4820 conf->recovery_disabled = mddev->recovery_disabled - 1; 4821 4822 conf->raid_disks = mddev->raid_disks; 4823 if (mddev->reshape_position == MaxSector) 4824 conf->previous_raid_disks = mddev->raid_disks; 4825 else 4826 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4827 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 4828 conf->scribble_len = scribble_len(max_disks); 4829 4830 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 4831 GFP_KERNEL); 4832 if (!conf->disks) 4833 goto abort; 4834 4835 conf->mddev = mddev; 4836 4837 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4838 goto abort; 4839 4840 conf->level = mddev->new_level; 4841 if (raid5_alloc_percpu(conf) != 0) 4842 goto abort; 4843 4844 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4845 4846 list_for_each_entry(rdev, &mddev->disks, same_set) { 4847 raid_disk = rdev->raid_disk; 4848 if (raid_disk >= max_disks 4849 || raid_disk < 0) 4850 continue; 4851 disk = conf->disks + raid_disk; 4852 4853 if (test_bit(Replacement, &rdev->flags)) { 4854 if (disk->replacement) 4855 goto abort; 4856 disk->replacement = rdev; 4857 } else { 4858 if (disk->rdev) 4859 goto abort; 4860 disk->rdev = rdev; 4861 } 4862 4863 if (test_bit(In_sync, &rdev->flags)) { 4864 char b[BDEVNAME_SIZE]; 4865 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4866 " disk %d\n", 4867 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4868 } else if (rdev->saved_raid_disk != raid_disk) 4869 /* Cannot rely on bitmap to complete recovery */ 4870 conf->fullsync = 1; 4871 } 4872 4873 conf->chunk_sectors = mddev->new_chunk_sectors; 4874 conf->level = mddev->new_level; 4875 if (conf->level == 6) 4876 conf->max_degraded = 2; 4877 else 4878 conf->max_degraded = 1; 4879 conf->algorithm = mddev->new_layout; 4880 conf->max_nr_stripes = NR_STRIPES; 4881 conf->reshape_progress = mddev->reshape_position; 4882 if (conf->reshape_progress != MaxSector) { 4883 conf->prev_chunk_sectors = mddev->chunk_sectors; 4884 conf->prev_algo = mddev->layout; 4885 } 4886 4887 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4888 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4889 if (grow_stripes(conf, conf->max_nr_stripes)) { 4890 printk(KERN_ERR 4891 "md/raid:%s: couldn't allocate %dkB for buffers\n", 4892 mdname(mddev), memory); 4893 goto abort; 4894 } else 4895 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4896 mdname(mddev), memory); 4897 4898 conf->thread = md_register_thread(raid5d, mddev, NULL); 4899 if (!conf->thread) { 4900 printk(KERN_ERR 4901 "md/raid:%s: couldn't allocate thread.\n", 4902 mdname(mddev)); 4903 goto abort; 4904 } 4905 4906 return conf; 4907 4908 abort: 4909 if (conf) { 4910 free_conf(conf); 4911 return ERR_PTR(-EIO); 4912 } else 4913 return ERR_PTR(-ENOMEM); 4914 } 4915 4916 4917 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 4918 { 4919 switch (algo) { 4920 case ALGORITHM_PARITY_0: 4921 if (raid_disk < max_degraded) 4922 return 1; 4923 break; 4924 case ALGORITHM_PARITY_N: 4925 if (raid_disk >= raid_disks - max_degraded) 4926 return 1; 4927 break; 4928 case ALGORITHM_PARITY_0_6: 4929 if (raid_disk == 0 || 4930 raid_disk == raid_disks - 1) 4931 return 1; 4932 break; 4933 case ALGORITHM_LEFT_ASYMMETRIC_6: 4934 case ALGORITHM_RIGHT_ASYMMETRIC_6: 4935 case ALGORITHM_LEFT_SYMMETRIC_6: 4936 case ALGORITHM_RIGHT_SYMMETRIC_6: 4937 if (raid_disk == raid_disks - 1) 4938 return 1; 4939 } 4940 return 0; 4941 } 4942 4943 static int run(struct mddev *mddev) 4944 { 4945 struct r5conf *conf; 4946 int working_disks = 0; 4947 int dirty_parity_disks = 0; 4948 struct md_rdev *rdev; 4949 sector_t reshape_offset = 0; 4950 int i; 4951 4952 if (mddev->recovery_cp != MaxSector) 4953 printk(KERN_NOTICE "md/raid:%s: not clean" 4954 " -- starting background reconstruction\n", 4955 mdname(mddev)); 4956 if (mddev->reshape_position != MaxSector) { 4957 /* Check that we can continue the reshape. 4958 * Currently only disks can change, it must 4959 * increase, and we must be past the point where 4960 * a stripe over-writes itself 4961 */ 4962 sector_t here_new, here_old; 4963 int old_disks; 4964 int max_degraded = (mddev->level == 6 ? 2 : 1); 4965 4966 if (mddev->new_level != mddev->level) { 4967 printk(KERN_ERR "md/raid:%s: unsupported reshape " 4968 "required - aborting.\n", 4969 mdname(mddev)); 4970 return -EINVAL; 4971 } 4972 old_disks = mddev->raid_disks - mddev->delta_disks; 4973 /* reshape_position must be on a new-stripe boundary, and one 4974 * further up in new geometry must map after here in old 4975 * geometry. 4976 */ 4977 here_new = mddev->reshape_position; 4978 if (sector_div(here_new, mddev->new_chunk_sectors * 4979 (mddev->raid_disks - max_degraded))) { 4980 printk(KERN_ERR "md/raid:%s: reshape_position not " 4981 "on a stripe boundary\n", mdname(mddev)); 4982 return -EINVAL; 4983 } 4984 reshape_offset = here_new * mddev->new_chunk_sectors; 4985 /* here_new is the stripe we will write to */ 4986 here_old = mddev->reshape_position; 4987 sector_div(here_old, mddev->chunk_sectors * 4988 (old_disks-max_degraded)); 4989 /* here_old is the first stripe that we might need to read 4990 * from */ 4991 if (mddev->delta_disks == 0) { 4992 /* We cannot be sure it is safe to start an in-place 4993 * reshape. It is only safe if user-space if monitoring 4994 * and taking constant backups. 4995 * mdadm always starts a situation like this in 4996 * readonly mode so it can take control before 4997 * allowing any writes. So just check for that. 4998 */ 4999 if ((here_new * mddev->new_chunk_sectors != 5000 here_old * mddev->chunk_sectors) || 5001 mddev->ro == 0) { 5002 printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 5003 " in read-only mode - aborting\n", 5004 mdname(mddev)); 5005 return -EINVAL; 5006 } 5007 } else if (mddev->delta_disks < 0 5008 ? (here_new * mddev->new_chunk_sectors <= 5009 here_old * mddev->chunk_sectors) 5010 : (here_new * mddev->new_chunk_sectors >= 5011 here_old * mddev->chunk_sectors)) { 5012 /* Reading from the same stripe as writing to - bad */ 5013 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5014 "auto-recovery - aborting.\n", 5015 mdname(mddev)); 5016 return -EINVAL; 5017 } 5018 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5019 mdname(mddev)); 5020 /* OK, we should be able to continue; */ 5021 } else { 5022 BUG_ON(mddev->level != mddev->new_level); 5023 BUG_ON(mddev->layout != mddev->new_layout); 5024 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5025 BUG_ON(mddev->delta_disks != 0); 5026 } 5027 5028 if (mddev->private == NULL) 5029 conf = setup_conf(mddev); 5030 else 5031 conf = mddev->private; 5032 5033 if (IS_ERR(conf)) 5034 return PTR_ERR(conf); 5035 5036 mddev->thread = conf->thread; 5037 conf->thread = NULL; 5038 mddev->private = conf; 5039 5040 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5041 i++) { 5042 rdev = conf->disks[i].rdev; 5043 if (!rdev && conf->disks[i].replacement) { 5044 /* The replacement is all we have yet */ 5045 rdev = conf->disks[i].replacement; 5046 conf->disks[i].replacement = NULL; 5047 clear_bit(Replacement, &rdev->flags); 5048 conf->disks[i].rdev = rdev; 5049 } 5050 if (!rdev) 5051 continue; 5052 if (conf->disks[i].replacement && 5053 conf->reshape_progress != MaxSector) { 5054 /* replacements and reshape simply do not mix. */ 5055 printk(KERN_ERR "md: cannot handle concurrent " 5056 "replacement and reshape.\n"); 5057 goto abort; 5058 } 5059 if (test_bit(In_sync, &rdev->flags)) { 5060 working_disks++; 5061 continue; 5062 } 5063 /* This disc is not fully in-sync. However if it 5064 * just stored parity (beyond the recovery_offset), 5065 * when we don't need to be concerned about the 5066 * array being dirty. 5067 * When reshape goes 'backwards', we never have 5068 * partially completed devices, so we only need 5069 * to worry about reshape going forwards. 5070 */ 5071 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5072 if (mddev->major_version == 0 && 5073 mddev->minor_version > 90) 5074 rdev->recovery_offset = reshape_offset; 5075 5076 if (rdev->recovery_offset < reshape_offset) { 5077 /* We need to check old and new layout */ 5078 if (!only_parity(rdev->raid_disk, 5079 conf->algorithm, 5080 conf->raid_disks, 5081 conf->max_degraded)) 5082 continue; 5083 } 5084 if (!only_parity(rdev->raid_disk, 5085 conf->prev_algo, 5086 conf->previous_raid_disks, 5087 conf->max_degraded)) 5088 continue; 5089 dirty_parity_disks++; 5090 } 5091 5092 /* 5093 * 0 for a fully functional array, 1 or 2 for a degraded array. 5094 */ 5095 mddev->degraded = calc_degraded(conf); 5096 5097 if (has_failed(conf)) { 5098 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5099 " (%d/%d failed)\n", 5100 mdname(mddev), mddev->degraded, conf->raid_disks); 5101 goto abort; 5102 } 5103 5104 /* device size must be a multiple of chunk size */ 5105 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5106 mddev->resync_max_sectors = mddev->dev_sectors; 5107 5108 if (mddev->degraded > dirty_parity_disks && 5109 mddev->recovery_cp != MaxSector) { 5110 if (mddev->ok_start_degraded) 5111 printk(KERN_WARNING 5112 "md/raid:%s: starting dirty degraded array" 5113 " - data corruption possible.\n", 5114 mdname(mddev)); 5115 else { 5116 printk(KERN_ERR 5117 "md/raid:%s: cannot start dirty degraded array.\n", 5118 mdname(mddev)); 5119 goto abort; 5120 } 5121 } 5122 5123 if (mddev->degraded == 0) 5124 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5125 " devices, algorithm %d\n", mdname(mddev), conf->level, 5126 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5127 mddev->new_layout); 5128 else 5129 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5130 " out of %d devices, algorithm %d\n", 5131 mdname(mddev), conf->level, 5132 mddev->raid_disks - mddev->degraded, 5133 mddev->raid_disks, mddev->new_layout); 5134 5135 print_raid5_conf(conf); 5136 5137 if (conf->reshape_progress != MaxSector) { 5138 conf->reshape_safe = conf->reshape_progress; 5139 atomic_set(&conf->reshape_stripes, 0); 5140 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5141 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5142 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5143 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5144 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5145 "reshape"); 5146 } 5147 5148 5149 /* Ok, everything is just fine now */ 5150 if (mddev->to_remove == &raid5_attrs_group) 5151 mddev->to_remove = NULL; 5152 else if (mddev->kobj.sd && 5153 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5154 printk(KERN_WARNING 5155 "raid5: failed to create sysfs attributes for %s\n", 5156 mdname(mddev)); 5157 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5158 5159 if (mddev->queue) { 5160 int chunk_size; 5161 /* read-ahead size must cover two whole stripes, which 5162 * is 2 * (datadisks) * chunksize where 'n' is the 5163 * number of raid devices 5164 */ 5165 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5166 int stripe = data_disks * 5167 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5168 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5169 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5170 5171 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5172 5173 mddev->queue->backing_dev_info.congested_data = mddev; 5174 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5175 5176 chunk_size = mddev->chunk_sectors << 9; 5177 blk_queue_io_min(mddev->queue, chunk_size); 5178 blk_queue_io_opt(mddev->queue, chunk_size * 5179 (conf->raid_disks - conf->max_degraded)); 5180 5181 list_for_each_entry(rdev, &mddev->disks, same_set) 5182 disk_stack_limits(mddev->gendisk, rdev->bdev, 5183 rdev->data_offset << 9); 5184 } 5185 5186 return 0; 5187 abort: 5188 md_unregister_thread(&mddev->thread); 5189 print_raid5_conf(conf); 5190 free_conf(conf); 5191 mddev->private = NULL; 5192 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5193 return -EIO; 5194 } 5195 5196 static int stop(struct mddev *mddev) 5197 { 5198 struct r5conf *conf = mddev->private; 5199 5200 md_unregister_thread(&mddev->thread); 5201 if (mddev->queue) 5202 mddev->queue->backing_dev_info.congested_fn = NULL; 5203 free_conf(conf); 5204 mddev->private = NULL; 5205 mddev->to_remove = &raid5_attrs_group; 5206 return 0; 5207 } 5208 5209 static void status(struct seq_file *seq, struct mddev *mddev) 5210 { 5211 struct r5conf *conf = mddev->private; 5212 int i; 5213 5214 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5215 mddev->chunk_sectors / 2, mddev->layout); 5216 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5217 for (i = 0; i < conf->raid_disks; i++) 5218 seq_printf (seq, "%s", 5219 conf->disks[i].rdev && 5220 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5221 seq_printf (seq, "]"); 5222 } 5223 5224 static void print_raid5_conf (struct r5conf *conf) 5225 { 5226 int i; 5227 struct disk_info *tmp; 5228 5229 printk(KERN_DEBUG "RAID conf printout:\n"); 5230 if (!conf) { 5231 printk("(conf==NULL)\n"); 5232 return; 5233 } 5234 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5235 conf->raid_disks, 5236 conf->raid_disks - conf->mddev->degraded); 5237 5238 for (i = 0; i < conf->raid_disks; i++) { 5239 char b[BDEVNAME_SIZE]; 5240 tmp = conf->disks + i; 5241 if (tmp->rdev) 5242 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5243 i, !test_bit(Faulty, &tmp->rdev->flags), 5244 bdevname(tmp->rdev->bdev, b)); 5245 } 5246 } 5247 5248 static int raid5_spare_active(struct mddev *mddev) 5249 { 5250 int i; 5251 struct r5conf *conf = mddev->private; 5252 struct disk_info *tmp; 5253 int count = 0; 5254 unsigned long flags; 5255 5256 for (i = 0; i < conf->raid_disks; i++) { 5257 tmp = conf->disks + i; 5258 if (tmp->replacement 5259 && tmp->replacement->recovery_offset == MaxSector 5260 && !test_bit(Faulty, &tmp->replacement->flags) 5261 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 5262 /* Replacement has just become active. */ 5263 if (!tmp->rdev 5264 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 5265 count++; 5266 if (tmp->rdev) { 5267 /* Replaced device not technically faulty, 5268 * but we need to be sure it gets removed 5269 * and never re-added. 5270 */ 5271 set_bit(Faulty, &tmp->rdev->flags); 5272 sysfs_notify_dirent_safe( 5273 tmp->rdev->sysfs_state); 5274 } 5275 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 5276 } else if (tmp->rdev 5277 && tmp->rdev->recovery_offset == MaxSector 5278 && !test_bit(Faulty, &tmp->rdev->flags) 5279 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5280 count++; 5281 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 5282 } 5283 } 5284 spin_lock_irqsave(&conf->device_lock, flags); 5285 mddev->degraded = calc_degraded(conf); 5286 spin_unlock_irqrestore(&conf->device_lock, flags); 5287 print_raid5_conf(conf); 5288 return count; 5289 } 5290 5291 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 5292 { 5293 struct r5conf *conf = mddev->private; 5294 int err = 0; 5295 int number = rdev->raid_disk; 5296 struct md_rdev **rdevp; 5297 struct disk_info *p = conf->disks + number; 5298 5299 print_raid5_conf(conf); 5300 if (rdev == p->rdev) 5301 rdevp = &p->rdev; 5302 else if (rdev == p->replacement) 5303 rdevp = &p->replacement; 5304 else 5305 return 0; 5306 5307 if (number >= conf->raid_disks && 5308 conf->reshape_progress == MaxSector) 5309 clear_bit(In_sync, &rdev->flags); 5310 5311 if (test_bit(In_sync, &rdev->flags) || 5312 atomic_read(&rdev->nr_pending)) { 5313 err = -EBUSY; 5314 goto abort; 5315 } 5316 /* Only remove non-faulty devices if recovery 5317 * isn't possible. 5318 */ 5319 if (!test_bit(Faulty, &rdev->flags) && 5320 mddev->recovery_disabled != conf->recovery_disabled && 5321 !has_failed(conf) && 5322 (!p->replacement || p->replacement == rdev) && 5323 number < conf->raid_disks) { 5324 err = -EBUSY; 5325 goto abort; 5326 } 5327 *rdevp = NULL; 5328 synchronize_rcu(); 5329 if (atomic_read(&rdev->nr_pending)) { 5330 /* lost the race, try later */ 5331 err = -EBUSY; 5332 *rdevp = rdev; 5333 } else if (p->replacement) { 5334 /* We must have just cleared 'rdev' */ 5335 p->rdev = p->replacement; 5336 clear_bit(Replacement, &p->replacement->flags); 5337 smp_mb(); /* Make sure other CPUs may see both as identical 5338 * but will never see neither - if they are careful 5339 */ 5340 p->replacement = NULL; 5341 clear_bit(WantReplacement, &rdev->flags); 5342 } else 5343 /* We might have just removed the Replacement as faulty- 5344 * clear the bit just in case 5345 */ 5346 clear_bit(WantReplacement, &rdev->flags); 5347 abort: 5348 5349 print_raid5_conf(conf); 5350 return err; 5351 } 5352 5353 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 5354 { 5355 struct r5conf *conf = mddev->private; 5356 int err = -EEXIST; 5357 int disk; 5358 struct disk_info *p; 5359 int first = 0; 5360 int last = conf->raid_disks - 1; 5361 5362 if (mddev->recovery_disabled == conf->recovery_disabled) 5363 return -EBUSY; 5364 5365 if (has_failed(conf)) 5366 /* no point adding a device */ 5367 return -EINVAL; 5368 5369 if (rdev->raid_disk >= 0) 5370 first = last = rdev->raid_disk; 5371 5372 /* 5373 * find the disk ... but prefer rdev->saved_raid_disk 5374 * if possible. 5375 */ 5376 if (rdev->saved_raid_disk >= 0 && 5377 rdev->saved_raid_disk >= first && 5378 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5379 disk = rdev->saved_raid_disk; 5380 else 5381 disk = first; 5382 for ( ; disk <= last ; disk++) { 5383 p = conf->disks + disk; 5384 if (p->rdev == NULL) { 5385 clear_bit(In_sync, &rdev->flags); 5386 rdev->raid_disk = disk; 5387 err = 0; 5388 if (rdev->saved_raid_disk != disk) 5389 conf->fullsync = 1; 5390 rcu_assign_pointer(p->rdev, rdev); 5391 break; 5392 } 5393 if (test_bit(WantReplacement, &p->rdev->flags) && 5394 p->replacement == NULL) { 5395 clear_bit(In_sync, &rdev->flags); 5396 set_bit(Replacement, &rdev->flags); 5397 rdev->raid_disk = disk; 5398 err = 0; 5399 conf->fullsync = 1; 5400 rcu_assign_pointer(p->replacement, rdev); 5401 break; 5402 } 5403 } 5404 print_raid5_conf(conf); 5405 return err; 5406 } 5407 5408 static int raid5_resize(struct mddev *mddev, sector_t sectors) 5409 { 5410 /* no resync is happening, and there is enough space 5411 * on all devices, so we can resize. 5412 * We need to make sure resync covers any new space. 5413 * If the array is shrinking we should possibly wait until 5414 * any io in the removed space completes, but it hardly seems 5415 * worth it. 5416 */ 5417 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5418 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5419 mddev->raid_disks)); 5420 if (mddev->array_sectors > 5421 raid5_size(mddev, sectors, mddev->raid_disks)) 5422 return -EINVAL; 5423 set_capacity(mddev->gendisk, mddev->array_sectors); 5424 revalidate_disk(mddev->gendisk); 5425 if (sectors > mddev->dev_sectors && 5426 mddev->recovery_cp > mddev->dev_sectors) { 5427 mddev->recovery_cp = mddev->dev_sectors; 5428 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5429 } 5430 mddev->dev_sectors = sectors; 5431 mddev->resync_max_sectors = sectors; 5432 return 0; 5433 } 5434 5435 static int check_stripe_cache(struct mddev *mddev) 5436 { 5437 /* Can only proceed if there are plenty of stripe_heads. 5438 * We need a minimum of one full stripe,, and for sensible progress 5439 * it is best to have about 4 times that. 5440 * If we require 4 times, then the default 256 4K stripe_heads will 5441 * allow for chunk sizes up to 256K, which is probably OK. 5442 * If the chunk size is greater, user-space should request more 5443 * stripe_heads first. 5444 */ 5445 struct r5conf *conf = mddev->private; 5446 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5447 > conf->max_nr_stripes || 5448 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5449 > conf->max_nr_stripes) { 5450 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5451 mdname(mddev), 5452 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5453 / STRIPE_SIZE)*4); 5454 return 0; 5455 } 5456 return 1; 5457 } 5458 5459 static int check_reshape(struct mddev *mddev) 5460 { 5461 struct r5conf *conf = mddev->private; 5462 5463 if (mddev->delta_disks == 0 && 5464 mddev->new_layout == mddev->layout && 5465 mddev->new_chunk_sectors == mddev->chunk_sectors) 5466 return 0; /* nothing to do */ 5467 if (mddev->bitmap) 5468 /* Cannot grow a bitmap yet */ 5469 return -EBUSY; 5470 if (has_failed(conf)) 5471 return -EINVAL; 5472 if (mddev->delta_disks < 0) { 5473 /* We might be able to shrink, but the devices must 5474 * be made bigger first. 5475 * For raid6, 4 is the minimum size. 5476 * Otherwise 2 is the minimum 5477 */ 5478 int min = 2; 5479 if (mddev->level == 6) 5480 min = 4; 5481 if (mddev->raid_disks + mddev->delta_disks < min) 5482 return -EINVAL; 5483 } 5484 5485 if (!check_stripe_cache(mddev)) 5486 return -ENOSPC; 5487 5488 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5489 } 5490 5491 static int raid5_start_reshape(struct mddev *mddev) 5492 { 5493 struct r5conf *conf = mddev->private; 5494 struct md_rdev *rdev; 5495 int spares = 0; 5496 unsigned long flags; 5497 5498 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5499 return -EBUSY; 5500 5501 if (!check_stripe_cache(mddev)) 5502 return -ENOSPC; 5503 5504 list_for_each_entry(rdev, &mddev->disks, same_set) 5505 if (!test_bit(In_sync, &rdev->flags) 5506 && !test_bit(Faulty, &rdev->flags)) 5507 spares++; 5508 5509 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5510 /* Not enough devices even to make a degraded array 5511 * of that size 5512 */ 5513 return -EINVAL; 5514 5515 /* Refuse to reduce size of the array. Any reductions in 5516 * array size must be through explicit setting of array_size 5517 * attribute. 5518 */ 5519 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5520 < mddev->array_sectors) { 5521 printk(KERN_ERR "md/raid:%s: array size must be reduced " 5522 "before number of disks\n", mdname(mddev)); 5523 return -EINVAL; 5524 } 5525 5526 atomic_set(&conf->reshape_stripes, 0); 5527 spin_lock_irq(&conf->device_lock); 5528 conf->previous_raid_disks = conf->raid_disks; 5529 conf->raid_disks += mddev->delta_disks; 5530 conf->prev_chunk_sectors = conf->chunk_sectors; 5531 conf->chunk_sectors = mddev->new_chunk_sectors; 5532 conf->prev_algo = conf->algorithm; 5533 conf->algorithm = mddev->new_layout; 5534 if (mddev->delta_disks < 0) 5535 conf->reshape_progress = raid5_size(mddev, 0, 0); 5536 else 5537 conf->reshape_progress = 0; 5538 conf->reshape_safe = conf->reshape_progress; 5539 conf->generation++; 5540 spin_unlock_irq(&conf->device_lock); 5541 5542 /* Add some new drives, as many as will fit. 5543 * We know there are enough to make the newly sized array work. 5544 * Don't add devices if we are reducing the number of 5545 * devices in the array. This is because it is not possible 5546 * to correctly record the "partially reconstructed" state of 5547 * such devices during the reshape and confusion could result. 5548 */ 5549 if (mddev->delta_disks >= 0) { 5550 int added_devices = 0; 5551 list_for_each_entry(rdev, &mddev->disks, same_set) 5552 if (rdev->raid_disk < 0 && 5553 !test_bit(Faulty, &rdev->flags)) { 5554 if (raid5_add_disk(mddev, rdev) == 0) { 5555 if (rdev->raid_disk 5556 >= conf->previous_raid_disks) { 5557 set_bit(In_sync, &rdev->flags); 5558 added_devices++; 5559 } else 5560 rdev->recovery_offset = 0; 5561 5562 if (sysfs_link_rdev(mddev, rdev)) 5563 /* Failure here is OK */; 5564 } 5565 } else if (rdev->raid_disk >= conf->previous_raid_disks 5566 && !test_bit(Faulty, &rdev->flags)) { 5567 /* This is a spare that was manually added */ 5568 set_bit(In_sync, &rdev->flags); 5569 added_devices++; 5570 } 5571 5572 /* When a reshape changes the number of devices, 5573 * ->degraded is measured against the larger of the 5574 * pre and post number of devices. 5575 */ 5576 spin_lock_irqsave(&conf->device_lock, flags); 5577 mddev->degraded = calc_degraded(conf); 5578 spin_unlock_irqrestore(&conf->device_lock, flags); 5579 } 5580 mddev->raid_disks = conf->raid_disks; 5581 mddev->reshape_position = conf->reshape_progress; 5582 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5583 5584 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5585 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5586 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5587 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5588 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5589 "reshape"); 5590 if (!mddev->sync_thread) { 5591 mddev->recovery = 0; 5592 spin_lock_irq(&conf->device_lock); 5593 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5594 conf->reshape_progress = MaxSector; 5595 spin_unlock_irq(&conf->device_lock); 5596 return -EAGAIN; 5597 } 5598 conf->reshape_checkpoint = jiffies; 5599 md_wakeup_thread(mddev->sync_thread); 5600 md_new_event(mddev); 5601 return 0; 5602 } 5603 5604 /* This is called from the reshape thread and should make any 5605 * changes needed in 'conf' 5606 */ 5607 static void end_reshape(struct r5conf *conf) 5608 { 5609 5610 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5611 5612 spin_lock_irq(&conf->device_lock); 5613 conf->previous_raid_disks = conf->raid_disks; 5614 conf->reshape_progress = MaxSector; 5615 spin_unlock_irq(&conf->device_lock); 5616 wake_up(&conf->wait_for_overlap); 5617 5618 /* read-ahead size must cover two whole stripes, which is 5619 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5620 */ 5621 if (conf->mddev->queue) { 5622 int data_disks = conf->raid_disks - conf->max_degraded; 5623 int stripe = data_disks * ((conf->chunk_sectors << 9) 5624 / PAGE_SIZE); 5625 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5626 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5627 } 5628 } 5629 } 5630 5631 /* This is called from the raid5d thread with mddev_lock held. 5632 * It makes config changes to the device. 5633 */ 5634 static void raid5_finish_reshape(struct mddev *mddev) 5635 { 5636 struct r5conf *conf = mddev->private; 5637 5638 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5639 5640 if (mddev->delta_disks > 0) { 5641 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5642 set_capacity(mddev->gendisk, mddev->array_sectors); 5643 revalidate_disk(mddev->gendisk); 5644 } else { 5645 int d; 5646 spin_lock_irq(&conf->device_lock); 5647 mddev->degraded = calc_degraded(conf); 5648 spin_unlock_irq(&conf->device_lock); 5649 for (d = conf->raid_disks ; 5650 d < conf->raid_disks - mddev->delta_disks; 5651 d++) { 5652 struct md_rdev *rdev = conf->disks[d].rdev; 5653 if (rdev && 5654 raid5_remove_disk(mddev, rdev) == 0) { 5655 sysfs_unlink_rdev(mddev, rdev); 5656 rdev->raid_disk = -1; 5657 } 5658 } 5659 } 5660 mddev->layout = conf->algorithm; 5661 mddev->chunk_sectors = conf->chunk_sectors; 5662 mddev->reshape_position = MaxSector; 5663 mddev->delta_disks = 0; 5664 } 5665 } 5666 5667 static void raid5_quiesce(struct mddev *mddev, int state) 5668 { 5669 struct r5conf *conf = mddev->private; 5670 5671 switch(state) { 5672 case 2: /* resume for a suspend */ 5673 wake_up(&conf->wait_for_overlap); 5674 break; 5675 5676 case 1: /* stop all writes */ 5677 spin_lock_irq(&conf->device_lock); 5678 /* '2' tells resync/reshape to pause so that all 5679 * active stripes can drain 5680 */ 5681 conf->quiesce = 2; 5682 wait_event_lock_irq(conf->wait_for_stripe, 5683 atomic_read(&conf->active_stripes) == 0 && 5684 atomic_read(&conf->active_aligned_reads) == 0, 5685 conf->device_lock, /* nothing */); 5686 conf->quiesce = 1; 5687 spin_unlock_irq(&conf->device_lock); 5688 /* allow reshape to continue */ 5689 wake_up(&conf->wait_for_overlap); 5690 break; 5691 5692 case 0: /* re-enable writes */ 5693 spin_lock_irq(&conf->device_lock); 5694 conf->quiesce = 0; 5695 wake_up(&conf->wait_for_stripe); 5696 wake_up(&conf->wait_for_overlap); 5697 spin_unlock_irq(&conf->device_lock); 5698 break; 5699 } 5700 } 5701 5702 5703 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 5704 { 5705 struct r0conf *raid0_conf = mddev->private; 5706 sector_t sectors; 5707 5708 /* for raid0 takeover only one zone is supported */ 5709 if (raid0_conf->nr_strip_zones > 1) { 5710 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5711 mdname(mddev)); 5712 return ERR_PTR(-EINVAL); 5713 } 5714 5715 sectors = raid0_conf->strip_zone[0].zone_end; 5716 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 5717 mddev->dev_sectors = sectors; 5718 mddev->new_level = level; 5719 mddev->new_layout = ALGORITHM_PARITY_N; 5720 mddev->new_chunk_sectors = mddev->chunk_sectors; 5721 mddev->raid_disks += 1; 5722 mddev->delta_disks = 1; 5723 /* make sure it will be not marked as dirty */ 5724 mddev->recovery_cp = MaxSector; 5725 5726 return setup_conf(mddev); 5727 } 5728 5729 5730 static void *raid5_takeover_raid1(struct mddev *mddev) 5731 { 5732 int chunksect; 5733 5734 if (mddev->raid_disks != 2 || 5735 mddev->degraded > 1) 5736 return ERR_PTR(-EINVAL); 5737 5738 /* Should check if there are write-behind devices? */ 5739 5740 chunksect = 64*2; /* 64K by default */ 5741 5742 /* The array must be an exact multiple of chunksize */ 5743 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5744 chunksect >>= 1; 5745 5746 if ((chunksect<<9) < STRIPE_SIZE) 5747 /* array size does not allow a suitable chunk size */ 5748 return ERR_PTR(-EINVAL); 5749 5750 mddev->new_level = 5; 5751 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5752 mddev->new_chunk_sectors = chunksect; 5753 5754 return setup_conf(mddev); 5755 } 5756 5757 static void *raid5_takeover_raid6(struct mddev *mddev) 5758 { 5759 int new_layout; 5760 5761 switch (mddev->layout) { 5762 case ALGORITHM_LEFT_ASYMMETRIC_6: 5763 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5764 break; 5765 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5766 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5767 break; 5768 case ALGORITHM_LEFT_SYMMETRIC_6: 5769 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5770 break; 5771 case ALGORITHM_RIGHT_SYMMETRIC_6: 5772 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5773 break; 5774 case ALGORITHM_PARITY_0_6: 5775 new_layout = ALGORITHM_PARITY_0; 5776 break; 5777 case ALGORITHM_PARITY_N: 5778 new_layout = ALGORITHM_PARITY_N; 5779 break; 5780 default: 5781 return ERR_PTR(-EINVAL); 5782 } 5783 mddev->new_level = 5; 5784 mddev->new_layout = new_layout; 5785 mddev->delta_disks = -1; 5786 mddev->raid_disks -= 1; 5787 return setup_conf(mddev); 5788 } 5789 5790 5791 static int raid5_check_reshape(struct mddev *mddev) 5792 { 5793 /* For a 2-drive array, the layout and chunk size can be changed 5794 * immediately as not restriping is needed. 5795 * For larger arrays we record the new value - after validation 5796 * to be used by a reshape pass. 5797 */ 5798 struct r5conf *conf = mddev->private; 5799 int new_chunk = mddev->new_chunk_sectors; 5800 5801 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5802 return -EINVAL; 5803 if (new_chunk > 0) { 5804 if (!is_power_of_2(new_chunk)) 5805 return -EINVAL; 5806 if (new_chunk < (PAGE_SIZE>>9)) 5807 return -EINVAL; 5808 if (mddev->array_sectors & (new_chunk-1)) 5809 /* not factor of array size */ 5810 return -EINVAL; 5811 } 5812 5813 /* They look valid */ 5814 5815 if (mddev->raid_disks == 2) { 5816 /* can make the change immediately */ 5817 if (mddev->new_layout >= 0) { 5818 conf->algorithm = mddev->new_layout; 5819 mddev->layout = mddev->new_layout; 5820 } 5821 if (new_chunk > 0) { 5822 conf->chunk_sectors = new_chunk ; 5823 mddev->chunk_sectors = new_chunk; 5824 } 5825 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5826 md_wakeup_thread(mddev->thread); 5827 } 5828 return check_reshape(mddev); 5829 } 5830 5831 static int raid6_check_reshape(struct mddev *mddev) 5832 { 5833 int new_chunk = mddev->new_chunk_sectors; 5834 5835 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5836 return -EINVAL; 5837 if (new_chunk > 0) { 5838 if (!is_power_of_2(new_chunk)) 5839 return -EINVAL; 5840 if (new_chunk < (PAGE_SIZE >> 9)) 5841 return -EINVAL; 5842 if (mddev->array_sectors & (new_chunk-1)) 5843 /* not factor of array size */ 5844 return -EINVAL; 5845 } 5846 5847 /* They look valid */ 5848 return check_reshape(mddev); 5849 } 5850 5851 static void *raid5_takeover(struct mddev *mddev) 5852 { 5853 /* raid5 can take over: 5854 * raid0 - if there is only one strip zone - make it a raid4 layout 5855 * raid1 - if there are two drives. We need to know the chunk size 5856 * raid4 - trivial - just use a raid4 layout. 5857 * raid6 - Providing it is a *_6 layout 5858 */ 5859 if (mddev->level == 0) 5860 return raid45_takeover_raid0(mddev, 5); 5861 if (mddev->level == 1) 5862 return raid5_takeover_raid1(mddev); 5863 if (mddev->level == 4) { 5864 mddev->new_layout = ALGORITHM_PARITY_N; 5865 mddev->new_level = 5; 5866 return setup_conf(mddev); 5867 } 5868 if (mddev->level == 6) 5869 return raid5_takeover_raid6(mddev); 5870 5871 return ERR_PTR(-EINVAL); 5872 } 5873 5874 static void *raid4_takeover(struct mddev *mddev) 5875 { 5876 /* raid4 can take over: 5877 * raid0 - if there is only one strip zone 5878 * raid5 - if layout is right 5879 */ 5880 if (mddev->level == 0) 5881 return raid45_takeover_raid0(mddev, 4); 5882 if (mddev->level == 5 && 5883 mddev->layout == ALGORITHM_PARITY_N) { 5884 mddev->new_layout = 0; 5885 mddev->new_level = 4; 5886 return setup_conf(mddev); 5887 } 5888 return ERR_PTR(-EINVAL); 5889 } 5890 5891 static struct md_personality raid5_personality; 5892 5893 static void *raid6_takeover(struct mddev *mddev) 5894 { 5895 /* Currently can only take over a raid5. We map the 5896 * personality to an equivalent raid6 personality 5897 * with the Q block at the end. 5898 */ 5899 int new_layout; 5900 5901 if (mddev->pers != &raid5_personality) 5902 return ERR_PTR(-EINVAL); 5903 if (mddev->degraded > 1) 5904 return ERR_PTR(-EINVAL); 5905 if (mddev->raid_disks > 253) 5906 return ERR_PTR(-EINVAL); 5907 if (mddev->raid_disks < 3) 5908 return ERR_PTR(-EINVAL); 5909 5910 switch (mddev->layout) { 5911 case ALGORITHM_LEFT_ASYMMETRIC: 5912 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5913 break; 5914 case ALGORITHM_RIGHT_ASYMMETRIC: 5915 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5916 break; 5917 case ALGORITHM_LEFT_SYMMETRIC: 5918 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5919 break; 5920 case ALGORITHM_RIGHT_SYMMETRIC: 5921 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5922 break; 5923 case ALGORITHM_PARITY_0: 5924 new_layout = ALGORITHM_PARITY_0_6; 5925 break; 5926 case ALGORITHM_PARITY_N: 5927 new_layout = ALGORITHM_PARITY_N; 5928 break; 5929 default: 5930 return ERR_PTR(-EINVAL); 5931 } 5932 mddev->new_level = 6; 5933 mddev->new_layout = new_layout; 5934 mddev->delta_disks = 1; 5935 mddev->raid_disks += 1; 5936 return setup_conf(mddev); 5937 } 5938 5939 5940 static struct md_personality raid6_personality = 5941 { 5942 .name = "raid6", 5943 .level = 6, 5944 .owner = THIS_MODULE, 5945 .make_request = make_request, 5946 .run = run, 5947 .stop = stop, 5948 .status = status, 5949 .error_handler = error, 5950 .hot_add_disk = raid5_add_disk, 5951 .hot_remove_disk= raid5_remove_disk, 5952 .spare_active = raid5_spare_active, 5953 .sync_request = sync_request, 5954 .resize = raid5_resize, 5955 .size = raid5_size, 5956 .check_reshape = raid6_check_reshape, 5957 .start_reshape = raid5_start_reshape, 5958 .finish_reshape = raid5_finish_reshape, 5959 .quiesce = raid5_quiesce, 5960 .takeover = raid6_takeover, 5961 }; 5962 static struct md_personality raid5_personality = 5963 { 5964 .name = "raid5", 5965 .level = 5, 5966 .owner = THIS_MODULE, 5967 .make_request = make_request, 5968 .run = run, 5969 .stop = stop, 5970 .status = status, 5971 .error_handler = error, 5972 .hot_add_disk = raid5_add_disk, 5973 .hot_remove_disk= raid5_remove_disk, 5974 .spare_active = raid5_spare_active, 5975 .sync_request = sync_request, 5976 .resize = raid5_resize, 5977 .size = raid5_size, 5978 .check_reshape = raid5_check_reshape, 5979 .start_reshape = raid5_start_reshape, 5980 .finish_reshape = raid5_finish_reshape, 5981 .quiesce = raid5_quiesce, 5982 .takeover = raid5_takeover, 5983 }; 5984 5985 static struct md_personality raid4_personality = 5986 { 5987 .name = "raid4", 5988 .level = 4, 5989 .owner = THIS_MODULE, 5990 .make_request = make_request, 5991 .run = run, 5992 .stop = stop, 5993 .status = status, 5994 .error_handler = error, 5995 .hot_add_disk = raid5_add_disk, 5996 .hot_remove_disk= raid5_remove_disk, 5997 .spare_active = raid5_spare_active, 5998 .sync_request = sync_request, 5999 .resize = raid5_resize, 6000 .size = raid5_size, 6001 .check_reshape = raid5_check_reshape, 6002 .start_reshape = raid5_start_reshape, 6003 .finish_reshape = raid5_finish_reshape, 6004 .quiesce = raid5_quiesce, 6005 .takeover = raid4_takeover, 6006 }; 6007 6008 static int __init raid5_init(void) 6009 { 6010 register_md_personality(&raid6_personality); 6011 register_md_personality(&raid5_personality); 6012 register_md_personality(&raid4_personality); 6013 return 0; 6014 } 6015 6016 static void raid5_exit(void) 6017 { 6018 unregister_md_personality(&raid6_personality); 6019 unregister_md_personality(&raid5_personality); 6020 unregister_md_personality(&raid4_personality); 6021 } 6022 6023 module_init(raid5_init); 6024 module_exit(raid5_exit); 6025 MODULE_LICENSE("GPL"); 6026 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6027 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6028 MODULE_ALIAS("md-raid5"); 6029 MODULE_ALIAS("md-raid4"); 6030 MODULE_ALIAS("md-level-5"); 6031 MODULE_ALIAS("md-level-4"); 6032 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6033 MODULE_ALIAS("md-raid6"); 6034 MODULE_ALIAS("md-level-6"); 6035 6036 /* This used to be two separate modules, they were: */ 6037 MODULE_ALIAS("raid5"); 6038 MODULE_ALIAS("raid6"); 6039