1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <trace/events/block.h> 58 59 #include "md.h" 60 #include "raid5.h" 61 #include "raid0.h" 62 #include "bitmap.h" 63 64 #define cpu_to_group(cpu) cpu_to_node(cpu) 65 #define ANY_GROUP NUMA_NO_NODE 66 67 static struct workqueue_struct *raid5_wq; 68 /* 69 * Stripe cache 70 */ 71 72 #define NR_STRIPES 256 73 #define STRIPE_SIZE PAGE_SIZE 74 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 75 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 76 #define IO_THRESHOLD 1 77 #define BYPASS_THRESHOLD 1 78 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 79 #define HASH_MASK (NR_HASH - 1) 80 #define MAX_STRIPE_BATCH 8 81 82 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 83 { 84 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 85 return &conf->stripe_hashtbl[hash]; 86 } 87 88 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 89 * order without overlap. There may be several bio's per stripe+device, and 90 * a bio could span several devices. 91 * When walking this list for a particular stripe+device, we must never proceed 92 * beyond a bio that extends past this device, as the next bio might no longer 93 * be valid. 94 * This function is used to determine the 'next' bio in the list, given the sector 95 * of the current stripe+device 96 */ 97 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 98 { 99 int sectors = bio_sectors(bio); 100 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 101 return bio->bi_next; 102 else 103 return NULL; 104 } 105 106 /* 107 * We maintain a biased count of active stripes in the bottom 16 bits of 108 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 109 */ 110 static inline int raid5_bi_processed_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return (atomic_read(segments) >> 16) & 0xffff; 114 } 115 116 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 return atomic_sub_return(1, segments) & 0xffff; 120 } 121 122 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 123 { 124 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 125 atomic_inc(segments); 126 } 127 128 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 129 unsigned int cnt) 130 { 131 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 132 int old, new; 133 134 do { 135 old = atomic_read(segments); 136 new = (old & 0xffff) | (cnt << 16); 137 } while (atomic_cmpxchg(segments, old, new) != old); 138 } 139 140 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 141 { 142 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 143 atomic_set(segments, cnt); 144 } 145 146 /* Find first data disk in a raid6 stripe */ 147 static inline int raid6_d0(struct stripe_head *sh) 148 { 149 if (sh->ddf_layout) 150 /* ddf always start from first device */ 151 return 0; 152 /* md starts just after Q block */ 153 if (sh->qd_idx == sh->disks - 1) 154 return 0; 155 else 156 return sh->qd_idx + 1; 157 } 158 static inline int raid6_next_disk(int disk, int raid_disks) 159 { 160 disk++; 161 return (disk < raid_disks) ? disk : 0; 162 } 163 164 /* When walking through the disks in a raid5, starting at raid6_d0, 165 * We need to map each disk to a 'slot', where the data disks are slot 166 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 167 * is raid_disks-1. This help does that mapping. 168 */ 169 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 170 int *count, int syndrome_disks) 171 { 172 int slot = *count; 173 174 if (sh->ddf_layout) 175 (*count)++; 176 if (idx == sh->pd_idx) 177 return syndrome_disks; 178 if (idx == sh->qd_idx) 179 return syndrome_disks + 1; 180 if (!sh->ddf_layout) 181 (*count)++; 182 return slot; 183 } 184 185 static void return_io(struct bio *return_bi) 186 { 187 struct bio *bi = return_bi; 188 while (bi) { 189 190 return_bi = bi->bi_next; 191 bi->bi_next = NULL; 192 bi->bi_size = 0; 193 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 194 bi, 0); 195 bio_endio(bi, 0); 196 bi = return_bi; 197 } 198 } 199 200 static void print_raid5_conf (struct r5conf *conf); 201 202 static int stripe_operations_active(struct stripe_head *sh) 203 { 204 return sh->check_state || sh->reconstruct_state || 205 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 206 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 207 } 208 209 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 210 { 211 struct r5conf *conf = sh->raid_conf; 212 struct r5worker_group *group; 213 int thread_cnt; 214 int i, cpu = sh->cpu; 215 216 if (!cpu_online(cpu)) { 217 cpu = cpumask_any(cpu_online_mask); 218 sh->cpu = cpu; 219 } 220 221 if (list_empty(&sh->lru)) { 222 struct r5worker_group *group; 223 group = conf->worker_groups + cpu_to_group(cpu); 224 list_add_tail(&sh->lru, &group->handle_list); 225 group->stripes_cnt++; 226 sh->group = group; 227 } 228 229 if (conf->worker_cnt_per_group == 0) { 230 md_wakeup_thread(conf->mddev->thread); 231 return; 232 } 233 234 group = conf->worker_groups + cpu_to_group(sh->cpu); 235 236 group->workers[0].working = true; 237 /* at least one worker should run to avoid race */ 238 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 239 240 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 241 /* wakeup more workers */ 242 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 243 if (group->workers[i].working == false) { 244 group->workers[i].working = true; 245 queue_work_on(sh->cpu, raid5_wq, 246 &group->workers[i].work); 247 thread_cnt--; 248 } 249 } 250 } 251 252 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 253 { 254 BUG_ON(!list_empty(&sh->lru)); 255 BUG_ON(atomic_read(&conf->active_stripes)==0); 256 if (test_bit(STRIPE_HANDLE, &sh->state)) { 257 if (test_bit(STRIPE_DELAYED, &sh->state) && 258 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 259 list_add_tail(&sh->lru, &conf->delayed_list); 260 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 261 sh->bm_seq - conf->seq_write > 0) 262 list_add_tail(&sh->lru, &conf->bitmap_list); 263 else { 264 clear_bit(STRIPE_DELAYED, &sh->state); 265 clear_bit(STRIPE_BIT_DELAY, &sh->state); 266 if (conf->worker_cnt_per_group == 0) { 267 list_add_tail(&sh->lru, &conf->handle_list); 268 } else { 269 raid5_wakeup_stripe_thread(sh); 270 return; 271 } 272 } 273 md_wakeup_thread(conf->mddev->thread); 274 } else { 275 BUG_ON(stripe_operations_active(sh)); 276 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 277 if (atomic_dec_return(&conf->preread_active_stripes) 278 < IO_THRESHOLD) 279 md_wakeup_thread(conf->mddev->thread); 280 atomic_dec(&conf->active_stripes); 281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 282 list_add_tail(&sh->lru, &conf->inactive_list); 283 wake_up(&conf->wait_for_stripe); 284 if (conf->retry_read_aligned) 285 md_wakeup_thread(conf->mddev->thread); 286 } 287 } 288 } 289 290 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 291 { 292 if (atomic_dec_and_test(&sh->count)) 293 do_release_stripe(conf, sh); 294 } 295 296 /* should hold conf->device_lock already */ 297 static int release_stripe_list(struct r5conf *conf) 298 { 299 struct stripe_head *sh; 300 int count = 0; 301 struct llist_node *head; 302 303 head = llist_del_all(&conf->released_stripes); 304 head = llist_reverse_order(head); 305 while (head) { 306 sh = llist_entry(head, struct stripe_head, release_list); 307 head = llist_next(head); 308 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 309 smp_mb(); 310 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 311 /* 312 * Don't worry the bit is set here, because if the bit is set 313 * again, the count is always > 1. This is true for 314 * STRIPE_ON_UNPLUG_LIST bit too. 315 */ 316 __release_stripe(conf, sh); 317 count++; 318 } 319 320 return count; 321 } 322 323 static void release_stripe(struct stripe_head *sh) 324 { 325 struct r5conf *conf = sh->raid_conf; 326 unsigned long flags; 327 bool wakeup; 328 329 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 330 goto slow_path; 331 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 332 if (wakeup) 333 md_wakeup_thread(conf->mddev->thread); 334 return; 335 slow_path: 336 local_irq_save(flags); 337 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 338 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 339 do_release_stripe(conf, sh); 340 spin_unlock(&conf->device_lock); 341 } 342 local_irq_restore(flags); 343 } 344 345 static inline void remove_hash(struct stripe_head *sh) 346 { 347 pr_debug("remove_hash(), stripe %llu\n", 348 (unsigned long long)sh->sector); 349 350 hlist_del_init(&sh->hash); 351 } 352 353 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 354 { 355 struct hlist_head *hp = stripe_hash(conf, sh->sector); 356 357 pr_debug("insert_hash(), stripe %llu\n", 358 (unsigned long long)sh->sector); 359 360 hlist_add_head(&sh->hash, hp); 361 } 362 363 364 /* find an idle stripe, make sure it is unhashed, and return it. */ 365 static struct stripe_head *get_free_stripe(struct r5conf *conf) 366 { 367 struct stripe_head *sh = NULL; 368 struct list_head *first; 369 370 if (list_empty(&conf->inactive_list)) 371 goto out; 372 first = conf->inactive_list.next; 373 sh = list_entry(first, struct stripe_head, lru); 374 list_del_init(first); 375 remove_hash(sh); 376 atomic_inc(&conf->active_stripes); 377 out: 378 return sh; 379 } 380 381 static void shrink_buffers(struct stripe_head *sh) 382 { 383 struct page *p; 384 int i; 385 int num = sh->raid_conf->pool_size; 386 387 for (i = 0; i < num ; i++) { 388 p = sh->dev[i].page; 389 if (!p) 390 continue; 391 sh->dev[i].page = NULL; 392 put_page(p); 393 } 394 } 395 396 static int grow_buffers(struct stripe_head *sh) 397 { 398 int i; 399 int num = sh->raid_conf->pool_size; 400 401 for (i = 0; i < num; i++) { 402 struct page *page; 403 404 if (!(page = alloc_page(GFP_KERNEL))) { 405 return 1; 406 } 407 sh->dev[i].page = page; 408 } 409 return 0; 410 } 411 412 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 413 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 414 struct stripe_head *sh); 415 416 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 417 { 418 struct r5conf *conf = sh->raid_conf; 419 int i; 420 421 BUG_ON(atomic_read(&sh->count) != 0); 422 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 423 BUG_ON(stripe_operations_active(sh)); 424 425 pr_debug("init_stripe called, stripe %llu\n", 426 (unsigned long long)sh->sector); 427 428 remove_hash(sh); 429 430 sh->generation = conf->generation - previous; 431 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 432 sh->sector = sector; 433 stripe_set_idx(sector, conf, previous, sh); 434 sh->state = 0; 435 436 437 for (i = sh->disks; i--; ) { 438 struct r5dev *dev = &sh->dev[i]; 439 440 if (dev->toread || dev->read || dev->towrite || dev->written || 441 test_bit(R5_LOCKED, &dev->flags)) { 442 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 443 (unsigned long long)sh->sector, i, dev->toread, 444 dev->read, dev->towrite, dev->written, 445 test_bit(R5_LOCKED, &dev->flags)); 446 WARN_ON(1); 447 } 448 dev->flags = 0; 449 raid5_build_block(sh, i, previous); 450 } 451 insert_hash(conf, sh); 452 sh->cpu = smp_processor_id(); 453 } 454 455 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 456 short generation) 457 { 458 struct stripe_head *sh; 459 460 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 461 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 462 if (sh->sector == sector && sh->generation == generation) 463 return sh; 464 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 465 return NULL; 466 } 467 468 /* 469 * Need to check if array has failed when deciding whether to: 470 * - start an array 471 * - remove non-faulty devices 472 * - add a spare 473 * - allow a reshape 474 * This determination is simple when no reshape is happening. 475 * However if there is a reshape, we need to carefully check 476 * both the before and after sections. 477 * This is because some failed devices may only affect one 478 * of the two sections, and some non-in_sync devices may 479 * be insync in the section most affected by failed devices. 480 */ 481 static int calc_degraded(struct r5conf *conf) 482 { 483 int degraded, degraded2; 484 int i; 485 486 rcu_read_lock(); 487 degraded = 0; 488 for (i = 0; i < conf->previous_raid_disks; i++) { 489 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 490 if (rdev && test_bit(Faulty, &rdev->flags)) 491 rdev = rcu_dereference(conf->disks[i].replacement); 492 if (!rdev || test_bit(Faulty, &rdev->flags)) 493 degraded++; 494 else if (test_bit(In_sync, &rdev->flags)) 495 ; 496 else 497 /* not in-sync or faulty. 498 * If the reshape increases the number of devices, 499 * this is being recovered by the reshape, so 500 * this 'previous' section is not in_sync. 501 * If the number of devices is being reduced however, 502 * the device can only be part of the array if 503 * we are reverting a reshape, so this section will 504 * be in-sync. 505 */ 506 if (conf->raid_disks >= conf->previous_raid_disks) 507 degraded++; 508 } 509 rcu_read_unlock(); 510 if (conf->raid_disks == conf->previous_raid_disks) 511 return degraded; 512 rcu_read_lock(); 513 degraded2 = 0; 514 for (i = 0; i < conf->raid_disks; i++) { 515 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 516 if (rdev && test_bit(Faulty, &rdev->flags)) 517 rdev = rcu_dereference(conf->disks[i].replacement); 518 if (!rdev || test_bit(Faulty, &rdev->flags)) 519 degraded2++; 520 else if (test_bit(In_sync, &rdev->flags)) 521 ; 522 else 523 /* not in-sync or faulty. 524 * If reshape increases the number of devices, this 525 * section has already been recovered, else it 526 * almost certainly hasn't. 527 */ 528 if (conf->raid_disks <= conf->previous_raid_disks) 529 degraded2++; 530 } 531 rcu_read_unlock(); 532 if (degraded2 > degraded) 533 return degraded2; 534 return degraded; 535 } 536 537 static int has_failed(struct r5conf *conf) 538 { 539 int degraded; 540 541 if (conf->mddev->reshape_position == MaxSector) 542 return conf->mddev->degraded > conf->max_degraded; 543 544 degraded = calc_degraded(conf); 545 if (degraded > conf->max_degraded) 546 return 1; 547 return 0; 548 } 549 550 static struct stripe_head * 551 get_active_stripe(struct r5conf *conf, sector_t sector, 552 int previous, int noblock, int noquiesce) 553 { 554 struct stripe_head *sh; 555 556 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 557 558 spin_lock_irq(&conf->device_lock); 559 560 do { 561 wait_event_lock_irq(conf->wait_for_stripe, 562 conf->quiesce == 0 || noquiesce, 563 conf->device_lock); 564 sh = __find_stripe(conf, sector, conf->generation - previous); 565 if (!sh) { 566 if (!conf->inactive_blocked) 567 sh = get_free_stripe(conf); 568 if (noblock && sh == NULL) 569 break; 570 if (!sh) { 571 conf->inactive_blocked = 1; 572 wait_event_lock_irq(conf->wait_for_stripe, 573 !list_empty(&conf->inactive_list) && 574 (atomic_read(&conf->active_stripes) 575 < (conf->max_nr_stripes *3/4) 576 || !conf->inactive_blocked), 577 conf->device_lock); 578 conf->inactive_blocked = 0; 579 } else 580 init_stripe(sh, sector, previous); 581 } else { 582 if (atomic_read(&sh->count)) { 583 BUG_ON(!list_empty(&sh->lru) 584 && !test_bit(STRIPE_EXPANDING, &sh->state) 585 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 586 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 587 } else { 588 if (!test_bit(STRIPE_HANDLE, &sh->state)) 589 atomic_inc(&conf->active_stripes); 590 if (list_empty(&sh->lru) && 591 !test_bit(STRIPE_EXPANDING, &sh->state)) 592 BUG(); 593 list_del_init(&sh->lru); 594 if (sh->group) { 595 sh->group->stripes_cnt--; 596 sh->group = NULL; 597 } 598 } 599 } 600 } while (sh == NULL); 601 602 if (sh) 603 atomic_inc(&sh->count); 604 605 spin_unlock_irq(&conf->device_lock); 606 return sh; 607 } 608 609 /* Determine if 'data_offset' or 'new_data_offset' should be used 610 * in this stripe_head. 611 */ 612 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 613 { 614 sector_t progress = conf->reshape_progress; 615 /* Need a memory barrier to make sure we see the value 616 * of conf->generation, or ->data_offset that was set before 617 * reshape_progress was updated. 618 */ 619 smp_rmb(); 620 if (progress == MaxSector) 621 return 0; 622 if (sh->generation == conf->generation - 1) 623 return 0; 624 /* We are in a reshape, and this is a new-generation stripe, 625 * so use new_data_offset. 626 */ 627 return 1; 628 } 629 630 static void 631 raid5_end_read_request(struct bio *bi, int error); 632 static void 633 raid5_end_write_request(struct bio *bi, int error); 634 635 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 636 { 637 struct r5conf *conf = sh->raid_conf; 638 int i, disks = sh->disks; 639 640 might_sleep(); 641 642 for (i = disks; i--; ) { 643 int rw; 644 int replace_only = 0; 645 struct bio *bi, *rbi; 646 struct md_rdev *rdev, *rrdev = NULL; 647 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 648 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 649 rw = WRITE_FUA; 650 else 651 rw = WRITE; 652 if (test_bit(R5_Discard, &sh->dev[i].flags)) 653 rw |= REQ_DISCARD; 654 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 655 rw = READ; 656 else if (test_and_clear_bit(R5_WantReplace, 657 &sh->dev[i].flags)) { 658 rw = WRITE; 659 replace_only = 1; 660 } else 661 continue; 662 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 663 rw |= REQ_SYNC; 664 665 bi = &sh->dev[i].req; 666 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 667 668 rcu_read_lock(); 669 rrdev = rcu_dereference(conf->disks[i].replacement); 670 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 671 rdev = rcu_dereference(conf->disks[i].rdev); 672 if (!rdev) { 673 rdev = rrdev; 674 rrdev = NULL; 675 } 676 if (rw & WRITE) { 677 if (replace_only) 678 rdev = NULL; 679 if (rdev == rrdev) 680 /* We raced and saw duplicates */ 681 rrdev = NULL; 682 } else { 683 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 684 rdev = rrdev; 685 rrdev = NULL; 686 } 687 688 if (rdev && test_bit(Faulty, &rdev->flags)) 689 rdev = NULL; 690 if (rdev) 691 atomic_inc(&rdev->nr_pending); 692 if (rrdev && test_bit(Faulty, &rrdev->flags)) 693 rrdev = NULL; 694 if (rrdev) 695 atomic_inc(&rrdev->nr_pending); 696 rcu_read_unlock(); 697 698 /* We have already checked bad blocks for reads. Now 699 * need to check for writes. We never accept write errors 700 * on the replacement, so we don't to check rrdev. 701 */ 702 while ((rw & WRITE) && rdev && 703 test_bit(WriteErrorSeen, &rdev->flags)) { 704 sector_t first_bad; 705 int bad_sectors; 706 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 707 &first_bad, &bad_sectors); 708 if (!bad) 709 break; 710 711 if (bad < 0) { 712 set_bit(BlockedBadBlocks, &rdev->flags); 713 if (!conf->mddev->external && 714 conf->mddev->flags) { 715 /* It is very unlikely, but we might 716 * still need to write out the 717 * bad block log - better give it 718 * a chance*/ 719 md_check_recovery(conf->mddev); 720 } 721 /* 722 * Because md_wait_for_blocked_rdev 723 * will dec nr_pending, we must 724 * increment it first. 725 */ 726 atomic_inc(&rdev->nr_pending); 727 md_wait_for_blocked_rdev(rdev, conf->mddev); 728 } else { 729 /* Acknowledged bad block - skip the write */ 730 rdev_dec_pending(rdev, conf->mddev); 731 rdev = NULL; 732 } 733 } 734 735 if (rdev) { 736 if (s->syncing || s->expanding || s->expanded 737 || s->replacing) 738 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 739 740 set_bit(STRIPE_IO_STARTED, &sh->state); 741 742 bio_reset(bi); 743 bi->bi_bdev = rdev->bdev; 744 bi->bi_rw = rw; 745 bi->bi_end_io = (rw & WRITE) 746 ? raid5_end_write_request 747 : raid5_end_read_request; 748 bi->bi_private = sh; 749 750 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 751 __func__, (unsigned long long)sh->sector, 752 bi->bi_rw, i); 753 atomic_inc(&sh->count); 754 if (use_new_offset(conf, sh)) 755 bi->bi_sector = (sh->sector 756 + rdev->new_data_offset); 757 else 758 bi->bi_sector = (sh->sector 759 + rdev->data_offset); 760 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 761 bi->bi_rw |= REQ_FLUSH; 762 763 bi->bi_vcnt = 1; 764 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 765 bi->bi_io_vec[0].bv_offset = 0; 766 bi->bi_size = STRIPE_SIZE; 767 /* 768 * If this is discard request, set bi_vcnt 0. We don't 769 * want to confuse SCSI because SCSI will replace payload 770 */ 771 if (rw & REQ_DISCARD) 772 bi->bi_vcnt = 0; 773 if (rrdev) 774 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 775 776 if (conf->mddev->gendisk) 777 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 778 bi, disk_devt(conf->mddev->gendisk), 779 sh->dev[i].sector); 780 generic_make_request(bi); 781 } 782 if (rrdev) { 783 if (s->syncing || s->expanding || s->expanded 784 || s->replacing) 785 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 786 787 set_bit(STRIPE_IO_STARTED, &sh->state); 788 789 bio_reset(rbi); 790 rbi->bi_bdev = rrdev->bdev; 791 rbi->bi_rw = rw; 792 BUG_ON(!(rw & WRITE)); 793 rbi->bi_end_io = raid5_end_write_request; 794 rbi->bi_private = sh; 795 796 pr_debug("%s: for %llu schedule op %ld on " 797 "replacement disc %d\n", 798 __func__, (unsigned long long)sh->sector, 799 rbi->bi_rw, i); 800 atomic_inc(&sh->count); 801 if (use_new_offset(conf, sh)) 802 rbi->bi_sector = (sh->sector 803 + rrdev->new_data_offset); 804 else 805 rbi->bi_sector = (sh->sector 806 + rrdev->data_offset); 807 rbi->bi_vcnt = 1; 808 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 809 rbi->bi_io_vec[0].bv_offset = 0; 810 rbi->bi_size = STRIPE_SIZE; 811 /* 812 * If this is discard request, set bi_vcnt 0. We don't 813 * want to confuse SCSI because SCSI will replace payload 814 */ 815 if (rw & REQ_DISCARD) 816 rbi->bi_vcnt = 0; 817 if (conf->mddev->gendisk) 818 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 819 rbi, disk_devt(conf->mddev->gendisk), 820 sh->dev[i].sector); 821 generic_make_request(rbi); 822 } 823 if (!rdev && !rrdev) { 824 if (rw & WRITE) 825 set_bit(STRIPE_DEGRADED, &sh->state); 826 pr_debug("skip op %ld on disc %d for sector %llu\n", 827 bi->bi_rw, i, (unsigned long long)sh->sector); 828 clear_bit(R5_LOCKED, &sh->dev[i].flags); 829 set_bit(STRIPE_HANDLE, &sh->state); 830 } 831 } 832 } 833 834 static struct dma_async_tx_descriptor * 835 async_copy_data(int frombio, struct bio *bio, struct page *page, 836 sector_t sector, struct dma_async_tx_descriptor *tx) 837 { 838 struct bio_vec *bvl; 839 struct page *bio_page; 840 int i; 841 int page_offset; 842 struct async_submit_ctl submit; 843 enum async_tx_flags flags = 0; 844 845 if (bio->bi_sector >= sector) 846 page_offset = (signed)(bio->bi_sector - sector) * 512; 847 else 848 page_offset = (signed)(sector - bio->bi_sector) * -512; 849 850 if (frombio) 851 flags |= ASYNC_TX_FENCE; 852 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 853 854 bio_for_each_segment(bvl, bio, i) { 855 int len = bvl->bv_len; 856 int clen; 857 int b_offset = 0; 858 859 if (page_offset < 0) { 860 b_offset = -page_offset; 861 page_offset += b_offset; 862 len -= b_offset; 863 } 864 865 if (len > 0 && page_offset + len > STRIPE_SIZE) 866 clen = STRIPE_SIZE - page_offset; 867 else 868 clen = len; 869 870 if (clen > 0) { 871 b_offset += bvl->bv_offset; 872 bio_page = bvl->bv_page; 873 if (frombio) 874 tx = async_memcpy(page, bio_page, page_offset, 875 b_offset, clen, &submit); 876 else 877 tx = async_memcpy(bio_page, page, b_offset, 878 page_offset, clen, &submit); 879 } 880 /* chain the operations */ 881 submit.depend_tx = tx; 882 883 if (clen < len) /* hit end of page */ 884 break; 885 page_offset += len; 886 } 887 888 return tx; 889 } 890 891 static void ops_complete_biofill(void *stripe_head_ref) 892 { 893 struct stripe_head *sh = stripe_head_ref; 894 struct bio *return_bi = NULL; 895 int i; 896 897 pr_debug("%s: stripe %llu\n", __func__, 898 (unsigned long long)sh->sector); 899 900 /* clear completed biofills */ 901 for (i = sh->disks; i--; ) { 902 struct r5dev *dev = &sh->dev[i]; 903 904 /* acknowledge completion of a biofill operation */ 905 /* and check if we need to reply to a read request, 906 * new R5_Wantfill requests are held off until 907 * !STRIPE_BIOFILL_RUN 908 */ 909 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 910 struct bio *rbi, *rbi2; 911 912 BUG_ON(!dev->read); 913 rbi = dev->read; 914 dev->read = NULL; 915 while (rbi && rbi->bi_sector < 916 dev->sector + STRIPE_SECTORS) { 917 rbi2 = r5_next_bio(rbi, dev->sector); 918 if (!raid5_dec_bi_active_stripes(rbi)) { 919 rbi->bi_next = return_bi; 920 return_bi = rbi; 921 } 922 rbi = rbi2; 923 } 924 } 925 } 926 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 927 928 return_io(return_bi); 929 930 set_bit(STRIPE_HANDLE, &sh->state); 931 release_stripe(sh); 932 } 933 934 static void ops_run_biofill(struct stripe_head *sh) 935 { 936 struct dma_async_tx_descriptor *tx = NULL; 937 struct async_submit_ctl submit; 938 int i; 939 940 pr_debug("%s: stripe %llu\n", __func__, 941 (unsigned long long)sh->sector); 942 943 for (i = sh->disks; i--; ) { 944 struct r5dev *dev = &sh->dev[i]; 945 if (test_bit(R5_Wantfill, &dev->flags)) { 946 struct bio *rbi; 947 spin_lock_irq(&sh->stripe_lock); 948 dev->read = rbi = dev->toread; 949 dev->toread = NULL; 950 spin_unlock_irq(&sh->stripe_lock); 951 while (rbi && rbi->bi_sector < 952 dev->sector + STRIPE_SECTORS) { 953 tx = async_copy_data(0, rbi, dev->page, 954 dev->sector, tx); 955 rbi = r5_next_bio(rbi, dev->sector); 956 } 957 } 958 } 959 960 atomic_inc(&sh->count); 961 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 962 async_trigger_callback(&submit); 963 } 964 965 static void mark_target_uptodate(struct stripe_head *sh, int target) 966 { 967 struct r5dev *tgt; 968 969 if (target < 0) 970 return; 971 972 tgt = &sh->dev[target]; 973 set_bit(R5_UPTODATE, &tgt->flags); 974 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 975 clear_bit(R5_Wantcompute, &tgt->flags); 976 } 977 978 static void ops_complete_compute(void *stripe_head_ref) 979 { 980 struct stripe_head *sh = stripe_head_ref; 981 982 pr_debug("%s: stripe %llu\n", __func__, 983 (unsigned long long)sh->sector); 984 985 /* mark the computed target(s) as uptodate */ 986 mark_target_uptodate(sh, sh->ops.target); 987 mark_target_uptodate(sh, sh->ops.target2); 988 989 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 990 if (sh->check_state == check_state_compute_run) 991 sh->check_state = check_state_compute_result; 992 set_bit(STRIPE_HANDLE, &sh->state); 993 release_stripe(sh); 994 } 995 996 /* return a pointer to the address conversion region of the scribble buffer */ 997 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 998 struct raid5_percpu *percpu) 999 { 1000 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 1001 } 1002 1003 static struct dma_async_tx_descriptor * 1004 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1005 { 1006 int disks = sh->disks; 1007 struct page **xor_srcs = percpu->scribble; 1008 int target = sh->ops.target; 1009 struct r5dev *tgt = &sh->dev[target]; 1010 struct page *xor_dest = tgt->page; 1011 int count = 0; 1012 struct dma_async_tx_descriptor *tx; 1013 struct async_submit_ctl submit; 1014 int i; 1015 1016 pr_debug("%s: stripe %llu block: %d\n", 1017 __func__, (unsigned long long)sh->sector, target); 1018 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1019 1020 for (i = disks; i--; ) 1021 if (i != target) 1022 xor_srcs[count++] = sh->dev[i].page; 1023 1024 atomic_inc(&sh->count); 1025 1026 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1027 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 1028 if (unlikely(count == 1)) 1029 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1030 else 1031 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1032 1033 return tx; 1034 } 1035 1036 /* set_syndrome_sources - populate source buffers for gen_syndrome 1037 * @srcs - (struct page *) array of size sh->disks 1038 * @sh - stripe_head to parse 1039 * 1040 * Populates srcs in proper layout order for the stripe and returns the 1041 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1042 * destination buffer is recorded in srcs[count] and the Q destination 1043 * is recorded in srcs[count+1]]. 1044 */ 1045 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 1046 { 1047 int disks = sh->disks; 1048 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1049 int d0_idx = raid6_d0(sh); 1050 int count; 1051 int i; 1052 1053 for (i = 0; i < disks; i++) 1054 srcs[i] = NULL; 1055 1056 count = 0; 1057 i = d0_idx; 1058 do { 1059 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1060 1061 srcs[slot] = sh->dev[i].page; 1062 i = raid6_next_disk(i, disks); 1063 } while (i != d0_idx); 1064 1065 return syndrome_disks; 1066 } 1067 1068 static struct dma_async_tx_descriptor * 1069 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1070 { 1071 int disks = sh->disks; 1072 struct page **blocks = percpu->scribble; 1073 int target; 1074 int qd_idx = sh->qd_idx; 1075 struct dma_async_tx_descriptor *tx; 1076 struct async_submit_ctl submit; 1077 struct r5dev *tgt; 1078 struct page *dest; 1079 int i; 1080 int count; 1081 1082 if (sh->ops.target < 0) 1083 target = sh->ops.target2; 1084 else if (sh->ops.target2 < 0) 1085 target = sh->ops.target; 1086 else 1087 /* we should only have one valid target */ 1088 BUG(); 1089 BUG_ON(target < 0); 1090 pr_debug("%s: stripe %llu block: %d\n", 1091 __func__, (unsigned long long)sh->sector, target); 1092 1093 tgt = &sh->dev[target]; 1094 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1095 dest = tgt->page; 1096 1097 atomic_inc(&sh->count); 1098 1099 if (target == qd_idx) { 1100 count = set_syndrome_sources(blocks, sh); 1101 blocks[count] = NULL; /* regenerating p is not necessary */ 1102 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1103 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1104 ops_complete_compute, sh, 1105 to_addr_conv(sh, percpu)); 1106 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1107 } else { 1108 /* Compute any data- or p-drive using XOR */ 1109 count = 0; 1110 for (i = disks; i-- ; ) { 1111 if (i == target || i == qd_idx) 1112 continue; 1113 blocks[count++] = sh->dev[i].page; 1114 } 1115 1116 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1117 NULL, ops_complete_compute, sh, 1118 to_addr_conv(sh, percpu)); 1119 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1120 } 1121 1122 return tx; 1123 } 1124 1125 static struct dma_async_tx_descriptor * 1126 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1127 { 1128 int i, count, disks = sh->disks; 1129 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1130 int d0_idx = raid6_d0(sh); 1131 int faila = -1, failb = -1; 1132 int target = sh->ops.target; 1133 int target2 = sh->ops.target2; 1134 struct r5dev *tgt = &sh->dev[target]; 1135 struct r5dev *tgt2 = &sh->dev[target2]; 1136 struct dma_async_tx_descriptor *tx; 1137 struct page **blocks = percpu->scribble; 1138 struct async_submit_ctl submit; 1139 1140 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1141 __func__, (unsigned long long)sh->sector, target, target2); 1142 BUG_ON(target < 0 || target2 < 0); 1143 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1144 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1145 1146 /* we need to open-code set_syndrome_sources to handle the 1147 * slot number conversion for 'faila' and 'failb' 1148 */ 1149 for (i = 0; i < disks ; i++) 1150 blocks[i] = NULL; 1151 count = 0; 1152 i = d0_idx; 1153 do { 1154 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1155 1156 blocks[slot] = sh->dev[i].page; 1157 1158 if (i == target) 1159 faila = slot; 1160 if (i == target2) 1161 failb = slot; 1162 i = raid6_next_disk(i, disks); 1163 } while (i != d0_idx); 1164 1165 BUG_ON(faila == failb); 1166 if (failb < faila) 1167 swap(faila, failb); 1168 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1169 __func__, (unsigned long long)sh->sector, faila, failb); 1170 1171 atomic_inc(&sh->count); 1172 1173 if (failb == syndrome_disks+1) { 1174 /* Q disk is one of the missing disks */ 1175 if (faila == syndrome_disks) { 1176 /* Missing P+Q, just recompute */ 1177 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1178 ops_complete_compute, sh, 1179 to_addr_conv(sh, percpu)); 1180 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1181 STRIPE_SIZE, &submit); 1182 } else { 1183 struct page *dest; 1184 int data_target; 1185 int qd_idx = sh->qd_idx; 1186 1187 /* Missing D+Q: recompute D from P, then recompute Q */ 1188 if (target == qd_idx) 1189 data_target = target2; 1190 else 1191 data_target = target; 1192 1193 count = 0; 1194 for (i = disks; i-- ; ) { 1195 if (i == data_target || i == qd_idx) 1196 continue; 1197 blocks[count++] = sh->dev[i].page; 1198 } 1199 dest = sh->dev[data_target].page; 1200 init_async_submit(&submit, 1201 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1202 NULL, NULL, NULL, 1203 to_addr_conv(sh, percpu)); 1204 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1205 &submit); 1206 1207 count = set_syndrome_sources(blocks, sh); 1208 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1209 ops_complete_compute, sh, 1210 to_addr_conv(sh, percpu)); 1211 return async_gen_syndrome(blocks, 0, count+2, 1212 STRIPE_SIZE, &submit); 1213 } 1214 } else { 1215 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1216 ops_complete_compute, sh, 1217 to_addr_conv(sh, percpu)); 1218 if (failb == syndrome_disks) { 1219 /* We're missing D+P. */ 1220 return async_raid6_datap_recov(syndrome_disks+2, 1221 STRIPE_SIZE, faila, 1222 blocks, &submit); 1223 } else { 1224 /* We're missing D+D. */ 1225 return async_raid6_2data_recov(syndrome_disks+2, 1226 STRIPE_SIZE, faila, failb, 1227 blocks, &submit); 1228 } 1229 } 1230 } 1231 1232 1233 static void ops_complete_prexor(void *stripe_head_ref) 1234 { 1235 struct stripe_head *sh = stripe_head_ref; 1236 1237 pr_debug("%s: stripe %llu\n", __func__, 1238 (unsigned long long)sh->sector); 1239 } 1240 1241 static struct dma_async_tx_descriptor * 1242 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1243 struct dma_async_tx_descriptor *tx) 1244 { 1245 int disks = sh->disks; 1246 struct page **xor_srcs = percpu->scribble; 1247 int count = 0, pd_idx = sh->pd_idx, i; 1248 struct async_submit_ctl submit; 1249 1250 /* existing parity data subtracted */ 1251 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1252 1253 pr_debug("%s: stripe %llu\n", __func__, 1254 (unsigned long long)sh->sector); 1255 1256 for (i = disks; i--; ) { 1257 struct r5dev *dev = &sh->dev[i]; 1258 /* Only process blocks that are known to be uptodate */ 1259 if (test_bit(R5_Wantdrain, &dev->flags)) 1260 xor_srcs[count++] = dev->page; 1261 } 1262 1263 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1264 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1265 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1266 1267 return tx; 1268 } 1269 1270 static struct dma_async_tx_descriptor * 1271 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1272 { 1273 int disks = sh->disks; 1274 int i; 1275 1276 pr_debug("%s: stripe %llu\n", __func__, 1277 (unsigned long long)sh->sector); 1278 1279 for (i = disks; i--; ) { 1280 struct r5dev *dev = &sh->dev[i]; 1281 struct bio *chosen; 1282 1283 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1284 struct bio *wbi; 1285 1286 spin_lock_irq(&sh->stripe_lock); 1287 chosen = dev->towrite; 1288 dev->towrite = NULL; 1289 BUG_ON(dev->written); 1290 wbi = dev->written = chosen; 1291 spin_unlock_irq(&sh->stripe_lock); 1292 1293 while (wbi && wbi->bi_sector < 1294 dev->sector + STRIPE_SECTORS) { 1295 if (wbi->bi_rw & REQ_FUA) 1296 set_bit(R5_WantFUA, &dev->flags); 1297 if (wbi->bi_rw & REQ_SYNC) 1298 set_bit(R5_SyncIO, &dev->flags); 1299 if (wbi->bi_rw & REQ_DISCARD) 1300 set_bit(R5_Discard, &dev->flags); 1301 else 1302 tx = async_copy_data(1, wbi, dev->page, 1303 dev->sector, tx); 1304 wbi = r5_next_bio(wbi, dev->sector); 1305 } 1306 } 1307 } 1308 1309 return tx; 1310 } 1311 1312 static void ops_complete_reconstruct(void *stripe_head_ref) 1313 { 1314 struct stripe_head *sh = stripe_head_ref; 1315 int disks = sh->disks; 1316 int pd_idx = sh->pd_idx; 1317 int qd_idx = sh->qd_idx; 1318 int i; 1319 bool fua = false, sync = false, discard = false; 1320 1321 pr_debug("%s: stripe %llu\n", __func__, 1322 (unsigned long long)sh->sector); 1323 1324 for (i = disks; i--; ) { 1325 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1326 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1327 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1328 } 1329 1330 for (i = disks; i--; ) { 1331 struct r5dev *dev = &sh->dev[i]; 1332 1333 if (dev->written || i == pd_idx || i == qd_idx) { 1334 if (!discard) 1335 set_bit(R5_UPTODATE, &dev->flags); 1336 if (fua) 1337 set_bit(R5_WantFUA, &dev->flags); 1338 if (sync) 1339 set_bit(R5_SyncIO, &dev->flags); 1340 } 1341 } 1342 1343 if (sh->reconstruct_state == reconstruct_state_drain_run) 1344 sh->reconstruct_state = reconstruct_state_drain_result; 1345 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1346 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1347 else { 1348 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1349 sh->reconstruct_state = reconstruct_state_result; 1350 } 1351 1352 set_bit(STRIPE_HANDLE, &sh->state); 1353 release_stripe(sh); 1354 } 1355 1356 static void 1357 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1358 struct dma_async_tx_descriptor *tx) 1359 { 1360 int disks = sh->disks; 1361 struct page **xor_srcs = percpu->scribble; 1362 struct async_submit_ctl submit; 1363 int count = 0, pd_idx = sh->pd_idx, i; 1364 struct page *xor_dest; 1365 int prexor = 0; 1366 unsigned long flags; 1367 1368 pr_debug("%s: stripe %llu\n", __func__, 1369 (unsigned long long)sh->sector); 1370 1371 for (i = 0; i < sh->disks; i++) { 1372 if (pd_idx == i) 1373 continue; 1374 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1375 break; 1376 } 1377 if (i >= sh->disks) { 1378 atomic_inc(&sh->count); 1379 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1380 ops_complete_reconstruct(sh); 1381 return; 1382 } 1383 /* check if prexor is active which means only process blocks 1384 * that are part of a read-modify-write (written) 1385 */ 1386 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1387 prexor = 1; 1388 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1389 for (i = disks; i--; ) { 1390 struct r5dev *dev = &sh->dev[i]; 1391 if (dev->written) 1392 xor_srcs[count++] = dev->page; 1393 } 1394 } else { 1395 xor_dest = sh->dev[pd_idx].page; 1396 for (i = disks; i--; ) { 1397 struct r5dev *dev = &sh->dev[i]; 1398 if (i != pd_idx) 1399 xor_srcs[count++] = dev->page; 1400 } 1401 } 1402 1403 /* 1/ if we prexor'd then the dest is reused as a source 1404 * 2/ if we did not prexor then we are redoing the parity 1405 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1406 * for the synchronous xor case 1407 */ 1408 flags = ASYNC_TX_ACK | 1409 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1410 1411 atomic_inc(&sh->count); 1412 1413 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1414 to_addr_conv(sh, percpu)); 1415 if (unlikely(count == 1)) 1416 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1417 else 1418 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1419 } 1420 1421 static void 1422 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1423 struct dma_async_tx_descriptor *tx) 1424 { 1425 struct async_submit_ctl submit; 1426 struct page **blocks = percpu->scribble; 1427 int count, i; 1428 1429 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1430 1431 for (i = 0; i < sh->disks; i++) { 1432 if (sh->pd_idx == i || sh->qd_idx == i) 1433 continue; 1434 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1435 break; 1436 } 1437 if (i >= sh->disks) { 1438 atomic_inc(&sh->count); 1439 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1440 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1441 ops_complete_reconstruct(sh); 1442 return; 1443 } 1444 1445 count = set_syndrome_sources(blocks, sh); 1446 1447 atomic_inc(&sh->count); 1448 1449 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1450 sh, to_addr_conv(sh, percpu)); 1451 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1452 } 1453 1454 static void ops_complete_check(void *stripe_head_ref) 1455 { 1456 struct stripe_head *sh = stripe_head_ref; 1457 1458 pr_debug("%s: stripe %llu\n", __func__, 1459 (unsigned long long)sh->sector); 1460 1461 sh->check_state = check_state_check_result; 1462 set_bit(STRIPE_HANDLE, &sh->state); 1463 release_stripe(sh); 1464 } 1465 1466 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1467 { 1468 int disks = sh->disks; 1469 int pd_idx = sh->pd_idx; 1470 int qd_idx = sh->qd_idx; 1471 struct page *xor_dest; 1472 struct page **xor_srcs = percpu->scribble; 1473 struct dma_async_tx_descriptor *tx; 1474 struct async_submit_ctl submit; 1475 int count; 1476 int i; 1477 1478 pr_debug("%s: stripe %llu\n", __func__, 1479 (unsigned long long)sh->sector); 1480 1481 count = 0; 1482 xor_dest = sh->dev[pd_idx].page; 1483 xor_srcs[count++] = xor_dest; 1484 for (i = disks; i--; ) { 1485 if (i == pd_idx || i == qd_idx) 1486 continue; 1487 xor_srcs[count++] = sh->dev[i].page; 1488 } 1489 1490 init_async_submit(&submit, 0, NULL, NULL, NULL, 1491 to_addr_conv(sh, percpu)); 1492 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1493 &sh->ops.zero_sum_result, &submit); 1494 1495 atomic_inc(&sh->count); 1496 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1497 tx = async_trigger_callback(&submit); 1498 } 1499 1500 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1501 { 1502 struct page **srcs = percpu->scribble; 1503 struct async_submit_ctl submit; 1504 int count; 1505 1506 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1507 (unsigned long long)sh->sector, checkp); 1508 1509 count = set_syndrome_sources(srcs, sh); 1510 if (!checkp) 1511 srcs[count] = NULL; 1512 1513 atomic_inc(&sh->count); 1514 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1515 sh, to_addr_conv(sh, percpu)); 1516 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1517 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1518 } 1519 1520 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1521 { 1522 int overlap_clear = 0, i, disks = sh->disks; 1523 struct dma_async_tx_descriptor *tx = NULL; 1524 struct r5conf *conf = sh->raid_conf; 1525 int level = conf->level; 1526 struct raid5_percpu *percpu; 1527 unsigned long cpu; 1528 1529 cpu = get_cpu(); 1530 percpu = per_cpu_ptr(conf->percpu, cpu); 1531 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1532 ops_run_biofill(sh); 1533 overlap_clear++; 1534 } 1535 1536 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1537 if (level < 6) 1538 tx = ops_run_compute5(sh, percpu); 1539 else { 1540 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1541 tx = ops_run_compute6_1(sh, percpu); 1542 else 1543 tx = ops_run_compute6_2(sh, percpu); 1544 } 1545 /* terminate the chain if reconstruct is not set to be run */ 1546 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1547 async_tx_ack(tx); 1548 } 1549 1550 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1551 tx = ops_run_prexor(sh, percpu, tx); 1552 1553 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1554 tx = ops_run_biodrain(sh, tx); 1555 overlap_clear++; 1556 } 1557 1558 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1559 if (level < 6) 1560 ops_run_reconstruct5(sh, percpu, tx); 1561 else 1562 ops_run_reconstruct6(sh, percpu, tx); 1563 } 1564 1565 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1566 if (sh->check_state == check_state_run) 1567 ops_run_check_p(sh, percpu); 1568 else if (sh->check_state == check_state_run_q) 1569 ops_run_check_pq(sh, percpu, 0); 1570 else if (sh->check_state == check_state_run_pq) 1571 ops_run_check_pq(sh, percpu, 1); 1572 else 1573 BUG(); 1574 } 1575 1576 if (overlap_clear) 1577 for (i = disks; i--; ) { 1578 struct r5dev *dev = &sh->dev[i]; 1579 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1580 wake_up(&sh->raid_conf->wait_for_overlap); 1581 } 1582 put_cpu(); 1583 } 1584 1585 static int grow_one_stripe(struct r5conf *conf) 1586 { 1587 struct stripe_head *sh; 1588 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1589 if (!sh) 1590 return 0; 1591 1592 sh->raid_conf = conf; 1593 1594 spin_lock_init(&sh->stripe_lock); 1595 1596 if (grow_buffers(sh)) { 1597 shrink_buffers(sh); 1598 kmem_cache_free(conf->slab_cache, sh); 1599 return 0; 1600 } 1601 /* we just created an active stripe so... */ 1602 atomic_set(&sh->count, 1); 1603 atomic_inc(&conf->active_stripes); 1604 INIT_LIST_HEAD(&sh->lru); 1605 release_stripe(sh); 1606 return 1; 1607 } 1608 1609 static int grow_stripes(struct r5conf *conf, int num) 1610 { 1611 struct kmem_cache *sc; 1612 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1613 1614 if (conf->mddev->gendisk) 1615 sprintf(conf->cache_name[0], 1616 "raid%d-%s", conf->level, mdname(conf->mddev)); 1617 else 1618 sprintf(conf->cache_name[0], 1619 "raid%d-%p", conf->level, conf->mddev); 1620 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1621 1622 conf->active_name = 0; 1623 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1624 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1625 0, 0, NULL); 1626 if (!sc) 1627 return 1; 1628 conf->slab_cache = sc; 1629 conf->pool_size = devs; 1630 while (num--) 1631 if (!grow_one_stripe(conf)) 1632 return 1; 1633 return 0; 1634 } 1635 1636 /** 1637 * scribble_len - return the required size of the scribble region 1638 * @num - total number of disks in the array 1639 * 1640 * The size must be enough to contain: 1641 * 1/ a struct page pointer for each device in the array +2 1642 * 2/ room to convert each entry in (1) to its corresponding dma 1643 * (dma_map_page()) or page (page_address()) address. 1644 * 1645 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1646 * calculate over all devices (not just the data blocks), using zeros in place 1647 * of the P and Q blocks. 1648 */ 1649 static size_t scribble_len(int num) 1650 { 1651 size_t len; 1652 1653 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1654 1655 return len; 1656 } 1657 1658 static int resize_stripes(struct r5conf *conf, int newsize) 1659 { 1660 /* Make all the stripes able to hold 'newsize' devices. 1661 * New slots in each stripe get 'page' set to a new page. 1662 * 1663 * This happens in stages: 1664 * 1/ create a new kmem_cache and allocate the required number of 1665 * stripe_heads. 1666 * 2/ gather all the old stripe_heads and transfer the pages across 1667 * to the new stripe_heads. This will have the side effect of 1668 * freezing the array as once all stripe_heads have been collected, 1669 * no IO will be possible. Old stripe heads are freed once their 1670 * pages have been transferred over, and the old kmem_cache is 1671 * freed when all stripes are done. 1672 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1673 * we simple return a failre status - no need to clean anything up. 1674 * 4/ allocate new pages for the new slots in the new stripe_heads. 1675 * If this fails, we don't bother trying the shrink the 1676 * stripe_heads down again, we just leave them as they are. 1677 * As each stripe_head is processed the new one is released into 1678 * active service. 1679 * 1680 * Once step2 is started, we cannot afford to wait for a write, 1681 * so we use GFP_NOIO allocations. 1682 */ 1683 struct stripe_head *osh, *nsh; 1684 LIST_HEAD(newstripes); 1685 struct disk_info *ndisks; 1686 unsigned long cpu; 1687 int err; 1688 struct kmem_cache *sc; 1689 int i; 1690 1691 if (newsize <= conf->pool_size) 1692 return 0; /* never bother to shrink */ 1693 1694 err = md_allow_write(conf->mddev); 1695 if (err) 1696 return err; 1697 1698 /* Step 1 */ 1699 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1700 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1701 0, 0, NULL); 1702 if (!sc) 1703 return -ENOMEM; 1704 1705 for (i = conf->max_nr_stripes; i; i--) { 1706 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1707 if (!nsh) 1708 break; 1709 1710 nsh->raid_conf = conf; 1711 spin_lock_init(&nsh->stripe_lock); 1712 1713 list_add(&nsh->lru, &newstripes); 1714 } 1715 if (i) { 1716 /* didn't get enough, give up */ 1717 while (!list_empty(&newstripes)) { 1718 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1719 list_del(&nsh->lru); 1720 kmem_cache_free(sc, nsh); 1721 } 1722 kmem_cache_destroy(sc); 1723 return -ENOMEM; 1724 } 1725 /* Step 2 - Must use GFP_NOIO now. 1726 * OK, we have enough stripes, start collecting inactive 1727 * stripes and copying them over 1728 */ 1729 list_for_each_entry(nsh, &newstripes, lru) { 1730 spin_lock_irq(&conf->device_lock); 1731 wait_event_lock_irq(conf->wait_for_stripe, 1732 !list_empty(&conf->inactive_list), 1733 conf->device_lock); 1734 osh = get_free_stripe(conf); 1735 spin_unlock_irq(&conf->device_lock); 1736 atomic_set(&nsh->count, 1); 1737 for(i=0; i<conf->pool_size; i++) 1738 nsh->dev[i].page = osh->dev[i].page; 1739 for( ; i<newsize; i++) 1740 nsh->dev[i].page = NULL; 1741 kmem_cache_free(conf->slab_cache, osh); 1742 } 1743 kmem_cache_destroy(conf->slab_cache); 1744 1745 /* Step 3. 1746 * At this point, we are holding all the stripes so the array 1747 * is completely stalled, so now is a good time to resize 1748 * conf->disks and the scribble region 1749 */ 1750 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1751 if (ndisks) { 1752 for (i=0; i<conf->raid_disks; i++) 1753 ndisks[i] = conf->disks[i]; 1754 kfree(conf->disks); 1755 conf->disks = ndisks; 1756 } else 1757 err = -ENOMEM; 1758 1759 get_online_cpus(); 1760 conf->scribble_len = scribble_len(newsize); 1761 for_each_present_cpu(cpu) { 1762 struct raid5_percpu *percpu; 1763 void *scribble; 1764 1765 percpu = per_cpu_ptr(conf->percpu, cpu); 1766 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1767 1768 if (scribble) { 1769 kfree(percpu->scribble); 1770 percpu->scribble = scribble; 1771 } else { 1772 err = -ENOMEM; 1773 break; 1774 } 1775 } 1776 put_online_cpus(); 1777 1778 /* Step 4, return new stripes to service */ 1779 while(!list_empty(&newstripes)) { 1780 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1781 list_del_init(&nsh->lru); 1782 1783 for (i=conf->raid_disks; i < newsize; i++) 1784 if (nsh->dev[i].page == NULL) { 1785 struct page *p = alloc_page(GFP_NOIO); 1786 nsh->dev[i].page = p; 1787 if (!p) 1788 err = -ENOMEM; 1789 } 1790 release_stripe(nsh); 1791 } 1792 /* critical section pass, GFP_NOIO no longer needed */ 1793 1794 conf->slab_cache = sc; 1795 conf->active_name = 1-conf->active_name; 1796 conf->pool_size = newsize; 1797 return err; 1798 } 1799 1800 static int drop_one_stripe(struct r5conf *conf) 1801 { 1802 struct stripe_head *sh; 1803 1804 spin_lock_irq(&conf->device_lock); 1805 sh = get_free_stripe(conf); 1806 spin_unlock_irq(&conf->device_lock); 1807 if (!sh) 1808 return 0; 1809 BUG_ON(atomic_read(&sh->count)); 1810 shrink_buffers(sh); 1811 kmem_cache_free(conf->slab_cache, sh); 1812 atomic_dec(&conf->active_stripes); 1813 return 1; 1814 } 1815 1816 static void shrink_stripes(struct r5conf *conf) 1817 { 1818 while (drop_one_stripe(conf)) 1819 ; 1820 1821 if (conf->slab_cache) 1822 kmem_cache_destroy(conf->slab_cache); 1823 conf->slab_cache = NULL; 1824 } 1825 1826 static void raid5_end_read_request(struct bio * bi, int error) 1827 { 1828 struct stripe_head *sh = bi->bi_private; 1829 struct r5conf *conf = sh->raid_conf; 1830 int disks = sh->disks, i; 1831 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1832 char b[BDEVNAME_SIZE]; 1833 struct md_rdev *rdev = NULL; 1834 sector_t s; 1835 1836 for (i=0 ; i<disks; i++) 1837 if (bi == &sh->dev[i].req) 1838 break; 1839 1840 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1841 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1842 uptodate); 1843 if (i == disks) { 1844 BUG(); 1845 return; 1846 } 1847 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1848 /* If replacement finished while this request was outstanding, 1849 * 'replacement' might be NULL already. 1850 * In that case it moved down to 'rdev'. 1851 * rdev is not removed until all requests are finished. 1852 */ 1853 rdev = conf->disks[i].replacement; 1854 if (!rdev) 1855 rdev = conf->disks[i].rdev; 1856 1857 if (use_new_offset(conf, sh)) 1858 s = sh->sector + rdev->new_data_offset; 1859 else 1860 s = sh->sector + rdev->data_offset; 1861 if (uptodate) { 1862 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1863 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1864 /* Note that this cannot happen on a 1865 * replacement device. We just fail those on 1866 * any error 1867 */ 1868 printk_ratelimited( 1869 KERN_INFO 1870 "md/raid:%s: read error corrected" 1871 " (%lu sectors at %llu on %s)\n", 1872 mdname(conf->mddev), STRIPE_SECTORS, 1873 (unsigned long long)s, 1874 bdevname(rdev->bdev, b)); 1875 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1876 clear_bit(R5_ReadError, &sh->dev[i].flags); 1877 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1878 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1879 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1880 1881 if (atomic_read(&rdev->read_errors)) 1882 atomic_set(&rdev->read_errors, 0); 1883 } else { 1884 const char *bdn = bdevname(rdev->bdev, b); 1885 int retry = 0; 1886 int set_bad = 0; 1887 1888 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1889 atomic_inc(&rdev->read_errors); 1890 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1891 printk_ratelimited( 1892 KERN_WARNING 1893 "md/raid:%s: read error on replacement device " 1894 "(sector %llu on %s).\n", 1895 mdname(conf->mddev), 1896 (unsigned long long)s, 1897 bdn); 1898 else if (conf->mddev->degraded >= conf->max_degraded) { 1899 set_bad = 1; 1900 printk_ratelimited( 1901 KERN_WARNING 1902 "md/raid:%s: read error not correctable " 1903 "(sector %llu on %s).\n", 1904 mdname(conf->mddev), 1905 (unsigned long long)s, 1906 bdn); 1907 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1908 /* Oh, no!!! */ 1909 set_bad = 1; 1910 printk_ratelimited( 1911 KERN_WARNING 1912 "md/raid:%s: read error NOT corrected!! " 1913 "(sector %llu on %s).\n", 1914 mdname(conf->mddev), 1915 (unsigned long long)s, 1916 bdn); 1917 } else if (atomic_read(&rdev->read_errors) 1918 > conf->max_nr_stripes) 1919 printk(KERN_WARNING 1920 "md/raid:%s: Too many read errors, failing device %s.\n", 1921 mdname(conf->mddev), bdn); 1922 else 1923 retry = 1; 1924 if (retry) 1925 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1926 set_bit(R5_ReadError, &sh->dev[i].flags); 1927 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1928 } else 1929 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1930 else { 1931 clear_bit(R5_ReadError, &sh->dev[i].flags); 1932 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1933 if (!(set_bad 1934 && test_bit(In_sync, &rdev->flags) 1935 && rdev_set_badblocks( 1936 rdev, sh->sector, STRIPE_SECTORS, 0))) 1937 md_error(conf->mddev, rdev); 1938 } 1939 } 1940 rdev_dec_pending(rdev, conf->mddev); 1941 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1942 set_bit(STRIPE_HANDLE, &sh->state); 1943 release_stripe(sh); 1944 } 1945 1946 static void raid5_end_write_request(struct bio *bi, int error) 1947 { 1948 struct stripe_head *sh = bi->bi_private; 1949 struct r5conf *conf = sh->raid_conf; 1950 int disks = sh->disks, i; 1951 struct md_rdev *uninitialized_var(rdev); 1952 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1953 sector_t first_bad; 1954 int bad_sectors; 1955 int replacement = 0; 1956 1957 for (i = 0 ; i < disks; i++) { 1958 if (bi == &sh->dev[i].req) { 1959 rdev = conf->disks[i].rdev; 1960 break; 1961 } 1962 if (bi == &sh->dev[i].rreq) { 1963 rdev = conf->disks[i].replacement; 1964 if (rdev) 1965 replacement = 1; 1966 else 1967 /* rdev was removed and 'replacement' 1968 * replaced it. rdev is not removed 1969 * until all requests are finished. 1970 */ 1971 rdev = conf->disks[i].rdev; 1972 break; 1973 } 1974 } 1975 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1976 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1977 uptodate); 1978 if (i == disks) { 1979 BUG(); 1980 return; 1981 } 1982 1983 if (replacement) { 1984 if (!uptodate) 1985 md_error(conf->mddev, rdev); 1986 else if (is_badblock(rdev, sh->sector, 1987 STRIPE_SECTORS, 1988 &first_bad, &bad_sectors)) 1989 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1990 } else { 1991 if (!uptodate) { 1992 set_bit(WriteErrorSeen, &rdev->flags); 1993 set_bit(R5_WriteError, &sh->dev[i].flags); 1994 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1995 set_bit(MD_RECOVERY_NEEDED, 1996 &rdev->mddev->recovery); 1997 } else if (is_badblock(rdev, sh->sector, 1998 STRIPE_SECTORS, 1999 &first_bad, &bad_sectors)) { 2000 set_bit(R5_MadeGood, &sh->dev[i].flags); 2001 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2002 /* That was a successful write so make 2003 * sure it looks like we already did 2004 * a re-write. 2005 */ 2006 set_bit(R5_ReWrite, &sh->dev[i].flags); 2007 } 2008 } 2009 rdev_dec_pending(rdev, conf->mddev); 2010 2011 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2012 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2013 set_bit(STRIPE_HANDLE, &sh->state); 2014 release_stripe(sh); 2015 } 2016 2017 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2018 2019 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2020 { 2021 struct r5dev *dev = &sh->dev[i]; 2022 2023 bio_init(&dev->req); 2024 dev->req.bi_io_vec = &dev->vec; 2025 dev->req.bi_vcnt++; 2026 dev->req.bi_max_vecs++; 2027 dev->req.bi_private = sh; 2028 dev->vec.bv_page = dev->page; 2029 2030 bio_init(&dev->rreq); 2031 dev->rreq.bi_io_vec = &dev->rvec; 2032 dev->rreq.bi_vcnt++; 2033 dev->rreq.bi_max_vecs++; 2034 dev->rreq.bi_private = sh; 2035 dev->rvec.bv_page = dev->page; 2036 2037 dev->flags = 0; 2038 dev->sector = compute_blocknr(sh, i, previous); 2039 } 2040 2041 static void error(struct mddev *mddev, struct md_rdev *rdev) 2042 { 2043 char b[BDEVNAME_SIZE]; 2044 struct r5conf *conf = mddev->private; 2045 unsigned long flags; 2046 pr_debug("raid456: error called\n"); 2047 2048 spin_lock_irqsave(&conf->device_lock, flags); 2049 clear_bit(In_sync, &rdev->flags); 2050 mddev->degraded = calc_degraded(conf); 2051 spin_unlock_irqrestore(&conf->device_lock, flags); 2052 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2053 2054 set_bit(Blocked, &rdev->flags); 2055 set_bit(Faulty, &rdev->flags); 2056 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2057 printk(KERN_ALERT 2058 "md/raid:%s: Disk failure on %s, disabling device.\n" 2059 "md/raid:%s: Operation continuing on %d devices.\n", 2060 mdname(mddev), 2061 bdevname(rdev->bdev, b), 2062 mdname(mddev), 2063 conf->raid_disks - mddev->degraded); 2064 } 2065 2066 /* 2067 * Input: a 'big' sector number, 2068 * Output: index of the data and parity disk, and the sector # in them. 2069 */ 2070 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2071 int previous, int *dd_idx, 2072 struct stripe_head *sh) 2073 { 2074 sector_t stripe, stripe2; 2075 sector_t chunk_number; 2076 unsigned int chunk_offset; 2077 int pd_idx, qd_idx; 2078 int ddf_layout = 0; 2079 sector_t new_sector; 2080 int algorithm = previous ? conf->prev_algo 2081 : conf->algorithm; 2082 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2083 : conf->chunk_sectors; 2084 int raid_disks = previous ? conf->previous_raid_disks 2085 : conf->raid_disks; 2086 int data_disks = raid_disks - conf->max_degraded; 2087 2088 /* First compute the information on this sector */ 2089 2090 /* 2091 * Compute the chunk number and the sector offset inside the chunk 2092 */ 2093 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2094 chunk_number = r_sector; 2095 2096 /* 2097 * Compute the stripe number 2098 */ 2099 stripe = chunk_number; 2100 *dd_idx = sector_div(stripe, data_disks); 2101 stripe2 = stripe; 2102 /* 2103 * Select the parity disk based on the user selected algorithm. 2104 */ 2105 pd_idx = qd_idx = -1; 2106 switch(conf->level) { 2107 case 4: 2108 pd_idx = data_disks; 2109 break; 2110 case 5: 2111 switch (algorithm) { 2112 case ALGORITHM_LEFT_ASYMMETRIC: 2113 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2114 if (*dd_idx >= pd_idx) 2115 (*dd_idx)++; 2116 break; 2117 case ALGORITHM_RIGHT_ASYMMETRIC: 2118 pd_idx = sector_div(stripe2, raid_disks); 2119 if (*dd_idx >= pd_idx) 2120 (*dd_idx)++; 2121 break; 2122 case ALGORITHM_LEFT_SYMMETRIC: 2123 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2124 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2125 break; 2126 case ALGORITHM_RIGHT_SYMMETRIC: 2127 pd_idx = sector_div(stripe2, raid_disks); 2128 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2129 break; 2130 case ALGORITHM_PARITY_0: 2131 pd_idx = 0; 2132 (*dd_idx)++; 2133 break; 2134 case ALGORITHM_PARITY_N: 2135 pd_idx = data_disks; 2136 break; 2137 default: 2138 BUG(); 2139 } 2140 break; 2141 case 6: 2142 2143 switch (algorithm) { 2144 case ALGORITHM_LEFT_ASYMMETRIC: 2145 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2146 qd_idx = pd_idx + 1; 2147 if (pd_idx == raid_disks-1) { 2148 (*dd_idx)++; /* Q D D D P */ 2149 qd_idx = 0; 2150 } else if (*dd_idx >= pd_idx) 2151 (*dd_idx) += 2; /* D D P Q D */ 2152 break; 2153 case ALGORITHM_RIGHT_ASYMMETRIC: 2154 pd_idx = sector_div(stripe2, raid_disks); 2155 qd_idx = pd_idx + 1; 2156 if (pd_idx == raid_disks-1) { 2157 (*dd_idx)++; /* Q D D D P */ 2158 qd_idx = 0; 2159 } else if (*dd_idx >= pd_idx) 2160 (*dd_idx) += 2; /* D D P Q D */ 2161 break; 2162 case ALGORITHM_LEFT_SYMMETRIC: 2163 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2164 qd_idx = (pd_idx + 1) % raid_disks; 2165 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2166 break; 2167 case ALGORITHM_RIGHT_SYMMETRIC: 2168 pd_idx = sector_div(stripe2, raid_disks); 2169 qd_idx = (pd_idx + 1) % raid_disks; 2170 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2171 break; 2172 2173 case ALGORITHM_PARITY_0: 2174 pd_idx = 0; 2175 qd_idx = 1; 2176 (*dd_idx) += 2; 2177 break; 2178 case ALGORITHM_PARITY_N: 2179 pd_idx = data_disks; 2180 qd_idx = data_disks + 1; 2181 break; 2182 2183 case ALGORITHM_ROTATING_ZERO_RESTART: 2184 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2185 * of blocks for computing Q is different. 2186 */ 2187 pd_idx = sector_div(stripe2, raid_disks); 2188 qd_idx = pd_idx + 1; 2189 if (pd_idx == raid_disks-1) { 2190 (*dd_idx)++; /* Q D D D P */ 2191 qd_idx = 0; 2192 } else if (*dd_idx >= pd_idx) 2193 (*dd_idx) += 2; /* D D P Q D */ 2194 ddf_layout = 1; 2195 break; 2196 2197 case ALGORITHM_ROTATING_N_RESTART: 2198 /* Same a left_asymmetric, by first stripe is 2199 * D D D P Q rather than 2200 * Q D D D P 2201 */ 2202 stripe2 += 1; 2203 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2204 qd_idx = pd_idx + 1; 2205 if (pd_idx == raid_disks-1) { 2206 (*dd_idx)++; /* Q D D D P */ 2207 qd_idx = 0; 2208 } else if (*dd_idx >= pd_idx) 2209 (*dd_idx) += 2; /* D D P Q D */ 2210 ddf_layout = 1; 2211 break; 2212 2213 case ALGORITHM_ROTATING_N_CONTINUE: 2214 /* Same as left_symmetric but Q is before P */ 2215 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2216 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2217 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2218 ddf_layout = 1; 2219 break; 2220 2221 case ALGORITHM_LEFT_ASYMMETRIC_6: 2222 /* RAID5 left_asymmetric, with Q on last device */ 2223 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2224 if (*dd_idx >= pd_idx) 2225 (*dd_idx)++; 2226 qd_idx = raid_disks - 1; 2227 break; 2228 2229 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2230 pd_idx = sector_div(stripe2, raid_disks-1); 2231 if (*dd_idx >= pd_idx) 2232 (*dd_idx)++; 2233 qd_idx = raid_disks - 1; 2234 break; 2235 2236 case ALGORITHM_LEFT_SYMMETRIC_6: 2237 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2238 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2239 qd_idx = raid_disks - 1; 2240 break; 2241 2242 case ALGORITHM_RIGHT_SYMMETRIC_6: 2243 pd_idx = sector_div(stripe2, raid_disks-1); 2244 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2245 qd_idx = raid_disks - 1; 2246 break; 2247 2248 case ALGORITHM_PARITY_0_6: 2249 pd_idx = 0; 2250 (*dd_idx)++; 2251 qd_idx = raid_disks - 1; 2252 break; 2253 2254 default: 2255 BUG(); 2256 } 2257 break; 2258 } 2259 2260 if (sh) { 2261 sh->pd_idx = pd_idx; 2262 sh->qd_idx = qd_idx; 2263 sh->ddf_layout = ddf_layout; 2264 } 2265 /* 2266 * Finally, compute the new sector number 2267 */ 2268 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2269 return new_sector; 2270 } 2271 2272 2273 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2274 { 2275 struct r5conf *conf = sh->raid_conf; 2276 int raid_disks = sh->disks; 2277 int data_disks = raid_disks - conf->max_degraded; 2278 sector_t new_sector = sh->sector, check; 2279 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2280 : conf->chunk_sectors; 2281 int algorithm = previous ? conf->prev_algo 2282 : conf->algorithm; 2283 sector_t stripe; 2284 int chunk_offset; 2285 sector_t chunk_number; 2286 int dummy1, dd_idx = i; 2287 sector_t r_sector; 2288 struct stripe_head sh2; 2289 2290 2291 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2292 stripe = new_sector; 2293 2294 if (i == sh->pd_idx) 2295 return 0; 2296 switch(conf->level) { 2297 case 4: break; 2298 case 5: 2299 switch (algorithm) { 2300 case ALGORITHM_LEFT_ASYMMETRIC: 2301 case ALGORITHM_RIGHT_ASYMMETRIC: 2302 if (i > sh->pd_idx) 2303 i--; 2304 break; 2305 case ALGORITHM_LEFT_SYMMETRIC: 2306 case ALGORITHM_RIGHT_SYMMETRIC: 2307 if (i < sh->pd_idx) 2308 i += raid_disks; 2309 i -= (sh->pd_idx + 1); 2310 break; 2311 case ALGORITHM_PARITY_0: 2312 i -= 1; 2313 break; 2314 case ALGORITHM_PARITY_N: 2315 break; 2316 default: 2317 BUG(); 2318 } 2319 break; 2320 case 6: 2321 if (i == sh->qd_idx) 2322 return 0; /* It is the Q disk */ 2323 switch (algorithm) { 2324 case ALGORITHM_LEFT_ASYMMETRIC: 2325 case ALGORITHM_RIGHT_ASYMMETRIC: 2326 case ALGORITHM_ROTATING_ZERO_RESTART: 2327 case ALGORITHM_ROTATING_N_RESTART: 2328 if (sh->pd_idx == raid_disks-1) 2329 i--; /* Q D D D P */ 2330 else if (i > sh->pd_idx) 2331 i -= 2; /* D D P Q D */ 2332 break; 2333 case ALGORITHM_LEFT_SYMMETRIC: 2334 case ALGORITHM_RIGHT_SYMMETRIC: 2335 if (sh->pd_idx == raid_disks-1) 2336 i--; /* Q D D D P */ 2337 else { 2338 /* D D P Q D */ 2339 if (i < sh->pd_idx) 2340 i += raid_disks; 2341 i -= (sh->pd_idx + 2); 2342 } 2343 break; 2344 case ALGORITHM_PARITY_0: 2345 i -= 2; 2346 break; 2347 case ALGORITHM_PARITY_N: 2348 break; 2349 case ALGORITHM_ROTATING_N_CONTINUE: 2350 /* Like left_symmetric, but P is before Q */ 2351 if (sh->pd_idx == 0) 2352 i--; /* P D D D Q */ 2353 else { 2354 /* D D Q P D */ 2355 if (i < sh->pd_idx) 2356 i += raid_disks; 2357 i -= (sh->pd_idx + 1); 2358 } 2359 break; 2360 case ALGORITHM_LEFT_ASYMMETRIC_6: 2361 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2362 if (i > sh->pd_idx) 2363 i--; 2364 break; 2365 case ALGORITHM_LEFT_SYMMETRIC_6: 2366 case ALGORITHM_RIGHT_SYMMETRIC_6: 2367 if (i < sh->pd_idx) 2368 i += data_disks + 1; 2369 i -= (sh->pd_idx + 1); 2370 break; 2371 case ALGORITHM_PARITY_0_6: 2372 i -= 1; 2373 break; 2374 default: 2375 BUG(); 2376 } 2377 break; 2378 } 2379 2380 chunk_number = stripe * data_disks + i; 2381 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2382 2383 check = raid5_compute_sector(conf, r_sector, 2384 previous, &dummy1, &sh2); 2385 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2386 || sh2.qd_idx != sh->qd_idx) { 2387 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2388 mdname(conf->mddev)); 2389 return 0; 2390 } 2391 return r_sector; 2392 } 2393 2394 2395 static void 2396 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2397 int rcw, int expand) 2398 { 2399 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2400 struct r5conf *conf = sh->raid_conf; 2401 int level = conf->level; 2402 2403 if (rcw) { 2404 2405 for (i = disks; i--; ) { 2406 struct r5dev *dev = &sh->dev[i]; 2407 2408 if (dev->towrite) { 2409 set_bit(R5_LOCKED, &dev->flags); 2410 set_bit(R5_Wantdrain, &dev->flags); 2411 if (!expand) 2412 clear_bit(R5_UPTODATE, &dev->flags); 2413 s->locked++; 2414 } 2415 } 2416 /* if we are not expanding this is a proper write request, and 2417 * there will be bios with new data to be drained into the 2418 * stripe cache 2419 */ 2420 if (!expand) { 2421 if (!s->locked) 2422 /* False alarm, nothing to do */ 2423 return; 2424 sh->reconstruct_state = reconstruct_state_drain_run; 2425 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2426 } else 2427 sh->reconstruct_state = reconstruct_state_run; 2428 2429 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2430 2431 if (s->locked + conf->max_degraded == disks) 2432 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2433 atomic_inc(&conf->pending_full_writes); 2434 } else { 2435 BUG_ON(level == 6); 2436 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2437 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2438 2439 for (i = disks; i--; ) { 2440 struct r5dev *dev = &sh->dev[i]; 2441 if (i == pd_idx) 2442 continue; 2443 2444 if (dev->towrite && 2445 (test_bit(R5_UPTODATE, &dev->flags) || 2446 test_bit(R5_Wantcompute, &dev->flags))) { 2447 set_bit(R5_Wantdrain, &dev->flags); 2448 set_bit(R5_LOCKED, &dev->flags); 2449 clear_bit(R5_UPTODATE, &dev->flags); 2450 s->locked++; 2451 } 2452 } 2453 if (!s->locked) 2454 /* False alarm - nothing to do */ 2455 return; 2456 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2457 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2458 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2459 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2460 } 2461 2462 /* keep the parity disk(s) locked while asynchronous operations 2463 * are in flight 2464 */ 2465 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2466 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2467 s->locked++; 2468 2469 if (level == 6) { 2470 int qd_idx = sh->qd_idx; 2471 struct r5dev *dev = &sh->dev[qd_idx]; 2472 2473 set_bit(R5_LOCKED, &dev->flags); 2474 clear_bit(R5_UPTODATE, &dev->flags); 2475 s->locked++; 2476 } 2477 2478 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2479 __func__, (unsigned long long)sh->sector, 2480 s->locked, s->ops_request); 2481 } 2482 2483 /* 2484 * Each stripe/dev can have one or more bion attached. 2485 * toread/towrite point to the first in a chain. 2486 * The bi_next chain must be in order. 2487 */ 2488 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2489 { 2490 struct bio **bip; 2491 struct r5conf *conf = sh->raid_conf; 2492 int firstwrite=0; 2493 2494 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2495 (unsigned long long)bi->bi_sector, 2496 (unsigned long long)sh->sector); 2497 2498 /* 2499 * If several bio share a stripe. The bio bi_phys_segments acts as a 2500 * reference count to avoid race. The reference count should already be 2501 * increased before this function is called (for example, in 2502 * make_request()), so other bio sharing this stripe will not free the 2503 * stripe. If a stripe is owned by one stripe, the stripe lock will 2504 * protect it. 2505 */ 2506 spin_lock_irq(&sh->stripe_lock); 2507 if (forwrite) { 2508 bip = &sh->dev[dd_idx].towrite; 2509 if (*bip == NULL) 2510 firstwrite = 1; 2511 } else 2512 bip = &sh->dev[dd_idx].toread; 2513 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2514 if (bio_end_sector(*bip) > bi->bi_sector) 2515 goto overlap; 2516 bip = & (*bip)->bi_next; 2517 } 2518 if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) 2519 goto overlap; 2520 2521 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2522 if (*bip) 2523 bi->bi_next = *bip; 2524 *bip = bi; 2525 raid5_inc_bi_active_stripes(bi); 2526 2527 if (forwrite) { 2528 /* check if page is covered */ 2529 sector_t sector = sh->dev[dd_idx].sector; 2530 for (bi=sh->dev[dd_idx].towrite; 2531 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2532 bi && bi->bi_sector <= sector; 2533 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2534 if (bio_end_sector(bi) >= sector) 2535 sector = bio_end_sector(bi); 2536 } 2537 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2538 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2539 } 2540 2541 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2542 (unsigned long long)(*bip)->bi_sector, 2543 (unsigned long long)sh->sector, dd_idx); 2544 spin_unlock_irq(&sh->stripe_lock); 2545 2546 if (conf->mddev->bitmap && firstwrite) { 2547 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2548 STRIPE_SECTORS, 0); 2549 sh->bm_seq = conf->seq_flush+1; 2550 set_bit(STRIPE_BIT_DELAY, &sh->state); 2551 } 2552 return 1; 2553 2554 overlap: 2555 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2556 spin_unlock_irq(&sh->stripe_lock); 2557 return 0; 2558 } 2559 2560 static void end_reshape(struct r5conf *conf); 2561 2562 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2563 struct stripe_head *sh) 2564 { 2565 int sectors_per_chunk = 2566 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2567 int dd_idx; 2568 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2569 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2570 2571 raid5_compute_sector(conf, 2572 stripe * (disks - conf->max_degraded) 2573 *sectors_per_chunk + chunk_offset, 2574 previous, 2575 &dd_idx, sh); 2576 } 2577 2578 static void 2579 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2580 struct stripe_head_state *s, int disks, 2581 struct bio **return_bi) 2582 { 2583 int i; 2584 for (i = disks; i--; ) { 2585 struct bio *bi; 2586 int bitmap_end = 0; 2587 2588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2589 struct md_rdev *rdev; 2590 rcu_read_lock(); 2591 rdev = rcu_dereference(conf->disks[i].rdev); 2592 if (rdev && test_bit(In_sync, &rdev->flags)) 2593 atomic_inc(&rdev->nr_pending); 2594 else 2595 rdev = NULL; 2596 rcu_read_unlock(); 2597 if (rdev) { 2598 if (!rdev_set_badblocks( 2599 rdev, 2600 sh->sector, 2601 STRIPE_SECTORS, 0)) 2602 md_error(conf->mddev, rdev); 2603 rdev_dec_pending(rdev, conf->mddev); 2604 } 2605 } 2606 spin_lock_irq(&sh->stripe_lock); 2607 /* fail all writes first */ 2608 bi = sh->dev[i].towrite; 2609 sh->dev[i].towrite = NULL; 2610 spin_unlock_irq(&sh->stripe_lock); 2611 if (bi) 2612 bitmap_end = 1; 2613 2614 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2615 wake_up(&conf->wait_for_overlap); 2616 2617 while (bi && bi->bi_sector < 2618 sh->dev[i].sector + STRIPE_SECTORS) { 2619 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2620 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2621 if (!raid5_dec_bi_active_stripes(bi)) { 2622 md_write_end(conf->mddev); 2623 bi->bi_next = *return_bi; 2624 *return_bi = bi; 2625 } 2626 bi = nextbi; 2627 } 2628 if (bitmap_end) 2629 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2630 STRIPE_SECTORS, 0, 0); 2631 bitmap_end = 0; 2632 /* and fail all 'written' */ 2633 bi = sh->dev[i].written; 2634 sh->dev[i].written = NULL; 2635 if (bi) bitmap_end = 1; 2636 while (bi && bi->bi_sector < 2637 sh->dev[i].sector + STRIPE_SECTORS) { 2638 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2639 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2640 if (!raid5_dec_bi_active_stripes(bi)) { 2641 md_write_end(conf->mddev); 2642 bi->bi_next = *return_bi; 2643 *return_bi = bi; 2644 } 2645 bi = bi2; 2646 } 2647 2648 /* fail any reads if this device is non-operational and 2649 * the data has not reached the cache yet. 2650 */ 2651 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2652 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2653 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2654 spin_lock_irq(&sh->stripe_lock); 2655 bi = sh->dev[i].toread; 2656 sh->dev[i].toread = NULL; 2657 spin_unlock_irq(&sh->stripe_lock); 2658 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2659 wake_up(&conf->wait_for_overlap); 2660 while (bi && bi->bi_sector < 2661 sh->dev[i].sector + STRIPE_SECTORS) { 2662 struct bio *nextbi = 2663 r5_next_bio(bi, sh->dev[i].sector); 2664 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2665 if (!raid5_dec_bi_active_stripes(bi)) { 2666 bi->bi_next = *return_bi; 2667 *return_bi = bi; 2668 } 2669 bi = nextbi; 2670 } 2671 } 2672 if (bitmap_end) 2673 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2674 STRIPE_SECTORS, 0, 0); 2675 /* If we were in the middle of a write the parity block might 2676 * still be locked - so just clear all R5_LOCKED flags 2677 */ 2678 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2679 } 2680 2681 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2682 if (atomic_dec_and_test(&conf->pending_full_writes)) 2683 md_wakeup_thread(conf->mddev->thread); 2684 } 2685 2686 static void 2687 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2688 struct stripe_head_state *s) 2689 { 2690 int abort = 0; 2691 int i; 2692 2693 clear_bit(STRIPE_SYNCING, &sh->state); 2694 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 2695 wake_up(&conf->wait_for_overlap); 2696 s->syncing = 0; 2697 s->replacing = 0; 2698 /* There is nothing more to do for sync/check/repair. 2699 * Don't even need to abort as that is handled elsewhere 2700 * if needed, and not always wanted e.g. if there is a known 2701 * bad block here. 2702 * For recover/replace we need to record a bad block on all 2703 * non-sync devices, or abort the recovery 2704 */ 2705 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2706 /* During recovery devices cannot be removed, so 2707 * locking and refcounting of rdevs is not needed 2708 */ 2709 for (i = 0; i < conf->raid_disks; i++) { 2710 struct md_rdev *rdev = conf->disks[i].rdev; 2711 if (rdev 2712 && !test_bit(Faulty, &rdev->flags) 2713 && !test_bit(In_sync, &rdev->flags) 2714 && !rdev_set_badblocks(rdev, sh->sector, 2715 STRIPE_SECTORS, 0)) 2716 abort = 1; 2717 rdev = conf->disks[i].replacement; 2718 if (rdev 2719 && !test_bit(Faulty, &rdev->flags) 2720 && !test_bit(In_sync, &rdev->flags) 2721 && !rdev_set_badblocks(rdev, sh->sector, 2722 STRIPE_SECTORS, 0)) 2723 abort = 1; 2724 } 2725 if (abort) 2726 conf->recovery_disabled = 2727 conf->mddev->recovery_disabled; 2728 } 2729 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2730 } 2731 2732 static int want_replace(struct stripe_head *sh, int disk_idx) 2733 { 2734 struct md_rdev *rdev; 2735 int rv = 0; 2736 /* Doing recovery so rcu locking not required */ 2737 rdev = sh->raid_conf->disks[disk_idx].replacement; 2738 if (rdev 2739 && !test_bit(Faulty, &rdev->flags) 2740 && !test_bit(In_sync, &rdev->flags) 2741 && (rdev->recovery_offset <= sh->sector 2742 || rdev->mddev->recovery_cp <= sh->sector)) 2743 rv = 1; 2744 2745 return rv; 2746 } 2747 2748 /* fetch_block - checks the given member device to see if its data needs 2749 * to be read or computed to satisfy a request. 2750 * 2751 * Returns 1 when no more member devices need to be checked, otherwise returns 2752 * 0 to tell the loop in handle_stripe_fill to continue 2753 */ 2754 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2755 int disk_idx, int disks) 2756 { 2757 struct r5dev *dev = &sh->dev[disk_idx]; 2758 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2759 &sh->dev[s->failed_num[1]] }; 2760 2761 /* is the data in this block needed, and can we get it? */ 2762 if (!test_bit(R5_LOCKED, &dev->flags) && 2763 !test_bit(R5_UPTODATE, &dev->flags) && 2764 (dev->toread || 2765 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2766 s->syncing || s->expanding || 2767 (s->replacing && want_replace(sh, disk_idx)) || 2768 (s->failed >= 1 && fdev[0]->toread) || 2769 (s->failed >= 2 && fdev[1]->toread) || 2770 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2771 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2772 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2773 /* we would like to get this block, possibly by computing it, 2774 * otherwise read it if the backing disk is insync 2775 */ 2776 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2777 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2778 if ((s->uptodate == disks - 1) && 2779 (s->failed && (disk_idx == s->failed_num[0] || 2780 disk_idx == s->failed_num[1]))) { 2781 /* have disk failed, and we're requested to fetch it; 2782 * do compute it 2783 */ 2784 pr_debug("Computing stripe %llu block %d\n", 2785 (unsigned long long)sh->sector, disk_idx); 2786 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2787 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2788 set_bit(R5_Wantcompute, &dev->flags); 2789 sh->ops.target = disk_idx; 2790 sh->ops.target2 = -1; /* no 2nd target */ 2791 s->req_compute = 1; 2792 /* Careful: from this point on 'uptodate' is in the eye 2793 * of raid_run_ops which services 'compute' operations 2794 * before writes. R5_Wantcompute flags a block that will 2795 * be R5_UPTODATE by the time it is needed for a 2796 * subsequent operation. 2797 */ 2798 s->uptodate++; 2799 return 1; 2800 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2801 /* Computing 2-failure is *very* expensive; only 2802 * do it if failed >= 2 2803 */ 2804 int other; 2805 for (other = disks; other--; ) { 2806 if (other == disk_idx) 2807 continue; 2808 if (!test_bit(R5_UPTODATE, 2809 &sh->dev[other].flags)) 2810 break; 2811 } 2812 BUG_ON(other < 0); 2813 pr_debug("Computing stripe %llu blocks %d,%d\n", 2814 (unsigned long long)sh->sector, 2815 disk_idx, other); 2816 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2817 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2818 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2819 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2820 sh->ops.target = disk_idx; 2821 sh->ops.target2 = other; 2822 s->uptodate += 2; 2823 s->req_compute = 1; 2824 return 1; 2825 } else if (test_bit(R5_Insync, &dev->flags)) { 2826 set_bit(R5_LOCKED, &dev->flags); 2827 set_bit(R5_Wantread, &dev->flags); 2828 s->locked++; 2829 pr_debug("Reading block %d (sync=%d)\n", 2830 disk_idx, s->syncing); 2831 } 2832 } 2833 2834 return 0; 2835 } 2836 2837 /** 2838 * handle_stripe_fill - read or compute data to satisfy pending requests. 2839 */ 2840 static void handle_stripe_fill(struct stripe_head *sh, 2841 struct stripe_head_state *s, 2842 int disks) 2843 { 2844 int i; 2845 2846 /* look for blocks to read/compute, skip this if a compute 2847 * is already in flight, or if the stripe contents are in the 2848 * midst of changing due to a write 2849 */ 2850 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2851 !sh->reconstruct_state) 2852 for (i = disks; i--; ) 2853 if (fetch_block(sh, s, i, disks)) 2854 break; 2855 set_bit(STRIPE_HANDLE, &sh->state); 2856 } 2857 2858 2859 /* handle_stripe_clean_event 2860 * any written block on an uptodate or failed drive can be returned. 2861 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2862 * never LOCKED, so we don't need to test 'failed' directly. 2863 */ 2864 static void handle_stripe_clean_event(struct r5conf *conf, 2865 struct stripe_head *sh, int disks, struct bio **return_bi) 2866 { 2867 int i; 2868 struct r5dev *dev; 2869 int discard_pending = 0; 2870 2871 for (i = disks; i--; ) 2872 if (sh->dev[i].written) { 2873 dev = &sh->dev[i]; 2874 if (!test_bit(R5_LOCKED, &dev->flags) && 2875 (test_bit(R5_UPTODATE, &dev->flags) || 2876 test_bit(R5_Discard, &dev->flags))) { 2877 /* We can return any write requests */ 2878 struct bio *wbi, *wbi2; 2879 pr_debug("Return write for disc %d\n", i); 2880 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2881 clear_bit(R5_UPTODATE, &dev->flags); 2882 wbi = dev->written; 2883 dev->written = NULL; 2884 while (wbi && wbi->bi_sector < 2885 dev->sector + STRIPE_SECTORS) { 2886 wbi2 = r5_next_bio(wbi, dev->sector); 2887 if (!raid5_dec_bi_active_stripes(wbi)) { 2888 md_write_end(conf->mddev); 2889 wbi->bi_next = *return_bi; 2890 *return_bi = wbi; 2891 } 2892 wbi = wbi2; 2893 } 2894 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2895 STRIPE_SECTORS, 2896 !test_bit(STRIPE_DEGRADED, &sh->state), 2897 0); 2898 } else if (test_bit(R5_Discard, &dev->flags)) 2899 discard_pending = 1; 2900 } 2901 if (!discard_pending && 2902 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 2903 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2904 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2905 if (sh->qd_idx >= 0) { 2906 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2907 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 2908 } 2909 /* now that discard is done we can proceed with any sync */ 2910 clear_bit(STRIPE_DISCARD, &sh->state); 2911 /* 2912 * SCSI discard will change some bio fields and the stripe has 2913 * no updated data, so remove it from hash list and the stripe 2914 * will be reinitialized 2915 */ 2916 spin_lock_irq(&conf->device_lock); 2917 remove_hash(sh); 2918 spin_unlock_irq(&conf->device_lock); 2919 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 2920 set_bit(STRIPE_HANDLE, &sh->state); 2921 2922 } 2923 2924 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2925 if (atomic_dec_and_test(&conf->pending_full_writes)) 2926 md_wakeup_thread(conf->mddev->thread); 2927 } 2928 2929 static void handle_stripe_dirtying(struct r5conf *conf, 2930 struct stripe_head *sh, 2931 struct stripe_head_state *s, 2932 int disks) 2933 { 2934 int rmw = 0, rcw = 0, i; 2935 sector_t recovery_cp = conf->mddev->recovery_cp; 2936 2937 /* RAID6 requires 'rcw' in current implementation. 2938 * Otherwise, check whether resync is now happening or should start. 2939 * If yes, then the array is dirty (after unclean shutdown or 2940 * initial creation), so parity in some stripes might be inconsistent. 2941 * In this case, we need to always do reconstruct-write, to ensure 2942 * that in case of drive failure or read-error correction, we 2943 * generate correct data from the parity. 2944 */ 2945 if (conf->max_degraded == 2 || 2946 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2947 /* Calculate the real rcw later - for now make it 2948 * look like rcw is cheaper 2949 */ 2950 rcw = 1; rmw = 2; 2951 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2952 conf->max_degraded, (unsigned long long)recovery_cp, 2953 (unsigned long long)sh->sector); 2954 } else for (i = disks; i--; ) { 2955 /* would I have to read this buffer for read_modify_write */ 2956 struct r5dev *dev = &sh->dev[i]; 2957 if ((dev->towrite || i == sh->pd_idx) && 2958 !test_bit(R5_LOCKED, &dev->flags) && 2959 !(test_bit(R5_UPTODATE, &dev->flags) || 2960 test_bit(R5_Wantcompute, &dev->flags))) { 2961 if (test_bit(R5_Insync, &dev->flags)) 2962 rmw++; 2963 else 2964 rmw += 2*disks; /* cannot read it */ 2965 } 2966 /* Would I have to read this buffer for reconstruct_write */ 2967 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2968 !test_bit(R5_LOCKED, &dev->flags) && 2969 !(test_bit(R5_UPTODATE, &dev->flags) || 2970 test_bit(R5_Wantcompute, &dev->flags))) { 2971 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2972 else 2973 rcw += 2*disks; 2974 } 2975 } 2976 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2977 (unsigned long long)sh->sector, rmw, rcw); 2978 set_bit(STRIPE_HANDLE, &sh->state); 2979 if (rmw < rcw && rmw > 0) { 2980 /* prefer read-modify-write, but need to get some data */ 2981 if (conf->mddev->queue) 2982 blk_add_trace_msg(conf->mddev->queue, 2983 "raid5 rmw %llu %d", 2984 (unsigned long long)sh->sector, rmw); 2985 for (i = disks; i--; ) { 2986 struct r5dev *dev = &sh->dev[i]; 2987 if ((dev->towrite || i == sh->pd_idx) && 2988 !test_bit(R5_LOCKED, &dev->flags) && 2989 !(test_bit(R5_UPTODATE, &dev->flags) || 2990 test_bit(R5_Wantcompute, &dev->flags)) && 2991 test_bit(R5_Insync, &dev->flags)) { 2992 if ( 2993 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2994 pr_debug("Read_old block " 2995 "%d for r-m-w\n", i); 2996 set_bit(R5_LOCKED, &dev->flags); 2997 set_bit(R5_Wantread, &dev->flags); 2998 s->locked++; 2999 } else { 3000 set_bit(STRIPE_DELAYED, &sh->state); 3001 set_bit(STRIPE_HANDLE, &sh->state); 3002 } 3003 } 3004 } 3005 } 3006 if (rcw <= rmw && rcw > 0) { 3007 /* want reconstruct write, but need to get some data */ 3008 int qread =0; 3009 rcw = 0; 3010 for (i = disks; i--; ) { 3011 struct r5dev *dev = &sh->dev[i]; 3012 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3013 i != sh->pd_idx && i != sh->qd_idx && 3014 !test_bit(R5_LOCKED, &dev->flags) && 3015 !(test_bit(R5_UPTODATE, &dev->flags) || 3016 test_bit(R5_Wantcompute, &dev->flags))) { 3017 rcw++; 3018 if (!test_bit(R5_Insync, &dev->flags)) 3019 continue; /* it's a failed drive */ 3020 if ( 3021 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3022 pr_debug("Read_old block " 3023 "%d for Reconstruct\n", i); 3024 set_bit(R5_LOCKED, &dev->flags); 3025 set_bit(R5_Wantread, &dev->flags); 3026 s->locked++; 3027 qread++; 3028 } else { 3029 set_bit(STRIPE_DELAYED, &sh->state); 3030 set_bit(STRIPE_HANDLE, &sh->state); 3031 } 3032 } 3033 } 3034 if (rcw && conf->mddev->queue) 3035 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3036 (unsigned long long)sh->sector, 3037 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3038 } 3039 /* now if nothing is locked, and if we have enough data, 3040 * we can start a write request 3041 */ 3042 /* since handle_stripe can be called at any time we need to handle the 3043 * case where a compute block operation has been submitted and then a 3044 * subsequent call wants to start a write request. raid_run_ops only 3045 * handles the case where compute block and reconstruct are requested 3046 * simultaneously. If this is not the case then new writes need to be 3047 * held off until the compute completes. 3048 */ 3049 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3050 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3051 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3052 schedule_reconstruction(sh, s, rcw == 0, 0); 3053 } 3054 3055 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3056 struct stripe_head_state *s, int disks) 3057 { 3058 struct r5dev *dev = NULL; 3059 3060 set_bit(STRIPE_HANDLE, &sh->state); 3061 3062 switch (sh->check_state) { 3063 case check_state_idle: 3064 /* start a new check operation if there are no failures */ 3065 if (s->failed == 0) { 3066 BUG_ON(s->uptodate != disks); 3067 sh->check_state = check_state_run; 3068 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3069 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3070 s->uptodate--; 3071 break; 3072 } 3073 dev = &sh->dev[s->failed_num[0]]; 3074 /* fall through */ 3075 case check_state_compute_result: 3076 sh->check_state = check_state_idle; 3077 if (!dev) 3078 dev = &sh->dev[sh->pd_idx]; 3079 3080 /* check that a write has not made the stripe insync */ 3081 if (test_bit(STRIPE_INSYNC, &sh->state)) 3082 break; 3083 3084 /* either failed parity check, or recovery is happening */ 3085 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3086 BUG_ON(s->uptodate != disks); 3087 3088 set_bit(R5_LOCKED, &dev->flags); 3089 s->locked++; 3090 set_bit(R5_Wantwrite, &dev->flags); 3091 3092 clear_bit(STRIPE_DEGRADED, &sh->state); 3093 set_bit(STRIPE_INSYNC, &sh->state); 3094 break; 3095 case check_state_run: 3096 break; /* we will be called again upon completion */ 3097 case check_state_check_result: 3098 sh->check_state = check_state_idle; 3099 3100 /* if a failure occurred during the check operation, leave 3101 * STRIPE_INSYNC not set and let the stripe be handled again 3102 */ 3103 if (s->failed) 3104 break; 3105 3106 /* handle a successful check operation, if parity is correct 3107 * we are done. Otherwise update the mismatch count and repair 3108 * parity if !MD_RECOVERY_CHECK 3109 */ 3110 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3111 /* parity is correct (on disc, 3112 * not in buffer any more) 3113 */ 3114 set_bit(STRIPE_INSYNC, &sh->state); 3115 else { 3116 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3117 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3118 /* don't try to repair!! */ 3119 set_bit(STRIPE_INSYNC, &sh->state); 3120 else { 3121 sh->check_state = check_state_compute_run; 3122 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3123 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3124 set_bit(R5_Wantcompute, 3125 &sh->dev[sh->pd_idx].flags); 3126 sh->ops.target = sh->pd_idx; 3127 sh->ops.target2 = -1; 3128 s->uptodate++; 3129 } 3130 } 3131 break; 3132 case check_state_compute_run: 3133 break; 3134 default: 3135 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3136 __func__, sh->check_state, 3137 (unsigned long long) sh->sector); 3138 BUG(); 3139 } 3140 } 3141 3142 3143 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3144 struct stripe_head_state *s, 3145 int disks) 3146 { 3147 int pd_idx = sh->pd_idx; 3148 int qd_idx = sh->qd_idx; 3149 struct r5dev *dev; 3150 3151 set_bit(STRIPE_HANDLE, &sh->state); 3152 3153 BUG_ON(s->failed > 2); 3154 3155 /* Want to check and possibly repair P and Q. 3156 * However there could be one 'failed' device, in which 3157 * case we can only check one of them, possibly using the 3158 * other to generate missing data 3159 */ 3160 3161 switch (sh->check_state) { 3162 case check_state_idle: 3163 /* start a new check operation if there are < 2 failures */ 3164 if (s->failed == s->q_failed) { 3165 /* The only possible failed device holds Q, so it 3166 * makes sense to check P (If anything else were failed, 3167 * we would have used P to recreate it). 3168 */ 3169 sh->check_state = check_state_run; 3170 } 3171 if (!s->q_failed && s->failed < 2) { 3172 /* Q is not failed, and we didn't use it to generate 3173 * anything, so it makes sense to check it 3174 */ 3175 if (sh->check_state == check_state_run) 3176 sh->check_state = check_state_run_pq; 3177 else 3178 sh->check_state = check_state_run_q; 3179 } 3180 3181 /* discard potentially stale zero_sum_result */ 3182 sh->ops.zero_sum_result = 0; 3183 3184 if (sh->check_state == check_state_run) { 3185 /* async_xor_zero_sum destroys the contents of P */ 3186 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3187 s->uptodate--; 3188 } 3189 if (sh->check_state >= check_state_run && 3190 sh->check_state <= check_state_run_pq) { 3191 /* async_syndrome_zero_sum preserves P and Q, so 3192 * no need to mark them !uptodate here 3193 */ 3194 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3195 break; 3196 } 3197 3198 /* we have 2-disk failure */ 3199 BUG_ON(s->failed != 2); 3200 /* fall through */ 3201 case check_state_compute_result: 3202 sh->check_state = check_state_idle; 3203 3204 /* check that a write has not made the stripe insync */ 3205 if (test_bit(STRIPE_INSYNC, &sh->state)) 3206 break; 3207 3208 /* now write out any block on a failed drive, 3209 * or P or Q if they were recomputed 3210 */ 3211 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3212 if (s->failed == 2) { 3213 dev = &sh->dev[s->failed_num[1]]; 3214 s->locked++; 3215 set_bit(R5_LOCKED, &dev->flags); 3216 set_bit(R5_Wantwrite, &dev->flags); 3217 } 3218 if (s->failed >= 1) { 3219 dev = &sh->dev[s->failed_num[0]]; 3220 s->locked++; 3221 set_bit(R5_LOCKED, &dev->flags); 3222 set_bit(R5_Wantwrite, &dev->flags); 3223 } 3224 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3225 dev = &sh->dev[pd_idx]; 3226 s->locked++; 3227 set_bit(R5_LOCKED, &dev->flags); 3228 set_bit(R5_Wantwrite, &dev->flags); 3229 } 3230 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3231 dev = &sh->dev[qd_idx]; 3232 s->locked++; 3233 set_bit(R5_LOCKED, &dev->flags); 3234 set_bit(R5_Wantwrite, &dev->flags); 3235 } 3236 clear_bit(STRIPE_DEGRADED, &sh->state); 3237 3238 set_bit(STRIPE_INSYNC, &sh->state); 3239 break; 3240 case check_state_run: 3241 case check_state_run_q: 3242 case check_state_run_pq: 3243 break; /* we will be called again upon completion */ 3244 case check_state_check_result: 3245 sh->check_state = check_state_idle; 3246 3247 /* handle a successful check operation, if parity is correct 3248 * we are done. Otherwise update the mismatch count and repair 3249 * parity if !MD_RECOVERY_CHECK 3250 */ 3251 if (sh->ops.zero_sum_result == 0) { 3252 /* both parities are correct */ 3253 if (!s->failed) 3254 set_bit(STRIPE_INSYNC, &sh->state); 3255 else { 3256 /* in contrast to the raid5 case we can validate 3257 * parity, but still have a failure to write 3258 * back 3259 */ 3260 sh->check_state = check_state_compute_result; 3261 /* Returning at this point means that we may go 3262 * off and bring p and/or q uptodate again so 3263 * we make sure to check zero_sum_result again 3264 * to verify if p or q need writeback 3265 */ 3266 } 3267 } else { 3268 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3269 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3270 /* don't try to repair!! */ 3271 set_bit(STRIPE_INSYNC, &sh->state); 3272 else { 3273 int *target = &sh->ops.target; 3274 3275 sh->ops.target = -1; 3276 sh->ops.target2 = -1; 3277 sh->check_state = check_state_compute_run; 3278 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3279 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3280 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3281 set_bit(R5_Wantcompute, 3282 &sh->dev[pd_idx].flags); 3283 *target = pd_idx; 3284 target = &sh->ops.target2; 3285 s->uptodate++; 3286 } 3287 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3288 set_bit(R5_Wantcompute, 3289 &sh->dev[qd_idx].flags); 3290 *target = qd_idx; 3291 s->uptodate++; 3292 } 3293 } 3294 } 3295 break; 3296 case check_state_compute_run: 3297 break; 3298 default: 3299 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3300 __func__, sh->check_state, 3301 (unsigned long long) sh->sector); 3302 BUG(); 3303 } 3304 } 3305 3306 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3307 { 3308 int i; 3309 3310 /* We have read all the blocks in this stripe and now we need to 3311 * copy some of them into a target stripe for expand. 3312 */ 3313 struct dma_async_tx_descriptor *tx = NULL; 3314 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3315 for (i = 0; i < sh->disks; i++) 3316 if (i != sh->pd_idx && i != sh->qd_idx) { 3317 int dd_idx, j; 3318 struct stripe_head *sh2; 3319 struct async_submit_ctl submit; 3320 3321 sector_t bn = compute_blocknr(sh, i, 1); 3322 sector_t s = raid5_compute_sector(conf, bn, 0, 3323 &dd_idx, NULL); 3324 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3325 if (sh2 == NULL) 3326 /* so far only the early blocks of this stripe 3327 * have been requested. When later blocks 3328 * get requested, we will try again 3329 */ 3330 continue; 3331 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3332 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3333 /* must have already done this block */ 3334 release_stripe(sh2); 3335 continue; 3336 } 3337 3338 /* place all the copies on one channel */ 3339 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3340 tx = async_memcpy(sh2->dev[dd_idx].page, 3341 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3342 &submit); 3343 3344 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3345 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3346 for (j = 0; j < conf->raid_disks; j++) 3347 if (j != sh2->pd_idx && 3348 j != sh2->qd_idx && 3349 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3350 break; 3351 if (j == conf->raid_disks) { 3352 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3353 set_bit(STRIPE_HANDLE, &sh2->state); 3354 } 3355 release_stripe(sh2); 3356 3357 } 3358 /* done submitting copies, wait for them to complete */ 3359 async_tx_quiesce(&tx); 3360 } 3361 3362 /* 3363 * handle_stripe - do things to a stripe. 3364 * 3365 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3366 * state of various bits to see what needs to be done. 3367 * Possible results: 3368 * return some read requests which now have data 3369 * return some write requests which are safely on storage 3370 * schedule a read on some buffers 3371 * schedule a write of some buffers 3372 * return confirmation of parity correctness 3373 * 3374 */ 3375 3376 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3377 { 3378 struct r5conf *conf = sh->raid_conf; 3379 int disks = sh->disks; 3380 struct r5dev *dev; 3381 int i; 3382 int do_recovery = 0; 3383 3384 memset(s, 0, sizeof(*s)); 3385 3386 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3387 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3388 s->failed_num[0] = -1; 3389 s->failed_num[1] = -1; 3390 3391 /* Now to look around and see what can be done */ 3392 rcu_read_lock(); 3393 for (i=disks; i--; ) { 3394 struct md_rdev *rdev; 3395 sector_t first_bad; 3396 int bad_sectors; 3397 int is_bad = 0; 3398 3399 dev = &sh->dev[i]; 3400 3401 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3402 i, dev->flags, 3403 dev->toread, dev->towrite, dev->written); 3404 /* maybe we can reply to a read 3405 * 3406 * new wantfill requests are only permitted while 3407 * ops_complete_biofill is guaranteed to be inactive 3408 */ 3409 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3410 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3411 set_bit(R5_Wantfill, &dev->flags); 3412 3413 /* now count some things */ 3414 if (test_bit(R5_LOCKED, &dev->flags)) 3415 s->locked++; 3416 if (test_bit(R5_UPTODATE, &dev->flags)) 3417 s->uptodate++; 3418 if (test_bit(R5_Wantcompute, &dev->flags)) { 3419 s->compute++; 3420 BUG_ON(s->compute > 2); 3421 } 3422 3423 if (test_bit(R5_Wantfill, &dev->flags)) 3424 s->to_fill++; 3425 else if (dev->toread) 3426 s->to_read++; 3427 if (dev->towrite) { 3428 s->to_write++; 3429 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3430 s->non_overwrite++; 3431 } 3432 if (dev->written) 3433 s->written++; 3434 /* Prefer to use the replacement for reads, but only 3435 * if it is recovered enough and has no bad blocks. 3436 */ 3437 rdev = rcu_dereference(conf->disks[i].replacement); 3438 if (rdev && !test_bit(Faulty, &rdev->flags) && 3439 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3440 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3441 &first_bad, &bad_sectors)) 3442 set_bit(R5_ReadRepl, &dev->flags); 3443 else { 3444 if (rdev) 3445 set_bit(R5_NeedReplace, &dev->flags); 3446 rdev = rcu_dereference(conf->disks[i].rdev); 3447 clear_bit(R5_ReadRepl, &dev->flags); 3448 } 3449 if (rdev && test_bit(Faulty, &rdev->flags)) 3450 rdev = NULL; 3451 if (rdev) { 3452 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3453 &first_bad, &bad_sectors); 3454 if (s->blocked_rdev == NULL 3455 && (test_bit(Blocked, &rdev->flags) 3456 || is_bad < 0)) { 3457 if (is_bad < 0) 3458 set_bit(BlockedBadBlocks, 3459 &rdev->flags); 3460 s->blocked_rdev = rdev; 3461 atomic_inc(&rdev->nr_pending); 3462 } 3463 } 3464 clear_bit(R5_Insync, &dev->flags); 3465 if (!rdev) 3466 /* Not in-sync */; 3467 else if (is_bad) { 3468 /* also not in-sync */ 3469 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3470 test_bit(R5_UPTODATE, &dev->flags)) { 3471 /* treat as in-sync, but with a read error 3472 * which we can now try to correct 3473 */ 3474 set_bit(R5_Insync, &dev->flags); 3475 set_bit(R5_ReadError, &dev->flags); 3476 } 3477 } else if (test_bit(In_sync, &rdev->flags)) 3478 set_bit(R5_Insync, &dev->flags); 3479 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3480 /* in sync if before recovery_offset */ 3481 set_bit(R5_Insync, &dev->flags); 3482 else if (test_bit(R5_UPTODATE, &dev->flags) && 3483 test_bit(R5_Expanded, &dev->flags)) 3484 /* If we've reshaped into here, we assume it is Insync. 3485 * We will shortly update recovery_offset to make 3486 * it official. 3487 */ 3488 set_bit(R5_Insync, &dev->flags); 3489 3490 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3491 /* This flag does not apply to '.replacement' 3492 * only to .rdev, so make sure to check that*/ 3493 struct md_rdev *rdev2 = rcu_dereference( 3494 conf->disks[i].rdev); 3495 if (rdev2 == rdev) 3496 clear_bit(R5_Insync, &dev->flags); 3497 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3498 s->handle_bad_blocks = 1; 3499 atomic_inc(&rdev2->nr_pending); 3500 } else 3501 clear_bit(R5_WriteError, &dev->flags); 3502 } 3503 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3504 /* This flag does not apply to '.replacement' 3505 * only to .rdev, so make sure to check that*/ 3506 struct md_rdev *rdev2 = rcu_dereference( 3507 conf->disks[i].rdev); 3508 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3509 s->handle_bad_blocks = 1; 3510 atomic_inc(&rdev2->nr_pending); 3511 } else 3512 clear_bit(R5_MadeGood, &dev->flags); 3513 } 3514 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3515 struct md_rdev *rdev2 = rcu_dereference( 3516 conf->disks[i].replacement); 3517 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3518 s->handle_bad_blocks = 1; 3519 atomic_inc(&rdev2->nr_pending); 3520 } else 3521 clear_bit(R5_MadeGoodRepl, &dev->flags); 3522 } 3523 if (!test_bit(R5_Insync, &dev->flags)) { 3524 /* The ReadError flag will just be confusing now */ 3525 clear_bit(R5_ReadError, &dev->flags); 3526 clear_bit(R5_ReWrite, &dev->flags); 3527 } 3528 if (test_bit(R5_ReadError, &dev->flags)) 3529 clear_bit(R5_Insync, &dev->flags); 3530 if (!test_bit(R5_Insync, &dev->flags)) { 3531 if (s->failed < 2) 3532 s->failed_num[s->failed] = i; 3533 s->failed++; 3534 if (rdev && !test_bit(Faulty, &rdev->flags)) 3535 do_recovery = 1; 3536 } 3537 } 3538 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3539 /* If there is a failed device being replaced, 3540 * we must be recovering. 3541 * else if we are after recovery_cp, we must be syncing 3542 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3543 * else we can only be replacing 3544 * sync and recovery both need to read all devices, and so 3545 * use the same flag. 3546 */ 3547 if (do_recovery || 3548 sh->sector >= conf->mddev->recovery_cp || 3549 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3550 s->syncing = 1; 3551 else 3552 s->replacing = 1; 3553 } 3554 rcu_read_unlock(); 3555 } 3556 3557 static void handle_stripe(struct stripe_head *sh) 3558 { 3559 struct stripe_head_state s; 3560 struct r5conf *conf = sh->raid_conf; 3561 int i; 3562 int prexor; 3563 int disks = sh->disks; 3564 struct r5dev *pdev, *qdev; 3565 3566 clear_bit(STRIPE_HANDLE, &sh->state); 3567 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3568 /* already being handled, ensure it gets handled 3569 * again when current action finishes */ 3570 set_bit(STRIPE_HANDLE, &sh->state); 3571 return; 3572 } 3573 3574 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3575 spin_lock(&sh->stripe_lock); 3576 /* Cannot process 'sync' concurrently with 'discard' */ 3577 if (!test_bit(STRIPE_DISCARD, &sh->state) && 3578 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3579 set_bit(STRIPE_SYNCING, &sh->state); 3580 clear_bit(STRIPE_INSYNC, &sh->state); 3581 clear_bit(STRIPE_REPLACED, &sh->state); 3582 } 3583 spin_unlock(&sh->stripe_lock); 3584 } 3585 clear_bit(STRIPE_DELAYED, &sh->state); 3586 3587 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3588 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3589 (unsigned long long)sh->sector, sh->state, 3590 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3591 sh->check_state, sh->reconstruct_state); 3592 3593 analyse_stripe(sh, &s); 3594 3595 if (s.handle_bad_blocks) { 3596 set_bit(STRIPE_HANDLE, &sh->state); 3597 goto finish; 3598 } 3599 3600 if (unlikely(s.blocked_rdev)) { 3601 if (s.syncing || s.expanding || s.expanded || 3602 s.replacing || s.to_write || s.written) { 3603 set_bit(STRIPE_HANDLE, &sh->state); 3604 goto finish; 3605 } 3606 /* There is nothing for the blocked_rdev to block */ 3607 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3608 s.blocked_rdev = NULL; 3609 } 3610 3611 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3612 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3613 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3614 } 3615 3616 pr_debug("locked=%d uptodate=%d to_read=%d" 3617 " to_write=%d failed=%d failed_num=%d,%d\n", 3618 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3619 s.failed_num[0], s.failed_num[1]); 3620 /* check if the array has lost more than max_degraded devices and, 3621 * if so, some requests might need to be failed. 3622 */ 3623 if (s.failed > conf->max_degraded) { 3624 sh->check_state = 0; 3625 sh->reconstruct_state = 0; 3626 if (s.to_read+s.to_write+s.written) 3627 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3628 if (s.syncing + s.replacing) 3629 handle_failed_sync(conf, sh, &s); 3630 } 3631 3632 /* Now we check to see if any write operations have recently 3633 * completed 3634 */ 3635 prexor = 0; 3636 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3637 prexor = 1; 3638 if (sh->reconstruct_state == reconstruct_state_drain_result || 3639 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3640 sh->reconstruct_state = reconstruct_state_idle; 3641 3642 /* All the 'written' buffers and the parity block are ready to 3643 * be written back to disk 3644 */ 3645 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3646 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3647 BUG_ON(sh->qd_idx >= 0 && 3648 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3649 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3650 for (i = disks; i--; ) { 3651 struct r5dev *dev = &sh->dev[i]; 3652 if (test_bit(R5_LOCKED, &dev->flags) && 3653 (i == sh->pd_idx || i == sh->qd_idx || 3654 dev->written)) { 3655 pr_debug("Writing block %d\n", i); 3656 set_bit(R5_Wantwrite, &dev->flags); 3657 if (prexor) 3658 continue; 3659 if (!test_bit(R5_Insync, &dev->flags) || 3660 ((i == sh->pd_idx || i == sh->qd_idx) && 3661 s.failed == 0)) 3662 set_bit(STRIPE_INSYNC, &sh->state); 3663 } 3664 } 3665 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3666 s.dec_preread_active = 1; 3667 } 3668 3669 /* 3670 * might be able to return some write requests if the parity blocks 3671 * are safe, or on a failed drive 3672 */ 3673 pdev = &sh->dev[sh->pd_idx]; 3674 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3675 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3676 qdev = &sh->dev[sh->qd_idx]; 3677 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3678 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3679 || conf->level < 6; 3680 3681 if (s.written && 3682 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3683 && !test_bit(R5_LOCKED, &pdev->flags) 3684 && (test_bit(R5_UPTODATE, &pdev->flags) || 3685 test_bit(R5_Discard, &pdev->flags))))) && 3686 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3687 && !test_bit(R5_LOCKED, &qdev->flags) 3688 && (test_bit(R5_UPTODATE, &qdev->flags) || 3689 test_bit(R5_Discard, &qdev->flags)))))) 3690 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3691 3692 /* Now we might consider reading some blocks, either to check/generate 3693 * parity, or to satisfy requests 3694 * or to load a block that is being partially written. 3695 */ 3696 if (s.to_read || s.non_overwrite 3697 || (conf->level == 6 && s.to_write && s.failed) 3698 || (s.syncing && (s.uptodate + s.compute < disks)) 3699 || s.replacing 3700 || s.expanding) 3701 handle_stripe_fill(sh, &s, disks); 3702 3703 /* Now to consider new write requests and what else, if anything 3704 * should be read. We do not handle new writes when: 3705 * 1/ A 'write' operation (copy+xor) is already in flight. 3706 * 2/ A 'check' operation is in flight, as it may clobber the parity 3707 * block. 3708 */ 3709 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3710 handle_stripe_dirtying(conf, sh, &s, disks); 3711 3712 /* maybe we need to check and possibly fix the parity for this stripe 3713 * Any reads will already have been scheduled, so we just see if enough 3714 * data is available. The parity check is held off while parity 3715 * dependent operations are in flight. 3716 */ 3717 if (sh->check_state || 3718 (s.syncing && s.locked == 0 && 3719 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3720 !test_bit(STRIPE_INSYNC, &sh->state))) { 3721 if (conf->level == 6) 3722 handle_parity_checks6(conf, sh, &s, disks); 3723 else 3724 handle_parity_checks5(conf, sh, &s, disks); 3725 } 3726 3727 if ((s.replacing || s.syncing) && s.locked == 0 3728 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 3729 && !test_bit(STRIPE_REPLACED, &sh->state)) { 3730 /* Write out to replacement devices where possible */ 3731 for (i = 0; i < conf->raid_disks; i++) 3732 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3733 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3734 set_bit(R5_WantReplace, &sh->dev[i].flags); 3735 set_bit(R5_LOCKED, &sh->dev[i].flags); 3736 s.locked++; 3737 } 3738 if (s.replacing) 3739 set_bit(STRIPE_INSYNC, &sh->state); 3740 set_bit(STRIPE_REPLACED, &sh->state); 3741 } 3742 if ((s.syncing || s.replacing) && s.locked == 0 && 3743 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3744 test_bit(STRIPE_INSYNC, &sh->state)) { 3745 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3746 clear_bit(STRIPE_SYNCING, &sh->state); 3747 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3748 wake_up(&conf->wait_for_overlap); 3749 } 3750 3751 /* If the failed drives are just a ReadError, then we might need 3752 * to progress the repair/check process 3753 */ 3754 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3755 for (i = 0; i < s.failed; i++) { 3756 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3757 if (test_bit(R5_ReadError, &dev->flags) 3758 && !test_bit(R5_LOCKED, &dev->flags) 3759 && test_bit(R5_UPTODATE, &dev->flags) 3760 ) { 3761 if (!test_bit(R5_ReWrite, &dev->flags)) { 3762 set_bit(R5_Wantwrite, &dev->flags); 3763 set_bit(R5_ReWrite, &dev->flags); 3764 set_bit(R5_LOCKED, &dev->flags); 3765 s.locked++; 3766 } else { 3767 /* let's read it back */ 3768 set_bit(R5_Wantread, &dev->flags); 3769 set_bit(R5_LOCKED, &dev->flags); 3770 s.locked++; 3771 } 3772 } 3773 } 3774 3775 3776 /* Finish reconstruct operations initiated by the expansion process */ 3777 if (sh->reconstruct_state == reconstruct_state_result) { 3778 struct stripe_head *sh_src 3779 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3780 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3781 /* sh cannot be written until sh_src has been read. 3782 * so arrange for sh to be delayed a little 3783 */ 3784 set_bit(STRIPE_DELAYED, &sh->state); 3785 set_bit(STRIPE_HANDLE, &sh->state); 3786 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3787 &sh_src->state)) 3788 atomic_inc(&conf->preread_active_stripes); 3789 release_stripe(sh_src); 3790 goto finish; 3791 } 3792 if (sh_src) 3793 release_stripe(sh_src); 3794 3795 sh->reconstruct_state = reconstruct_state_idle; 3796 clear_bit(STRIPE_EXPANDING, &sh->state); 3797 for (i = conf->raid_disks; i--; ) { 3798 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3799 set_bit(R5_LOCKED, &sh->dev[i].flags); 3800 s.locked++; 3801 } 3802 } 3803 3804 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3805 !sh->reconstruct_state) { 3806 /* Need to write out all blocks after computing parity */ 3807 sh->disks = conf->raid_disks; 3808 stripe_set_idx(sh->sector, conf, 0, sh); 3809 schedule_reconstruction(sh, &s, 1, 1); 3810 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3811 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3812 atomic_dec(&conf->reshape_stripes); 3813 wake_up(&conf->wait_for_overlap); 3814 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3815 } 3816 3817 if (s.expanding && s.locked == 0 && 3818 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3819 handle_stripe_expansion(conf, sh); 3820 3821 finish: 3822 /* wait for this device to become unblocked */ 3823 if (unlikely(s.blocked_rdev)) { 3824 if (conf->mddev->external) 3825 md_wait_for_blocked_rdev(s.blocked_rdev, 3826 conf->mddev); 3827 else 3828 /* Internal metadata will immediately 3829 * be written by raid5d, so we don't 3830 * need to wait here. 3831 */ 3832 rdev_dec_pending(s.blocked_rdev, 3833 conf->mddev); 3834 } 3835 3836 if (s.handle_bad_blocks) 3837 for (i = disks; i--; ) { 3838 struct md_rdev *rdev; 3839 struct r5dev *dev = &sh->dev[i]; 3840 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3841 /* We own a safe reference to the rdev */ 3842 rdev = conf->disks[i].rdev; 3843 if (!rdev_set_badblocks(rdev, sh->sector, 3844 STRIPE_SECTORS, 0)) 3845 md_error(conf->mddev, rdev); 3846 rdev_dec_pending(rdev, conf->mddev); 3847 } 3848 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3849 rdev = conf->disks[i].rdev; 3850 rdev_clear_badblocks(rdev, sh->sector, 3851 STRIPE_SECTORS, 0); 3852 rdev_dec_pending(rdev, conf->mddev); 3853 } 3854 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3855 rdev = conf->disks[i].replacement; 3856 if (!rdev) 3857 /* rdev have been moved down */ 3858 rdev = conf->disks[i].rdev; 3859 rdev_clear_badblocks(rdev, sh->sector, 3860 STRIPE_SECTORS, 0); 3861 rdev_dec_pending(rdev, conf->mddev); 3862 } 3863 } 3864 3865 if (s.ops_request) 3866 raid_run_ops(sh, s.ops_request); 3867 3868 ops_run_io(sh, &s); 3869 3870 if (s.dec_preread_active) { 3871 /* We delay this until after ops_run_io so that if make_request 3872 * is waiting on a flush, it won't continue until the writes 3873 * have actually been submitted. 3874 */ 3875 atomic_dec(&conf->preread_active_stripes); 3876 if (atomic_read(&conf->preread_active_stripes) < 3877 IO_THRESHOLD) 3878 md_wakeup_thread(conf->mddev->thread); 3879 } 3880 3881 return_io(s.return_bi); 3882 3883 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3884 } 3885 3886 static void raid5_activate_delayed(struct r5conf *conf) 3887 { 3888 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3889 while (!list_empty(&conf->delayed_list)) { 3890 struct list_head *l = conf->delayed_list.next; 3891 struct stripe_head *sh; 3892 sh = list_entry(l, struct stripe_head, lru); 3893 list_del_init(l); 3894 clear_bit(STRIPE_DELAYED, &sh->state); 3895 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3896 atomic_inc(&conf->preread_active_stripes); 3897 list_add_tail(&sh->lru, &conf->hold_list); 3898 raid5_wakeup_stripe_thread(sh); 3899 } 3900 } 3901 } 3902 3903 static void activate_bit_delay(struct r5conf *conf) 3904 { 3905 /* device_lock is held */ 3906 struct list_head head; 3907 list_add(&head, &conf->bitmap_list); 3908 list_del_init(&conf->bitmap_list); 3909 while (!list_empty(&head)) { 3910 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3911 list_del_init(&sh->lru); 3912 atomic_inc(&sh->count); 3913 __release_stripe(conf, sh); 3914 } 3915 } 3916 3917 int md_raid5_congested(struct mddev *mddev, int bits) 3918 { 3919 struct r5conf *conf = mddev->private; 3920 3921 /* No difference between reads and writes. Just check 3922 * how busy the stripe_cache is 3923 */ 3924 3925 if (conf->inactive_blocked) 3926 return 1; 3927 if (conf->quiesce) 3928 return 1; 3929 if (list_empty_careful(&conf->inactive_list)) 3930 return 1; 3931 3932 return 0; 3933 } 3934 EXPORT_SYMBOL_GPL(md_raid5_congested); 3935 3936 static int raid5_congested(void *data, int bits) 3937 { 3938 struct mddev *mddev = data; 3939 3940 return mddev_congested(mddev, bits) || 3941 md_raid5_congested(mddev, bits); 3942 } 3943 3944 /* We want read requests to align with chunks where possible, 3945 * but write requests don't need to. 3946 */ 3947 static int raid5_mergeable_bvec(struct request_queue *q, 3948 struct bvec_merge_data *bvm, 3949 struct bio_vec *biovec) 3950 { 3951 struct mddev *mddev = q->queuedata; 3952 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3953 int max; 3954 unsigned int chunk_sectors = mddev->chunk_sectors; 3955 unsigned int bio_sectors = bvm->bi_size >> 9; 3956 3957 if ((bvm->bi_rw & 1) == WRITE) 3958 return biovec->bv_len; /* always allow writes to be mergeable */ 3959 3960 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3961 chunk_sectors = mddev->new_chunk_sectors; 3962 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3963 if (max < 0) max = 0; 3964 if (max <= biovec->bv_len && bio_sectors == 0) 3965 return biovec->bv_len; 3966 else 3967 return max; 3968 } 3969 3970 3971 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3972 { 3973 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3974 unsigned int chunk_sectors = mddev->chunk_sectors; 3975 unsigned int bio_sectors = bio_sectors(bio); 3976 3977 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3978 chunk_sectors = mddev->new_chunk_sectors; 3979 return chunk_sectors >= 3980 ((sector & (chunk_sectors - 1)) + bio_sectors); 3981 } 3982 3983 /* 3984 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3985 * later sampled by raid5d. 3986 */ 3987 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3988 { 3989 unsigned long flags; 3990 3991 spin_lock_irqsave(&conf->device_lock, flags); 3992 3993 bi->bi_next = conf->retry_read_aligned_list; 3994 conf->retry_read_aligned_list = bi; 3995 3996 spin_unlock_irqrestore(&conf->device_lock, flags); 3997 md_wakeup_thread(conf->mddev->thread); 3998 } 3999 4000 4001 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4002 { 4003 struct bio *bi; 4004 4005 bi = conf->retry_read_aligned; 4006 if (bi) { 4007 conf->retry_read_aligned = NULL; 4008 return bi; 4009 } 4010 bi = conf->retry_read_aligned_list; 4011 if(bi) { 4012 conf->retry_read_aligned_list = bi->bi_next; 4013 bi->bi_next = NULL; 4014 /* 4015 * this sets the active strip count to 1 and the processed 4016 * strip count to zero (upper 8 bits) 4017 */ 4018 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4019 } 4020 4021 return bi; 4022 } 4023 4024 4025 /* 4026 * The "raid5_align_endio" should check if the read succeeded and if it 4027 * did, call bio_endio on the original bio (having bio_put the new bio 4028 * first). 4029 * If the read failed.. 4030 */ 4031 static void raid5_align_endio(struct bio *bi, int error) 4032 { 4033 struct bio* raid_bi = bi->bi_private; 4034 struct mddev *mddev; 4035 struct r5conf *conf; 4036 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 4037 struct md_rdev *rdev; 4038 4039 bio_put(bi); 4040 4041 rdev = (void*)raid_bi->bi_next; 4042 raid_bi->bi_next = NULL; 4043 mddev = rdev->mddev; 4044 conf = mddev->private; 4045 4046 rdev_dec_pending(rdev, conf->mddev); 4047 4048 if (!error && uptodate) { 4049 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4050 raid_bi, 0); 4051 bio_endio(raid_bi, 0); 4052 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4053 wake_up(&conf->wait_for_stripe); 4054 return; 4055 } 4056 4057 4058 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4059 4060 add_bio_to_retry(raid_bi, conf); 4061 } 4062 4063 static int bio_fits_rdev(struct bio *bi) 4064 { 4065 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 4066 4067 if (bio_sectors(bi) > queue_max_sectors(q)) 4068 return 0; 4069 blk_recount_segments(q, bi); 4070 if (bi->bi_phys_segments > queue_max_segments(q)) 4071 return 0; 4072 4073 if (q->merge_bvec_fn) 4074 /* it's too hard to apply the merge_bvec_fn at this stage, 4075 * just just give up 4076 */ 4077 return 0; 4078 4079 return 1; 4080 } 4081 4082 4083 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 4084 { 4085 struct r5conf *conf = mddev->private; 4086 int dd_idx; 4087 struct bio* align_bi; 4088 struct md_rdev *rdev; 4089 sector_t end_sector; 4090 4091 if (!in_chunk_boundary(mddev, raid_bio)) { 4092 pr_debug("chunk_aligned_read : non aligned\n"); 4093 return 0; 4094 } 4095 /* 4096 * use bio_clone_mddev to make a copy of the bio 4097 */ 4098 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 4099 if (!align_bi) 4100 return 0; 4101 /* 4102 * set bi_end_io to a new function, and set bi_private to the 4103 * original bio. 4104 */ 4105 align_bi->bi_end_io = raid5_align_endio; 4106 align_bi->bi_private = raid_bio; 4107 /* 4108 * compute position 4109 */ 4110 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 4111 0, 4112 &dd_idx, NULL); 4113 4114 end_sector = bio_end_sector(align_bi); 4115 rcu_read_lock(); 4116 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 4117 if (!rdev || test_bit(Faulty, &rdev->flags) || 4118 rdev->recovery_offset < end_sector) { 4119 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 4120 if (rdev && 4121 (test_bit(Faulty, &rdev->flags) || 4122 !(test_bit(In_sync, &rdev->flags) || 4123 rdev->recovery_offset >= end_sector))) 4124 rdev = NULL; 4125 } 4126 if (rdev) { 4127 sector_t first_bad; 4128 int bad_sectors; 4129 4130 atomic_inc(&rdev->nr_pending); 4131 rcu_read_unlock(); 4132 raid_bio->bi_next = (void*)rdev; 4133 align_bi->bi_bdev = rdev->bdev; 4134 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4135 4136 if (!bio_fits_rdev(align_bi) || 4137 is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), 4138 &first_bad, &bad_sectors)) { 4139 /* too big in some way, or has a known bad block */ 4140 bio_put(align_bi); 4141 rdev_dec_pending(rdev, mddev); 4142 return 0; 4143 } 4144 4145 /* No reshape active, so we can trust rdev->data_offset */ 4146 align_bi->bi_sector += rdev->data_offset; 4147 4148 spin_lock_irq(&conf->device_lock); 4149 wait_event_lock_irq(conf->wait_for_stripe, 4150 conf->quiesce == 0, 4151 conf->device_lock); 4152 atomic_inc(&conf->active_aligned_reads); 4153 spin_unlock_irq(&conf->device_lock); 4154 4155 if (mddev->gendisk) 4156 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4157 align_bi, disk_devt(mddev->gendisk), 4158 raid_bio->bi_sector); 4159 generic_make_request(align_bi); 4160 return 1; 4161 } else { 4162 rcu_read_unlock(); 4163 bio_put(align_bi); 4164 return 0; 4165 } 4166 } 4167 4168 /* __get_priority_stripe - get the next stripe to process 4169 * 4170 * Full stripe writes are allowed to pass preread active stripes up until 4171 * the bypass_threshold is exceeded. In general the bypass_count 4172 * increments when the handle_list is handled before the hold_list; however, it 4173 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4174 * stripe with in flight i/o. The bypass_count will be reset when the 4175 * head of the hold_list has changed, i.e. the head was promoted to the 4176 * handle_list. 4177 */ 4178 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 4179 { 4180 struct stripe_head *sh = NULL, *tmp; 4181 struct list_head *handle_list = NULL; 4182 struct r5worker_group *wg = NULL; 4183 4184 if (conf->worker_cnt_per_group == 0) { 4185 handle_list = &conf->handle_list; 4186 } else if (group != ANY_GROUP) { 4187 handle_list = &conf->worker_groups[group].handle_list; 4188 wg = &conf->worker_groups[group]; 4189 } else { 4190 int i; 4191 for (i = 0; i < conf->group_cnt; i++) { 4192 handle_list = &conf->worker_groups[i].handle_list; 4193 wg = &conf->worker_groups[i]; 4194 if (!list_empty(handle_list)) 4195 break; 4196 } 4197 } 4198 4199 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4200 __func__, 4201 list_empty(handle_list) ? "empty" : "busy", 4202 list_empty(&conf->hold_list) ? "empty" : "busy", 4203 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4204 4205 if (!list_empty(handle_list)) { 4206 sh = list_entry(handle_list->next, typeof(*sh), lru); 4207 4208 if (list_empty(&conf->hold_list)) 4209 conf->bypass_count = 0; 4210 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4211 if (conf->hold_list.next == conf->last_hold) 4212 conf->bypass_count++; 4213 else { 4214 conf->last_hold = conf->hold_list.next; 4215 conf->bypass_count -= conf->bypass_threshold; 4216 if (conf->bypass_count < 0) 4217 conf->bypass_count = 0; 4218 } 4219 } 4220 } else if (!list_empty(&conf->hold_list) && 4221 ((conf->bypass_threshold && 4222 conf->bypass_count > conf->bypass_threshold) || 4223 atomic_read(&conf->pending_full_writes) == 0)) { 4224 4225 list_for_each_entry(tmp, &conf->hold_list, lru) { 4226 if (conf->worker_cnt_per_group == 0 || 4227 group == ANY_GROUP || 4228 !cpu_online(tmp->cpu) || 4229 cpu_to_group(tmp->cpu) == group) { 4230 sh = tmp; 4231 break; 4232 } 4233 } 4234 4235 if (sh) { 4236 conf->bypass_count -= conf->bypass_threshold; 4237 if (conf->bypass_count < 0) 4238 conf->bypass_count = 0; 4239 } 4240 wg = NULL; 4241 } 4242 4243 if (!sh) 4244 return NULL; 4245 4246 if (wg) { 4247 wg->stripes_cnt--; 4248 sh->group = NULL; 4249 } 4250 list_del_init(&sh->lru); 4251 atomic_inc(&sh->count); 4252 BUG_ON(atomic_read(&sh->count) != 1); 4253 return sh; 4254 } 4255 4256 struct raid5_plug_cb { 4257 struct blk_plug_cb cb; 4258 struct list_head list; 4259 }; 4260 4261 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4262 { 4263 struct raid5_plug_cb *cb = container_of( 4264 blk_cb, struct raid5_plug_cb, cb); 4265 struct stripe_head *sh; 4266 struct mddev *mddev = cb->cb.data; 4267 struct r5conf *conf = mddev->private; 4268 int cnt = 0; 4269 4270 if (cb->list.next && !list_empty(&cb->list)) { 4271 spin_lock_irq(&conf->device_lock); 4272 while (!list_empty(&cb->list)) { 4273 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4274 list_del_init(&sh->lru); 4275 /* 4276 * avoid race release_stripe_plug() sees 4277 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4278 * is still in our list 4279 */ 4280 smp_mb__before_clear_bit(); 4281 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4282 /* 4283 * STRIPE_ON_RELEASE_LIST could be set here. In that 4284 * case, the count is always > 1 here 4285 */ 4286 __release_stripe(conf, sh); 4287 cnt++; 4288 } 4289 spin_unlock_irq(&conf->device_lock); 4290 } 4291 if (mddev->queue) 4292 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4293 kfree(cb); 4294 } 4295 4296 static void release_stripe_plug(struct mddev *mddev, 4297 struct stripe_head *sh) 4298 { 4299 struct blk_plug_cb *blk_cb = blk_check_plugged( 4300 raid5_unplug, mddev, 4301 sizeof(struct raid5_plug_cb)); 4302 struct raid5_plug_cb *cb; 4303 4304 if (!blk_cb) { 4305 release_stripe(sh); 4306 return; 4307 } 4308 4309 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4310 4311 if (cb->list.next == NULL) 4312 INIT_LIST_HEAD(&cb->list); 4313 4314 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4315 list_add_tail(&sh->lru, &cb->list); 4316 else 4317 release_stripe(sh); 4318 } 4319 4320 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4321 { 4322 struct r5conf *conf = mddev->private; 4323 sector_t logical_sector, last_sector; 4324 struct stripe_head *sh; 4325 int remaining; 4326 int stripe_sectors; 4327 4328 if (mddev->reshape_position != MaxSector) 4329 /* Skip discard while reshape is happening */ 4330 return; 4331 4332 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4333 last_sector = bi->bi_sector + (bi->bi_size>>9); 4334 4335 bi->bi_next = NULL; 4336 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4337 4338 stripe_sectors = conf->chunk_sectors * 4339 (conf->raid_disks - conf->max_degraded); 4340 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4341 stripe_sectors); 4342 sector_div(last_sector, stripe_sectors); 4343 4344 logical_sector *= conf->chunk_sectors; 4345 last_sector *= conf->chunk_sectors; 4346 4347 for (; logical_sector < last_sector; 4348 logical_sector += STRIPE_SECTORS) { 4349 DEFINE_WAIT(w); 4350 int d; 4351 again: 4352 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4353 prepare_to_wait(&conf->wait_for_overlap, &w, 4354 TASK_UNINTERRUPTIBLE); 4355 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4356 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4357 release_stripe(sh); 4358 schedule(); 4359 goto again; 4360 } 4361 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4362 spin_lock_irq(&sh->stripe_lock); 4363 for (d = 0; d < conf->raid_disks; d++) { 4364 if (d == sh->pd_idx || d == sh->qd_idx) 4365 continue; 4366 if (sh->dev[d].towrite || sh->dev[d].toread) { 4367 set_bit(R5_Overlap, &sh->dev[d].flags); 4368 spin_unlock_irq(&sh->stripe_lock); 4369 release_stripe(sh); 4370 schedule(); 4371 goto again; 4372 } 4373 } 4374 set_bit(STRIPE_DISCARD, &sh->state); 4375 finish_wait(&conf->wait_for_overlap, &w); 4376 for (d = 0; d < conf->raid_disks; d++) { 4377 if (d == sh->pd_idx || d == sh->qd_idx) 4378 continue; 4379 sh->dev[d].towrite = bi; 4380 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4381 raid5_inc_bi_active_stripes(bi); 4382 } 4383 spin_unlock_irq(&sh->stripe_lock); 4384 if (conf->mddev->bitmap) { 4385 for (d = 0; 4386 d < conf->raid_disks - conf->max_degraded; 4387 d++) 4388 bitmap_startwrite(mddev->bitmap, 4389 sh->sector, 4390 STRIPE_SECTORS, 4391 0); 4392 sh->bm_seq = conf->seq_flush + 1; 4393 set_bit(STRIPE_BIT_DELAY, &sh->state); 4394 } 4395 4396 set_bit(STRIPE_HANDLE, &sh->state); 4397 clear_bit(STRIPE_DELAYED, &sh->state); 4398 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4399 atomic_inc(&conf->preread_active_stripes); 4400 release_stripe_plug(mddev, sh); 4401 } 4402 4403 remaining = raid5_dec_bi_active_stripes(bi); 4404 if (remaining == 0) { 4405 md_write_end(mddev); 4406 bio_endio(bi, 0); 4407 } 4408 } 4409 4410 static void make_request(struct mddev *mddev, struct bio * bi) 4411 { 4412 struct r5conf *conf = mddev->private; 4413 int dd_idx; 4414 sector_t new_sector; 4415 sector_t logical_sector, last_sector; 4416 struct stripe_head *sh; 4417 const int rw = bio_data_dir(bi); 4418 int remaining; 4419 4420 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4421 md_flush_request(mddev, bi); 4422 return; 4423 } 4424 4425 md_write_start(mddev, bi); 4426 4427 if (rw == READ && 4428 mddev->reshape_position == MaxSector && 4429 chunk_aligned_read(mddev,bi)) 4430 return; 4431 4432 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4433 make_discard_request(mddev, bi); 4434 return; 4435 } 4436 4437 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4438 last_sector = bio_end_sector(bi); 4439 bi->bi_next = NULL; 4440 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4441 4442 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4443 DEFINE_WAIT(w); 4444 int previous; 4445 int seq; 4446 4447 retry: 4448 seq = read_seqcount_begin(&conf->gen_lock); 4449 previous = 0; 4450 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4451 if (unlikely(conf->reshape_progress != MaxSector)) { 4452 /* spinlock is needed as reshape_progress may be 4453 * 64bit on a 32bit platform, and so it might be 4454 * possible to see a half-updated value 4455 * Of course reshape_progress could change after 4456 * the lock is dropped, so once we get a reference 4457 * to the stripe that we think it is, we will have 4458 * to check again. 4459 */ 4460 spin_lock_irq(&conf->device_lock); 4461 if (mddev->reshape_backwards 4462 ? logical_sector < conf->reshape_progress 4463 : logical_sector >= conf->reshape_progress) { 4464 previous = 1; 4465 } else { 4466 if (mddev->reshape_backwards 4467 ? logical_sector < conf->reshape_safe 4468 : logical_sector >= conf->reshape_safe) { 4469 spin_unlock_irq(&conf->device_lock); 4470 schedule(); 4471 goto retry; 4472 } 4473 } 4474 spin_unlock_irq(&conf->device_lock); 4475 } 4476 4477 new_sector = raid5_compute_sector(conf, logical_sector, 4478 previous, 4479 &dd_idx, NULL); 4480 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4481 (unsigned long long)new_sector, 4482 (unsigned long long)logical_sector); 4483 4484 sh = get_active_stripe(conf, new_sector, previous, 4485 (bi->bi_rw&RWA_MASK), 0); 4486 if (sh) { 4487 if (unlikely(previous)) { 4488 /* expansion might have moved on while waiting for a 4489 * stripe, so we must do the range check again. 4490 * Expansion could still move past after this 4491 * test, but as we are holding a reference to 4492 * 'sh', we know that if that happens, 4493 * STRIPE_EXPANDING will get set and the expansion 4494 * won't proceed until we finish with the stripe. 4495 */ 4496 int must_retry = 0; 4497 spin_lock_irq(&conf->device_lock); 4498 if (mddev->reshape_backwards 4499 ? logical_sector >= conf->reshape_progress 4500 : logical_sector < conf->reshape_progress) 4501 /* mismatch, need to try again */ 4502 must_retry = 1; 4503 spin_unlock_irq(&conf->device_lock); 4504 if (must_retry) { 4505 release_stripe(sh); 4506 schedule(); 4507 goto retry; 4508 } 4509 } 4510 if (read_seqcount_retry(&conf->gen_lock, seq)) { 4511 /* Might have got the wrong stripe_head 4512 * by accident 4513 */ 4514 release_stripe(sh); 4515 goto retry; 4516 } 4517 4518 if (rw == WRITE && 4519 logical_sector >= mddev->suspend_lo && 4520 logical_sector < mddev->suspend_hi) { 4521 release_stripe(sh); 4522 /* As the suspend_* range is controlled by 4523 * userspace, we want an interruptible 4524 * wait. 4525 */ 4526 flush_signals(current); 4527 prepare_to_wait(&conf->wait_for_overlap, 4528 &w, TASK_INTERRUPTIBLE); 4529 if (logical_sector >= mddev->suspend_lo && 4530 logical_sector < mddev->suspend_hi) 4531 schedule(); 4532 goto retry; 4533 } 4534 4535 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4536 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4537 /* Stripe is busy expanding or 4538 * add failed due to overlap. Flush everything 4539 * and wait a while 4540 */ 4541 md_wakeup_thread(mddev->thread); 4542 release_stripe(sh); 4543 schedule(); 4544 goto retry; 4545 } 4546 finish_wait(&conf->wait_for_overlap, &w); 4547 set_bit(STRIPE_HANDLE, &sh->state); 4548 clear_bit(STRIPE_DELAYED, &sh->state); 4549 if ((bi->bi_rw & REQ_SYNC) && 4550 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4551 atomic_inc(&conf->preread_active_stripes); 4552 release_stripe_plug(mddev, sh); 4553 } else { 4554 /* cannot get stripe for read-ahead, just give-up */ 4555 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4556 finish_wait(&conf->wait_for_overlap, &w); 4557 break; 4558 } 4559 } 4560 4561 remaining = raid5_dec_bi_active_stripes(bi); 4562 if (remaining == 0) { 4563 4564 if ( rw == WRITE ) 4565 md_write_end(mddev); 4566 4567 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4568 bi, 0); 4569 bio_endio(bi, 0); 4570 } 4571 } 4572 4573 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4574 4575 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4576 { 4577 /* reshaping is quite different to recovery/resync so it is 4578 * handled quite separately ... here. 4579 * 4580 * On each call to sync_request, we gather one chunk worth of 4581 * destination stripes and flag them as expanding. 4582 * Then we find all the source stripes and request reads. 4583 * As the reads complete, handle_stripe will copy the data 4584 * into the destination stripe and release that stripe. 4585 */ 4586 struct r5conf *conf = mddev->private; 4587 struct stripe_head *sh; 4588 sector_t first_sector, last_sector; 4589 int raid_disks = conf->previous_raid_disks; 4590 int data_disks = raid_disks - conf->max_degraded; 4591 int new_data_disks = conf->raid_disks - conf->max_degraded; 4592 int i; 4593 int dd_idx; 4594 sector_t writepos, readpos, safepos; 4595 sector_t stripe_addr; 4596 int reshape_sectors; 4597 struct list_head stripes; 4598 4599 if (sector_nr == 0) { 4600 /* If restarting in the middle, skip the initial sectors */ 4601 if (mddev->reshape_backwards && 4602 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4603 sector_nr = raid5_size(mddev, 0, 0) 4604 - conf->reshape_progress; 4605 } else if (!mddev->reshape_backwards && 4606 conf->reshape_progress > 0) 4607 sector_nr = conf->reshape_progress; 4608 sector_div(sector_nr, new_data_disks); 4609 if (sector_nr) { 4610 mddev->curr_resync_completed = sector_nr; 4611 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4612 *skipped = 1; 4613 return sector_nr; 4614 } 4615 } 4616 4617 /* We need to process a full chunk at a time. 4618 * If old and new chunk sizes differ, we need to process the 4619 * largest of these 4620 */ 4621 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4622 reshape_sectors = mddev->new_chunk_sectors; 4623 else 4624 reshape_sectors = mddev->chunk_sectors; 4625 4626 /* We update the metadata at least every 10 seconds, or when 4627 * the data about to be copied would over-write the source of 4628 * the data at the front of the range. i.e. one new_stripe 4629 * along from reshape_progress new_maps to after where 4630 * reshape_safe old_maps to 4631 */ 4632 writepos = conf->reshape_progress; 4633 sector_div(writepos, new_data_disks); 4634 readpos = conf->reshape_progress; 4635 sector_div(readpos, data_disks); 4636 safepos = conf->reshape_safe; 4637 sector_div(safepos, data_disks); 4638 if (mddev->reshape_backwards) { 4639 writepos -= min_t(sector_t, reshape_sectors, writepos); 4640 readpos += reshape_sectors; 4641 safepos += reshape_sectors; 4642 } else { 4643 writepos += reshape_sectors; 4644 readpos -= min_t(sector_t, reshape_sectors, readpos); 4645 safepos -= min_t(sector_t, reshape_sectors, safepos); 4646 } 4647 4648 /* Having calculated the 'writepos' possibly use it 4649 * to set 'stripe_addr' which is where we will write to. 4650 */ 4651 if (mddev->reshape_backwards) { 4652 BUG_ON(conf->reshape_progress == 0); 4653 stripe_addr = writepos; 4654 BUG_ON((mddev->dev_sectors & 4655 ~((sector_t)reshape_sectors - 1)) 4656 - reshape_sectors - stripe_addr 4657 != sector_nr); 4658 } else { 4659 BUG_ON(writepos != sector_nr + reshape_sectors); 4660 stripe_addr = sector_nr; 4661 } 4662 4663 /* 'writepos' is the most advanced device address we might write. 4664 * 'readpos' is the least advanced device address we might read. 4665 * 'safepos' is the least address recorded in the metadata as having 4666 * been reshaped. 4667 * If there is a min_offset_diff, these are adjusted either by 4668 * increasing the safepos/readpos if diff is negative, or 4669 * increasing writepos if diff is positive. 4670 * If 'readpos' is then behind 'writepos', there is no way that we can 4671 * ensure safety in the face of a crash - that must be done by userspace 4672 * making a backup of the data. So in that case there is no particular 4673 * rush to update metadata. 4674 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4675 * update the metadata to advance 'safepos' to match 'readpos' so that 4676 * we can be safe in the event of a crash. 4677 * So we insist on updating metadata if safepos is behind writepos and 4678 * readpos is beyond writepos. 4679 * In any case, update the metadata every 10 seconds. 4680 * Maybe that number should be configurable, but I'm not sure it is 4681 * worth it.... maybe it could be a multiple of safemode_delay??? 4682 */ 4683 if (conf->min_offset_diff < 0) { 4684 safepos += -conf->min_offset_diff; 4685 readpos += -conf->min_offset_diff; 4686 } else 4687 writepos += conf->min_offset_diff; 4688 4689 if ((mddev->reshape_backwards 4690 ? (safepos > writepos && readpos < writepos) 4691 : (safepos < writepos && readpos > writepos)) || 4692 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4693 /* Cannot proceed until we've updated the superblock... */ 4694 wait_event(conf->wait_for_overlap, 4695 atomic_read(&conf->reshape_stripes)==0); 4696 mddev->reshape_position = conf->reshape_progress; 4697 mddev->curr_resync_completed = sector_nr; 4698 conf->reshape_checkpoint = jiffies; 4699 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4700 md_wakeup_thread(mddev->thread); 4701 wait_event(mddev->sb_wait, mddev->flags == 0 || 4702 kthread_should_stop()); 4703 spin_lock_irq(&conf->device_lock); 4704 conf->reshape_safe = mddev->reshape_position; 4705 spin_unlock_irq(&conf->device_lock); 4706 wake_up(&conf->wait_for_overlap); 4707 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4708 } 4709 4710 INIT_LIST_HEAD(&stripes); 4711 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4712 int j; 4713 int skipped_disk = 0; 4714 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4715 set_bit(STRIPE_EXPANDING, &sh->state); 4716 atomic_inc(&conf->reshape_stripes); 4717 /* If any of this stripe is beyond the end of the old 4718 * array, then we need to zero those blocks 4719 */ 4720 for (j=sh->disks; j--;) { 4721 sector_t s; 4722 if (j == sh->pd_idx) 4723 continue; 4724 if (conf->level == 6 && 4725 j == sh->qd_idx) 4726 continue; 4727 s = compute_blocknr(sh, j, 0); 4728 if (s < raid5_size(mddev, 0, 0)) { 4729 skipped_disk = 1; 4730 continue; 4731 } 4732 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4733 set_bit(R5_Expanded, &sh->dev[j].flags); 4734 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4735 } 4736 if (!skipped_disk) { 4737 set_bit(STRIPE_EXPAND_READY, &sh->state); 4738 set_bit(STRIPE_HANDLE, &sh->state); 4739 } 4740 list_add(&sh->lru, &stripes); 4741 } 4742 spin_lock_irq(&conf->device_lock); 4743 if (mddev->reshape_backwards) 4744 conf->reshape_progress -= reshape_sectors * new_data_disks; 4745 else 4746 conf->reshape_progress += reshape_sectors * new_data_disks; 4747 spin_unlock_irq(&conf->device_lock); 4748 /* Ok, those stripe are ready. We can start scheduling 4749 * reads on the source stripes. 4750 * The source stripes are determined by mapping the first and last 4751 * block on the destination stripes. 4752 */ 4753 first_sector = 4754 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4755 1, &dd_idx, NULL); 4756 last_sector = 4757 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4758 * new_data_disks - 1), 4759 1, &dd_idx, NULL); 4760 if (last_sector >= mddev->dev_sectors) 4761 last_sector = mddev->dev_sectors - 1; 4762 while (first_sector <= last_sector) { 4763 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4764 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4765 set_bit(STRIPE_HANDLE, &sh->state); 4766 release_stripe(sh); 4767 first_sector += STRIPE_SECTORS; 4768 } 4769 /* Now that the sources are clearly marked, we can release 4770 * the destination stripes 4771 */ 4772 while (!list_empty(&stripes)) { 4773 sh = list_entry(stripes.next, struct stripe_head, lru); 4774 list_del_init(&sh->lru); 4775 release_stripe(sh); 4776 } 4777 /* If this takes us to the resync_max point where we have to pause, 4778 * then we need to write out the superblock. 4779 */ 4780 sector_nr += reshape_sectors; 4781 if ((sector_nr - mddev->curr_resync_completed) * 2 4782 >= mddev->resync_max - mddev->curr_resync_completed) { 4783 /* Cannot proceed until we've updated the superblock... */ 4784 wait_event(conf->wait_for_overlap, 4785 atomic_read(&conf->reshape_stripes) == 0); 4786 mddev->reshape_position = conf->reshape_progress; 4787 mddev->curr_resync_completed = sector_nr; 4788 conf->reshape_checkpoint = jiffies; 4789 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4790 md_wakeup_thread(mddev->thread); 4791 wait_event(mddev->sb_wait, 4792 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4793 || kthread_should_stop()); 4794 spin_lock_irq(&conf->device_lock); 4795 conf->reshape_safe = mddev->reshape_position; 4796 spin_unlock_irq(&conf->device_lock); 4797 wake_up(&conf->wait_for_overlap); 4798 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4799 } 4800 return reshape_sectors; 4801 } 4802 4803 /* FIXME go_faster isn't used */ 4804 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4805 { 4806 struct r5conf *conf = mddev->private; 4807 struct stripe_head *sh; 4808 sector_t max_sector = mddev->dev_sectors; 4809 sector_t sync_blocks; 4810 int still_degraded = 0; 4811 int i; 4812 4813 if (sector_nr >= max_sector) { 4814 /* just being told to finish up .. nothing much to do */ 4815 4816 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4817 end_reshape(conf); 4818 return 0; 4819 } 4820 4821 if (mddev->curr_resync < max_sector) /* aborted */ 4822 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4823 &sync_blocks, 1); 4824 else /* completed sync */ 4825 conf->fullsync = 0; 4826 bitmap_close_sync(mddev->bitmap); 4827 4828 return 0; 4829 } 4830 4831 /* Allow raid5_quiesce to complete */ 4832 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4833 4834 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4835 return reshape_request(mddev, sector_nr, skipped); 4836 4837 /* No need to check resync_max as we never do more than one 4838 * stripe, and as resync_max will always be on a chunk boundary, 4839 * if the check in md_do_sync didn't fire, there is no chance 4840 * of overstepping resync_max here 4841 */ 4842 4843 /* if there is too many failed drives and we are trying 4844 * to resync, then assert that we are finished, because there is 4845 * nothing we can do. 4846 */ 4847 if (mddev->degraded >= conf->max_degraded && 4848 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4849 sector_t rv = mddev->dev_sectors - sector_nr; 4850 *skipped = 1; 4851 return rv; 4852 } 4853 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4854 !conf->fullsync && 4855 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4856 sync_blocks >= STRIPE_SECTORS) { 4857 /* we can skip this block, and probably more */ 4858 sync_blocks /= STRIPE_SECTORS; 4859 *skipped = 1; 4860 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4861 } 4862 4863 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4864 4865 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4866 if (sh == NULL) { 4867 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4868 /* make sure we don't swamp the stripe cache if someone else 4869 * is trying to get access 4870 */ 4871 schedule_timeout_uninterruptible(1); 4872 } 4873 /* Need to check if array will still be degraded after recovery/resync 4874 * We don't need to check the 'failed' flag as when that gets set, 4875 * recovery aborts. 4876 */ 4877 for (i = 0; i < conf->raid_disks; i++) 4878 if (conf->disks[i].rdev == NULL) 4879 still_degraded = 1; 4880 4881 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4882 4883 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4884 4885 handle_stripe(sh); 4886 release_stripe(sh); 4887 4888 return STRIPE_SECTORS; 4889 } 4890 4891 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4892 { 4893 /* We may not be able to submit a whole bio at once as there 4894 * may not be enough stripe_heads available. 4895 * We cannot pre-allocate enough stripe_heads as we may need 4896 * more than exist in the cache (if we allow ever large chunks). 4897 * So we do one stripe head at a time and record in 4898 * ->bi_hw_segments how many have been done. 4899 * 4900 * We *know* that this entire raid_bio is in one chunk, so 4901 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4902 */ 4903 struct stripe_head *sh; 4904 int dd_idx; 4905 sector_t sector, logical_sector, last_sector; 4906 int scnt = 0; 4907 int remaining; 4908 int handled = 0; 4909 4910 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4911 sector = raid5_compute_sector(conf, logical_sector, 4912 0, &dd_idx, NULL); 4913 last_sector = bio_end_sector(raid_bio); 4914 4915 for (; logical_sector < last_sector; 4916 logical_sector += STRIPE_SECTORS, 4917 sector += STRIPE_SECTORS, 4918 scnt++) { 4919 4920 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4921 /* already done this stripe */ 4922 continue; 4923 4924 sh = get_active_stripe(conf, sector, 0, 1, 0); 4925 4926 if (!sh) { 4927 /* failed to get a stripe - must wait */ 4928 raid5_set_bi_processed_stripes(raid_bio, scnt); 4929 conf->retry_read_aligned = raid_bio; 4930 return handled; 4931 } 4932 4933 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4934 release_stripe(sh); 4935 raid5_set_bi_processed_stripes(raid_bio, scnt); 4936 conf->retry_read_aligned = raid_bio; 4937 return handled; 4938 } 4939 4940 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4941 handle_stripe(sh); 4942 release_stripe(sh); 4943 handled++; 4944 } 4945 remaining = raid5_dec_bi_active_stripes(raid_bio); 4946 if (remaining == 0) { 4947 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4948 raid_bio, 0); 4949 bio_endio(raid_bio, 0); 4950 } 4951 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4952 wake_up(&conf->wait_for_stripe); 4953 return handled; 4954 } 4955 4956 static int handle_active_stripes(struct r5conf *conf, int group, 4957 struct r5worker *worker) 4958 { 4959 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4960 int i, batch_size = 0; 4961 4962 while (batch_size < MAX_STRIPE_BATCH && 4963 (sh = __get_priority_stripe(conf, group)) != NULL) 4964 batch[batch_size++] = sh; 4965 4966 if (batch_size == 0) 4967 return batch_size; 4968 spin_unlock_irq(&conf->device_lock); 4969 4970 for (i = 0; i < batch_size; i++) 4971 handle_stripe(batch[i]); 4972 4973 cond_resched(); 4974 4975 spin_lock_irq(&conf->device_lock); 4976 for (i = 0; i < batch_size; i++) 4977 __release_stripe(conf, batch[i]); 4978 return batch_size; 4979 } 4980 4981 static void raid5_do_work(struct work_struct *work) 4982 { 4983 struct r5worker *worker = container_of(work, struct r5worker, work); 4984 struct r5worker_group *group = worker->group; 4985 struct r5conf *conf = group->conf; 4986 int group_id = group - conf->worker_groups; 4987 int handled; 4988 struct blk_plug plug; 4989 4990 pr_debug("+++ raid5worker active\n"); 4991 4992 blk_start_plug(&plug); 4993 handled = 0; 4994 spin_lock_irq(&conf->device_lock); 4995 while (1) { 4996 int batch_size, released; 4997 4998 released = release_stripe_list(conf); 4999 5000 batch_size = handle_active_stripes(conf, group_id, worker); 5001 worker->working = false; 5002 if (!batch_size && !released) 5003 break; 5004 handled += batch_size; 5005 } 5006 pr_debug("%d stripes handled\n", handled); 5007 5008 spin_unlock_irq(&conf->device_lock); 5009 blk_finish_plug(&plug); 5010 5011 pr_debug("--- raid5worker inactive\n"); 5012 } 5013 5014 /* 5015 * This is our raid5 kernel thread. 5016 * 5017 * We scan the hash table for stripes which can be handled now. 5018 * During the scan, completed stripes are saved for us by the interrupt 5019 * handler, so that they will not have to wait for our next wakeup. 5020 */ 5021 static void raid5d(struct md_thread *thread) 5022 { 5023 struct mddev *mddev = thread->mddev; 5024 struct r5conf *conf = mddev->private; 5025 int handled; 5026 struct blk_plug plug; 5027 5028 pr_debug("+++ raid5d active\n"); 5029 5030 md_check_recovery(mddev); 5031 5032 blk_start_plug(&plug); 5033 handled = 0; 5034 spin_lock_irq(&conf->device_lock); 5035 while (1) { 5036 struct bio *bio; 5037 int batch_size, released; 5038 5039 released = release_stripe_list(conf); 5040 5041 if ( 5042 !list_empty(&conf->bitmap_list)) { 5043 /* Now is a good time to flush some bitmap updates */ 5044 conf->seq_flush++; 5045 spin_unlock_irq(&conf->device_lock); 5046 bitmap_unplug(mddev->bitmap); 5047 spin_lock_irq(&conf->device_lock); 5048 conf->seq_write = conf->seq_flush; 5049 activate_bit_delay(conf); 5050 } 5051 raid5_activate_delayed(conf); 5052 5053 while ((bio = remove_bio_from_retry(conf))) { 5054 int ok; 5055 spin_unlock_irq(&conf->device_lock); 5056 ok = retry_aligned_read(conf, bio); 5057 spin_lock_irq(&conf->device_lock); 5058 if (!ok) 5059 break; 5060 handled++; 5061 } 5062 5063 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5064 if (!batch_size && !released) 5065 break; 5066 handled += batch_size; 5067 5068 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 5069 spin_unlock_irq(&conf->device_lock); 5070 md_check_recovery(mddev); 5071 spin_lock_irq(&conf->device_lock); 5072 } 5073 } 5074 pr_debug("%d stripes handled\n", handled); 5075 5076 spin_unlock_irq(&conf->device_lock); 5077 5078 async_tx_issue_pending_all(); 5079 blk_finish_plug(&plug); 5080 5081 pr_debug("--- raid5d inactive\n"); 5082 } 5083 5084 static ssize_t 5085 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 5086 { 5087 struct r5conf *conf = mddev->private; 5088 if (conf) 5089 return sprintf(page, "%d\n", conf->max_nr_stripes); 5090 else 5091 return 0; 5092 } 5093 5094 int 5095 raid5_set_cache_size(struct mddev *mddev, int size) 5096 { 5097 struct r5conf *conf = mddev->private; 5098 int err; 5099 5100 if (size <= 16 || size > 32768) 5101 return -EINVAL; 5102 while (size < conf->max_nr_stripes) { 5103 if (drop_one_stripe(conf)) 5104 conf->max_nr_stripes--; 5105 else 5106 break; 5107 } 5108 err = md_allow_write(mddev); 5109 if (err) 5110 return err; 5111 while (size > conf->max_nr_stripes) { 5112 if (grow_one_stripe(conf)) 5113 conf->max_nr_stripes++; 5114 else break; 5115 } 5116 return 0; 5117 } 5118 EXPORT_SYMBOL(raid5_set_cache_size); 5119 5120 static ssize_t 5121 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 5122 { 5123 struct r5conf *conf = mddev->private; 5124 unsigned long new; 5125 int err; 5126 5127 if (len >= PAGE_SIZE) 5128 return -EINVAL; 5129 if (!conf) 5130 return -ENODEV; 5131 5132 if (kstrtoul(page, 10, &new)) 5133 return -EINVAL; 5134 err = raid5_set_cache_size(mddev, new); 5135 if (err) 5136 return err; 5137 return len; 5138 } 5139 5140 static struct md_sysfs_entry 5141 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 5142 raid5_show_stripe_cache_size, 5143 raid5_store_stripe_cache_size); 5144 5145 static ssize_t 5146 raid5_show_preread_threshold(struct mddev *mddev, char *page) 5147 { 5148 struct r5conf *conf = mddev->private; 5149 if (conf) 5150 return sprintf(page, "%d\n", conf->bypass_threshold); 5151 else 5152 return 0; 5153 } 5154 5155 static ssize_t 5156 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 5157 { 5158 struct r5conf *conf = mddev->private; 5159 unsigned long new; 5160 if (len >= PAGE_SIZE) 5161 return -EINVAL; 5162 if (!conf) 5163 return -ENODEV; 5164 5165 if (kstrtoul(page, 10, &new)) 5166 return -EINVAL; 5167 if (new > conf->max_nr_stripes) 5168 return -EINVAL; 5169 conf->bypass_threshold = new; 5170 return len; 5171 } 5172 5173 static struct md_sysfs_entry 5174 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 5175 S_IRUGO | S_IWUSR, 5176 raid5_show_preread_threshold, 5177 raid5_store_preread_threshold); 5178 5179 static ssize_t 5180 stripe_cache_active_show(struct mddev *mddev, char *page) 5181 { 5182 struct r5conf *conf = mddev->private; 5183 if (conf) 5184 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 5185 else 5186 return 0; 5187 } 5188 5189 static struct md_sysfs_entry 5190 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 5191 5192 static ssize_t 5193 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 5194 { 5195 struct r5conf *conf = mddev->private; 5196 if (conf) 5197 return sprintf(page, "%d\n", conf->worker_cnt_per_group); 5198 else 5199 return 0; 5200 } 5201 5202 static int alloc_thread_groups(struct r5conf *conf, int cnt); 5203 static ssize_t 5204 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5205 { 5206 struct r5conf *conf = mddev->private; 5207 unsigned long new; 5208 int err; 5209 struct r5worker_group *old_groups; 5210 int old_group_cnt; 5211 5212 if (len >= PAGE_SIZE) 5213 return -EINVAL; 5214 if (!conf) 5215 return -ENODEV; 5216 5217 if (kstrtoul(page, 10, &new)) 5218 return -EINVAL; 5219 5220 if (new == conf->worker_cnt_per_group) 5221 return len; 5222 5223 mddev_suspend(mddev); 5224 5225 old_groups = conf->worker_groups; 5226 old_group_cnt = conf->worker_cnt_per_group; 5227 5228 conf->worker_groups = NULL; 5229 err = alloc_thread_groups(conf, new); 5230 if (err) { 5231 conf->worker_groups = old_groups; 5232 conf->worker_cnt_per_group = old_group_cnt; 5233 } else { 5234 if (old_groups) 5235 kfree(old_groups[0].workers); 5236 kfree(old_groups); 5237 } 5238 5239 mddev_resume(mddev); 5240 5241 if (err) 5242 return err; 5243 return len; 5244 } 5245 5246 static struct md_sysfs_entry 5247 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 5248 raid5_show_group_thread_cnt, 5249 raid5_store_group_thread_cnt); 5250 5251 static struct attribute *raid5_attrs[] = { 5252 &raid5_stripecache_size.attr, 5253 &raid5_stripecache_active.attr, 5254 &raid5_preread_bypass_threshold.attr, 5255 &raid5_group_thread_cnt.attr, 5256 NULL, 5257 }; 5258 static struct attribute_group raid5_attrs_group = { 5259 .name = NULL, 5260 .attrs = raid5_attrs, 5261 }; 5262 5263 static int alloc_thread_groups(struct r5conf *conf, int cnt) 5264 { 5265 int i, j; 5266 ssize_t size; 5267 struct r5worker *workers; 5268 5269 conf->worker_cnt_per_group = cnt; 5270 if (cnt == 0) { 5271 conf->worker_groups = NULL; 5272 return 0; 5273 } 5274 conf->group_cnt = num_possible_nodes(); 5275 size = sizeof(struct r5worker) * cnt; 5276 workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5277 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5278 conf->group_cnt, GFP_NOIO); 5279 if (!conf->worker_groups || !workers) { 5280 kfree(workers); 5281 kfree(conf->worker_groups); 5282 conf->worker_groups = NULL; 5283 return -ENOMEM; 5284 } 5285 5286 for (i = 0; i < conf->group_cnt; i++) { 5287 struct r5worker_group *group; 5288 5289 group = &conf->worker_groups[i]; 5290 INIT_LIST_HEAD(&group->handle_list); 5291 group->conf = conf; 5292 group->workers = workers + i * cnt; 5293 5294 for (j = 0; j < cnt; j++) { 5295 group->workers[j].group = group; 5296 INIT_WORK(&group->workers[j].work, raid5_do_work); 5297 } 5298 } 5299 5300 return 0; 5301 } 5302 5303 static void free_thread_groups(struct r5conf *conf) 5304 { 5305 if (conf->worker_groups) 5306 kfree(conf->worker_groups[0].workers); 5307 kfree(conf->worker_groups); 5308 conf->worker_groups = NULL; 5309 } 5310 5311 static sector_t 5312 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 5313 { 5314 struct r5conf *conf = mddev->private; 5315 5316 if (!sectors) 5317 sectors = mddev->dev_sectors; 5318 if (!raid_disks) 5319 /* size is defined by the smallest of previous and new size */ 5320 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 5321 5322 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5323 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 5324 return sectors * (raid_disks - conf->max_degraded); 5325 } 5326 5327 static void raid5_free_percpu(struct r5conf *conf) 5328 { 5329 struct raid5_percpu *percpu; 5330 unsigned long cpu; 5331 5332 if (!conf->percpu) 5333 return; 5334 5335 get_online_cpus(); 5336 for_each_possible_cpu(cpu) { 5337 percpu = per_cpu_ptr(conf->percpu, cpu); 5338 safe_put_page(percpu->spare_page); 5339 kfree(percpu->scribble); 5340 } 5341 #ifdef CONFIG_HOTPLUG_CPU 5342 unregister_cpu_notifier(&conf->cpu_notify); 5343 #endif 5344 put_online_cpus(); 5345 5346 free_percpu(conf->percpu); 5347 } 5348 5349 static void free_conf(struct r5conf *conf) 5350 { 5351 free_thread_groups(conf); 5352 shrink_stripes(conf); 5353 raid5_free_percpu(conf); 5354 kfree(conf->disks); 5355 kfree(conf->stripe_hashtbl); 5356 kfree(conf); 5357 } 5358 5359 #ifdef CONFIG_HOTPLUG_CPU 5360 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5361 void *hcpu) 5362 { 5363 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5364 long cpu = (long)hcpu; 5365 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5366 5367 switch (action) { 5368 case CPU_UP_PREPARE: 5369 case CPU_UP_PREPARE_FROZEN: 5370 if (conf->level == 6 && !percpu->spare_page) 5371 percpu->spare_page = alloc_page(GFP_KERNEL); 5372 if (!percpu->scribble) 5373 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5374 5375 if (!percpu->scribble || 5376 (conf->level == 6 && !percpu->spare_page)) { 5377 safe_put_page(percpu->spare_page); 5378 kfree(percpu->scribble); 5379 pr_err("%s: failed memory allocation for cpu%ld\n", 5380 __func__, cpu); 5381 return notifier_from_errno(-ENOMEM); 5382 } 5383 break; 5384 case CPU_DEAD: 5385 case CPU_DEAD_FROZEN: 5386 safe_put_page(percpu->spare_page); 5387 kfree(percpu->scribble); 5388 percpu->spare_page = NULL; 5389 percpu->scribble = NULL; 5390 break; 5391 default: 5392 break; 5393 } 5394 return NOTIFY_OK; 5395 } 5396 #endif 5397 5398 static int raid5_alloc_percpu(struct r5conf *conf) 5399 { 5400 unsigned long cpu; 5401 struct page *spare_page; 5402 struct raid5_percpu __percpu *allcpus; 5403 void *scribble; 5404 int err; 5405 5406 allcpus = alloc_percpu(struct raid5_percpu); 5407 if (!allcpus) 5408 return -ENOMEM; 5409 conf->percpu = allcpus; 5410 5411 get_online_cpus(); 5412 err = 0; 5413 for_each_present_cpu(cpu) { 5414 if (conf->level == 6) { 5415 spare_page = alloc_page(GFP_KERNEL); 5416 if (!spare_page) { 5417 err = -ENOMEM; 5418 break; 5419 } 5420 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5421 } 5422 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5423 if (!scribble) { 5424 err = -ENOMEM; 5425 break; 5426 } 5427 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5428 } 5429 #ifdef CONFIG_HOTPLUG_CPU 5430 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5431 conf->cpu_notify.priority = 0; 5432 if (err == 0) 5433 err = register_cpu_notifier(&conf->cpu_notify); 5434 #endif 5435 put_online_cpus(); 5436 5437 return err; 5438 } 5439 5440 static struct r5conf *setup_conf(struct mddev *mddev) 5441 { 5442 struct r5conf *conf; 5443 int raid_disk, memory, max_disks; 5444 struct md_rdev *rdev; 5445 struct disk_info *disk; 5446 char pers_name[6]; 5447 5448 if (mddev->new_level != 5 5449 && mddev->new_level != 4 5450 && mddev->new_level != 6) { 5451 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5452 mdname(mddev), mddev->new_level); 5453 return ERR_PTR(-EIO); 5454 } 5455 if ((mddev->new_level == 5 5456 && !algorithm_valid_raid5(mddev->new_layout)) || 5457 (mddev->new_level == 6 5458 && !algorithm_valid_raid6(mddev->new_layout))) { 5459 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5460 mdname(mddev), mddev->new_layout); 5461 return ERR_PTR(-EIO); 5462 } 5463 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5464 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5465 mdname(mddev), mddev->raid_disks); 5466 return ERR_PTR(-EINVAL); 5467 } 5468 5469 if (!mddev->new_chunk_sectors || 5470 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5471 !is_power_of_2(mddev->new_chunk_sectors)) { 5472 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5473 mdname(mddev), mddev->new_chunk_sectors << 9); 5474 return ERR_PTR(-EINVAL); 5475 } 5476 5477 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5478 if (conf == NULL) 5479 goto abort; 5480 /* Don't enable multi-threading by default*/ 5481 if (alloc_thread_groups(conf, 0)) 5482 goto abort; 5483 spin_lock_init(&conf->device_lock); 5484 seqcount_init(&conf->gen_lock); 5485 init_waitqueue_head(&conf->wait_for_stripe); 5486 init_waitqueue_head(&conf->wait_for_overlap); 5487 INIT_LIST_HEAD(&conf->handle_list); 5488 INIT_LIST_HEAD(&conf->hold_list); 5489 INIT_LIST_HEAD(&conf->delayed_list); 5490 INIT_LIST_HEAD(&conf->bitmap_list); 5491 INIT_LIST_HEAD(&conf->inactive_list); 5492 init_llist_head(&conf->released_stripes); 5493 atomic_set(&conf->active_stripes, 0); 5494 atomic_set(&conf->preread_active_stripes, 0); 5495 atomic_set(&conf->active_aligned_reads, 0); 5496 conf->bypass_threshold = BYPASS_THRESHOLD; 5497 conf->recovery_disabled = mddev->recovery_disabled - 1; 5498 5499 conf->raid_disks = mddev->raid_disks; 5500 if (mddev->reshape_position == MaxSector) 5501 conf->previous_raid_disks = mddev->raid_disks; 5502 else 5503 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5504 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5505 conf->scribble_len = scribble_len(max_disks); 5506 5507 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5508 GFP_KERNEL); 5509 if (!conf->disks) 5510 goto abort; 5511 5512 conf->mddev = mddev; 5513 5514 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5515 goto abort; 5516 5517 conf->level = mddev->new_level; 5518 if (raid5_alloc_percpu(conf) != 0) 5519 goto abort; 5520 5521 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5522 5523 rdev_for_each(rdev, mddev) { 5524 raid_disk = rdev->raid_disk; 5525 if (raid_disk >= max_disks 5526 || raid_disk < 0) 5527 continue; 5528 disk = conf->disks + raid_disk; 5529 5530 if (test_bit(Replacement, &rdev->flags)) { 5531 if (disk->replacement) 5532 goto abort; 5533 disk->replacement = rdev; 5534 } else { 5535 if (disk->rdev) 5536 goto abort; 5537 disk->rdev = rdev; 5538 } 5539 5540 if (test_bit(In_sync, &rdev->flags)) { 5541 char b[BDEVNAME_SIZE]; 5542 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5543 " disk %d\n", 5544 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5545 } else if (rdev->saved_raid_disk != raid_disk) 5546 /* Cannot rely on bitmap to complete recovery */ 5547 conf->fullsync = 1; 5548 } 5549 5550 conf->chunk_sectors = mddev->new_chunk_sectors; 5551 conf->level = mddev->new_level; 5552 if (conf->level == 6) 5553 conf->max_degraded = 2; 5554 else 5555 conf->max_degraded = 1; 5556 conf->algorithm = mddev->new_layout; 5557 conf->max_nr_stripes = NR_STRIPES; 5558 conf->reshape_progress = mddev->reshape_position; 5559 if (conf->reshape_progress != MaxSector) { 5560 conf->prev_chunk_sectors = mddev->chunk_sectors; 5561 conf->prev_algo = mddev->layout; 5562 } 5563 5564 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5565 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5566 if (grow_stripes(conf, conf->max_nr_stripes)) { 5567 printk(KERN_ERR 5568 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5569 mdname(mddev), memory); 5570 goto abort; 5571 } else 5572 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5573 mdname(mddev), memory); 5574 5575 sprintf(pers_name, "raid%d", mddev->new_level); 5576 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5577 if (!conf->thread) { 5578 printk(KERN_ERR 5579 "md/raid:%s: couldn't allocate thread.\n", 5580 mdname(mddev)); 5581 goto abort; 5582 } 5583 5584 return conf; 5585 5586 abort: 5587 if (conf) { 5588 free_conf(conf); 5589 return ERR_PTR(-EIO); 5590 } else 5591 return ERR_PTR(-ENOMEM); 5592 } 5593 5594 5595 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5596 { 5597 switch (algo) { 5598 case ALGORITHM_PARITY_0: 5599 if (raid_disk < max_degraded) 5600 return 1; 5601 break; 5602 case ALGORITHM_PARITY_N: 5603 if (raid_disk >= raid_disks - max_degraded) 5604 return 1; 5605 break; 5606 case ALGORITHM_PARITY_0_6: 5607 if (raid_disk == 0 || 5608 raid_disk == raid_disks - 1) 5609 return 1; 5610 break; 5611 case ALGORITHM_LEFT_ASYMMETRIC_6: 5612 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5613 case ALGORITHM_LEFT_SYMMETRIC_6: 5614 case ALGORITHM_RIGHT_SYMMETRIC_6: 5615 if (raid_disk == raid_disks - 1) 5616 return 1; 5617 } 5618 return 0; 5619 } 5620 5621 static int run(struct mddev *mddev) 5622 { 5623 struct r5conf *conf; 5624 int working_disks = 0; 5625 int dirty_parity_disks = 0; 5626 struct md_rdev *rdev; 5627 sector_t reshape_offset = 0; 5628 int i; 5629 long long min_offset_diff = 0; 5630 int first = 1; 5631 5632 if (mddev->recovery_cp != MaxSector) 5633 printk(KERN_NOTICE "md/raid:%s: not clean" 5634 " -- starting background reconstruction\n", 5635 mdname(mddev)); 5636 5637 rdev_for_each(rdev, mddev) { 5638 long long diff; 5639 if (rdev->raid_disk < 0) 5640 continue; 5641 diff = (rdev->new_data_offset - rdev->data_offset); 5642 if (first) { 5643 min_offset_diff = diff; 5644 first = 0; 5645 } else if (mddev->reshape_backwards && 5646 diff < min_offset_diff) 5647 min_offset_diff = diff; 5648 else if (!mddev->reshape_backwards && 5649 diff > min_offset_diff) 5650 min_offset_diff = diff; 5651 } 5652 5653 if (mddev->reshape_position != MaxSector) { 5654 /* Check that we can continue the reshape. 5655 * Difficulties arise if the stripe we would write to 5656 * next is at or after the stripe we would read from next. 5657 * For a reshape that changes the number of devices, this 5658 * is only possible for a very short time, and mdadm makes 5659 * sure that time appears to have past before assembling 5660 * the array. So we fail if that time hasn't passed. 5661 * For a reshape that keeps the number of devices the same 5662 * mdadm must be monitoring the reshape can keeping the 5663 * critical areas read-only and backed up. It will start 5664 * the array in read-only mode, so we check for that. 5665 */ 5666 sector_t here_new, here_old; 5667 int old_disks; 5668 int max_degraded = (mddev->level == 6 ? 2 : 1); 5669 5670 if (mddev->new_level != mddev->level) { 5671 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5672 "required - aborting.\n", 5673 mdname(mddev)); 5674 return -EINVAL; 5675 } 5676 old_disks = mddev->raid_disks - mddev->delta_disks; 5677 /* reshape_position must be on a new-stripe boundary, and one 5678 * further up in new geometry must map after here in old 5679 * geometry. 5680 */ 5681 here_new = mddev->reshape_position; 5682 if (sector_div(here_new, mddev->new_chunk_sectors * 5683 (mddev->raid_disks - max_degraded))) { 5684 printk(KERN_ERR "md/raid:%s: reshape_position not " 5685 "on a stripe boundary\n", mdname(mddev)); 5686 return -EINVAL; 5687 } 5688 reshape_offset = here_new * mddev->new_chunk_sectors; 5689 /* here_new is the stripe we will write to */ 5690 here_old = mddev->reshape_position; 5691 sector_div(here_old, mddev->chunk_sectors * 5692 (old_disks-max_degraded)); 5693 /* here_old is the first stripe that we might need to read 5694 * from */ 5695 if (mddev->delta_disks == 0) { 5696 if ((here_new * mddev->new_chunk_sectors != 5697 here_old * mddev->chunk_sectors)) { 5698 printk(KERN_ERR "md/raid:%s: reshape position is" 5699 " confused - aborting\n", mdname(mddev)); 5700 return -EINVAL; 5701 } 5702 /* We cannot be sure it is safe to start an in-place 5703 * reshape. It is only safe if user-space is monitoring 5704 * and taking constant backups. 5705 * mdadm always starts a situation like this in 5706 * readonly mode so it can take control before 5707 * allowing any writes. So just check for that. 5708 */ 5709 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5710 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5711 /* not really in-place - so OK */; 5712 else if (mddev->ro == 0) { 5713 printk(KERN_ERR "md/raid:%s: in-place reshape " 5714 "must be started in read-only mode " 5715 "- aborting\n", 5716 mdname(mddev)); 5717 return -EINVAL; 5718 } 5719 } else if (mddev->reshape_backwards 5720 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5721 here_old * mddev->chunk_sectors) 5722 : (here_new * mddev->new_chunk_sectors >= 5723 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5724 /* Reading from the same stripe as writing to - bad */ 5725 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5726 "auto-recovery - aborting.\n", 5727 mdname(mddev)); 5728 return -EINVAL; 5729 } 5730 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5731 mdname(mddev)); 5732 /* OK, we should be able to continue; */ 5733 } else { 5734 BUG_ON(mddev->level != mddev->new_level); 5735 BUG_ON(mddev->layout != mddev->new_layout); 5736 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5737 BUG_ON(mddev->delta_disks != 0); 5738 } 5739 5740 if (mddev->private == NULL) 5741 conf = setup_conf(mddev); 5742 else 5743 conf = mddev->private; 5744 5745 if (IS_ERR(conf)) 5746 return PTR_ERR(conf); 5747 5748 conf->min_offset_diff = min_offset_diff; 5749 mddev->thread = conf->thread; 5750 conf->thread = NULL; 5751 mddev->private = conf; 5752 5753 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5754 i++) { 5755 rdev = conf->disks[i].rdev; 5756 if (!rdev && conf->disks[i].replacement) { 5757 /* The replacement is all we have yet */ 5758 rdev = conf->disks[i].replacement; 5759 conf->disks[i].replacement = NULL; 5760 clear_bit(Replacement, &rdev->flags); 5761 conf->disks[i].rdev = rdev; 5762 } 5763 if (!rdev) 5764 continue; 5765 if (conf->disks[i].replacement && 5766 conf->reshape_progress != MaxSector) { 5767 /* replacements and reshape simply do not mix. */ 5768 printk(KERN_ERR "md: cannot handle concurrent " 5769 "replacement and reshape.\n"); 5770 goto abort; 5771 } 5772 if (test_bit(In_sync, &rdev->flags)) { 5773 working_disks++; 5774 continue; 5775 } 5776 /* This disc is not fully in-sync. However if it 5777 * just stored parity (beyond the recovery_offset), 5778 * when we don't need to be concerned about the 5779 * array being dirty. 5780 * When reshape goes 'backwards', we never have 5781 * partially completed devices, so we only need 5782 * to worry about reshape going forwards. 5783 */ 5784 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5785 if (mddev->major_version == 0 && 5786 mddev->minor_version > 90) 5787 rdev->recovery_offset = reshape_offset; 5788 5789 if (rdev->recovery_offset < reshape_offset) { 5790 /* We need to check old and new layout */ 5791 if (!only_parity(rdev->raid_disk, 5792 conf->algorithm, 5793 conf->raid_disks, 5794 conf->max_degraded)) 5795 continue; 5796 } 5797 if (!only_parity(rdev->raid_disk, 5798 conf->prev_algo, 5799 conf->previous_raid_disks, 5800 conf->max_degraded)) 5801 continue; 5802 dirty_parity_disks++; 5803 } 5804 5805 /* 5806 * 0 for a fully functional array, 1 or 2 for a degraded array. 5807 */ 5808 mddev->degraded = calc_degraded(conf); 5809 5810 if (has_failed(conf)) { 5811 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5812 " (%d/%d failed)\n", 5813 mdname(mddev), mddev->degraded, conf->raid_disks); 5814 goto abort; 5815 } 5816 5817 /* device size must be a multiple of chunk size */ 5818 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5819 mddev->resync_max_sectors = mddev->dev_sectors; 5820 5821 if (mddev->degraded > dirty_parity_disks && 5822 mddev->recovery_cp != MaxSector) { 5823 if (mddev->ok_start_degraded) 5824 printk(KERN_WARNING 5825 "md/raid:%s: starting dirty degraded array" 5826 " - data corruption possible.\n", 5827 mdname(mddev)); 5828 else { 5829 printk(KERN_ERR 5830 "md/raid:%s: cannot start dirty degraded array.\n", 5831 mdname(mddev)); 5832 goto abort; 5833 } 5834 } 5835 5836 if (mddev->degraded == 0) 5837 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5838 " devices, algorithm %d\n", mdname(mddev), conf->level, 5839 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5840 mddev->new_layout); 5841 else 5842 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5843 " out of %d devices, algorithm %d\n", 5844 mdname(mddev), conf->level, 5845 mddev->raid_disks - mddev->degraded, 5846 mddev->raid_disks, mddev->new_layout); 5847 5848 print_raid5_conf(conf); 5849 5850 if (conf->reshape_progress != MaxSector) { 5851 conf->reshape_safe = conf->reshape_progress; 5852 atomic_set(&conf->reshape_stripes, 0); 5853 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5854 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5855 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5856 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5857 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5858 "reshape"); 5859 } 5860 5861 5862 /* Ok, everything is just fine now */ 5863 if (mddev->to_remove == &raid5_attrs_group) 5864 mddev->to_remove = NULL; 5865 else if (mddev->kobj.sd && 5866 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5867 printk(KERN_WARNING 5868 "raid5: failed to create sysfs attributes for %s\n", 5869 mdname(mddev)); 5870 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5871 5872 if (mddev->queue) { 5873 int chunk_size; 5874 bool discard_supported = true; 5875 /* read-ahead size must cover two whole stripes, which 5876 * is 2 * (datadisks) * chunksize where 'n' is the 5877 * number of raid devices 5878 */ 5879 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5880 int stripe = data_disks * 5881 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5882 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5883 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5884 5885 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5886 5887 mddev->queue->backing_dev_info.congested_data = mddev; 5888 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5889 5890 chunk_size = mddev->chunk_sectors << 9; 5891 blk_queue_io_min(mddev->queue, chunk_size); 5892 blk_queue_io_opt(mddev->queue, chunk_size * 5893 (conf->raid_disks - conf->max_degraded)); 5894 /* 5895 * We can only discard a whole stripe. It doesn't make sense to 5896 * discard data disk but write parity disk 5897 */ 5898 stripe = stripe * PAGE_SIZE; 5899 /* Round up to power of 2, as discard handling 5900 * currently assumes that */ 5901 while ((stripe-1) & stripe) 5902 stripe = (stripe | (stripe-1)) + 1; 5903 mddev->queue->limits.discard_alignment = stripe; 5904 mddev->queue->limits.discard_granularity = stripe; 5905 /* 5906 * unaligned part of discard request will be ignored, so can't 5907 * guarantee discard_zerors_data 5908 */ 5909 mddev->queue->limits.discard_zeroes_data = 0; 5910 5911 blk_queue_max_write_same_sectors(mddev->queue, 0); 5912 5913 rdev_for_each(rdev, mddev) { 5914 disk_stack_limits(mddev->gendisk, rdev->bdev, 5915 rdev->data_offset << 9); 5916 disk_stack_limits(mddev->gendisk, rdev->bdev, 5917 rdev->new_data_offset << 9); 5918 /* 5919 * discard_zeroes_data is required, otherwise data 5920 * could be lost. Consider a scenario: discard a stripe 5921 * (the stripe could be inconsistent if 5922 * discard_zeroes_data is 0); write one disk of the 5923 * stripe (the stripe could be inconsistent again 5924 * depending on which disks are used to calculate 5925 * parity); the disk is broken; The stripe data of this 5926 * disk is lost. 5927 */ 5928 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5929 !bdev_get_queue(rdev->bdev)-> 5930 limits.discard_zeroes_data) 5931 discard_supported = false; 5932 } 5933 5934 if (discard_supported && 5935 mddev->queue->limits.max_discard_sectors >= stripe && 5936 mddev->queue->limits.discard_granularity >= stripe) 5937 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5938 mddev->queue); 5939 else 5940 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5941 mddev->queue); 5942 } 5943 5944 return 0; 5945 abort: 5946 md_unregister_thread(&mddev->thread); 5947 print_raid5_conf(conf); 5948 free_conf(conf); 5949 mddev->private = NULL; 5950 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5951 return -EIO; 5952 } 5953 5954 static int stop(struct mddev *mddev) 5955 { 5956 struct r5conf *conf = mddev->private; 5957 5958 md_unregister_thread(&mddev->thread); 5959 if (mddev->queue) 5960 mddev->queue->backing_dev_info.congested_fn = NULL; 5961 free_conf(conf); 5962 mddev->private = NULL; 5963 mddev->to_remove = &raid5_attrs_group; 5964 return 0; 5965 } 5966 5967 static void status(struct seq_file *seq, struct mddev *mddev) 5968 { 5969 struct r5conf *conf = mddev->private; 5970 int i; 5971 5972 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5973 mddev->chunk_sectors / 2, mddev->layout); 5974 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5975 for (i = 0; i < conf->raid_disks; i++) 5976 seq_printf (seq, "%s", 5977 conf->disks[i].rdev && 5978 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5979 seq_printf (seq, "]"); 5980 } 5981 5982 static void print_raid5_conf (struct r5conf *conf) 5983 { 5984 int i; 5985 struct disk_info *tmp; 5986 5987 printk(KERN_DEBUG "RAID conf printout:\n"); 5988 if (!conf) { 5989 printk("(conf==NULL)\n"); 5990 return; 5991 } 5992 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5993 conf->raid_disks, 5994 conf->raid_disks - conf->mddev->degraded); 5995 5996 for (i = 0; i < conf->raid_disks; i++) { 5997 char b[BDEVNAME_SIZE]; 5998 tmp = conf->disks + i; 5999 if (tmp->rdev) 6000 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 6001 i, !test_bit(Faulty, &tmp->rdev->flags), 6002 bdevname(tmp->rdev->bdev, b)); 6003 } 6004 } 6005 6006 static int raid5_spare_active(struct mddev *mddev) 6007 { 6008 int i; 6009 struct r5conf *conf = mddev->private; 6010 struct disk_info *tmp; 6011 int count = 0; 6012 unsigned long flags; 6013 6014 for (i = 0; i < conf->raid_disks; i++) { 6015 tmp = conf->disks + i; 6016 if (tmp->replacement 6017 && tmp->replacement->recovery_offset == MaxSector 6018 && !test_bit(Faulty, &tmp->replacement->flags) 6019 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 6020 /* Replacement has just become active. */ 6021 if (!tmp->rdev 6022 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 6023 count++; 6024 if (tmp->rdev) { 6025 /* Replaced device not technically faulty, 6026 * but we need to be sure it gets removed 6027 * and never re-added. 6028 */ 6029 set_bit(Faulty, &tmp->rdev->flags); 6030 sysfs_notify_dirent_safe( 6031 tmp->rdev->sysfs_state); 6032 } 6033 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 6034 } else if (tmp->rdev 6035 && tmp->rdev->recovery_offset == MaxSector 6036 && !test_bit(Faulty, &tmp->rdev->flags) 6037 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 6038 count++; 6039 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 6040 } 6041 } 6042 spin_lock_irqsave(&conf->device_lock, flags); 6043 mddev->degraded = calc_degraded(conf); 6044 spin_unlock_irqrestore(&conf->device_lock, flags); 6045 print_raid5_conf(conf); 6046 return count; 6047 } 6048 6049 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 6050 { 6051 struct r5conf *conf = mddev->private; 6052 int err = 0; 6053 int number = rdev->raid_disk; 6054 struct md_rdev **rdevp; 6055 struct disk_info *p = conf->disks + number; 6056 6057 print_raid5_conf(conf); 6058 if (rdev == p->rdev) 6059 rdevp = &p->rdev; 6060 else if (rdev == p->replacement) 6061 rdevp = &p->replacement; 6062 else 6063 return 0; 6064 6065 if (number >= conf->raid_disks && 6066 conf->reshape_progress == MaxSector) 6067 clear_bit(In_sync, &rdev->flags); 6068 6069 if (test_bit(In_sync, &rdev->flags) || 6070 atomic_read(&rdev->nr_pending)) { 6071 err = -EBUSY; 6072 goto abort; 6073 } 6074 /* Only remove non-faulty devices if recovery 6075 * isn't possible. 6076 */ 6077 if (!test_bit(Faulty, &rdev->flags) && 6078 mddev->recovery_disabled != conf->recovery_disabled && 6079 !has_failed(conf) && 6080 (!p->replacement || p->replacement == rdev) && 6081 number < conf->raid_disks) { 6082 err = -EBUSY; 6083 goto abort; 6084 } 6085 *rdevp = NULL; 6086 synchronize_rcu(); 6087 if (atomic_read(&rdev->nr_pending)) { 6088 /* lost the race, try later */ 6089 err = -EBUSY; 6090 *rdevp = rdev; 6091 } else if (p->replacement) { 6092 /* We must have just cleared 'rdev' */ 6093 p->rdev = p->replacement; 6094 clear_bit(Replacement, &p->replacement->flags); 6095 smp_mb(); /* Make sure other CPUs may see both as identical 6096 * but will never see neither - if they are careful 6097 */ 6098 p->replacement = NULL; 6099 clear_bit(WantReplacement, &rdev->flags); 6100 } else 6101 /* We might have just removed the Replacement as faulty- 6102 * clear the bit just in case 6103 */ 6104 clear_bit(WantReplacement, &rdev->flags); 6105 abort: 6106 6107 print_raid5_conf(conf); 6108 return err; 6109 } 6110 6111 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 6112 { 6113 struct r5conf *conf = mddev->private; 6114 int err = -EEXIST; 6115 int disk; 6116 struct disk_info *p; 6117 int first = 0; 6118 int last = conf->raid_disks - 1; 6119 6120 if (mddev->recovery_disabled == conf->recovery_disabled) 6121 return -EBUSY; 6122 6123 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 6124 /* no point adding a device */ 6125 return -EINVAL; 6126 6127 if (rdev->raid_disk >= 0) 6128 first = last = rdev->raid_disk; 6129 6130 /* 6131 * find the disk ... but prefer rdev->saved_raid_disk 6132 * if possible. 6133 */ 6134 if (rdev->saved_raid_disk >= 0 && 6135 rdev->saved_raid_disk >= first && 6136 conf->disks[rdev->saved_raid_disk].rdev == NULL) 6137 first = rdev->saved_raid_disk; 6138 6139 for (disk = first; disk <= last; disk++) { 6140 p = conf->disks + disk; 6141 if (p->rdev == NULL) { 6142 clear_bit(In_sync, &rdev->flags); 6143 rdev->raid_disk = disk; 6144 err = 0; 6145 if (rdev->saved_raid_disk != disk) 6146 conf->fullsync = 1; 6147 rcu_assign_pointer(p->rdev, rdev); 6148 goto out; 6149 } 6150 } 6151 for (disk = first; disk <= last; disk++) { 6152 p = conf->disks + disk; 6153 if (test_bit(WantReplacement, &p->rdev->flags) && 6154 p->replacement == NULL) { 6155 clear_bit(In_sync, &rdev->flags); 6156 set_bit(Replacement, &rdev->flags); 6157 rdev->raid_disk = disk; 6158 err = 0; 6159 conf->fullsync = 1; 6160 rcu_assign_pointer(p->replacement, rdev); 6161 break; 6162 } 6163 } 6164 out: 6165 print_raid5_conf(conf); 6166 return err; 6167 } 6168 6169 static int raid5_resize(struct mddev *mddev, sector_t sectors) 6170 { 6171 /* no resync is happening, and there is enough space 6172 * on all devices, so we can resize. 6173 * We need to make sure resync covers any new space. 6174 * If the array is shrinking we should possibly wait until 6175 * any io in the removed space completes, but it hardly seems 6176 * worth it. 6177 */ 6178 sector_t newsize; 6179 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 6180 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 6181 if (mddev->external_size && 6182 mddev->array_sectors > newsize) 6183 return -EINVAL; 6184 if (mddev->bitmap) { 6185 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 6186 if (ret) 6187 return ret; 6188 } 6189 md_set_array_sectors(mddev, newsize); 6190 set_capacity(mddev->gendisk, mddev->array_sectors); 6191 revalidate_disk(mddev->gendisk); 6192 if (sectors > mddev->dev_sectors && 6193 mddev->recovery_cp > mddev->dev_sectors) { 6194 mddev->recovery_cp = mddev->dev_sectors; 6195 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6196 } 6197 mddev->dev_sectors = sectors; 6198 mddev->resync_max_sectors = sectors; 6199 return 0; 6200 } 6201 6202 static int check_stripe_cache(struct mddev *mddev) 6203 { 6204 /* Can only proceed if there are plenty of stripe_heads. 6205 * We need a minimum of one full stripe,, and for sensible progress 6206 * it is best to have about 4 times that. 6207 * If we require 4 times, then the default 256 4K stripe_heads will 6208 * allow for chunk sizes up to 256K, which is probably OK. 6209 * If the chunk size is greater, user-space should request more 6210 * stripe_heads first. 6211 */ 6212 struct r5conf *conf = mddev->private; 6213 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 6214 > conf->max_nr_stripes || 6215 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 6216 > conf->max_nr_stripes) { 6217 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 6218 mdname(mddev), 6219 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 6220 / STRIPE_SIZE)*4); 6221 return 0; 6222 } 6223 return 1; 6224 } 6225 6226 static int check_reshape(struct mddev *mddev) 6227 { 6228 struct r5conf *conf = mddev->private; 6229 6230 if (mddev->delta_disks == 0 && 6231 mddev->new_layout == mddev->layout && 6232 mddev->new_chunk_sectors == mddev->chunk_sectors) 6233 return 0; /* nothing to do */ 6234 if (has_failed(conf)) 6235 return -EINVAL; 6236 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 6237 /* We might be able to shrink, but the devices must 6238 * be made bigger first. 6239 * For raid6, 4 is the minimum size. 6240 * Otherwise 2 is the minimum 6241 */ 6242 int min = 2; 6243 if (mddev->level == 6) 6244 min = 4; 6245 if (mddev->raid_disks + mddev->delta_disks < min) 6246 return -EINVAL; 6247 } 6248 6249 if (!check_stripe_cache(mddev)) 6250 return -ENOSPC; 6251 6252 return resize_stripes(conf, (conf->previous_raid_disks 6253 + mddev->delta_disks)); 6254 } 6255 6256 static int raid5_start_reshape(struct mddev *mddev) 6257 { 6258 struct r5conf *conf = mddev->private; 6259 struct md_rdev *rdev; 6260 int spares = 0; 6261 unsigned long flags; 6262 6263 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6264 return -EBUSY; 6265 6266 if (!check_stripe_cache(mddev)) 6267 return -ENOSPC; 6268 6269 if (has_failed(conf)) 6270 return -EINVAL; 6271 6272 rdev_for_each(rdev, mddev) { 6273 if (!test_bit(In_sync, &rdev->flags) 6274 && !test_bit(Faulty, &rdev->flags)) 6275 spares++; 6276 } 6277 6278 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 6279 /* Not enough devices even to make a degraded array 6280 * of that size 6281 */ 6282 return -EINVAL; 6283 6284 /* Refuse to reduce size of the array. Any reductions in 6285 * array size must be through explicit setting of array_size 6286 * attribute. 6287 */ 6288 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 6289 < mddev->array_sectors) { 6290 printk(KERN_ERR "md/raid:%s: array size must be reduced " 6291 "before number of disks\n", mdname(mddev)); 6292 return -EINVAL; 6293 } 6294 6295 atomic_set(&conf->reshape_stripes, 0); 6296 spin_lock_irq(&conf->device_lock); 6297 write_seqcount_begin(&conf->gen_lock); 6298 conf->previous_raid_disks = conf->raid_disks; 6299 conf->raid_disks += mddev->delta_disks; 6300 conf->prev_chunk_sectors = conf->chunk_sectors; 6301 conf->chunk_sectors = mddev->new_chunk_sectors; 6302 conf->prev_algo = conf->algorithm; 6303 conf->algorithm = mddev->new_layout; 6304 conf->generation++; 6305 /* Code that selects data_offset needs to see the generation update 6306 * if reshape_progress has been set - so a memory barrier needed. 6307 */ 6308 smp_mb(); 6309 if (mddev->reshape_backwards) 6310 conf->reshape_progress = raid5_size(mddev, 0, 0); 6311 else 6312 conf->reshape_progress = 0; 6313 conf->reshape_safe = conf->reshape_progress; 6314 write_seqcount_end(&conf->gen_lock); 6315 spin_unlock_irq(&conf->device_lock); 6316 6317 /* Now make sure any requests that proceeded on the assumption 6318 * the reshape wasn't running - like Discard or Read - have 6319 * completed. 6320 */ 6321 mddev_suspend(mddev); 6322 mddev_resume(mddev); 6323 6324 /* Add some new drives, as many as will fit. 6325 * We know there are enough to make the newly sized array work. 6326 * Don't add devices if we are reducing the number of 6327 * devices in the array. This is because it is not possible 6328 * to correctly record the "partially reconstructed" state of 6329 * such devices during the reshape and confusion could result. 6330 */ 6331 if (mddev->delta_disks >= 0) { 6332 rdev_for_each(rdev, mddev) 6333 if (rdev->raid_disk < 0 && 6334 !test_bit(Faulty, &rdev->flags)) { 6335 if (raid5_add_disk(mddev, rdev) == 0) { 6336 if (rdev->raid_disk 6337 >= conf->previous_raid_disks) 6338 set_bit(In_sync, &rdev->flags); 6339 else 6340 rdev->recovery_offset = 0; 6341 6342 if (sysfs_link_rdev(mddev, rdev)) 6343 /* Failure here is OK */; 6344 } 6345 } else if (rdev->raid_disk >= conf->previous_raid_disks 6346 && !test_bit(Faulty, &rdev->flags)) { 6347 /* This is a spare that was manually added */ 6348 set_bit(In_sync, &rdev->flags); 6349 } 6350 6351 /* When a reshape changes the number of devices, 6352 * ->degraded is measured against the larger of the 6353 * pre and post number of devices. 6354 */ 6355 spin_lock_irqsave(&conf->device_lock, flags); 6356 mddev->degraded = calc_degraded(conf); 6357 spin_unlock_irqrestore(&conf->device_lock, flags); 6358 } 6359 mddev->raid_disks = conf->raid_disks; 6360 mddev->reshape_position = conf->reshape_progress; 6361 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6362 6363 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6364 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6365 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6366 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6367 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6368 "reshape"); 6369 if (!mddev->sync_thread) { 6370 mddev->recovery = 0; 6371 spin_lock_irq(&conf->device_lock); 6372 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6373 rdev_for_each(rdev, mddev) 6374 rdev->new_data_offset = rdev->data_offset; 6375 smp_wmb(); 6376 conf->reshape_progress = MaxSector; 6377 mddev->reshape_position = MaxSector; 6378 spin_unlock_irq(&conf->device_lock); 6379 return -EAGAIN; 6380 } 6381 conf->reshape_checkpoint = jiffies; 6382 md_wakeup_thread(mddev->sync_thread); 6383 md_new_event(mddev); 6384 return 0; 6385 } 6386 6387 /* This is called from the reshape thread and should make any 6388 * changes needed in 'conf' 6389 */ 6390 static void end_reshape(struct r5conf *conf) 6391 { 6392 6393 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6394 struct md_rdev *rdev; 6395 6396 spin_lock_irq(&conf->device_lock); 6397 conf->previous_raid_disks = conf->raid_disks; 6398 rdev_for_each(rdev, conf->mddev) 6399 rdev->data_offset = rdev->new_data_offset; 6400 smp_wmb(); 6401 conf->reshape_progress = MaxSector; 6402 spin_unlock_irq(&conf->device_lock); 6403 wake_up(&conf->wait_for_overlap); 6404 6405 /* read-ahead size must cover two whole stripes, which is 6406 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6407 */ 6408 if (conf->mddev->queue) { 6409 int data_disks = conf->raid_disks - conf->max_degraded; 6410 int stripe = data_disks * ((conf->chunk_sectors << 9) 6411 / PAGE_SIZE); 6412 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6413 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6414 } 6415 } 6416 } 6417 6418 /* This is called from the raid5d thread with mddev_lock held. 6419 * It makes config changes to the device. 6420 */ 6421 static void raid5_finish_reshape(struct mddev *mddev) 6422 { 6423 struct r5conf *conf = mddev->private; 6424 6425 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6426 6427 if (mddev->delta_disks > 0) { 6428 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6429 set_capacity(mddev->gendisk, mddev->array_sectors); 6430 revalidate_disk(mddev->gendisk); 6431 } else { 6432 int d; 6433 spin_lock_irq(&conf->device_lock); 6434 mddev->degraded = calc_degraded(conf); 6435 spin_unlock_irq(&conf->device_lock); 6436 for (d = conf->raid_disks ; 6437 d < conf->raid_disks - mddev->delta_disks; 6438 d++) { 6439 struct md_rdev *rdev = conf->disks[d].rdev; 6440 if (rdev) 6441 clear_bit(In_sync, &rdev->flags); 6442 rdev = conf->disks[d].replacement; 6443 if (rdev) 6444 clear_bit(In_sync, &rdev->flags); 6445 } 6446 } 6447 mddev->layout = conf->algorithm; 6448 mddev->chunk_sectors = conf->chunk_sectors; 6449 mddev->reshape_position = MaxSector; 6450 mddev->delta_disks = 0; 6451 mddev->reshape_backwards = 0; 6452 } 6453 } 6454 6455 static void raid5_quiesce(struct mddev *mddev, int state) 6456 { 6457 struct r5conf *conf = mddev->private; 6458 6459 switch(state) { 6460 case 2: /* resume for a suspend */ 6461 wake_up(&conf->wait_for_overlap); 6462 break; 6463 6464 case 1: /* stop all writes */ 6465 spin_lock_irq(&conf->device_lock); 6466 /* '2' tells resync/reshape to pause so that all 6467 * active stripes can drain 6468 */ 6469 conf->quiesce = 2; 6470 wait_event_lock_irq(conf->wait_for_stripe, 6471 atomic_read(&conf->active_stripes) == 0 && 6472 atomic_read(&conf->active_aligned_reads) == 0, 6473 conf->device_lock); 6474 conf->quiesce = 1; 6475 spin_unlock_irq(&conf->device_lock); 6476 /* allow reshape to continue */ 6477 wake_up(&conf->wait_for_overlap); 6478 break; 6479 6480 case 0: /* re-enable writes */ 6481 spin_lock_irq(&conf->device_lock); 6482 conf->quiesce = 0; 6483 wake_up(&conf->wait_for_stripe); 6484 wake_up(&conf->wait_for_overlap); 6485 spin_unlock_irq(&conf->device_lock); 6486 break; 6487 } 6488 } 6489 6490 6491 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6492 { 6493 struct r0conf *raid0_conf = mddev->private; 6494 sector_t sectors; 6495 6496 /* for raid0 takeover only one zone is supported */ 6497 if (raid0_conf->nr_strip_zones > 1) { 6498 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6499 mdname(mddev)); 6500 return ERR_PTR(-EINVAL); 6501 } 6502 6503 sectors = raid0_conf->strip_zone[0].zone_end; 6504 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6505 mddev->dev_sectors = sectors; 6506 mddev->new_level = level; 6507 mddev->new_layout = ALGORITHM_PARITY_N; 6508 mddev->new_chunk_sectors = mddev->chunk_sectors; 6509 mddev->raid_disks += 1; 6510 mddev->delta_disks = 1; 6511 /* make sure it will be not marked as dirty */ 6512 mddev->recovery_cp = MaxSector; 6513 6514 return setup_conf(mddev); 6515 } 6516 6517 6518 static void *raid5_takeover_raid1(struct mddev *mddev) 6519 { 6520 int chunksect; 6521 6522 if (mddev->raid_disks != 2 || 6523 mddev->degraded > 1) 6524 return ERR_PTR(-EINVAL); 6525 6526 /* Should check if there are write-behind devices? */ 6527 6528 chunksect = 64*2; /* 64K by default */ 6529 6530 /* The array must be an exact multiple of chunksize */ 6531 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6532 chunksect >>= 1; 6533 6534 if ((chunksect<<9) < STRIPE_SIZE) 6535 /* array size does not allow a suitable chunk size */ 6536 return ERR_PTR(-EINVAL); 6537 6538 mddev->new_level = 5; 6539 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6540 mddev->new_chunk_sectors = chunksect; 6541 6542 return setup_conf(mddev); 6543 } 6544 6545 static void *raid5_takeover_raid6(struct mddev *mddev) 6546 { 6547 int new_layout; 6548 6549 switch (mddev->layout) { 6550 case ALGORITHM_LEFT_ASYMMETRIC_6: 6551 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6552 break; 6553 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6554 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6555 break; 6556 case ALGORITHM_LEFT_SYMMETRIC_6: 6557 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6558 break; 6559 case ALGORITHM_RIGHT_SYMMETRIC_6: 6560 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6561 break; 6562 case ALGORITHM_PARITY_0_6: 6563 new_layout = ALGORITHM_PARITY_0; 6564 break; 6565 case ALGORITHM_PARITY_N: 6566 new_layout = ALGORITHM_PARITY_N; 6567 break; 6568 default: 6569 return ERR_PTR(-EINVAL); 6570 } 6571 mddev->new_level = 5; 6572 mddev->new_layout = new_layout; 6573 mddev->delta_disks = -1; 6574 mddev->raid_disks -= 1; 6575 return setup_conf(mddev); 6576 } 6577 6578 6579 static int raid5_check_reshape(struct mddev *mddev) 6580 { 6581 /* For a 2-drive array, the layout and chunk size can be changed 6582 * immediately as not restriping is needed. 6583 * For larger arrays we record the new value - after validation 6584 * to be used by a reshape pass. 6585 */ 6586 struct r5conf *conf = mddev->private; 6587 int new_chunk = mddev->new_chunk_sectors; 6588 6589 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6590 return -EINVAL; 6591 if (new_chunk > 0) { 6592 if (!is_power_of_2(new_chunk)) 6593 return -EINVAL; 6594 if (new_chunk < (PAGE_SIZE>>9)) 6595 return -EINVAL; 6596 if (mddev->array_sectors & (new_chunk-1)) 6597 /* not factor of array size */ 6598 return -EINVAL; 6599 } 6600 6601 /* They look valid */ 6602 6603 if (mddev->raid_disks == 2) { 6604 /* can make the change immediately */ 6605 if (mddev->new_layout >= 0) { 6606 conf->algorithm = mddev->new_layout; 6607 mddev->layout = mddev->new_layout; 6608 } 6609 if (new_chunk > 0) { 6610 conf->chunk_sectors = new_chunk ; 6611 mddev->chunk_sectors = new_chunk; 6612 } 6613 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6614 md_wakeup_thread(mddev->thread); 6615 } 6616 return check_reshape(mddev); 6617 } 6618 6619 static int raid6_check_reshape(struct mddev *mddev) 6620 { 6621 int new_chunk = mddev->new_chunk_sectors; 6622 6623 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6624 return -EINVAL; 6625 if (new_chunk > 0) { 6626 if (!is_power_of_2(new_chunk)) 6627 return -EINVAL; 6628 if (new_chunk < (PAGE_SIZE >> 9)) 6629 return -EINVAL; 6630 if (mddev->array_sectors & (new_chunk-1)) 6631 /* not factor of array size */ 6632 return -EINVAL; 6633 } 6634 6635 /* They look valid */ 6636 return check_reshape(mddev); 6637 } 6638 6639 static void *raid5_takeover(struct mddev *mddev) 6640 { 6641 /* raid5 can take over: 6642 * raid0 - if there is only one strip zone - make it a raid4 layout 6643 * raid1 - if there are two drives. We need to know the chunk size 6644 * raid4 - trivial - just use a raid4 layout. 6645 * raid6 - Providing it is a *_6 layout 6646 */ 6647 if (mddev->level == 0) 6648 return raid45_takeover_raid0(mddev, 5); 6649 if (mddev->level == 1) 6650 return raid5_takeover_raid1(mddev); 6651 if (mddev->level == 4) { 6652 mddev->new_layout = ALGORITHM_PARITY_N; 6653 mddev->new_level = 5; 6654 return setup_conf(mddev); 6655 } 6656 if (mddev->level == 6) 6657 return raid5_takeover_raid6(mddev); 6658 6659 return ERR_PTR(-EINVAL); 6660 } 6661 6662 static void *raid4_takeover(struct mddev *mddev) 6663 { 6664 /* raid4 can take over: 6665 * raid0 - if there is only one strip zone 6666 * raid5 - if layout is right 6667 */ 6668 if (mddev->level == 0) 6669 return raid45_takeover_raid0(mddev, 4); 6670 if (mddev->level == 5 && 6671 mddev->layout == ALGORITHM_PARITY_N) { 6672 mddev->new_layout = 0; 6673 mddev->new_level = 4; 6674 return setup_conf(mddev); 6675 } 6676 return ERR_PTR(-EINVAL); 6677 } 6678 6679 static struct md_personality raid5_personality; 6680 6681 static void *raid6_takeover(struct mddev *mddev) 6682 { 6683 /* Currently can only take over a raid5. We map the 6684 * personality to an equivalent raid6 personality 6685 * with the Q block at the end. 6686 */ 6687 int new_layout; 6688 6689 if (mddev->pers != &raid5_personality) 6690 return ERR_PTR(-EINVAL); 6691 if (mddev->degraded > 1) 6692 return ERR_PTR(-EINVAL); 6693 if (mddev->raid_disks > 253) 6694 return ERR_PTR(-EINVAL); 6695 if (mddev->raid_disks < 3) 6696 return ERR_PTR(-EINVAL); 6697 6698 switch (mddev->layout) { 6699 case ALGORITHM_LEFT_ASYMMETRIC: 6700 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6701 break; 6702 case ALGORITHM_RIGHT_ASYMMETRIC: 6703 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6704 break; 6705 case ALGORITHM_LEFT_SYMMETRIC: 6706 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6707 break; 6708 case ALGORITHM_RIGHT_SYMMETRIC: 6709 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6710 break; 6711 case ALGORITHM_PARITY_0: 6712 new_layout = ALGORITHM_PARITY_0_6; 6713 break; 6714 case ALGORITHM_PARITY_N: 6715 new_layout = ALGORITHM_PARITY_N; 6716 break; 6717 default: 6718 return ERR_PTR(-EINVAL); 6719 } 6720 mddev->new_level = 6; 6721 mddev->new_layout = new_layout; 6722 mddev->delta_disks = 1; 6723 mddev->raid_disks += 1; 6724 return setup_conf(mddev); 6725 } 6726 6727 6728 static struct md_personality raid6_personality = 6729 { 6730 .name = "raid6", 6731 .level = 6, 6732 .owner = THIS_MODULE, 6733 .make_request = make_request, 6734 .run = run, 6735 .stop = stop, 6736 .status = status, 6737 .error_handler = error, 6738 .hot_add_disk = raid5_add_disk, 6739 .hot_remove_disk= raid5_remove_disk, 6740 .spare_active = raid5_spare_active, 6741 .sync_request = sync_request, 6742 .resize = raid5_resize, 6743 .size = raid5_size, 6744 .check_reshape = raid6_check_reshape, 6745 .start_reshape = raid5_start_reshape, 6746 .finish_reshape = raid5_finish_reshape, 6747 .quiesce = raid5_quiesce, 6748 .takeover = raid6_takeover, 6749 }; 6750 static struct md_personality raid5_personality = 6751 { 6752 .name = "raid5", 6753 .level = 5, 6754 .owner = THIS_MODULE, 6755 .make_request = make_request, 6756 .run = run, 6757 .stop = stop, 6758 .status = status, 6759 .error_handler = error, 6760 .hot_add_disk = raid5_add_disk, 6761 .hot_remove_disk= raid5_remove_disk, 6762 .spare_active = raid5_spare_active, 6763 .sync_request = sync_request, 6764 .resize = raid5_resize, 6765 .size = raid5_size, 6766 .check_reshape = raid5_check_reshape, 6767 .start_reshape = raid5_start_reshape, 6768 .finish_reshape = raid5_finish_reshape, 6769 .quiesce = raid5_quiesce, 6770 .takeover = raid5_takeover, 6771 }; 6772 6773 static struct md_personality raid4_personality = 6774 { 6775 .name = "raid4", 6776 .level = 4, 6777 .owner = THIS_MODULE, 6778 .make_request = make_request, 6779 .run = run, 6780 .stop = stop, 6781 .status = status, 6782 .error_handler = error, 6783 .hot_add_disk = raid5_add_disk, 6784 .hot_remove_disk= raid5_remove_disk, 6785 .spare_active = raid5_spare_active, 6786 .sync_request = sync_request, 6787 .resize = raid5_resize, 6788 .size = raid5_size, 6789 .check_reshape = raid5_check_reshape, 6790 .start_reshape = raid5_start_reshape, 6791 .finish_reshape = raid5_finish_reshape, 6792 .quiesce = raid5_quiesce, 6793 .takeover = raid4_takeover, 6794 }; 6795 6796 static int __init raid5_init(void) 6797 { 6798 raid5_wq = alloc_workqueue("raid5wq", 6799 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 6800 if (!raid5_wq) 6801 return -ENOMEM; 6802 register_md_personality(&raid6_personality); 6803 register_md_personality(&raid5_personality); 6804 register_md_personality(&raid4_personality); 6805 return 0; 6806 } 6807 6808 static void raid5_exit(void) 6809 { 6810 unregister_md_personality(&raid6_personality); 6811 unregister_md_personality(&raid5_personality); 6812 unregister_md_personality(&raid4_personality); 6813 destroy_workqueue(raid5_wq); 6814 } 6815 6816 module_init(raid5_init); 6817 module_exit(raid5_exit); 6818 MODULE_LICENSE("GPL"); 6819 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6820 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6821 MODULE_ALIAS("md-raid5"); 6822 MODULE_ALIAS("md-raid4"); 6823 MODULE_ALIAS("md-level-5"); 6824 MODULE_ALIAS("md-level-4"); 6825 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6826 MODULE_ALIAS("md-raid6"); 6827 MODULE_ALIAS("md-level-6"); 6828 6829 /* This used to be two separate modules, they were: */ 6830 MODULE_ALIAS("raid5"); 6831 MODULE_ALIAS("raid6"); 6832