1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <trace/events/block.h> 58 59 #include "md.h" 60 #include "raid5.h" 61 #include "raid0.h" 62 #include "bitmap.h" 63 64 #define cpu_to_group(cpu) cpu_to_node(cpu) 65 #define ANY_GROUP NUMA_NO_NODE 66 67 static struct workqueue_struct *raid5_wq; 68 /* 69 * Stripe cache 70 */ 71 72 #define NR_STRIPES 256 73 #define STRIPE_SIZE PAGE_SIZE 74 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 75 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 76 #define IO_THRESHOLD 1 77 #define BYPASS_THRESHOLD 1 78 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 79 #define HASH_MASK (NR_HASH - 1) 80 #define MAX_STRIPE_BATCH 8 81 82 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 83 { 84 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 85 return &conf->stripe_hashtbl[hash]; 86 } 87 88 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 89 * order without overlap. There may be several bio's per stripe+device, and 90 * a bio could span several devices. 91 * When walking this list for a particular stripe+device, we must never proceed 92 * beyond a bio that extends past this device, as the next bio might no longer 93 * be valid. 94 * This function is used to determine the 'next' bio in the list, given the sector 95 * of the current stripe+device 96 */ 97 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 98 { 99 int sectors = bio_sectors(bio); 100 if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) 101 return bio->bi_next; 102 else 103 return NULL; 104 } 105 106 /* 107 * We maintain a biased count of active stripes in the bottom 16 bits of 108 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 109 */ 110 static inline int raid5_bi_processed_stripes(struct bio *bio) 111 { 112 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 113 return (atomic_read(segments) >> 16) & 0xffff; 114 } 115 116 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 117 { 118 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 119 return atomic_sub_return(1, segments) & 0xffff; 120 } 121 122 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 123 { 124 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 125 atomic_inc(segments); 126 } 127 128 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 129 unsigned int cnt) 130 { 131 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 132 int old, new; 133 134 do { 135 old = atomic_read(segments); 136 new = (old & 0xffff) | (cnt << 16); 137 } while (atomic_cmpxchg(segments, old, new) != old); 138 } 139 140 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 141 { 142 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 143 atomic_set(segments, cnt); 144 } 145 146 /* Find first data disk in a raid6 stripe */ 147 static inline int raid6_d0(struct stripe_head *sh) 148 { 149 if (sh->ddf_layout) 150 /* ddf always start from first device */ 151 return 0; 152 /* md starts just after Q block */ 153 if (sh->qd_idx == sh->disks - 1) 154 return 0; 155 else 156 return sh->qd_idx + 1; 157 } 158 static inline int raid6_next_disk(int disk, int raid_disks) 159 { 160 disk++; 161 return (disk < raid_disks) ? disk : 0; 162 } 163 164 /* When walking through the disks in a raid5, starting at raid6_d0, 165 * We need to map each disk to a 'slot', where the data disks are slot 166 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 167 * is raid_disks-1. This help does that mapping. 168 */ 169 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 170 int *count, int syndrome_disks) 171 { 172 int slot = *count; 173 174 if (sh->ddf_layout) 175 (*count)++; 176 if (idx == sh->pd_idx) 177 return syndrome_disks; 178 if (idx == sh->qd_idx) 179 return syndrome_disks + 1; 180 if (!sh->ddf_layout) 181 (*count)++; 182 return slot; 183 } 184 185 static void return_io(struct bio *return_bi) 186 { 187 struct bio *bi = return_bi; 188 while (bi) { 189 190 return_bi = bi->bi_next; 191 bi->bi_next = NULL; 192 bi->bi_size = 0; 193 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 194 bi, 0); 195 bio_endio(bi, 0); 196 bi = return_bi; 197 } 198 } 199 200 static void print_raid5_conf (struct r5conf *conf); 201 202 static int stripe_operations_active(struct stripe_head *sh) 203 { 204 return sh->check_state || sh->reconstruct_state || 205 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 206 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 207 } 208 209 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 210 { 211 struct r5conf *conf = sh->raid_conf; 212 struct r5worker_group *group; 213 int thread_cnt; 214 int i, cpu = sh->cpu; 215 216 if (!cpu_online(cpu)) { 217 cpu = cpumask_any(cpu_online_mask); 218 sh->cpu = cpu; 219 } 220 221 if (list_empty(&sh->lru)) { 222 struct r5worker_group *group; 223 group = conf->worker_groups + cpu_to_group(cpu); 224 list_add_tail(&sh->lru, &group->handle_list); 225 group->stripes_cnt++; 226 sh->group = group; 227 } 228 229 if (conf->worker_cnt_per_group == 0) { 230 md_wakeup_thread(conf->mddev->thread); 231 return; 232 } 233 234 group = conf->worker_groups + cpu_to_group(sh->cpu); 235 236 group->workers[0].working = true; 237 /* at least one worker should run to avoid race */ 238 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 239 240 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 241 /* wakeup more workers */ 242 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 243 if (group->workers[i].working == false) { 244 group->workers[i].working = true; 245 queue_work_on(sh->cpu, raid5_wq, 246 &group->workers[i].work); 247 thread_cnt--; 248 } 249 } 250 } 251 252 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 253 { 254 BUG_ON(!list_empty(&sh->lru)); 255 BUG_ON(atomic_read(&conf->active_stripes)==0); 256 if (test_bit(STRIPE_HANDLE, &sh->state)) { 257 if (test_bit(STRIPE_DELAYED, &sh->state) && 258 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 259 list_add_tail(&sh->lru, &conf->delayed_list); 260 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 261 sh->bm_seq - conf->seq_write > 0) 262 list_add_tail(&sh->lru, &conf->bitmap_list); 263 else { 264 clear_bit(STRIPE_DELAYED, &sh->state); 265 clear_bit(STRIPE_BIT_DELAY, &sh->state); 266 if (conf->worker_cnt_per_group == 0) { 267 list_add_tail(&sh->lru, &conf->handle_list); 268 } else { 269 raid5_wakeup_stripe_thread(sh); 270 return; 271 } 272 } 273 md_wakeup_thread(conf->mddev->thread); 274 } else { 275 BUG_ON(stripe_operations_active(sh)); 276 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 277 if (atomic_dec_return(&conf->preread_active_stripes) 278 < IO_THRESHOLD) 279 md_wakeup_thread(conf->mddev->thread); 280 atomic_dec(&conf->active_stripes); 281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 282 list_add_tail(&sh->lru, &conf->inactive_list); 283 wake_up(&conf->wait_for_stripe); 284 if (conf->retry_read_aligned) 285 md_wakeup_thread(conf->mddev->thread); 286 } 287 } 288 } 289 290 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 291 { 292 if (atomic_dec_and_test(&sh->count)) 293 do_release_stripe(conf, sh); 294 } 295 296 static struct llist_node *llist_reverse_order(struct llist_node *head) 297 { 298 struct llist_node *new_head = NULL; 299 300 while (head) { 301 struct llist_node *tmp = head; 302 head = head->next; 303 tmp->next = new_head; 304 new_head = tmp; 305 } 306 307 return new_head; 308 } 309 310 /* should hold conf->device_lock already */ 311 static int release_stripe_list(struct r5conf *conf) 312 { 313 struct stripe_head *sh; 314 int count = 0; 315 struct llist_node *head; 316 317 head = llist_del_all(&conf->released_stripes); 318 head = llist_reverse_order(head); 319 while (head) { 320 sh = llist_entry(head, struct stripe_head, release_list); 321 head = llist_next(head); 322 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 323 smp_mb(); 324 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 325 /* 326 * Don't worry the bit is set here, because if the bit is set 327 * again, the count is always > 1. This is true for 328 * STRIPE_ON_UNPLUG_LIST bit too. 329 */ 330 __release_stripe(conf, sh); 331 count++; 332 } 333 334 return count; 335 } 336 337 static void release_stripe(struct stripe_head *sh) 338 { 339 struct r5conf *conf = sh->raid_conf; 340 unsigned long flags; 341 bool wakeup; 342 343 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 344 goto slow_path; 345 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 346 if (wakeup) 347 md_wakeup_thread(conf->mddev->thread); 348 return; 349 slow_path: 350 local_irq_save(flags); 351 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 352 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 353 do_release_stripe(conf, sh); 354 spin_unlock(&conf->device_lock); 355 } 356 local_irq_restore(flags); 357 } 358 359 static inline void remove_hash(struct stripe_head *sh) 360 { 361 pr_debug("remove_hash(), stripe %llu\n", 362 (unsigned long long)sh->sector); 363 364 hlist_del_init(&sh->hash); 365 } 366 367 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 368 { 369 struct hlist_head *hp = stripe_hash(conf, sh->sector); 370 371 pr_debug("insert_hash(), stripe %llu\n", 372 (unsigned long long)sh->sector); 373 374 hlist_add_head(&sh->hash, hp); 375 } 376 377 378 /* find an idle stripe, make sure it is unhashed, and return it. */ 379 static struct stripe_head *get_free_stripe(struct r5conf *conf) 380 { 381 struct stripe_head *sh = NULL; 382 struct list_head *first; 383 384 if (list_empty(&conf->inactive_list)) 385 goto out; 386 first = conf->inactive_list.next; 387 sh = list_entry(first, struct stripe_head, lru); 388 list_del_init(first); 389 remove_hash(sh); 390 atomic_inc(&conf->active_stripes); 391 out: 392 return sh; 393 } 394 395 static void shrink_buffers(struct stripe_head *sh) 396 { 397 struct page *p; 398 int i; 399 int num = sh->raid_conf->pool_size; 400 401 for (i = 0; i < num ; i++) { 402 p = sh->dev[i].page; 403 if (!p) 404 continue; 405 sh->dev[i].page = NULL; 406 put_page(p); 407 } 408 } 409 410 static int grow_buffers(struct stripe_head *sh) 411 { 412 int i; 413 int num = sh->raid_conf->pool_size; 414 415 for (i = 0; i < num; i++) { 416 struct page *page; 417 418 if (!(page = alloc_page(GFP_KERNEL))) { 419 return 1; 420 } 421 sh->dev[i].page = page; 422 } 423 return 0; 424 } 425 426 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 427 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 428 struct stripe_head *sh); 429 430 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 431 { 432 struct r5conf *conf = sh->raid_conf; 433 int i; 434 435 BUG_ON(atomic_read(&sh->count) != 0); 436 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 437 BUG_ON(stripe_operations_active(sh)); 438 439 pr_debug("init_stripe called, stripe %llu\n", 440 (unsigned long long)sh->sector); 441 442 remove_hash(sh); 443 444 sh->generation = conf->generation - previous; 445 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 446 sh->sector = sector; 447 stripe_set_idx(sector, conf, previous, sh); 448 sh->state = 0; 449 450 451 for (i = sh->disks; i--; ) { 452 struct r5dev *dev = &sh->dev[i]; 453 454 if (dev->toread || dev->read || dev->towrite || dev->written || 455 test_bit(R5_LOCKED, &dev->flags)) { 456 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 457 (unsigned long long)sh->sector, i, dev->toread, 458 dev->read, dev->towrite, dev->written, 459 test_bit(R5_LOCKED, &dev->flags)); 460 WARN_ON(1); 461 } 462 dev->flags = 0; 463 raid5_build_block(sh, i, previous); 464 } 465 insert_hash(conf, sh); 466 sh->cpu = smp_processor_id(); 467 } 468 469 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 470 short generation) 471 { 472 struct stripe_head *sh; 473 474 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 475 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 476 if (sh->sector == sector && sh->generation == generation) 477 return sh; 478 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 479 return NULL; 480 } 481 482 /* 483 * Need to check if array has failed when deciding whether to: 484 * - start an array 485 * - remove non-faulty devices 486 * - add a spare 487 * - allow a reshape 488 * This determination is simple when no reshape is happening. 489 * However if there is a reshape, we need to carefully check 490 * both the before and after sections. 491 * This is because some failed devices may only affect one 492 * of the two sections, and some non-in_sync devices may 493 * be insync in the section most affected by failed devices. 494 */ 495 static int calc_degraded(struct r5conf *conf) 496 { 497 int degraded, degraded2; 498 int i; 499 500 rcu_read_lock(); 501 degraded = 0; 502 for (i = 0; i < conf->previous_raid_disks; i++) { 503 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 504 if (rdev && test_bit(Faulty, &rdev->flags)) 505 rdev = rcu_dereference(conf->disks[i].replacement); 506 if (!rdev || test_bit(Faulty, &rdev->flags)) 507 degraded++; 508 else if (test_bit(In_sync, &rdev->flags)) 509 ; 510 else 511 /* not in-sync or faulty. 512 * If the reshape increases the number of devices, 513 * this is being recovered by the reshape, so 514 * this 'previous' section is not in_sync. 515 * If the number of devices is being reduced however, 516 * the device can only be part of the array if 517 * we are reverting a reshape, so this section will 518 * be in-sync. 519 */ 520 if (conf->raid_disks >= conf->previous_raid_disks) 521 degraded++; 522 } 523 rcu_read_unlock(); 524 if (conf->raid_disks == conf->previous_raid_disks) 525 return degraded; 526 rcu_read_lock(); 527 degraded2 = 0; 528 for (i = 0; i < conf->raid_disks; i++) { 529 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 530 if (rdev && test_bit(Faulty, &rdev->flags)) 531 rdev = rcu_dereference(conf->disks[i].replacement); 532 if (!rdev || test_bit(Faulty, &rdev->flags)) 533 degraded2++; 534 else if (test_bit(In_sync, &rdev->flags)) 535 ; 536 else 537 /* not in-sync or faulty. 538 * If reshape increases the number of devices, this 539 * section has already been recovered, else it 540 * almost certainly hasn't. 541 */ 542 if (conf->raid_disks <= conf->previous_raid_disks) 543 degraded2++; 544 } 545 rcu_read_unlock(); 546 if (degraded2 > degraded) 547 return degraded2; 548 return degraded; 549 } 550 551 static int has_failed(struct r5conf *conf) 552 { 553 int degraded; 554 555 if (conf->mddev->reshape_position == MaxSector) 556 return conf->mddev->degraded > conf->max_degraded; 557 558 degraded = calc_degraded(conf); 559 if (degraded > conf->max_degraded) 560 return 1; 561 return 0; 562 } 563 564 static struct stripe_head * 565 get_active_stripe(struct r5conf *conf, sector_t sector, 566 int previous, int noblock, int noquiesce) 567 { 568 struct stripe_head *sh; 569 570 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 571 572 spin_lock_irq(&conf->device_lock); 573 574 do { 575 wait_event_lock_irq(conf->wait_for_stripe, 576 conf->quiesce == 0 || noquiesce, 577 conf->device_lock); 578 sh = __find_stripe(conf, sector, conf->generation - previous); 579 if (!sh) { 580 if (!conf->inactive_blocked) 581 sh = get_free_stripe(conf); 582 if (noblock && sh == NULL) 583 break; 584 if (!sh) { 585 conf->inactive_blocked = 1; 586 wait_event_lock_irq(conf->wait_for_stripe, 587 !list_empty(&conf->inactive_list) && 588 (atomic_read(&conf->active_stripes) 589 < (conf->max_nr_stripes *3/4) 590 || !conf->inactive_blocked), 591 conf->device_lock); 592 conf->inactive_blocked = 0; 593 } else 594 init_stripe(sh, sector, previous); 595 } else { 596 if (atomic_read(&sh->count)) { 597 BUG_ON(!list_empty(&sh->lru) 598 && !test_bit(STRIPE_EXPANDING, &sh->state) 599 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) 600 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 601 } else { 602 if (!test_bit(STRIPE_HANDLE, &sh->state)) 603 atomic_inc(&conf->active_stripes); 604 if (list_empty(&sh->lru) && 605 !test_bit(STRIPE_EXPANDING, &sh->state)) 606 BUG(); 607 list_del_init(&sh->lru); 608 if (sh->group) { 609 sh->group->stripes_cnt--; 610 sh->group = NULL; 611 } 612 } 613 } 614 } while (sh == NULL); 615 616 if (sh) 617 atomic_inc(&sh->count); 618 619 spin_unlock_irq(&conf->device_lock); 620 return sh; 621 } 622 623 /* Determine if 'data_offset' or 'new_data_offset' should be used 624 * in this stripe_head. 625 */ 626 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 627 { 628 sector_t progress = conf->reshape_progress; 629 /* Need a memory barrier to make sure we see the value 630 * of conf->generation, or ->data_offset that was set before 631 * reshape_progress was updated. 632 */ 633 smp_rmb(); 634 if (progress == MaxSector) 635 return 0; 636 if (sh->generation == conf->generation - 1) 637 return 0; 638 /* We are in a reshape, and this is a new-generation stripe, 639 * so use new_data_offset. 640 */ 641 return 1; 642 } 643 644 static void 645 raid5_end_read_request(struct bio *bi, int error); 646 static void 647 raid5_end_write_request(struct bio *bi, int error); 648 649 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 650 { 651 struct r5conf *conf = sh->raid_conf; 652 int i, disks = sh->disks; 653 654 might_sleep(); 655 656 for (i = disks; i--; ) { 657 int rw; 658 int replace_only = 0; 659 struct bio *bi, *rbi; 660 struct md_rdev *rdev, *rrdev = NULL; 661 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 662 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 663 rw = WRITE_FUA; 664 else 665 rw = WRITE; 666 if (test_bit(R5_Discard, &sh->dev[i].flags)) 667 rw |= REQ_DISCARD; 668 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 669 rw = READ; 670 else if (test_and_clear_bit(R5_WantReplace, 671 &sh->dev[i].flags)) { 672 rw = WRITE; 673 replace_only = 1; 674 } else 675 continue; 676 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 677 rw |= REQ_SYNC; 678 679 bi = &sh->dev[i].req; 680 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 681 682 rcu_read_lock(); 683 rrdev = rcu_dereference(conf->disks[i].replacement); 684 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 685 rdev = rcu_dereference(conf->disks[i].rdev); 686 if (!rdev) { 687 rdev = rrdev; 688 rrdev = NULL; 689 } 690 if (rw & WRITE) { 691 if (replace_only) 692 rdev = NULL; 693 if (rdev == rrdev) 694 /* We raced and saw duplicates */ 695 rrdev = NULL; 696 } else { 697 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 698 rdev = rrdev; 699 rrdev = NULL; 700 } 701 702 if (rdev && test_bit(Faulty, &rdev->flags)) 703 rdev = NULL; 704 if (rdev) 705 atomic_inc(&rdev->nr_pending); 706 if (rrdev && test_bit(Faulty, &rrdev->flags)) 707 rrdev = NULL; 708 if (rrdev) 709 atomic_inc(&rrdev->nr_pending); 710 rcu_read_unlock(); 711 712 /* We have already checked bad blocks for reads. Now 713 * need to check for writes. We never accept write errors 714 * on the replacement, so we don't to check rrdev. 715 */ 716 while ((rw & WRITE) && rdev && 717 test_bit(WriteErrorSeen, &rdev->flags)) { 718 sector_t first_bad; 719 int bad_sectors; 720 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 721 &first_bad, &bad_sectors); 722 if (!bad) 723 break; 724 725 if (bad < 0) { 726 set_bit(BlockedBadBlocks, &rdev->flags); 727 if (!conf->mddev->external && 728 conf->mddev->flags) { 729 /* It is very unlikely, but we might 730 * still need to write out the 731 * bad block log - better give it 732 * a chance*/ 733 md_check_recovery(conf->mddev); 734 } 735 /* 736 * Because md_wait_for_blocked_rdev 737 * will dec nr_pending, we must 738 * increment it first. 739 */ 740 atomic_inc(&rdev->nr_pending); 741 md_wait_for_blocked_rdev(rdev, conf->mddev); 742 } else { 743 /* Acknowledged bad block - skip the write */ 744 rdev_dec_pending(rdev, conf->mddev); 745 rdev = NULL; 746 } 747 } 748 749 if (rdev) { 750 if (s->syncing || s->expanding || s->expanded 751 || s->replacing) 752 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 753 754 set_bit(STRIPE_IO_STARTED, &sh->state); 755 756 bio_reset(bi); 757 bi->bi_bdev = rdev->bdev; 758 bi->bi_rw = rw; 759 bi->bi_end_io = (rw & WRITE) 760 ? raid5_end_write_request 761 : raid5_end_read_request; 762 bi->bi_private = sh; 763 764 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 765 __func__, (unsigned long long)sh->sector, 766 bi->bi_rw, i); 767 atomic_inc(&sh->count); 768 if (use_new_offset(conf, sh)) 769 bi->bi_sector = (sh->sector 770 + rdev->new_data_offset); 771 else 772 bi->bi_sector = (sh->sector 773 + rdev->data_offset); 774 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 775 bi->bi_rw |= REQ_FLUSH; 776 777 bi->bi_vcnt = 1; 778 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 779 bi->bi_io_vec[0].bv_offset = 0; 780 bi->bi_size = STRIPE_SIZE; 781 if (rrdev) 782 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 783 784 if (conf->mddev->gendisk) 785 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 786 bi, disk_devt(conf->mddev->gendisk), 787 sh->dev[i].sector); 788 generic_make_request(bi); 789 } 790 if (rrdev) { 791 if (s->syncing || s->expanding || s->expanded 792 || s->replacing) 793 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 794 795 set_bit(STRIPE_IO_STARTED, &sh->state); 796 797 bio_reset(rbi); 798 rbi->bi_bdev = rrdev->bdev; 799 rbi->bi_rw = rw; 800 BUG_ON(!(rw & WRITE)); 801 rbi->bi_end_io = raid5_end_write_request; 802 rbi->bi_private = sh; 803 804 pr_debug("%s: for %llu schedule op %ld on " 805 "replacement disc %d\n", 806 __func__, (unsigned long long)sh->sector, 807 rbi->bi_rw, i); 808 atomic_inc(&sh->count); 809 if (use_new_offset(conf, sh)) 810 rbi->bi_sector = (sh->sector 811 + rrdev->new_data_offset); 812 else 813 rbi->bi_sector = (sh->sector 814 + rrdev->data_offset); 815 rbi->bi_vcnt = 1; 816 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 817 rbi->bi_io_vec[0].bv_offset = 0; 818 rbi->bi_size = STRIPE_SIZE; 819 if (conf->mddev->gendisk) 820 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 821 rbi, disk_devt(conf->mddev->gendisk), 822 sh->dev[i].sector); 823 generic_make_request(rbi); 824 } 825 if (!rdev && !rrdev) { 826 if (rw & WRITE) 827 set_bit(STRIPE_DEGRADED, &sh->state); 828 pr_debug("skip op %ld on disc %d for sector %llu\n", 829 bi->bi_rw, i, (unsigned long long)sh->sector); 830 clear_bit(R5_LOCKED, &sh->dev[i].flags); 831 set_bit(STRIPE_HANDLE, &sh->state); 832 } 833 } 834 } 835 836 static struct dma_async_tx_descriptor * 837 async_copy_data(int frombio, struct bio *bio, struct page *page, 838 sector_t sector, struct dma_async_tx_descriptor *tx) 839 { 840 struct bio_vec *bvl; 841 struct page *bio_page; 842 int i; 843 int page_offset; 844 struct async_submit_ctl submit; 845 enum async_tx_flags flags = 0; 846 847 if (bio->bi_sector >= sector) 848 page_offset = (signed)(bio->bi_sector - sector) * 512; 849 else 850 page_offset = (signed)(sector - bio->bi_sector) * -512; 851 852 if (frombio) 853 flags |= ASYNC_TX_FENCE; 854 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 855 856 bio_for_each_segment(bvl, bio, i) { 857 int len = bvl->bv_len; 858 int clen; 859 int b_offset = 0; 860 861 if (page_offset < 0) { 862 b_offset = -page_offset; 863 page_offset += b_offset; 864 len -= b_offset; 865 } 866 867 if (len > 0 && page_offset + len > STRIPE_SIZE) 868 clen = STRIPE_SIZE - page_offset; 869 else 870 clen = len; 871 872 if (clen > 0) { 873 b_offset += bvl->bv_offset; 874 bio_page = bvl->bv_page; 875 if (frombio) 876 tx = async_memcpy(page, bio_page, page_offset, 877 b_offset, clen, &submit); 878 else 879 tx = async_memcpy(bio_page, page, b_offset, 880 page_offset, clen, &submit); 881 } 882 /* chain the operations */ 883 submit.depend_tx = tx; 884 885 if (clen < len) /* hit end of page */ 886 break; 887 page_offset += len; 888 } 889 890 return tx; 891 } 892 893 static void ops_complete_biofill(void *stripe_head_ref) 894 { 895 struct stripe_head *sh = stripe_head_ref; 896 struct bio *return_bi = NULL; 897 int i; 898 899 pr_debug("%s: stripe %llu\n", __func__, 900 (unsigned long long)sh->sector); 901 902 /* clear completed biofills */ 903 for (i = sh->disks; i--; ) { 904 struct r5dev *dev = &sh->dev[i]; 905 906 /* acknowledge completion of a biofill operation */ 907 /* and check if we need to reply to a read request, 908 * new R5_Wantfill requests are held off until 909 * !STRIPE_BIOFILL_RUN 910 */ 911 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 912 struct bio *rbi, *rbi2; 913 914 BUG_ON(!dev->read); 915 rbi = dev->read; 916 dev->read = NULL; 917 while (rbi && rbi->bi_sector < 918 dev->sector + STRIPE_SECTORS) { 919 rbi2 = r5_next_bio(rbi, dev->sector); 920 if (!raid5_dec_bi_active_stripes(rbi)) { 921 rbi->bi_next = return_bi; 922 return_bi = rbi; 923 } 924 rbi = rbi2; 925 } 926 } 927 } 928 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 929 930 return_io(return_bi); 931 932 set_bit(STRIPE_HANDLE, &sh->state); 933 release_stripe(sh); 934 } 935 936 static void ops_run_biofill(struct stripe_head *sh) 937 { 938 struct dma_async_tx_descriptor *tx = NULL; 939 struct async_submit_ctl submit; 940 int i; 941 942 pr_debug("%s: stripe %llu\n", __func__, 943 (unsigned long long)sh->sector); 944 945 for (i = sh->disks; i--; ) { 946 struct r5dev *dev = &sh->dev[i]; 947 if (test_bit(R5_Wantfill, &dev->flags)) { 948 struct bio *rbi; 949 spin_lock_irq(&sh->stripe_lock); 950 dev->read = rbi = dev->toread; 951 dev->toread = NULL; 952 spin_unlock_irq(&sh->stripe_lock); 953 while (rbi && rbi->bi_sector < 954 dev->sector + STRIPE_SECTORS) { 955 tx = async_copy_data(0, rbi, dev->page, 956 dev->sector, tx); 957 rbi = r5_next_bio(rbi, dev->sector); 958 } 959 } 960 } 961 962 atomic_inc(&sh->count); 963 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 964 async_trigger_callback(&submit); 965 } 966 967 static void mark_target_uptodate(struct stripe_head *sh, int target) 968 { 969 struct r5dev *tgt; 970 971 if (target < 0) 972 return; 973 974 tgt = &sh->dev[target]; 975 set_bit(R5_UPTODATE, &tgt->flags); 976 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 977 clear_bit(R5_Wantcompute, &tgt->flags); 978 } 979 980 static void ops_complete_compute(void *stripe_head_ref) 981 { 982 struct stripe_head *sh = stripe_head_ref; 983 984 pr_debug("%s: stripe %llu\n", __func__, 985 (unsigned long long)sh->sector); 986 987 /* mark the computed target(s) as uptodate */ 988 mark_target_uptodate(sh, sh->ops.target); 989 mark_target_uptodate(sh, sh->ops.target2); 990 991 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 992 if (sh->check_state == check_state_compute_run) 993 sh->check_state = check_state_compute_result; 994 set_bit(STRIPE_HANDLE, &sh->state); 995 release_stripe(sh); 996 } 997 998 /* return a pointer to the address conversion region of the scribble buffer */ 999 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1000 struct raid5_percpu *percpu) 1001 { 1002 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 1003 } 1004 1005 static struct dma_async_tx_descriptor * 1006 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1007 { 1008 int disks = sh->disks; 1009 struct page **xor_srcs = percpu->scribble; 1010 int target = sh->ops.target; 1011 struct r5dev *tgt = &sh->dev[target]; 1012 struct page *xor_dest = tgt->page; 1013 int count = 0; 1014 struct dma_async_tx_descriptor *tx; 1015 struct async_submit_ctl submit; 1016 int i; 1017 1018 pr_debug("%s: stripe %llu block: %d\n", 1019 __func__, (unsigned long long)sh->sector, target); 1020 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1021 1022 for (i = disks; i--; ) 1023 if (i != target) 1024 xor_srcs[count++] = sh->dev[i].page; 1025 1026 atomic_inc(&sh->count); 1027 1028 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1029 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 1030 if (unlikely(count == 1)) 1031 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1032 else 1033 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1034 1035 return tx; 1036 } 1037 1038 /* set_syndrome_sources - populate source buffers for gen_syndrome 1039 * @srcs - (struct page *) array of size sh->disks 1040 * @sh - stripe_head to parse 1041 * 1042 * Populates srcs in proper layout order for the stripe and returns the 1043 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1044 * destination buffer is recorded in srcs[count] and the Q destination 1045 * is recorded in srcs[count+1]]. 1046 */ 1047 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 1048 { 1049 int disks = sh->disks; 1050 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1051 int d0_idx = raid6_d0(sh); 1052 int count; 1053 int i; 1054 1055 for (i = 0; i < disks; i++) 1056 srcs[i] = NULL; 1057 1058 count = 0; 1059 i = d0_idx; 1060 do { 1061 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1062 1063 srcs[slot] = sh->dev[i].page; 1064 i = raid6_next_disk(i, disks); 1065 } while (i != d0_idx); 1066 1067 return syndrome_disks; 1068 } 1069 1070 static struct dma_async_tx_descriptor * 1071 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1072 { 1073 int disks = sh->disks; 1074 struct page **blocks = percpu->scribble; 1075 int target; 1076 int qd_idx = sh->qd_idx; 1077 struct dma_async_tx_descriptor *tx; 1078 struct async_submit_ctl submit; 1079 struct r5dev *tgt; 1080 struct page *dest; 1081 int i; 1082 int count; 1083 1084 if (sh->ops.target < 0) 1085 target = sh->ops.target2; 1086 else if (sh->ops.target2 < 0) 1087 target = sh->ops.target; 1088 else 1089 /* we should only have one valid target */ 1090 BUG(); 1091 BUG_ON(target < 0); 1092 pr_debug("%s: stripe %llu block: %d\n", 1093 __func__, (unsigned long long)sh->sector, target); 1094 1095 tgt = &sh->dev[target]; 1096 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1097 dest = tgt->page; 1098 1099 atomic_inc(&sh->count); 1100 1101 if (target == qd_idx) { 1102 count = set_syndrome_sources(blocks, sh); 1103 blocks[count] = NULL; /* regenerating p is not necessary */ 1104 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1105 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1106 ops_complete_compute, sh, 1107 to_addr_conv(sh, percpu)); 1108 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1109 } else { 1110 /* Compute any data- or p-drive using XOR */ 1111 count = 0; 1112 for (i = disks; i-- ; ) { 1113 if (i == target || i == qd_idx) 1114 continue; 1115 blocks[count++] = sh->dev[i].page; 1116 } 1117 1118 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1119 NULL, ops_complete_compute, sh, 1120 to_addr_conv(sh, percpu)); 1121 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1122 } 1123 1124 return tx; 1125 } 1126 1127 static struct dma_async_tx_descriptor * 1128 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1129 { 1130 int i, count, disks = sh->disks; 1131 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1132 int d0_idx = raid6_d0(sh); 1133 int faila = -1, failb = -1; 1134 int target = sh->ops.target; 1135 int target2 = sh->ops.target2; 1136 struct r5dev *tgt = &sh->dev[target]; 1137 struct r5dev *tgt2 = &sh->dev[target2]; 1138 struct dma_async_tx_descriptor *tx; 1139 struct page **blocks = percpu->scribble; 1140 struct async_submit_ctl submit; 1141 1142 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1143 __func__, (unsigned long long)sh->sector, target, target2); 1144 BUG_ON(target < 0 || target2 < 0); 1145 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1146 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1147 1148 /* we need to open-code set_syndrome_sources to handle the 1149 * slot number conversion for 'faila' and 'failb' 1150 */ 1151 for (i = 0; i < disks ; i++) 1152 blocks[i] = NULL; 1153 count = 0; 1154 i = d0_idx; 1155 do { 1156 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1157 1158 blocks[slot] = sh->dev[i].page; 1159 1160 if (i == target) 1161 faila = slot; 1162 if (i == target2) 1163 failb = slot; 1164 i = raid6_next_disk(i, disks); 1165 } while (i != d0_idx); 1166 1167 BUG_ON(faila == failb); 1168 if (failb < faila) 1169 swap(faila, failb); 1170 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1171 __func__, (unsigned long long)sh->sector, faila, failb); 1172 1173 atomic_inc(&sh->count); 1174 1175 if (failb == syndrome_disks+1) { 1176 /* Q disk is one of the missing disks */ 1177 if (faila == syndrome_disks) { 1178 /* Missing P+Q, just recompute */ 1179 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1180 ops_complete_compute, sh, 1181 to_addr_conv(sh, percpu)); 1182 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1183 STRIPE_SIZE, &submit); 1184 } else { 1185 struct page *dest; 1186 int data_target; 1187 int qd_idx = sh->qd_idx; 1188 1189 /* Missing D+Q: recompute D from P, then recompute Q */ 1190 if (target == qd_idx) 1191 data_target = target2; 1192 else 1193 data_target = target; 1194 1195 count = 0; 1196 for (i = disks; i-- ; ) { 1197 if (i == data_target || i == qd_idx) 1198 continue; 1199 blocks[count++] = sh->dev[i].page; 1200 } 1201 dest = sh->dev[data_target].page; 1202 init_async_submit(&submit, 1203 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1204 NULL, NULL, NULL, 1205 to_addr_conv(sh, percpu)); 1206 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1207 &submit); 1208 1209 count = set_syndrome_sources(blocks, sh); 1210 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1211 ops_complete_compute, sh, 1212 to_addr_conv(sh, percpu)); 1213 return async_gen_syndrome(blocks, 0, count+2, 1214 STRIPE_SIZE, &submit); 1215 } 1216 } else { 1217 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1218 ops_complete_compute, sh, 1219 to_addr_conv(sh, percpu)); 1220 if (failb == syndrome_disks) { 1221 /* We're missing D+P. */ 1222 return async_raid6_datap_recov(syndrome_disks+2, 1223 STRIPE_SIZE, faila, 1224 blocks, &submit); 1225 } else { 1226 /* We're missing D+D. */ 1227 return async_raid6_2data_recov(syndrome_disks+2, 1228 STRIPE_SIZE, faila, failb, 1229 blocks, &submit); 1230 } 1231 } 1232 } 1233 1234 1235 static void ops_complete_prexor(void *stripe_head_ref) 1236 { 1237 struct stripe_head *sh = stripe_head_ref; 1238 1239 pr_debug("%s: stripe %llu\n", __func__, 1240 (unsigned long long)sh->sector); 1241 } 1242 1243 static struct dma_async_tx_descriptor * 1244 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1245 struct dma_async_tx_descriptor *tx) 1246 { 1247 int disks = sh->disks; 1248 struct page **xor_srcs = percpu->scribble; 1249 int count = 0, pd_idx = sh->pd_idx, i; 1250 struct async_submit_ctl submit; 1251 1252 /* existing parity data subtracted */ 1253 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1254 1255 pr_debug("%s: stripe %llu\n", __func__, 1256 (unsigned long long)sh->sector); 1257 1258 for (i = disks; i--; ) { 1259 struct r5dev *dev = &sh->dev[i]; 1260 /* Only process blocks that are known to be uptodate */ 1261 if (test_bit(R5_Wantdrain, &dev->flags)) 1262 xor_srcs[count++] = dev->page; 1263 } 1264 1265 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1266 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1267 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1268 1269 return tx; 1270 } 1271 1272 static struct dma_async_tx_descriptor * 1273 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1274 { 1275 int disks = sh->disks; 1276 int i; 1277 1278 pr_debug("%s: stripe %llu\n", __func__, 1279 (unsigned long long)sh->sector); 1280 1281 for (i = disks; i--; ) { 1282 struct r5dev *dev = &sh->dev[i]; 1283 struct bio *chosen; 1284 1285 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1286 struct bio *wbi; 1287 1288 spin_lock_irq(&sh->stripe_lock); 1289 chosen = dev->towrite; 1290 dev->towrite = NULL; 1291 BUG_ON(dev->written); 1292 wbi = dev->written = chosen; 1293 spin_unlock_irq(&sh->stripe_lock); 1294 1295 while (wbi && wbi->bi_sector < 1296 dev->sector + STRIPE_SECTORS) { 1297 if (wbi->bi_rw & REQ_FUA) 1298 set_bit(R5_WantFUA, &dev->flags); 1299 if (wbi->bi_rw & REQ_SYNC) 1300 set_bit(R5_SyncIO, &dev->flags); 1301 if (wbi->bi_rw & REQ_DISCARD) 1302 set_bit(R5_Discard, &dev->flags); 1303 else 1304 tx = async_copy_data(1, wbi, dev->page, 1305 dev->sector, tx); 1306 wbi = r5_next_bio(wbi, dev->sector); 1307 } 1308 } 1309 } 1310 1311 return tx; 1312 } 1313 1314 static void ops_complete_reconstruct(void *stripe_head_ref) 1315 { 1316 struct stripe_head *sh = stripe_head_ref; 1317 int disks = sh->disks; 1318 int pd_idx = sh->pd_idx; 1319 int qd_idx = sh->qd_idx; 1320 int i; 1321 bool fua = false, sync = false, discard = false; 1322 1323 pr_debug("%s: stripe %llu\n", __func__, 1324 (unsigned long long)sh->sector); 1325 1326 for (i = disks; i--; ) { 1327 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1328 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1329 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1330 } 1331 1332 for (i = disks; i--; ) { 1333 struct r5dev *dev = &sh->dev[i]; 1334 1335 if (dev->written || i == pd_idx || i == qd_idx) { 1336 if (!discard) 1337 set_bit(R5_UPTODATE, &dev->flags); 1338 if (fua) 1339 set_bit(R5_WantFUA, &dev->flags); 1340 if (sync) 1341 set_bit(R5_SyncIO, &dev->flags); 1342 } 1343 } 1344 1345 if (sh->reconstruct_state == reconstruct_state_drain_run) 1346 sh->reconstruct_state = reconstruct_state_drain_result; 1347 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1348 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1349 else { 1350 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1351 sh->reconstruct_state = reconstruct_state_result; 1352 } 1353 1354 set_bit(STRIPE_HANDLE, &sh->state); 1355 release_stripe(sh); 1356 } 1357 1358 static void 1359 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1360 struct dma_async_tx_descriptor *tx) 1361 { 1362 int disks = sh->disks; 1363 struct page **xor_srcs = percpu->scribble; 1364 struct async_submit_ctl submit; 1365 int count = 0, pd_idx = sh->pd_idx, i; 1366 struct page *xor_dest; 1367 int prexor = 0; 1368 unsigned long flags; 1369 1370 pr_debug("%s: stripe %llu\n", __func__, 1371 (unsigned long long)sh->sector); 1372 1373 for (i = 0; i < sh->disks; i++) { 1374 if (pd_idx == i) 1375 continue; 1376 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1377 break; 1378 } 1379 if (i >= sh->disks) { 1380 atomic_inc(&sh->count); 1381 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1382 ops_complete_reconstruct(sh); 1383 return; 1384 } 1385 /* check if prexor is active which means only process blocks 1386 * that are part of a read-modify-write (written) 1387 */ 1388 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1389 prexor = 1; 1390 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1391 for (i = disks; i--; ) { 1392 struct r5dev *dev = &sh->dev[i]; 1393 if (dev->written) 1394 xor_srcs[count++] = dev->page; 1395 } 1396 } else { 1397 xor_dest = sh->dev[pd_idx].page; 1398 for (i = disks; i--; ) { 1399 struct r5dev *dev = &sh->dev[i]; 1400 if (i != pd_idx) 1401 xor_srcs[count++] = dev->page; 1402 } 1403 } 1404 1405 /* 1/ if we prexor'd then the dest is reused as a source 1406 * 2/ if we did not prexor then we are redoing the parity 1407 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1408 * for the synchronous xor case 1409 */ 1410 flags = ASYNC_TX_ACK | 1411 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1412 1413 atomic_inc(&sh->count); 1414 1415 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1416 to_addr_conv(sh, percpu)); 1417 if (unlikely(count == 1)) 1418 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1419 else 1420 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1421 } 1422 1423 static void 1424 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1425 struct dma_async_tx_descriptor *tx) 1426 { 1427 struct async_submit_ctl submit; 1428 struct page **blocks = percpu->scribble; 1429 int count, i; 1430 1431 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1432 1433 for (i = 0; i < sh->disks; i++) { 1434 if (sh->pd_idx == i || sh->qd_idx == i) 1435 continue; 1436 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1437 break; 1438 } 1439 if (i >= sh->disks) { 1440 atomic_inc(&sh->count); 1441 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1442 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1443 ops_complete_reconstruct(sh); 1444 return; 1445 } 1446 1447 count = set_syndrome_sources(blocks, sh); 1448 1449 atomic_inc(&sh->count); 1450 1451 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1452 sh, to_addr_conv(sh, percpu)); 1453 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1454 } 1455 1456 static void ops_complete_check(void *stripe_head_ref) 1457 { 1458 struct stripe_head *sh = stripe_head_ref; 1459 1460 pr_debug("%s: stripe %llu\n", __func__, 1461 (unsigned long long)sh->sector); 1462 1463 sh->check_state = check_state_check_result; 1464 set_bit(STRIPE_HANDLE, &sh->state); 1465 release_stripe(sh); 1466 } 1467 1468 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1469 { 1470 int disks = sh->disks; 1471 int pd_idx = sh->pd_idx; 1472 int qd_idx = sh->qd_idx; 1473 struct page *xor_dest; 1474 struct page **xor_srcs = percpu->scribble; 1475 struct dma_async_tx_descriptor *tx; 1476 struct async_submit_ctl submit; 1477 int count; 1478 int i; 1479 1480 pr_debug("%s: stripe %llu\n", __func__, 1481 (unsigned long long)sh->sector); 1482 1483 count = 0; 1484 xor_dest = sh->dev[pd_idx].page; 1485 xor_srcs[count++] = xor_dest; 1486 for (i = disks; i--; ) { 1487 if (i == pd_idx || i == qd_idx) 1488 continue; 1489 xor_srcs[count++] = sh->dev[i].page; 1490 } 1491 1492 init_async_submit(&submit, 0, NULL, NULL, NULL, 1493 to_addr_conv(sh, percpu)); 1494 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1495 &sh->ops.zero_sum_result, &submit); 1496 1497 atomic_inc(&sh->count); 1498 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1499 tx = async_trigger_callback(&submit); 1500 } 1501 1502 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1503 { 1504 struct page **srcs = percpu->scribble; 1505 struct async_submit_ctl submit; 1506 int count; 1507 1508 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1509 (unsigned long long)sh->sector, checkp); 1510 1511 count = set_syndrome_sources(srcs, sh); 1512 if (!checkp) 1513 srcs[count] = NULL; 1514 1515 atomic_inc(&sh->count); 1516 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1517 sh, to_addr_conv(sh, percpu)); 1518 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1519 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1520 } 1521 1522 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1523 { 1524 int overlap_clear = 0, i, disks = sh->disks; 1525 struct dma_async_tx_descriptor *tx = NULL; 1526 struct r5conf *conf = sh->raid_conf; 1527 int level = conf->level; 1528 struct raid5_percpu *percpu; 1529 unsigned long cpu; 1530 1531 cpu = get_cpu(); 1532 percpu = per_cpu_ptr(conf->percpu, cpu); 1533 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1534 ops_run_biofill(sh); 1535 overlap_clear++; 1536 } 1537 1538 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1539 if (level < 6) 1540 tx = ops_run_compute5(sh, percpu); 1541 else { 1542 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1543 tx = ops_run_compute6_1(sh, percpu); 1544 else 1545 tx = ops_run_compute6_2(sh, percpu); 1546 } 1547 /* terminate the chain if reconstruct is not set to be run */ 1548 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1549 async_tx_ack(tx); 1550 } 1551 1552 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1553 tx = ops_run_prexor(sh, percpu, tx); 1554 1555 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1556 tx = ops_run_biodrain(sh, tx); 1557 overlap_clear++; 1558 } 1559 1560 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1561 if (level < 6) 1562 ops_run_reconstruct5(sh, percpu, tx); 1563 else 1564 ops_run_reconstruct6(sh, percpu, tx); 1565 } 1566 1567 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1568 if (sh->check_state == check_state_run) 1569 ops_run_check_p(sh, percpu); 1570 else if (sh->check_state == check_state_run_q) 1571 ops_run_check_pq(sh, percpu, 0); 1572 else if (sh->check_state == check_state_run_pq) 1573 ops_run_check_pq(sh, percpu, 1); 1574 else 1575 BUG(); 1576 } 1577 1578 if (overlap_clear) 1579 for (i = disks; i--; ) { 1580 struct r5dev *dev = &sh->dev[i]; 1581 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1582 wake_up(&sh->raid_conf->wait_for_overlap); 1583 } 1584 put_cpu(); 1585 } 1586 1587 static int grow_one_stripe(struct r5conf *conf) 1588 { 1589 struct stripe_head *sh; 1590 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1591 if (!sh) 1592 return 0; 1593 1594 sh->raid_conf = conf; 1595 1596 spin_lock_init(&sh->stripe_lock); 1597 1598 if (grow_buffers(sh)) { 1599 shrink_buffers(sh); 1600 kmem_cache_free(conf->slab_cache, sh); 1601 return 0; 1602 } 1603 /* we just created an active stripe so... */ 1604 atomic_set(&sh->count, 1); 1605 atomic_inc(&conf->active_stripes); 1606 INIT_LIST_HEAD(&sh->lru); 1607 release_stripe(sh); 1608 return 1; 1609 } 1610 1611 static int grow_stripes(struct r5conf *conf, int num) 1612 { 1613 struct kmem_cache *sc; 1614 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1615 1616 if (conf->mddev->gendisk) 1617 sprintf(conf->cache_name[0], 1618 "raid%d-%s", conf->level, mdname(conf->mddev)); 1619 else 1620 sprintf(conf->cache_name[0], 1621 "raid%d-%p", conf->level, conf->mddev); 1622 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1623 1624 conf->active_name = 0; 1625 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1626 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1627 0, 0, NULL); 1628 if (!sc) 1629 return 1; 1630 conf->slab_cache = sc; 1631 conf->pool_size = devs; 1632 while (num--) 1633 if (!grow_one_stripe(conf)) 1634 return 1; 1635 return 0; 1636 } 1637 1638 /** 1639 * scribble_len - return the required size of the scribble region 1640 * @num - total number of disks in the array 1641 * 1642 * The size must be enough to contain: 1643 * 1/ a struct page pointer for each device in the array +2 1644 * 2/ room to convert each entry in (1) to its corresponding dma 1645 * (dma_map_page()) or page (page_address()) address. 1646 * 1647 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1648 * calculate over all devices (not just the data blocks), using zeros in place 1649 * of the P and Q blocks. 1650 */ 1651 static size_t scribble_len(int num) 1652 { 1653 size_t len; 1654 1655 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1656 1657 return len; 1658 } 1659 1660 static int resize_stripes(struct r5conf *conf, int newsize) 1661 { 1662 /* Make all the stripes able to hold 'newsize' devices. 1663 * New slots in each stripe get 'page' set to a new page. 1664 * 1665 * This happens in stages: 1666 * 1/ create a new kmem_cache and allocate the required number of 1667 * stripe_heads. 1668 * 2/ gather all the old stripe_heads and transfer the pages across 1669 * to the new stripe_heads. This will have the side effect of 1670 * freezing the array as once all stripe_heads have been collected, 1671 * no IO will be possible. Old stripe heads are freed once their 1672 * pages have been transferred over, and the old kmem_cache is 1673 * freed when all stripes are done. 1674 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1675 * we simple return a failre status - no need to clean anything up. 1676 * 4/ allocate new pages for the new slots in the new stripe_heads. 1677 * If this fails, we don't bother trying the shrink the 1678 * stripe_heads down again, we just leave them as they are. 1679 * As each stripe_head is processed the new one is released into 1680 * active service. 1681 * 1682 * Once step2 is started, we cannot afford to wait for a write, 1683 * so we use GFP_NOIO allocations. 1684 */ 1685 struct stripe_head *osh, *nsh; 1686 LIST_HEAD(newstripes); 1687 struct disk_info *ndisks; 1688 unsigned long cpu; 1689 int err; 1690 struct kmem_cache *sc; 1691 int i; 1692 1693 if (newsize <= conf->pool_size) 1694 return 0; /* never bother to shrink */ 1695 1696 err = md_allow_write(conf->mddev); 1697 if (err) 1698 return err; 1699 1700 /* Step 1 */ 1701 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1702 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1703 0, 0, NULL); 1704 if (!sc) 1705 return -ENOMEM; 1706 1707 for (i = conf->max_nr_stripes; i; i--) { 1708 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1709 if (!nsh) 1710 break; 1711 1712 nsh->raid_conf = conf; 1713 spin_lock_init(&nsh->stripe_lock); 1714 1715 list_add(&nsh->lru, &newstripes); 1716 } 1717 if (i) { 1718 /* didn't get enough, give up */ 1719 while (!list_empty(&newstripes)) { 1720 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1721 list_del(&nsh->lru); 1722 kmem_cache_free(sc, nsh); 1723 } 1724 kmem_cache_destroy(sc); 1725 return -ENOMEM; 1726 } 1727 /* Step 2 - Must use GFP_NOIO now. 1728 * OK, we have enough stripes, start collecting inactive 1729 * stripes and copying them over 1730 */ 1731 list_for_each_entry(nsh, &newstripes, lru) { 1732 spin_lock_irq(&conf->device_lock); 1733 wait_event_lock_irq(conf->wait_for_stripe, 1734 !list_empty(&conf->inactive_list), 1735 conf->device_lock); 1736 osh = get_free_stripe(conf); 1737 spin_unlock_irq(&conf->device_lock); 1738 atomic_set(&nsh->count, 1); 1739 for(i=0; i<conf->pool_size; i++) 1740 nsh->dev[i].page = osh->dev[i].page; 1741 for( ; i<newsize; i++) 1742 nsh->dev[i].page = NULL; 1743 kmem_cache_free(conf->slab_cache, osh); 1744 } 1745 kmem_cache_destroy(conf->slab_cache); 1746 1747 /* Step 3. 1748 * At this point, we are holding all the stripes so the array 1749 * is completely stalled, so now is a good time to resize 1750 * conf->disks and the scribble region 1751 */ 1752 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1753 if (ndisks) { 1754 for (i=0; i<conf->raid_disks; i++) 1755 ndisks[i] = conf->disks[i]; 1756 kfree(conf->disks); 1757 conf->disks = ndisks; 1758 } else 1759 err = -ENOMEM; 1760 1761 get_online_cpus(); 1762 conf->scribble_len = scribble_len(newsize); 1763 for_each_present_cpu(cpu) { 1764 struct raid5_percpu *percpu; 1765 void *scribble; 1766 1767 percpu = per_cpu_ptr(conf->percpu, cpu); 1768 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1769 1770 if (scribble) { 1771 kfree(percpu->scribble); 1772 percpu->scribble = scribble; 1773 } else { 1774 err = -ENOMEM; 1775 break; 1776 } 1777 } 1778 put_online_cpus(); 1779 1780 /* Step 4, return new stripes to service */ 1781 while(!list_empty(&newstripes)) { 1782 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1783 list_del_init(&nsh->lru); 1784 1785 for (i=conf->raid_disks; i < newsize; i++) 1786 if (nsh->dev[i].page == NULL) { 1787 struct page *p = alloc_page(GFP_NOIO); 1788 nsh->dev[i].page = p; 1789 if (!p) 1790 err = -ENOMEM; 1791 } 1792 release_stripe(nsh); 1793 } 1794 /* critical section pass, GFP_NOIO no longer needed */ 1795 1796 conf->slab_cache = sc; 1797 conf->active_name = 1-conf->active_name; 1798 conf->pool_size = newsize; 1799 return err; 1800 } 1801 1802 static int drop_one_stripe(struct r5conf *conf) 1803 { 1804 struct stripe_head *sh; 1805 1806 spin_lock_irq(&conf->device_lock); 1807 sh = get_free_stripe(conf); 1808 spin_unlock_irq(&conf->device_lock); 1809 if (!sh) 1810 return 0; 1811 BUG_ON(atomic_read(&sh->count)); 1812 shrink_buffers(sh); 1813 kmem_cache_free(conf->slab_cache, sh); 1814 atomic_dec(&conf->active_stripes); 1815 return 1; 1816 } 1817 1818 static void shrink_stripes(struct r5conf *conf) 1819 { 1820 while (drop_one_stripe(conf)) 1821 ; 1822 1823 if (conf->slab_cache) 1824 kmem_cache_destroy(conf->slab_cache); 1825 conf->slab_cache = NULL; 1826 } 1827 1828 static void raid5_end_read_request(struct bio * bi, int error) 1829 { 1830 struct stripe_head *sh = bi->bi_private; 1831 struct r5conf *conf = sh->raid_conf; 1832 int disks = sh->disks, i; 1833 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1834 char b[BDEVNAME_SIZE]; 1835 struct md_rdev *rdev = NULL; 1836 sector_t s; 1837 1838 for (i=0 ; i<disks; i++) 1839 if (bi == &sh->dev[i].req) 1840 break; 1841 1842 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1843 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1844 uptodate); 1845 if (i == disks) { 1846 BUG(); 1847 return; 1848 } 1849 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1850 /* If replacement finished while this request was outstanding, 1851 * 'replacement' might be NULL already. 1852 * In that case it moved down to 'rdev'. 1853 * rdev is not removed until all requests are finished. 1854 */ 1855 rdev = conf->disks[i].replacement; 1856 if (!rdev) 1857 rdev = conf->disks[i].rdev; 1858 1859 if (use_new_offset(conf, sh)) 1860 s = sh->sector + rdev->new_data_offset; 1861 else 1862 s = sh->sector + rdev->data_offset; 1863 if (uptodate) { 1864 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1865 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1866 /* Note that this cannot happen on a 1867 * replacement device. We just fail those on 1868 * any error 1869 */ 1870 printk_ratelimited( 1871 KERN_INFO 1872 "md/raid:%s: read error corrected" 1873 " (%lu sectors at %llu on %s)\n", 1874 mdname(conf->mddev), STRIPE_SECTORS, 1875 (unsigned long long)s, 1876 bdevname(rdev->bdev, b)); 1877 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1878 clear_bit(R5_ReadError, &sh->dev[i].flags); 1879 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1880 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1881 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1882 1883 if (atomic_read(&rdev->read_errors)) 1884 atomic_set(&rdev->read_errors, 0); 1885 } else { 1886 const char *bdn = bdevname(rdev->bdev, b); 1887 int retry = 0; 1888 int set_bad = 0; 1889 1890 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1891 atomic_inc(&rdev->read_errors); 1892 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1893 printk_ratelimited( 1894 KERN_WARNING 1895 "md/raid:%s: read error on replacement device " 1896 "(sector %llu on %s).\n", 1897 mdname(conf->mddev), 1898 (unsigned long long)s, 1899 bdn); 1900 else if (conf->mddev->degraded >= conf->max_degraded) { 1901 set_bad = 1; 1902 printk_ratelimited( 1903 KERN_WARNING 1904 "md/raid:%s: read error not correctable " 1905 "(sector %llu on %s).\n", 1906 mdname(conf->mddev), 1907 (unsigned long long)s, 1908 bdn); 1909 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 1910 /* Oh, no!!! */ 1911 set_bad = 1; 1912 printk_ratelimited( 1913 KERN_WARNING 1914 "md/raid:%s: read error NOT corrected!! " 1915 "(sector %llu on %s).\n", 1916 mdname(conf->mddev), 1917 (unsigned long long)s, 1918 bdn); 1919 } else if (atomic_read(&rdev->read_errors) 1920 > conf->max_nr_stripes) 1921 printk(KERN_WARNING 1922 "md/raid:%s: Too many read errors, failing device %s.\n", 1923 mdname(conf->mddev), bdn); 1924 else 1925 retry = 1; 1926 if (retry) 1927 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1928 set_bit(R5_ReadError, &sh->dev[i].flags); 1929 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1930 } else 1931 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1932 else { 1933 clear_bit(R5_ReadError, &sh->dev[i].flags); 1934 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1935 if (!(set_bad 1936 && test_bit(In_sync, &rdev->flags) 1937 && rdev_set_badblocks( 1938 rdev, sh->sector, STRIPE_SECTORS, 0))) 1939 md_error(conf->mddev, rdev); 1940 } 1941 } 1942 rdev_dec_pending(rdev, conf->mddev); 1943 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1944 set_bit(STRIPE_HANDLE, &sh->state); 1945 release_stripe(sh); 1946 } 1947 1948 static void raid5_end_write_request(struct bio *bi, int error) 1949 { 1950 struct stripe_head *sh = bi->bi_private; 1951 struct r5conf *conf = sh->raid_conf; 1952 int disks = sh->disks, i; 1953 struct md_rdev *uninitialized_var(rdev); 1954 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1955 sector_t first_bad; 1956 int bad_sectors; 1957 int replacement = 0; 1958 1959 for (i = 0 ; i < disks; i++) { 1960 if (bi == &sh->dev[i].req) { 1961 rdev = conf->disks[i].rdev; 1962 break; 1963 } 1964 if (bi == &sh->dev[i].rreq) { 1965 rdev = conf->disks[i].replacement; 1966 if (rdev) 1967 replacement = 1; 1968 else 1969 /* rdev was removed and 'replacement' 1970 * replaced it. rdev is not removed 1971 * until all requests are finished. 1972 */ 1973 rdev = conf->disks[i].rdev; 1974 break; 1975 } 1976 } 1977 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1978 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1979 uptodate); 1980 if (i == disks) { 1981 BUG(); 1982 return; 1983 } 1984 1985 if (replacement) { 1986 if (!uptodate) 1987 md_error(conf->mddev, rdev); 1988 else if (is_badblock(rdev, sh->sector, 1989 STRIPE_SECTORS, 1990 &first_bad, &bad_sectors)) 1991 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 1992 } else { 1993 if (!uptodate) { 1994 set_bit(WriteErrorSeen, &rdev->flags); 1995 set_bit(R5_WriteError, &sh->dev[i].flags); 1996 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1997 set_bit(MD_RECOVERY_NEEDED, 1998 &rdev->mddev->recovery); 1999 } else if (is_badblock(rdev, sh->sector, 2000 STRIPE_SECTORS, 2001 &first_bad, &bad_sectors)) { 2002 set_bit(R5_MadeGood, &sh->dev[i].flags); 2003 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2004 /* That was a successful write so make 2005 * sure it looks like we already did 2006 * a re-write. 2007 */ 2008 set_bit(R5_ReWrite, &sh->dev[i].flags); 2009 } 2010 } 2011 rdev_dec_pending(rdev, conf->mddev); 2012 2013 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2014 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2015 set_bit(STRIPE_HANDLE, &sh->state); 2016 release_stripe(sh); 2017 } 2018 2019 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2020 2021 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2022 { 2023 struct r5dev *dev = &sh->dev[i]; 2024 2025 bio_init(&dev->req); 2026 dev->req.bi_io_vec = &dev->vec; 2027 dev->req.bi_vcnt++; 2028 dev->req.bi_max_vecs++; 2029 dev->req.bi_private = sh; 2030 dev->vec.bv_page = dev->page; 2031 2032 bio_init(&dev->rreq); 2033 dev->rreq.bi_io_vec = &dev->rvec; 2034 dev->rreq.bi_vcnt++; 2035 dev->rreq.bi_max_vecs++; 2036 dev->rreq.bi_private = sh; 2037 dev->rvec.bv_page = dev->page; 2038 2039 dev->flags = 0; 2040 dev->sector = compute_blocknr(sh, i, previous); 2041 } 2042 2043 static void error(struct mddev *mddev, struct md_rdev *rdev) 2044 { 2045 char b[BDEVNAME_SIZE]; 2046 struct r5conf *conf = mddev->private; 2047 unsigned long flags; 2048 pr_debug("raid456: error called\n"); 2049 2050 spin_lock_irqsave(&conf->device_lock, flags); 2051 clear_bit(In_sync, &rdev->flags); 2052 mddev->degraded = calc_degraded(conf); 2053 spin_unlock_irqrestore(&conf->device_lock, flags); 2054 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2055 2056 set_bit(Blocked, &rdev->flags); 2057 set_bit(Faulty, &rdev->flags); 2058 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2059 printk(KERN_ALERT 2060 "md/raid:%s: Disk failure on %s, disabling device.\n" 2061 "md/raid:%s: Operation continuing on %d devices.\n", 2062 mdname(mddev), 2063 bdevname(rdev->bdev, b), 2064 mdname(mddev), 2065 conf->raid_disks - mddev->degraded); 2066 } 2067 2068 /* 2069 * Input: a 'big' sector number, 2070 * Output: index of the data and parity disk, and the sector # in them. 2071 */ 2072 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2073 int previous, int *dd_idx, 2074 struct stripe_head *sh) 2075 { 2076 sector_t stripe, stripe2; 2077 sector_t chunk_number; 2078 unsigned int chunk_offset; 2079 int pd_idx, qd_idx; 2080 int ddf_layout = 0; 2081 sector_t new_sector; 2082 int algorithm = previous ? conf->prev_algo 2083 : conf->algorithm; 2084 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2085 : conf->chunk_sectors; 2086 int raid_disks = previous ? conf->previous_raid_disks 2087 : conf->raid_disks; 2088 int data_disks = raid_disks - conf->max_degraded; 2089 2090 /* First compute the information on this sector */ 2091 2092 /* 2093 * Compute the chunk number and the sector offset inside the chunk 2094 */ 2095 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2096 chunk_number = r_sector; 2097 2098 /* 2099 * Compute the stripe number 2100 */ 2101 stripe = chunk_number; 2102 *dd_idx = sector_div(stripe, data_disks); 2103 stripe2 = stripe; 2104 /* 2105 * Select the parity disk based on the user selected algorithm. 2106 */ 2107 pd_idx = qd_idx = -1; 2108 switch(conf->level) { 2109 case 4: 2110 pd_idx = data_disks; 2111 break; 2112 case 5: 2113 switch (algorithm) { 2114 case ALGORITHM_LEFT_ASYMMETRIC: 2115 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2116 if (*dd_idx >= pd_idx) 2117 (*dd_idx)++; 2118 break; 2119 case ALGORITHM_RIGHT_ASYMMETRIC: 2120 pd_idx = sector_div(stripe2, raid_disks); 2121 if (*dd_idx >= pd_idx) 2122 (*dd_idx)++; 2123 break; 2124 case ALGORITHM_LEFT_SYMMETRIC: 2125 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2126 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2127 break; 2128 case ALGORITHM_RIGHT_SYMMETRIC: 2129 pd_idx = sector_div(stripe2, raid_disks); 2130 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2131 break; 2132 case ALGORITHM_PARITY_0: 2133 pd_idx = 0; 2134 (*dd_idx)++; 2135 break; 2136 case ALGORITHM_PARITY_N: 2137 pd_idx = data_disks; 2138 break; 2139 default: 2140 BUG(); 2141 } 2142 break; 2143 case 6: 2144 2145 switch (algorithm) { 2146 case ALGORITHM_LEFT_ASYMMETRIC: 2147 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2148 qd_idx = pd_idx + 1; 2149 if (pd_idx == raid_disks-1) { 2150 (*dd_idx)++; /* Q D D D P */ 2151 qd_idx = 0; 2152 } else if (*dd_idx >= pd_idx) 2153 (*dd_idx) += 2; /* D D P Q D */ 2154 break; 2155 case ALGORITHM_RIGHT_ASYMMETRIC: 2156 pd_idx = sector_div(stripe2, raid_disks); 2157 qd_idx = pd_idx + 1; 2158 if (pd_idx == raid_disks-1) { 2159 (*dd_idx)++; /* Q D D D P */ 2160 qd_idx = 0; 2161 } else if (*dd_idx >= pd_idx) 2162 (*dd_idx) += 2; /* D D P Q D */ 2163 break; 2164 case ALGORITHM_LEFT_SYMMETRIC: 2165 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2166 qd_idx = (pd_idx + 1) % raid_disks; 2167 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2168 break; 2169 case ALGORITHM_RIGHT_SYMMETRIC: 2170 pd_idx = sector_div(stripe2, raid_disks); 2171 qd_idx = (pd_idx + 1) % raid_disks; 2172 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2173 break; 2174 2175 case ALGORITHM_PARITY_0: 2176 pd_idx = 0; 2177 qd_idx = 1; 2178 (*dd_idx) += 2; 2179 break; 2180 case ALGORITHM_PARITY_N: 2181 pd_idx = data_disks; 2182 qd_idx = data_disks + 1; 2183 break; 2184 2185 case ALGORITHM_ROTATING_ZERO_RESTART: 2186 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2187 * of blocks for computing Q is different. 2188 */ 2189 pd_idx = sector_div(stripe2, raid_disks); 2190 qd_idx = pd_idx + 1; 2191 if (pd_idx == raid_disks-1) { 2192 (*dd_idx)++; /* Q D D D P */ 2193 qd_idx = 0; 2194 } else if (*dd_idx >= pd_idx) 2195 (*dd_idx) += 2; /* D D P Q D */ 2196 ddf_layout = 1; 2197 break; 2198 2199 case ALGORITHM_ROTATING_N_RESTART: 2200 /* Same a left_asymmetric, by first stripe is 2201 * D D D P Q rather than 2202 * Q D D D P 2203 */ 2204 stripe2 += 1; 2205 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2206 qd_idx = pd_idx + 1; 2207 if (pd_idx == raid_disks-1) { 2208 (*dd_idx)++; /* Q D D D P */ 2209 qd_idx = 0; 2210 } else if (*dd_idx >= pd_idx) 2211 (*dd_idx) += 2; /* D D P Q D */ 2212 ddf_layout = 1; 2213 break; 2214 2215 case ALGORITHM_ROTATING_N_CONTINUE: 2216 /* Same as left_symmetric but Q is before P */ 2217 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2218 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2219 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2220 ddf_layout = 1; 2221 break; 2222 2223 case ALGORITHM_LEFT_ASYMMETRIC_6: 2224 /* RAID5 left_asymmetric, with Q on last device */ 2225 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2226 if (*dd_idx >= pd_idx) 2227 (*dd_idx)++; 2228 qd_idx = raid_disks - 1; 2229 break; 2230 2231 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2232 pd_idx = sector_div(stripe2, raid_disks-1); 2233 if (*dd_idx >= pd_idx) 2234 (*dd_idx)++; 2235 qd_idx = raid_disks - 1; 2236 break; 2237 2238 case ALGORITHM_LEFT_SYMMETRIC_6: 2239 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2240 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2241 qd_idx = raid_disks - 1; 2242 break; 2243 2244 case ALGORITHM_RIGHT_SYMMETRIC_6: 2245 pd_idx = sector_div(stripe2, raid_disks-1); 2246 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2247 qd_idx = raid_disks - 1; 2248 break; 2249 2250 case ALGORITHM_PARITY_0_6: 2251 pd_idx = 0; 2252 (*dd_idx)++; 2253 qd_idx = raid_disks - 1; 2254 break; 2255 2256 default: 2257 BUG(); 2258 } 2259 break; 2260 } 2261 2262 if (sh) { 2263 sh->pd_idx = pd_idx; 2264 sh->qd_idx = qd_idx; 2265 sh->ddf_layout = ddf_layout; 2266 } 2267 /* 2268 * Finally, compute the new sector number 2269 */ 2270 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2271 return new_sector; 2272 } 2273 2274 2275 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2276 { 2277 struct r5conf *conf = sh->raid_conf; 2278 int raid_disks = sh->disks; 2279 int data_disks = raid_disks - conf->max_degraded; 2280 sector_t new_sector = sh->sector, check; 2281 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2282 : conf->chunk_sectors; 2283 int algorithm = previous ? conf->prev_algo 2284 : conf->algorithm; 2285 sector_t stripe; 2286 int chunk_offset; 2287 sector_t chunk_number; 2288 int dummy1, dd_idx = i; 2289 sector_t r_sector; 2290 struct stripe_head sh2; 2291 2292 2293 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2294 stripe = new_sector; 2295 2296 if (i == sh->pd_idx) 2297 return 0; 2298 switch(conf->level) { 2299 case 4: break; 2300 case 5: 2301 switch (algorithm) { 2302 case ALGORITHM_LEFT_ASYMMETRIC: 2303 case ALGORITHM_RIGHT_ASYMMETRIC: 2304 if (i > sh->pd_idx) 2305 i--; 2306 break; 2307 case ALGORITHM_LEFT_SYMMETRIC: 2308 case ALGORITHM_RIGHT_SYMMETRIC: 2309 if (i < sh->pd_idx) 2310 i += raid_disks; 2311 i -= (sh->pd_idx + 1); 2312 break; 2313 case ALGORITHM_PARITY_0: 2314 i -= 1; 2315 break; 2316 case ALGORITHM_PARITY_N: 2317 break; 2318 default: 2319 BUG(); 2320 } 2321 break; 2322 case 6: 2323 if (i == sh->qd_idx) 2324 return 0; /* It is the Q disk */ 2325 switch (algorithm) { 2326 case ALGORITHM_LEFT_ASYMMETRIC: 2327 case ALGORITHM_RIGHT_ASYMMETRIC: 2328 case ALGORITHM_ROTATING_ZERO_RESTART: 2329 case ALGORITHM_ROTATING_N_RESTART: 2330 if (sh->pd_idx == raid_disks-1) 2331 i--; /* Q D D D P */ 2332 else if (i > sh->pd_idx) 2333 i -= 2; /* D D P Q D */ 2334 break; 2335 case ALGORITHM_LEFT_SYMMETRIC: 2336 case ALGORITHM_RIGHT_SYMMETRIC: 2337 if (sh->pd_idx == raid_disks-1) 2338 i--; /* Q D D D P */ 2339 else { 2340 /* D D P Q D */ 2341 if (i < sh->pd_idx) 2342 i += raid_disks; 2343 i -= (sh->pd_idx + 2); 2344 } 2345 break; 2346 case ALGORITHM_PARITY_0: 2347 i -= 2; 2348 break; 2349 case ALGORITHM_PARITY_N: 2350 break; 2351 case ALGORITHM_ROTATING_N_CONTINUE: 2352 /* Like left_symmetric, but P is before Q */ 2353 if (sh->pd_idx == 0) 2354 i--; /* P D D D Q */ 2355 else { 2356 /* D D Q P D */ 2357 if (i < sh->pd_idx) 2358 i += raid_disks; 2359 i -= (sh->pd_idx + 1); 2360 } 2361 break; 2362 case ALGORITHM_LEFT_ASYMMETRIC_6: 2363 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2364 if (i > sh->pd_idx) 2365 i--; 2366 break; 2367 case ALGORITHM_LEFT_SYMMETRIC_6: 2368 case ALGORITHM_RIGHT_SYMMETRIC_6: 2369 if (i < sh->pd_idx) 2370 i += data_disks + 1; 2371 i -= (sh->pd_idx + 1); 2372 break; 2373 case ALGORITHM_PARITY_0_6: 2374 i -= 1; 2375 break; 2376 default: 2377 BUG(); 2378 } 2379 break; 2380 } 2381 2382 chunk_number = stripe * data_disks + i; 2383 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2384 2385 check = raid5_compute_sector(conf, r_sector, 2386 previous, &dummy1, &sh2); 2387 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2388 || sh2.qd_idx != sh->qd_idx) { 2389 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2390 mdname(conf->mddev)); 2391 return 0; 2392 } 2393 return r_sector; 2394 } 2395 2396 2397 static void 2398 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2399 int rcw, int expand) 2400 { 2401 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2402 struct r5conf *conf = sh->raid_conf; 2403 int level = conf->level; 2404 2405 if (rcw) { 2406 2407 for (i = disks; i--; ) { 2408 struct r5dev *dev = &sh->dev[i]; 2409 2410 if (dev->towrite) { 2411 set_bit(R5_LOCKED, &dev->flags); 2412 set_bit(R5_Wantdrain, &dev->flags); 2413 if (!expand) 2414 clear_bit(R5_UPTODATE, &dev->flags); 2415 s->locked++; 2416 } 2417 } 2418 /* if we are not expanding this is a proper write request, and 2419 * there will be bios with new data to be drained into the 2420 * stripe cache 2421 */ 2422 if (!expand) { 2423 if (!s->locked) 2424 /* False alarm, nothing to do */ 2425 return; 2426 sh->reconstruct_state = reconstruct_state_drain_run; 2427 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2428 } else 2429 sh->reconstruct_state = reconstruct_state_run; 2430 2431 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2432 2433 if (s->locked + conf->max_degraded == disks) 2434 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2435 atomic_inc(&conf->pending_full_writes); 2436 } else { 2437 BUG_ON(level == 6); 2438 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2439 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2440 2441 for (i = disks; i--; ) { 2442 struct r5dev *dev = &sh->dev[i]; 2443 if (i == pd_idx) 2444 continue; 2445 2446 if (dev->towrite && 2447 (test_bit(R5_UPTODATE, &dev->flags) || 2448 test_bit(R5_Wantcompute, &dev->flags))) { 2449 set_bit(R5_Wantdrain, &dev->flags); 2450 set_bit(R5_LOCKED, &dev->flags); 2451 clear_bit(R5_UPTODATE, &dev->flags); 2452 s->locked++; 2453 } 2454 } 2455 if (!s->locked) 2456 /* False alarm - nothing to do */ 2457 return; 2458 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2459 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2460 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2461 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2462 } 2463 2464 /* keep the parity disk(s) locked while asynchronous operations 2465 * are in flight 2466 */ 2467 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2468 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2469 s->locked++; 2470 2471 if (level == 6) { 2472 int qd_idx = sh->qd_idx; 2473 struct r5dev *dev = &sh->dev[qd_idx]; 2474 2475 set_bit(R5_LOCKED, &dev->flags); 2476 clear_bit(R5_UPTODATE, &dev->flags); 2477 s->locked++; 2478 } 2479 2480 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2481 __func__, (unsigned long long)sh->sector, 2482 s->locked, s->ops_request); 2483 } 2484 2485 /* 2486 * Each stripe/dev can have one or more bion attached. 2487 * toread/towrite point to the first in a chain. 2488 * The bi_next chain must be in order. 2489 */ 2490 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2491 { 2492 struct bio **bip; 2493 struct r5conf *conf = sh->raid_conf; 2494 int firstwrite=0; 2495 2496 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2497 (unsigned long long)bi->bi_sector, 2498 (unsigned long long)sh->sector); 2499 2500 /* 2501 * If several bio share a stripe. The bio bi_phys_segments acts as a 2502 * reference count to avoid race. The reference count should already be 2503 * increased before this function is called (for example, in 2504 * make_request()), so other bio sharing this stripe will not free the 2505 * stripe. If a stripe is owned by one stripe, the stripe lock will 2506 * protect it. 2507 */ 2508 spin_lock_irq(&sh->stripe_lock); 2509 if (forwrite) { 2510 bip = &sh->dev[dd_idx].towrite; 2511 if (*bip == NULL) 2512 firstwrite = 1; 2513 } else 2514 bip = &sh->dev[dd_idx].toread; 2515 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2516 if (bio_end_sector(*bip) > bi->bi_sector) 2517 goto overlap; 2518 bip = & (*bip)->bi_next; 2519 } 2520 if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) 2521 goto overlap; 2522 2523 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2524 if (*bip) 2525 bi->bi_next = *bip; 2526 *bip = bi; 2527 raid5_inc_bi_active_stripes(bi); 2528 2529 if (forwrite) { 2530 /* check if page is covered */ 2531 sector_t sector = sh->dev[dd_idx].sector; 2532 for (bi=sh->dev[dd_idx].towrite; 2533 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2534 bi && bi->bi_sector <= sector; 2535 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2536 if (bio_end_sector(bi) >= sector) 2537 sector = bio_end_sector(bi); 2538 } 2539 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2540 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2541 } 2542 2543 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2544 (unsigned long long)(*bip)->bi_sector, 2545 (unsigned long long)sh->sector, dd_idx); 2546 spin_unlock_irq(&sh->stripe_lock); 2547 2548 if (conf->mddev->bitmap && firstwrite) { 2549 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2550 STRIPE_SECTORS, 0); 2551 sh->bm_seq = conf->seq_flush+1; 2552 set_bit(STRIPE_BIT_DELAY, &sh->state); 2553 } 2554 return 1; 2555 2556 overlap: 2557 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2558 spin_unlock_irq(&sh->stripe_lock); 2559 return 0; 2560 } 2561 2562 static void end_reshape(struct r5conf *conf); 2563 2564 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2565 struct stripe_head *sh) 2566 { 2567 int sectors_per_chunk = 2568 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2569 int dd_idx; 2570 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2571 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2572 2573 raid5_compute_sector(conf, 2574 stripe * (disks - conf->max_degraded) 2575 *sectors_per_chunk + chunk_offset, 2576 previous, 2577 &dd_idx, sh); 2578 } 2579 2580 static void 2581 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2582 struct stripe_head_state *s, int disks, 2583 struct bio **return_bi) 2584 { 2585 int i; 2586 for (i = disks; i--; ) { 2587 struct bio *bi; 2588 int bitmap_end = 0; 2589 2590 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2591 struct md_rdev *rdev; 2592 rcu_read_lock(); 2593 rdev = rcu_dereference(conf->disks[i].rdev); 2594 if (rdev && test_bit(In_sync, &rdev->flags)) 2595 atomic_inc(&rdev->nr_pending); 2596 else 2597 rdev = NULL; 2598 rcu_read_unlock(); 2599 if (rdev) { 2600 if (!rdev_set_badblocks( 2601 rdev, 2602 sh->sector, 2603 STRIPE_SECTORS, 0)) 2604 md_error(conf->mddev, rdev); 2605 rdev_dec_pending(rdev, conf->mddev); 2606 } 2607 } 2608 spin_lock_irq(&sh->stripe_lock); 2609 /* fail all writes first */ 2610 bi = sh->dev[i].towrite; 2611 sh->dev[i].towrite = NULL; 2612 spin_unlock_irq(&sh->stripe_lock); 2613 if (bi) 2614 bitmap_end = 1; 2615 2616 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2617 wake_up(&conf->wait_for_overlap); 2618 2619 while (bi && bi->bi_sector < 2620 sh->dev[i].sector + STRIPE_SECTORS) { 2621 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2622 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2623 if (!raid5_dec_bi_active_stripes(bi)) { 2624 md_write_end(conf->mddev); 2625 bi->bi_next = *return_bi; 2626 *return_bi = bi; 2627 } 2628 bi = nextbi; 2629 } 2630 if (bitmap_end) 2631 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2632 STRIPE_SECTORS, 0, 0); 2633 bitmap_end = 0; 2634 /* and fail all 'written' */ 2635 bi = sh->dev[i].written; 2636 sh->dev[i].written = NULL; 2637 if (bi) bitmap_end = 1; 2638 while (bi && bi->bi_sector < 2639 sh->dev[i].sector + STRIPE_SECTORS) { 2640 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2641 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2642 if (!raid5_dec_bi_active_stripes(bi)) { 2643 md_write_end(conf->mddev); 2644 bi->bi_next = *return_bi; 2645 *return_bi = bi; 2646 } 2647 bi = bi2; 2648 } 2649 2650 /* fail any reads if this device is non-operational and 2651 * the data has not reached the cache yet. 2652 */ 2653 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2654 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2655 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2656 spin_lock_irq(&sh->stripe_lock); 2657 bi = sh->dev[i].toread; 2658 sh->dev[i].toread = NULL; 2659 spin_unlock_irq(&sh->stripe_lock); 2660 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2661 wake_up(&conf->wait_for_overlap); 2662 while (bi && bi->bi_sector < 2663 sh->dev[i].sector + STRIPE_SECTORS) { 2664 struct bio *nextbi = 2665 r5_next_bio(bi, sh->dev[i].sector); 2666 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2667 if (!raid5_dec_bi_active_stripes(bi)) { 2668 bi->bi_next = *return_bi; 2669 *return_bi = bi; 2670 } 2671 bi = nextbi; 2672 } 2673 } 2674 if (bitmap_end) 2675 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2676 STRIPE_SECTORS, 0, 0); 2677 /* If we were in the middle of a write the parity block might 2678 * still be locked - so just clear all R5_LOCKED flags 2679 */ 2680 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2681 } 2682 2683 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2684 if (atomic_dec_and_test(&conf->pending_full_writes)) 2685 md_wakeup_thread(conf->mddev->thread); 2686 } 2687 2688 static void 2689 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2690 struct stripe_head_state *s) 2691 { 2692 int abort = 0; 2693 int i; 2694 2695 clear_bit(STRIPE_SYNCING, &sh->state); 2696 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 2697 wake_up(&conf->wait_for_overlap); 2698 s->syncing = 0; 2699 s->replacing = 0; 2700 /* There is nothing more to do for sync/check/repair. 2701 * Don't even need to abort as that is handled elsewhere 2702 * if needed, and not always wanted e.g. if there is a known 2703 * bad block here. 2704 * For recover/replace we need to record a bad block on all 2705 * non-sync devices, or abort the recovery 2706 */ 2707 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2708 /* During recovery devices cannot be removed, so 2709 * locking and refcounting of rdevs is not needed 2710 */ 2711 for (i = 0; i < conf->raid_disks; i++) { 2712 struct md_rdev *rdev = conf->disks[i].rdev; 2713 if (rdev 2714 && !test_bit(Faulty, &rdev->flags) 2715 && !test_bit(In_sync, &rdev->flags) 2716 && !rdev_set_badblocks(rdev, sh->sector, 2717 STRIPE_SECTORS, 0)) 2718 abort = 1; 2719 rdev = conf->disks[i].replacement; 2720 if (rdev 2721 && !test_bit(Faulty, &rdev->flags) 2722 && !test_bit(In_sync, &rdev->flags) 2723 && !rdev_set_badblocks(rdev, sh->sector, 2724 STRIPE_SECTORS, 0)) 2725 abort = 1; 2726 } 2727 if (abort) 2728 conf->recovery_disabled = 2729 conf->mddev->recovery_disabled; 2730 } 2731 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2732 } 2733 2734 static int want_replace(struct stripe_head *sh, int disk_idx) 2735 { 2736 struct md_rdev *rdev; 2737 int rv = 0; 2738 /* Doing recovery so rcu locking not required */ 2739 rdev = sh->raid_conf->disks[disk_idx].replacement; 2740 if (rdev 2741 && !test_bit(Faulty, &rdev->flags) 2742 && !test_bit(In_sync, &rdev->flags) 2743 && (rdev->recovery_offset <= sh->sector 2744 || rdev->mddev->recovery_cp <= sh->sector)) 2745 rv = 1; 2746 2747 return rv; 2748 } 2749 2750 /* fetch_block - checks the given member device to see if its data needs 2751 * to be read or computed to satisfy a request. 2752 * 2753 * Returns 1 when no more member devices need to be checked, otherwise returns 2754 * 0 to tell the loop in handle_stripe_fill to continue 2755 */ 2756 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2757 int disk_idx, int disks) 2758 { 2759 struct r5dev *dev = &sh->dev[disk_idx]; 2760 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2761 &sh->dev[s->failed_num[1]] }; 2762 2763 /* is the data in this block needed, and can we get it? */ 2764 if (!test_bit(R5_LOCKED, &dev->flags) && 2765 !test_bit(R5_UPTODATE, &dev->flags) && 2766 (dev->toread || 2767 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2768 s->syncing || s->expanding || 2769 (s->replacing && want_replace(sh, disk_idx)) || 2770 (s->failed >= 1 && fdev[0]->toread) || 2771 (s->failed >= 2 && fdev[1]->toread) || 2772 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2773 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2774 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2775 /* we would like to get this block, possibly by computing it, 2776 * otherwise read it if the backing disk is insync 2777 */ 2778 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2779 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2780 if ((s->uptodate == disks - 1) && 2781 (s->failed && (disk_idx == s->failed_num[0] || 2782 disk_idx == s->failed_num[1]))) { 2783 /* have disk failed, and we're requested to fetch it; 2784 * do compute it 2785 */ 2786 pr_debug("Computing stripe %llu block %d\n", 2787 (unsigned long long)sh->sector, disk_idx); 2788 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2789 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2790 set_bit(R5_Wantcompute, &dev->flags); 2791 sh->ops.target = disk_idx; 2792 sh->ops.target2 = -1; /* no 2nd target */ 2793 s->req_compute = 1; 2794 /* Careful: from this point on 'uptodate' is in the eye 2795 * of raid_run_ops which services 'compute' operations 2796 * before writes. R5_Wantcompute flags a block that will 2797 * be R5_UPTODATE by the time it is needed for a 2798 * subsequent operation. 2799 */ 2800 s->uptodate++; 2801 return 1; 2802 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2803 /* Computing 2-failure is *very* expensive; only 2804 * do it if failed >= 2 2805 */ 2806 int other; 2807 for (other = disks; other--; ) { 2808 if (other == disk_idx) 2809 continue; 2810 if (!test_bit(R5_UPTODATE, 2811 &sh->dev[other].flags)) 2812 break; 2813 } 2814 BUG_ON(other < 0); 2815 pr_debug("Computing stripe %llu blocks %d,%d\n", 2816 (unsigned long long)sh->sector, 2817 disk_idx, other); 2818 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2819 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2820 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2821 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2822 sh->ops.target = disk_idx; 2823 sh->ops.target2 = other; 2824 s->uptodate += 2; 2825 s->req_compute = 1; 2826 return 1; 2827 } else if (test_bit(R5_Insync, &dev->flags)) { 2828 set_bit(R5_LOCKED, &dev->flags); 2829 set_bit(R5_Wantread, &dev->flags); 2830 s->locked++; 2831 pr_debug("Reading block %d (sync=%d)\n", 2832 disk_idx, s->syncing); 2833 } 2834 } 2835 2836 return 0; 2837 } 2838 2839 /** 2840 * handle_stripe_fill - read or compute data to satisfy pending requests. 2841 */ 2842 static void handle_stripe_fill(struct stripe_head *sh, 2843 struct stripe_head_state *s, 2844 int disks) 2845 { 2846 int i; 2847 2848 /* look for blocks to read/compute, skip this if a compute 2849 * is already in flight, or if the stripe contents are in the 2850 * midst of changing due to a write 2851 */ 2852 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2853 !sh->reconstruct_state) 2854 for (i = disks; i--; ) 2855 if (fetch_block(sh, s, i, disks)) 2856 break; 2857 set_bit(STRIPE_HANDLE, &sh->state); 2858 } 2859 2860 2861 /* handle_stripe_clean_event 2862 * any written block on an uptodate or failed drive can be returned. 2863 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2864 * never LOCKED, so we don't need to test 'failed' directly. 2865 */ 2866 static void handle_stripe_clean_event(struct r5conf *conf, 2867 struct stripe_head *sh, int disks, struct bio **return_bi) 2868 { 2869 int i; 2870 struct r5dev *dev; 2871 int discard_pending = 0; 2872 2873 for (i = disks; i--; ) 2874 if (sh->dev[i].written) { 2875 dev = &sh->dev[i]; 2876 if (!test_bit(R5_LOCKED, &dev->flags) && 2877 (test_bit(R5_UPTODATE, &dev->flags) || 2878 test_bit(R5_Discard, &dev->flags))) { 2879 /* We can return any write requests */ 2880 struct bio *wbi, *wbi2; 2881 pr_debug("Return write for disc %d\n", i); 2882 if (test_and_clear_bit(R5_Discard, &dev->flags)) 2883 clear_bit(R5_UPTODATE, &dev->flags); 2884 wbi = dev->written; 2885 dev->written = NULL; 2886 while (wbi && wbi->bi_sector < 2887 dev->sector + STRIPE_SECTORS) { 2888 wbi2 = r5_next_bio(wbi, dev->sector); 2889 if (!raid5_dec_bi_active_stripes(wbi)) { 2890 md_write_end(conf->mddev); 2891 wbi->bi_next = *return_bi; 2892 *return_bi = wbi; 2893 } 2894 wbi = wbi2; 2895 } 2896 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2897 STRIPE_SECTORS, 2898 !test_bit(STRIPE_DEGRADED, &sh->state), 2899 0); 2900 } else if (test_bit(R5_Discard, &dev->flags)) 2901 discard_pending = 1; 2902 } 2903 if (!discard_pending && 2904 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 2905 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2906 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2907 if (sh->qd_idx >= 0) { 2908 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2909 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 2910 } 2911 /* now that discard is done we can proceed with any sync */ 2912 clear_bit(STRIPE_DISCARD, &sh->state); 2913 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 2914 set_bit(STRIPE_HANDLE, &sh->state); 2915 2916 } 2917 2918 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2919 if (atomic_dec_and_test(&conf->pending_full_writes)) 2920 md_wakeup_thread(conf->mddev->thread); 2921 } 2922 2923 static void handle_stripe_dirtying(struct r5conf *conf, 2924 struct stripe_head *sh, 2925 struct stripe_head_state *s, 2926 int disks) 2927 { 2928 int rmw = 0, rcw = 0, i; 2929 sector_t recovery_cp = conf->mddev->recovery_cp; 2930 2931 /* RAID6 requires 'rcw' in current implementation. 2932 * Otherwise, check whether resync is now happening or should start. 2933 * If yes, then the array is dirty (after unclean shutdown or 2934 * initial creation), so parity in some stripes might be inconsistent. 2935 * In this case, we need to always do reconstruct-write, to ensure 2936 * that in case of drive failure or read-error correction, we 2937 * generate correct data from the parity. 2938 */ 2939 if (conf->max_degraded == 2 || 2940 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2941 /* Calculate the real rcw later - for now make it 2942 * look like rcw is cheaper 2943 */ 2944 rcw = 1; rmw = 2; 2945 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2946 conf->max_degraded, (unsigned long long)recovery_cp, 2947 (unsigned long long)sh->sector); 2948 } else for (i = disks; i--; ) { 2949 /* would I have to read this buffer for read_modify_write */ 2950 struct r5dev *dev = &sh->dev[i]; 2951 if ((dev->towrite || i == sh->pd_idx) && 2952 !test_bit(R5_LOCKED, &dev->flags) && 2953 !(test_bit(R5_UPTODATE, &dev->flags) || 2954 test_bit(R5_Wantcompute, &dev->flags))) { 2955 if (test_bit(R5_Insync, &dev->flags)) 2956 rmw++; 2957 else 2958 rmw += 2*disks; /* cannot read it */ 2959 } 2960 /* Would I have to read this buffer for reconstruct_write */ 2961 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2962 !test_bit(R5_LOCKED, &dev->flags) && 2963 !(test_bit(R5_UPTODATE, &dev->flags) || 2964 test_bit(R5_Wantcompute, &dev->flags))) { 2965 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2966 else 2967 rcw += 2*disks; 2968 } 2969 } 2970 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2971 (unsigned long long)sh->sector, rmw, rcw); 2972 set_bit(STRIPE_HANDLE, &sh->state); 2973 if (rmw < rcw && rmw > 0) { 2974 /* prefer read-modify-write, but need to get some data */ 2975 if (conf->mddev->queue) 2976 blk_add_trace_msg(conf->mddev->queue, 2977 "raid5 rmw %llu %d", 2978 (unsigned long long)sh->sector, rmw); 2979 for (i = disks; i--; ) { 2980 struct r5dev *dev = &sh->dev[i]; 2981 if ((dev->towrite || i == sh->pd_idx) && 2982 !test_bit(R5_LOCKED, &dev->flags) && 2983 !(test_bit(R5_UPTODATE, &dev->flags) || 2984 test_bit(R5_Wantcompute, &dev->flags)) && 2985 test_bit(R5_Insync, &dev->flags)) { 2986 if ( 2987 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2988 pr_debug("Read_old block " 2989 "%d for r-m-w\n", i); 2990 set_bit(R5_LOCKED, &dev->flags); 2991 set_bit(R5_Wantread, &dev->flags); 2992 s->locked++; 2993 } else { 2994 set_bit(STRIPE_DELAYED, &sh->state); 2995 set_bit(STRIPE_HANDLE, &sh->state); 2996 } 2997 } 2998 } 2999 } 3000 if (rcw <= rmw && rcw > 0) { 3001 /* want reconstruct write, but need to get some data */ 3002 int qread =0; 3003 rcw = 0; 3004 for (i = disks; i--; ) { 3005 struct r5dev *dev = &sh->dev[i]; 3006 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3007 i != sh->pd_idx && i != sh->qd_idx && 3008 !test_bit(R5_LOCKED, &dev->flags) && 3009 !(test_bit(R5_UPTODATE, &dev->flags) || 3010 test_bit(R5_Wantcompute, &dev->flags))) { 3011 rcw++; 3012 if (!test_bit(R5_Insync, &dev->flags)) 3013 continue; /* it's a failed drive */ 3014 if ( 3015 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3016 pr_debug("Read_old block " 3017 "%d for Reconstruct\n", i); 3018 set_bit(R5_LOCKED, &dev->flags); 3019 set_bit(R5_Wantread, &dev->flags); 3020 s->locked++; 3021 qread++; 3022 } else { 3023 set_bit(STRIPE_DELAYED, &sh->state); 3024 set_bit(STRIPE_HANDLE, &sh->state); 3025 } 3026 } 3027 } 3028 if (rcw && conf->mddev->queue) 3029 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3030 (unsigned long long)sh->sector, 3031 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3032 } 3033 /* now if nothing is locked, and if we have enough data, 3034 * we can start a write request 3035 */ 3036 /* since handle_stripe can be called at any time we need to handle the 3037 * case where a compute block operation has been submitted and then a 3038 * subsequent call wants to start a write request. raid_run_ops only 3039 * handles the case where compute block and reconstruct are requested 3040 * simultaneously. If this is not the case then new writes need to be 3041 * held off until the compute completes. 3042 */ 3043 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3044 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3045 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3046 schedule_reconstruction(sh, s, rcw == 0, 0); 3047 } 3048 3049 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3050 struct stripe_head_state *s, int disks) 3051 { 3052 struct r5dev *dev = NULL; 3053 3054 set_bit(STRIPE_HANDLE, &sh->state); 3055 3056 switch (sh->check_state) { 3057 case check_state_idle: 3058 /* start a new check operation if there are no failures */ 3059 if (s->failed == 0) { 3060 BUG_ON(s->uptodate != disks); 3061 sh->check_state = check_state_run; 3062 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3063 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3064 s->uptodate--; 3065 break; 3066 } 3067 dev = &sh->dev[s->failed_num[0]]; 3068 /* fall through */ 3069 case check_state_compute_result: 3070 sh->check_state = check_state_idle; 3071 if (!dev) 3072 dev = &sh->dev[sh->pd_idx]; 3073 3074 /* check that a write has not made the stripe insync */ 3075 if (test_bit(STRIPE_INSYNC, &sh->state)) 3076 break; 3077 3078 /* either failed parity check, or recovery is happening */ 3079 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3080 BUG_ON(s->uptodate != disks); 3081 3082 set_bit(R5_LOCKED, &dev->flags); 3083 s->locked++; 3084 set_bit(R5_Wantwrite, &dev->flags); 3085 3086 clear_bit(STRIPE_DEGRADED, &sh->state); 3087 set_bit(STRIPE_INSYNC, &sh->state); 3088 break; 3089 case check_state_run: 3090 break; /* we will be called again upon completion */ 3091 case check_state_check_result: 3092 sh->check_state = check_state_idle; 3093 3094 /* if a failure occurred during the check operation, leave 3095 * STRIPE_INSYNC not set and let the stripe be handled again 3096 */ 3097 if (s->failed) 3098 break; 3099 3100 /* handle a successful check operation, if parity is correct 3101 * we are done. Otherwise update the mismatch count and repair 3102 * parity if !MD_RECOVERY_CHECK 3103 */ 3104 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3105 /* parity is correct (on disc, 3106 * not in buffer any more) 3107 */ 3108 set_bit(STRIPE_INSYNC, &sh->state); 3109 else { 3110 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3111 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3112 /* don't try to repair!! */ 3113 set_bit(STRIPE_INSYNC, &sh->state); 3114 else { 3115 sh->check_state = check_state_compute_run; 3116 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3117 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3118 set_bit(R5_Wantcompute, 3119 &sh->dev[sh->pd_idx].flags); 3120 sh->ops.target = sh->pd_idx; 3121 sh->ops.target2 = -1; 3122 s->uptodate++; 3123 } 3124 } 3125 break; 3126 case check_state_compute_run: 3127 break; 3128 default: 3129 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3130 __func__, sh->check_state, 3131 (unsigned long long) sh->sector); 3132 BUG(); 3133 } 3134 } 3135 3136 3137 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3138 struct stripe_head_state *s, 3139 int disks) 3140 { 3141 int pd_idx = sh->pd_idx; 3142 int qd_idx = sh->qd_idx; 3143 struct r5dev *dev; 3144 3145 set_bit(STRIPE_HANDLE, &sh->state); 3146 3147 BUG_ON(s->failed > 2); 3148 3149 /* Want to check and possibly repair P and Q. 3150 * However there could be one 'failed' device, in which 3151 * case we can only check one of them, possibly using the 3152 * other to generate missing data 3153 */ 3154 3155 switch (sh->check_state) { 3156 case check_state_idle: 3157 /* start a new check operation if there are < 2 failures */ 3158 if (s->failed == s->q_failed) { 3159 /* The only possible failed device holds Q, so it 3160 * makes sense to check P (If anything else were failed, 3161 * we would have used P to recreate it). 3162 */ 3163 sh->check_state = check_state_run; 3164 } 3165 if (!s->q_failed && s->failed < 2) { 3166 /* Q is not failed, and we didn't use it to generate 3167 * anything, so it makes sense to check it 3168 */ 3169 if (sh->check_state == check_state_run) 3170 sh->check_state = check_state_run_pq; 3171 else 3172 sh->check_state = check_state_run_q; 3173 } 3174 3175 /* discard potentially stale zero_sum_result */ 3176 sh->ops.zero_sum_result = 0; 3177 3178 if (sh->check_state == check_state_run) { 3179 /* async_xor_zero_sum destroys the contents of P */ 3180 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3181 s->uptodate--; 3182 } 3183 if (sh->check_state >= check_state_run && 3184 sh->check_state <= check_state_run_pq) { 3185 /* async_syndrome_zero_sum preserves P and Q, so 3186 * no need to mark them !uptodate here 3187 */ 3188 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3189 break; 3190 } 3191 3192 /* we have 2-disk failure */ 3193 BUG_ON(s->failed != 2); 3194 /* fall through */ 3195 case check_state_compute_result: 3196 sh->check_state = check_state_idle; 3197 3198 /* check that a write has not made the stripe insync */ 3199 if (test_bit(STRIPE_INSYNC, &sh->state)) 3200 break; 3201 3202 /* now write out any block on a failed drive, 3203 * or P or Q if they were recomputed 3204 */ 3205 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3206 if (s->failed == 2) { 3207 dev = &sh->dev[s->failed_num[1]]; 3208 s->locked++; 3209 set_bit(R5_LOCKED, &dev->flags); 3210 set_bit(R5_Wantwrite, &dev->flags); 3211 } 3212 if (s->failed >= 1) { 3213 dev = &sh->dev[s->failed_num[0]]; 3214 s->locked++; 3215 set_bit(R5_LOCKED, &dev->flags); 3216 set_bit(R5_Wantwrite, &dev->flags); 3217 } 3218 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3219 dev = &sh->dev[pd_idx]; 3220 s->locked++; 3221 set_bit(R5_LOCKED, &dev->flags); 3222 set_bit(R5_Wantwrite, &dev->flags); 3223 } 3224 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3225 dev = &sh->dev[qd_idx]; 3226 s->locked++; 3227 set_bit(R5_LOCKED, &dev->flags); 3228 set_bit(R5_Wantwrite, &dev->flags); 3229 } 3230 clear_bit(STRIPE_DEGRADED, &sh->state); 3231 3232 set_bit(STRIPE_INSYNC, &sh->state); 3233 break; 3234 case check_state_run: 3235 case check_state_run_q: 3236 case check_state_run_pq: 3237 break; /* we will be called again upon completion */ 3238 case check_state_check_result: 3239 sh->check_state = check_state_idle; 3240 3241 /* handle a successful check operation, if parity is correct 3242 * we are done. Otherwise update the mismatch count and repair 3243 * parity if !MD_RECOVERY_CHECK 3244 */ 3245 if (sh->ops.zero_sum_result == 0) { 3246 /* both parities are correct */ 3247 if (!s->failed) 3248 set_bit(STRIPE_INSYNC, &sh->state); 3249 else { 3250 /* in contrast to the raid5 case we can validate 3251 * parity, but still have a failure to write 3252 * back 3253 */ 3254 sh->check_state = check_state_compute_result; 3255 /* Returning at this point means that we may go 3256 * off and bring p and/or q uptodate again so 3257 * we make sure to check zero_sum_result again 3258 * to verify if p or q need writeback 3259 */ 3260 } 3261 } else { 3262 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3263 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3264 /* don't try to repair!! */ 3265 set_bit(STRIPE_INSYNC, &sh->state); 3266 else { 3267 int *target = &sh->ops.target; 3268 3269 sh->ops.target = -1; 3270 sh->ops.target2 = -1; 3271 sh->check_state = check_state_compute_run; 3272 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3273 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3274 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3275 set_bit(R5_Wantcompute, 3276 &sh->dev[pd_idx].flags); 3277 *target = pd_idx; 3278 target = &sh->ops.target2; 3279 s->uptodate++; 3280 } 3281 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3282 set_bit(R5_Wantcompute, 3283 &sh->dev[qd_idx].flags); 3284 *target = qd_idx; 3285 s->uptodate++; 3286 } 3287 } 3288 } 3289 break; 3290 case check_state_compute_run: 3291 break; 3292 default: 3293 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3294 __func__, sh->check_state, 3295 (unsigned long long) sh->sector); 3296 BUG(); 3297 } 3298 } 3299 3300 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3301 { 3302 int i; 3303 3304 /* We have read all the blocks in this stripe and now we need to 3305 * copy some of them into a target stripe for expand. 3306 */ 3307 struct dma_async_tx_descriptor *tx = NULL; 3308 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3309 for (i = 0; i < sh->disks; i++) 3310 if (i != sh->pd_idx && i != sh->qd_idx) { 3311 int dd_idx, j; 3312 struct stripe_head *sh2; 3313 struct async_submit_ctl submit; 3314 3315 sector_t bn = compute_blocknr(sh, i, 1); 3316 sector_t s = raid5_compute_sector(conf, bn, 0, 3317 &dd_idx, NULL); 3318 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3319 if (sh2 == NULL) 3320 /* so far only the early blocks of this stripe 3321 * have been requested. When later blocks 3322 * get requested, we will try again 3323 */ 3324 continue; 3325 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3326 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3327 /* must have already done this block */ 3328 release_stripe(sh2); 3329 continue; 3330 } 3331 3332 /* place all the copies on one channel */ 3333 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3334 tx = async_memcpy(sh2->dev[dd_idx].page, 3335 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3336 &submit); 3337 3338 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3339 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3340 for (j = 0; j < conf->raid_disks; j++) 3341 if (j != sh2->pd_idx && 3342 j != sh2->qd_idx && 3343 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3344 break; 3345 if (j == conf->raid_disks) { 3346 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3347 set_bit(STRIPE_HANDLE, &sh2->state); 3348 } 3349 release_stripe(sh2); 3350 3351 } 3352 /* done submitting copies, wait for them to complete */ 3353 async_tx_quiesce(&tx); 3354 } 3355 3356 /* 3357 * handle_stripe - do things to a stripe. 3358 * 3359 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3360 * state of various bits to see what needs to be done. 3361 * Possible results: 3362 * return some read requests which now have data 3363 * return some write requests which are safely on storage 3364 * schedule a read on some buffers 3365 * schedule a write of some buffers 3366 * return confirmation of parity correctness 3367 * 3368 */ 3369 3370 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3371 { 3372 struct r5conf *conf = sh->raid_conf; 3373 int disks = sh->disks; 3374 struct r5dev *dev; 3375 int i; 3376 int do_recovery = 0; 3377 3378 memset(s, 0, sizeof(*s)); 3379 3380 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3381 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3382 s->failed_num[0] = -1; 3383 s->failed_num[1] = -1; 3384 3385 /* Now to look around and see what can be done */ 3386 rcu_read_lock(); 3387 for (i=disks; i--; ) { 3388 struct md_rdev *rdev; 3389 sector_t first_bad; 3390 int bad_sectors; 3391 int is_bad = 0; 3392 3393 dev = &sh->dev[i]; 3394 3395 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3396 i, dev->flags, 3397 dev->toread, dev->towrite, dev->written); 3398 /* maybe we can reply to a read 3399 * 3400 * new wantfill requests are only permitted while 3401 * ops_complete_biofill is guaranteed to be inactive 3402 */ 3403 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3404 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3405 set_bit(R5_Wantfill, &dev->flags); 3406 3407 /* now count some things */ 3408 if (test_bit(R5_LOCKED, &dev->flags)) 3409 s->locked++; 3410 if (test_bit(R5_UPTODATE, &dev->flags)) 3411 s->uptodate++; 3412 if (test_bit(R5_Wantcompute, &dev->flags)) { 3413 s->compute++; 3414 BUG_ON(s->compute > 2); 3415 } 3416 3417 if (test_bit(R5_Wantfill, &dev->flags)) 3418 s->to_fill++; 3419 else if (dev->toread) 3420 s->to_read++; 3421 if (dev->towrite) { 3422 s->to_write++; 3423 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3424 s->non_overwrite++; 3425 } 3426 if (dev->written) 3427 s->written++; 3428 /* Prefer to use the replacement for reads, but only 3429 * if it is recovered enough and has no bad blocks. 3430 */ 3431 rdev = rcu_dereference(conf->disks[i].replacement); 3432 if (rdev && !test_bit(Faulty, &rdev->flags) && 3433 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3434 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3435 &first_bad, &bad_sectors)) 3436 set_bit(R5_ReadRepl, &dev->flags); 3437 else { 3438 if (rdev) 3439 set_bit(R5_NeedReplace, &dev->flags); 3440 rdev = rcu_dereference(conf->disks[i].rdev); 3441 clear_bit(R5_ReadRepl, &dev->flags); 3442 } 3443 if (rdev && test_bit(Faulty, &rdev->flags)) 3444 rdev = NULL; 3445 if (rdev) { 3446 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3447 &first_bad, &bad_sectors); 3448 if (s->blocked_rdev == NULL 3449 && (test_bit(Blocked, &rdev->flags) 3450 || is_bad < 0)) { 3451 if (is_bad < 0) 3452 set_bit(BlockedBadBlocks, 3453 &rdev->flags); 3454 s->blocked_rdev = rdev; 3455 atomic_inc(&rdev->nr_pending); 3456 } 3457 } 3458 clear_bit(R5_Insync, &dev->flags); 3459 if (!rdev) 3460 /* Not in-sync */; 3461 else if (is_bad) { 3462 /* also not in-sync */ 3463 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3464 test_bit(R5_UPTODATE, &dev->flags)) { 3465 /* treat as in-sync, but with a read error 3466 * which we can now try to correct 3467 */ 3468 set_bit(R5_Insync, &dev->flags); 3469 set_bit(R5_ReadError, &dev->flags); 3470 } 3471 } else if (test_bit(In_sync, &rdev->flags)) 3472 set_bit(R5_Insync, &dev->flags); 3473 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3474 /* in sync if before recovery_offset */ 3475 set_bit(R5_Insync, &dev->flags); 3476 else if (test_bit(R5_UPTODATE, &dev->flags) && 3477 test_bit(R5_Expanded, &dev->flags)) 3478 /* If we've reshaped into here, we assume it is Insync. 3479 * We will shortly update recovery_offset to make 3480 * it official. 3481 */ 3482 set_bit(R5_Insync, &dev->flags); 3483 3484 if (rdev && test_bit(R5_WriteError, &dev->flags)) { 3485 /* This flag does not apply to '.replacement' 3486 * only to .rdev, so make sure to check that*/ 3487 struct md_rdev *rdev2 = rcu_dereference( 3488 conf->disks[i].rdev); 3489 if (rdev2 == rdev) 3490 clear_bit(R5_Insync, &dev->flags); 3491 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3492 s->handle_bad_blocks = 1; 3493 atomic_inc(&rdev2->nr_pending); 3494 } else 3495 clear_bit(R5_WriteError, &dev->flags); 3496 } 3497 if (rdev && test_bit(R5_MadeGood, &dev->flags)) { 3498 /* This flag does not apply to '.replacement' 3499 * only to .rdev, so make sure to check that*/ 3500 struct md_rdev *rdev2 = rcu_dereference( 3501 conf->disks[i].rdev); 3502 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3503 s->handle_bad_blocks = 1; 3504 atomic_inc(&rdev2->nr_pending); 3505 } else 3506 clear_bit(R5_MadeGood, &dev->flags); 3507 } 3508 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3509 struct md_rdev *rdev2 = rcu_dereference( 3510 conf->disks[i].replacement); 3511 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3512 s->handle_bad_blocks = 1; 3513 atomic_inc(&rdev2->nr_pending); 3514 } else 3515 clear_bit(R5_MadeGoodRepl, &dev->flags); 3516 } 3517 if (!test_bit(R5_Insync, &dev->flags)) { 3518 /* The ReadError flag will just be confusing now */ 3519 clear_bit(R5_ReadError, &dev->flags); 3520 clear_bit(R5_ReWrite, &dev->flags); 3521 } 3522 if (test_bit(R5_ReadError, &dev->flags)) 3523 clear_bit(R5_Insync, &dev->flags); 3524 if (!test_bit(R5_Insync, &dev->flags)) { 3525 if (s->failed < 2) 3526 s->failed_num[s->failed] = i; 3527 s->failed++; 3528 if (rdev && !test_bit(Faulty, &rdev->flags)) 3529 do_recovery = 1; 3530 } 3531 } 3532 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3533 /* If there is a failed device being replaced, 3534 * we must be recovering. 3535 * else if we are after recovery_cp, we must be syncing 3536 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3537 * else we can only be replacing 3538 * sync and recovery both need to read all devices, and so 3539 * use the same flag. 3540 */ 3541 if (do_recovery || 3542 sh->sector >= conf->mddev->recovery_cp || 3543 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3544 s->syncing = 1; 3545 else 3546 s->replacing = 1; 3547 } 3548 rcu_read_unlock(); 3549 } 3550 3551 static void handle_stripe(struct stripe_head *sh) 3552 { 3553 struct stripe_head_state s; 3554 struct r5conf *conf = sh->raid_conf; 3555 int i; 3556 int prexor; 3557 int disks = sh->disks; 3558 struct r5dev *pdev, *qdev; 3559 3560 clear_bit(STRIPE_HANDLE, &sh->state); 3561 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3562 /* already being handled, ensure it gets handled 3563 * again when current action finishes */ 3564 set_bit(STRIPE_HANDLE, &sh->state); 3565 return; 3566 } 3567 3568 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3569 spin_lock(&sh->stripe_lock); 3570 /* Cannot process 'sync' concurrently with 'discard' */ 3571 if (!test_bit(STRIPE_DISCARD, &sh->state) && 3572 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3573 set_bit(STRIPE_SYNCING, &sh->state); 3574 clear_bit(STRIPE_INSYNC, &sh->state); 3575 clear_bit(STRIPE_REPLACED, &sh->state); 3576 } 3577 spin_unlock(&sh->stripe_lock); 3578 } 3579 clear_bit(STRIPE_DELAYED, &sh->state); 3580 3581 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3582 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3583 (unsigned long long)sh->sector, sh->state, 3584 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3585 sh->check_state, sh->reconstruct_state); 3586 3587 analyse_stripe(sh, &s); 3588 3589 if (s.handle_bad_blocks) { 3590 set_bit(STRIPE_HANDLE, &sh->state); 3591 goto finish; 3592 } 3593 3594 if (unlikely(s.blocked_rdev)) { 3595 if (s.syncing || s.expanding || s.expanded || 3596 s.replacing || s.to_write || s.written) { 3597 set_bit(STRIPE_HANDLE, &sh->state); 3598 goto finish; 3599 } 3600 /* There is nothing for the blocked_rdev to block */ 3601 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3602 s.blocked_rdev = NULL; 3603 } 3604 3605 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3606 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3607 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3608 } 3609 3610 pr_debug("locked=%d uptodate=%d to_read=%d" 3611 " to_write=%d failed=%d failed_num=%d,%d\n", 3612 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3613 s.failed_num[0], s.failed_num[1]); 3614 /* check if the array has lost more than max_degraded devices and, 3615 * if so, some requests might need to be failed. 3616 */ 3617 if (s.failed > conf->max_degraded) { 3618 sh->check_state = 0; 3619 sh->reconstruct_state = 0; 3620 if (s.to_read+s.to_write+s.written) 3621 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3622 if (s.syncing + s.replacing) 3623 handle_failed_sync(conf, sh, &s); 3624 } 3625 3626 /* Now we check to see if any write operations have recently 3627 * completed 3628 */ 3629 prexor = 0; 3630 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3631 prexor = 1; 3632 if (sh->reconstruct_state == reconstruct_state_drain_result || 3633 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3634 sh->reconstruct_state = reconstruct_state_idle; 3635 3636 /* All the 'written' buffers and the parity block are ready to 3637 * be written back to disk 3638 */ 3639 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3640 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3641 BUG_ON(sh->qd_idx >= 0 && 3642 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3643 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3644 for (i = disks; i--; ) { 3645 struct r5dev *dev = &sh->dev[i]; 3646 if (test_bit(R5_LOCKED, &dev->flags) && 3647 (i == sh->pd_idx || i == sh->qd_idx || 3648 dev->written)) { 3649 pr_debug("Writing block %d\n", i); 3650 set_bit(R5_Wantwrite, &dev->flags); 3651 if (prexor) 3652 continue; 3653 if (!test_bit(R5_Insync, &dev->flags) || 3654 ((i == sh->pd_idx || i == sh->qd_idx) && 3655 s.failed == 0)) 3656 set_bit(STRIPE_INSYNC, &sh->state); 3657 } 3658 } 3659 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3660 s.dec_preread_active = 1; 3661 } 3662 3663 /* 3664 * might be able to return some write requests if the parity blocks 3665 * are safe, or on a failed drive 3666 */ 3667 pdev = &sh->dev[sh->pd_idx]; 3668 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3669 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3670 qdev = &sh->dev[sh->qd_idx]; 3671 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3672 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3673 || conf->level < 6; 3674 3675 if (s.written && 3676 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3677 && !test_bit(R5_LOCKED, &pdev->flags) 3678 && (test_bit(R5_UPTODATE, &pdev->flags) || 3679 test_bit(R5_Discard, &pdev->flags))))) && 3680 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3681 && !test_bit(R5_LOCKED, &qdev->flags) 3682 && (test_bit(R5_UPTODATE, &qdev->flags) || 3683 test_bit(R5_Discard, &qdev->flags)))))) 3684 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3685 3686 /* Now we might consider reading some blocks, either to check/generate 3687 * parity, or to satisfy requests 3688 * or to load a block that is being partially written. 3689 */ 3690 if (s.to_read || s.non_overwrite 3691 || (conf->level == 6 && s.to_write && s.failed) 3692 || (s.syncing && (s.uptodate + s.compute < disks)) 3693 || s.replacing 3694 || s.expanding) 3695 handle_stripe_fill(sh, &s, disks); 3696 3697 /* Now to consider new write requests and what else, if anything 3698 * should be read. We do not handle new writes when: 3699 * 1/ A 'write' operation (copy+xor) is already in flight. 3700 * 2/ A 'check' operation is in flight, as it may clobber the parity 3701 * block. 3702 */ 3703 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3704 handle_stripe_dirtying(conf, sh, &s, disks); 3705 3706 /* maybe we need to check and possibly fix the parity for this stripe 3707 * Any reads will already have been scheduled, so we just see if enough 3708 * data is available. The parity check is held off while parity 3709 * dependent operations are in flight. 3710 */ 3711 if (sh->check_state || 3712 (s.syncing && s.locked == 0 && 3713 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3714 !test_bit(STRIPE_INSYNC, &sh->state))) { 3715 if (conf->level == 6) 3716 handle_parity_checks6(conf, sh, &s, disks); 3717 else 3718 handle_parity_checks5(conf, sh, &s, disks); 3719 } 3720 3721 if ((s.replacing || s.syncing) && s.locked == 0 3722 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 3723 && !test_bit(STRIPE_REPLACED, &sh->state)) { 3724 /* Write out to replacement devices where possible */ 3725 for (i = 0; i < conf->raid_disks; i++) 3726 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3727 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3728 set_bit(R5_WantReplace, &sh->dev[i].flags); 3729 set_bit(R5_LOCKED, &sh->dev[i].flags); 3730 s.locked++; 3731 } 3732 if (s.replacing) 3733 set_bit(STRIPE_INSYNC, &sh->state); 3734 set_bit(STRIPE_REPLACED, &sh->state); 3735 } 3736 if ((s.syncing || s.replacing) && s.locked == 0 && 3737 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3738 test_bit(STRIPE_INSYNC, &sh->state)) { 3739 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3740 clear_bit(STRIPE_SYNCING, &sh->state); 3741 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3742 wake_up(&conf->wait_for_overlap); 3743 } 3744 3745 /* If the failed drives are just a ReadError, then we might need 3746 * to progress the repair/check process 3747 */ 3748 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3749 for (i = 0; i < s.failed; i++) { 3750 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3751 if (test_bit(R5_ReadError, &dev->flags) 3752 && !test_bit(R5_LOCKED, &dev->flags) 3753 && test_bit(R5_UPTODATE, &dev->flags) 3754 ) { 3755 if (!test_bit(R5_ReWrite, &dev->flags)) { 3756 set_bit(R5_Wantwrite, &dev->flags); 3757 set_bit(R5_ReWrite, &dev->flags); 3758 set_bit(R5_LOCKED, &dev->flags); 3759 s.locked++; 3760 } else { 3761 /* let's read it back */ 3762 set_bit(R5_Wantread, &dev->flags); 3763 set_bit(R5_LOCKED, &dev->flags); 3764 s.locked++; 3765 } 3766 } 3767 } 3768 3769 3770 /* Finish reconstruct operations initiated by the expansion process */ 3771 if (sh->reconstruct_state == reconstruct_state_result) { 3772 struct stripe_head *sh_src 3773 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3774 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3775 /* sh cannot be written until sh_src has been read. 3776 * so arrange for sh to be delayed a little 3777 */ 3778 set_bit(STRIPE_DELAYED, &sh->state); 3779 set_bit(STRIPE_HANDLE, &sh->state); 3780 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3781 &sh_src->state)) 3782 atomic_inc(&conf->preread_active_stripes); 3783 release_stripe(sh_src); 3784 goto finish; 3785 } 3786 if (sh_src) 3787 release_stripe(sh_src); 3788 3789 sh->reconstruct_state = reconstruct_state_idle; 3790 clear_bit(STRIPE_EXPANDING, &sh->state); 3791 for (i = conf->raid_disks; i--; ) { 3792 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3793 set_bit(R5_LOCKED, &sh->dev[i].flags); 3794 s.locked++; 3795 } 3796 } 3797 3798 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3799 !sh->reconstruct_state) { 3800 /* Need to write out all blocks after computing parity */ 3801 sh->disks = conf->raid_disks; 3802 stripe_set_idx(sh->sector, conf, 0, sh); 3803 schedule_reconstruction(sh, &s, 1, 1); 3804 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3805 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3806 atomic_dec(&conf->reshape_stripes); 3807 wake_up(&conf->wait_for_overlap); 3808 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3809 } 3810 3811 if (s.expanding && s.locked == 0 && 3812 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3813 handle_stripe_expansion(conf, sh); 3814 3815 finish: 3816 /* wait for this device to become unblocked */ 3817 if (unlikely(s.blocked_rdev)) { 3818 if (conf->mddev->external) 3819 md_wait_for_blocked_rdev(s.blocked_rdev, 3820 conf->mddev); 3821 else 3822 /* Internal metadata will immediately 3823 * be written by raid5d, so we don't 3824 * need to wait here. 3825 */ 3826 rdev_dec_pending(s.blocked_rdev, 3827 conf->mddev); 3828 } 3829 3830 if (s.handle_bad_blocks) 3831 for (i = disks; i--; ) { 3832 struct md_rdev *rdev; 3833 struct r5dev *dev = &sh->dev[i]; 3834 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 3835 /* We own a safe reference to the rdev */ 3836 rdev = conf->disks[i].rdev; 3837 if (!rdev_set_badblocks(rdev, sh->sector, 3838 STRIPE_SECTORS, 0)) 3839 md_error(conf->mddev, rdev); 3840 rdev_dec_pending(rdev, conf->mddev); 3841 } 3842 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 3843 rdev = conf->disks[i].rdev; 3844 rdev_clear_badblocks(rdev, sh->sector, 3845 STRIPE_SECTORS, 0); 3846 rdev_dec_pending(rdev, conf->mddev); 3847 } 3848 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 3849 rdev = conf->disks[i].replacement; 3850 if (!rdev) 3851 /* rdev have been moved down */ 3852 rdev = conf->disks[i].rdev; 3853 rdev_clear_badblocks(rdev, sh->sector, 3854 STRIPE_SECTORS, 0); 3855 rdev_dec_pending(rdev, conf->mddev); 3856 } 3857 } 3858 3859 if (s.ops_request) 3860 raid_run_ops(sh, s.ops_request); 3861 3862 ops_run_io(sh, &s); 3863 3864 if (s.dec_preread_active) { 3865 /* We delay this until after ops_run_io so that if make_request 3866 * is waiting on a flush, it won't continue until the writes 3867 * have actually been submitted. 3868 */ 3869 atomic_dec(&conf->preread_active_stripes); 3870 if (atomic_read(&conf->preread_active_stripes) < 3871 IO_THRESHOLD) 3872 md_wakeup_thread(conf->mddev->thread); 3873 } 3874 3875 return_io(s.return_bi); 3876 3877 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 3878 } 3879 3880 static void raid5_activate_delayed(struct r5conf *conf) 3881 { 3882 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3883 while (!list_empty(&conf->delayed_list)) { 3884 struct list_head *l = conf->delayed_list.next; 3885 struct stripe_head *sh; 3886 sh = list_entry(l, struct stripe_head, lru); 3887 list_del_init(l); 3888 clear_bit(STRIPE_DELAYED, &sh->state); 3889 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3890 atomic_inc(&conf->preread_active_stripes); 3891 list_add_tail(&sh->lru, &conf->hold_list); 3892 raid5_wakeup_stripe_thread(sh); 3893 } 3894 } 3895 } 3896 3897 static void activate_bit_delay(struct r5conf *conf) 3898 { 3899 /* device_lock is held */ 3900 struct list_head head; 3901 list_add(&head, &conf->bitmap_list); 3902 list_del_init(&conf->bitmap_list); 3903 while (!list_empty(&head)) { 3904 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3905 list_del_init(&sh->lru); 3906 atomic_inc(&sh->count); 3907 __release_stripe(conf, sh); 3908 } 3909 } 3910 3911 int md_raid5_congested(struct mddev *mddev, int bits) 3912 { 3913 struct r5conf *conf = mddev->private; 3914 3915 /* No difference between reads and writes. Just check 3916 * how busy the stripe_cache is 3917 */ 3918 3919 if (conf->inactive_blocked) 3920 return 1; 3921 if (conf->quiesce) 3922 return 1; 3923 if (list_empty_careful(&conf->inactive_list)) 3924 return 1; 3925 3926 return 0; 3927 } 3928 EXPORT_SYMBOL_GPL(md_raid5_congested); 3929 3930 static int raid5_congested(void *data, int bits) 3931 { 3932 struct mddev *mddev = data; 3933 3934 return mddev_congested(mddev, bits) || 3935 md_raid5_congested(mddev, bits); 3936 } 3937 3938 /* We want read requests to align with chunks where possible, 3939 * but write requests don't need to. 3940 */ 3941 static int raid5_mergeable_bvec(struct request_queue *q, 3942 struct bvec_merge_data *bvm, 3943 struct bio_vec *biovec) 3944 { 3945 struct mddev *mddev = q->queuedata; 3946 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3947 int max; 3948 unsigned int chunk_sectors = mddev->chunk_sectors; 3949 unsigned int bio_sectors = bvm->bi_size >> 9; 3950 3951 if ((bvm->bi_rw & 1) == WRITE) 3952 return biovec->bv_len; /* always allow writes to be mergeable */ 3953 3954 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3955 chunk_sectors = mddev->new_chunk_sectors; 3956 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3957 if (max < 0) max = 0; 3958 if (max <= biovec->bv_len && bio_sectors == 0) 3959 return biovec->bv_len; 3960 else 3961 return max; 3962 } 3963 3964 3965 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 3966 { 3967 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3968 unsigned int chunk_sectors = mddev->chunk_sectors; 3969 unsigned int bio_sectors = bio_sectors(bio); 3970 3971 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3972 chunk_sectors = mddev->new_chunk_sectors; 3973 return chunk_sectors >= 3974 ((sector & (chunk_sectors - 1)) + bio_sectors); 3975 } 3976 3977 /* 3978 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3979 * later sampled by raid5d. 3980 */ 3981 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 3982 { 3983 unsigned long flags; 3984 3985 spin_lock_irqsave(&conf->device_lock, flags); 3986 3987 bi->bi_next = conf->retry_read_aligned_list; 3988 conf->retry_read_aligned_list = bi; 3989 3990 spin_unlock_irqrestore(&conf->device_lock, flags); 3991 md_wakeup_thread(conf->mddev->thread); 3992 } 3993 3994 3995 static struct bio *remove_bio_from_retry(struct r5conf *conf) 3996 { 3997 struct bio *bi; 3998 3999 bi = conf->retry_read_aligned; 4000 if (bi) { 4001 conf->retry_read_aligned = NULL; 4002 return bi; 4003 } 4004 bi = conf->retry_read_aligned_list; 4005 if(bi) { 4006 conf->retry_read_aligned_list = bi->bi_next; 4007 bi->bi_next = NULL; 4008 /* 4009 * this sets the active strip count to 1 and the processed 4010 * strip count to zero (upper 8 bits) 4011 */ 4012 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4013 } 4014 4015 return bi; 4016 } 4017 4018 4019 /* 4020 * The "raid5_align_endio" should check if the read succeeded and if it 4021 * did, call bio_endio on the original bio (having bio_put the new bio 4022 * first). 4023 * If the read failed.. 4024 */ 4025 static void raid5_align_endio(struct bio *bi, int error) 4026 { 4027 struct bio* raid_bi = bi->bi_private; 4028 struct mddev *mddev; 4029 struct r5conf *conf; 4030 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 4031 struct md_rdev *rdev; 4032 4033 bio_put(bi); 4034 4035 rdev = (void*)raid_bi->bi_next; 4036 raid_bi->bi_next = NULL; 4037 mddev = rdev->mddev; 4038 conf = mddev->private; 4039 4040 rdev_dec_pending(rdev, conf->mddev); 4041 4042 if (!error && uptodate) { 4043 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4044 raid_bi, 0); 4045 bio_endio(raid_bi, 0); 4046 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4047 wake_up(&conf->wait_for_stripe); 4048 return; 4049 } 4050 4051 4052 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4053 4054 add_bio_to_retry(raid_bi, conf); 4055 } 4056 4057 static int bio_fits_rdev(struct bio *bi) 4058 { 4059 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 4060 4061 if (bio_sectors(bi) > queue_max_sectors(q)) 4062 return 0; 4063 blk_recount_segments(q, bi); 4064 if (bi->bi_phys_segments > queue_max_segments(q)) 4065 return 0; 4066 4067 if (q->merge_bvec_fn) 4068 /* it's too hard to apply the merge_bvec_fn at this stage, 4069 * just just give up 4070 */ 4071 return 0; 4072 4073 return 1; 4074 } 4075 4076 4077 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 4078 { 4079 struct r5conf *conf = mddev->private; 4080 int dd_idx; 4081 struct bio* align_bi; 4082 struct md_rdev *rdev; 4083 sector_t end_sector; 4084 4085 if (!in_chunk_boundary(mddev, raid_bio)) { 4086 pr_debug("chunk_aligned_read : non aligned\n"); 4087 return 0; 4088 } 4089 /* 4090 * use bio_clone_mddev to make a copy of the bio 4091 */ 4092 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 4093 if (!align_bi) 4094 return 0; 4095 /* 4096 * set bi_end_io to a new function, and set bi_private to the 4097 * original bio. 4098 */ 4099 align_bi->bi_end_io = raid5_align_endio; 4100 align_bi->bi_private = raid_bio; 4101 /* 4102 * compute position 4103 */ 4104 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 4105 0, 4106 &dd_idx, NULL); 4107 4108 end_sector = bio_end_sector(align_bi); 4109 rcu_read_lock(); 4110 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 4111 if (!rdev || test_bit(Faulty, &rdev->flags) || 4112 rdev->recovery_offset < end_sector) { 4113 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 4114 if (rdev && 4115 (test_bit(Faulty, &rdev->flags) || 4116 !(test_bit(In_sync, &rdev->flags) || 4117 rdev->recovery_offset >= end_sector))) 4118 rdev = NULL; 4119 } 4120 if (rdev) { 4121 sector_t first_bad; 4122 int bad_sectors; 4123 4124 atomic_inc(&rdev->nr_pending); 4125 rcu_read_unlock(); 4126 raid_bio->bi_next = (void*)rdev; 4127 align_bi->bi_bdev = rdev->bdev; 4128 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4129 4130 if (!bio_fits_rdev(align_bi) || 4131 is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), 4132 &first_bad, &bad_sectors)) { 4133 /* too big in some way, or has a known bad block */ 4134 bio_put(align_bi); 4135 rdev_dec_pending(rdev, mddev); 4136 return 0; 4137 } 4138 4139 /* No reshape active, so we can trust rdev->data_offset */ 4140 align_bi->bi_sector += rdev->data_offset; 4141 4142 spin_lock_irq(&conf->device_lock); 4143 wait_event_lock_irq(conf->wait_for_stripe, 4144 conf->quiesce == 0, 4145 conf->device_lock); 4146 atomic_inc(&conf->active_aligned_reads); 4147 spin_unlock_irq(&conf->device_lock); 4148 4149 if (mddev->gendisk) 4150 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4151 align_bi, disk_devt(mddev->gendisk), 4152 raid_bio->bi_sector); 4153 generic_make_request(align_bi); 4154 return 1; 4155 } else { 4156 rcu_read_unlock(); 4157 bio_put(align_bi); 4158 return 0; 4159 } 4160 } 4161 4162 /* __get_priority_stripe - get the next stripe to process 4163 * 4164 * Full stripe writes are allowed to pass preread active stripes up until 4165 * the bypass_threshold is exceeded. In general the bypass_count 4166 * increments when the handle_list is handled before the hold_list; however, it 4167 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4168 * stripe with in flight i/o. The bypass_count will be reset when the 4169 * head of the hold_list has changed, i.e. the head was promoted to the 4170 * handle_list. 4171 */ 4172 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 4173 { 4174 struct stripe_head *sh = NULL, *tmp; 4175 struct list_head *handle_list = NULL; 4176 struct r5worker_group *wg = NULL; 4177 4178 if (conf->worker_cnt_per_group == 0) { 4179 handle_list = &conf->handle_list; 4180 } else if (group != ANY_GROUP) { 4181 handle_list = &conf->worker_groups[group].handle_list; 4182 wg = &conf->worker_groups[group]; 4183 } else { 4184 int i; 4185 for (i = 0; i < conf->group_cnt; i++) { 4186 handle_list = &conf->worker_groups[i].handle_list; 4187 wg = &conf->worker_groups[i]; 4188 if (!list_empty(handle_list)) 4189 break; 4190 } 4191 } 4192 4193 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4194 __func__, 4195 list_empty(handle_list) ? "empty" : "busy", 4196 list_empty(&conf->hold_list) ? "empty" : "busy", 4197 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4198 4199 if (!list_empty(handle_list)) { 4200 sh = list_entry(handle_list->next, typeof(*sh), lru); 4201 4202 if (list_empty(&conf->hold_list)) 4203 conf->bypass_count = 0; 4204 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4205 if (conf->hold_list.next == conf->last_hold) 4206 conf->bypass_count++; 4207 else { 4208 conf->last_hold = conf->hold_list.next; 4209 conf->bypass_count -= conf->bypass_threshold; 4210 if (conf->bypass_count < 0) 4211 conf->bypass_count = 0; 4212 } 4213 } 4214 } else if (!list_empty(&conf->hold_list) && 4215 ((conf->bypass_threshold && 4216 conf->bypass_count > conf->bypass_threshold) || 4217 atomic_read(&conf->pending_full_writes) == 0)) { 4218 4219 list_for_each_entry(tmp, &conf->hold_list, lru) { 4220 if (conf->worker_cnt_per_group == 0 || 4221 group == ANY_GROUP || 4222 !cpu_online(tmp->cpu) || 4223 cpu_to_group(tmp->cpu) == group) { 4224 sh = tmp; 4225 break; 4226 } 4227 } 4228 4229 if (sh) { 4230 conf->bypass_count -= conf->bypass_threshold; 4231 if (conf->bypass_count < 0) 4232 conf->bypass_count = 0; 4233 } 4234 wg = NULL; 4235 } 4236 4237 if (!sh) 4238 return NULL; 4239 4240 if (wg) { 4241 wg->stripes_cnt--; 4242 sh->group = NULL; 4243 } 4244 list_del_init(&sh->lru); 4245 atomic_inc(&sh->count); 4246 BUG_ON(atomic_read(&sh->count) != 1); 4247 return sh; 4248 } 4249 4250 struct raid5_plug_cb { 4251 struct blk_plug_cb cb; 4252 struct list_head list; 4253 }; 4254 4255 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4256 { 4257 struct raid5_plug_cb *cb = container_of( 4258 blk_cb, struct raid5_plug_cb, cb); 4259 struct stripe_head *sh; 4260 struct mddev *mddev = cb->cb.data; 4261 struct r5conf *conf = mddev->private; 4262 int cnt = 0; 4263 4264 if (cb->list.next && !list_empty(&cb->list)) { 4265 spin_lock_irq(&conf->device_lock); 4266 while (!list_empty(&cb->list)) { 4267 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4268 list_del_init(&sh->lru); 4269 /* 4270 * avoid race release_stripe_plug() sees 4271 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4272 * is still in our list 4273 */ 4274 smp_mb__before_clear_bit(); 4275 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4276 /* 4277 * STRIPE_ON_RELEASE_LIST could be set here. In that 4278 * case, the count is always > 1 here 4279 */ 4280 __release_stripe(conf, sh); 4281 cnt++; 4282 } 4283 spin_unlock_irq(&conf->device_lock); 4284 } 4285 if (mddev->queue) 4286 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4287 kfree(cb); 4288 } 4289 4290 static void release_stripe_plug(struct mddev *mddev, 4291 struct stripe_head *sh) 4292 { 4293 struct blk_plug_cb *blk_cb = blk_check_plugged( 4294 raid5_unplug, mddev, 4295 sizeof(struct raid5_plug_cb)); 4296 struct raid5_plug_cb *cb; 4297 4298 if (!blk_cb) { 4299 release_stripe(sh); 4300 return; 4301 } 4302 4303 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4304 4305 if (cb->list.next == NULL) 4306 INIT_LIST_HEAD(&cb->list); 4307 4308 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4309 list_add_tail(&sh->lru, &cb->list); 4310 else 4311 release_stripe(sh); 4312 } 4313 4314 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4315 { 4316 struct r5conf *conf = mddev->private; 4317 sector_t logical_sector, last_sector; 4318 struct stripe_head *sh; 4319 int remaining; 4320 int stripe_sectors; 4321 4322 if (mddev->reshape_position != MaxSector) 4323 /* Skip discard while reshape is happening */ 4324 return; 4325 4326 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4327 last_sector = bi->bi_sector + (bi->bi_size>>9); 4328 4329 bi->bi_next = NULL; 4330 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4331 4332 stripe_sectors = conf->chunk_sectors * 4333 (conf->raid_disks - conf->max_degraded); 4334 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4335 stripe_sectors); 4336 sector_div(last_sector, stripe_sectors); 4337 4338 logical_sector *= conf->chunk_sectors; 4339 last_sector *= conf->chunk_sectors; 4340 4341 for (; logical_sector < last_sector; 4342 logical_sector += STRIPE_SECTORS) { 4343 DEFINE_WAIT(w); 4344 int d; 4345 again: 4346 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4347 prepare_to_wait(&conf->wait_for_overlap, &w, 4348 TASK_UNINTERRUPTIBLE); 4349 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4350 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4351 release_stripe(sh); 4352 schedule(); 4353 goto again; 4354 } 4355 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4356 spin_lock_irq(&sh->stripe_lock); 4357 for (d = 0; d < conf->raid_disks; d++) { 4358 if (d == sh->pd_idx || d == sh->qd_idx) 4359 continue; 4360 if (sh->dev[d].towrite || sh->dev[d].toread) { 4361 set_bit(R5_Overlap, &sh->dev[d].flags); 4362 spin_unlock_irq(&sh->stripe_lock); 4363 release_stripe(sh); 4364 schedule(); 4365 goto again; 4366 } 4367 } 4368 set_bit(STRIPE_DISCARD, &sh->state); 4369 finish_wait(&conf->wait_for_overlap, &w); 4370 for (d = 0; d < conf->raid_disks; d++) { 4371 if (d == sh->pd_idx || d == sh->qd_idx) 4372 continue; 4373 sh->dev[d].towrite = bi; 4374 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4375 raid5_inc_bi_active_stripes(bi); 4376 } 4377 spin_unlock_irq(&sh->stripe_lock); 4378 if (conf->mddev->bitmap) { 4379 for (d = 0; 4380 d < conf->raid_disks - conf->max_degraded; 4381 d++) 4382 bitmap_startwrite(mddev->bitmap, 4383 sh->sector, 4384 STRIPE_SECTORS, 4385 0); 4386 sh->bm_seq = conf->seq_flush + 1; 4387 set_bit(STRIPE_BIT_DELAY, &sh->state); 4388 } 4389 4390 set_bit(STRIPE_HANDLE, &sh->state); 4391 clear_bit(STRIPE_DELAYED, &sh->state); 4392 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4393 atomic_inc(&conf->preread_active_stripes); 4394 release_stripe_plug(mddev, sh); 4395 } 4396 4397 remaining = raid5_dec_bi_active_stripes(bi); 4398 if (remaining == 0) { 4399 md_write_end(mddev); 4400 bio_endio(bi, 0); 4401 } 4402 } 4403 4404 static void make_request(struct mddev *mddev, struct bio * bi) 4405 { 4406 struct r5conf *conf = mddev->private; 4407 int dd_idx; 4408 sector_t new_sector; 4409 sector_t logical_sector, last_sector; 4410 struct stripe_head *sh; 4411 const int rw = bio_data_dir(bi); 4412 int remaining; 4413 4414 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4415 md_flush_request(mddev, bi); 4416 return; 4417 } 4418 4419 md_write_start(mddev, bi); 4420 4421 if (rw == READ && 4422 mddev->reshape_position == MaxSector && 4423 chunk_aligned_read(mddev,bi)) 4424 return; 4425 4426 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4427 make_discard_request(mddev, bi); 4428 return; 4429 } 4430 4431 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4432 last_sector = bio_end_sector(bi); 4433 bi->bi_next = NULL; 4434 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4435 4436 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4437 DEFINE_WAIT(w); 4438 int previous; 4439 int seq; 4440 4441 retry: 4442 seq = read_seqcount_begin(&conf->gen_lock); 4443 previous = 0; 4444 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4445 if (unlikely(conf->reshape_progress != MaxSector)) { 4446 /* spinlock is needed as reshape_progress may be 4447 * 64bit on a 32bit platform, and so it might be 4448 * possible to see a half-updated value 4449 * Of course reshape_progress could change after 4450 * the lock is dropped, so once we get a reference 4451 * to the stripe that we think it is, we will have 4452 * to check again. 4453 */ 4454 spin_lock_irq(&conf->device_lock); 4455 if (mddev->reshape_backwards 4456 ? logical_sector < conf->reshape_progress 4457 : logical_sector >= conf->reshape_progress) { 4458 previous = 1; 4459 } else { 4460 if (mddev->reshape_backwards 4461 ? logical_sector < conf->reshape_safe 4462 : logical_sector >= conf->reshape_safe) { 4463 spin_unlock_irq(&conf->device_lock); 4464 schedule(); 4465 goto retry; 4466 } 4467 } 4468 spin_unlock_irq(&conf->device_lock); 4469 } 4470 4471 new_sector = raid5_compute_sector(conf, logical_sector, 4472 previous, 4473 &dd_idx, NULL); 4474 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4475 (unsigned long long)new_sector, 4476 (unsigned long long)logical_sector); 4477 4478 sh = get_active_stripe(conf, new_sector, previous, 4479 (bi->bi_rw&RWA_MASK), 0); 4480 if (sh) { 4481 if (unlikely(previous)) { 4482 /* expansion might have moved on while waiting for a 4483 * stripe, so we must do the range check again. 4484 * Expansion could still move past after this 4485 * test, but as we are holding a reference to 4486 * 'sh', we know that if that happens, 4487 * STRIPE_EXPANDING will get set and the expansion 4488 * won't proceed until we finish with the stripe. 4489 */ 4490 int must_retry = 0; 4491 spin_lock_irq(&conf->device_lock); 4492 if (mddev->reshape_backwards 4493 ? logical_sector >= conf->reshape_progress 4494 : logical_sector < conf->reshape_progress) 4495 /* mismatch, need to try again */ 4496 must_retry = 1; 4497 spin_unlock_irq(&conf->device_lock); 4498 if (must_retry) { 4499 release_stripe(sh); 4500 schedule(); 4501 goto retry; 4502 } 4503 } 4504 if (read_seqcount_retry(&conf->gen_lock, seq)) { 4505 /* Might have got the wrong stripe_head 4506 * by accident 4507 */ 4508 release_stripe(sh); 4509 goto retry; 4510 } 4511 4512 if (rw == WRITE && 4513 logical_sector >= mddev->suspend_lo && 4514 logical_sector < mddev->suspend_hi) { 4515 release_stripe(sh); 4516 /* As the suspend_* range is controlled by 4517 * userspace, we want an interruptible 4518 * wait. 4519 */ 4520 flush_signals(current); 4521 prepare_to_wait(&conf->wait_for_overlap, 4522 &w, TASK_INTERRUPTIBLE); 4523 if (logical_sector >= mddev->suspend_lo && 4524 logical_sector < mddev->suspend_hi) 4525 schedule(); 4526 goto retry; 4527 } 4528 4529 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4530 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4531 /* Stripe is busy expanding or 4532 * add failed due to overlap. Flush everything 4533 * and wait a while 4534 */ 4535 md_wakeup_thread(mddev->thread); 4536 release_stripe(sh); 4537 schedule(); 4538 goto retry; 4539 } 4540 finish_wait(&conf->wait_for_overlap, &w); 4541 set_bit(STRIPE_HANDLE, &sh->state); 4542 clear_bit(STRIPE_DELAYED, &sh->state); 4543 if ((bi->bi_rw & REQ_SYNC) && 4544 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4545 atomic_inc(&conf->preread_active_stripes); 4546 release_stripe_plug(mddev, sh); 4547 } else { 4548 /* cannot get stripe for read-ahead, just give-up */ 4549 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4550 finish_wait(&conf->wait_for_overlap, &w); 4551 break; 4552 } 4553 } 4554 4555 remaining = raid5_dec_bi_active_stripes(bi); 4556 if (remaining == 0) { 4557 4558 if ( rw == WRITE ) 4559 md_write_end(mddev); 4560 4561 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4562 bi, 0); 4563 bio_endio(bi, 0); 4564 } 4565 } 4566 4567 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4568 4569 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4570 { 4571 /* reshaping is quite different to recovery/resync so it is 4572 * handled quite separately ... here. 4573 * 4574 * On each call to sync_request, we gather one chunk worth of 4575 * destination stripes and flag them as expanding. 4576 * Then we find all the source stripes and request reads. 4577 * As the reads complete, handle_stripe will copy the data 4578 * into the destination stripe and release that stripe. 4579 */ 4580 struct r5conf *conf = mddev->private; 4581 struct stripe_head *sh; 4582 sector_t first_sector, last_sector; 4583 int raid_disks = conf->previous_raid_disks; 4584 int data_disks = raid_disks - conf->max_degraded; 4585 int new_data_disks = conf->raid_disks - conf->max_degraded; 4586 int i; 4587 int dd_idx; 4588 sector_t writepos, readpos, safepos; 4589 sector_t stripe_addr; 4590 int reshape_sectors; 4591 struct list_head stripes; 4592 4593 if (sector_nr == 0) { 4594 /* If restarting in the middle, skip the initial sectors */ 4595 if (mddev->reshape_backwards && 4596 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4597 sector_nr = raid5_size(mddev, 0, 0) 4598 - conf->reshape_progress; 4599 } else if (!mddev->reshape_backwards && 4600 conf->reshape_progress > 0) 4601 sector_nr = conf->reshape_progress; 4602 sector_div(sector_nr, new_data_disks); 4603 if (sector_nr) { 4604 mddev->curr_resync_completed = sector_nr; 4605 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4606 *skipped = 1; 4607 return sector_nr; 4608 } 4609 } 4610 4611 /* We need to process a full chunk at a time. 4612 * If old and new chunk sizes differ, we need to process the 4613 * largest of these 4614 */ 4615 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4616 reshape_sectors = mddev->new_chunk_sectors; 4617 else 4618 reshape_sectors = mddev->chunk_sectors; 4619 4620 /* We update the metadata at least every 10 seconds, or when 4621 * the data about to be copied would over-write the source of 4622 * the data at the front of the range. i.e. one new_stripe 4623 * along from reshape_progress new_maps to after where 4624 * reshape_safe old_maps to 4625 */ 4626 writepos = conf->reshape_progress; 4627 sector_div(writepos, new_data_disks); 4628 readpos = conf->reshape_progress; 4629 sector_div(readpos, data_disks); 4630 safepos = conf->reshape_safe; 4631 sector_div(safepos, data_disks); 4632 if (mddev->reshape_backwards) { 4633 writepos -= min_t(sector_t, reshape_sectors, writepos); 4634 readpos += reshape_sectors; 4635 safepos += reshape_sectors; 4636 } else { 4637 writepos += reshape_sectors; 4638 readpos -= min_t(sector_t, reshape_sectors, readpos); 4639 safepos -= min_t(sector_t, reshape_sectors, safepos); 4640 } 4641 4642 /* Having calculated the 'writepos' possibly use it 4643 * to set 'stripe_addr' which is where we will write to. 4644 */ 4645 if (mddev->reshape_backwards) { 4646 BUG_ON(conf->reshape_progress == 0); 4647 stripe_addr = writepos; 4648 BUG_ON((mddev->dev_sectors & 4649 ~((sector_t)reshape_sectors - 1)) 4650 - reshape_sectors - stripe_addr 4651 != sector_nr); 4652 } else { 4653 BUG_ON(writepos != sector_nr + reshape_sectors); 4654 stripe_addr = sector_nr; 4655 } 4656 4657 /* 'writepos' is the most advanced device address we might write. 4658 * 'readpos' is the least advanced device address we might read. 4659 * 'safepos' is the least address recorded in the metadata as having 4660 * been reshaped. 4661 * If there is a min_offset_diff, these are adjusted either by 4662 * increasing the safepos/readpos if diff is negative, or 4663 * increasing writepos if diff is positive. 4664 * If 'readpos' is then behind 'writepos', there is no way that we can 4665 * ensure safety in the face of a crash - that must be done by userspace 4666 * making a backup of the data. So in that case there is no particular 4667 * rush to update metadata. 4668 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4669 * update the metadata to advance 'safepos' to match 'readpos' so that 4670 * we can be safe in the event of a crash. 4671 * So we insist on updating metadata if safepos is behind writepos and 4672 * readpos is beyond writepos. 4673 * In any case, update the metadata every 10 seconds. 4674 * Maybe that number should be configurable, but I'm not sure it is 4675 * worth it.... maybe it could be a multiple of safemode_delay??? 4676 */ 4677 if (conf->min_offset_diff < 0) { 4678 safepos += -conf->min_offset_diff; 4679 readpos += -conf->min_offset_diff; 4680 } else 4681 writepos += conf->min_offset_diff; 4682 4683 if ((mddev->reshape_backwards 4684 ? (safepos > writepos && readpos < writepos) 4685 : (safepos < writepos && readpos > writepos)) || 4686 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4687 /* Cannot proceed until we've updated the superblock... */ 4688 wait_event(conf->wait_for_overlap, 4689 atomic_read(&conf->reshape_stripes)==0); 4690 mddev->reshape_position = conf->reshape_progress; 4691 mddev->curr_resync_completed = sector_nr; 4692 conf->reshape_checkpoint = jiffies; 4693 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4694 md_wakeup_thread(mddev->thread); 4695 wait_event(mddev->sb_wait, mddev->flags == 0 || 4696 kthread_should_stop()); 4697 spin_lock_irq(&conf->device_lock); 4698 conf->reshape_safe = mddev->reshape_position; 4699 spin_unlock_irq(&conf->device_lock); 4700 wake_up(&conf->wait_for_overlap); 4701 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4702 } 4703 4704 INIT_LIST_HEAD(&stripes); 4705 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4706 int j; 4707 int skipped_disk = 0; 4708 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4709 set_bit(STRIPE_EXPANDING, &sh->state); 4710 atomic_inc(&conf->reshape_stripes); 4711 /* If any of this stripe is beyond the end of the old 4712 * array, then we need to zero those blocks 4713 */ 4714 for (j=sh->disks; j--;) { 4715 sector_t s; 4716 if (j == sh->pd_idx) 4717 continue; 4718 if (conf->level == 6 && 4719 j == sh->qd_idx) 4720 continue; 4721 s = compute_blocknr(sh, j, 0); 4722 if (s < raid5_size(mddev, 0, 0)) { 4723 skipped_disk = 1; 4724 continue; 4725 } 4726 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4727 set_bit(R5_Expanded, &sh->dev[j].flags); 4728 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4729 } 4730 if (!skipped_disk) { 4731 set_bit(STRIPE_EXPAND_READY, &sh->state); 4732 set_bit(STRIPE_HANDLE, &sh->state); 4733 } 4734 list_add(&sh->lru, &stripes); 4735 } 4736 spin_lock_irq(&conf->device_lock); 4737 if (mddev->reshape_backwards) 4738 conf->reshape_progress -= reshape_sectors * new_data_disks; 4739 else 4740 conf->reshape_progress += reshape_sectors * new_data_disks; 4741 spin_unlock_irq(&conf->device_lock); 4742 /* Ok, those stripe are ready. We can start scheduling 4743 * reads on the source stripes. 4744 * The source stripes are determined by mapping the first and last 4745 * block on the destination stripes. 4746 */ 4747 first_sector = 4748 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4749 1, &dd_idx, NULL); 4750 last_sector = 4751 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4752 * new_data_disks - 1), 4753 1, &dd_idx, NULL); 4754 if (last_sector >= mddev->dev_sectors) 4755 last_sector = mddev->dev_sectors - 1; 4756 while (first_sector <= last_sector) { 4757 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4758 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4759 set_bit(STRIPE_HANDLE, &sh->state); 4760 release_stripe(sh); 4761 first_sector += STRIPE_SECTORS; 4762 } 4763 /* Now that the sources are clearly marked, we can release 4764 * the destination stripes 4765 */ 4766 while (!list_empty(&stripes)) { 4767 sh = list_entry(stripes.next, struct stripe_head, lru); 4768 list_del_init(&sh->lru); 4769 release_stripe(sh); 4770 } 4771 /* If this takes us to the resync_max point where we have to pause, 4772 * then we need to write out the superblock. 4773 */ 4774 sector_nr += reshape_sectors; 4775 if ((sector_nr - mddev->curr_resync_completed) * 2 4776 >= mddev->resync_max - mddev->curr_resync_completed) { 4777 /* Cannot proceed until we've updated the superblock... */ 4778 wait_event(conf->wait_for_overlap, 4779 atomic_read(&conf->reshape_stripes) == 0); 4780 mddev->reshape_position = conf->reshape_progress; 4781 mddev->curr_resync_completed = sector_nr; 4782 conf->reshape_checkpoint = jiffies; 4783 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4784 md_wakeup_thread(mddev->thread); 4785 wait_event(mddev->sb_wait, 4786 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4787 || kthread_should_stop()); 4788 spin_lock_irq(&conf->device_lock); 4789 conf->reshape_safe = mddev->reshape_position; 4790 spin_unlock_irq(&conf->device_lock); 4791 wake_up(&conf->wait_for_overlap); 4792 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4793 } 4794 return reshape_sectors; 4795 } 4796 4797 /* FIXME go_faster isn't used */ 4798 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 4799 { 4800 struct r5conf *conf = mddev->private; 4801 struct stripe_head *sh; 4802 sector_t max_sector = mddev->dev_sectors; 4803 sector_t sync_blocks; 4804 int still_degraded = 0; 4805 int i; 4806 4807 if (sector_nr >= max_sector) { 4808 /* just being told to finish up .. nothing much to do */ 4809 4810 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4811 end_reshape(conf); 4812 return 0; 4813 } 4814 4815 if (mddev->curr_resync < max_sector) /* aborted */ 4816 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4817 &sync_blocks, 1); 4818 else /* completed sync */ 4819 conf->fullsync = 0; 4820 bitmap_close_sync(mddev->bitmap); 4821 4822 return 0; 4823 } 4824 4825 /* Allow raid5_quiesce to complete */ 4826 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4827 4828 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4829 return reshape_request(mddev, sector_nr, skipped); 4830 4831 /* No need to check resync_max as we never do more than one 4832 * stripe, and as resync_max will always be on a chunk boundary, 4833 * if the check in md_do_sync didn't fire, there is no chance 4834 * of overstepping resync_max here 4835 */ 4836 4837 /* if there is too many failed drives and we are trying 4838 * to resync, then assert that we are finished, because there is 4839 * nothing we can do. 4840 */ 4841 if (mddev->degraded >= conf->max_degraded && 4842 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4843 sector_t rv = mddev->dev_sectors - sector_nr; 4844 *skipped = 1; 4845 return rv; 4846 } 4847 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4848 !conf->fullsync && 4849 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4850 sync_blocks >= STRIPE_SECTORS) { 4851 /* we can skip this block, and probably more */ 4852 sync_blocks /= STRIPE_SECTORS; 4853 *skipped = 1; 4854 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4855 } 4856 4857 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4858 4859 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4860 if (sh == NULL) { 4861 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4862 /* make sure we don't swamp the stripe cache if someone else 4863 * is trying to get access 4864 */ 4865 schedule_timeout_uninterruptible(1); 4866 } 4867 /* Need to check if array will still be degraded after recovery/resync 4868 * We don't need to check the 'failed' flag as when that gets set, 4869 * recovery aborts. 4870 */ 4871 for (i = 0; i < conf->raid_disks; i++) 4872 if (conf->disks[i].rdev == NULL) 4873 still_degraded = 1; 4874 4875 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4876 4877 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 4878 4879 handle_stripe(sh); 4880 release_stripe(sh); 4881 4882 return STRIPE_SECTORS; 4883 } 4884 4885 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 4886 { 4887 /* We may not be able to submit a whole bio at once as there 4888 * may not be enough stripe_heads available. 4889 * We cannot pre-allocate enough stripe_heads as we may need 4890 * more than exist in the cache (if we allow ever large chunks). 4891 * So we do one stripe head at a time and record in 4892 * ->bi_hw_segments how many have been done. 4893 * 4894 * We *know* that this entire raid_bio is in one chunk, so 4895 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4896 */ 4897 struct stripe_head *sh; 4898 int dd_idx; 4899 sector_t sector, logical_sector, last_sector; 4900 int scnt = 0; 4901 int remaining; 4902 int handled = 0; 4903 4904 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4905 sector = raid5_compute_sector(conf, logical_sector, 4906 0, &dd_idx, NULL); 4907 last_sector = bio_end_sector(raid_bio); 4908 4909 for (; logical_sector < last_sector; 4910 logical_sector += STRIPE_SECTORS, 4911 sector += STRIPE_SECTORS, 4912 scnt++) { 4913 4914 if (scnt < raid5_bi_processed_stripes(raid_bio)) 4915 /* already done this stripe */ 4916 continue; 4917 4918 sh = get_active_stripe(conf, sector, 0, 1, 0); 4919 4920 if (!sh) { 4921 /* failed to get a stripe - must wait */ 4922 raid5_set_bi_processed_stripes(raid_bio, scnt); 4923 conf->retry_read_aligned = raid_bio; 4924 return handled; 4925 } 4926 4927 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4928 release_stripe(sh); 4929 raid5_set_bi_processed_stripes(raid_bio, scnt); 4930 conf->retry_read_aligned = raid_bio; 4931 return handled; 4932 } 4933 4934 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4935 handle_stripe(sh); 4936 release_stripe(sh); 4937 handled++; 4938 } 4939 remaining = raid5_dec_bi_active_stripes(raid_bio); 4940 if (remaining == 0) { 4941 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 4942 raid_bio, 0); 4943 bio_endio(raid_bio, 0); 4944 } 4945 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4946 wake_up(&conf->wait_for_stripe); 4947 return handled; 4948 } 4949 4950 static int handle_active_stripes(struct r5conf *conf, int group, 4951 struct r5worker *worker) 4952 { 4953 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4954 int i, batch_size = 0; 4955 4956 while (batch_size < MAX_STRIPE_BATCH && 4957 (sh = __get_priority_stripe(conf, group)) != NULL) 4958 batch[batch_size++] = sh; 4959 4960 if (batch_size == 0) 4961 return batch_size; 4962 spin_unlock_irq(&conf->device_lock); 4963 4964 for (i = 0; i < batch_size; i++) 4965 handle_stripe(batch[i]); 4966 4967 cond_resched(); 4968 4969 spin_lock_irq(&conf->device_lock); 4970 for (i = 0; i < batch_size; i++) 4971 __release_stripe(conf, batch[i]); 4972 return batch_size; 4973 } 4974 4975 static void raid5_do_work(struct work_struct *work) 4976 { 4977 struct r5worker *worker = container_of(work, struct r5worker, work); 4978 struct r5worker_group *group = worker->group; 4979 struct r5conf *conf = group->conf; 4980 int group_id = group - conf->worker_groups; 4981 int handled; 4982 struct blk_plug plug; 4983 4984 pr_debug("+++ raid5worker active\n"); 4985 4986 blk_start_plug(&plug); 4987 handled = 0; 4988 spin_lock_irq(&conf->device_lock); 4989 while (1) { 4990 int batch_size, released; 4991 4992 released = release_stripe_list(conf); 4993 4994 batch_size = handle_active_stripes(conf, group_id, worker); 4995 worker->working = false; 4996 if (!batch_size && !released) 4997 break; 4998 handled += batch_size; 4999 } 5000 pr_debug("%d stripes handled\n", handled); 5001 5002 spin_unlock_irq(&conf->device_lock); 5003 blk_finish_plug(&plug); 5004 5005 pr_debug("--- raid5worker inactive\n"); 5006 } 5007 5008 /* 5009 * This is our raid5 kernel thread. 5010 * 5011 * We scan the hash table for stripes which can be handled now. 5012 * During the scan, completed stripes are saved for us by the interrupt 5013 * handler, so that they will not have to wait for our next wakeup. 5014 */ 5015 static void raid5d(struct md_thread *thread) 5016 { 5017 struct mddev *mddev = thread->mddev; 5018 struct r5conf *conf = mddev->private; 5019 int handled; 5020 struct blk_plug plug; 5021 5022 pr_debug("+++ raid5d active\n"); 5023 5024 md_check_recovery(mddev); 5025 5026 blk_start_plug(&plug); 5027 handled = 0; 5028 spin_lock_irq(&conf->device_lock); 5029 while (1) { 5030 struct bio *bio; 5031 int batch_size, released; 5032 5033 released = release_stripe_list(conf); 5034 5035 if ( 5036 !list_empty(&conf->bitmap_list)) { 5037 /* Now is a good time to flush some bitmap updates */ 5038 conf->seq_flush++; 5039 spin_unlock_irq(&conf->device_lock); 5040 bitmap_unplug(mddev->bitmap); 5041 spin_lock_irq(&conf->device_lock); 5042 conf->seq_write = conf->seq_flush; 5043 activate_bit_delay(conf); 5044 } 5045 raid5_activate_delayed(conf); 5046 5047 while ((bio = remove_bio_from_retry(conf))) { 5048 int ok; 5049 spin_unlock_irq(&conf->device_lock); 5050 ok = retry_aligned_read(conf, bio); 5051 spin_lock_irq(&conf->device_lock); 5052 if (!ok) 5053 break; 5054 handled++; 5055 } 5056 5057 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); 5058 if (!batch_size && !released) 5059 break; 5060 handled += batch_size; 5061 5062 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 5063 spin_unlock_irq(&conf->device_lock); 5064 md_check_recovery(mddev); 5065 spin_lock_irq(&conf->device_lock); 5066 } 5067 } 5068 pr_debug("%d stripes handled\n", handled); 5069 5070 spin_unlock_irq(&conf->device_lock); 5071 5072 async_tx_issue_pending_all(); 5073 blk_finish_plug(&plug); 5074 5075 pr_debug("--- raid5d inactive\n"); 5076 } 5077 5078 static ssize_t 5079 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 5080 { 5081 struct r5conf *conf = mddev->private; 5082 if (conf) 5083 return sprintf(page, "%d\n", conf->max_nr_stripes); 5084 else 5085 return 0; 5086 } 5087 5088 int 5089 raid5_set_cache_size(struct mddev *mddev, int size) 5090 { 5091 struct r5conf *conf = mddev->private; 5092 int err; 5093 5094 if (size <= 16 || size > 32768) 5095 return -EINVAL; 5096 while (size < conf->max_nr_stripes) { 5097 if (drop_one_stripe(conf)) 5098 conf->max_nr_stripes--; 5099 else 5100 break; 5101 } 5102 err = md_allow_write(mddev); 5103 if (err) 5104 return err; 5105 while (size > conf->max_nr_stripes) { 5106 if (grow_one_stripe(conf)) 5107 conf->max_nr_stripes++; 5108 else break; 5109 } 5110 return 0; 5111 } 5112 EXPORT_SYMBOL(raid5_set_cache_size); 5113 5114 static ssize_t 5115 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 5116 { 5117 struct r5conf *conf = mddev->private; 5118 unsigned long new; 5119 int err; 5120 5121 if (len >= PAGE_SIZE) 5122 return -EINVAL; 5123 if (!conf) 5124 return -ENODEV; 5125 5126 if (kstrtoul(page, 10, &new)) 5127 return -EINVAL; 5128 err = raid5_set_cache_size(mddev, new); 5129 if (err) 5130 return err; 5131 return len; 5132 } 5133 5134 static struct md_sysfs_entry 5135 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 5136 raid5_show_stripe_cache_size, 5137 raid5_store_stripe_cache_size); 5138 5139 static ssize_t 5140 raid5_show_preread_threshold(struct mddev *mddev, char *page) 5141 { 5142 struct r5conf *conf = mddev->private; 5143 if (conf) 5144 return sprintf(page, "%d\n", conf->bypass_threshold); 5145 else 5146 return 0; 5147 } 5148 5149 static ssize_t 5150 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 5151 { 5152 struct r5conf *conf = mddev->private; 5153 unsigned long new; 5154 if (len >= PAGE_SIZE) 5155 return -EINVAL; 5156 if (!conf) 5157 return -ENODEV; 5158 5159 if (kstrtoul(page, 10, &new)) 5160 return -EINVAL; 5161 if (new > conf->max_nr_stripes) 5162 return -EINVAL; 5163 conf->bypass_threshold = new; 5164 return len; 5165 } 5166 5167 static struct md_sysfs_entry 5168 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 5169 S_IRUGO | S_IWUSR, 5170 raid5_show_preread_threshold, 5171 raid5_store_preread_threshold); 5172 5173 static ssize_t 5174 stripe_cache_active_show(struct mddev *mddev, char *page) 5175 { 5176 struct r5conf *conf = mddev->private; 5177 if (conf) 5178 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 5179 else 5180 return 0; 5181 } 5182 5183 static struct md_sysfs_entry 5184 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 5185 5186 static ssize_t 5187 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 5188 { 5189 struct r5conf *conf = mddev->private; 5190 if (conf) 5191 return sprintf(page, "%d\n", conf->worker_cnt_per_group); 5192 else 5193 return 0; 5194 } 5195 5196 static int alloc_thread_groups(struct r5conf *conf, int cnt); 5197 static ssize_t 5198 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5199 { 5200 struct r5conf *conf = mddev->private; 5201 unsigned long new; 5202 int err; 5203 struct r5worker_group *old_groups; 5204 int old_group_cnt; 5205 5206 if (len >= PAGE_SIZE) 5207 return -EINVAL; 5208 if (!conf) 5209 return -ENODEV; 5210 5211 if (kstrtoul(page, 10, &new)) 5212 return -EINVAL; 5213 5214 if (new == conf->worker_cnt_per_group) 5215 return len; 5216 5217 mddev_suspend(mddev); 5218 5219 old_groups = conf->worker_groups; 5220 old_group_cnt = conf->worker_cnt_per_group; 5221 5222 conf->worker_groups = NULL; 5223 err = alloc_thread_groups(conf, new); 5224 if (err) { 5225 conf->worker_groups = old_groups; 5226 conf->worker_cnt_per_group = old_group_cnt; 5227 } else { 5228 if (old_groups) 5229 kfree(old_groups[0].workers); 5230 kfree(old_groups); 5231 } 5232 5233 mddev_resume(mddev); 5234 5235 if (err) 5236 return err; 5237 return len; 5238 } 5239 5240 static struct md_sysfs_entry 5241 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 5242 raid5_show_group_thread_cnt, 5243 raid5_store_group_thread_cnt); 5244 5245 static struct attribute *raid5_attrs[] = { 5246 &raid5_stripecache_size.attr, 5247 &raid5_stripecache_active.attr, 5248 &raid5_preread_bypass_threshold.attr, 5249 &raid5_group_thread_cnt.attr, 5250 NULL, 5251 }; 5252 static struct attribute_group raid5_attrs_group = { 5253 .name = NULL, 5254 .attrs = raid5_attrs, 5255 }; 5256 5257 static int alloc_thread_groups(struct r5conf *conf, int cnt) 5258 { 5259 int i, j; 5260 ssize_t size; 5261 struct r5worker *workers; 5262 5263 conf->worker_cnt_per_group = cnt; 5264 if (cnt == 0) { 5265 conf->worker_groups = NULL; 5266 return 0; 5267 } 5268 conf->group_cnt = num_possible_nodes(); 5269 size = sizeof(struct r5worker) * cnt; 5270 workers = kzalloc(size * conf->group_cnt, GFP_NOIO); 5271 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * 5272 conf->group_cnt, GFP_NOIO); 5273 if (!conf->worker_groups || !workers) { 5274 kfree(workers); 5275 kfree(conf->worker_groups); 5276 conf->worker_groups = NULL; 5277 return -ENOMEM; 5278 } 5279 5280 for (i = 0; i < conf->group_cnt; i++) { 5281 struct r5worker_group *group; 5282 5283 group = &conf->worker_groups[i]; 5284 INIT_LIST_HEAD(&group->handle_list); 5285 group->conf = conf; 5286 group->workers = workers + i * cnt; 5287 5288 for (j = 0; j < cnt; j++) { 5289 group->workers[j].group = group; 5290 INIT_WORK(&group->workers[j].work, raid5_do_work); 5291 } 5292 } 5293 5294 return 0; 5295 } 5296 5297 static void free_thread_groups(struct r5conf *conf) 5298 { 5299 if (conf->worker_groups) 5300 kfree(conf->worker_groups[0].workers); 5301 kfree(conf->worker_groups); 5302 conf->worker_groups = NULL; 5303 } 5304 5305 static sector_t 5306 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 5307 { 5308 struct r5conf *conf = mddev->private; 5309 5310 if (!sectors) 5311 sectors = mddev->dev_sectors; 5312 if (!raid_disks) 5313 /* size is defined by the smallest of previous and new size */ 5314 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 5315 5316 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5317 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 5318 return sectors * (raid_disks - conf->max_degraded); 5319 } 5320 5321 static void raid5_free_percpu(struct r5conf *conf) 5322 { 5323 struct raid5_percpu *percpu; 5324 unsigned long cpu; 5325 5326 if (!conf->percpu) 5327 return; 5328 5329 get_online_cpus(); 5330 for_each_possible_cpu(cpu) { 5331 percpu = per_cpu_ptr(conf->percpu, cpu); 5332 safe_put_page(percpu->spare_page); 5333 kfree(percpu->scribble); 5334 } 5335 #ifdef CONFIG_HOTPLUG_CPU 5336 unregister_cpu_notifier(&conf->cpu_notify); 5337 #endif 5338 put_online_cpus(); 5339 5340 free_percpu(conf->percpu); 5341 } 5342 5343 static void free_conf(struct r5conf *conf) 5344 { 5345 free_thread_groups(conf); 5346 shrink_stripes(conf); 5347 raid5_free_percpu(conf); 5348 kfree(conf->disks); 5349 kfree(conf->stripe_hashtbl); 5350 kfree(conf); 5351 } 5352 5353 #ifdef CONFIG_HOTPLUG_CPU 5354 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5355 void *hcpu) 5356 { 5357 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5358 long cpu = (long)hcpu; 5359 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5360 5361 switch (action) { 5362 case CPU_UP_PREPARE: 5363 case CPU_UP_PREPARE_FROZEN: 5364 if (conf->level == 6 && !percpu->spare_page) 5365 percpu->spare_page = alloc_page(GFP_KERNEL); 5366 if (!percpu->scribble) 5367 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5368 5369 if (!percpu->scribble || 5370 (conf->level == 6 && !percpu->spare_page)) { 5371 safe_put_page(percpu->spare_page); 5372 kfree(percpu->scribble); 5373 pr_err("%s: failed memory allocation for cpu%ld\n", 5374 __func__, cpu); 5375 return notifier_from_errno(-ENOMEM); 5376 } 5377 break; 5378 case CPU_DEAD: 5379 case CPU_DEAD_FROZEN: 5380 safe_put_page(percpu->spare_page); 5381 kfree(percpu->scribble); 5382 percpu->spare_page = NULL; 5383 percpu->scribble = NULL; 5384 break; 5385 default: 5386 break; 5387 } 5388 return NOTIFY_OK; 5389 } 5390 #endif 5391 5392 static int raid5_alloc_percpu(struct r5conf *conf) 5393 { 5394 unsigned long cpu; 5395 struct page *spare_page; 5396 struct raid5_percpu __percpu *allcpus; 5397 void *scribble; 5398 int err; 5399 5400 allcpus = alloc_percpu(struct raid5_percpu); 5401 if (!allcpus) 5402 return -ENOMEM; 5403 conf->percpu = allcpus; 5404 5405 get_online_cpus(); 5406 err = 0; 5407 for_each_present_cpu(cpu) { 5408 if (conf->level == 6) { 5409 spare_page = alloc_page(GFP_KERNEL); 5410 if (!spare_page) { 5411 err = -ENOMEM; 5412 break; 5413 } 5414 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 5415 } 5416 scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5417 if (!scribble) { 5418 err = -ENOMEM; 5419 break; 5420 } 5421 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 5422 } 5423 #ifdef CONFIG_HOTPLUG_CPU 5424 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5425 conf->cpu_notify.priority = 0; 5426 if (err == 0) 5427 err = register_cpu_notifier(&conf->cpu_notify); 5428 #endif 5429 put_online_cpus(); 5430 5431 return err; 5432 } 5433 5434 static struct r5conf *setup_conf(struct mddev *mddev) 5435 { 5436 struct r5conf *conf; 5437 int raid_disk, memory, max_disks; 5438 struct md_rdev *rdev; 5439 struct disk_info *disk; 5440 char pers_name[6]; 5441 5442 if (mddev->new_level != 5 5443 && mddev->new_level != 4 5444 && mddev->new_level != 6) { 5445 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5446 mdname(mddev), mddev->new_level); 5447 return ERR_PTR(-EIO); 5448 } 5449 if ((mddev->new_level == 5 5450 && !algorithm_valid_raid5(mddev->new_layout)) || 5451 (mddev->new_level == 6 5452 && !algorithm_valid_raid6(mddev->new_layout))) { 5453 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5454 mdname(mddev), mddev->new_layout); 5455 return ERR_PTR(-EIO); 5456 } 5457 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5458 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5459 mdname(mddev), mddev->raid_disks); 5460 return ERR_PTR(-EINVAL); 5461 } 5462 5463 if (!mddev->new_chunk_sectors || 5464 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5465 !is_power_of_2(mddev->new_chunk_sectors)) { 5466 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5467 mdname(mddev), mddev->new_chunk_sectors << 9); 5468 return ERR_PTR(-EINVAL); 5469 } 5470 5471 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5472 if (conf == NULL) 5473 goto abort; 5474 /* Don't enable multi-threading by default*/ 5475 if (alloc_thread_groups(conf, 0)) 5476 goto abort; 5477 spin_lock_init(&conf->device_lock); 5478 seqcount_init(&conf->gen_lock); 5479 init_waitqueue_head(&conf->wait_for_stripe); 5480 init_waitqueue_head(&conf->wait_for_overlap); 5481 INIT_LIST_HEAD(&conf->handle_list); 5482 INIT_LIST_HEAD(&conf->hold_list); 5483 INIT_LIST_HEAD(&conf->delayed_list); 5484 INIT_LIST_HEAD(&conf->bitmap_list); 5485 INIT_LIST_HEAD(&conf->inactive_list); 5486 init_llist_head(&conf->released_stripes); 5487 atomic_set(&conf->active_stripes, 0); 5488 atomic_set(&conf->preread_active_stripes, 0); 5489 atomic_set(&conf->active_aligned_reads, 0); 5490 conf->bypass_threshold = BYPASS_THRESHOLD; 5491 conf->recovery_disabled = mddev->recovery_disabled - 1; 5492 5493 conf->raid_disks = mddev->raid_disks; 5494 if (mddev->reshape_position == MaxSector) 5495 conf->previous_raid_disks = mddev->raid_disks; 5496 else 5497 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5498 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5499 conf->scribble_len = scribble_len(max_disks); 5500 5501 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5502 GFP_KERNEL); 5503 if (!conf->disks) 5504 goto abort; 5505 5506 conf->mddev = mddev; 5507 5508 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5509 goto abort; 5510 5511 conf->level = mddev->new_level; 5512 if (raid5_alloc_percpu(conf) != 0) 5513 goto abort; 5514 5515 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5516 5517 rdev_for_each(rdev, mddev) { 5518 raid_disk = rdev->raid_disk; 5519 if (raid_disk >= max_disks 5520 || raid_disk < 0) 5521 continue; 5522 disk = conf->disks + raid_disk; 5523 5524 if (test_bit(Replacement, &rdev->flags)) { 5525 if (disk->replacement) 5526 goto abort; 5527 disk->replacement = rdev; 5528 } else { 5529 if (disk->rdev) 5530 goto abort; 5531 disk->rdev = rdev; 5532 } 5533 5534 if (test_bit(In_sync, &rdev->flags)) { 5535 char b[BDEVNAME_SIZE]; 5536 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5537 " disk %d\n", 5538 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5539 } else if (rdev->saved_raid_disk != raid_disk) 5540 /* Cannot rely on bitmap to complete recovery */ 5541 conf->fullsync = 1; 5542 } 5543 5544 conf->chunk_sectors = mddev->new_chunk_sectors; 5545 conf->level = mddev->new_level; 5546 if (conf->level == 6) 5547 conf->max_degraded = 2; 5548 else 5549 conf->max_degraded = 1; 5550 conf->algorithm = mddev->new_layout; 5551 conf->max_nr_stripes = NR_STRIPES; 5552 conf->reshape_progress = mddev->reshape_position; 5553 if (conf->reshape_progress != MaxSector) { 5554 conf->prev_chunk_sectors = mddev->chunk_sectors; 5555 conf->prev_algo = mddev->layout; 5556 } 5557 5558 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5559 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5560 if (grow_stripes(conf, conf->max_nr_stripes)) { 5561 printk(KERN_ERR 5562 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5563 mdname(mddev), memory); 5564 goto abort; 5565 } else 5566 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5567 mdname(mddev), memory); 5568 5569 sprintf(pers_name, "raid%d", mddev->new_level); 5570 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5571 if (!conf->thread) { 5572 printk(KERN_ERR 5573 "md/raid:%s: couldn't allocate thread.\n", 5574 mdname(mddev)); 5575 goto abort; 5576 } 5577 5578 return conf; 5579 5580 abort: 5581 if (conf) { 5582 free_conf(conf); 5583 return ERR_PTR(-EIO); 5584 } else 5585 return ERR_PTR(-ENOMEM); 5586 } 5587 5588 5589 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5590 { 5591 switch (algo) { 5592 case ALGORITHM_PARITY_0: 5593 if (raid_disk < max_degraded) 5594 return 1; 5595 break; 5596 case ALGORITHM_PARITY_N: 5597 if (raid_disk >= raid_disks - max_degraded) 5598 return 1; 5599 break; 5600 case ALGORITHM_PARITY_0_6: 5601 if (raid_disk == 0 || 5602 raid_disk == raid_disks - 1) 5603 return 1; 5604 break; 5605 case ALGORITHM_LEFT_ASYMMETRIC_6: 5606 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5607 case ALGORITHM_LEFT_SYMMETRIC_6: 5608 case ALGORITHM_RIGHT_SYMMETRIC_6: 5609 if (raid_disk == raid_disks - 1) 5610 return 1; 5611 } 5612 return 0; 5613 } 5614 5615 static int run(struct mddev *mddev) 5616 { 5617 struct r5conf *conf; 5618 int working_disks = 0; 5619 int dirty_parity_disks = 0; 5620 struct md_rdev *rdev; 5621 sector_t reshape_offset = 0; 5622 int i; 5623 long long min_offset_diff = 0; 5624 int first = 1; 5625 5626 if (mddev->recovery_cp != MaxSector) 5627 printk(KERN_NOTICE "md/raid:%s: not clean" 5628 " -- starting background reconstruction\n", 5629 mdname(mddev)); 5630 5631 rdev_for_each(rdev, mddev) { 5632 long long diff; 5633 if (rdev->raid_disk < 0) 5634 continue; 5635 diff = (rdev->new_data_offset - rdev->data_offset); 5636 if (first) { 5637 min_offset_diff = diff; 5638 first = 0; 5639 } else if (mddev->reshape_backwards && 5640 diff < min_offset_diff) 5641 min_offset_diff = diff; 5642 else if (!mddev->reshape_backwards && 5643 diff > min_offset_diff) 5644 min_offset_diff = diff; 5645 } 5646 5647 if (mddev->reshape_position != MaxSector) { 5648 /* Check that we can continue the reshape. 5649 * Difficulties arise if the stripe we would write to 5650 * next is at or after the stripe we would read from next. 5651 * For a reshape that changes the number of devices, this 5652 * is only possible for a very short time, and mdadm makes 5653 * sure that time appears to have past before assembling 5654 * the array. So we fail if that time hasn't passed. 5655 * For a reshape that keeps the number of devices the same 5656 * mdadm must be monitoring the reshape can keeping the 5657 * critical areas read-only and backed up. It will start 5658 * the array in read-only mode, so we check for that. 5659 */ 5660 sector_t here_new, here_old; 5661 int old_disks; 5662 int max_degraded = (mddev->level == 6 ? 2 : 1); 5663 5664 if (mddev->new_level != mddev->level) { 5665 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5666 "required - aborting.\n", 5667 mdname(mddev)); 5668 return -EINVAL; 5669 } 5670 old_disks = mddev->raid_disks - mddev->delta_disks; 5671 /* reshape_position must be on a new-stripe boundary, and one 5672 * further up in new geometry must map after here in old 5673 * geometry. 5674 */ 5675 here_new = mddev->reshape_position; 5676 if (sector_div(here_new, mddev->new_chunk_sectors * 5677 (mddev->raid_disks - max_degraded))) { 5678 printk(KERN_ERR "md/raid:%s: reshape_position not " 5679 "on a stripe boundary\n", mdname(mddev)); 5680 return -EINVAL; 5681 } 5682 reshape_offset = here_new * mddev->new_chunk_sectors; 5683 /* here_new is the stripe we will write to */ 5684 here_old = mddev->reshape_position; 5685 sector_div(here_old, mddev->chunk_sectors * 5686 (old_disks-max_degraded)); 5687 /* here_old is the first stripe that we might need to read 5688 * from */ 5689 if (mddev->delta_disks == 0) { 5690 if ((here_new * mddev->new_chunk_sectors != 5691 here_old * mddev->chunk_sectors)) { 5692 printk(KERN_ERR "md/raid:%s: reshape position is" 5693 " confused - aborting\n", mdname(mddev)); 5694 return -EINVAL; 5695 } 5696 /* We cannot be sure it is safe to start an in-place 5697 * reshape. It is only safe if user-space is monitoring 5698 * and taking constant backups. 5699 * mdadm always starts a situation like this in 5700 * readonly mode so it can take control before 5701 * allowing any writes. So just check for that. 5702 */ 5703 if (abs(min_offset_diff) >= mddev->chunk_sectors && 5704 abs(min_offset_diff) >= mddev->new_chunk_sectors) 5705 /* not really in-place - so OK */; 5706 else if (mddev->ro == 0) { 5707 printk(KERN_ERR "md/raid:%s: in-place reshape " 5708 "must be started in read-only mode " 5709 "- aborting\n", 5710 mdname(mddev)); 5711 return -EINVAL; 5712 } 5713 } else if (mddev->reshape_backwards 5714 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 5715 here_old * mddev->chunk_sectors) 5716 : (here_new * mddev->new_chunk_sectors >= 5717 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 5718 /* Reading from the same stripe as writing to - bad */ 5719 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 5720 "auto-recovery - aborting.\n", 5721 mdname(mddev)); 5722 return -EINVAL; 5723 } 5724 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 5725 mdname(mddev)); 5726 /* OK, we should be able to continue; */ 5727 } else { 5728 BUG_ON(mddev->level != mddev->new_level); 5729 BUG_ON(mddev->layout != mddev->new_layout); 5730 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 5731 BUG_ON(mddev->delta_disks != 0); 5732 } 5733 5734 if (mddev->private == NULL) 5735 conf = setup_conf(mddev); 5736 else 5737 conf = mddev->private; 5738 5739 if (IS_ERR(conf)) 5740 return PTR_ERR(conf); 5741 5742 conf->min_offset_diff = min_offset_diff; 5743 mddev->thread = conf->thread; 5744 conf->thread = NULL; 5745 mddev->private = conf; 5746 5747 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 5748 i++) { 5749 rdev = conf->disks[i].rdev; 5750 if (!rdev && conf->disks[i].replacement) { 5751 /* The replacement is all we have yet */ 5752 rdev = conf->disks[i].replacement; 5753 conf->disks[i].replacement = NULL; 5754 clear_bit(Replacement, &rdev->flags); 5755 conf->disks[i].rdev = rdev; 5756 } 5757 if (!rdev) 5758 continue; 5759 if (conf->disks[i].replacement && 5760 conf->reshape_progress != MaxSector) { 5761 /* replacements and reshape simply do not mix. */ 5762 printk(KERN_ERR "md: cannot handle concurrent " 5763 "replacement and reshape.\n"); 5764 goto abort; 5765 } 5766 if (test_bit(In_sync, &rdev->flags)) { 5767 working_disks++; 5768 continue; 5769 } 5770 /* This disc is not fully in-sync. However if it 5771 * just stored parity (beyond the recovery_offset), 5772 * when we don't need to be concerned about the 5773 * array being dirty. 5774 * When reshape goes 'backwards', we never have 5775 * partially completed devices, so we only need 5776 * to worry about reshape going forwards. 5777 */ 5778 /* Hack because v0.91 doesn't store recovery_offset properly. */ 5779 if (mddev->major_version == 0 && 5780 mddev->minor_version > 90) 5781 rdev->recovery_offset = reshape_offset; 5782 5783 if (rdev->recovery_offset < reshape_offset) { 5784 /* We need to check old and new layout */ 5785 if (!only_parity(rdev->raid_disk, 5786 conf->algorithm, 5787 conf->raid_disks, 5788 conf->max_degraded)) 5789 continue; 5790 } 5791 if (!only_parity(rdev->raid_disk, 5792 conf->prev_algo, 5793 conf->previous_raid_disks, 5794 conf->max_degraded)) 5795 continue; 5796 dirty_parity_disks++; 5797 } 5798 5799 /* 5800 * 0 for a fully functional array, 1 or 2 for a degraded array. 5801 */ 5802 mddev->degraded = calc_degraded(conf); 5803 5804 if (has_failed(conf)) { 5805 printk(KERN_ERR "md/raid:%s: not enough operational devices" 5806 " (%d/%d failed)\n", 5807 mdname(mddev), mddev->degraded, conf->raid_disks); 5808 goto abort; 5809 } 5810 5811 /* device size must be a multiple of chunk size */ 5812 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 5813 mddev->resync_max_sectors = mddev->dev_sectors; 5814 5815 if (mddev->degraded > dirty_parity_disks && 5816 mddev->recovery_cp != MaxSector) { 5817 if (mddev->ok_start_degraded) 5818 printk(KERN_WARNING 5819 "md/raid:%s: starting dirty degraded array" 5820 " - data corruption possible.\n", 5821 mdname(mddev)); 5822 else { 5823 printk(KERN_ERR 5824 "md/raid:%s: cannot start dirty degraded array.\n", 5825 mdname(mddev)); 5826 goto abort; 5827 } 5828 } 5829 5830 if (mddev->degraded == 0) 5831 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5832 " devices, algorithm %d\n", mdname(mddev), conf->level, 5833 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5834 mddev->new_layout); 5835 else 5836 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5837 " out of %d devices, algorithm %d\n", 5838 mdname(mddev), conf->level, 5839 mddev->raid_disks - mddev->degraded, 5840 mddev->raid_disks, mddev->new_layout); 5841 5842 print_raid5_conf(conf); 5843 5844 if (conf->reshape_progress != MaxSector) { 5845 conf->reshape_safe = conf->reshape_progress; 5846 atomic_set(&conf->reshape_stripes, 0); 5847 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5848 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5849 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5850 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5851 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5852 "reshape"); 5853 } 5854 5855 5856 /* Ok, everything is just fine now */ 5857 if (mddev->to_remove == &raid5_attrs_group) 5858 mddev->to_remove = NULL; 5859 else if (mddev->kobj.sd && 5860 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5861 printk(KERN_WARNING 5862 "raid5: failed to create sysfs attributes for %s\n", 5863 mdname(mddev)); 5864 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5865 5866 if (mddev->queue) { 5867 int chunk_size; 5868 bool discard_supported = true; 5869 /* read-ahead size must cover two whole stripes, which 5870 * is 2 * (datadisks) * chunksize where 'n' is the 5871 * number of raid devices 5872 */ 5873 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5874 int stripe = data_disks * 5875 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 5876 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5877 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5878 5879 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5880 5881 mddev->queue->backing_dev_info.congested_data = mddev; 5882 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5883 5884 chunk_size = mddev->chunk_sectors << 9; 5885 blk_queue_io_min(mddev->queue, chunk_size); 5886 blk_queue_io_opt(mddev->queue, chunk_size * 5887 (conf->raid_disks - conf->max_degraded)); 5888 /* 5889 * We can only discard a whole stripe. It doesn't make sense to 5890 * discard data disk but write parity disk 5891 */ 5892 stripe = stripe * PAGE_SIZE; 5893 /* Round up to power of 2, as discard handling 5894 * currently assumes that */ 5895 while ((stripe-1) & stripe) 5896 stripe = (stripe | (stripe-1)) + 1; 5897 mddev->queue->limits.discard_alignment = stripe; 5898 mddev->queue->limits.discard_granularity = stripe; 5899 /* 5900 * unaligned part of discard request will be ignored, so can't 5901 * guarantee discard_zerors_data 5902 */ 5903 mddev->queue->limits.discard_zeroes_data = 0; 5904 5905 blk_queue_max_write_same_sectors(mddev->queue, 0); 5906 5907 rdev_for_each(rdev, mddev) { 5908 disk_stack_limits(mddev->gendisk, rdev->bdev, 5909 rdev->data_offset << 9); 5910 disk_stack_limits(mddev->gendisk, rdev->bdev, 5911 rdev->new_data_offset << 9); 5912 /* 5913 * discard_zeroes_data is required, otherwise data 5914 * could be lost. Consider a scenario: discard a stripe 5915 * (the stripe could be inconsistent if 5916 * discard_zeroes_data is 0); write one disk of the 5917 * stripe (the stripe could be inconsistent again 5918 * depending on which disks are used to calculate 5919 * parity); the disk is broken; The stripe data of this 5920 * disk is lost. 5921 */ 5922 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5923 !bdev_get_queue(rdev->bdev)-> 5924 limits.discard_zeroes_data) 5925 discard_supported = false; 5926 } 5927 5928 if (discard_supported && 5929 mddev->queue->limits.max_discard_sectors >= stripe && 5930 mddev->queue->limits.discard_granularity >= stripe) 5931 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5932 mddev->queue); 5933 else 5934 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5935 mddev->queue); 5936 } 5937 5938 return 0; 5939 abort: 5940 md_unregister_thread(&mddev->thread); 5941 print_raid5_conf(conf); 5942 free_conf(conf); 5943 mddev->private = NULL; 5944 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5945 return -EIO; 5946 } 5947 5948 static int stop(struct mddev *mddev) 5949 { 5950 struct r5conf *conf = mddev->private; 5951 5952 md_unregister_thread(&mddev->thread); 5953 if (mddev->queue) 5954 mddev->queue->backing_dev_info.congested_fn = NULL; 5955 free_conf(conf); 5956 mddev->private = NULL; 5957 mddev->to_remove = &raid5_attrs_group; 5958 return 0; 5959 } 5960 5961 static void status(struct seq_file *seq, struct mddev *mddev) 5962 { 5963 struct r5conf *conf = mddev->private; 5964 int i; 5965 5966 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5967 mddev->chunk_sectors / 2, mddev->layout); 5968 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5969 for (i = 0; i < conf->raid_disks; i++) 5970 seq_printf (seq, "%s", 5971 conf->disks[i].rdev && 5972 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5973 seq_printf (seq, "]"); 5974 } 5975 5976 static void print_raid5_conf (struct r5conf *conf) 5977 { 5978 int i; 5979 struct disk_info *tmp; 5980 5981 printk(KERN_DEBUG "RAID conf printout:\n"); 5982 if (!conf) { 5983 printk("(conf==NULL)\n"); 5984 return; 5985 } 5986 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5987 conf->raid_disks, 5988 conf->raid_disks - conf->mddev->degraded); 5989 5990 for (i = 0; i < conf->raid_disks; i++) { 5991 char b[BDEVNAME_SIZE]; 5992 tmp = conf->disks + i; 5993 if (tmp->rdev) 5994 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5995 i, !test_bit(Faulty, &tmp->rdev->flags), 5996 bdevname(tmp->rdev->bdev, b)); 5997 } 5998 } 5999 6000 static int raid5_spare_active(struct mddev *mddev) 6001 { 6002 int i; 6003 struct r5conf *conf = mddev->private; 6004 struct disk_info *tmp; 6005 int count = 0; 6006 unsigned long flags; 6007 6008 for (i = 0; i < conf->raid_disks; i++) { 6009 tmp = conf->disks + i; 6010 if (tmp->replacement 6011 && tmp->replacement->recovery_offset == MaxSector 6012 && !test_bit(Faulty, &tmp->replacement->flags) 6013 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 6014 /* Replacement has just become active. */ 6015 if (!tmp->rdev 6016 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 6017 count++; 6018 if (tmp->rdev) { 6019 /* Replaced device not technically faulty, 6020 * but we need to be sure it gets removed 6021 * and never re-added. 6022 */ 6023 set_bit(Faulty, &tmp->rdev->flags); 6024 sysfs_notify_dirent_safe( 6025 tmp->rdev->sysfs_state); 6026 } 6027 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 6028 } else if (tmp->rdev 6029 && tmp->rdev->recovery_offset == MaxSector 6030 && !test_bit(Faulty, &tmp->rdev->flags) 6031 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 6032 count++; 6033 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 6034 } 6035 } 6036 spin_lock_irqsave(&conf->device_lock, flags); 6037 mddev->degraded = calc_degraded(conf); 6038 spin_unlock_irqrestore(&conf->device_lock, flags); 6039 print_raid5_conf(conf); 6040 return count; 6041 } 6042 6043 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 6044 { 6045 struct r5conf *conf = mddev->private; 6046 int err = 0; 6047 int number = rdev->raid_disk; 6048 struct md_rdev **rdevp; 6049 struct disk_info *p = conf->disks + number; 6050 6051 print_raid5_conf(conf); 6052 if (rdev == p->rdev) 6053 rdevp = &p->rdev; 6054 else if (rdev == p->replacement) 6055 rdevp = &p->replacement; 6056 else 6057 return 0; 6058 6059 if (number >= conf->raid_disks && 6060 conf->reshape_progress == MaxSector) 6061 clear_bit(In_sync, &rdev->flags); 6062 6063 if (test_bit(In_sync, &rdev->flags) || 6064 atomic_read(&rdev->nr_pending)) { 6065 err = -EBUSY; 6066 goto abort; 6067 } 6068 /* Only remove non-faulty devices if recovery 6069 * isn't possible. 6070 */ 6071 if (!test_bit(Faulty, &rdev->flags) && 6072 mddev->recovery_disabled != conf->recovery_disabled && 6073 !has_failed(conf) && 6074 (!p->replacement || p->replacement == rdev) && 6075 number < conf->raid_disks) { 6076 err = -EBUSY; 6077 goto abort; 6078 } 6079 *rdevp = NULL; 6080 synchronize_rcu(); 6081 if (atomic_read(&rdev->nr_pending)) { 6082 /* lost the race, try later */ 6083 err = -EBUSY; 6084 *rdevp = rdev; 6085 } else if (p->replacement) { 6086 /* We must have just cleared 'rdev' */ 6087 p->rdev = p->replacement; 6088 clear_bit(Replacement, &p->replacement->flags); 6089 smp_mb(); /* Make sure other CPUs may see both as identical 6090 * but will never see neither - if they are careful 6091 */ 6092 p->replacement = NULL; 6093 clear_bit(WantReplacement, &rdev->flags); 6094 } else 6095 /* We might have just removed the Replacement as faulty- 6096 * clear the bit just in case 6097 */ 6098 clear_bit(WantReplacement, &rdev->flags); 6099 abort: 6100 6101 print_raid5_conf(conf); 6102 return err; 6103 } 6104 6105 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 6106 { 6107 struct r5conf *conf = mddev->private; 6108 int err = -EEXIST; 6109 int disk; 6110 struct disk_info *p; 6111 int first = 0; 6112 int last = conf->raid_disks - 1; 6113 6114 if (mddev->recovery_disabled == conf->recovery_disabled) 6115 return -EBUSY; 6116 6117 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 6118 /* no point adding a device */ 6119 return -EINVAL; 6120 6121 if (rdev->raid_disk >= 0) 6122 first = last = rdev->raid_disk; 6123 6124 /* 6125 * find the disk ... but prefer rdev->saved_raid_disk 6126 * if possible. 6127 */ 6128 if (rdev->saved_raid_disk >= 0 && 6129 rdev->saved_raid_disk >= first && 6130 conf->disks[rdev->saved_raid_disk].rdev == NULL) 6131 first = rdev->saved_raid_disk; 6132 6133 for (disk = first; disk <= last; disk++) { 6134 p = conf->disks + disk; 6135 if (p->rdev == NULL) { 6136 clear_bit(In_sync, &rdev->flags); 6137 rdev->raid_disk = disk; 6138 err = 0; 6139 if (rdev->saved_raid_disk != disk) 6140 conf->fullsync = 1; 6141 rcu_assign_pointer(p->rdev, rdev); 6142 goto out; 6143 } 6144 } 6145 for (disk = first; disk <= last; disk++) { 6146 p = conf->disks + disk; 6147 if (test_bit(WantReplacement, &p->rdev->flags) && 6148 p->replacement == NULL) { 6149 clear_bit(In_sync, &rdev->flags); 6150 set_bit(Replacement, &rdev->flags); 6151 rdev->raid_disk = disk; 6152 err = 0; 6153 conf->fullsync = 1; 6154 rcu_assign_pointer(p->replacement, rdev); 6155 break; 6156 } 6157 } 6158 out: 6159 print_raid5_conf(conf); 6160 return err; 6161 } 6162 6163 static int raid5_resize(struct mddev *mddev, sector_t sectors) 6164 { 6165 /* no resync is happening, and there is enough space 6166 * on all devices, so we can resize. 6167 * We need to make sure resync covers any new space. 6168 * If the array is shrinking we should possibly wait until 6169 * any io in the removed space completes, but it hardly seems 6170 * worth it. 6171 */ 6172 sector_t newsize; 6173 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 6174 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 6175 if (mddev->external_size && 6176 mddev->array_sectors > newsize) 6177 return -EINVAL; 6178 if (mddev->bitmap) { 6179 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 6180 if (ret) 6181 return ret; 6182 } 6183 md_set_array_sectors(mddev, newsize); 6184 set_capacity(mddev->gendisk, mddev->array_sectors); 6185 revalidate_disk(mddev->gendisk); 6186 if (sectors > mddev->dev_sectors && 6187 mddev->recovery_cp > mddev->dev_sectors) { 6188 mddev->recovery_cp = mddev->dev_sectors; 6189 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6190 } 6191 mddev->dev_sectors = sectors; 6192 mddev->resync_max_sectors = sectors; 6193 return 0; 6194 } 6195 6196 static int check_stripe_cache(struct mddev *mddev) 6197 { 6198 /* Can only proceed if there are plenty of stripe_heads. 6199 * We need a minimum of one full stripe,, and for sensible progress 6200 * it is best to have about 4 times that. 6201 * If we require 4 times, then the default 256 4K stripe_heads will 6202 * allow for chunk sizes up to 256K, which is probably OK. 6203 * If the chunk size is greater, user-space should request more 6204 * stripe_heads first. 6205 */ 6206 struct r5conf *conf = mddev->private; 6207 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 6208 > conf->max_nr_stripes || 6209 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 6210 > conf->max_nr_stripes) { 6211 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 6212 mdname(mddev), 6213 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 6214 / STRIPE_SIZE)*4); 6215 return 0; 6216 } 6217 return 1; 6218 } 6219 6220 static int check_reshape(struct mddev *mddev) 6221 { 6222 struct r5conf *conf = mddev->private; 6223 6224 if (mddev->delta_disks == 0 && 6225 mddev->new_layout == mddev->layout && 6226 mddev->new_chunk_sectors == mddev->chunk_sectors) 6227 return 0; /* nothing to do */ 6228 if (has_failed(conf)) 6229 return -EINVAL; 6230 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 6231 /* We might be able to shrink, but the devices must 6232 * be made bigger first. 6233 * For raid6, 4 is the minimum size. 6234 * Otherwise 2 is the minimum 6235 */ 6236 int min = 2; 6237 if (mddev->level == 6) 6238 min = 4; 6239 if (mddev->raid_disks + mddev->delta_disks < min) 6240 return -EINVAL; 6241 } 6242 6243 if (!check_stripe_cache(mddev)) 6244 return -ENOSPC; 6245 6246 return resize_stripes(conf, (conf->previous_raid_disks 6247 + mddev->delta_disks)); 6248 } 6249 6250 static int raid5_start_reshape(struct mddev *mddev) 6251 { 6252 struct r5conf *conf = mddev->private; 6253 struct md_rdev *rdev; 6254 int spares = 0; 6255 unsigned long flags; 6256 6257 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6258 return -EBUSY; 6259 6260 if (!check_stripe_cache(mddev)) 6261 return -ENOSPC; 6262 6263 if (has_failed(conf)) 6264 return -EINVAL; 6265 6266 rdev_for_each(rdev, mddev) { 6267 if (!test_bit(In_sync, &rdev->flags) 6268 && !test_bit(Faulty, &rdev->flags)) 6269 spares++; 6270 } 6271 6272 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 6273 /* Not enough devices even to make a degraded array 6274 * of that size 6275 */ 6276 return -EINVAL; 6277 6278 /* Refuse to reduce size of the array. Any reductions in 6279 * array size must be through explicit setting of array_size 6280 * attribute. 6281 */ 6282 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 6283 < mddev->array_sectors) { 6284 printk(KERN_ERR "md/raid:%s: array size must be reduced " 6285 "before number of disks\n", mdname(mddev)); 6286 return -EINVAL; 6287 } 6288 6289 atomic_set(&conf->reshape_stripes, 0); 6290 spin_lock_irq(&conf->device_lock); 6291 write_seqcount_begin(&conf->gen_lock); 6292 conf->previous_raid_disks = conf->raid_disks; 6293 conf->raid_disks += mddev->delta_disks; 6294 conf->prev_chunk_sectors = conf->chunk_sectors; 6295 conf->chunk_sectors = mddev->new_chunk_sectors; 6296 conf->prev_algo = conf->algorithm; 6297 conf->algorithm = mddev->new_layout; 6298 conf->generation++; 6299 /* Code that selects data_offset needs to see the generation update 6300 * if reshape_progress has been set - so a memory barrier needed. 6301 */ 6302 smp_mb(); 6303 if (mddev->reshape_backwards) 6304 conf->reshape_progress = raid5_size(mddev, 0, 0); 6305 else 6306 conf->reshape_progress = 0; 6307 conf->reshape_safe = conf->reshape_progress; 6308 write_seqcount_end(&conf->gen_lock); 6309 spin_unlock_irq(&conf->device_lock); 6310 6311 /* Now make sure any requests that proceeded on the assumption 6312 * the reshape wasn't running - like Discard or Read - have 6313 * completed. 6314 */ 6315 mddev_suspend(mddev); 6316 mddev_resume(mddev); 6317 6318 /* Add some new drives, as many as will fit. 6319 * We know there are enough to make the newly sized array work. 6320 * Don't add devices if we are reducing the number of 6321 * devices in the array. This is because it is not possible 6322 * to correctly record the "partially reconstructed" state of 6323 * such devices during the reshape and confusion could result. 6324 */ 6325 if (mddev->delta_disks >= 0) { 6326 rdev_for_each(rdev, mddev) 6327 if (rdev->raid_disk < 0 && 6328 !test_bit(Faulty, &rdev->flags)) { 6329 if (raid5_add_disk(mddev, rdev) == 0) { 6330 if (rdev->raid_disk 6331 >= conf->previous_raid_disks) 6332 set_bit(In_sync, &rdev->flags); 6333 else 6334 rdev->recovery_offset = 0; 6335 6336 if (sysfs_link_rdev(mddev, rdev)) 6337 /* Failure here is OK */; 6338 } 6339 } else if (rdev->raid_disk >= conf->previous_raid_disks 6340 && !test_bit(Faulty, &rdev->flags)) { 6341 /* This is a spare that was manually added */ 6342 set_bit(In_sync, &rdev->flags); 6343 } 6344 6345 /* When a reshape changes the number of devices, 6346 * ->degraded is measured against the larger of the 6347 * pre and post number of devices. 6348 */ 6349 spin_lock_irqsave(&conf->device_lock, flags); 6350 mddev->degraded = calc_degraded(conf); 6351 spin_unlock_irqrestore(&conf->device_lock, flags); 6352 } 6353 mddev->raid_disks = conf->raid_disks; 6354 mddev->reshape_position = conf->reshape_progress; 6355 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6356 6357 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6358 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6359 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6360 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6361 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6362 "reshape"); 6363 if (!mddev->sync_thread) { 6364 mddev->recovery = 0; 6365 spin_lock_irq(&conf->device_lock); 6366 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6367 rdev_for_each(rdev, mddev) 6368 rdev->new_data_offset = rdev->data_offset; 6369 smp_wmb(); 6370 conf->reshape_progress = MaxSector; 6371 mddev->reshape_position = MaxSector; 6372 spin_unlock_irq(&conf->device_lock); 6373 return -EAGAIN; 6374 } 6375 conf->reshape_checkpoint = jiffies; 6376 md_wakeup_thread(mddev->sync_thread); 6377 md_new_event(mddev); 6378 return 0; 6379 } 6380 6381 /* This is called from the reshape thread and should make any 6382 * changes needed in 'conf' 6383 */ 6384 static void end_reshape(struct r5conf *conf) 6385 { 6386 6387 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6388 struct md_rdev *rdev; 6389 6390 spin_lock_irq(&conf->device_lock); 6391 conf->previous_raid_disks = conf->raid_disks; 6392 rdev_for_each(rdev, conf->mddev) 6393 rdev->data_offset = rdev->new_data_offset; 6394 smp_wmb(); 6395 conf->reshape_progress = MaxSector; 6396 spin_unlock_irq(&conf->device_lock); 6397 wake_up(&conf->wait_for_overlap); 6398 6399 /* read-ahead size must cover two whole stripes, which is 6400 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6401 */ 6402 if (conf->mddev->queue) { 6403 int data_disks = conf->raid_disks - conf->max_degraded; 6404 int stripe = data_disks * ((conf->chunk_sectors << 9) 6405 / PAGE_SIZE); 6406 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6407 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6408 } 6409 } 6410 } 6411 6412 /* This is called from the raid5d thread with mddev_lock held. 6413 * It makes config changes to the device. 6414 */ 6415 static void raid5_finish_reshape(struct mddev *mddev) 6416 { 6417 struct r5conf *conf = mddev->private; 6418 6419 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6420 6421 if (mddev->delta_disks > 0) { 6422 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6423 set_capacity(mddev->gendisk, mddev->array_sectors); 6424 revalidate_disk(mddev->gendisk); 6425 } else { 6426 int d; 6427 spin_lock_irq(&conf->device_lock); 6428 mddev->degraded = calc_degraded(conf); 6429 spin_unlock_irq(&conf->device_lock); 6430 for (d = conf->raid_disks ; 6431 d < conf->raid_disks - mddev->delta_disks; 6432 d++) { 6433 struct md_rdev *rdev = conf->disks[d].rdev; 6434 if (rdev) 6435 clear_bit(In_sync, &rdev->flags); 6436 rdev = conf->disks[d].replacement; 6437 if (rdev) 6438 clear_bit(In_sync, &rdev->flags); 6439 } 6440 } 6441 mddev->layout = conf->algorithm; 6442 mddev->chunk_sectors = conf->chunk_sectors; 6443 mddev->reshape_position = MaxSector; 6444 mddev->delta_disks = 0; 6445 mddev->reshape_backwards = 0; 6446 } 6447 } 6448 6449 static void raid5_quiesce(struct mddev *mddev, int state) 6450 { 6451 struct r5conf *conf = mddev->private; 6452 6453 switch(state) { 6454 case 2: /* resume for a suspend */ 6455 wake_up(&conf->wait_for_overlap); 6456 break; 6457 6458 case 1: /* stop all writes */ 6459 spin_lock_irq(&conf->device_lock); 6460 /* '2' tells resync/reshape to pause so that all 6461 * active stripes can drain 6462 */ 6463 conf->quiesce = 2; 6464 wait_event_lock_irq(conf->wait_for_stripe, 6465 atomic_read(&conf->active_stripes) == 0 && 6466 atomic_read(&conf->active_aligned_reads) == 0, 6467 conf->device_lock); 6468 conf->quiesce = 1; 6469 spin_unlock_irq(&conf->device_lock); 6470 /* allow reshape to continue */ 6471 wake_up(&conf->wait_for_overlap); 6472 break; 6473 6474 case 0: /* re-enable writes */ 6475 spin_lock_irq(&conf->device_lock); 6476 conf->quiesce = 0; 6477 wake_up(&conf->wait_for_stripe); 6478 wake_up(&conf->wait_for_overlap); 6479 spin_unlock_irq(&conf->device_lock); 6480 break; 6481 } 6482 } 6483 6484 6485 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6486 { 6487 struct r0conf *raid0_conf = mddev->private; 6488 sector_t sectors; 6489 6490 /* for raid0 takeover only one zone is supported */ 6491 if (raid0_conf->nr_strip_zones > 1) { 6492 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6493 mdname(mddev)); 6494 return ERR_PTR(-EINVAL); 6495 } 6496 6497 sectors = raid0_conf->strip_zone[0].zone_end; 6498 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6499 mddev->dev_sectors = sectors; 6500 mddev->new_level = level; 6501 mddev->new_layout = ALGORITHM_PARITY_N; 6502 mddev->new_chunk_sectors = mddev->chunk_sectors; 6503 mddev->raid_disks += 1; 6504 mddev->delta_disks = 1; 6505 /* make sure it will be not marked as dirty */ 6506 mddev->recovery_cp = MaxSector; 6507 6508 return setup_conf(mddev); 6509 } 6510 6511 6512 static void *raid5_takeover_raid1(struct mddev *mddev) 6513 { 6514 int chunksect; 6515 6516 if (mddev->raid_disks != 2 || 6517 mddev->degraded > 1) 6518 return ERR_PTR(-EINVAL); 6519 6520 /* Should check if there are write-behind devices? */ 6521 6522 chunksect = 64*2; /* 64K by default */ 6523 6524 /* The array must be an exact multiple of chunksize */ 6525 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6526 chunksect >>= 1; 6527 6528 if ((chunksect<<9) < STRIPE_SIZE) 6529 /* array size does not allow a suitable chunk size */ 6530 return ERR_PTR(-EINVAL); 6531 6532 mddev->new_level = 5; 6533 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6534 mddev->new_chunk_sectors = chunksect; 6535 6536 return setup_conf(mddev); 6537 } 6538 6539 static void *raid5_takeover_raid6(struct mddev *mddev) 6540 { 6541 int new_layout; 6542 6543 switch (mddev->layout) { 6544 case ALGORITHM_LEFT_ASYMMETRIC_6: 6545 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6546 break; 6547 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6548 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6549 break; 6550 case ALGORITHM_LEFT_SYMMETRIC_6: 6551 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6552 break; 6553 case ALGORITHM_RIGHT_SYMMETRIC_6: 6554 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6555 break; 6556 case ALGORITHM_PARITY_0_6: 6557 new_layout = ALGORITHM_PARITY_0; 6558 break; 6559 case ALGORITHM_PARITY_N: 6560 new_layout = ALGORITHM_PARITY_N; 6561 break; 6562 default: 6563 return ERR_PTR(-EINVAL); 6564 } 6565 mddev->new_level = 5; 6566 mddev->new_layout = new_layout; 6567 mddev->delta_disks = -1; 6568 mddev->raid_disks -= 1; 6569 return setup_conf(mddev); 6570 } 6571 6572 6573 static int raid5_check_reshape(struct mddev *mddev) 6574 { 6575 /* For a 2-drive array, the layout and chunk size can be changed 6576 * immediately as not restriping is needed. 6577 * For larger arrays we record the new value - after validation 6578 * to be used by a reshape pass. 6579 */ 6580 struct r5conf *conf = mddev->private; 6581 int new_chunk = mddev->new_chunk_sectors; 6582 6583 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6584 return -EINVAL; 6585 if (new_chunk > 0) { 6586 if (!is_power_of_2(new_chunk)) 6587 return -EINVAL; 6588 if (new_chunk < (PAGE_SIZE>>9)) 6589 return -EINVAL; 6590 if (mddev->array_sectors & (new_chunk-1)) 6591 /* not factor of array size */ 6592 return -EINVAL; 6593 } 6594 6595 /* They look valid */ 6596 6597 if (mddev->raid_disks == 2) { 6598 /* can make the change immediately */ 6599 if (mddev->new_layout >= 0) { 6600 conf->algorithm = mddev->new_layout; 6601 mddev->layout = mddev->new_layout; 6602 } 6603 if (new_chunk > 0) { 6604 conf->chunk_sectors = new_chunk ; 6605 mddev->chunk_sectors = new_chunk; 6606 } 6607 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6608 md_wakeup_thread(mddev->thread); 6609 } 6610 return check_reshape(mddev); 6611 } 6612 6613 static int raid6_check_reshape(struct mddev *mddev) 6614 { 6615 int new_chunk = mddev->new_chunk_sectors; 6616 6617 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6618 return -EINVAL; 6619 if (new_chunk > 0) { 6620 if (!is_power_of_2(new_chunk)) 6621 return -EINVAL; 6622 if (new_chunk < (PAGE_SIZE >> 9)) 6623 return -EINVAL; 6624 if (mddev->array_sectors & (new_chunk-1)) 6625 /* not factor of array size */ 6626 return -EINVAL; 6627 } 6628 6629 /* They look valid */ 6630 return check_reshape(mddev); 6631 } 6632 6633 static void *raid5_takeover(struct mddev *mddev) 6634 { 6635 /* raid5 can take over: 6636 * raid0 - if there is only one strip zone - make it a raid4 layout 6637 * raid1 - if there are two drives. We need to know the chunk size 6638 * raid4 - trivial - just use a raid4 layout. 6639 * raid6 - Providing it is a *_6 layout 6640 */ 6641 if (mddev->level == 0) 6642 return raid45_takeover_raid0(mddev, 5); 6643 if (mddev->level == 1) 6644 return raid5_takeover_raid1(mddev); 6645 if (mddev->level == 4) { 6646 mddev->new_layout = ALGORITHM_PARITY_N; 6647 mddev->new_level = 5; 6648 return setup_conf(mddev); 6649 } 6650 if (mddev->level == 6) 6651 return raid5_takeover_raid6(mddev); 6652 6653 return ERR_PTR(-EINVAL); 6654 } 6655 6656 static void *raid4_takeover(struct mddev *mddev) 6657 { 6658 /* raid4 can take over: 6659 * raid0 - if there is only one strip zone 6660 * raid5 - if layout is right 6661 */ 6662 if (mddev->level == 0) 6663 return raid45_takeover_raid0(mddev, 4); 6664 if (mddev->level == 5 && 6665 mddev->layout == ALGORITHM_PARITY_N) { 6666 mddev->new_layout = 0; 6667 mddev->new_level = 4; 6668 return setup_conf(mddev); 6669 } 6670 return ERR_PTR(-EINVAL); 6671 } 6672 6673 static struct md_personality raid5_personality; 6674 6675 static void *raid6_takeover(struct mddev *mddev) 6676 { 6677 /* Currently can only take over a raid5. We map the 6678 * personality to an equivalent raid6 personality 6679 * with the Q block at the end. 6680 */ 6681 int new_layout; 6682 6683 if (mddev->pers != &raid5_personality) 6684 return ERR_PTR(-EINVAL); 6685 if (mddev->degraded > 1) 6686 return ERR_PTR(-EINVAL); 6687 if (mddev->raid_disks > 253) 6688 return ERR_PTR(-EINVAL); 6689 if (mddev->raid_disks < 3) 6690 return ERR_PTR(-EINVAL); 6691 6692 switch (mddev->layout) { 6693 case ALGORITHM_LEFT_ASYMMETRIC: 6694 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 6695 break; 6696 case ALGORITHM_RIGHT_ASYMMETRIC: 6697 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 6698 break; 6699 case ALGORITHM_LEFT_SYMMETRIC: 6700 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 6701 break; 6702 case ALGORITHM_RIGHT_SYMMETRIC: 6703 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 6704 break; 6705 case ALGORITHM_PARITY_0: 6706 new_layout = ALGORITHM_PARITY_0_6; 6707 break; 6708 case ALGORITHM_PARITY_N: 6709 new_layout = ALGORITHM_PARITY_N; 6710 break; 6711 default: 6712 return ERR_PTR(-EINVAL); 6713 } 6714 mddev->new_level = 6; 6715 mddev->new_layout = new_layout; 6716 mddev->delta_disks = 1; 6717 mddev->raid_disks += 1; 6718 return setup_conf(mddev); 6719 } 6720 6721 6722 static struct md_personality raid6_personality = 6723 { 6724 .name = "raid6", 6725 .level = 6, 6726 .owner = THIS_MODULE, 6727 .make_request = make_request, 6728 .run = run, 6729 .stop = stop, 6730 .status = status, 6731 .error_handler = error, 6732 .hot_add_disk = raid5_add_disk, 6733 .hot_remove_disk= raid5_remove_disk, 6734 .spare_active = raid5_spare_active, 6735 .sync_request = sync_request, 6736 .resize = raid5_resize, 6737 .size = raid5_size, 6738 .check_reshape = raid6_check_reshape, 6739 .start_reshape = raid5_start_reshape, 6740 .finish_reshape = raid5_finish_reshape, 6741 .quiesce = raid5_quiesce, 6742 .takeover = raid6_takeover, 6743 }; 6744 static struct md_personality raid5_personality = 6745 { 6746 .name = "raid5", 6747 .level = 5, 6748 .owner = THIS_MODULE, 6749 .make_request = make_request, 6750 .run = run, 6751 .stop = stop, 6752 .status = status, 6753 .error_handler = error, 6754 .hot_add_disk = raid5_add_disk, 6755 .hot_remove_disk= raid5_remove_disk, 6756 .spare_active = raid5_spare_active, 6757 .sync_request = sync_request, 6758 .resize = raid5_resize, 6759 .size = raid5_size, 6760 .check_reshape = raid5_check_reshape, 6761 .start_reshape = raid5_start_reshape, 6762 .finish_reshape = raid5_finish_reshape, 6763 .quiesce = raid5_quiesce, 6764 .takeover = raid5_takeover, 6765 }; 6766 6767 static struct md_personality raid4_personality = 6768 { 6769 .name = "raid4", 6770 .level = 4, 6771 .owner = THIS_MODULE, 6772 .make_request = make_request, 6773 .run = run, 6774 .stop = stop, 6775 .status = status, 6776 .error_handler = error, 6777 .hot_add_disk = raid5_add_disk, 6778 .hot_remove_disk= raid5_remove_disk, 6779 .spare_active = raid5_spare_active, 6780 .sync_request = sync_request, 6781 .resize = raid5_resize, 6782 .size = raid5_size, 6783 .check_reshape = raid5_check_reshape, 6784 .start_reshape = raid5_start_reshape, 6785 .finish_reshape = raid5_finish_reshape, 6786 .quiesce = raid5_quiesce, 6787 .takeover = raid4_takeover, 6788 }; 6789 6790 static int __init raid5_init(void) 6791 { 6792 raid5_wq = alloc_workqueue("raid5wq", 6793 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 6794 if (!raid5_wq) 6795 return -ENOMEM; 6796 register_md_personality(&raid6_personality); 6797 register_md_personality(&raid5_personality); 6798 register_md_personality(&raid4_personality); 6799 return 0; 6800 } 6801 6802 static void raid5_exit(void) 6803 { 6804 unregister_md_personality(&raid6_personality); 6805 unregister_md_personality(&raid5_personality); 6806 unregister_md_personality(&raid4_personality); 6807 destroy_workqueue(raid5_wq); 6808 } 6809 6810 module_init(raid5_init); 6811 module_exit(raid5_exit); 6812 MODULE_LICENSE("GPL"); 6813 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 6814 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 6815 MODULE_ALIAS("md-raid5"); 6816 MODULE_ALIAS("md-raid4"); 6817 MODULE_ALIAS("md-level-5"); 6818 MODULE_ALIAS("md-level-4"); 6819 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 6820 MODULE_ALIAS("md-raid6"); 6821 MODULE_ALIAS("md-level-6"); 6822 6823 /* This used to be two separate modules, they were: */ 6824 MODULE_ALIAS("raid5"); 6825 MODULE_ALIAS("raid6"); 6826