1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <trace/events/block.h> 58 59 #include "md.h" 60 #include "raid5.h" 61 #include "raid0.h" 62 #include "bitmap.h" 63 64 #define cpu_to_group(cpu) cpu_to_node(cpu) 65 #define ANY_GROUP NUMA_NO_NODE 66 67 static bool devices_handle_discard_safely = false; 68 module_param(devices_handle_discard_safely, bool, 0644); 69 MODULE_PARM_DESC(devices_handle_discard_safely, 70 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 71 static struct workqueue_struct *raid5_wq; 72 /* 73 * Stripe cache 74 */ 75 76 #define NR_STRIPES 256 77 #define STRIPE_SIZE PAGE_SIZE 78 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 79 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 80 #define IO_THRESHOLD 1 81 #define BYPASS_THRESHOLD 1 82 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 83 #define HASH_MASK (NR_HASH - 1) 84 #define MAX_STRIPE_BATCH 8 85 86 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 87 { 88 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 89 return &conf->stripe_hashtbl[hash]; 90 } 91 92 static inline int stripe_hash_locks_hash(sector_t sect) 93 { 94 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 95 } 96 97 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 98 { 99 spin_lock_irq(conf->hash_locks + hash); 100 spin_lock(&conf->device_lock); 101 } 102 103 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 104 { 105 spin_unlock(&conf->device_lock); 106 spin_unlock_irq(conf->hash_locks + hash); 107 } 108 109 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 110 { 111 int i; 112 local_irq_disable(); 113 spin_lock(conf->hash_locks); 114 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 115 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 116 spin_lock(&conf->device_lock); 117 } 118 119 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 120 { 121 int i; 122 spin_unlock(&conf->device_lock); 123 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 124 spin_unlock(conf->hash_locks + i - 1); 125 local_irq_enable(); 126 } 127 128 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 129 * order without overlap. There may be several bio's per stripe+device, and 130 * a bio could span several devices. 131 * When walking this list for a particular stripe+device, we must never proceed 132 * beyond a bio that extends past this device, as the next bio might no longer 133 * be valid. 134 * This function is used to determine the 'next' bio in the list, given the sector 135 * of the current stripe+device 136 */ 137 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 138 { 139 int sectors = bio_sectors(bio); 140 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) 141 return bio->bi_next; 142 else 143 return NULL; 144 } 145 146 /* 147 * We maintain a biased count of active stripes in the bottom 16 bits of 148 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 149 */ 150 static inline int raid5_bi_processed_stripes(struct bio *bio) 151 { 152 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 153 return (atomic_read(segments) >> 16) & 0xffff; 154 } 155 156 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 157 { 158 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 159 return atomic_sub_return(1, segments) & 0xffff; 160 } 161 162 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 163 { 164 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 165 atomic_inc(segments); 166 } 167 168 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 169 unsigned int cnt) 170 { 171 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 172 int old, new; 173 174 do { 175 old = atomic_read(segments); 176 new = (old & 0xffff) | (cnt << 16); 177 } while (atomic_cmpxchg(segments, old, new) != old); 178 } 179 180 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 181 { 182 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 183 atomic_set(segments, cnt); 184 } 185 186 /* Find first data disk in a raid6 stripe */ 187 static inline int raid6_d0(struct stripe_head *sh) 188 { 189 if (sh->ddf_layout) 190 /* ddf always start from first device */ 191 return 0; 192 /* md starts just after Q block */ 193 if (sh->qd_idx == sh->disks - 1) 194 return 0; 195 else 196 return sh->qd_idx + 1; 197 } 198 static inline int raid6_next_disk(int disk, int raid_disks) 199 { 200 disk++; 201 return (disk < raid_disks) ? disk : 0; 202 } 203 204 /* When walking through the disks in a raid5, starting at raid6_d0, 205 * We need to map each disk to a 'slot', where the data disks are slot 206 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 207 * is raid_disks-1. This help does that mapping. 208 */ 209 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 210 int *count, int syndrome_disks) 211 { 212 int slot = *count; 213 214 if (sh->ddf_layout) 215 (*count)++; 216 if (idx == sh->pd_idx) 217 return syndrome_disks; 218 if (idx == sh->qd_idx) 219 return syndrome_disks + 1; 220 if (!sh->ddf_layout) 221 (*count)++; 222 return slot; 223 } 224 225 static void return_io(struct bio *return_bi) 226 { 227 struct bio *bi = return_bi; 228 while (bi) { 229 230 return_bi = bi->bi_next; 231 bi->bi_next = NULL; 232 bi->bi_iter.bi_size = 0; 233 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 234 bi, 0); 235 bio_endio(bi, 0); 236 bi = return_bi; 237 } 238 } 239 240 static void print_raid5_conf (struct r5conf *conf); 241 242 static int stripe_operations_active(struct stripe_head *sh) 243 { 244 return sh->check_state || sh->reconstruct_state || 245 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 246 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 247 } 248 249 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 250 { 251 struct r5conf *conf = sh->raid_conf; 252 struct r5worker_group *group; 253 int thread_cnt; 254 int i, cpu = sh->cpu; 255 256 if (!cpu_online(cpu)) { 257 cpu = cpumask_any(cpu_online_mask); 258 sh->cpu = cpu; 259 } 260 261 if (list_empty(&sh->lru)) { 262 struct r5worker_group *group; 263 group = conf->worker_groups + cpu_to_group(cpu); 264 list_add_tail(&sh->lru, &group->handle_list); 265 group->stripes_cnt++; 266 sh->group = group; 267 } 268 269 if (conf->worker_cnt_per_group == 0) { 270 md_wakeup_thread(conf->mddev->thread); 271 return; 272 } 273 274 group = conf->worker_groups + cpu_to_group(sh->cpu); 275 276 group->workers[0].working = true; 277 /* at least one worker should run to avoid race */ 278 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 279 280 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 281 /* wakeup more workers */ 282 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 283 if (group->workers[i].working == false) { 284 group->workers[i].working = true; 285 queue_work_on(sh->cpu, raid5_wq, 286 &group->workers[i].work); 287 thread_cnt--; 288 } 289 } 290 } 291 292 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 293 struct list_head *temp_inactive_list) 294 { 295 BUG_ON(!list_empty(&sh->lru)); 296 BUG_ON(atomic_read(&conf->active_stripes)==0); 297 if (test_bit(STRIPE_HANDLE, &sh->state)) { 298 if (test_bit(STRIPE_DELAYED, &sh->state) && 299 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 300 list_add_tail(&sh->lru, &conf->delayed_list); 301 if (atomic_read(&conf->preread_active_stripes) 302 < IO_THRESHOLD) 303 md_wakeup_thread(conf->mddev->thread); 304 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 305 sh->bm_seq - conf->seq_write > 0) 306 list_add_tail(&sh->lru, &conf->bitmap_list); 307 else { 308 clear_bit(STRIPE_DELAYED, &sh->state); 309 clear_bit(STRIPE_BIT_DELAY, &sh->state); 310 if (conf->worker_cnt_per_group == 0) { 311 list_add_tail(&sh->lru, &conf->handle_list); 312 } else { 313 raid5_wakeup_stripe_thread(sh); 314 return; 315 } 316 } 317 md_wakeup_thread(conf->mddev->thread); 318 } else { 319 BUG_ON(stripe_operations_active(sh)); 320 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 321 if (atomic_dec_return(&conf->preread_active_stripes) 322 < IO_THRESHOLD) 323 md_wakeup_thread(conf->mddev->thread); 324 atomic_dec(&conf->active_stripes); 325 if (!test_bit(STRIPE_EXPANDING, &sh->state)) 326 list_add_tail(&sh->lru, temp_inactive_list); 327 } 328 } 329 330 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 331 struct list_head *temp_inactive_list) 332 { 333 if (atomic_dec_and_test(&sh->count)) 334 do_release_stripe(conf, sh, temp_inactive_list); 335 } 336 337 /* 338 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 339 * 340 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 341 * given time. Adding stripes only takes device lock, while deleting stripes 342 * only takes hash lock. 343 */ 344 static void release_inactive_stripe_list(struct r5conf *conf, 345 struct list_head *temp_inactive_list, 346 int hash) 347 { 348 int size; 349 bool do_wakeup = false; 350 unsigned long flags; 351 352 if (hash == NR_STRIPE_HASH_LOCKS) { 353 size = NR_STRIPE_HASH_LOCKS; 354 hash = NR_STRIPE_HASH_LOCKS - 1; 355 } else 356 size = 1; 357 while (size) { 358 struct list_head *list = &temp_inactive_list[size - 1]; 359 360 /* 361 * We don't hold any lock here yet, get_active_stripe() might 362 * remove stripes from the list 363 */ 364 if (!list_empty_careful(list)) { 365 spin_lock_irqsave(conf->hash_locks + hash, flags); 366 if (list_empty(conf->inactive_list + hash) && 367 !list_empty(list)) 368 atomic_dec(&conf->empty_inactive_list_nr); 369 list_splice_tail_init(list, conf->inactive_list + hash); 370 do_wakeup = true; 371 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 372 } 373 size--; 374 hash--; 375 } 376 377 if (do_wakeup) { 378 wake_up(&conf->wait_for_stripe); 379 if (conf->retry_read_aligned) 380 md_wakeup_thread(conf->mddev->thread); 381 } 382 } 383 384 /* should hold conf->device_lock already */ 385 static int release_stripe_list(struct r5conf *conf, 386 struct list_head *temp_inactive_list) 387 { 388 struct stripe_head *sh; 389 int count = 0; 390 struct llist_node *head; 391 392 head = llist_del_all(&conf->released_stripes); 393 head = llist_reverse_order(head); 394 while (head) { 395 int hash; 396 397 sh = llist_entry(head, struct stripe_head, release_list); 398 head = llist_next(head); 399 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 400 smp_mb(); 401 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 402 /* 403 * Don't worry the bit is set here, because if the bit is set 404 * again, the count is always > 1. This is true for 405 * STRIPE_ON_UNPLUG_LIST bit too. 406 */ 407 hash = sh->hash_lock_index; 408 __release_stripe(conf, sh, &temp_inactive_list[hash]); 409 count++; 410 } 411 412 return count; 413 } 414 415 static void release_stripe(struct stripe_head *sh) 416 { 417 struct r5conf *conf = sh->raid_conf; 418 unsigned long flags; 419 struct list_head list; 420 int hash; 421 bool wakeup; 422 423 /* Avoid release_list until the last reference. 424 */ 425 if (atomic_add_unless(&sh->count, -1, 1)) 426 return; 427 428 if (unlikely(!conf->mddev->thread) || 429 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 430 goto slow_path; 431 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 432 if (wakeup) 433 md_wakeup_thread(conf->mddev->thread); 434 return; 435 slow_path: 436 local_irq_save(flags); 437 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 438 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 439 INIT_LIST_HEAD(&list); 440 hash = sh->hash_lock_index; 441 do_release_stripe(conf, sh, &list); 442 spin_unlock(&conf->device_lock); 443 release_inactive_stripe_list(conf, &list, hash); 444 } 445 local_irq_restore(flags); 446 } 447 448 static inline void remove_hash(struct stripe_head *sh) 449 { 450 pr_debug("remove_hash(), stripe %llu\n", 451 (unsigned long long)sh->sector); 452 453 hlist_del_init(&sh->hash); 454 } 455 456 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 457 { 458 struct hlist_head *hp = stripe_hash(conf, sh->sector); 459 460 pr_debug("insert_hash(), stripe %llu\n", 461 (unsigned long long)sh->sector); 462 463 hlist_add_head(&sh->hash, hp); 464 } 465 466 467 /* find an idle stripe, make sure it is unhashed, and return it. */ 468 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 469 { 470 struct stripe_head *sh = NULL; 471 struct list_head *first; 472 473 if (list_empty(conf->inactive_list + hash)) 474 goto out; 475 first = (conf->inactive_list + hash)->next; 476 sh = list_entry(first, struct stripe_head, lru); 477 list_del_init(first); 478 remove_hash(sh); 479 atomic_inc(&conf->active_stripes); 480 BUG_ON(hash != sh->hash_lock_index); 481 if (list_empty(conf->inactive_list + hash)) 482 atomic_inc(&conf->empty_inactive_list_nr); 483 out: 484 return sh; 485 } 486 487 static void shrink_buffers(struct stripe_head *sh) 488 { 489 struct page *p; 490 int i; 491 int num = sh->raid_conf->pool_size; 492 493 for (i = 0; i < num ; i++) { 494 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 495 p = sh->dev[i].page; 496 if (!p) 497 continue; 498 sh->dev[i].page = NULL; 499 put_page(p); 500 } 501 } 502 503 static int grow_buffers(struct stripe_head *sh) 504 { 505 int i; 506 int num = sh->raid_conf->pool_size; 507 508 for (i = 0; i < num; i++) { 509 struct page *page; 510 511 if (!(page = alloc_page(GFP_KERNEL))) { 512 return 1; 513 } 514 sh->dev[i].page = page; 515 sh->dev[i].orig_page = page; 516 } 517 return 0; 518 } 519 520 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 521 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 522 struct stripe_head *sh); 523 524 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 525 { 526 struct r5conf *conf = sh->raid_conf; 527 int i, seq; 528 529 BUG_ON(atomic_read(&sh->count) != 0); 530 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 531 BUG_ON(stripe_operations_active(sh)); 532 533 pr_debug("init_stripe called, stripe %llu\n", 534 (unsigned long long)sh->sector); 535 536 remove_hash(sh); 537 retry: 538 seq = read_seqcount_begin(&conf->gen_lock); 539 sh->generation = conf->generation - previous; 540 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 541 sh->sector = sector; 542 stripe_set_idx(sector, conf, previous, sh); 543 sh->state = 0; 544 545 546 for (i = sh->disks; i--; ) { 547 struct r5dev *dev = &sh->dev[i]; 548 549 if (dev->toread || dev->read || dev->towrite || dev->written || 550 test_bit(R5_LOCKED, &dev->flags)) { 551 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 552 (unsigned long long)sh->sector, i, dev->toread, 553 dev->read, dev->towrite, dev->written, 554 test_bit(R5_LOCKED, &dev->flags)); 555 WARN_ON(1); 556 } 557 dev->flags = 0; 558 raid5_build_block(sh, i, previous); 559 } 560 if (read_seqcount_retry(&conf->gen_lock, seq)) 561 goto retry; 562 insert_hash(conf, sh); 563 sh->cpu = smp_processor_id(); 564 } 565 566 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 567 short generation) 568 { 569 struct stripe_head *sh; 570 571 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 572 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 573 if (sh->sector == sector && sh->generation == generation) 574 return sh; 575 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 576 return NULL; 577 } 578 579 /* 580 * Need to check if array has failed when deciding whether to: 581 * - start an array 582 * - remove non-faulty devices 583 * - add a spare 584 * - allow a reshape 585 * This determination is simple when no reshape is happening. 586 * However if there is a reshape, we need to carefully check 587 * both the before and after sections. 588 * This is because some failed devices may only affect one 589 * of the two sections, and some non-in_sync devices may 590 * be insync in the section most affected by failed devices. 591 */ 592 static int calc_degraded(struct r5conf *conf) 593 { 594 int degraded, degraded2; 595 int i; 596 597 rcu_read_lock(); 598 degraded = 0; 599 for (i = 0; i < conf->previous_raid_disks; i++) { 600 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 601 if (rdev && test_bit(Faulty, &rdev->flags)) 602 rdev = rcu_dereference(conf->disks[i].replacement); 603 if (!rdev || test_bit(Faulty, &rdev->flags)) 604 degraded++; 605 else if (test_bit(In_sync, &rdev->flags)) 606 ; 607 else 608 /* not in-sync or faulty. 609 * If the reshape increases the number of devices, 610 * this is being recovered by the reshape, so 611 * this 'previous' section is not in_sync. 612 * If the number of devices is being reduced however, 613 * the device can only be part of the array if 614 * we are reverting a reshape, so this section will 615 * be in-sync. 616 */ 617 if (conf->raid_disks >= conf->previous_raid_disks) 618 degraded++; 619 } 620 rcu_read_unlock(); 621 if (conf->raid_disks == conf->previous_raid_disks) 622 return degraded; 623 rcu_read_lock(); 624 degraded2 = 0; 625 for (i = 0; i < conf->raid_disks; i++) { 626 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 627 if (rdev && test_bit(Faulty, &rdev->flags)) 628 rdev = rcu_dereference(conf->disks[i].replacement); 629 if (!rdev || test_bit(Faulty, &rdev->flags)) 630 degraded2++; 631 else if (test_bit(In_sync, &rdev->flags)) 632 ; 633 else 634 /* not in-sync or faulty. 635 * If reshape increases the number of devices, this 636 * section has already been recovered, else it 637 * almost certainly hasn't. 638 */ 639 if (conf->raid_disks <= conf->previous_raid_disks) 640 degraded2++; 641 } 642 rcu_read_unlock(); 643 if (degraded2 > degraded) 644 return degraded2; 645 return degraded; 646 } 647 648 static int has_failed(struct r5conf *conf) 649 { 650 int degraded; 651 652 if (conf->mddev->reshape_position == MaxSector) 653 return conf->mddev->degraded > conf->max_degraded; 654 655 degraded = calc_degraded(conf); 656 if (degraded > conf->max_degraded) 657 return 1; 658 return 0; 659 } 660 661 static struct stripe_head * 662 get_active_stripe(struct r5conf *conf, sector_t sector, 663 int previous, int noblock, int noquiesce) 664 { 665 struct stripe_head *sh; 666 int hash = stripe_hash_locks_hash(sector); 667 668 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 669 670 spin_lock_irq(conf->hash_locks + hash); 671 672 do { 673 wait_event_lock_irq(conf->wait_for_stripe, 674 conf->quiesce == 0 || noquiesce, 675 *(conf->hash_locks + hash)); 676 sh = __find_stripe(conf, sector, conf->generation - previous); 677 if (!sh) { 678 if (!conf->inactive_blocked) 679 sh = get_free_stripe(conf, hash); 680 if (noblock && sh == NULL) 681 break; 682 if (!sh) { 683 conf->inactive_blocked = 1; 684 wait_event_lock_irq( 685 conf->wait_for_stripe, 686 !list_empty(conf->inactive_list + hash) && 687 (atomic_read(&conf->active_stripes) 688 < (conf->max_nr_stripes * 3 / 4) 689 || !conf->inactive_blocked), 690 *(conf->hash_locks + hash)); 691 conf->inactive_blocked = 0; 692 } else { 693 init_stripe(sh, sector, previous); 694 atomic_inc(&sh->count); 695 } 696 } else if (!atomic_inc_not_zero(&sh->count)) { 697 spin_lock(&conf->device_lock); 698 if (!atomic_read(&sh->count)) { 699 if (!test_bit(STRIPE_HANDLE, &sh->state)) 700 atomic_inc(&conf->active_stripes); 701 BUG_ON(list_empty(&sh->lru) && 702 !test_bit(STRIPE_EXPANDING, &sh->state)); 703 list_del_init(&sh->lru); 704 if (sh->group) { 705 sh->group->stripes_cnt--; 706 sh->group = NULL; 707 } 708 } 709 atomic_inc(&sh->count); 710 spin_unlock(&conf->device_lock); 711 } 712 } while (sh == NULL); 713 714 spin_unlock_irq(conf->hash_locks + hash); 715 return sh; 716 } 717 718 /* Determine if 'data_offset' or 'new_data_offset' should be used 719 * in this stripe_head. 720 */ 721 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 722 { 723 sector_t progress = conf->reshape_progress; 724 /* Need a memory barrier to make sure we see the value 725 * of conf->generation, or ->data_offset that was set before 726 * reshape_progress was updated. 727 */ 728 smp_rmb(); 729 if (progress == MaxSector) 730 return 0; 731 if (sh->generation == conf->generation - 1) 732 return 0; 733 /* We are in a reshape, and this is a new-generation stripe, 734 * so use new_data_offset. 735 */ 736 return 1; 737 } 738 739 static void 740 raid5_end_read_request(struct bio *bi, int error); 741 static void 742 raid5_end_write_request(struct bio *bi, int error); 743 744 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 745 { 746 struct r5conf *conf = sh->raid_conf; 747 int i, disks = sh->disks; 748 749 might_sleep(); 750 751 for (i = disks; i--; ) { 752 int rw; 753 int replace_only = 0; 754 struct bio *bi, *rbi; 755 struct md_rdev *rdev, *rrdev = NULL; 756 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 757 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 758 rw = WRITE_FUA; 759 else 760 rw = WRITE; 761 if (test_bit(R5_Discard, &sh->dev[i].flags)) 762 rw |= REQ_DISCARD; 763 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 764 rw = READ; 765 else if (test_and_clear_bit(R5_WantReplace, 766 &sh->dev[i].flags)) { 767 rw = WRITE; 768 replace_only = 1; 769 } else 770 continue; 771 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 772 rw |= REQ_SYNC; 773 774 bi = &sh->dev[i].req; 775 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 776 777 rcu_read_lock(); 778 rrdev = rcu_dereference(conf->disks[i].replacement); 779 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 780 rdev = rcu_dereference(conf->disks[i].rdev); 781 if (!rdev) { 782 rdev = rrdev; 783 rrdev = NULL; 784 } 785 if (rw & WRITE) { 786 if (replace_only) 787 rdev = NULL; 788 if (rdev == rrdev) 789 /* We raced and saw duplicates */ 790 rrdev = NULL; 791 } else { 792 if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) 793 rdev = rrdev; 794 rrdev = NULL; 795 } 796 797 if (rdev && test_bit(Faulty, &rdev->flags)) 798 rdev = NULL; 799 if (rdev) 800 atomic_inc(&rdev->nr_pending); 801 if (rrdev && test_bit(Faulty, &rrdev->flags)) 802 rrdev = NULL; 803 if (rrdev) 804 atomic_inc(&rrdev->nr_pending); 805 rcu_read_unlock(); 806 807 /* We have already checked bad blocks for reads. Now 808 * need to check for writes. We never accept write errors 809 * on the replacement, so we don't to check rrdev. 810 */ 811 while ((rw & WRITE) && rdev && 812 test_bit(WriteErrorSeen, &rdev->flags)) { 813 sector_t first_bad; 814 int bad_sectors; 815 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 816 &first_bad, &bad_sectors); 817 if (!bad) 818 break; 819 820 if (bad < 0) { 821 set_bit(BlockedBadBlocks, &rdev->flags); 822 if (!conf->mddev->external && 823 conf->mddev->flags) { 824 /* It is very unlikely, but we might 825 * still need to write out the 826 * bad block log - better give it 827 * a chance*/ 828 md_check_recovery(conf->mddev); 829 } 830 /* 831 * Because md_wait_for_blocked_rdev 832 * will dec nr_pending, we must 833 * increment it first. 834 */ 835 atomic_inc(&rdev->nr_pending); 836 md_wait_for_blocked_rdev(rdev, conf->mddev); 837 } else { 838 /* Acknowledged bad block - skip the write */ 839 rdev_dec_pending(rdev, conf->mddev); 840 rdev = NULL; 841 } 842 } 843 844 if (rdev) { 845 if (s->syncing || s->expanding || s->expanded 846 || s->replacing) 847 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 848 849 set_bit(STRIPE_IO_STARTED, &sh->state); 850 851 bio_reset(bi); 852 bi->bi_bdev = rdev->bdev; 853 bi->bi_rw = rw; 854 bi->bi_end_io = (rw & WRITE) 855 ? raid5_end_write_request 856 : raid5_end_read_request; 857 bi->bi_private = sh; 858 859 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 860 __func__, (unsigned long long)sh->sector, 861 bi->bi_rw, i); 862 atomic_inc(&sh->count); 863 if (use_new_offset(conf, sh)) 864 bi->bi_iter.bi_sector = (sh->sector 865 + rdev->new_data_offset); 866 else 867 bi->bi_iter.bi_sector = (sh->sector 868 + rdev->data_offset); 869 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 870 bi->bi_rw |= REQ_NOMERGE; 871 872 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 873 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 874 sh->dev[i].vec.bv_page = sh->dev[i].page; 875 bi->bi_vcnt = 1; 876 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 877 bi->bi_io_vec[0].bv_offset = 0; 878 bi->bi_iter.bi_size = STRIPE_SIZE; 879 /* 880 * If this is discard request, set bi_vcnt 0. We don't 881 * want to confuse SCSI because SCSI will replace payload 882 */ 883 if (rw & REQ_DISCARD) 884 bi->bi_vcnt = 0; 885 if (rrdev) 886 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 887 888 if (conf->mddev->gendisk) 889 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 890 bi, disk_devt(conf->mddev->gendisk), 891 sh->dev[i].sector); 892 generic_make_request(bi); 893 } 894 if (rrdev) { 895 if (s->syncing || s->expanding || s->expanded 896 || s->replacing) 897 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 898 899 set_bit(STRIPE_IO_STARTED, &sh->state); 900 901 bio_reset(rbi); 902 rbi->bi_bdev = rrdev->bdev; 903 rbi->bi_rw = rw; 904 BUG_ON(!(rw & WRITE)); 905 rbi->bi_end_io = raid5_end_write_request; 906 rbi->bi_private = sh; 907 908 pr_debug("%s: for %llu schedule op %ld on " 909 "replacement disc %d\n", 910 __func__, (unsigned long long)sh->sector, 911 rbi->bi_rw, i); 912 atomic_inc(&sh->count); 913 if (use_new_offset(conf, sh)) 914 rbi->bi_iter.bi_sector = (sh->sector 915 + rrdev->new_data_offset); 916 else 917 rbi->bi_iter.bi_sector = (sh->sector 918 + rrdev->data_offset); 919 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 920 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 921 sh->dev[i].rvec.bv_page = sh->dev[i].page; 922 rbi->bi_vcnt = 1; 923 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 924 rbi->bi_io_vec[0].bv_offset = 0; 925 rbi->bi_iter.bi_size = STRIPE_SIZE; 926 /* 927 * If this is discard request, set bi_vcnt 0. We don't 928 * want to confuse SCSI because SCSI will replace payload 929 */ 930 if (rw & REQ_DISCARD) 931 rbi->bi_vcnt = 0; 932 if (conf->mddev->gendisk) 933 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 934 rbi, disk_devt(conf->mddev->gendisk), 935 sh->dev[i].sector); 936 generic_make_request(rbi); 937 } 938 if (!rdev && !rrdev) { 939 if (rw & WRITE) 940 set_bit(STRIPE_DEGRADED, &sh->state); 941 pr_debug("skip op %ld on disc %d for sector %llu\n", 942 bi->bi_rw, i, (unsigned long long)sh->sector); 943 clear_bit(R5_LOCKED, &sh->dev[i].flags); 944 set_bit(STRIPE_HANDLE, &sh->state); 945 } 946 } 947 } 948 949 static struct dma_async_tx_descriptor * 950 async_copy_data(int frombio, struct bio *bio, struct page **page, 951 sector_t sector, struct dma_async_tx_descriptor *tx, 952 struct stripe_head *sh) 953 { 954 struct bio_vec bvl; 955 struct bvec_iter iter; 956 struct page *bio_page; 957 int page_offset; 958 struct async_submit_ctl submit; 959 enum async_tx_flags flags = 0; 960 961 if (bio->bi_iter.bi_sector >= sector) 962 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 963 else 964 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 965 966 if (frombio) 967 flags |= ASYNC_TX_FENCE; 968 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 969 970 bio_for_each_segment(bvl, bio, iter) { 971 int len = bvl.bv_len; 972 int clen; 973 int b_offset = 0; 974 975 if (page_offset < 0) { 976 b_offset = -page_offset; 977 page_offset += b_offset; 978 len -= b_offset; 979 } 980 981 if (len > 0 && page_offset + len > STRIPE_SIZE) 982 clen = STRIPE_SIZE - page_offset; 983 else 984 clen = len; 985 986 if (clen > 0) { 987 b_offset += bvl.bv_offset; 988 bio_page = bvl.bv_page; 989 if (frombio) { 990 if (sh->raid_conf->skip_copy && 991 b_offset == 0 && page_offset == 0 && 992 clen == STRIPE_SIZE) 993 *page = bio_page; 994 else 995 tx = async_memcpy(*page, bio_page, page_offset, 996 b_offset, clen, &submit); 997 } else 998 tx = async_memcpy(bio_page, *page, b_offset, 999 page_offset, clen, &submit); 1000 } 1001 /* chain the operations */ 1002 submit.depend_tx = tx; 1003 1004 if (clen < len) /* hit end of page */ 1005 break; 1006 page_offset += len; 1007 } 1008 1009 return tx; 1010 } 1011 1012 static void ops_complete_biofill(void *stripe_head_ref) 1013 { 1014 struct stripe_head *sh = stripe_head_ref; 1015 struct bio *return_bi = NULL; 1016 int i; 1017 1018 pr_debug("%s: stripe %llu\n", __func__, 1019 (unsigned long long)sh->sector); 1020 1021 /* clear completed biofills */ 1022 for (i = sh->disks; i--; ) { 1023 struct r5dev *dev = &sh->dev[i]; 1024 1025 /* acknowledge completion of a biofill operation */ 1026 /* and check if we need to reply to a read request, 1027 * new R5_Wantfill requests are held off until 1028 * !STRIPE_BIOFILL_RUN 1029 */ 1030 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1031 struct bio *rbi, *rbi2; 1032 1033 BUG_ON(!dev->read); 1034 rbi = dev->read; 1035 dev->read = NULL; 1036 while (rbi && rbi->bi_iter.bi_sector < 1037 dev->sector + STRIPE_SECTORS) { 1038 rbi2 = r5_next_bio(rbi, dev->sector); 1039 if (!raid5_dec_bi_active_stripes(rbi)) { 1040 rbi->bi_next = return_bi; 1041 return_bi = rbi; 1042 } 1043 rbi = rbi2; 1044 } 1045 } 1046 } 1047 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1048 1049 return_io(return_bi); 1050 1051 set_bit(STRIPE_HANDLE, &sh->state); 1052 release_stripe(sh); 1053 } 1054 1055 static void ops_run_biofill(struct stripe_head *sh) 1056 { 1057 struct dma_async_tx_descriptor *tx = NULL; 1058 struct async_submit_ctl submit; 1059 int i; 1060 1061 pr_debug("%s: stripe %llu\n", __func__, 1062 (unsigned long long)sh->sector); 1063 1064 for (i = sh->disks; i--; ) { 1065 struct r5dev *dev = &sh->dev[i]; 1066 if (test_bit(R5_Wantfill, &dev->flags)) { 1067 struct bio *rbi; 1068 spin_lock_irq(&sh->stripe_lock); 1069 dev->read = rbi = dev->toread; 1070 dev->toread = NULL; 1071 spin_unlock_irq(&sh->stripe_lock); 1072 while (rbi && rbi->bi_iter.bi_sector < 1073 dev->sector + STRIPE_SECTORS) { 1074 tx = async_copy_data(0, rbi, &dev->page, 1075 dev->sector, tx, sh); 1076 rbi = r5_next_bio(rbi, dev->sector); 1077 } 1078 } 1079 } 1080 1081 atomic_inc(&sh->count); 1082 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1083 async_trigger_callback(&submit); 1084 } 1085 1086 static void mark_target_uptodate(struct stripe_head *sh, int target) 1087 { 1088 struct r5dev *tgt; 1089 1090 if (target < 0) 1091 return; 1092 1093 tgt = &sh->dev[target]; 1094 set_bit(R5_UPTODATE, &tgt->flags); 1095 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1096 clear_bit(R5_Wantcompute, &tgt->flags); 1097 } 1098 1099 static void ops_complete_compute(void *stripe_head_ref) 1100 { 1101 struct stripe_head *sh = stripe_head_ref; 1102 1103 pr_debug("%s: stripe %llu\n", __func__, 1104 (unsigned long long)sh->sector); 1105 1106 /* mark the computed target(s) as uptodate */ 1107 mark_target_uptodate(sh, sh->ops.target); 1108 mark_target_uptodate(sh, sh->ops.target2); 1109 1110 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1111 if (sh->check_state == check_state_compute_run) 1112 sh->check_state = check_state_compute_result; 1113 set_bit(STRIPE_HANDLE, &sh->state); 1114 release_stripe(sh); 1115 } 1116 1117 /* return a pointer to the address conversion region of the scribble buffer */ 1118 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1119 struct raid5_percpu *percpu) 1120 { 1121 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 1122 } 1123 1124 static struct dma_async_tx_descriptor * 1125 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1126 { 1127 int disks = sh->disks; 1128 struct page **xor_srcs = percpu->scribble; 1129 int target = sh->ops.target; 1130 struct r5dev *tgt = &sh->dev[target]; 1131 struct page *xor_dest = tgt->page; 1132 int count = 0; 1133 struct dma_async_tx_descriptor *tx; 1134 struct async_submit_ctl submit; 1135 int i; 1136 1137 pr_debug("%s: stripe %llu block: %d\n", 1138 __func__, (unsigned long long)sh->sector, target); 1139 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1140 1141 for (i = disks; i--; ) 1142 if (i != target) 1143 xor_srcs[count++] = sh->dev[i].page; 1144 1145 atomic_inc(&sh->count); 1146 1147 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1148 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 1149 if (unlikely(count == 1)) 1150 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1151 else 1152 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1153 1154 return tx; 1155 } 1156 1157 /* set_syndrome_sources - populate source buffers for gen_syndrome 1158 * @srcs - (struct page *) array of size sh->disks 1159 * @sh - stripe_head to parse 1160 * 1161 * Populates srcs in proper layout order for the stripe and returns the 1162 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1163 * destination buffer is recorded in srcs[count] and the Q destination 1164 * is recorded in srcs[count+1]]. 1165 */ 1166 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 1167 { 1168 int disks = sh->disks; 1169 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1170 int d0_idx = raid6_d0(sh); 1171 int count; 1172 int i; 1173 1174 for (i = 0; i < disks; i++) 1175 srcs[i] = NULL; 1176 1177 count = 0; 1178 i = d0_idx; 1179 do { 1180 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1181 1182 srcs[slot] = sh->dev[i].page; 1183 i = raid6_next_disk(i, disks); 1184 } while (i != d0_idx); 1185 1186 return syndrome_disks; 1187 } 1188 1189 static struct dma_async_tx_descriptor * 1190 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1191 { 1192 int disks = sh->disks; 1193 struct page **blocks = percpu->scribble; 1194 int target; 1195 int qd_idx = sh->qd_idx; 1196 struct dma_async_tx_descriptor *tx; 1197 struct async_submit_ctl submit; 1198 struct r5dev *tgt; 1199 struct page *dest; 1200 int i; 1201 int count; 1202 1203 if (sh->ops.target < 0) 1204 target = sh->ops.target2; 1205 else if (sh->ops.target2 < 0) 1206 target = sh->ops.target; 1207 else 1208 /* we should only have one valid target */ 1209 BUG(); 1210 BUG_ON(target < 0); 1211 pr_debug("%s: stripe %llu block: %d\n", 1212 __func__, (unsigned long long)sh->sector, target); 1213 1214 tgt = &sh->dev[target]; 1215 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1216 dest = tgt->page; 1217 1218 atomic_inc(&sh->count); 1219 1220 if (target == qd_idx) { 1221 count = set_syndrome_sources(blocks, sh); 1222 blocks[count] = NULL; /* regenerating p is not necessary */ 1223 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1224 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1225 ops_complete_compute, sh, 1226 to_addr_conv(sh, percpu)); 1227 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1228 } else { 1229 /* Compute any data- or p-drive using XOR */ 1230 count = 0; 1231 for (i = disks; i-- ; ) { 1232 if (i == target || i == qd_idx) 1233 continue; 1234 blocks[count++] = sh->dev[i].page; 1235 } 1236 1237 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1238 NULL, ops_complete_compute, sh, 1239 to_addr_conv(sh, percpu)); 1240 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1241 } 1242 1243 return tx; 1244 } 1245 1246 static struct dma_async_tx_descriptor * 1247 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1248 { 1249 int i, count, disks = sh->disks; 1250 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1251 int d0_idx = raid6_d0(sh); 1252 int faila = -1, failb = -1; 1253 int target = sh->ops.target; 1254 int target2 = sh->ops.target2; 1255 struct r5dev *tgt = &sh->dev[target]; 1256 struct r5dev *tgt2 = &sh->dev[target2]; 1257 struct dma_async_tx_descriptor *tx; 1258 struct page **blocks = percpu->scribble; 1259 struct async_submit_ctl submit; 1260 1261 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1262 __func__, (unsigned long long)sh->sector, target, target2); 1263 BUG_ON(target < 0 || target2 < 0); 1264 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1265 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1266 1267 /* we need to open-code set_syndrome_sources to handle the 1268 * slot number conversion for 'faila' and 'failb' 1269 */ 1270 for (i = 0; i < disks ; i++) 1271 blocks[i] = NULL; 1272 count = 0; 1273 i = d0_idx; 1274 do { 1275 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1276 1277 blocks[slot] = sh->dev[i].page; 1278 1279 if (i == target) 1280 faila = slot; 1281 if (i == target2) 1282 failb = slot; 1283 i = raid6_next_disk(i, disks); 1284 } while (i != d0_idx); 1285 1286 BUG_ON(faila == failb); 1287 if (failb < faila) 1288 swap(faila, failb); 1289 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1290 __func__, (unsigned long long)sh->sector, faila, failb); 1291 1292 atomic_inc(&sh->count); 1293 1294 if (failb == syndrome_disks+1) { 1295 /* Q disk is one of the missing disks */ 1296 if (faila == syndrome_disks) { 1297 /* Missing P+Q, just recompute */ 1298 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1299 ops_complete_compute, sh, 1300 to_addr_conv(sh, percpu)); 1301 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1302 STRIPE_SIZE, &submit); 1303 } else { 1304 struct page *dest; 1305 int data_target; 1306 int qd_idx = sh->qd_idx; 1307 1308 /* Missing D+Q: recompute D from P, then recompute Q */ 1309 if (target == qd_idx) 1310 data_target = target2; 1311 else 1312 data_target = target; 1313 1314 count = 0; 1315 for (i = disks; i-- ; ) { 1316 if (i == data_target || i == qd_idx) 1317 continue; 1318 blocks[count++] = sh->dev[i].page; 1319 } 1320 dest = sh->dev[data_target].page; 1321 init_async_submit(&submit, 1322 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1323 NULL, NULL, NULL, 1324 to_addr_conv(sh, percpu)); 1325 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1326 &submit); 1327 1328 count = set_syndrome_sources(blocks, sh); 1329 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1330 ops_complete_compute, sh, 1331 to_addr_conv(sh, percpu)); 1332 return async_gen_syndrome(blocks, 0, count+2, 1333 STRIPE_SIZE, &submit); 1334 } 1335 } else { 1336 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1337 ops_complete_compute, sh, 1338 to_addr_conv(sh, percpu)); 1339 if (failb == syndrome_disks) { 1340 /* We're missing D+P. */ 1341 return async_raid6_datap_recov(syndrome_disks+2, 1342 STRIPE_SIZE, faila, 1343 blocks, &submit); 1344 } else { 1345 /* We're missing D+D. */ 1346 return async_raid6_2data_recov(syndrome_disks+2, 1347 STRIPE_SIZE, faila, failb, 1348 blocks, &submit); 1349 } 1350 } 1351 } 1352 1353 1354 static void ops_complete_prexor(void *stripe_head_ref) 1355 { 1356 struct stripe_head *sh = stripe_head_ref; 1357 1358 pr_debug("%s: stripe %llu\n", __func__, 1359 (unsigned long long)sh->sector); 1360 } 1361 1362 static struct dma_async_tx_descriptor * 1363 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 1364 struct dma_async_tx_descriptor *tx) 1365 { 1366 int disks = sh->disks; 1367 struct page **xor_srcs = percpu->scribble; 1368 int count = 0, pd_idx = sh->pd_idx, i; 1369 struct async_submit_ctl submit; 1370 1371 /* existing parity data subtracted */ 1372 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1373 1374 pr_debug("%s: stripe %llu\n", __func__, 1375 (unsigned long long)sh->sector); 1376 1377 for (i = disks; i--; ) { 1378 struct r5dev *dev = &sh->dev[i]; 1379 /* Only process blocks that are known to be uptodate */ 1380 if (test_bit(R5_Wantdrain, &dev->flags)) 1381 xor_srcs[count++] = dev->page; 1382 } 1383 1384 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1385 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1386 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1387 1388 return tx; 1389 } 1390 1391 static struct dma_async_tx_descriptor * 1392 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1393 { 1394 int disks = sh->disks; 1395 int i; 1396 1397 pr_debug("%s: stripe %llu\n", __func__, 1398 (unsigned long long)sh->sector); 1399 1400 for (i = disks; i--; ) { 1401 struct r5dev *dev = &sh->dev[i]; 1402 struct bio *chosen; 1403 1404 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1405 struct bio *wbi; 1406 1407 spin_lock_irq(&sh->stripe_lock); 1408 chosen = dev->towrite; 1409 dev->towrite = NULL; 1410 BUG_ON(dev->written); 1411 wbi = dev->written = chosen; 1412 spin_unlock_irq(&sh->stripe_lock); 1413 WARN_ON(dev->page != dev->orig_page); 1414 1415 while (wbi && wbi->bi_iter.bi_sector < 1416 dev->sector + STRIPE_SECTORS) { 1417 if (wbi->bi_rw & REQ_FUA) 1418 set_bit(R5_WantFUA, &dev->flags); 1419 if (wbi->bi_rw & REQ_SYNC) 1420 set_bit(R5_SyncIO, &dev->flags); 1421 if (wbi->bi_rw & REQ_DISCARD) 1422 set_bit(R5_Discard, &dev->flags); 1423 else { 1424 tx = async_copy_data(1, wbi, &dev->page, 1425 dev->sector, tx, sh); 1426 if (dev->page != dev->orig_page) { 1427 set_bit(R5_SkipCopy, &dev->flags); 1428 clear_bit(R5_UPTODATE, &dev->flags); 1429 clear_bit(R5_OVERWRITE, &dev->flags); 1430 } 1431 } 1432 wbi = r5_next_bio(wbi, dev->sector); 1433 } 1434 } 1435 } 1436 1437 return tx; 1438 } 1439 1440 static void ops_complete_reconstruct(void *stripe_head_ref) 1441 { 1442 struct stripe_head *sh = stripe_head_ref; 1443 int disks = sh->disks; 1444 int pd_idx = sh->pd_idx; 1445 int qd_idx = sh->qd_idx; 1446 int i; 1447 bool fua = false, sync = false, discard = false; 1448 1449 pr_debug("%s: stripe %llu\n", __func__, 1450 (unsigned long long)sh->sector); 1451 1452 for (i = disks; i--; ) { 1453 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1454 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1455 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1456 } 1457 1458 for (i = disks; i--; ) { 1459 struct r5dev *dev = &sh->dev[i]; 1460 1461 if (dev->written || i == pd_idx || i == qd_idx) { 1462 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1463 set_bit(R5_UPTODATE, &dev->flags); 1464 if (fua) 1465 set_bit(R5_WantFUA, &dev->flags); 1466 if (sync) 1467 set_bit(R5_SyncIO, &dev->flags); 1468 } 1469 } 1470 1471 if (sh->reconstruct_state == reconstruct_state_drain_run) 1472 sh->reconstruct_state = reconstruct_state_drain_result; 1473 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1474 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1475 else { 1476 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1477 sh->reconstruct_state = reconstruct_state_result; 1478 } 1479 1480 set_bit(STRIPE_HANDLE, &sh->state); 1481 release_stripe(sh); 1482 } 1483 1484 static void 1485 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1486 struct dma_async_tx_descriptor *tx) 1487 { 1488 int disks = sh->disks; 1489 struct page **xor_srcs = percpu->scribble; 1490 struct async_submit_ctl submit; 1491 int count = 0, pd_idx = sh->pd_idx, i; 1492 struct page *xor_dest; 1493 int prexor = 0; 1494 unsigned long flags; 1495 1496 pr_debug("%s: stripe %llu\n", __func__, 1497 (unsigned long long)sh->sector); 1498 1499 for (i = 0; i < sh->disks; i++) { 1500 if (pd_idx == i) 1501 continue; 1502 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1503 break; 1504 } 1505 if (i >= sh->disks) { 1506 atomic_inc(&sh->count); 1507 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1508 ops_complete_reconstruct(sh); 1509 return; 1510 } 1511 /* check if prexor is active which means only process blocks 1512 * that are part of a read-modify-write (written) 1513 */ 1514 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1515 prexor = 1; 1516 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1517 for (i = disks; i--; ) { 1518 struct r5dev *dev = &sh->dev[i]; 1519 if (dev->written) 1520 xor_srcs[count++] = dev->page; 1521 } 1522 } else { 1523 xor_dest = sh->dev[pd_idx].page; 1524 for (i = disks; i--; ) { 1525 struct r5dev *dev = &sh->dev[i]; 1526 if (i != pd_idx) 1527 xor_srcs[count++] = dev->page; 1528 } 1529 } 1530 1531 /* 1/ if we prexor'd then the dest is reused as a source 1532 * 2/ if we did not prexor then we are redoing the parity 1533 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1534 * for the synchronous xor case 1535 */ 1536 flags = ASYNC_TX_ACK | 1537 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1538 1539 atomic_inc(&sh->count); 1540 1541 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1542 to_addr_conv(sh, percpu)); 1543 if (unlikely(count == 1)) 1544 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1545 else 1546 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1547 } 1548 1549 static void 1550 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1551 struct dma_async_tx_descriptor *tx) 1552 { 1553 struct async_submit_ctl submit; 1554 struct page **blocks = percpu->scribble; 1555 int count, i; 1556 1557 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1558 1559 for (i = 0; i < sh->disks; i++) { 1560 if (sh->pd_idx == i || sh->qd_idx == i) 1561 continue; 1562 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1563 break; 1564 } 1565 if (i >= sh->disks) { 1566 atomic_inc(&sh->count); 1567 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1568 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1569 ops_complete_reconstruct(sh); 1570 return; 1571 } 1572 1573 count = set_syndrome_sources(blocks, sh); 1574 1575 atomic_inc(&sh->count); 1576 1577 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1578 sh, to_addr_conv(sh, percpu)); 1579 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1580 } 1581 1582 static void ops_complete_check(void *stripe_head_ref) 1583 { 1584 struct stripe_head *sh = stripe_head_ref; 1585 1586 pr_debug("%s: stripe %llu\n", __func__, 1587 (unsigned long long)sh->sector); 1588 1589 sh->check_state = check_state_check_result; 1590 set_bit(STRIPE_HANDLE, &sh->state); 1591 release_stripe(sh); 1592 } 1593 1594 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1595 { 1596 int disks = sh->disks; 1597 int pd_idx = sh->pd_idx; 1598 int qd_idx = sh->qd_idx; 1599 struct page *xor_dest; 1600 struct page **xor_srcs = percpu->scribble; 1601 struct dma_async_tx_descriptor *tx; 1602 struct async_submit_ctl submit; 1603 int count; 1604 int i; 1605 1606 pr_debug("%s: stripe %llu\n", __func__, 1607 (unsigned long long)sh->sector); 1608 1609 count = 0; 1610 xor_dest = sh->dev[pd_idx].page; 1611 xor_srcs[count++] = xor_dest; 1612 for (i = disks; i--; ) { 1613 if (i == pd_idx || i == qd_idx) 1614 continue; 1615 xor_srcs[count++] = sh->dev[i].page; 1616 } 1617 1618 init_async_submit(&submit, 0, NULL, NULL, NULL, 1619 to_addr_conv(sh, percpu)); 1620 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1621 &sh->ops.zero_sum_result, &submit); 1622 1623 atomic_inc(&sh->count); 1624 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1625 tx = async_trigger_callback(&submit); 1626 } 1627 1628 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1629 { 1630 struct page **srcs = percpu->scribble; 1631 struct async_submit_ctl submit; 1632 int count; 1633 1634 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1635 (unsigned long long)sh->sector, checkp); 1636 1637 count = set_syndrome_sources(srcs, sh); 1638 if (!checkp) 1639 srcs[count] = NULL; 1640 1641 atomic_inc(&sh->count); 1642 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1643 sh, to_addr_conv(sh, percpu)); 1644 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1645 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1646 } 1647 1648 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1649 { 1650 int overlap_clear = 0, i, disks = sh->disks; 1651 struct dma_async_tx_descriptor *tx = NULL; 1652 struct r5conf *conf = sh->raid_conf; 1653 int level = conf->level; 1654 struct raid5_percpu *percpu; 1655 unsigned long cpu; 1656 1657 cpu = get_cpu(); 1658 percpu = per_cpu_ptr(conf->percpu, cpu); 1659 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1660 ops_run_biofill(sh); 1661 overlap_clear++; 1662 } 1663 1664 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1665 if (level < 6) 1666 tx = ops_run_compute5(sh, percpu); 1667 else { 1668 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1669 tx = ops_run_compute6_1(sh, percpu); 1670 else 1671 tx = ops_run_compute6_2(sh, percpu); 1672 } 1673 /* terminate the chain if reconstruct is not set to be run */ 1674 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1675 async_tx_ack(tx); 1676 } 1677 1678 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1679 tx = ops_run_prexor(sh, percpu, tx); 1680 1681 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1682 tx = ops_run_biodrain(sh, tx); 1683 overlap_clear++; 1684 } 1685 1686 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1687 if (level < 6) 1688 ops_run_reconstruct5(sh, percpu, tx); 1689 else 1690 ops_run_reconstruct6(sh, percpu, tx); 1691 } 1692 1693 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1694 if (sh->check_state == check_state_run) 1695 ops_run_check_p(sh, percpu); 1696 else if (sh->check_state == check_state_run_q) 1697 ops_run_check_pq(sh, percpu, 0); 1698 else if (sh->check_state == check_state_run_pq) 1699 ops_run_check_pq(sh, percpu, 1); 1700 else 1701 BUG(); 1702 } 1703 1704 if (overlap_clear) 1705 for (i = disks; i--; ) { 1706 struct r5dev *dev = &sh->dev[i]; 1707 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1708 wake_up(&sh->raid_conf->wait_for_overlap); 1709 } 1710 put_cpu(); 1711 } 1712 1713 static int grow_one_stripe(struct r5conf *conf, int hash) 1714 { 1715 struct stripe_head *sh; 1716 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); 1717 if (!sh) 1718 return 0; 1719 1720 sh->raid_conf = conf; 1721 1722 spin_lock_init(&sh->stripe_lock); 1723 1724 if (grow_buffers(sh)) { 1725 shrink_buffers(sh); 1726 kmem_cache_free(conf->slab_cache, sh); 1727 return 0; 1728 } 1729 sh->hash_lock_index = hash; 1730 /* we just created an active stripe so... */ 1731 atomic_set(&sh->count, 1); 1732 atomic_inc(&conf->active_stripes); 1733 INIT_LIST_HEAD(&sh->lru); 1734 release_stripe(sh); 1735 return 1; 1736 } 1737 1738 static int grow_stripes(struct r5conf *conf, int num) 1739 { 1740 struct kmem_cache *sc; 1741 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1742 int hash; 1743 1744 if (conf->mddev->gendisk) 1745 sprintf(conf->cache_name[0], 1746 "raid%d-%s", conf->level, mdname(conf->mddev)); 1747 else 1748 sprintf(conf->cache_name[0], 1749 "raid%d-%p", conf->level, conf->mddev); 1750 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1751 1752 conf->active_name = 0; 1753 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1754 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1755 0, 0, NULL); 1756 if (!sc) 1757 return 1; 1758 conf->slab_cache = sc; 1759 conf->pool_size = devs; 1760 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 1761 while (num--) { 1762 if (!grow_one_stripe(conf, hash)) 1763 return 1; 1764 conf->max_nr_stripes++; 1765 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; 1766 } 1767 return 0; 1768 } 1769 1770 /** 1771 * scribble_len - return the required size of the scribble region 1772 * @num - total number of disks in the array 1773 * 1774 * The size must be enough to contain: 1775 * 1/ a struct page pointer for each device in the array +2 1776 * 2/ room to convert each entry in (1) to its corresponding dma 1777 * (dma_map_page()) or page (page_address()) address. 1778 * 1779 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1780 * calculate over all devices (not just the data blocks), using zeros in place 1781 * of the P and Q blocks. 1782 */ 1783 static size_t scribble_len(int num) 1784 { 1785 size_t len; 1786 1787 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1788 1789 return len; 1790 } 1791 1792 static int resize_stripes(struct r5conf *conf, int newsize) 1793 { 1794 /* Make all the stripes able to hold 'newsize' devices. 1795 * New slots in each stripe get 'page' set to a new page. 1796 * 1797 * This happens in stages: 1798 * 1/ create a new kmem_cache and allocate the required number of 1799 * stripe_heads. 1800 * 2/ gather all the old stripe_heads and transfer the pages across 1801 * to the new stripe_heads. This will have the side effect of 1802 * freezing the array as once all stripe_heads have been collected, 1803 * no IO will be possible. Old stripe heads are freed once their 1804 * pages have been transferred over, and the old kmem_cache is 1805 * freed when all stripes are done. 1806 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1807 * we simple return a failre status - no need to clean anything up. 1808 * 4/ allocate new pages for the new slots in the new stripe_heads. 1809 * If this fails, we don't bother trying the shrink the 1810 * stripe_heads down again, we just leave them as they are. 1811 * As each stripe_head is processed the new one is released into 1812 * active service. 1813 * 1814 * Once step2 is started, we cannot afford to wait for a write, 1815 * so we use GFP_NOIO allocations. 1816 */ 1817 struct stripe_head *osh, *nsh; 1818 LIST_HEAD(newstripes); 1819 struct disk_info *ndisks; 1820 unsigned long cpu; 1821 int err; 1822 struct kmem_cache *sc; 1823 int i; 1824 int hash, cnt; 1825 1826 if (newsize <= conf->pool_size) 1827 return 0; /* never bother to shrink */ 1828 1829 err = md_allow_write(conf->mddev); 1830 if (err) 1831 return err; 1832 1833 /* Step 1 */ 1834 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1835 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1836 0, 0, NULL); 1837 if (!sc) 1838 return -ENOMEM; 1839 1840 for (i = conf->max_nr_stripes; i; i--) { 1841 nsh = kmem_cache_zalloc(sc, GFP_KERNEL); 1842 if (!nsh) 1843 break; 1844 1845 nsh->raid_conf = conf; 1846 spin_lock_init(&nsh->stripe_lock); 1847 1848 list_add(&nsh->lru, &newstripes); 1849 } 1850 if (i) { 1851 /* didn't get enough, give up */ 1852 while (!list_empty(&newstripes)) { 1853 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1854 list_del(&nsh->lru); 1855 kmem_cache_free(sc, nsh); 1856 } 1857 kmem_cache_destroy(sc); 1858 return -ENOMEM; 1859 } 1860 /* Step 2 - Must use GFP_NOIO now. 1861 * OK, we have enough stripes, start collecting inactive 1862 * stripes and copying them over 1863 */ 1864 hash = 0; 1865 cnt = 0; 1866 list_for_each_entry(nsh, &newstripes, lru) { 1867 lock_device_hash_lock(conf, hash); 1868 wait_event_cmd(conf->wait_for_stripe, 1869 !list_empty(conf->inactive_list + hash), 1870 unlock_device_hash_lock(conf, hash), 1871 lock_device_hash_lock(conf, hash)); 1872 osh = get_free_stripe(conf, hash); 1873 unlock_device_hash_lock(conf, hash); 1874 atomic_set(&nsh->count, 1); 1875 for(i=0; i<conf->pool_size; i++) { 1876 nsh->dev[i].page = osh->dev[i].page; 1877 nsh->dev[i].orig_page = osh->dev[i].page; 1878 } 1879 for( ; i<newsize; i++) 1880 nsh->dev[i].page = NULL; 1881 nsh->hash_lock_index = hash; 1882 kmem_cache_free(conf->slab_cache, osh); 1883 cnt++; 1884 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 1885 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 1886 hash++; 1887 cnt = 0; 1888 } 1889 } 1890 kmem_cache_destroy(conf->slab_cache); 1891 1892 /* Step 3. 1893 * At this point, we are holding all the stripes so the array 1894 * is completely stalled, so now is a good time to resize 1895 * conf->disks and the scribble region 1896 */ 1897 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1898 if (ndisks) { 1899 for (i=0; i<conf->raid_disks; i++) 1900 ndisks[i] = conf->disks[i]; 1901 kfree(conf->disks); 1902 conf->disks = ndisks; 1903 } else 1904 err = -ENOMEM; 1905 1906 get_online_cpus(); 1907 conf->scribble_len = scribble_len(newsize); 1908 for_each_present_cpu(cpu) { 1909 struct raid5_percpu *percpu; 1910 void *scribble; 1911 1912 percpu = per_cpu_ptr(conf->percpu, cpu); 1913 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1914 1915 if (scribble) { 1916 kfree(percpu->scribble); 1917 percpu->scribble = scribble; 1918 } else { 1919 err = -ENOMEM; 1920 break; 1921 } 1922 } 1923 put_online_cpus(); 1924 1925 /* Step 4, return new stripes to service */ 1926 while(!list_empty(&newstripes)) { 1927 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1928 list_del_init(&nsh->lru); 1929 1930 for (i=conf->raid_disks; i < newsize; i++) 1931 if (nsh->dev[i].page == NULL) { 1932 struct page *p = alloc_page(GFP_NOIO); 1933 nsh->dev[i].page = p; 1934 nsh->dev[i].orig_page = p; 1935 if (!p) 1936 err = -ENOMEM; 1937 } 1938 release_stripe(nsh); 1939 } 1940 /* critical section pass, GFP_NOIO no longer needed */ 1941 1942 conf->slab_cache = sc; 1943 conf->active_name = 1-conf->active_name; 1944 conf->pool_size = newsize; 1945 return err; 1946 } 1947 1948 static int drop_one_stripe(struct r5conf *conf, int hash) 1949 { 1950 struct stripe_head *sh; 1951 1952 spin_lock_irq(conf->hash_locks + hash); 1953 sh = get_free_stripe(conf, hash); 1954 spin_unlock_irq(conf->hash_locks + hash); 1955 if (!sh) 1956 return 0; 1957 BUG_ON(atomic_read(&sh->count)); 1958 shrink_buffers(sh); 1959 kmem_cache_free(conf->slab_cache, sh); 1960 atomic_dec(&conf->active_stripes); 1961 return 1; 1962 } 1963 1964 static void shrink_stripes(struct r5conf *conf) 1965 { 1966 int hash; 1967 for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) 1968 while (drop_one_stripe(conf, hash)) 1969 ; 1970 1971 if (conf->slab_cache) 1972 kmem_cache_destroy(conf->slab_cache); 1973 conf->slab_cache = NULL; 1974 } 1975 1976 static void raid5_end_read_request(struct bio * bi, int error) 1977 { 1978 struct stripe_head *sh = bi->bi_private; 1979 struct r5conf *conf = sh->raid_conf; 1980 int disks = sh->disks, i; 1981 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1982 char b[BDEVNAME_SIZE]; 1983 struct md_rdev *rdev = NULL; 1984 sector_t s; 1985 1986 for (i=0 ; i<disks; i++) 1987 if (bi == &sh->dev[i].req) 1988 break; 1989 1990 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1991 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1992 uptodate); 1993 if (i == disks) { 1994 BUG(); 1995 return; 1996 } 1997 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 1998 /* If replacement finished while this request was outstanding, 1999 * 'replacement' might be NULL already. 2000 * In that case it moved down to 'rdev'. 2001 * rdev is not removed until all requests are finished. 2002 */ 2003 rdev = conf->disks[i].replacement; 2004 if (!rdev) 2005 rdev = conf->disks[i].rdev; 2006 2007 if (use_new_offset(conf, sh)) 2008 s = sh->sector + rdev->new_data_offset; 2009 else 2010 s = sh->sector + rdev->data_offset; 2011 if (uptodate) { 2012 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2013 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2014 /* Note that this cannot happen on a 2015 * replacement device. We just fail those on 2016 * any error 2017 */ 2018 printk_ratelimited( 2019 KERN_INFO 2020 "md/raid:%s: read error corrected" 2021 " (%lu sectors at %llu on %s)\n", 2022 mdname(conf->mddev), STRIPE_SECTORS, 2023 (unsigned long long)s, 2024 bdevname(rdev->bdev, b)); 2025 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2026 clear_bit(R5_ReadError, &sh->dev[i].flags); 2027 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2028 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2029 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2030 2031 if (atomic_read(&rdev->read_errors)) 2032 atomic_set(&rdev->read_errors, 0); 2033 } else { 2034 const char *bdn = bdevname(rdev->bdev, b); 2035 int retry = 0; 2036 int set_bad = 0; 2037 2038 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2039 atomic_inc(&rdev->read_errors); 2040 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2041 printk_ratelimited( 2042 KERN_WARNING 2043 "md/raid:%s: read error on replacement device " 2044 "(sector %llu on %s).\n", 2045 mdname(conf->mddev), 2046 (unsigned long long)s, 2047 bdn); 2048 else if (conf->mddev->degraded >= conf->max_degraded) { 2049 set_bad = 1; 2050 printk_ratelimited( 2051 KERN_WARNING 2052 "md/raid:%s: read error not correctable " 2053 "(sector %llu on %s).\n", 2054 mdname(conf->mddev), 2055 (unsigned long long)s, 2056 bdn); 2057 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2058 /* Oh, no!!! */ 2059 set_bad = 1; 2060 printk_ratelimited( 2061 KERN_WARNING 2062 "md/raid:%s: read error NOT corrected!! " 2063 "(sector %llu on %s).\n", 2064 mdname(conf->mddev), 2065 (unsigned long long)s, 2066 bdn); 2067 } else if (atomic_read(&rdev->read_errors) 2068 > conf->max_nr_stripes) 2069 printk(KERN_WARNING 2070 "md/raid:%s: Too many read errors, failing device %s.\n", 2071 mdname(conf->mddev), bdn); 2072 else 2073 retry = 1; 2074 if (set_bad && test_bit(In_sync, &rdev->flags) 2075 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2076 retry = 1; 2077 if (retry) 2078 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2079 set_bit(R5_ReadError, &sh->dev[i].flags); 2080 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2081 } else 2082 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2083 else { 2084 clear_bit(R5_ReadError, &sh->dev[i].flags); 2085 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2086 if (!(set_bad 2087 && test_bit(In_sync, &rdev->flags) 2088 && rdev_set_badblocks( 2089 rdev, sh->sector, STRIPE_SECTORS, 0))) 2090 md_error(conf->mddev, rdev); 2091 } 2092 } 2093 rdev_dec_pending(rdev, conf->mddev); 2094 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2095 set_bit(STRIPE_HANDLE, &sh->state); 2096 release_stripe(sh); 2097 } 2098 2099 static void raid5_end_write_request(struct bio *bi, int error) 2100 { 2101 struct stripe_head *sh = bi->bi_private; 2102 struct r5conf *conf = sh->raid_conf; 2103 int disks = sh->disks, i; 2104 struct md_rdev *uninitialized_var(rdev); 2105 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 2106 sector_t first_bad; 2107 int bad_sectors; 2108 int replacement = 0; 2109 2110 for (i = 0 ; i < disks; i++) { 2111 if (bi == &sh->dev[i].req) { 2112 rdev = conf->disks[i].rdev; 2113 break; 2114 } 2115 if (bi == &sh->dev[i].rreq) { 2116 rdev = conf->disks[i].replacement; 2117 if (rdev) 2118 replacement = 1; 2119 else 2120 /* rdev was removed and 'replacement' 2121 * replaced it. rdev is not removed 2122 * until all requests are finished. 2123 */ 2124 rdev = conf->disks[i].rdev; 2125 break; 2126 } 2127 } 2128 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 2129 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2130 uptodate); 2131 if (i == disks) { 2132 BUG(); 2133 return; 2134 } 2135 2136 if (replacement) { 2137 if (!uptodate) 2138 md_error(conf->mddev, rdev); 2139 else if (is_badblock(rdev, sh->sector, 2140 STRIPE_SECTORS, 2141 &first_bad, &bad_sectors)) 2142 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2143 } else { 2144 if (!uptodate) { 2145 set_bit(STRIPE_DEGRADED, &sh->state); 2146 set_bit(WriteErrorSeen, &rdev->flags); 2147 set_bit(R5_WriteError, &sh->dev[i].flags); 2148 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2149 set_bit(MD_RECOVERY_NEEDED, 2150 &rdev->mddev->recovery); 2151 } else if (is_badblock(rdev, sh->sector, 2152 STRIPE_SECTORS, 2153 &first_bad, &bad_sectors)) { 2154 set_bit(R5_MadeGood, &sh->dev[i].flags); 2155 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2156 /* That was a successful write so make 2157 * sure it looks like we already did 2158 * a re-write. 2159 */ 2160 set_bit(R5_ReWrite, &sh->dev[i].flags); 2161 } 2162 } 2163 rdev_dec_pending(rdev, conf->mddev); 2164 2165 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2166 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2167 set_bit(STRIPE_HANDLE, &sh->state); 2168 release_stripe(sh); 2169 } 2170 2171 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2172 2173 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2174 { 2175 struct r5dev *dev = &sh->dev[i]; 2176 2177 bio_init(&dev->req); 2178 dev->req.bi_io_vec = &dev->vec; 2179 dev->req.bi_max_vecs = 1; 2180 dev->req.bi_private = sh; 2181 2182 bio_init(&dev->rreq); 2183 dev->rreq.bi_io_vec = &dev->rvec; 2184 dev->rreq.bi_max_vecs = 1; 2185 dev->rreq.bi_private = sh; 2186 2187 dev->flags = 0; 2188 dev->sector = compute_blocknr(sh, i, previous); 2189 } 2190 2191 static void error(struct mddev *mddev, struct md_rdev *rdev) 2192 { 2193 char b[BDEVNAME_SIZE]; 2194 struct r5conf *conf = mddev->private; 2195 unsigned long flags; 2196 pr_debug("raid456: error called\n"); 2197 2198 spin_lock_irqsave(&conf->device_lock, flags); 2199 clear_bit(In_sync, &rdev->flags); 2200 mddev->degraded = calc_degraded(conf); 2201 spin_unlock_irqrestore(&conf->device_lock, flags); 2202 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2203 2204 set_bit(Blocked, &rdev->flags); 2205 set_bit(Faulty, &rdev->flags); 2206 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2207 printk(KERN_ALERT 2208 "md/raid:%s: Disk failure on %s, disabling device.\n" 2209 "md/raid:%s: Operation continuing on %d devices.\n", 2210 mdname(mddev), 2211 bdevname(rdev->bdev, b), 2212 mdname(mddev), 2213 conf->raid_disks - mddev->degraded); 2214 } 2215 2216 /* 2217 * Input: a 'big' sector number, 2218 * Output: index of the data and parity disk, and the sector # in them. 2219 */ 2220 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2221 int previous, int *dd_idx, 2222 struct stripe_head *sh) 2223 { 2224 sector_t stripe, stripe2; 2225 sector_t chunk_number; 2226 unsigned int chunk_offset; 2227 int pd_idx, qd_idx; 2228 int ddf_layout = 0; 2229 sector_t new_sector; 2230 int algorithm = previous ? conf->prev_algo 2231 : conf->algorithm; 2232 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2233 : conf->chunk_sectors; 2234 int raid_disks = previous ? conf->previous_raid_disks 2235 : conf->raid_disks; 2236 int data_disks = raid_disks - conf->max_degraded; 2237 2238 /* First compute the information on this sector */ 2239 2240 /* 2241 * Compute the chunk number and the sector offset inside the chunk 2242 */ 2243 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2244 chunk_number = r_sector; 2245 2246 /* 2247 * Compute the stripe number 2248 */ 2249 stripe = chunk_number; 2250 *dd_idx = sector_div(stripe, data_disks); 2251 stripe2 = stripe; 2252 /* 2253 * Select the parity disk based on the user selected algorithm. 2254 */ 2255 pd_idx = qd_idx = -1; 2256 switch(conf->level) { 2257 case 4: 2258 pd_idx = data_disks; 2259 break; 2260 case 5: 2261 switch (algorithm) { 2262 case ALGORITHM_LEFT_ASYMMETRIC: 2263 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2264 if (*dd_idx >= pd_idx) 2265 (*dd_idx)++; 2266 break; 2267 case ALGORITHM_RIGHT_ASYMMETRIC: 2268 pd_idx = sector_div(stripe2, raid_disks); 2269 if (*dd_idx >= pd_idx) 2270 (*dd_idx)++; 2271 break; 2272 case ALGORITHM_LEFT_SYMMETRIC: 2273 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2274 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2275 break; 2276 case ALGORITHM_RIGHT_SYMMETRIC: 2277 pd_idx = sector_div(stripe2, raid_disks); 2278 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2279 break; 2280 case ALGORITHM_PARITY_0: 2281 pd_idx = 0; 2282 (*dd_idx)++; 2283 break; 2284 case ALGORITHM_PARITY_N: 2285 pd_idx = data_disks; 2286 break; 2287 default: 2288 BUG(); 2289 } 2290 break; 2291 case 6: 2292 2293 switch (algorithm) { 2294 case ALGORITHM_LEFT_ASYMMETRIC: 2295 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2296 qd_idx = pd_idx + 1; 2297 if (pd_idx == raid_disks-1) { 2298 (*dd_idx)++; /* Q D D D P */ 2299 qd_idx = 0; 2300 } else if (*dd_idx >= pd_idx) 2301 (*dd_idx) += 2; /* D D P Q D */ 2302 break; 2303 case ALGORITHM_RIGHT_ASYMMETRIC: 2304 pd_idx = sector_div(stripe2, raid_disks); 2305 qd_idx = pd_idx + 1; 2306 if (pd_idx == raid_disks-1) { 2307 (*dd_idx)++; /* Q D D D P */ 2308 qd_idx = 0; 2309 } else if (*dd_idx >= pd_idx) 2310 (*dd_idx) += 2; /* D D P Q D */ 2311 break; 2312 case ALGORITHM_LEFT_SYMMETRIC: 2313 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2314 qd_idx = (pd_idx + 1) % raid_disks; 2315 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2316 break; 2317 case ALGORITHM_RIGHT_SYMMETRIC: 2318 pd_idx = sector_div(stripe2, raid_disks); 2319 qd_idx = (pd_idx + 1) % raid_disks; 2320 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2321 break; 2322 2323 case ALGORITHM_PARITY_0: 2324 pd_idx = 0; 2325 qd_idx = 1; 2326 (*dd_idx) += 2; 2327 break; 2328 case ALGORITHM_PARITY_N: 2329 pd_idx = data_disks; 2330 qd_idx = data_disks + 1; 2331 break; 2332 2333 case ALGORITHM_ROTATING_ZERO_RESTART: 2334 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2335 * of blocks for computing Q is different. 2336 */ 2337 pd_idx = sector_div(stripe2, raid_disks); 2338 qd_idx = pd_idx + 1; 2339 if (pd_idx == raid_disks-1) { 2340 (*dd_idx)++; /* Q D D D P */ 2341 qd_idx = 0; 2342 } else if (*dd_idx >= pd_idx) 2343 (*dd_idx) += 2; /* D D P Q D */ 2344 ddf_layout = 1; 2345 break; 2346 2347 case ALGORITHM_ROTATING_N_RESTART: 2348 /* Same a left_asymmetric, by first stripe is 2349 * D D D P Q rather than 2350 * Q D D D P 2351 */ 2352 stripe2 += 1; 2353 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2354 qd_idx = pd_idx + 1; 2355 if (pd_idx == raid_disks-1) { 2356 (*dd_idx)++; /* Q D D D P */ 2357 qd_idx = 0; 2358 } else if (*dd_idx >= pd_idx) 2359 (*dd_idx) += 2; /* D D P Q D */ 2360 ddf_layout = 1; 2361 break; 2362 2363 case ALGORITHM_ROTATING_N_CONTINUE: 2364 /* Same as left_symmetric but Q is before P */ 2365 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2366 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2367 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2368 ddf_layout = 1; 2369 break; 2370 2371 case ALGORITHM_LEFT_ASYMMETRIC_6: 2372 /* RAID5 left_asymmetric, with Q on last device */ 2373 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2374 if (*dd_idx >= pd_idx) 2375 (*dd_idx)++; 2376 qd_idx = raid_disks - 1; 2377 break; 2378 2379 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2380 pd_idx = sector_div(stripe2, raid_disks-1); 2381 if (*dd_idx >= pd_idx) 2382 (*dd_idx)++; 2383 qd_idx = raid_disks - 1; 2384 break; 2385 2386 case ALGORITHM_LEFT_SYMMETRIC_6: 2387 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2388 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2389 qd_idx = raid_disks - 1; 2390 break; 2391 2392 case ALGORITHM_RIGHT_SYMMETRIC_6: 2393 pd_idx = sector_div(stripe2, raid_disks-1); 2394 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2395 qd_idx = raid_disks - 1; 2396 break; 2397 2398 case ALGORITHM_PARITY_0_6: 2399 pd_idx = 0; 2400 (*dd_idx)++; 2401 qd_idx = raid_disks - 1; 2402 break; 2403 2404 default: 2405 BUG(); 2406 } 2407 break; 2408 } 2409 2410 if (sh) { 2411 sh->pd_idx = pd_idx; 2412 sh->qd_idx = qd_idx; 2413 sh->ddf_layout = ddf_layout; 2414 } 2415 /* 2416 * Finally, compute the new sector number 2417 */ 2418 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2419 return new_sector; 2420 } 2421 2422 2423 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2424 { 2425 struct r5conf *conf = sh->raid_conf; 2426 int raid_disks = sh->disks; 2427 int data_disks = raid_disks - conf->max_degraded; 2428 sector_t new_sector = sh->sector, check; 2429 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2430 : conf->chunk_sectors; 2431 int algorithm = previous ? conf->prev_algo 2432 : conf->algorithm; 2433 sector_t stripe; 2434 int chunk_offset; 2435 sector_t chunk_number; 2436 int dummy1, dd_idx = i; 2437 sector_t r_sector; 2438 struct stripe_head sh2; 2439 2440 2441 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2442 stripe = new_sector; 2443 2444 if (i == sh->pd_idx) 2445 return 0; 2446 switch(conf->level) { 2447 case 4: break; 2448 case 5: 2449 switch (algorithm) { 2450 case ALGORITHM_LEFT_ASYMMETRIC: 2451 case ALGORITHM_RIGHT_ASYMMETRIC: 2452 if (i > sh->pd_idx) 2453 i--; 2454 break; 2455 case ALGORITHM_LEFT_SYMMETRIC: 2456 case ALGORITHM_RIGHT_SYMMETRIC: 2457 if (i < sh->pd_idx) 2458 i += raid_disks; 2459 i -= (sh->pd_idx + 1); 2460 break; 2461 case ALGORITHM_PARITY_0: 2462 i -= 1; 2463 break; 2464 case ALGORITHM_PARITY_N: 2465 break; 2466 default: 2467 BUG(); 2468 } 2469 break; 2470 case 6: 2471 if (i == sh->qd_idx) 2472 return 0; /* It is the Q disk */ 2473 switch (algorithm) { 2474 case ALGORITHM_LEFT_ASYMMETRIC: 2475 case ALGORITHM_RIGHT_ASYMMETRIC: 2476 case ALGORITHM_ROTATING_ZERO_RESTART: 2477 case ALGORITHM_ROTATING_N_RESTART: 2478 if (sh->pd_idx == raid_disks-1) 2479 i--; /* Q D D D P */ 2480 else if (i > sh->pd_idx) 2481 i -= 2; /* D D P Q D */ 2482 break; 2483 case ALGORITHM_LEFT_SYMMETRIC: 2484 case ALGORITHM_RIGHT_SYMMETRIC: 2485 if (sh->pd_idx == raid_disks-1) 2486 i--; /* Q D D D P */ 2487 else { 2488 /* D D P Q D */ 2489 if (i < sh->pd_idx) 2490 i += raid_disks; 2491 i -= (sh->pd_idx + 2); 2492 } 2493 break; 2494 case ALGORITHM_PARITY_0: 2495 i -= 2; 2496 break; 2497 case ALGORITHM_PARITY_N: 2498 break; 2499 case ALGORITHM_ROTATING_N_CONTINUE: 2500 /* Like left_symmetric, but P is before Q */ 2501 if (sh->pd_idx == 0) 2502 i--; /* P D D D Q */ 2503 else { 2504 /* D D Q P D */ 2505 if (i < sh->pd_idx) 2506 i += raid_disks; 2507 i -= (sh->pd_idx + 1); 2508 } 2509 break; 2510 case ALGORITHM_LEFT_ASYMMETRIC_6: 2511 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2512 if (i > sh->pd_idx) 2513 i--; 2514 break; 2515 case ALGORITHM_LEFT_SYMMETRIC_6: 2516 case ALGORITHM_RIGHT_SYMMETRIC_6: 2517 if (i < sh->pd_idx) 2518 i += data_disks + 1; 2519 i -= (sh->pd_idx + 1); 2520 break; 2521 case ALGORITHM_PARITY_0_6: 2522 i -= 1; 2523 break; 2524 default: 2525 BUG(); 2526 } 2527 break; 2528 } 2529 2530 chunk_number = stripe * data_disks + i; 2531 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2532 2533 check = raid5_compute_sector(conf, r_sector, 2534 previous, &dummy1, &sh2); 2535 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2536 || sh2.qd_idx != sh->qd_idx) { 2537 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2538 mdname(conf->mddev)); 2539 return 0; 2540 } 2541 return r_sector; 2542 } 2543 2544 2545 static void 2546 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2547 int rcw, int expand) 2548 { 2549 int i, pd_idx = sh->pd_idx, disks = sh->disks; 2550 struct r5conf *conf = sh->raid_conf; 2551 int level = conf->level; 2552 2553 if (rcw) { 2554 2555 for (i = disks; i--; ) { 2556 struct r5dev *dev = &sh->dev[i]; 2557 2558 if (dev->towrite) { 2559 set_bit(R5_LOCKED, &dev->flags); 2560 set_bit(R5_Wantdrain, &dev->flags); 2561 if (!expand) 2562 clear_bit(R5_UPTODATE, &dev->flags); 2563 s->locked++; 2564 } 2565 } 2566 /* if we are not expanding this is a proper write request, and 2567 * there will be bios with new data to be drained into the 2568 * stripe cache 2569 */ 2570 if (!expand) { 2571 if (!s->locked) 2572 /* False alarm, nothing to do */ 2573 return; 2574 sh->reconstruct_state = reconstruct_state_drain_run; 2575 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2576 } else 2577 sh->reconstruct_state = reconstruct_state_run; 2578 2579 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2580 2581 if (s->locked + conf->max_degraded == disks) 2582 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2583 atomic_inc(&conf->pending_full_writes); 2584 } else { 2585 BUG_ON(level == 6); 2586 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2587 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2588 2589 for (i = disks; i--; ) { 2590 struct r5dev *dev = &sh->dev[i]; 2591 if (i == pd_idx) 2592 continue; 2593 2594 if (dev->towrite && 2595 (test_bit(R5_UPTODATE, &dev->flags) || 2596 test_bit(R5_Wantcompute, &dev->flags))) { 2597 set_bit(R5_Wantdrain, &dev->flags); 2598 set_bit(R5_LOCKED, &dev->flags); 2599 clear_bit(R5_UPTODATE, &dev->flags); 2600 s->locked++; 2601 } 2602 } 2603 if (!s->locked) 2604 /* False alarm - nothing to do */ 2605 return; 2606 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2607 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2608 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2609 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2610 } 2611 2612 /* keep the parity disk(s) locked while asynchronous operations 2613 * are in flight 2614 */ 2615 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2616 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2617 s->locked++; 2618 2619 if (level == 6) { 2620 int qd_idx = sh->qd_idx; 2621 struct r5dev *dev = &sh->dev[qd_idx]; 2622 2623 set_bit(R5_LOCKED, &dev->flags); 2624 clear_bit(R5_UPTODATE, &dev->flags); 2625 s->locked++; 2626 } 2627 2628 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2629 __func__, (unsigned long long)sh->sector, 2630 s->locked, s->ops_request); 2631 } 2632 2633 /* 2634 * Each stripe/dev can have one or more bion attached. 2635 * toread/towrite point to the first in a chain. 2636 * The bi_next chain must be in order. 2637 */ 2638 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2639 { 2640 struct bio **bip; 2641 struct r5conf *conf = sh->raid_conf; 2642 int firstwrite=0; 2643 2644 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2645 (unsigned long long)bi->bi_iter.bi_sector, 2646 (unsigned long long)sh->sector); 2647 2648 /* 2649 * If several bio share a stripe. The bio bi_phys_segments acts as a 2650 * reference count to avoid race. The reference count should already be 2651 * increased before this function is called (for example, in 2652 * make_request()), so other bio sharing this stripe will not free the 2653 * stripe. If a stripe is owned by one stripe, the stripe lock will 2654 * protect it. 2655 */ 2656 spin_lock_irq(&sh->stripe_lock); 2657 if (forwrite) { 2658 bip = &sh->dev[dd_idx].towrite; 2659 if (*bip == NULL) 2660 firstwrite = 1; 2661 } else 2662 bip = &sh->dev[dd_idx].toread; 2663 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 2664 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 2665 goto overlap; 2666 bip = & (*bip)->bi_next; 2667 } 2668 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 2669 goto overlap; 2670 2671 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2672 if (*bip) 2673 bi->bi_next = *bip; 2674 *bip = bi; 2675 raid5_inc_bi_active_stripes(bi); 2676 2677 if (forwrite) { 2678 /* check if page is covered */ 2679 sector_t sector = sh->dev[dd_idx].sector; 2680 for (bi=sh->dev[dd_idx].towrite; 2681 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2682 bi && bi->bi_iter.bi_sector <= sector; 2683 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2684 if (bio_end_sector(bi) >= sector) 2685 sector = bio_end_sector(bi); 2686 } 2687 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2688 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2689 } 2690 2691 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2692 (unsigned long long)(*bip)->bi_iter.bi_sector, 2693 (unsigned long long)sh->sector, dd_idx); 2694 spin_unlock_irq(&sh->stripe_lock); 2695 2696 if (conf->mddev->bitmap && firstwrite) { 2697 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2698 STRIPE_SECTORS, 0); 2699 sh->bm_seq = conf->seq_flush+1; 2700 set_bit(STRIPE_BIT_DELAY, &sh->state); 2701 } 2702 return 1; 2703 2704 overlap: 2705 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2706 spin_unlock_irq(&sh->stripe_lock); 2707 return 0; 2708 } 2709 2710 static void end_reshape(struct r5conf *conf); 2711 2712 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 2713 struct stripe_head *sh) 2714 { 2715 int sectors_per_chunk = 2716 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2717 int dd_idx; 2718 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2719 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2720 2721 raid5_compute_sector(conf, 2722 stripe * (disks - conf->max_degraded) 2723 *sectors_per_chunk + chunk_offset, 2724 previous, 2725 &dd_idx, sh); 2726 } 2727 2728 static void 2729 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 2730 struct stripe_head_state *s, int disks, 2731 struct bio **return_bi) 2732 { 2733 int i; 2734 for (i = disks; i--; ) { 2735 struct bio *bi; 2736 int bitmap_end = 0; 2737 2738 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2739 struct md_rdev *rdev; 2740 rcu_read_lock(); 2741 rdev = rcu_dereference(conf->disks[i].rdev); 2742 if (rdev && test_bit(In_sync, &rdev->flags)) 2743 atomic_inc(&rdev->nr_pending); 2744 else 2745 rdev = NULL; 2746 rcu_read_unlock(); 2747 if (rdev) { 2748 if (!rdev_set_badblocks( 2749 rdev, 2750 sh->sector, 2751 STRIPE_SECTORS, 0)) 2752 md_error(conf->mddev, rdev); 2753 rdev_dec_pending(rdev, conf->mddev); 2754 } 2755 } 2756 spin_lock_irq(&sh->stripe_lock); 2757 /* fail all writes first */ 2758 bi = sh->dev[i].towrite; 2759 sh->dev[i].towrite = NULL; 2760 spin_unlock_irq(&sh->stripe_lock); 2761 if (bi) 2762 bitmap_end = 1; 2763 2764 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2765 wake_up(&conf->wait_for_overlap); 2766 2767 while (bi && bi->bi_iter.bi_sector < 2768 sh->dev[i].sector + STRIPE_SECTORS) { 2769 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2770 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2771 if (!raid5_dec_bi_active_stripes(bi)) { 2772 md_write_end(conf->mddev); 2773 bi->bi_next = *return_bi; 2774 *return_bi = bi; 2775 } 2776 bi = nextbi; 2777 } 2778 if (bitmap_end) 2779 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2780 STRIPE_SECTORS, 0, 0); 2781 bitmap_end = 0; 2782 /* and fail all 'written' */ 2783 bi = sh->dev[i].written; 2784 sh->dev[i].written = NULL; 2785 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 2786 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 2787 sh->dev[i].page = sh->dev[i].orig_page; 2788 } 2789 2790 if (bi) bitmap_end = 1; 2791 while (bi && bi->bi_iter.bi_sector < 2792 sh->dev[i].sector + STRIPE_SECTORS) { 2793 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2794 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2795 if (!raid5_dec_bi_active_stripes(bi)) { 2796 md_write_end(conf->mddev); 2797 bi->bi_next = *return_bi; 2798 *return_bi = bi; 2799 } 2800 bi = bi2; 2801 } 2802 2803 /* fail any reads if this device is non-operational and 2804 * the data has not reached the cache yet. 2805 */ 2806 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2807 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2808 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2809 spin_lock_irq(&sh->stripe_lock); 2810 bi = sh->dev[i].toread; 2811 sh->dev[i].toread = NULL; 2812 spin_unlock_irq(&sh->stripe_lock); 2813 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2814 wake_up(&conf->wait_for_overlap); 2815 while (bi && bi->bi_iter.bi_sector < 2816 sh->dev[i].sector + STRIPE_SECTORS) { 2817 struct bio *nextbi = 2818 r5_next_bio(bi, sh->dev[i].sector); 2819 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2820 if (!raid5_dec_bi_active_stripes(bi)) { 2821 bi->bi_next = *return_bi; 2822 *return_bi = bi; 2823 } 2824 bi = nextbi; 2825 } 2826 } 2827 if (bitmap_end) 2828 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2829 STRIPE_SECTORS, 0, 0); 2830 /* If we were in the middle of a write the parity block might 2831 * still be locked - so just clear all R5_LOCKED flags 2832 */ 2833 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2834 } 2835 2836 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2837 if (atomic_dec_and_test(&conf->pending_full_writes)) 2838 md_wakeup_thread(conf->mddev->thread); 2839 } 2840 2841 static void 2842 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 2843 struct stripe_head_state *s) 2844 { 2845 int abort = 0; 2846 int i; 2847 2848 clear_bit(STRIPE_SYNCING, &sh->state); 2849 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 2850 wake_up(&conf->wait_for_overlap); 2851 s->syncing = 0; 2852 s->replacing = 0; 2853 /* There is nothing more to do for sync/check/repair. 2854 * Don't even need to abort as that is handled elsewhere 2855 * if needed, and not always wanted e.g. if there is a known 2856 * bad block here. 2857 * For recover/replace we need to record a bad block on all 2858 * non-sync devices, or abort the recovery 2859 */ 2860 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 2861 /* During recovery devices cannot be removed, so 2862 * locking and refcounting of rdevs is not needed 2863 */ 2864 for (i = 0; i < conf->raid_disks; i++) { 2865 struct md_rdev *rdev = conf->disks[i].rdev; 2866 if (rdev 2867 && !test_bit(Faulty, &rdev->flags) 2868 && !test_bit(In_sync, &rdev->flags) 2869 && !rdev_set_badblocks(rdev, sh->sector, 2870 STRIPE_SECTORS, 0)) 2871 abort = 1; 2872 rdev = conf->disks[i].replacement; 2873 if (rdev 2874 && !test_bit(Faulty, &rdev->flags) 2875 && !test_bit(In_sync, &rdev->flags) 2876 && !rdev_set_badblocks(rdev, sh->sector, 2877 STRIPE_SECTORS, 0)) 2878 abort = 1; 2879 } 2880 if (abort) 2881 conf->recovery_disabled = 2882 conf->mddev->recovery_disabled; 2883 } 2884 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 2885 } 2886 2887 static int want_replace(struct stripe_head *sh, int disk_idx) 2888 { 2889 struct md_rdev *rdev; 2890 int rv = 0; 2891 /* Doing recovery so rcu locking not required */ 2892 rdev = sh->raid_conf->disks[disk_idx].replacement; 2893 if (rdev 2894 && !test_bit(Faulty, &rdev->flags) 2895 && !test_bit(In_sync, &rdev->flags) 2896 && (rdev->recovery_offset <= sh->sector 2897 || rdev->mddev->recovery_cp <= sh->sector)) 2898 rv = 1; 2899 2900 return rv; 2901 } 2902 2903 /* fetch_block - checks the given member device to see if its data needs 2904 * to be read or computed to satisfy a request. 2905 * 2906 * Returns 1 when no more member devices need to be checked, otherwise returns 2907 * 0 to tell the loop in handle_stripe_fill to continue 2908 */ 2909 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 2910 int disk_idx, int disks) 2911 { 2912 struct r5dev *dev = &sh->dev[disk_idx]; 2913 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 2914 &sh->dev[s->failed_num[1]] }; 2915 2916 /* is the data in this block needed, and can we get it? */ 2917 if (!test_bit(R5_LOCKED, &dev->flags) && 2918 !test_bit(R5_UPTODATE, &dev->flags) && 2919 (dev->toread || 2920 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2921 s->syncing || s->expanding || 2922 (s->replacing && want_replace(sh, disk_idx)) || 2923 (s->failed >= 1 && fdev[0]->toread) || 2924 (s->failed >= 2 && fdev[1]->toread) || 2925 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2926 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && 2927 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2928 (sh->raid_conf->level == 6 && s->failed && s->to_write && 2929 s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 && 2930 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { 2931 /* we would like to get this block, possibly by computing it, 2932 * otherwise read it if the backing disk is insync 2933 */ 2934 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2935 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2936 if ((s->uptodate == disks - 1) && 2937 (s->failed && (disk_idx == s->failed_num[0] || 2938 disk_idx == s->failed_num[1]))) { 2939 /* have disk failed, and we're requested to fetch it; 2940 * do compute it 2941 */ 2942 pr_debug("Computing stripe %llu block %d\n", 2943 (unsigned long long)sh->sector, disk_idx); 2944 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2945 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2946 set_bit(R5_Wantcompute, &dev->flags); 2947 sh->ops.target = disk_idx; 2948 sh->ops.target2 = -1; /* no 2nd target */ 2949 s->req_compute = 1; 2950 /* Careful: from this point on 'uptodate' is in the eye 2951 * of raid_run_ops which services 'compute' operations 2952 * before writes. R5_Wantcompute flags a block that will 2953 * be R5_UPTODATE by the time it is needed for a 2954 * subsequent operation. 2955 */ 2956 s->uptodate++; 2957 return 1; 2958 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2959 /* Computing 2-failure is *very* expensive; only 2960 * do it if failed >= 2 2961 */ 2962 int other; 2963 for (other = disks; other--; ) { 2964 if (other == disk_idx) 2965 continue; 2966 if (!test_bit(R5_UPTODATE, 2967 &sh->dev[other].flags)) 2968 break; 2969 } 2970 BUG_ON(other < 0); 2971 pr_debug("Computing stripe %llu blocks %d,%d\n", 2972 (unsigned long long)sh->sector, 2973 disk_idx, other); 2974 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2975 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2976 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2977 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2978 sh->ops.target = disk_idx; 2979 sh->ops.target2 = other; 2980 s->uptodate += 2; 2981 s->req_compute = 1; 2982 return 1; 2983 } else if (test_bit(R5_Insync, &dev->flags)) { 2984 set_bit(R5_LOCKED, &dev->flags); 2985 set_bit(R5_Wantread, &dev->flags); 2986 s->locked++; 2987 pr_debug("Reading block %d (sync=%d)\n", 2988 disk_idx, s->syncing); 2989 } 2990 } 2991 2992 return 0; 2993 } 2994 2995 /** 2996 * handle_stripe_fill - read or compute data to satisfy pending requests. 2997 */ 2998 static void handle_stripe_fill(struct stripe_head *sh, 2999 struct stripe_head_state *s, 3000 int disks) 3001 { 3002 int i; 3003 3004 /* look for blocks to read/compute, skip this if a compute 3005 * is already in flight, or if the stripe contents are in the 3006 * midst of changing due to a write 3007 */ 3008 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3009 !sh->reconstruct_state) 3010 for (i = disks; i--; ) 3011 if (fetch_block(sh, s, i, disks)) 3012 break; 3013 set_bit(STRIPE_HANDLE, &sh->state); 3014 } 3015 3016 3017 /* handle_stripe_clean_event 3018 * any written block on an uptodate or failed drive can be returned. 3019 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3020 * never LOCKED, so we don't need to test 'failed' directly. 3021 */ 3022 static void handle_stripe_clean_event(struct r5conf *conf, 3023 struct stripe_head *sh, int disks, struct bio **return_bi) 3024 { 3025 int i; 3026 struct r5dev *dev; 3027 int discard_pending = 0; 3028 3029 for (i = disks; i--; ) 3030 if (sh->dev[i].written) { 3031 dev = &sh->dev[i]; 3032 if (!test_bit(R5_LOCKED, &dev->flags) && 3033 (test_bit(R5_UPTODATE, &dev->flags) || 3034 test_bit(R5_Discard, &dev->flags) || 3035 test_bit(R5_SkipCopy, &dev->flags))) { 3036 /* We can return any write requests */ 3037 struct bio *wbi, *wbi2; 3038 pr_debug("Return write for disc %d\n", i); 3039 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3040 clear_bit(R5_UPTODATE, &dev->flags); 3041 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3042 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3043 dev->page = dev->orig_page; 3044 } 3045 wbi = dev->written; 3046 dev->written = NULL; 3047 while (wbi && wbi->bi_iter.bi_sector < 3048 dev->sector + STRIPE_SECTORS) { 3049 wbi2 = r5_next_bio(wbi, dev->sector); 3050 if (!raid5_dec_bi_active_stripes(wbi)) { 3051 md_write_end(conf->mddev); 3052 wbi->bi_next = *return_bi; 3053 *return_bi = wbi; 3054 } 3055 wbi = wbi2; 3056 } 3057 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3058 STRIPE_SECTORS, 3059 !test_bit(STRIPE_DEGRADED, &sh->state), 3060 0); 3061 } else if (test_bit(R5_Discard, &dev->flags)) 3062 discard_pending = 1; 3063 WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); 3064 WARN_ON(dev->page != dev->orig_page); 3065 } 3066 if (!discard_pending && 3067 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3068 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3069 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3070 if (sh->qd_idx >= 0) { 3071 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3072 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3073 } 3074 /* now that discard is done we can proceed with any sync */ 3075 clear_bit(STRIPE_DISCARD, &sh->state); 3076 /* 3077 * SCSI discard will change some bio fields and the stripe has 3078 * no updated data, so remove it from hash list and the stripe 3079 * will be reinitialized 3080 */ 3081 spin_lock_irq(&conf->device_lock); 3082 remove_hash(sh); 3083 spin_unlock_irq(&conf->device_lock); 3084 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3085 set_bit(STRIPE_HANDLE, &sh->state); 3086 3087 } 3088 3089 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3090 if (atomic_dec_and_test(&conf->pending_full_writes)) 3091 md_wakeup_thread(conf->mddev->thread); 3092 } 3093 3094 static void handle_stripe_dirtying(struct r5conf *conf, 3095 struct stripe_head *sh, 3096 struct stripe_head_state *s, 3097 int disks) 3098 { 3099 int rmw = 0, rcw = 0, i; 3100 sector_t recovery_cp = conf->mddev->recovery_cp; 3101 3102 /* RAID6 requires 'rcw' in current implementation. 3103 * Otherwise, check whether resync is now happening or should start. 3104 * If yes, then the array is dirty (after unclean shutdown or 3105 * initial creation), so parity in some stripes might be inconsistent. 3106 * In this case, we need to always do reconstruct-write, to ensure 3107 * that in case of drive failure or read-error correction, we 3108 * generate correct data from the parity. 3109 */ 3110 if (conf->max_degraded == 2 || 3111 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 3112 /* Calculate the real rcw later - for now make it 3113 * look like rcw is cheaper 3114 */ 3115 rcw = 1; rmw = 2; 3116 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 3117 conf->max_degraded, (unsigned long long)recovery_cp, 3118 (unsigned long long)sh->sector); 3119 } else for (i = disks; i--; ) { 3120 /* would I have to read this buffer for read_modify_write */ 3121 struct r5dev *dev = &sh->dev[i]; 3122 if ((dev->towrite || i == sh->pd_idx) && 3123 !test_bit(R5_LOCKED, &dev->flags) && 3124 !(test_bit(R5_UPTODATE, &dev->flags) || 3125 test_bit(R5_Wantcompute, &dev->flags))) { 3126 if (test_bit(R5_Insync, &dev->flags)) 3127 rmw++; 3128 else 3129 rmw += 2*disks; /* cannot read it */ 3130 } 3131 /* Would I have to read this buffer for reconstruct_write */ 3132 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 3133 !test_bit(R5_LOCKED, &dev->flags) && 3134 !(test_bit(R5_UPTODATE, &dev->flags) || 3135 test_bit(R5_Wantcompute, &dev->flags))) { 3136 if (test_bit(R5_Insync, &dev->flags)) 3137 rcw++; 3138 else 3139 rcw += 2*disks; 3140 } 3141 } 3142 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3143 (unsigned long long)sh->sector, rmw, rcw); 3144 set_bit(STRIPE_HANDLE, &sh->state); 3145 if (rmw < rcw && rmw > 0) { 3146 /* prefer read-modify-write, but need to get some data */ 3147 if (conf->mddev->queue) 3148 blk_add_trace_msg(conf->mddev->queue, 3149 "raid5 rmw %llu %d", 3150 (unsigned long long)sh->sector, rmw); 3151 for (i = disks; i--; ) { 3152 struct r5dev *dev = &sh->dev[i]; 3153 if ((dev->towrite || i == sh->pd_idx) && 3154 !test_bit(R5_LOCKED, &dev->flags) && 3155 !(test_bit(R5_UPTODATE, &dev->flags) || 3156 test_bit(R5_Wantcompute, &dev->flags)) && 3157 test_bit(R5_Insync, &dev->flags)) { 3158 if (test_bit(STRIPE_PREREAD_ACTIVE, 3159 &sh->state)) { 3160 pr_debug("Read_old block %d for r-m-w\n", 3161 i); 3162 set_bit(R5_LOCKED, &dev->flags); 3163 set_bit(R5_Wantread, &dev->flags); 3164 s->locked++; 3165 } else { 3166 set_bit(STRIPE_DELAYED, &sh->state); 3167 set_bit(STRIPE_HANDLE, &sh->state); 3168 } 3169 } 3170 } 3171 } 3172 if (rcw <= rmw && rcw > 0) { 3173 /* want reconstruct write, but need to get some data */ 3174 int qread =0; 3175 rcw = 0; 3176 for (i = disks; i--; ) { 3177 struct r5dev *dev = &sh->dev[i]; 3178 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3179 i != sh->pd_idx && i != sh->qd_idx && 3180 !test_bit(R5_LOCKED, &dev->flags) && 3181 !(test_bit(R5_UPTODATE, &dev->flags) || 3182 test_bit(R5_Wantcompute, &dev->flags))) { 3183 rcw++; 3184 if (test_bit(R5_Insync, &dev->flags) && 3185 test_bit(STRIPE_PREREAD_ACTIVE, 3186 &sh->state)) { 3187 pr_debug("Read_old block " 3188 "%d for Reconstruct\n", i); 3189 set_bit(R5_LOCKED, &dev->flags); 3190 set_bit(R5_Wantread, &dev->flags); 3191 s->locked++; 3192 qread++; 3193 } else { 3194 set_bit(STRIPE_DELAYED, &sh->state); 3195 set_bit(STRIPE_HANDLE, &sh->state); 3196 } 3197 } 3198 } 3199 if (rcw && conf->mddev->queue) 3200 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3201 (unsigned long long)sh->sector, 3202 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3203 } 3204 /* now if nothing is locked, and if we have enough data, 3205 * we can start a write request 3206 */ 3207 /* since handle_stripe can be called at any time we need to handle the 3208 * case where a compute block operation has been submitted and then a 3209 * subsequent call wants to start a write request. raid_run_ops only 3210 * handles the case where compute block and reconstruct are requested 3211 * simultaneously. If this is not the case then new writes need to be 3212 * held off until the compute completes. 3213 */ 3214 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3215 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3216 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3217 schedule_reconstruction(sh, s, rcw == 0, 0); 3218 } 3219 3220 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3221 struct stripe_head_state *s, int disks) 3222 { 3223 struct r5dev *dev = NULL; 3224 3225 set_bit(STRIPE_HANDLE, &sh->state); 3226 3227 switch (sh->check_state) { 3228 case check_state_idle: 3229 /* start a new check operation if there are no failures */ 3230 if (s->failed == 0) { 3231 BUG_ON(s->uptodate != disks); 3232 sh->check_state = check_state_run; 3233 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3234 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3235 s->uptodate--; 3236 break; 3237 } 3238 dev = &sh->dev[s->failed_num[0]]; 3239 /* fall through */ 3240 case check_state_compute_result: 3241 sh->check_state = check_state_idle; 3242 if (!dev) 3243 dev = &sh->dev[sh->pd_idx]; 3244 3245 /* check that a write has not made the stripe insync */ 3246 if (test_bit(STRIPE_INSYNC, &sh->state)) 3247 break; 3248 3249 /* either failed parity check, or recovery is happening */ 3250 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3251 BUG_ON(s->uptodate != disks); 3252 3253 set_bit(R5_LOCKED, &dev->flags); 3254 s->locked++; 3255 set_bit(R5_Wantwrite, &dev->flags); 3256 3257 clear_bit(STRIPE_DEGRADED, &sh->state); 3258 set_bit(STRIPE_INSYNC, &sh->state); 3259 break; 3260 case check_state_run: 3261 break; /* we will be called again upon completion */ 3262 case check_state_check_result: 3263 sh->check_state = check_state_idle; 3264 3265 /* if a failure occurred during the check operation, leave 3266 * STRIPE_INSYNC not set and let the stripe be handled again 3267 */ 3268 if (s->failed) 3269 break; 3270 3271 /* handle a successful check operation, if parity is correct 3272 * we are done. Otherwise update the mismatch count and repair 3273 * parity if !MD_RECOVERY_CHECK 3274 */ 3275 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3276 /* parity is correct (on disc, 3277 * not in buffer any more) 3278 */ 3279 set_bit(STRIPE_INSYNC, &sh->state); 3280 else { 3281 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3282 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3283 /* don't try to repair!! */ 3284 set_bit(STRIPE_INSYNC, &sh->state); 3285 else { 3286 sh->check_state = check_state_compute_run; 3287 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3288 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3289 set_bit(R5_Wantcompute, 3290 &sh->dev[sh->pd_idx].flags); 3291 sh->ops.target = sh->pd_idx; 3292 sh->ops.target2 = -1; 3293 s->uptodate++; 3294 } 3295 } 3296 break; 3297 case check_state_compute_run: 3298 break; 3299 default: 3300 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3301 __func__, sh->check_state, 3302 (unsigned long long) sh->sector); 3303 BUG(); 3304 } 3305 } 3306 3307 3308 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3309 struct stripe_head_state *s, 3310 int disks) 3311 { 3312 int pd_idx = sh->pd_idx; 3313 int qd_idx = sh->qd_idx; 3314 struct r5dev *dev; 3315 3316 set_bit(STRIPE_HANDLE, &sh->state); 3317 3318 BUG_ON(s->failed > 2); 3319 3320 /* Want to check and possibly repair P and Q. 3321 * However there could be one 'failed' device, in which 3322 * case we can only check one of them, possibly using the 3323 * other to generate missing data 3324 */ 3325 3326 switch (sh->check_state) { 3327 case check_state_idle: 3328 /* start a new check operation if there are < 2 failures */ 3329 if (s->failed == s->q_failed) { 3330 /* The only possible failed device holds Q, so it 3331 * makes sense to check P (If anything else were failed, 3332 * we would have used P to recreate it). 3333 */ 3334 sh->check_state = check_state_run; 3335 } 3336 if (!s->q_failed && s->failed < 2) { 3337 /* Q is not failed, and we didn't use it to generate 3338 * anything, so it makes sense to check it 3339 */ 3340 if (sh->check_state == check_state_run) 3341 sh->check_state = check_state_run_pq; 3342 else 3343 sh->check_state = check_state_run_q; 3344 } 3345 3346 /* discard potentially stale zero_sum_result */ 3347 sh->ops.zero_sum_result = 0; 3348 3349 if (sh->check_state == check_state_run) { 3350 /* async_xor_zero_sum destroys the contents of P */ 3351 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3352 s->uptodate--; 3353 } 3354 if (sh->check_state >= check_state_run && 3355 sh->check_state <= check_state_run_pq) { 3356 /* async_syndrome_zero_sum preserves P and Q, so 3357 * no need to mark them !uptodate here 3358 */ 3359 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3360 break; 3361 } 3362 3363 /* we have 2-disk failure */ 3364 BUG_ON(s->failed != 2); 3365 /* fall through */ 3366 case check_state_compute_result: 3367 sh->check_state = check_state_idle; 3368 3369 /* check that a write has not made the stripe insync */ 3370 if (test_bit(STRIPE_INSYNC, &sh->state)) 3371 break; 3372 3373 /* now write out any block on a failed drive, 3374 * or P or Q if they were recomputed 3375 */ 3376 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3377 if (s->failed == 2) { 3378 dev = &sh->dev[s->failed_num[1]]; 3379 s->locked++; 3380 set_bit(R5_LOCKED, &dev->flags); 3381 set_bit(R5_Wantwrite, &dev->flags); 3382 } 3383 if (s->failed >= 1) { 3384 dev = &sh->dev[s->failed_num[0]]; 3385 s->locked++; 3386 set_bit(R5_LOCKED, &dev->flags); 3387 set_bit(R5_Wantwrite, &dev->flags); 3388 } 3389 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3390 dev = &sh->dev[pd_idx]; 3391 s->locked++; 3392 set_bit(R5_LOCKED, &dev->flags); 3393 set_bit(R5_Wantwrite, &dev->flags); 3394 } 3395 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3396 dev = &sh->dev[qd_idx]; 3397 s->locked++; 3398 set_bit(R5_LOCKED, &dev->flags); 3399 set_bit(R5_Wantwrite, &dev->flags); 3400 } 3401 clear_bit(STRIPE_DEGRADED, &sh->state); 3402 3403 set_bit(STRIPE_INSYNC, &sh->state); 3404 break; 3405 case check_state_run: 3406 case check_state_run_q: 3407 case check_state_run_pq: 3408 break; /* we will be called again upon completion */ 3409 case check_state_check_result: 3410 sh->check_state = check_state_idle; 3411 3412 /* handle a successful check operation, if parity is correct 3413 * we are done. Otherwise update the mismatch count and repair 3414 * parity if !MD_RECOVERY_CHECK 3415 */ 3416 if (sh->ops.zero_sum_result == 0) { 3417 /* both parities are correct */ 3418 if (!s->failed) 3419 set_bit(STRIPE_INSYNC, &sh->state); 3420 else { 3421 /* in contrast to the raid5 case we can validate 3422 * parity, but still have a failure to write 3423 * back 3424 */ 3425 sh->check_state = check_state_compute_result; 3426 /* Returning at this point means that we may go 3427 * off and bring p and/or q uptodate again so 3428 * we make sure to check zero_sum_result again 3429 * to verify if p or q need writeback 3430 */ 3431 } 3432 } else { 3433 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3434 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3435 /* don't try to repair!! */ 3436 set_bit(STRIPE_INSYNC, &sh->state); 3437 else { 3438 int *target = &sh->ops.target; 3439 3440 sh->ops.target = -1; 3441 sh->ops.target2 = -1; 3442 sh->check_state = check_state_compute_run; 3443 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3444 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3445 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3446 set_bit(R5_Wantcompute, 3447 &sh->dev[pd_idx].flags); 3448 *target = pd_idx; 3449 target = &sh->ops.target2; 3450 s->uptodate++; 3451 } 3452 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3453 set_bit(R5_Wantcompute, 3454 &sh->dev[qd_idx].flags); 3455 *target = qd_idx; 3456 s->uptodate++; 3457 } 3458 } 3459 } 3460 break; 3461 case check_state_compute_run: 3462 break; 3463 default: 3464 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3465 __func__, sh->check_state, 3466 (unsigned long long) sh->sector); 3467 BUG(); 3468 } 3469 } 3470 3471 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3472 { 3473 int i; 3474 3475 /* We have read all the blocks in this stripe and now we need to 3476 * copy some of them into a target stripe for expand. 3477 */ 3478 struct dma_async_tx_descriptor *tx = NULL; 3479 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3480 for (i = 0; i < sh->disks; i++) 3481 if (i != sh->pd_idx && i != sh->qd_idx) { 3482 int dd_idx, j; 3483 struct stripe_head *sh2; 3484 struct async_submit_ctl submit; 3485 3486 sector_t bn = compute_blocknr(sh, i, 1); 3487 sector_t s = raid5_compute_sector(conf, bn, 0, 3488 &dd_idx, NULL); 3489 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3490 if (sh2 == NULL) 3491 /* so far only the early blocks of this stripe 3492 * have been requested. When later blocks 3493 * get requested, we will try again 3494 */ 3495 continue; 3496 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3497 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3498 /* must have already done this block */ 3499 release_stripe(sh2); 3500 continue; 3501 } 3502 3503 /* place all the copies on one channel */ 3504 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3505 tx = async_memcpy(sh2->dev[dd_idx].page, 3506 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3507 &submit); 3508 3509 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3510 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3511 for (j = 0; j < conf->raid_disks; j++) 3512 if (j != sh2->pd_idx && 3513 j != sh2->qd_idx && 3514 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3515 break; 3516 if (j == conf->raid_disks) { 3517 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3518 set_bit(STRIPE_HANDLE, &sh2->state); 3519 } 3520 release_stripe(sh2); 3521 3522 } 3523 /* done submitting copies, wait for them to complete */ 3524 async_tx_quiesce(&tx); 3525 } 3526 3527 /* 3528 * handle_stripe - do things to a stripe. 3529 * 3530 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3531 * state of various bits to see what needs to be done. 3532 * Possible results: 3533 * return some read requests which now have data 3534 * return some write requests which are safely on storage 3535 * schedule a read on some buffers 3536 * schedule a write of some buffers 3537 * return confirmation of parity correctness 3538 * 3539 */ 3540 3541 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 3542 { 3543 struct r5conf *conf = sh->raid_conf; 3544 int disks = sh->disks; 3545 struct r5dev *dev; 3546 int i; 3547 int do_recovery = 0; 3548 3549 memset(s, 0, sizeof(*s)); 3550 3551 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3552 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3553 s->failed_num[0] = -1; 3554 s->failed_num[1] = -1; 3555 3556 /* Now to look around and see what can be done */ 3557 rcu_read_lock(); 3558 for (i=disks; i--; ) { 3559 struct md_rdev *rdev; 3560 sector_t first_bad; 3561 int bad_sectors; 3562 int is_bad = 0; 3563 3564 dev = &sh->dev[i]; 3565 3566 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3567 i, dev->flags, 3568 dev->toread, dev->towrite, dev->written); 3569 /* maybe we can reply to a read 3570 * 3571 * new wantfill requests are only permitted while 3572 * ops_complete_biofill is guaranteed to be inactive 3573 */ 3574 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3575 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3576 set_bit(R5_Wantfill, &dev->flags); 3577 3578 /* now count some things */ 3579 if (test_bit(R5_LOCKED, &dev->flags)) 3580 s->locked++; 3581 if (test_bit(R5_UPTODATE, &dev->flags)) 3582 s->uptodate++; 3583 if (test_bit(R5_Wantcompute, &dev->flags)) { 3584 s->compute++; 3585 BUG_ON(s->compute > 2); 3586 } 3587 3588 if (test_bit(R5_Wantfill, &dev->flags)) 3589 s->to_fill++; 3590 else if (dev->toread) 3591 s->to_read++; 3592 if (dev->towrite) { 3593 s->to_write++; 3594 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3595 s->non_overwrite++; 3596 } 3597 if (dev->written) 3598 s->written++; 3599 /* Prefer to use the replacement for reads, but only 3600 * if it is recovered enough and has no bad blocks. 3601 */ 3602 rdev = rcu_dereference(conf->disks[i].replacement); 3603 if (rdev && !test_bit(Faulty, &rdev->flags) && 3604 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 3605 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3606 &first_bad, &bad_sectors)) 3607 set_bit(R5_ReadRepl, &dev->flags); 3608 else { 3609 if (rdev) 3610 set_bit(R5_NeedReplace, &dev->flags); 3611 rdev = rcu_dereference(conf->disks[i].rdev); 3612 clear_bit(R5_ReadRepl, &dev->flags); 3613 } 3614 if (rdev && test_bit(Faulty, &rdev->flags)) 3615 rdev = NULL; 3616 if (rdev) { 3617 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 3618 &first_bad, &bad_sectors); 3619 if (s->blocked_rdev == NULL 3620 && (test_bit(Blocked, &rdev->flags) 3621 || is_bad < 0)) { 3622 if (is_bad < 0) 3623 set_bit(BlockedBadBlocks, 3624 &rdev->flags); 3625 s->blocked_rdev = rdev; 3626 atomic_inc(&rdev->nr_pending); 3627 } 3628 } 3629 clear_bit(R5_Insync, &dev->flags); 3630 if (!rdev) 3631 /* Not in-sync */; 3632 else if (is_bad) { 3633 /* also not in-sync */ 3634 if (!test_bit(WriteErrorSeen, &rdev->flags) && 3635 test_bit(R5_UPTODATE, &dev->flags)) { 3636 /* treat as in-sync, but with a read error 3637 * which we can now try to correct 3638 */ 3639 set_bit(R5_Insync, &dev->flags); 3640 set_bit(R5_ReadError, &dev->flags); 3641 } 3642 } else if (test_bit(In_sync, &rdev->flags)) 3643 set_bit(R5_Insync, &dev->flags); 3644 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3645 /* in sync if before recovery_offset */ 3646 set_bit(R5_Insync, &dev->flags); 3647 else if (test_bit(R5_UPTODATE, &dev->flags) && 3648 test_bit(R5_Expanded, &dev->flags)) 3649 /* If we've reshaped into here, we assume it is Insync. 3650 * We will shortly update recovery_offset to make 3651 * it official. 3652 */ 3653 set_bit(R5_Insync, &dev->flags); 3654 3655 if (test_bit(R5_WriteError, &dev->flags)) { 3656 /* This flag does not apply to '.replacement' 3657 * only to .rdev, so make sure to check that*/ 3658 struct md_rdev *rdev2 = rcu_dereference( 3659 conf->disks[i].rdev); 3660 if (rdev2 == rdev) 3661 clear_bit(R5_Insync, &dev->flags); 3662 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3663 s->handle_bad_blocks = 1; 3664 atomic_inc(&rdev2->nr_pending); 3665 } else 3666 clear_bit(R5_WriteError, &dev->flags); 3667 } 3668 if (test_bit(R5_MadeGood, &dev->flags)) { 3669 /* This flag does not apply to '.replacement' 3670 * only to .rdev, so make sure to check that*/ 3671 struct md_rdev *rdev2 = rcu_dereference( 3672 conf->disks[i].rdev); 3673 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3674 s->handle_bad_blocks = 1; 3675 atomic_inc(&rdev2->nr_pending); 3676 } else 3677 clear_bit(R5_MadeGood, &dev->flags); 3678 } 3679 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 3680 struct md_rdev *rdev2 = rcu_dereference( 3681 conf->disks[i].replacement); 3682 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 3683 s->handle_bad_blocks = 1; 3684 atomic_inc(&rdev2->nr_pending); 3685 } else 3686 clear_bit(R5_MadeGoodRepl, &dev->flags); 3687 } 3688 if (!test_bit(R5_Insync, &dev->flags)) { 3689 /* The ReadError flag will just be confusing now */ 3690 clear_bit(R5_ReadError, &dev->flags); 3691 clear_bit(R5_ReWrite, &dev->flags); 3692 } 3693 if (test_bit(R5_ReadError, &dev->flags)) 3694 clear_bit(R5_Insync, &dev->flags); 3695 if (!test_bit(R5_Insync, &dev->flags)) { 3696 if (s->failed < 2) 3697 s->failed_num[s->failed] = i; 3698 s->failed++; 3699 if (rdev && !test_bit(Faulty, &rdev->flags)) 3700 do_recovery = 1; 3701 } 3702 } 3703 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3704 /* If there is a failed device being replaced, 3705 * we must be recovering. 3706 * else if we are after recovery_cp, we must be syncing 3707 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 3708 * else we can only be replacing 3709 * sync and recovery both need to read all devices, and so 3710 * use the same flag. 3711 */ 3712 if (do_recovery || 3713 sh->sector >= conf->mddev->recovery_cp || 3714 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 3715 s->syncing = 1; 3716 else 3717 s->replacing = 1; 3718 } 3719 rcu_read_unlock(); 3720 } 3721 3722 static void handle_stripe(struct stripe_head *sh) 3723 { 3724 struct stripe_head_state s; 3725 struct r5conf *conf = sh->raid_conf; 3726 int i; 3727 int prexor; 3728 int disks = sh->disks; 3729 struct r5dev *pdev, *qdev; 3730 3731 clear_bit(STRIPE_HANDLE, &sh->state); 3732 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 3733 /* already being handled, ensure it gets handled 3734 * again when current action finishes */ 3735 set_bit(STRIPE_HANDLE, &sh->state); 3736 return; 3737 } 3738 3739 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3740 spin_lock(&sh->stripe_lock); 3741 /* Cannot process 'sync' concurrently with 'discard' */ 3742 if (!test_bit(STRIPE_DISCARD, &sh->state) && 3743 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 3744 set_bit(STRIPE_SYNCING, &sh->state); 3745 clear_bit(STRIPE_INSYNC, &sh->state); 3746 clear_bit(STRIPE_REPLACED, &sh->state); 3747 } 3748 spin_unlock(&sh->stripe_lock); 3749 } 3750 clear_bit(STRIPE_DELAYED, &sh->state); 3751 3752 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3753 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3754 (unsigned long long)sh->sector, sh->state, 3755 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 3756 sh->check_state, sh->reconstruct_state); 3757 3758 analyse_stripe(sh, &s); 3759 3760 if (s.handle_bad_blocks) { 3761 set_bit(STRIPE_HANDLE, &sh->state); 3762 goto finish; 3763 } 3764 3765 if (unlikely(s.blocked_rdev)) { 3766 if (s.syncing || s.expanding || s.expanded || 3767 s.replacing || s.to_write || s.written) { 3768 set_bit(STRIPE_HANDLE, &sh->state); 3769 goto finish; 3770 } 3771 /* There is nothing for the blocked_rdev to block */ 3772 rdev_dec_pending(s.blocked_rdev, conf->mddev); 3773 s.blocked_rdev = NULL; 3774 } 3775 3776 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3777 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3778 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3779 } 3780 3781 pr_debug("locked=%d uptodate=%d to_read=%d" 3782 " to_write=%d failed=%d failed_num=%d,%d\n", 3783 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3784 s.failed_num[0], s.failed_num[1]); 3785 /* check if the array has lost more than max_degraded devices and, 3786 * if so, some requests might need to be failed. 3787 */ 3788 if (s.failed > conf->max_degraded) { 3789 sh->check_state = 0; 3790 sh->reconstruct_state = 0; 3791 if (s.to_read+s.to_write+s.written) 3792 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 3793 if (s.syncing + s.replacing) 3794 handle_failed_sync(conf, sh, &s); 3795 } 3796 3797 /* Now we check to see if any write operations have recently 3798 * completed 3799 */ 3800 prexor = 0; 3801 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3802 prexor = 1; 3803 if (sh->reconstruct_state == reconstruct_state_drain_result || 3804 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3805 sh->reconstruct_state = reconstruct_state_idle; 3806 3807 /* All the 'written' buffers and the parity block are ready to 3808 * be written back to disk 3809 */ 3810 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3811 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3812 BUG_ON(sh->qd_idx >= 0 && 3813 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3814 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3815 for (i = disks; i--; ) { 3816 struct r5dev *dev = &sh->dev[i]; 3817 if (test_bit(R5_LOCKED, &dev->flags) && 3818 (i == sh->pd_idx || i == sh->qd_idx || 3819 dev->written)) { 3820 pr_debug("Writing block %d\n", i); 3821 set_bit(R5_Wantwrite, &dev->flags); 3822 if (prexor) 3823 continue; 3824 if (s.failed > 1) 3825 continue; 3826 if (!test_bit(R5_Insync, &dev->flags) || 3827 ((i == sh->pd_idx || i == sh->qd_idx) && 3828 s.failed == 0)) 3829 set_bit(STRIPE_INSYNC, &sh->state); 3830 } 3831 } 3832 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3833 s.dec_preread_active = 1; 3834 } 3835 3836 /* 3837 * might be able to return some write requests if the parity blocks 3838 * are safe, or on a failed drive 3839 */ 3840 pdev = &sh->dev[sh->pd_idx]; 3841 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3842 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3843 qdev = &sh->dev[sh->qd_idx]; 3844 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3845 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3846 || conf->level < 6; 3847 3848 if (s.written && 3849 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3850 && !test_bit(R5_LOCKED, &pdev->flags) 3851 && (test_bit(R5_UPTODATE, &pdev->flags) || 3852 test_bit(R5_Discard, &pdev->flags))))) && 3853 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3854 && !test_bit(R5_LOCKED, &qdev->flags) 3855 && (test_bit(R5_UPTODATE, &qdev->flags) || 3856 test_bit(R5_Discard, &qdev->flags)))))) 3857 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3858 3859 /* Now we might consider reading some blocks, either to check/generate 3860 * parity, or to satisfy requests 3861 * or to load a block that is being partially written. 3862 */ 3863 if (s.to_read || s.non_overwrite 3864 || (conf->level == 6 && s.to_write && s.failed) 3865 || (s.syncing && (s.uptodate + s.compute < disks)) 3866 || s.replacing 3867 || s.expanding) 3868 handle_stripe_fill(sh, &s, disks); 3869 3870 /* Now to consider new write requests and what else, if anything 3871 * should be read. We do not handle new writes when: 3872 * 1/ A 'write' operation (copy+xor) is already in flight. 3873 * 2/ A 'check' operation is in flight, as it may clobber the parity 3874 * block. 3875 */ 3876 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3877 handle_stripe_dirtying(conf, sh, &s, disks); 3878 3879 /* maybe we need to check and possibly fix the parity for this stripe 3880 * Any reads will already have been scheduled, so we just see if enough 3881 * data is available. The parity check is held off while parity 3882 * dependent operations are in flight. 3883 */ 3884 if (sh->check_state || 3885 (s.syncing && s.locked == 0 && 3886 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3887 !test_bit(STRIPE_INSYNC, &sh->state))) { 3888 if (conf->level == 6) 3889 handle_parity_checks6(conf, sh, &s, disks); 3890 else 3891 handle_parity_checks5(conf, sh, &s, disks); 3892 } 3893 3894 if ((s.replacing || s.syncing) && s.locked == 0 3895 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 3896 && !test_bit(STRIPE_REPLACED, &sh->state)) { 3897 /* Write out to replacement devices where possible */ 3898 for (i = 0; i < conf->raid_disks; i++) 3899 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 3900 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3901 set_bit(R5_WantReplace, &sh->dev[i].flags); 3902 set_bit(R5_LOCKED, &sh->dev[i].flags); 3903 s.locked++; 3904 } 3905 if (s.replacing) 3906 set_bit(STRIPE_INSYNC, &sh->state); 3907 set_bit(STRIPE_REPLACED, &sh->state); 3908 } 3909 if ((s.syncing || s.replacing) && s.locked == 0 && 3910 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3911 test_bit(STRIPE_INSYNC, &sh->state)) { 3912 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3913 clear_bit(STRIPE_SYNCING, &sh->state); 3914 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3915 wake_up(&conf->wait_for_overlap); 3916 } 3917 3918 /* If the failed drives are just a ReadError, then we might need 3919 * to progress the repair/check process 3920 */ 3921 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 3922 for (i = 0; i < s.failed; i++) { 3923 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 3924 if (test_bit(R5_ReadError, &dev->flags) 3925 && !test_bit(R5_LOCKED, &dev->flags) 3926 && test_bit(R5_UPTODATE, &dev->flags) 3927 ) { 3928 if (!test_bit(R5_ReWrite, &dev->flags)) { 3929 set_bit(R5_Wantwrite, &dev->flags); 3930 set_bit(R5_ReWrite, &dev->flags); 3931 set_bit(R5_LOCKED, &dev->flags); 3932 s.locked++; 3933 } else { 3934 /* let's read it back */ 3935 set_bit(R5_Wantread, &dev->flags); 3936 set_bit(R5_LOCKED, &dev->flags); 3937 s.locked++; 3938 } 3939 } 3940 } 3941 3942 3943 /* Finish reconstruct operations initiated by the expansion process */ 3944 if (sh->reconstruct_state == reconstruct_state_result) { 3945 struct stripe_head *sh_src 3946 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3947 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 3948 /* sh cannot be written until sh_src has been read. 3949 * so arrange for sh to be delayed a little 3950 */ 3951 set_bit(STRIPE_DELAYED, &sh->state); 3952 set_bit(STRIPE_HANDLE, &sh->state); 3953 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3954 &sh_src->state)) 3955 atomic_inc(&conf->preread_active_stripes); 3956 release_stripe(sh_src); 3957 goto finish; 3958 } 3959 if (sh_src) 3960 release_stripe(sh_src); 3961 3962 sh->reconstruct_state = reconstruct_state_idle; 3963 clear_bit(STRIPE_EXPANDING, &sh->state); 3964 for (i = conf->raid_disks; i--; ) { 3965 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3966 set_bit(R5_LOCKED, &sh->dev[i].flags); 3967 s.locked++; 3968 } 3969 } 3970 3971 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3972 !sh->reconstruct_state) { 3973 /* Need to write out all blocks after computing parity */ 3974 sh->disks = conf->raid_disks; 3975 stripe_set_idx(sh->sector, conf, 0, sh); 3976 schedule_reconstruction(sh, &s, 1, 1); 3977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3978 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3979 atomic_dec(&conf->reshape_stripes); 3980 wake_up(&conf->wait_for_overlap); 3981 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3982 } 3983 3984 if (s.expanding && s.locked == 0 && 3985 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3986 handle_stripe_expansion(conf, sh); 3987 3988 finish: 3989 /* wait for this device to become unblocked */ 3990 if (unlikely(s.blocked_rdev)) { 3991 if (conf->mddev->external) 3992 md_wait_for_blocked_rdev(s.blocked_rdev, 3993 conf->mddev); 3994 else 3995 /* Internal metadata will immediately 3996 * be written by raid5d, so we don't 3997 * need to wait here. 3998 */ 3999 rdev_dec_pending(s.blocked_rdev, 4000 conf->mddev); 4001 } 4002 4003 if (s.handle_bad_blocks) 4004 for (i = disks; i--; ) { 4005 struct md_rdev *rdev; 4006 struct r5dev *dev = &sh->dev[i]; 4007 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4008 /* We own a safe reference to the rdev */ 4009 rdev = conf->disks[i].rdev; 4010 if (!rdev_set_badblocks(rdev, sh->sector, 4011 STRIPE_SECTORS, 0)) 4012 md_error(conf->mddev, rdev); 4013 rdev_dec_pending(rdev, conf->mddev); 4014 } 4015 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4016 rdev = conf->disks[i].rdev; 4017 rdev_clear_badblocks(rdev, sh->sector, 4018 STRIPE_SECTORS, 0); 4019 rdev_dec_pending(rdev, conf->mddev); 4020 } 4021 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4022 rdev = conf->disks[i].replacement; 4023 if (!rdev) 4024 /* rdev have been moved down */ 4025 rdev = conf->disks[i].rdev; 4026 rdev_clear_badblocks(rdev, sh->sector, 4027 STRIPE_SECTORS, 0); 4028 rdev_dec_pending(rdev, conf->mddev); 4029 } 4030 } 4031 4032 if (s.ops_request) 4033 raid_run_ops(sh, s.ops_request); 4034 4035 ops_run_io(sh, &s); 4036 4037 if (s.dec_preread_active) { 4038 /* We delay this until after ops_run_io so that if make_request 4039 * is waiting on a flush, it won't continue until the writes 4040 * have actually been submitted. 4041 */ 4042 atomic_dec(&conf->preread_active_stripes); 4043 if (atomic_read(&conf->preread_active_stripes) < 4044 IO_THRESHOLD) 4045 md_wakeup_thread(conf->mddev->thread); 4046 } 4047 4048 return_io(s.return_bi); 4049 4050 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4051 } 4052 4053 static void raid5_activate_delayed(struct r5conf *conf) 4054 { 4055 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 4056 while (!list_empty(&conf->delayed_list)) { 4057 struct list_head *l = conf->delayed_list.next; 4058 struct stripe_head *sh; 4059 sh = list_entry(l, struct stripe_head, lru); 4060 list_del_init(l); 4061 clear_bit(STRIPE_DELAYED, &sh->state); 4062 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4063 atomic_inc(&conf->preread_active_stripes); 4064 list_add_tail(&sh->lru, &conf->hold_list); 4065 raid5_wakeup_stripe_thread(sh); 4066 } 4067 } 4068 } 4069 4070 static void activate_bit_delay(struct r5conf *conf, 4071 struct list_head *temp_inactive_list) 4072 { 4073 /* device_lock is held */ 4074 struct list_head head; 4075 list_add(&head, &conf->bitmap_list); 4076 list_del_init(&conf->bitmap_list); 4077 while (!list_empty(&head)) { 4078 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4079 int hash; 4080 list_del_init(&sh->lru); 4081 atomic_inc(&sh->count); 4082 hash = sh->hash_lock_index; 4083 __release_stripe(conf, sh, &temp_inactive_list[hash]); 4084 } 4085 } 4086 4087 int md_raid5_congested(struct mddev *mddev, int bits) 4088 { 4089 struct r5conf *conf = mddev->private; 4090 4091 /* No difference between reads and writes. Just check 4092 * how busy the stripe_cache is 4093 */ 4094 4095 if (conf->inactive_blocked) 4096 return 1; 4097 if (conf->quiesce) 4098 return 1; 4099 if (atomic_read(&conf->empty_inactive_list_nr)) 4100 return 1; 4101 4102 return 0; 4103 } 4104 EXPORT_SYMBOL_GPL(md_raid5_congested); 4105 4106 static int raid5_congested(void *data, int bits) 4107 { 4108 struct mddev *mddev = data; 4109 4110 return mddev_congested(mddev, bits) || 4111 md_raid5_congested(mddev, bits); 4112 } 4113 4114 /* We want read requests to align with chunks where possible, 4115 * but write requests don't need to. 4116 */ 4117 static int raid5_mergeable_bvec(struct request_queue *q, 4118 struct bvec_merge_data *bvm, 4119 struct bio_vec *biovec) 4120 { 4121 struct mddev *mddev = q->queuedata; 4122 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 4123 int max; 4124 unsigned int chunk_sectors = mddev->chunk_sectors; 4125 unsigned int bio_sectors = bvm->bi_size >> 9; 4126 4127 if ((bvm->bi_rw & 1) == WRITE) 4128 return biovec->bv_len; /* always allow writes to be mergeable */ 4129 4130 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 4131 chunk_sectors = mddev->new_chunk_sectors; 4132 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 4133 if (max < 0) max = 0; 4134 if (max <= biovec->bv_len && bio_sectors == 0) 4135 return biovec->bv_len; 4136 else 4137 return max; 4138 } 4139 4140 4141 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4142 { 4143 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 4144 unsigned int chunk_sectors = mddev->chunk_sectors; 4145 unsigned int bio_sectors = bio_sectors(bio); 4146 4147 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 4148 chunk_sectors = mddev->new_chunk_sectors; 4149 return chunk_sectors >= 4150 ((sector & (chunk_sectors - 1)) + bio_sectors); 4151 } 4152 4153 /* 4154 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 4155 * later sampled by raid5d. 4156 */ 4157 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 4158 { 4159 unsigned long flags; 4160 4161 spin_lock_irqsave(&conf->device_lock, flags); 4162 4163 bi->bi_next = conf->retry_read_aligned_list; 4164 conf->retry_read_aligned_list = bi; 4165 4166 spin_unlock_irqrestore(&conf->device_lock, flags); 4167 md_wakeup_thread(conf->mddev->thread); 4168 } 4169 4170 4171 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4172 { 4173 struct bio *bi; 4174 4175 bi = conf->retry_read_aligned; 4176 if (bi) { 4177 conf->retry_read_aligned = NULL; 4178 return bi; 4179 } 4180 bi = conf->retry_read_aligned_list; 4181 if(bi) { 4182 conf->retry_read_aligned_list = bi->bi_next; 4183 bi->bi_next = NULL; 4184 /* 4185 * this sets the active strip count to 1 and the processed 4186 * strip count to zero (upper 8 bits) 4187 */ 4188 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4189 } 4190 4191 return bi; 4192 } 4193 4194 4195 /* 4196 * The "raid5_align_endio" should check if the read succeeded and if it 4197 * did, call bio_endio on the original bio (having bio_put the new bio 4198 * first). 4199 * If the read failed.. 4200 */ 4201 static void raid5_align_endio(struct bio *bi, int error) 4202 { 4203 struct bio* raid_bi = bi->bi_private; 4204 struct mddev *mddev; 4205 struct r5conf *conf; 4206 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 4207 struct md_rdev *rdev; 4208 4209 bio_put(bi); 4210 4211 rdev = (void*)raid_bi->bi_next; 4212 raid_bi->bi_next = NULL; 4213 mddev = rdev->mddev; 4214 conf = mddev->private; 4215 4216 rdev_dec_pending(rdev, conf->mddev); 4217 4218 if (!error && uptodate) { 4219 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4220 raid_bi, 0); 4221 bio_endio(raid_bi, 0); 4222 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4223 wake_up(&conf->wait_for_stripe); 4224 return; 4225 } 4226 4227 4228 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4229 4230 add_bio_to_retry(raid_bi, conf); 4231 } 4232 4233 static int bio_fits_rdev(struct bio *bi) 4234 { 4235 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 4236 4237 if (bio_sectors(bi) > queue_max_sectors(q)) 4238 return 0; 4239 blk_recount_segments(q, bi); 4240 if (bi->bi_phys_segments > queue_max_segments(q)) 4241 return 0; 4242 4243 if (q->merge_bvec_fn) 4244 /* it's too hard to apply the merge_bvec_fn at this stage, 4245 * just just give up 4246 */ 4247 return 0; 4248 4249 return 1; 4250 } 4251 4252 4253 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 4254 { 4255 struct r5conf *conf = mddev->private; 4256 int dd_idx; 4257 struct bio* align_bi; 4258 struct md_rdev *rdev; 4259 sector_t end_sector; 4260 4261 if (!in_chunk_boundary(mddev, raid_bio)) { 4262 pr_debug("chunk_aligned_read : non aligned\n"); 4263 return 0; 4264 } 4265 /* 4266 * use bio_clone_mddev to make a copy of the bio 4267 */ 4268 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 4269 if (!align_bi) 4270 return 0; 4271 /* 4272 * set bi_end_io to a new function, and set bi_private to the 4273 * original bio. 4274 */ 4275 align_bi->bi_end_io = raid5_align_endio; 4276 align_bi->bi_private = raid_bio; 4277 /* 4278 * compute position 4279 */ 4280 align_bi->bi_iter.bi_sector = 4281 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 4282 0, &dd_idx, NULL); 4283 4284 end_sector = bio_end_sector(align_bi); 4285 rcu_read_lock(); 4286 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 4287 if (!rdev || test_bit(Faulty, &rdev->flags) || 4288 rdev->recovery_offset < end_sector) { 4289 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 4290 if (rdev && 4291 (test_bit(Faulty, &rdev->flags) || 4292 !(test_bit(In_sync, &rdev->flags) || 4293 rdev->recovery_offset >= end_sector))) 4294 rdev = NULL; 4295 } 4296 if (rdev) { 4297 sector_t first_bad; 4298 int bad_sectors; 4299 4300 atomic_inc(&rdev->nr_pending); 4301 rcu_read_unlock(); 4302 raid_bio->bi_next = (void*)rdev; 4303 align_bi->bi_bdev = rdev->bdev; 4304 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 4305 4306 if (!bio_fits_rdev(align_bi) || 4307 is_badblock(rdev, align_bi->bi_iter.bi_sector, 4308 bio_sectors(align_bi), 4309 &first_bad, &bad_sectors)) { 4310 /* too big in some way, or has a known bad block */ 4311 bio_put(align_bi); 4312 rdev_dec_pending(rdev, mddev); 4313 return 0; 4314 } 4315 4316 /* No reshape active, so we can trust rdev->data_offset */ 4317 align_bi->bi_iter.bi_sector += rdev->data_offset; 4318 4319 spin_lock_irq(&conf->device_lock); 4320 wait_event_lock_irq(conf->wait_for_stripe, 4321 conf->quiesce == 0, 4322 conf->device_lock); 4323 atomic_inc(&conf->active_aligned_reads); 4324 spin_unlock_irq(&conf->device_lock); 4325 4326 if (mddev->gendisk) 4327 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4328 align_bi, disk_devt(mddev->gendisk), 4329 raid_bio->bi_iter.bi_sector); 4330 generic_make_request(align_bi); 4331 return 1; 4332 } else { 4333 rcu_read_unlock(); 4334 bio_put(align_bi); 4335 return 0; 4336 } 4337 } 4338 4339 /* __get_priority_stripe - get the next stripe to process 4340 * 4341 * Full stripe writes are allowed to pass preread active stripes up until 4342 * the bypass_threshold is exceeded. In general the bypass_count 4343 * increments when the handle_list is handled before the hold_list; however, it 4344 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4345 * stripe with in flight i/o. The bypass_count will be reset when the 4346 * head of the hold_list has changed, i.e. the head was promoted to the 4347 * handle_list. 4348 */ 4349 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 4350 { 4351 struct stripe_head *sh = NULL, *tmp; 4352 struct list_head *handle_list = NULL; 4353 struct r5worker_group *wg = NULL; 4354 4355 if (conf->worker_cnt_per_group == 0) { 4356 handle_list = &conf->handle_list; 4357 } else if (group != ANY_GROUP) { 4358 handle_list = &conf->worker_groups[group].handle_list; 4359 wg = &conf->worker_groups[group]; 4360 } else { 4361 int i; 4362 for (i = 0; i < conf->group_cnt; i++) { 4363 handle_list = &conf->worker_groups[i].handle_list; 4364 wg = &conf->worker_groups[i]; 4365 if (!list_empty(handle_list)) 4366 break; 4367 } 4368 } 4369 4370 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4371 __func__, 4372 list_empty(handle_list) ? "empty" : "busy", 4373 list_empty(&conf->hold_list) ? "empty" : "busy", 4374 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4375 4376 if (!list_empty(handle_list)) { 4377 sh = list_entry(handle_list->next, typeof(*sh), lru); 4378 4379 if (list_empty(&conf->hold_list)) 4380 conf->bypass_count = 0; 4381 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4382 if (conf->hold_list.next == conf->last_hold) 4383 conf->bypass_count++; 4384 else { 4385 conf->last_hold = conf->hold_list.next; 4386 conf->bypass_count -= conf->bypass_threshold; 4387 if (conf->bypass_count < 0) 4388 conf->bypass_count = 0; 4389 } 4390 } 4391 } else if (!list_empty(&conf->hold_list) && 4392 ((conf->bypass_threshold && 4393 conf->bypass_count > conf->bypass_threshold) || 4394 atomic_read(&conf->pending_full_writes) == 0)) { 4395 4396 list_for_each_entry(tmp, &conf->hold_list, lru) { 4397 if (conf->worker_cnt_per_group == 0 || 4398 group == ANY_GROUP || 4399 !cpu_online(tmp->cpu) || 4400 cpu_to_group(tmp->cpu) == group) { 4401 sh = tmp; 4402 break; 4403 } 4404 } 4405 4406 if (sh) { 4407 conf->bypass_count -= conf->bypass_threshold; 4408 if (conf->bypass_count < 0) 4409 conf->bypass_count = 0; 4410 } 4411 wg = NULL; 4412 } 4413 4414 if (!sh) 4415 return NULL; 4416 4417 if (wg) { 4418 wg->stripes_cnt--; 4419 sh->group = NULL; 4420 } 4421 list_del_init(&sh->lru); 4422 BUG_ON(atomic_inc_return(&sh->count) != 1); 4423 return sh; 4424 } 4425 4426 struct raid5_plug_cb { 4427 struct blk_plug_cb cb; 4428 struct list_head list; 4429 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 4430 }; 4431 4432 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4433 { 4434 struct raid5_plug_cb *cb = container_of( 4435 blk_cb, struct raid5_plug_cb, cb); 4436 struct stripe_head *sh; 4437 struct mddev *mddev = cb->cb.data; 4438 struct r5conf *conf = mddev->private; 4439 int cnt = 0; 4440 int hash; 4441 4442 if (cb->list.next && !list_empty(&cb->list)) { 4443 spin_lock_irq(&conf->device_lock); 4444 while (!list_empty(&cb->list)) { 4445 sh = list_first_entry(&cb->list, struct stripe_head, lru); 4446 list_del_init(&sh->lru); 4447 /* 4448 * avoid race release_stripe_plug() sees 4449 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4450 * is still in our list 4451 */ 4452 smp_mb__before_atomic(); 4453 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4454 /* 4455 * STRIPE_ON_RELEASE_LIST could be set here. In that 4456 * case, the count is always > 1 here 4457 */ 4458 hash = sh->hash_lock_index; 4459 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 4460 cnt++; 4461 } 4462 spin_unlock_irq(&conf->device_lock); 4463 } 4464 release_inactive_stripe_list(conf, cb->temp_inactive_list, 4465 NR_STRIPE_HASH_LOCKS); 4466 if (mddev->queue) 4467 trace_block_unplug(mddev->queue, cnt, !from_schedule); 4468 kfree(cb); 4469 } 4470 4471 static void release_stripe_plug(struct mddev *mddev, 4472 struct stripe_head *sh) 4473 { 4474 struct blk_plug_cb *blk_cb = blk_check_plugged( 4475 raid5_unplug, mddev, 4476 sizeof(struct raid5_plug_cb)); 4477 struct raid5_plug_cb *cb; 4478 4479 if (!blk_cb) { 4480 release_stripe(sh); 4481 return; 4482 } 4483 4484 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 4485 4486 if (cb->list.next == NULL) { 4487 int i; 4488 INIT_LIST_HEAD(&cb->list); 4489 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 4490 INIT_LIST_HEAD(cb->temp_inactive_list + i); 4491 } 4492 4493 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 4494 list_add_tail(&sh->lru, &cb->list); 4495 else 4496 release_stripe(sh); 4497 } 4498 4499 static void make_discard_request(struct mddev *mddev, struct bio *bi) 4500 { 4501 struct r5conf *conf = mddev->private; 4502 sector_t logical_sector, last_sector; 4503 struct stripe_head *sh; 4504 int remaining; 4505 int stripe_sectors; 4506 4507 if (mddev->reshape_position != MaxSector) 4508 /* Skip discard while reshape is happening */ 4509 return; 4510 4511 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4512 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 4513 4514 bi->bi_next = NULL; 4515 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4516 4517 stripe_sectors = conf->chunk_sectors * 4518 (conf->raid_disks - conf->max_degraded); 4519 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4520 stripe_sectors); 4521 sector_div(last_sector, stripe_sectors); 4522 4523 logical_sector *= conf->chunk_sectors; 4524 last_sector *= conf->chunk_sectors; 4525 4526 for (; logical_sector < last_sector; 4527 logical_sector += STRIPE_SECTORS) { 4528 DEFINE_WAIT(w); 4529 int d; 4530 again: 4531 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4532 prepare_to_wait(&conf->wait_for_overlap, &w, 4533 TASK_UNINTERRUPTIBLE); 4534 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4535 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4536 release_stripe(sh); 4537 schedule(); 4538 goto again; 4539 } 4540 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 4541 spin_lock_irq(&sh->stripe_lock); 4542 for (d = 0; d < conf->raid_disks; d++) { 4543 if (d == sh->pd_idx || d == sh->qd_idx) 4544 continue; 4545 if (sh->dev[d].towrite || sh->dev[d].toread) { 4546 set_bit(R5_Overlap, &sh->dev[d].flags); 4547 spin_unlock_irq(&sh->stripe_lock); 4548 release_stripe(sh); 4549 schedule(); 4550 goto again; 4551 } 4552 } 4553 set_bit(STRIPE_DISCARD, &sh->state); 4554 finish_wait(&conf->wait_for_overlap, &w); 4555 for (d = 0; d < conf->raid_disks; d++) { 4556 if (d == sh->pd_idx || d == sh->qd_idx) 4557 continue; 4558 sh->dev[d].towrite = bi; 4559 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4560 raid5_inc_bi_active_stripes(bi); 4561 } 4562 spin_unlock_irq(&sh->stripe_lock); 4563 if (conf->mddev->bitmap) { 4564 for (d = 0; 4565 d < conf->raid_disks - conf->max_degraded; 4566 d++) 4567 bitmap_startwrite(mddev->bitmap, 4568 sh->sector, 4569 STRIPE_SECTORS, 4570 0); 4571 sh->bm_seq = conf->seq_flush + 1; 4572 set_bit(STRIPE_BIT_DELAY, &sh->state); 4573 } 4574 4575 set_bit(STRIPE_HANDLE, &sh->state); 4576 clear_bit(STRIPE_DELAYED, &sh->state); 4577 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4578 atomic_inc(&conf->preread_active_stripes); 4579 release_stripe_plug(mddev, sh); 4580 } 4581 4582 remaining = raid5_dec_bi_active_stripes(bi); 4583 if (remaining == 0) { 4584 md_write_end(mddev); 4585 bio_endio(bi, 0); 4586 } 4587 } 4588 4589 static void make_request(struct mddev *mddev, struct bio * bi) 4590 { 4591 struct r5conf *conf = mddev->private; 4592 int dd_idx; 4593 sector_t new_sector; 4594 sector_t logical_sector, last_sector; 4595 struct stripe_head *sh; 4596 const int rw = bio_data_dir(bi); 4597 int remaining; 4598 DEFINE_WAIT(w); 4599 bool do_prepare; 4600 4601 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 4602 md_flush_request(mddev, bi); 4603 return; 4604 } 4605 4606 md_write_start(mddev, bi); 4607 4608 if (rw == READ && 4609 mddev->reshape_position == MaxSector && 4610 chunk_aligned_read(mddev,bi)) 4611 return; 4612 4613 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4614 make_discard_request(mddev, bi); 4615 return; 4616 } 4617 4618 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4619 last_sector = bio_end_sector(bi); 4620 bi->bi_next = NULL; 4621 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4622 4623 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4624 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4625 int previous; 4626 int seq; 4627 4628 do_prepare = false; 4629 retry: 4630 seq = read_seqcount_begin(&conf->gen_lock); 4631 previous = 0; 4632 if (do_prepare) 4633 prepare_to_wait(&conf->wait_for_overlap, &w, 4634 TASK_UNINTERRUPTIBLE); 4635 if (unlikely(conf->reshape_progress != MaxSector)) { 4636 /* spinlock is needed as reshape_progress may be 4637 * 64bit on a 32bit platform, and so it might be 4638 * possible to see a half-updated value 4639 * Of course reshape_progress could change after 4640 * the lock is dropped, so once we get a reference 4641 * to the stripe that we think it is, we will have 4642 * to check again. 4643 */ 4644 spin_lock_irq(&conf->device_lock); 4645 if (mddev->reshape_backwards 4646 ? logical_sector < conf->reshape_progress 4647 : logical_sector >= conf->reshape_progress) { 4648 previous = 1; 4649 } else { 4650 if (mddev->reshape_backwards 4651 ? logical_sector < conf->reshape_safe 4652 : logical_sector >= conf->reshape_safe) { 4653 spin_unlock_irq(&conf->device_lock); 4654 schedule(); 4655 do_prepare = true; 4656 goto retry; 4657 } 4658 } 4659 spin_unlock_irq(&conf->device_lock); 4660 } 4661 4662 new_sector = raid5_compute_sector(conf, logical_sector, 4663 previous, 4664 &dd_idx, NULL); 4665 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4666 (unsigned long long)new_sector, 4667 (unsigned long long)logical_sector); 4668 4669 sh = get_active_stripe(conf, new_sector, previous, 4670 (bi->bi_rw&RWA_MASK), 0); 4671 if (sh) { 4672 if (unlikely(previous)) { 4673 /* expansion might have moved on while waiting for a 4674 * stripe, so we must do the range check again. 4675 * Expansion could still move past after this 4676 * test, but as we are holding a reference to 4677 * 'sh', we know that if that happens, 4678 * STRIPE_EXPANDING will get set and the expansion 4679 * won't proceed until we finish with the stripe. 4680 */ 4681 int must_retry = 0; 4682 spin_lock_irq(&conf->device_lock); 4683 if (mddev->reshape_backwards 4684 ? logical_sector >= conf->reshape_progress 4685 : logical_sector < conf->reshape_progress) 4686 /* mismatch, need to try again */ 4687 must_retry = 1; 4688 spin_unlock_irq(&conf->device_lock); 4689 if (must_retry) { 4690 release_stripe(sh); 4691 schedule(); 4692 do_prepare = true; 4693 goto retry; 4694 } 4695 } 4696 if (read_seqcount_retry(&conf->gen_lock, seq)) { 4697 /* Might have got the wrong stripe_head 4698 * by accident 4699 */ 4700 release_stripe(sh); 4701 goto retry; 4702 } 4703 4704 if (rw == WRITE && 4705 logical_sector >= mddev->suspend_lo && 4706 logical_sector < mddev->suspend_hi) { 4707 release_stripe(sh); 4708 /* As the suspend_* range is controlled by 4709 * userspace, we want an interruptible 4710 * wait. 4711 */ 4712 flush_signals(current); 4713 prepare_to_wait(&conf->wait_for_overlap, 4714 &w, TASK_INTERRUPTIBLE); 4715 if (logical_sector >= mddev->suspend_lo && 4716 logical_sector < mddev->suspend_hi) { 4717 schedule(); 4718 do_prepare = true; 4719 } 4720 goto retry; 4721 } 4722 4723 if (test_bit(STRIPE_EXPANDING, &sh->state) || 4724 !add_stripe_bio(sh, bi, dd_idx, rw)) { 4725 /* Stripe is busy expanding or 4726 * add failed due to overlap. Flush everything 4727 * and wait a while 4728 */ 4729 md_wakeup_thread(mddev->thread); 4730 release_stripe(sh); 4731 schedule(); 4732 do_prepare = true; 4733 goto retry; 4734 } 4735 set_bit(STRIPE_HANDLE, &sh->state); 4736 clear_bit(STRIPE_DELAYED, &sh->state); 4737 if ((bi->bi_rw & REQ_SYNC) && 4738 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4739 atomic_inc(&conf->preread_active_stripes); 4740 release_stripe_plug(mddev, sh); 4741 } else { 4742 /* cannot get stripe for read-ahead, just give-up */ 4743 clear_bit(BIO_UPTODATE, &bi->bi_flags); 4744 break; 4745 } 4746 } 4747 finish_wait(&conf->wait_for_overlap, &w); 4748 4749 remaining = raid5_dec_bi_active_stripes(bi); 4750 if (remaining == 0) { 4751 4752 if ( rw == WRITE ) 4753 md_write_end(mddev); 4754 4755 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 4756 bi, 0); 4757 bio_endio(bi, 0); 4758 } 4759 } 4760 4761 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 4762 4763 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 4764 { 4765 /* reshaping is quite different to recovery/resync so it is 4766 * handled quite separately ... here. 4767 * 4768 * On each call to sync_request, we gather one chunk worth of 4769 * destination stripes and flag them as expanding. 4770 * Then we find all the source stripes and request reads. 4771 * As the reads complete, handle_stripe will copy the data 4772 * into the destination stripe and release that stripe. 4773 */ 4774 struct r5conf *conf = mddev->private; 4775 struct stripe_head *sh; 4776 sector_t first_sector, last_sector; 4777 int raid_disks = conf->previous_raid_disks; 4778 int data_disks = raid_disks - conf->max_degraded; 4779 int new_data_disks = conf->raid_disks - conf->max_degraded; 4780 int i; 4781 int dd_idx; 4782 sector_t writepos, readpos, safepos; 4783 sector_t stripe_addr; 4784 int reshape_sectors; 4785 struct list_head stripes; 4786 4787 if (sector_nr == 0) { 4788 /* If restarting in the middle, skip the initial sectors */ 4789 if (mddev->reshape_backwards && 4790 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4791 sector_nr = raid5_size(mddev, 0, 0) 4792 - conf->reshape_progress; 4793 } else if (!mddev->reshape_backwards && 4794 conf->reshape_progress > 0) 4795 sector_nr = conf->reshape_progress; 4796 sector_div(sector_nr, new_data_disks); 4797 if (sector_nr) { 4798 mddev->curr_resync_completed = sector_nr; 4799 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4800 *skipped = 1; 4801 return sector_nr; 4802 } 4803 } 4804 4805 /* We need to process a full chunk at a time. 4806 * If old and new chunk sizes differ, we need to process the 4807 * largest of these 4808 */ 4809 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4810 reshape_sectors = mddev->new_chunk_sectors; 4811 else 4812 reshape_sectors = mddev->chunk_sectors; 4813 4814 /* We update the metadata at least every 10 seconds, or when 4815 * the data about to be copied would over-write the source of 4816 * the data at the front of the range. i.e. one new_stripe 4817 * along from reshape_progress new_maps to after where 4818 * reshape_safe old_maps to 4819 */ 4820 writepos = conf->reshape_progress; 4821 sector_div(writepos, new_data_disks); 4822 readpos = conf->reshape_progress; 4823 sector_div(readpos, data_disks); 4824 safepos = conf->reshape_safe; 4825 sector_div(safepos, data_disks); 4826 if (mddev->reshape_backwards) { 4827 writepos -= min_t(sector_t, reshape_sectors, writepos); 4828 readpos += reshape_sectors; 4829 safepos += reshape_sectors; 4830 } else { 4831 writepos += reshape_sectors; 4832 readpos -= min_t(sector_t, reshape_sectors, readpos); 4833 safepos -= min_t(sector_t, reshape_sectors, safepos); 4834 } 4835 4836 /* Having calculated the 'writepos' possibly use it 4837 * to set 'stripe_addr' which is where we will write to. 4838 */ 4839 if (mddev->reshape_backwards) { 4840 BUG_ON(conf->reshape_progress == 0); 4841 stripe_addr = writepos; 4842 BUG_ON((mddev->dev_sectors & 4843 ~((sector_t)reshape_sectors - 1)) 4844 - reshape_sectors - stripe_addr 4845 != sector_nr); 4846 } else { 4847 BUG_ON(writepos != sector_nr + reshape_sectors); 4848 stripe_addr = sector_nr; 4849 } 4850 4851 /* 'writepos' is the most advanced device address we might write. 4852 * 'readpos' is the least advanced device address we might read. 4853 * 'safepos' is the least address recorded in the metadata as having 4854 * been reshaped. 4855 * If there is a min_offset_diff, these are adjusted either by 4856 * increasing the safepos/readpos if diff is negative, or 4857 * increasing writepos if diff is positive. 4858 * If 'readpos' is then behind 'writepos', there is no way that we can 4859 * ensure safety in the face of a crash - that must be done by userspace 4860 * making a backup of the data. So in that case there is no particular 4861 * rush to update metadata. 4862 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4863 * update the metadata to advance 'safepos' to match 'readpos' so that 4864 * we can be safe in the event of a crash. 4865 * So we insist on updating metadata if safepos is behind writepos and 4866 * readpos is beyond writepos. 4867 * In any case, update the metadata every 10 seconds. 4868 * Maybe that number should be configurable, but I'm not sure it is 4869 * worth it.... maybe it could be a multiple of safemode_delay??? 4870 */ 4871 if (conf->min_offset_diff < 0) { 4872 safepos += -conf->min_offset_diff; 4873 readpos += -conf->min_offset_diff; 4874 } else 4875 writepos += conf->min_offset_diff; 4876 4877 if ((mddev->reshape_backwards 4878 ? (safepos > writepos && readpos < writepos) 4879 : (safepos < writepos && readpos > writepos)) || 4880 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4881 /* Cannot proceed until we've updated the superblock... */ 4882 wait_event(conf->wait_for_overlap, 4883 atomic_read(&conf->reshape_stripes)==0 4884 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4885 if (atomic_read(&conf->reshape_stripes) != 0) 4886 return 0; 4887 mddev->reshape_position = conf->reshape_progress; 4888 mddev->curr_resync_completed = sector_nr; 4889 conf->reshape_checkpoint = jiffies; 4890 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4891 md_wakeup_thread(mddev->thread); 4892 wait_event(mddev->sb_wait, mddev->flags == 0 || 4893 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4894 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 4895 return 0; 4896 spin_lock_irq(&conf->device_lock); 4897 conf->reshape_safe = mddev->reshape_position; 4898 spin_unlock_irq(&conf->device_lock); 4899 wake_up(&conf->wait_for_overlap); 4900 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4901 } 4902 4903 INIT_LIST_HEAD(&stripes); 4904 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4905 int j; 4906 int skipped_disk = 0; 4907 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4908 set_bit(STRIPE_EXPANDING, &sh->state); 4909 atomic_inc(&conf->reshape_stripes); 4910 /* If any of this stripe is beyond the end of the old 4911 * array, then we need to zero those blocks 4912 */ 4913 for (j=sh->disks; j--;) { 4914 sector_t s; 4915 if (j == sh->pd_idx) 4916 continue; 4917 if (conf->level == 6 && 4918 j == sh->qd_idx) 4919 continue; 4920 s = compute_blocknr(sh, j, 0); 4921 if (s < raid5_size(mddev, 0, 0)) { 4922 skipped_disk = 1; 4923 continue; 4924 } 4925 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4926 set_bit(R5_Expanded, &sh->dev[j].flags); 4927 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4928 } 4929 if (!skipped_disk) { 4930 set_bit(STRIPE_EXPAND_READY, &sh->state); 4931 set_bit(STRIPE_HANDLE, &sh->state); 4932 } 4933 list_add(&sh->lru, &stripes); 4934 } 4935 spin_lock_irq(&conf->device_lock); 4936 if (mddev->reshape_backwards) 4937 conf->reshape_progress -= reshape_sectors * new_data_disks; 4938 else 4939 conf->reshape_progress += reshape_sectors * new_data_disks; 4940 spin_unlock_irq(&conf->device_lock); 4941 /* Ok, those stripe are ready. We can start scheduling 4942 * reads on the source stripes. 4943 * The source stripes are determined by mapping the first and last 4944 * block on the destination stripes. 4945 */ 4946 first_sector = 4947 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4948 1, &dd_idx, NULL); 4949 last_sector = 4950 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4951 * new_data_disks - 1), 4952 1, &dd_idx, NULL); 4953 if (last_sector >= mddev->dev_sectors) 4954 last_sector = mddev->dev_sectors - 1; 4955 while (first_sector <= last_sector) { 4956 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4957 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4958 set_bit(STRIPE_HANDLE, &sh->state); 4959 release_stripe(sh); 4960 first_sector += STRIPE_SECTORS; 4961 } 4962 /* Now that the sources are clearly marked, we can release 4963 * the destination stripes 4964 */ 4965 while (!list_empty(&stripes)) { 4966 sh = list_entry(stripes.next, struct stripe_head, lru); 4967 list_del_init(&sh->lru); 4968 release_stripe(sh); 4969 } 4970 /* If this takes us to the resync_max point where we have to pause, 4971 * then we need to write out the superblock. 4972 */ 4973 sector_nr += reshape_sectors; 4974 if ((sector_nr - mddev->curr_resync_completed) * 2 4975 >= mddev->resync_max - mddev->curr_resync_completed) { 4976 /* Cannot proceed until we've updated the superblock... */ 4977 wait_event(conf->wait_for_overlap, 4978 atomic_read(&conf->reshape_stripes) == 0 4979 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4980 if (atomic_read(&conf->reshape_stripes) != 0) 4981 goto ret; 4982 mddev->reshape_position = conf->reshape_progress; 4983 mddev->curr_resync_completed = sector_nr; 4984 conf->reshape_checkpoint = jiffies; 4985 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4986 md_wakeup_thread(mddev->thread); 4987 wait_event(mddev->sb_wait, 4988 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4989 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 4990 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 4991 goto ret; 4992 spin_lock_irq(&conf->device_lock); 4993 conf->reshape_safe = mddev->reshape_position; 4994 spin_unlock_irq(&conf->device_lock); 4995 wake_up(&conf->wait_for_overlap); 4996 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4997 } 4998 ret: 4999 return reshape_sectors; 5000 } 5001 5002 /* FIXME go_faster isn't used */ 5003 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) 5004 { 5005 struct r5conf *conf = mddev->private; 5006 struct stripe_head *sh; 5007 sector_t max_sector = mddev->dev_sectors; 5008 sector_t sync_blocks; 5009 int still_degraded = 0; 5010 int i; 5011 5012 if (sector_nr >= max_sector) { 5013 /* just being told to finish up .. nothing much to do */ 5014 5015 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5016 end_reshape(conf); 5017 return 0; 5018 } 5019 5020 if (mddev->curr_resync < max_sector) /* aborted */ 5021 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5022 &sync_blocks, 1); 5023 else /* completed sync */ 5024 conf->fullsync = 0; 5025 bitmap_close_sync(mddev->bitmap); 5026 5027 return 0; 5028 } 5029 5030 /* Allow raid5_quiesce to complete */ 5031 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5032 5033 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5034 return reshape_request(mddev, sector_nr, skipped); 5035 5036 /* No need to check resync_max as we never do more than one 5037 * stripe, and as resync_max will always be on a chunk boundary, 5038 * if the check in md_do_sync didn't fire, there is no chance 5039 * of overstepping resync_max here 5040 */ 5041 5042 /* if there is too many failed drives and we are trying 5043 * to resync, then assert that we are finished, because there is 5044 * nothing we can do. 5045 */ 5046 if (mddev->degraded >= conf->max_degraded && 5047 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5048 sector_t rv = mddev->dev_sectors - sector_nr; 5049 *skipped = 1; 5050 return rv; 5051 } 5052 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 5053 !conf->fullsync && 5054 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 5055 sync_blocks >= STRIPE_SECTORS) { 5056 /* we can skip this block, and probably more */ 5057 sync_blocks /= STRIPE_SECTORS; 5058 *skipped = 1; 5059 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5060 } 5061 5062 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 5063 5064 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 5065 if (sh == NULL) { 5066 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 5067 /* make sure we don't swamp the stripe cache if someone else 5068 * is trying to get access 5069 */ 5070 schedule_timeout_uninterruptible(1); 5071 } 5072 /* Need to check if array will still be degraded after recovery/resync 5073 * We don't need to check the 'failed' flag as when that gets set, 5074 * recovery aborts. 5075 */ 5076 for (i = 0; i < conf->raid_disks; i++) 5077 if (conf->disks[i].rdev == NULL) 5078 still_degraded = 1; 5079 5080 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5081 5082 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5083 set_bit(STRIPE_HANDLE, &sh->state); 5084 5085 release_stripe(sh); 5086 5087 return STRIPE_SECTORS; 5088 } 5089 5090 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 5091 { 5092 /* We may not be able to submit a whole bio at once as there 5093 * may not be enough stripe_heads available. 5094 * We cannot pre-allocate enough stripe_heads as we may need 5095 * more than exist in the cache (if we allow ever large chunks). 5096 * So we do one stripe head at a time and record in 5097 * ->bi_hw_segments how many have been done. 5098 * 5099 * We *know* that this entire raid_bio is in one chunk, so 5100 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 5101 */ 5102 struct stripe_head *sh; 5103 int dd_idx; 5104 sector_t sector, logical_sector, last_sector; 5105 int scnt = 0; 5106 int remaining; 5107 int handled = 0; 5108 5109 logical_sector = raid_bio->bi_iter.bi_sector & 5110 ~((sector_t)STRIPE_SECTORS-1); 5111 sector = raid5_compute_sector(conf, logical_sector, 5112 0, &dd_idx, NULL); 5113 last_sector = bio_end_sector(raid_bio); 5114 5115 for (; logical_sector < last_sector; 5116 logical_sector += STRIPE_SECTORS, 5117 sector += STRIPE_SECTORS, 5118 scnt++) { 5119 5120 if (scnt < raid5_bi_processed_stripes(raid_bio)) 5121 /* already done this stripe */ 5122 continue; 5123 5124 sh = get_active_stripe(conf, sector, 0, 1, 1); 5125 5126 if (!sh) { 5127 /* failed to get a stripe - must wait */ 5128 raid5_set_bi_processed_stripes(raid_bio, scnt); 5129 conf->retry_read_aligned = raid_bio; 5130 return handled; 5131 } 5132 5133 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 5134 release_stripe(sh); 5135 raid5_set_bi_processed_stripes(raid_bio, scnt); 5136 conf->retry_read_aligned = raid_bio; 5137 return handled; 5138 } 5139 5140 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 5141 handle_stripe(sh); 5142 release_stripe(sh); 5143 handled++; 5144 } 5145 remaining = raid5_dec_bi_active_stripes(raid_bio); 5146 if (remaining == 0) { 5147 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 5148 raid_bio, 0); 5149 bio_endio(raid_bio, 0); 5150 } 5151 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5152 wake_up(&conf->wait_for_stripe); 5153 return handled; 5154 } 5155 5156 static int handle_active_stripes(struct r5conf *conf, int group, 5157 struct r5worker *worker, 5158 struct list_head *temp_inactive_list) 5159 { 5160 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5161 int i, batch_size = 0, hash; 5162 bool release_inactive = false; 5163 5164 while (batch_size < MAX_STRIPE_BATCH && 5165 (sh = __get_priority_stripe(conf, group)) != NULL) 5166 batch[batch_size++] = sh; 5167 5168 if (batch_size == 0) { 5169 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5170 if (!list_empty(temp_inactive_list + i)) 5171 break; 5172 if (i == NR_STRIPE_HASH_LOCKS) 5173 return batch_size; 5174 release_inactive = true; 5175 } 5176 spin_unlock_irq(&conf->device_lock); 5177 5178 release_inactive_stripe_list(conf, temp_inactive_list, 5179 NR_STRIPE_HASH_LOCKS); 5180 5181 if (release_inactive) { 5182 spin_lock_irq(&conf->device_lock); 5183 return 0; 5184 } 5185 5186 for (i = 0; i < batch_size; i++) 5187 handle_stripe(batch[i]); 5188 5189 cond_resched(); 5190 5191 spin_lock_irq(&conf->device_lock); 5192 for (i = 0; i < batch_size; i++) { 5193 hash = batch[i]->hash_lock_index; 5194 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 5195 } 5196 return batch_size; 5197 } 5198 5199 static void raid5_do_work(struct work_struct *work) 5200 { 5201 struct r5worker *worker = container_of(work, struct r5worker, work); 5202 struct r5worker_group *group = worker->group; 5203 struct r5conf *conf = group->conf; 5204 int group_id = group - conf->worker_groups; 5205 int handled; 5206 struct blk_plug plug; 5207 5208 pr_debug("+++ raid5worker active\n"); 5209 5210 blk_start_plug(&plug); 5211 handled = 0; 5212 spin_lock_irq(&conf->device_lock); 5213 while (1) { 5214 int batch_size, released; 5215 5216 released = release_stripe_list(conf, worker->temp_inactive_list); 5217 5218 batch_size = handle_active_stripes(conf, group_id, worker, 5219 worker->temp_inactive_list); 5220 worker->working = false; 5221 if (!batch_size && !released) 5222 break; 5223 handled += batch_size; 5224 } 5225 pr_debug("%d stripes handled\n", handled); 5226 5227 spin_unlock_irq(&conf->device_lock); 5228 blk_finish_plug(&plug); 5229 5230 pr_debug("--- raid5worker inactive\n"); 5231 } 5232 5233 /* 5234 * This is our raid5 kernel thread. 5235 * 5236 * We scan the hash table for stripes which can be handled now. 5237 * During the scan, completed stripes are saved for us by the interrupt 5238 * handler, so that they will not have to wait for our next wakeup. 5239 */ 5240 static void raid5d(struct md_thread *thread) 5241 { 5242 struct mddev *mddev = thread->mddev; 5243 struct r5conf *conf = mddev->private; 5244 int handled; 5245 struct blk_plug plug; 5246 5247 pr_debug("+++ raid5d active\n"); 5248 5249 md_check_recovery(mddev); 5250 5251 blk_start_plug(&plug); 5252 handled = 0; 5253 spin_lock_irq(&conf->device_lock); 5254 while (1) { 5255 struct bio *bio; 5256 int batch_size, released; 5257 5258 released = release_stripe_list(conf, conf->temp_inactive_list); 5259 5260 if ( 5261 !list_empty(&conf->bitmap_list)) { 5262 /* Now is a good time to flush some bitmap updates */ 5263 conf->seq_flush++; 5264 spin_unlock_irq(&conf->device_lock); 5265 bitmap_unplug(mddev->bitmap); 5266 spin_lock_irq(&conf->device_lock); 5267 conf->seq_write = conf->seq_flush; 5268 activate_bit_delay(conf, conf->temp_inactive_list); 5269 } 5270 raid5_activate_delayed(conf); 5271 5272 while ((bio = remove_bio_from_retry(conf))) { 5273 int ok; 5274 spin_unlock_irq(&conf->device_lock); 5275 ok = retry_aligned_read(conf, bio); 5276 spin_lock_irq(&conf->device_lock); 5277 if (!ok) 5278 break; 5279 handled++; 5280 } 5281 5282 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 5283 conf->temp_inactive_list); 5284 if (!batch_size && !released) 5285 break; 5286 handled += batch_size; 5287 5288 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 5289 spin_unlock_irq(&conf->device_lock); 5290 md_check_recovery(mddev); 5291 spin_lock_irq(&conf->device_lock); 5292 } 5293 } 5294 pr_debug("%d stripes handled\n", handled); 5295 5296 spin_unlock_irq(&conf->device_lock); 5297 5298 async_tx_issue_pending_all(); 5299 blk_finish_plug(&plug); 5300 5301 pr_debug("--- raid5d inactive\n"); 5302 } 5303 5304 static ssize_t 5305 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 5306 { 5307 struct r5conf *conf = mddev->private; 5308 if (conf) 5309 return sprintf(page, "%d\n", conf->max_nr_stripes); 5310 else 5311 return 0; 5312 } 5313 5314 int 5315 raid5_set_cache_size(struct mddev *mddev, int size) 5316 { 5317 struct r5conf *conf = mddev->private; 5318 int err; 5319 int hash; 5320 5321 if (size <= 16 || size > 32768) 5322 return -EINVAL; 5323 hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; 5324 while (size < conf->max_nr_stripes) { 5325 if (drop_one_stripe(conf, hash)) 5326 conf->max_nr_stripes--; 5327 else 5328 break; 5329 hash--; 5330 if (hash < 0) 5331 hash = NR_STRIPE_HASH_LOCKS - 1; 5332 } 5333 err = md_allow_write(mddev); 5334 if (err) 5335 return err; 5336 hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 5337 while (size > conf->max_nr_stripes) { 5338 if (grow_one_stripe(conf, hash)) 5339 conf->max_nr_stripes++; 5340 else break; 5341 hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; 5342 } 5343 return 0; 5344 } 5345 EXPORT_SYMBOL(raid5_set_cache_size); 5346 5347 static ssize_t 5348 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 5349 { 5350 struct r5conf *conf = mddev->private; 5351 unsigned long new; 5352 int err; 5353 5354 if (len >= PAGE_SIZE) 5355 return -EINVAL; 5356 if (!conf) 5357 return -ENODEV; 5358 5359 if (kstrtoul(page, 10, &new)) 5360 return -EINVAL; 5361 err = raid5_set_cache_size(mddev, new); 5362 if (err) 5363 return err; 5364 return len; 5365 } 5366 5367 static struct md_sysfs_entry 5368 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 5369 raid5_show_stripe_cache_size, 5370 raid5_store_stripe_cache_size); 5371 5372 static ssize_t 5373 raid5_show_preread_threshold(struct mddev *mddev, char *page) 5374 { 5375 struct r5conf *conf = mddev->private; 5376 if (conf) 5377 return sprintf(page, "%d\n", conf->bypass_threshold); 5378 else 5379 return 0; 5380 } 5381 5382 static ssize_t 5383 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 5384 { 5385 struct r5conf *conf = mddev->private; 5386 unsigned long new; 5387 if (len >= PAGE_SIZE) 5388 return -EINVAL; 5389 if (!conf) 5390 return -ENODEV; 5391 5392 if (kstrtoul(page, 10, &new)) 5393 return -EINVAL; 5394 if (new > conf->max_nr_stripes) 5395 return -EINVAL; 5396 conf->bypass_threshold = new; 5397 return len; 5398 } 5399 5400 static struct md_sysfs_entry 5401 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 5402 S_IRUGO | S_IWUSR, 5403 raid5_show_preread_threshold, 5404 raid5_store_preread_threshold); 5405 5406 static ssize_t 5407 raid5_show_skip_copy(struct mddev *mddev, char *page) 5408 { 5409 struct r5conf *conf = mddev->private; 5410 if (conf) 5411 return sprintf(page, "%d\n", conf->skip_copy); 5412 else 5413 return 0; 5414 } 5415 5416 static ssize_t 5417 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 5418 { 5419 struct r5conf *conf = mddev->private; 5420 unsigned long new; 5421 if (len >= PAGE_SIZE) 5422 return -EINVAL; 5423 if (!conf) 5424 return -ENODEV; 5425 5426 if (kstrtoul(page, 10, &new)) 5427 return -EINVAL; 5428 new = !!new; 5429 if (new == conf->skip_copy) 5430 return len; 5431 5432 mddev_suspend(mddev); 5433 conf->skip_copy = new; 5434 if (new) 5435 mddev->queue->backing_dev_info.capabilities |= 5436 BDI_CAP_STABLE_WRITES; 5437 else 5438 mddev->queue->backing_dev_info.capabilities &= 5439 ~BDI_CAP_STABLE_WRITES; 5440 mddev_resume(mddev); 5441 return len; 5442 } 5443 5444 static struct md_sysfs_entry 5445 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 5446 raid5_show_skip_copy, 5447 raid5_store_skip_copy); 5448 5449 5450 static ssize_t 5451 stripe_cache_active_show(struct mddev *mddev, char *page) 5452 { 5453 struct r5conf *conf = mddev->private; 5454 if (conf) 5455 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 5456 else 5457 return 0; 5458 } 5459 5460 static struct md_sysfs_entry 5461 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 5462 5463 static ssize_t 5464 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 5465 { 5466 struct r5conf *conf = mddev->private; 5467 if (conf) 5468 return sprintf(page, "%d\n", conf->worker_cnt_per_group); 5469 else 5470 return 0; 5471 } 5472 5473 static int alloc_thread_groups(struct r5conf *conf, int cnt, 5474 int *group_cnt, 5475 int *worker_cnt_per_group, 5476 struct r5worker_group **worker_groups); 5477 static ssize_t 5478 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 5479 { 5480 struct r5conf *conf = mddev->private; 5481 unsigned long new; 5482 int err; 5483 struct r5worker_group *new_groups, *old_groups; 5484 int group_cnt, worker_cnt_per_group; 5485 5486 if (len >= PAGE_SIZE) 5487 return -EINVAL; 5488 if (!conf) 5489 return -ENODEV; 5490 5491 if (kstrtoul(page, 10, &new)) 5492 return -EINVAL; 5493 5494 if (new == conf->worker_cnt_per_group) 5495 return len; 5496 5497 mddev_suspend(mddev); 5498 5499 old_groups = conf->worker_groups; 5500 if (old_groups) 5501 flush_workqueue(raid5_wq); 5502 5503 err = alloc_thread_groups(conf, new, 5504 &group_cnt, &worker_cnt_per_group, 5505 &new_groups); 5506 if (!err) { 5507 spin_lock_irq(&conf->device_lock); 5508 conf->group_cnt = group_cnt; 5509 conf->worker_cnt_per_group = worker_cnt_per_group; 5510 conf->worker_groups = new_groups; 5511 spin_unlock_irq(&conf->device_lock); 5512 5513 if (old_groups) 5514 kfree(old_groups[0].workers); 5515 kfree(old_groups); 5516 } 5517 5518 mddev_resume(mddev); 5519 5520 if (err) 5521 return err; 5522 return len; 5523 } 5524 5525 static struct md_sysfs_entry 5526 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 5527 raid5_show_group_thread_cnt, 5528 raid5_store_group_thread_cnt); 5529 5530 static struct attribute *raid5_attrs[] = { 5531 &raid5_stripecache_size.attr, 5532 &raid5_stripecache_active.attr, 5533 &raid5_preread_bypass_threshold.attr, 5534 &raid5_group_thread_cnt.attr, 5535 &raid5_skip_copy.attr, 5536 NULL, 5537 }; 5538 static struct attribute_group raid5_attrs_group = { 5539 .name = NULL, 5540 .attrs = raid5_attrs, 5541 }; 5542 5543 static int alloc_thread_groups(struct r5conf *conf, int cnt, 5544 int *group_cnt, 5545 int *worker_cnt_per_group, 5546 struct r5worker_group **worker_groups) 5547 { 5548 int i, j, k; 5549 ssize_t size; 5550 struct r5worker *workers; 5551 5552 *worker_cnt_per_group = cnt; 5553 if (cnt == 0) { 5554 *group_cnt = 0; 5555 *worker_groups = NULL; 5556 return 0; 5557 } 5558 *group_cnt = num_possible_nodes(); 5559 size = sizeof(struct r5worker) * cnt; 5560 workers = kzalloc(size * *group_cnt, GFP_NOIO); 5561 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 5562 *group_cnt, GFP_NOIO); 5563 if (!*worker_groups || !workers) { 5564 kfree(workers); 5565 kfree(*worker_groups); 5566 return -ENOMEM; 5567 } 5568 5569 for (i = 0; i < *group_cnt; i++) { 5570 struct r5worker_group *group; 5571 5572 group = &(*worker_groups)[i]; 5573 INIT_LIST_HEAD(&group->handle_list); 5574 group->conf = conf; 5575 group->workers = workers + i * cnt; 5576 5577 for (j = 0; j < cnt; j++) { 5578 struct r5worker *worker = group->workers + j; 5579 worker->group = group; 5580 INIT_WORK(&worker->work, raid5_do_work); 5581 5582 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 5583 INIT_LIST_HEAD(worker->temp_inactive_list + k); 5584 } 5585 } 5586 5587 return 0; 5588 } 5589 5590 static void free_thread_groups(struct r5conf *conf) 5591 { 5592 if (conf->worker_groups) 5593 kfree(conf->worker_groups[0].workers); 5594 kfree(conf->worker_groups); 5595 conf->worker_groups = NULL; 5596 } 5597 5598 static sector_t 5599 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 5600 { 5601 struct r5conf *conf = mddev->private; 5602 5603 if (!sectors) 5604 sectors = mddev->dev_sectors; 5605 if (!raid_disks) 5606 /* size is defined by the smallest of previous and new size */ 5607 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 5608 5609 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5610 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 5611 return sectors * (raid_disks - conf->max_degraded); 5612 } 5613 5614 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 5615 { 5616 safe_put_page(percpu->spare_page); 5617 kfree(percpu->scribble); 5618 percpu->spare_page = NULL; 5619 percpu->scribble = NULL; 5620 } 5621 5622 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 5623 { 5624 if (conf->level == 6 && !percpu->spare_page) 5625 percpu->spare_page = alloc_page(GFP_KERNEL); 5626 if (!percpu->scribble) 5627 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 5628 5629 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 5630 free_scratch_buffer(conf, percpu); 5631 return -ENOMEM; 5632 } 5633 5634 return 0; 5635 } 5636 5637 static void raid5_free_percpu(struct r5conf *conf) 5638 { 5639 unsigned long cpu; 5640 5641 if (!conf->percpu) 5642 return; 5643 5644 #ifdef CONFIG_HOTPLUG_CPU 5645 unregister_cpu_notifier(&conf->cpu_notify); 5646 #endif 5647 5648 get_online_cpus(); 5649 for_each_possible_cpu(cpu) 5650 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 5651 put_online_cpus(); 5652 5653 free_percpu(conf->percpu); 5654 } 5655 5656 static void free_conf(struct r5conf *conf) 5657 { 5658 free_thread_groups(conf); 5659 shrink_stripes(conf); 5660 raid5_free_percpu(conf); 5661 kfree(conf->disks); 5662 kfree(conf->stripe_hashtbl); 5663 kfree(conf); 5664 } 5665 5666 #ifdef CONFIG_HOTPLUG_CPU 5667 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 5668 void *hcpu) 5669 { 5670 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 5671 long cpu = (long)hcpu; 5672 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 5673 5674 switch (action) { 5675 case CPU_UP_PREPARE: 5676 case CPU_UP_PREPARE_FROZEN: 5677 if (alloc_scratch_buffer(conf, percpu)) { 5678 pr_err("%s: failed memory allocation for cpu%ld\n", 5679 __func__, cpu); 5680 return notifier_from_errno(-ENOMEM); 5681 } 5682 break; 5683 case CPU_DEAD: 5684 case CPU_DEAD_FROZEN: 5685 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 5686 break; 5687 default: 5688 break; 5689 } 5690 return NOTIFY_OK; 5691 } 5692 #endif 5693 5694 static int raid5_alloc_percpu(struct r5conf *conf) 5695 { 5696 unsigned long cpu; 5697 int err = 0; 5698 5699 conf->percpu = alloc_percpu(struct raid5_percpu); 5700 if (!conf->percpu) 5701 return -ENOMEM; 5702 5703 #ifdef CONFIG_HOTPLUG_CPU 5704 conf->cpu_notify.notifier_call = raid456_cpu_notify; 5705 conf->cpu_notify.priority = 0; 5706 err = register_cpu_notifier(&conf->cpu_notify); 5707 if (err) 5708 return err; 5709 #endif 5710 5711 get_online_cpus(); 5712 for_each_present_cpu(cpu) { 5713 err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 5714 if (err) { 5715 pr_err("%s: failed memory allocation for cpu%ld\n", 5716 __func__, cpu); 5717 break; 5718 } 5719 } 5720 put_online_cpus(); 5721 5722 return err; 5723 } 5724 5725 static struct r5conf *setup_conf(struct mddev *mddev) 5726 { 5727 struct r5conf *conf; 5728 int raid_disk, memory, max_disks; 5729 struct md_rdev *rdev; 5730 struct disk_info *disk; 5731 char pers_name[6]; 5732 int i; 5733 int group_cnt, worker_cnt_per_group; 5734 struct r5worker_group *new_group; 5735 5736 if (mddev->new_level != 5 5737 && mddev->new_level != 4 5738 && mddev->new_level != 6) { 5739 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 5740 mdname(mddev), mddev->new_level); 5741 return ERR_PTR(-EIO); 5742 } 5743 if ((mddev->new_level == 5 5744 && !algorithm_valid_raid5(mddev->new_layout)) || 5745 (mddev->new_level == 6 5746 && !algorithm_valid_raid6(mddev->new_layout))) { 5747 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 5748 mdname(mddev), mddev->new_layout); 5749 return ERR_PTR(-EIO); 5750 } 5751 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 5752 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 5753 mdname(mddev), mddev->raid_disks); 5754 return ERR_PTR(-EINVAL); 5755 } 5756 5757 if (!mddev->new_chunk_sectors || 5758 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 5759 !is_power_of_2(mddev->new_chunk_sectors)) { 5760 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 5761 mdname(mddev), mddev->new_chunk_sectors << 9); 5762 return ERR_PTR(-EINVAL); 5763 } 5764 5765 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5766 if (conf == NULL) 5767 goto abort; 5768 /* Don't enable multi-threading by default*/ 5769 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 5770 &new_group)) { 5771 conf->group_cnt = group_cnt; 5772 conf->worker_cnt_per_group = worker_cnt_per_group; 5773 conf->worker_groups = new_group; 5774 } else 5775 goto abort; 5776 spin_lock_init(&conf->device_lock); 5777 seqcount_init(&conf->gen_lock); 5778 init_waitqueue_head(&conf->wait_for_stripe); 5779 init_waitqueue_head(&conf->wait_for_overlap); 5780 INIT_LIST_HEAD(&conf->handle_list); 5781 INIT_LIST_HEAD(&conf->hold_list); 5782 INIT_LIST_HEAD(&conf->delayed_list); 5783 INIT_LIST_HEAD(&conf->bitmap_list); 5784 init_llist_head(&conf->released_stripes); 5785 atomic_set(&conf->active_stripes, 0); 5786 atomic_set(&conf->preread_active_stripes, 0); 5787 atomic_set(&conf->active_aligned_reads, 0); 5788 conf->bypass_threshold = BYPASS_THRESHOLD; 5789 conf->recovery_disabled = mddev->recovery_disabled - 1; 5790 5791 conf->raid_disks = mddev->raid_disks; 5792 if (mddev->reshape_position == MaxSector) 5793 conf->previous_raid_disks = mddev->raid_disks; 5794 else 5795 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 5796 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 5797 conf->scribble_len = scribble_len(max_disks); 5798 5799 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 5800 GFP_KERNEL); 5801 if (!conf->disks) 5802 goto abort; 5803 5804 conf->mddev = mddev; 5805 5806 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 5807 goto abort; 5808 5809 /* We init hash_locks[0] separately to that it can be used 5810 * as the reference lock in the spin_lock_nest_lock() call 5811 * in lock_all_device_hash_locks_irq in order to convince 5812 * lockdep that we know what we are doing. 5813 */ 5814 spin_lock_init(conf->hash_locks); 5815 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 5816 spin_lock_init(conf->hash_locks + i); 5817 5818 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5819 INIT_LIST_HEAD(conf->inactive_list + i); 5820 5821 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5822 INIT_LIST_HEAD(conf->temp_inactive_list + i); 5823 5824 conf->level = mddev->new_level; 5825 if (raid5_alloc_percpu(conf) != 0) 5826 goto abort; 5827 5828 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 5829 5830 rdev_for_each(rdev, mddev) { 5831 raid_disk = rdev->raid_disk; 5832 if (raid_disk >= max_disks 5833 || raid_disk < 0) 5834 continue; 5835 disk = conf->disks + raid_disk; 5836 5837 if (test_bit(Replacement, &rdev->flags)) { 5838 if (disk->replacement) 5839 goto abort; 5840 disk->replacement = rdev; 5841 } else { 5842 if (disk->rdev) 5843 goto abort; 5844 disk->rdev = rdev; 5845 } 5846 5847 if (test_bit(In_sync, &rdev->flags)) { 5848 char b[BDEVNAME_SIZE]; 5849 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 5850 " disk %d\n", 5851 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 5852 } else if (rdev->saved_raid_disk != raid_disk) 5853 /* Cannot rely on bitmap to complete recovery */ 5854 conf->fullsync = 1; 5855 } 5856 5857 conf->chunk_sectors = mddev->new_chunk_sectors; 5858 conf->level = mddev->new_level; 5859 if (conf->level == 6) 5860 conf->max_degraded = 2; 5861 else 5862 conf->max_degraded = 1; 5863 conf->algorithm = mddev->new_layout; 5864 conf->reshape_progress = mddev->reshape_position; 5865 if (conf->reshape_progress != MaxSector) { 5866 conf->prev_chunk_sectors = mddev->chunk_sectors; 5867 conf->prev_algo = mddev->layout; 5868 } 5869 5870 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 5871 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 5872 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 5873 if (grow_stripes(conf, NR_STRIPES)) { 5874 printk(KERN_ERR 5875 "md/raid:%s: couldn't allocate %dkB for buffers\n", 5876 mdname(mddev), memory); 5877 goto abort; 5878 } else 5879 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 5880 mdname(mddev), memory); 5881 5882 sprintf(pers_name, "raid%d", mddev->new_level); 5883 conf->thread = md_register_thread(raid5d, mddev, pers_name); 5884 if (!conf->thread) { 5885 printk(KERN_ERR 5886 "md/raid:%s: couldn't allocate thread.\n", 5887 mdname(mddev)); 5888 goto abort; 5889 } 5890 5891 return conf; 5892 5893 abort: 5894 if (conf) { 5895 free_conf(conf); 5896 return ERR_PTR(-EIO); 5897 } else 5898 return ERR_PTR(-ENOMEM); 5899 } 5900 5901 5902 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 5903 { 5904 switch (algo) { 5905 case ALGORITHM_PARITY_0: 5906 if (raid_disk < max_degraded) 5907 return 1; 5908 break; 5909 case ALGORITHM_PARITY_N: 5910 if (raid_disk >= raid_disks - max_degraded) 5911 return 1; 5912 break; 5913 case ALGORITHM_PARITY_0_6: 5914 if (raid_disk == 0 || 5915 raid_disk == raid_disks - 1) 5916 return 1; 5917 break; 5918 case ALGORITHM_LEFT_ASYMMETRIC_6: 5919 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5920 case ALGORITHM_LEFT_SYMMETRIC_6: 5921 case ALGORITHM_RIGHT_SYMMETRIC_6: 5922 if (raid_disk == raid_disks - 1) 5923 return 1; 5924 } 5925 return 0; 5926 } 5927 5928 static int run(struct mddev *mddev) 5929 { 5930 struct r5conf *conf; 5931 int working_disks = 0; 5932 int dirty_parity_disks = 0; 5933 struct md_rdev *rdev; 5934 sector_t reshape_offset = 0; 5935 int i; 5936 long long min_offset_diff = 0; 5937 int first = 1; 5938 5939 if (mddev->recovery_cp != MaxSector) 5940 printk(KERN_NOTICE "md/raid:%s: not clean" 5941 " -- starting background reconstruction\n", 5942 mdname(mddev)); 5943 5944 rdev_for_each(rdev, mddev) { 5945 long long diff; 5946 if (rdev->raid_disk < 0) 5947 continue; 5948 diff = (rdev->new_data_offset - rdev->data_offset); 5949 if (first) { 5950 min_offset_diff = diff; 5951 first = 0; 5952 } else if (mddev->reshape_backwards && 5953 diff < min_offset_diff) 5954 min_offset_diff = diff; 5955 else if (!mddev->reshape_backwards && 5956 diff > min_offset_diff) 5957 min_offset_diff = diff; 5958 } 5959 5960 if (mddev->reshape_position != MaxSector) { 5961 /* Check that we can continue the reshape. 5962 * Difficulties arise if the stripe we would write to 5963 * next is at or after the stripe we would read from next. 5964 * For a reshape that changes the number of devices, this 5965 * is only possible for a very short time, and mdadm makes 5966 * sure that time appears to have past before assembling 5967 * the array. So we fail if that time hasn't passed. 5968 * For a reshape that keeps the number of devices the same 5969 * mdadm must be monitoring the reshape can keeping the 5970 * critical areas read-only and backed up. It will start 5971 * the array in read-only mode, so we check for that. 5972 */ 5973 sector_t here_new, here_old; 5974 int old_disks; 5975 int max_degraded = (mddev->level == 6 ? 2 : 1); 5976 5977 if (mddev->new_level != mddev->level) { 5978 printk(KERN_ERR "md/raid:%s: unsupported reshape " 5979 "required - aborting.\n", 5980 mdname(mddev)); 5981 return -EINVAL; 5982 } 5983 old_disks = mddev->raid_disks - mddev->delta_disks; 5984 /* reshape_position must be on a new-stripe boundary, and one 5985 * further up in new geometry must map after here in old 5986 * geometry. 5987 */ 5988 here_new = mddev->reshape_position; 5989 if (sector_div(here_new, mddev->new_chunk_sectors * 5990 (mddev->raid_disks - max_degraded))) { 5991 printk(KERN_ERR "md/raid:%s: reshape_position not " 5992 "on a stripe boundary\n", mdname(mddev)); 5993 return -EINVAL; 5994 } 5995 reshape_offset = here_new * mddev->new_chunk_sectors; 5996 /* here_new is the stripe we will write to */ 5997 here_old = mddev->reshape_position; 5998 sector_div(here_old, mddev->chunk_sectors * 5999 (old_disks-max_degraded)); 6000 /* here_old is the first stripe that we might need to read 6001 * from */ 6002 if (mddev->delta_disks == 0) { 6003 if ((here_new * mddev->new_chunk_sectors != 6004 here_old * mddev->chunk_sectors)) { 6005 printk(KERN_ERR "md/raid:%s: reshape position is" 6006 " confused - aborting\n", mdname(mddev)); 6007 return -EINVAL; 6008 } 6009 /* We cannot be sure it is safe to start an in-place 6010 * reshape. It is only safe if user-space is monitoring 6011 * and taking constant backups. 6012 * mdadm always starts a situation like this in 6013 * readonly mode so it can take control before 6014 * allowing any writes. So just check for that. 6015 */ 6016 if (abs(min_offset_diff) >= mddev->chunk_sectors && 6017 abs(min_offset_diff) >= mddev->new_chunk_sectors) 6018 /* not really in-place - so OK */; 6019 else if (mddev->ro == 0) { 6020 printk(KERN_ERR "md/raid:%s: in-place reshape " 6021 "must be started in read-only mode " 6022 "- aborting\n", 6023 mdname(mddev)); 6024 return -EINVAL; 6025 } 6026 } else if (mddev->reshape_backwards 6027 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 6028 here_old * mddev->chunk_sectors) 6029 : (here_new * mddev->new_chunk_sectors >= 6030 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 6031 /* Reading from the same stripe as writing to - bad */ 6032 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 6033 "auto-recovery - aborting.\n", 6034 mdname(mddev)); 6035 return -EINVAL; 6036 } 6037 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 6038 mdname(mddev)); 6039 /* OK, we should be able to continue; */ 6040 } else { 6041 BUG_ON(mddev->level != mddev->new_level); 6042 BUG_ON(mddev->layout != mddev->new_layout); 6043 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 6044 BUG_ON(mddev->delta_disks != 0); 6045 } 6046 6047 if (mddev->private == NULL) 6048 conf = setup_conf(mddev); 6049 else 6050 conf = mddev->private; 6051 6052 if (IS_ERR(conf)) 6053 return PTR_ERR(conf); 6054 6055 conf->min_offset_diff = min_offset_diff; 6056 mddev->thread = conf->thread; 6057 conf->thread = NULL; 6058 mddev->private = conf; 6059 6060 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 6061 i++) { 6062 rdev = conf->disks[i].rdev; 6063 if (!rdev && conf->disks[i].replacement) { 6064 /* The replacement is all we have yet */ 6065 rdev = conf->disks[i].replacement; 6066 conf->disks[i].replacement = NULL; 6067 clear_bit(Replacement, &rdev->flags); 6068 conf->disks[i].rdev = rdev; 6069 } 6070 if (!rdev) 6071 continue; 6072 if (conf->disks[i].replacement && 6073 conf->reshape_progress != MaxSector) { 6074 /* replacements and reshape simply do not mix. */ 6075 printk(KERN_ERR "md: cannot handle concurrent " 6076 "replacement and reshape.\n"); 6077 goto abort; 6078 } 6079 if (test_bit(In_sync, &rdev->flags)) { 6080 working_disks++; 6081 continue; 6082 } 6083 /* This disc is not fully in-sync. However if it 6084 * just stored parity (beyond the recovery_offset), 6085 * when we don't need to be concerned about the 6086 * array being dirty. 6087 * When reshape goes 'backwards', we never have 6088 * partially completed devices, so we only need 6089 * to worry about reshape going forwards. 6090 */ 6091 /* Hack because v0.91 doesn't store recovery_offset properly. */ 6092 if (mddev->major_version == 0 && 6093 mddev->minor_version > 90) 6094 rdev->recovery_offset = reshape_offset; 6095 6096 if (rdev->recovery_offset < reshape_offset) { 6097 /* We need to check old and new layout */ 6098 if (!only_parity(rdev->raid_disk, 6099 conf->algorithm, 6100 conf->raid_disks, 6101 conf->max_degraded)) 6102 continue; 6103 } 6104 if (!only_parity(rdev->raid_disk, 6105 conf->prev_algo, 6106 conf->previous_raid_disks, 6107 conf->max_degraded)) 6108 continue; 6109 dirty_parity_disks++; 6110 } 6111 6112 /* 6113 * 0 for a fully functional array, 1 or 2 for a degraded array. 6114 */ 6115 mddev->degraded = calc_degraded(conf); 6116 6117 if (has_failed(conf)) { 6118 printk(KERN_ERR "md/raid:%s: not enough operational devices" 6119 " (%d/%d failed)\n", 6120 mdname(mddev), mddev->degraded, conf->raid_disks); 6121 goto abort; 6122 } 6123 6124 /* device size must be a multiple of chunk size */ 6125 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 6126 mddev->resync_max_sectors = mddev->dev_sectors; 6127 6128 if (mddev->degraded > dirty_parity_disks && 6129 mddev->recovery_cp != MaxSector) { 6130 if (mddev->ok_start_degraded) 6131 printk(KERN_WARNING 6132 "md/raid:%s: starting dirty degraded array" 6133 " - data corruption possible.\n", 6134 mdname(mddev)); 6135 else { 6136 printk(KERN_ERR 6137 "md/raid:%s: cannot start dirty degraded array.\n", 6138 mdname(mddev)); 6139 goto abort; 6140 } 6141 } 6142 6143 if (mddev->degraded == 0) 6144 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 6145 " devices, algorithm %d\n", mdname(mddev), conf->level, 6146 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 6147 mddev->new_layout); 6148 else 6149 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 6150 " out of %d devices, algorithm %d\n", 6151 mdname(mddev), conf->level, 6152 mddev->raid_disks - mddev->degraded, 6153 mddev->raid_disks, mddev->new_layout); 6154 6155 print_raid5_conf(conf); 6156 6157 if (conf->reshape_progress != MaxSector) { 6158 conf->reshape_safe = conf->reshape_progress; 6159 atomic_set(&conf->reshape_stripes, 0); 6160 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6161 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6162 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6163 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6164 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6165 "reshape"); 6166 } 6167 6168 6169 /* Ok, everything is just fine now */ 6170 if (mddev->to_remove == &raid5_attrs_group) 6171 mddev->to_remove = NULL; 6172 else if (mddev->kobj.sd && 6173 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 6174 printk(KERN_WARNING 6175 "raid5: failed to create sysfs attributes for %s\n", 6176 mdname(mddev)); 6177 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6178 6179 if (mddev->queue) { 6180 int chunk_size; 6181 bool discard_supported = true; 6182 /* read-ahead size must cover two whole stripes, which 6183 * is 2 * (datadisks) * chunksize where 'n' is the 6184 * number of raid devices 6185 */ 6186 int data_disks = conf->previous_raid_disks - conf->max_degraded; 6187 int stripe = data_disks * 6188 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 6189 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6190 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6191 6192 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 6193 6194 mddev->queue->backing_dev_info.congested_data = mddev; 6195 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 6196 6197 chunk_size = mddev->chunk_sectors << 9; 6198 blk_queue_io_min(mddev->queue, chunk_size); 6199 blk_queue_io_opt(mddev->queue, chunk_size * 6200 (conf->raid_disks - conf->max_degraded)); 6201 mddev->queue->limits.raid_partial_stripes_expensive = 1; 6202 /* 6203 * We can only discard a whole stripe. It doesn't make sense to 6204 * discard data disk but write parity disk 6205 */ 6206 stripe = stripe * PAGE_SIZE; 6207 /* Round up to power of 2, as discard handling 6208 * currently assumes that */ 6209 while ((stripe-1) & stripe) 6210 stripe = (stripe | (stripe-1)) + 1; 6211 mddev->queue->limits.discard_alignment = stripe; 6212 mddev->queue->limits.discard_granularity = stripe; 6213 /* 6214 * unaligned part of discard request will be ignored, so can't 6215 * guarantee discard_zeroes_data 6216 */ 6217 mddev->queue->limits.discard_zeroes_data = 0; 6218 6219 blk_queue_max_write_same_sectors(mddev->queue, 0); 6220 6221 rdev_for_each(rdev, mddev) { 6222 disk_stack_limits(mddev->gendisk, rdev->bdev, 6223 rdev->data_offset << 9); 6224 disk_stack_limits(mddev->gendisk, rdev->bdev, 6225 rdev->new_data_offset << 9); 6226 /* 6227 * discard_zeroes_data is required, otherwise data 6228 * could be lost. Consider a scenario: discard a stripe 6229 * (the stripe could be inconsistent if 6230 * discard_zeroes_data is 0); write one disk of the 6231 * stripe (the stripe could be inconsistent again 6232 * depending on which disks are used to calculate 6233 * parity); the disk is broken; The stripe data of this 6234 * disk is lost. 6235 */ 6236 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 6237 !bdev_get_queue(rdev->bdev)-> 6238 limits.discard_zeroes_data) 6239 discard_supported = false; 6240 /* Unfortunately, discard_zeroes_data is not currently 6241 * a guarantee - just a hint. So we only allow DISCARD 6242 * if the sysadmin has confirmed that only safe devices 6243 * are in use by setting a module parameter. 6244 */ 6245 if (!devices_handle_discard_safely) { 6246 if (discard_supported) { 6247 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 6248 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 6249 } 6250 discard_supported = false; 6251 } 6252 } 6253 6254 if (discard_supported && 6255 mddev->queue->limits.max_discard_sectors >= stripe && 6256 mddev->queue->limits.discard_granularity >= stripe) 6257 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 6258 mddev->queue); 6259 else 6260 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 6261 mddev->queue); 6262 } 6263 6264 return 0; 6265 abort: 6266 md_unregister_thread(&mddev->thread); 6267 print_raid5_conf(conf); 6268 free_conf(conf); 6269 mddev->private = NULL; 6270 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 6271 return -EIO; 6272 } 6273 6274 static int stop(struct mddev *mddev) 6275 { 6276 struct r5conf *conf = mddev->private; 6277 6278 md_unregister_thread(&mddev->thread); 6279 if (mddev->queue) 6280 mddev->queue->backing_dev_info.congested_fn = NULL; 6281 free_conf(conf); 6282 mddev->private = NULL; 6283 mddev->to_remove = &raid5_attrs_group; 6284 return 0; 6285 } 6286 6287 static void status(struct seq_file *seq, struct mddev *mddev) 6288 { 6289 struct r5conf *conf = mddev->private; 6290 int i; 6291 6292 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 6293 mddev->chunk_sectors / 2, mddev->layout); 6294 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 6295 for (i = 0; i < conf->raid_disks; i++) 6296 seq_printf (seq, "%s", 6297 conf->disks[i].rdev && 6298 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 6299 seq_printf (seq, "]"); 6300 } 6301 6302 static void print_raid5_conf (struct r5conf *conf) 6303 { 6304 int i; 6305 struct disk_info *tmp; 6306 6307 printk(KERN_DEBUG "RAID conf printout:\n"); 6308 if (!conf) { 6309 printk("(conf==NULL)\n"); 6310 return; 6311 } 6312 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 6313 conf->raid_disks, 6314 conf->raid_disks - conf->mddev->degraded); 6315 6316 for (i = 0; i < conf->raid_disks; i++) { 6317 char b[BDEVNAME_SIZE]; 6318 tmp = conf->disks + i; 6319 if (tmp->rdev) 6320 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 6321 i, !test_bit(Faulty, &tmp->rdev->flags), 6322 bdevname(tmp->rdev->bdev, b)); 6323 } 6324 } 6325 6326 static int raid5_spare_active(struct mddev *mddev) 6327 { 6328 int i; 6329 struct r5conf *conf = mddev->private; 6330 struct disk_info *tmp; 6331 int count = 0; 6332 unsigned long flags; 6333 6334 for (i = 0; i < conf->raid_disks; i++) { 6335 tmp = conf->disks + i; 6336 if (tmp->replacement 6337 && tmp->replacement->recovery_offset == MaxSector 6338 && !test_bit(Faulty, &tmp->replacement->flags) 6339 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 6340 /* Replacement has just become active. */ 6341 if (!tmp->rdev 6342 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 6343 count++; 6344 if (tmp->rdev) { 6345 /* Replaced device not technically faulty, 6346 * but we need to be sure it gets removed 6347 * and never re-added. 6348 */ 6349 set_bit(Faulty, &tmp->rdev->flags); 6350 sysfs_notify_dirent_safe( 6351 tmp->rdev->sysfs_state); 6352 } 6353 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 6354 } else if (tmp->rdev 6355 && tmp->rdev->recovery_offset == MaxSector 6356 && !test_bit(Faulty, &tmp->rdev->flags) 6357 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 6358 count++; 6359 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 6360 } 6361 } 6362 spin_lock_irqsave(&conf->device_lock, flags); 6363 mddev->degraded = calc_degraded(conf); 6364 spin_unlock_irqrestore(&conf->device_lock, flags); 6365 print_raid5_conf(conf); 6366 return count; 6367 } 6368 6369 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 6370 { 6371 struct r5conf *conf = mddev->private; 6372 int err = 0; 6373 int number = rdev->raid_disk; 6374 struct md_rdev **rdevp; 6375 struct disk_info *p = conf->disks + number; 6376 6377 print_raid5_conf(conf); 6378 if (rdev == p->rdev) 6379 rdevp = &p->rdev; 6380 else if (rdev == p->replacement) 6381 rdevp = &p->replacement; 6382 else 6383 return 0; 6384 6385 if (number >= conf->raid_disks && 6386 conf->reshape_progress == MaxSector) 6387 clear_bit(In_sync, &rdev->flags); 6388 6389 if (test_bit(In_sync, &rdev->flags) || 6390 atomic_read(&rdev->nr_pending)) { 6391 err = -EBUSY; 6392 goto abort; 6393 } 6394 /* Only remove non-faulty devices if recovery 6395 * isn't possible. 6396 */ 6397 if (!test_bit(Faulty, &rdev->flags) && 6398 mddev->recovery_disabled != conf->recovery_disabled && 6399 !has_failed(conf) && 6400 (!p->replacement || p->replacement == rdev) && 6401 number < conf->raid_disks) { 6402 err = -EBUSY; 6403 goto abort; 6404 } 6405 *rdevp = NULL; 6406 synchronize_rcu(); 6407 if (atomic_read(&rdev->nr_pending)) { 6408 /* lost the race, try later */ 6409 err = -EBUSY; 6410 *rdevp = rdev; 6411 } else if (p->replacement) { 6412 /* We must have just cleared 'rdev' */ 6413 p->rdev = p->replacement; 6414 clear_bit(Replacement, &p->replacement->flags); 6415 smp_mb(); /* Make sure other CPUs may see both as identical 6416 * but will never see neither - if they are careful 6417 */ 6418 p->replacement = NULL; 6419 clear_bit(WantReplacement, &rdev->flags); 6420 } else 6421 /* We might have just removed the Replacement as faulty- 6422 * clear the bit just in case 6423 */ 6424 clear_bit(WantReplacement, &rdev->flags); 6425 abort: 6426 6427 print_raid5_conf(conf); 6428 return err; 6429 } 6430 6431 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 6432 { 6433 struct r5conf *conf = mddev->private; 6434 int err = -EEXIST; 6435 int disk; 6436 struct disk_info *p; 6437 int first = 0; 6438 int last = conf->raid_disks - 1; 6439 6440 if (mddev->recovery_disabled == conf->recovery_disabled) 6441 return -EBUSY; 6442 6443 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 6444 /* no point adding a device */ 6445 return -EINVAL; 6446 6447 if (rdev->raid_disk >= 0) 6448 first = last = rdev->raid_disk; 6449 6450 /* 6451 * find the disk ... but prefer rdev->saved_raid_disk 6452 * if possible. 6453 */ 6454 if (rdev->saved_raid_disk >= 0 && 6455 rdev->saved_raid_disk >= first && 6456 conf->disks[rdev->saved_raid_disk].rdev == NULL) 6457 first = rdev->saved_raid_disk; 6458 6459 for (disk = first; disk <= last; disk++) { 6460 p = conf->disks + disk; 6461 if (p->rdev == NULL) { 6462 clear_bit(In_sync, &rdev->flags); 6463 rdev->raid_disk = disk; 6464 err = 0; 6465 if (rdev->saved_raid_disk != disk) 6466 conf->fullsync = 1; 6467 rcu_assign_pointer(p->rdev, rdev); 6468 goto out; 6469 } 6470 } 6471 for (disk = first; disk <= last; disk++) { 6472 p = conf->disks + disk; 6473 if (test_bit(WantReplacement, &p->rdev->flags) && 6474 p->replacement == NULL) { 6475 clear_bit(In_sync, &rdev->flags); 6476 set_bit(Replacement, &rdev->flags); 6477 rdev->raid_disk = disk; 6478 err = 0; 6479 conf->fullsync = 1; 6480 rcu_assign_pointer(p->replacement, rdev); 6481 break; 6482 } 6483 } 6484 out: 6485 print_raid5_conf(conf); 6486 return err; 6487 } 6488 6489 static int raid5_resize(struct mddev *mddev, sector_t sectors) 6490 { 6491 /* no resync is happening, and there is enough space 6492 * on all devices, so we can resize. 6493 * We need to make sure resync covers any new space. 6494 * If the array is shrinking we should possibly wait until 6495 * any io in the removed space completes, but it hardly seems 6496 * worth it. 6497 */ 6498 sector_t newsize; 6499 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 6500 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 6501 if (mddev->external_size && 6502 mddev->array_sectors > newsize) 6503 return -EINVAL; 6504 if (mddev->bitmap) { 6505 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 6506 if (ret) 6507 return ret; 6508 } 6509 md_set_array_sectors(mddev, newsize); 6510 set_capacity(mddev->gendisk, mddev->array_sectors); 6511 revalidate_disk(mddev->gendisk); 6512 if (sectors > mddev->dev_sectors && 6513 mddev->recovery_cp > mddev->dev_sectors) { 6514 mddev->recovery_cp = mddev->dev_sectors; 6515 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6516 } 6517 mddev->dev_sectors = sectors; 6518 mddev->resync_max_sectors = sectors; 6519 return 0; 6520 } 6521 6522 static int check_stripe_cache(struct mddev *mddev) 6523 { 6524 /* Can only proceed if there are plenty of stripe_heads. 6525 * We need a minimum of one full stripe,, and for sensible progress 6526 * it is best to have about 4 times that. 6527 * If we require 4 times, then the default 256 4K stripe_heads will 6528 * allow for chunk sizes up to 256K, which is probably OK. 6529 * If the chunk size is greater, user-space should request more 6530 * stripe_heads first. 6531 */ 6532 struct r5conf *conf = mddev->private; 6533 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 6534 > conf->max_nr_stripes || 6535 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 6536 > conf->max_nr_stripes) { 6537 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 6538 mdname(mddev), 6539 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 6540 / STRIPE_SIZE)*4); 6541 return 0; 6542 } 6543 return 1; 6544 } 6545 6546 static int check_reshape(struct mddev *mddev) 6547 { 6548 struct r5conf *conf = mddev->private; 6549 6550 if (mddev->delta_disks == 0 && 6551 mddev->new_layout == mddev->layout && 6552 mddev->new_chunk_sectors == mddev->chunk_sectors) 6553 return 0; /* nothing to do */ 6554 if (has_failed(conf)) 6555 return -EINVAL; 6556 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 6557 /* We might be able to shrink, but the devices must 6558 * be made bigger first. 6559 * For raid6, 4 is the minimum size. 6560 * Otherwise 2 is the minimum 6561 */ 6562 int min = 2; 6563 if (mddev->level == 6) 6564 min = 4; 6565 if (mddev->raid_disks + mddev->delta_disks < min) 6566 return -EINVAL; 6567 } 6568 6569 if (!check_stripe_cache(mddev)) 6570 return -ENOSPC; 6571 6572 return resize_stripes(conf, (conf->previous_raid_disks 6573 + mddev->delta_disks)); 6574 } 6575 6576 static int raid5_start_reshape(struct mddev *mddev) 6577 { 6578 struct r5conf *conf = mddev->private; 6579 struct md_rdev *rdev; 6580 int spares = 0; 6581 unsigned long flags; 6582 6583 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6584 return -EBUSY; 6585 6586 if (!check_stripe_cache(mddev)) 6587 return -ENOSPC; 6588 6589 if (has_failed(conf)) 6590 return -EINVAL; 6591 6592 rdev_for_each(rdev, mddev) { 6593 if (!test_bit(In_sync, &rdev->flags) 6594 && !test_bit(Faulty, &rdev->flags)) 6595 spares++; 6596 } 6597 6598 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 6599 /* Not enough devices even to make a degraded array 6600 * of that size 6601 */ 6602 return -EINVAL; 6603 6604 /* Refuse to reduce size of the array. Any reductions in 6605 * array size must be through explicit setting of array_size 6606 * attribute. 6607 */ 6608 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 6609 < mddev->array_sectors) { 6610 printk(KERN_ERR "md/raid:%s: array size must be reduced " 6611 "before number of disks\n", mdname(mddev)); 6612 return -EINVAL; 6613 } 6614 6615 atomic_set(&conf->reshape_stripes, 0); 6616 spin_lock_irq(&conf->device_lock); 6617 write_seqcount_begin(&conf->gen_lock); 6618 conf->previous_raid_disks = conf->raid_disks; 6619 conf->raid_disks += mddev->delta_disks; 6620 conf->prev_chunk_sectors = conf->chunk_sectors; 6621 conf->chunk_sectors = mddev->new_chunk_sectors; 6622 conf->prev_algo = conf->algorithm; 6623 conf->algorithm = mddev->new_layout; 6624 conf->generation++; 6625 /* Code that selects data_offset needs to see the generation update 6626 * if reshape_progress has been set - so a memory barrier needed. 6627 */ 6628 smp_mb(); 6629 if (mddev->reshape_backwards) 6630 conf->reshape_progress = raid5_size(mddev, 0, 0); 6631 else 6632 conf->reshape_progress = 0; 6633 conf->reshape_safe = conf->reshape_progress; 6634 write_seqcount_end(&conf->gen_lock); 6635 spin_unlock_irq(&conf->device_lock); 6636 6637 /* Now make sure any requests that proceeded on the assumption 6638 * the reshape wasn't running - like Discard or Read - have 6639 * completed. 6640 */ 6641 mddev_suspend(mddev); 6642 mddev_resume(mddev); 6643 6644 /* Add some new drives, as many as will fit. 6645 * We know there are enough to make the newly sized array work. 6646 * Don't add devices if we are reducing the number of 6647 * devices in the array. This is because it is not possible 6648 * to correctly record the "partially reconstructed" state of 6649 * such devices during the reshape and confusion could result. 6650 */ 6651 if (mddev->delta_disks >= 0) { 6652 rdev_for_each(rdev, mddev) 6653 if (rdev->raid_disk < 0 && 6654 !test_bit(Faulty, &rdev->flags)) { 6655 if (raid5_add_disk(mddev, rdev) == 0) { 6656 if (rdev->raid_disk 6657 >= conf->previous_raid_disks) 6658 set_bit(In_sync, &rdev->flags); 6659 else 6660 rdev->recovery_offset = 0; 6661 6662 if (sysfs_link_rdev(mddev, rdev)) 6663 /* Failure here is OK */; 6664 } 6665 } else if (rdev->raid_disk >= conf->previous_raid_disks 6666 && !test_bit(Faulty, &rdev->flags)) { 6667 /* This is a spare that was manually added */ 6668 set_bit(In_sync, &rdev->flags); 6669 } 6670 6671 /* When a reshape changes the number of devices, 6672 * ->degraded is measured against the larger of the 6673 * pre and post number of devices. 6674 */ 6675 spin_lock_irqsave(&conf->device_lock, flags); 6676 mddev->degraded = calc_degraded(conf); 6677 spin_unlock_irqrestore(&conf->device_lock, flags); 6678 } 6679 mddev->raid_disks = conf->raid_disks; 6680 mddev->reshape_position = conf->reshape_progress; 6681 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6682 6683 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6684 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6685 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6686 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6687 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6688 "reshape"); 6689 if (!mddev->sync_thread) { 6690 mddev->recovery = 0; 6691 spin_lock_irq(&conf->device_lock); 6692 write_seqcount_begin(&conf->gen_lock); 6693 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 6694 mddev->new_chunk_sectors = 6695 conf->chunk_sectors = conf->prev_chunk_sectors; 6696 mddev->new_layout = conf->algorithm = conf->prev_algo; 6697 rdev_for_each(rdev, mddev) 6698 rdev->new_data_offset = rdev->data_offset; 6699 smp_wmb(); 6700 conf->generation --; 6701 conf->reshape_progress = MaxSector; 6702 mddev->reshape_position = MaxSector; 6703 write_seqcount_end(&conf->gen_lock); 6704 spin_unlock_irq(&conf->device_lock); 6705 return -EAGAIN; 6706 } 6707 conf->reshape_checkpoint = jiffies; 6708 md_wakeup_thread(mddev->sync_thread); 6709 md_new_event(mddev); 6710 return 0; 6711 } 6712 6713 /* This is called from the reshape thread and should make any 6714 * changes needed in 'conf' 6715 */ 6716 static void end_reshape(struct r5conf *conf) 6717 { 6718 6719 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 6720 struct md_rdev *rdev; 6721 6722 spin_lock_irq(&conf->device_lock); 6723 conf->previous_raid_disks = conf->raid_disks; 6724 rdev_for_each(rdev, conf->mddev) 6725 rdev->data_offset = rdev->new_data_offset; 6726 smp_wmb(); 6727 conf->reshape_progress = MaxSector; 6728 spin_unlock_irq(&conf->device_lock); 6729 wake_up(&conf->wait_for_overlap); 6730 6731 /* read-ahead size must cover two whole stripes, which is 6732 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 6733 */ 6734 if (conf->mddev->queue) { 6735 int data_disks = conf->raid_disks - conf->max_degraded; 6736 int stripe = data_disks * ((conf->chunk_sectors << 9) 6737 / PAGE_SIZE); 6738 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6739 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6740 } 6741 } 6742 } 6743 6744 /* This is called from the raid5d thread with mddev_lock held. 6745 * It makes config changes to the device. 6746 */ 6747 static void raid5_finish_reshape(struct mddev *mddev) 6748 { 6749 struct r5conf *conf = mddev->private; 6750 6751 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6752 6753 if (mddev->delta_disks > 0) { 6754 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6755 set_capacity(mddev->gendisk, mddev->array_sectors); 6756 revalidate_disk(mddev->gendisk); 6757 } else { 6758 int d; 6759 spin_lock_irq(&conf->device_lock); 6760 mddev->degraded = calc_degraded(conf); 6761 spin_unlock_irq(&conf->device_lock); 6762 for (d = conf->raid_disks ; 6763 d < conf->raid_disks - mddev->delta_disks; 6764 d++) { 6765 struct md_rdev *rdev = conf->disks[d].rdev; 6766 if (rdev) 6767 clear_bit(In_sync, &rdev->flags); 6768 rdev = conf->disks[d].replacement; 6769 if (rdev) 6770 clear_bit(In_sync, &rdev->flags); 6771 } 6772 } 6773 mddev->layout = conf->algorithm; 6774 mddev->chunk_sectors = conf->chunk_sectors; 6775 mddev->reshape_position = MaxSector; 6776 mddev->delta_disks = 0; 6777 mddev->reshape_backwards = 0; 6778 } 6779 } 6780 6781 static void raid5_quiesce(struct mddev *mddev, int state) 6782 { 6783 struct r5conf *conf = mddev->private; 6784 6785 switch(state) { 6786 case 2: /* resume for a suspend */ 6787 wake_up(&conf->wait_for_overlap); 6788 break; 6789 6790 case 1: /* stop all writes */ 6791 lock_all_device_hash_locks_irq(conf); 6792 /* '2' tells resync/reshape to pause so that all 6793 * active stripes can drain 6794 */ 6795 conf->quiesce = 2; 6796 wait_event_cmd(conf->wait_for_stripe, 6797 atomic_read(&conf->active_stripes) == 0 && 6798 atomic_read(&conf->active_aligned_reads) == 0, 6799 unlock_all_device_hash_locks_irq(conf), 6800 lock_all_device_hash_locks_irq(conf)); 6801 conf->quiesce = 1; 6802 unlock_all_device_hash_locks_irq(conf); 6803 /* allow reshape to continue */ 6804 wake_up(&conf->wait_for_overlap); 6805 break; 6806 6807 case 0: /* re-enable writes */ 6808 lock_all_device_hash_locks_irq(conf); 6809 conf->quiesce = 0; 6810 wake_up(&conf->wait_for_stripe); 6811 wake_up(&conf->wait_for_overlap); 6812 unlock_all_device_hash_locks_irq(conf); 6813 break; 6814 } 6815 } 6816 6817 6818 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 6819 { 6820 struct r0conf *raid0_conf = mddev->private; 6821 sector_t sectors; 6822 6823 /* for raid0 takeover only one zone is supported */ 6824 if (raid0_conf->nr_strip_zones > 1) { 6825 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 6826 mdname(mddev)); 6827 return ERR_PTR(-EINVAL); 6828 } 6829 6830 sectors = raid0_conf->strip_zone[0].zone_end; 6831 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 6832 mddev->dev_sectors = sectors; 6833 mddev->new_level = level; 6834 mddev->new_layout = ALGORITHM_PARITY_N; 6835 mddev->new_chunk_sectors = mddev->chunk_sectors; 6836 mddev->raid_disks += 1; 6837 mddev->delta_disks = 1; 6838 /* make sure it will be not marked as dirty */ 6839 mddev->recovery_cp = MaxSector; 6840 6841 return setup_conf(mddev); 6842 } 6843 6844 6845 static void *raid5_takeover_raid1(struct mddev *mddev) 6846 { 6847 int chunksect; 6848 6849 if (mddev->raid_disks != 2 || 6850 mddev->degraded > 1) 6851 return ERR_PTR(-EINVAL); 6852 6853 /* Should check if there are write-behind devices? */ 6854 6855 chunksect = 64*2; /* 64K by default */ 6856 6857 /* The array must be an exact multiple of chunksize */ 6858 while (chunksect && (mddev->array_sectors & (chunksect-1))) 6859 chunksect >>= 1; 6860 6861 if ((chunksect<<9) < STRIPE_SIZE) 6862 /* array size does not allow a suitable chunk size */ 6863 return ERR_PTR(-EINVAL); 6864 6865 mddev->new_level = 5; 6866 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 6867 mddev->new_chunk_sectors = chunksect; 6868 6869 return setup_conf(mddev); 6870 } 6871 6872 static void *raid5_takeover_raid6(struct mddev *mddev) 6873 { 6874 int new_layout; 6875 6876 switch (mddev->layout) { 6877 case ALGORITHM_LEFT_ASYMMETRIC_6: 6878 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 6879 break; 6880 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6881 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 6882 break; 6883 case ALGORITHM_LEFT_SYMMETRIC_6: 6884 new_layout = ALGORITHM_LEFT_SYMMETRIC; 6885 break; 6886 case ALGORITHM_RIGHT_SYMMETRIC_6: 6887 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 6888 break; 6889 case ALGORITHM_PARITY_0_6: 6890 new_layout = ALGORITHM_PARITY_0; 6891 break; 6892 case ALGORITHM_PARITY_N: 6893 new_layout = ALGORITHM_PARITY_N; 6894 break; 6895 default: 6896 return ERR_PTR(-EINVAL); 6897 } 6898 mddev->new_level = 5; 6899 mddev->new_layout = new_layout; 6900 mddev->delta_disks = -1; 6901 mddev->raid_disks -= 1; 6902 return setup_conf(mddev); 6903 } 6904 6905 6906 static int raid5_check_reshape(struct mddev *mddev) 6907 { 6908 /* For a 2-drive array, the layout and chunk size can be changed 6909 * immediately as not restriping is needed. 6910 * For larger arrays we record the new value - after validation 6911 * to be used by a reshape pass. 6912 */ 6913 struct r5conf *conf = mddev->private; 6914 int new_chunk = mddev->new_chunk_sectors; 6915 6916 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 6917 return -EINVAL; 6918 if (new_chunk > 0) { 6919 if (!is_power_of_2(new_chunk)) 6920 return -EINVAL; 6921 if (new_chunk < (PAGE_SIZE>>9)) 6922 return -EINVAL; 6923 if (mddev->array_sectors & (new_chunk-1)) 6924 /* not factor of array size */ 6925 return -EINVAL; 6926 } 6927 6928 /* They look valid */ 6929 6930 if (mddev->raid_disks == 2) { 6931 /* can make the change immediately */ 6932 if (mddev->new_layout >= 0) { 6933 conf->algorithm = mddev->new_layout; 6934 mddev->layout = mddev->new_layout; 6935 } 6936 if (new_chunk > 0) { 6937 conf->chunk_sectors = new_chunk ; 6938 mddev->chunk_sectors = new_chunk; 6939 } 6940 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6941 md_wakeup_thread(mddev->thread); 6942 } 6943 return check_reshape(mddev); 6944 } 6945 6946 static int raid6_check_reshape(struct mddev *mddev) 6947 { 6948 int new_chunk = mddev->new_chunk_sectors; 6949 6950 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 6951 return -EINVAL; 6952 if (new_chunk > 0) { 6953 if (!is_power_of_2(new_chunk)) 6954 return -EINVAL; 6955 if (new_chunk < (PAGE_SIZE >> 9)) 6956 return -EINVAL; 6957 if (mddev->array_sectors & (new_chunk-1)) 6958 /* not factor of array size */ 6959 return -EINVAL; 6960 } 6961 6962 /* They look valid */ 6963 return check_reshape(mddev); 6964 } 6965 6966 static void *raid5_takeover(struct mddev *mddev) 6967 { 6968 /* raid5 can take over: 6969 * raid0 - if there is only one strip zone - make it a raid4 layout 6970 * raid1 - if there are two drives. We need to know the chunk size 6971 * raid4 - trivial - just use a raid4 layout. 6972 * raid6 - Providing it is a *_6 layout 6973 */ 6974 if (mddev->level == 0) 6975 return raid45_takeover_raid0(mddev, 5); 6976 if (mddev->level == 1) 6977 return raid5_takeover_raid1(mddev); 6978 if (mddev->level == 4) { 6979 mddev->new_layout = ALGORITHM_PARITY_N; 6980 mddev->new_level = 5; 6981 return setup_conf(mddev); 6982 } 6983 if (mddev->level == 6) 6984 return raid5_takeover_raid6(mddev); 6985 6986 return ERR_PTR(-EINVAL); 6987 } 6988 6989 static void *raid4_takeover(struct mddev *mddev) 6990 { 6991 /* raid4 can take over: 6992 * raid0 - if there is only one strip zone 6993 * raid5 - if layout is right 6994 */ 6995 if (mddev->level == 0) 6996 return raid45_takeover_raid0(mddev, 4); 6997 if (mddev->level == 5 && 6998 mddev->layout == ALGORITHM_PARITY_N) { 6999 mddev->new_layout = 0; 7000 mddev->new_level = 4; 7001 return setup_conf(mddev); 7002 } 7003 return ERR_PTR(-EINVAL); 7004 } 7005 7006 static struct md_personality raid5_personality; 7007 7008 static void *raid6_takeover(struct mddev *mddev) 7009 { 7010 /* Currently can only take over a raid5. We map the 7011 * personality to an equivalent raid6 personality 7012 * with the Q block at the end. 7013 */ 7014 int new_layout; 7015 7016 if (mddev->pers != &raid5_personality) 7017 return ERR_PTR(-EINVAL); 7018 if (mddev->degraded > 1) 7019 return ERR_PTR(-EINVAL); 7020 if (mddev->raid_disks > 253) 7021 return ERR_PTR(-EINVAL); 7022 if (mddev->raid_disks < 3) 7023 return ERR_PTR(-EINVAL); 7024 7025 switch (mddev->layout) { 7026 case ALGORITHM_LEFT_ASYMMETRIC: 7027 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 7028 break; 7029 case ALGORITHM_RIGHT_ASYMMETRIC: 7030 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 7031 break; 7032 case ALGORITHM_LEFT_SYMMETRIC: 7033 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 7034 break; 7035 case ALGORITHM_RIGHT_SYMMETRIC: 7036 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 7037 break; 7038 case ALGORITHM_PARITY_0: 7039 new_layout = ALGORITHM_PARITY_0_6; 7040 break; 7041 case ALGORITHM_PARITY_N: 7042 new_layout = ALGORITHM_PARITY_N; 7043 break; 7044 default: 7045 return ERR_PTR(-EINVAL); 7046 } 7047 mddev->new_level = 6; 7048 mddev->new_layout = new_layout; 7049 mddev->delta_disks = 1; 7050 mddev->raid_disks += 1; 7051 return setup_conf(mddev); 7052 } 7053 7054 7055 static struct md_personality raid6_personality = 7056 { 7057 .name = "raid6", 7058 .level = 6, 7059 .owner = THIS_MODULE, 7060 .make_request = make_request, 7061 .run = run, 7062 .stop = stop, 7063 .status = status, 7064 .error_handler = error, 7065 .hot_add_disk = raid5_add_disk, 7066 .hot_remove_disk= raid5_remove_disk, 7067 .spare_active = raid5_spare_active, 7068 .sync_request = sync_request, 7069 .resize = raid5_resize, 7070 .size = raid5_size, 7071 .check_reshape = raid6_check_reshape, 7072 .start_reshape = raid5_start_reshape, 7073 .finish_reshape = raid5_finish_reshape, 7074 .quiesce = raid5_quiesce, 7075 .takeover = raid6_takeover, 7076 }; 7077 static struct md_personality raid5_personality = 7078 { 7079 .name = "raid5", 7080 .level = 5, 7081 .owner = THIS_MODULE, 7082 .make_request = make_request, 7083 .run = run, 7084 .stop = stop, 7085 .status = status, 7086 .error_handler = error, 7087 .hot_add_disk = raid5_add_disk, 7088 .hot_remove_disk= raid5_remove_disk, 7089 .spare_active = raid5_spare_active, 7090 .sync_request = sync_request, 7091 .resize = raid5_resize, 7092 .size = raid5_size, 7093 .check_reshape = raid5_check_reshape, 7094 .start_reshape = raid5_start_reshape, 7095 .finish_reshape = raid5_finish_reshape, 7096 .quiesce = raid5_quiesce, 7097 .takeover = raid5_takeover, 7098 }; 7099 7100 static struct md_personality raid4_personality = 7101 { 7102 .name = "raid4", 7103 .level = 4, 7104 .owner = THIS_MODULE, 7105 .make_request = make_request, 7106 .run = run, 7107 .stop = stop, 7108 .status = status, 7109 .error_handler = error, 7110 .hot_add_disk = raid5_add_disk, 7111 .hot_remove_disk= raid5_remove_disk, 7112 .spare_active = raid5_spare_active, 7113 .sync_request = sync_request, 7114 .resize = raid5_resize, 7115 .size = raid5_size, 7116 .check_reshape = raid5_check_reshape, 7117 .start_reshape = raid5_start_reshape, 7118 .finish_reshape = raid5_finish_reshape, 7119 .quiesce = raid5_quiesce, 7120 .takeover = raid4_takeover, 7121 }; 7122 7123 static int __init raid5_init(void) 7124 { 7125 raid5_wq = alloc_workqueue("raid5wq", 7126 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 7127 if (!raid5_wq) 7128 return -ENOMEM; 7129 register_md_personality(&raid6_personality); 7130 register_md_personality(&raid5_personality); 7131 register_md_personality(&raid4_personality); 7132 return 0; 7133 } 7134 7135 static void raid5_exit(void) 7136 { 7137 unregister_md_personality(&raid6_personality); 7138 unregister_md_personality(&raid5_personality); 7139 unregister_md_personality(&raid4_personality); 7140 destroy_workqueue(raid5_wq); 7141 } 7142 7143 module_init(raid5_init); 7144 module_exit(raid5_exit); 7145 MODULE_LICENSE("GPL"); 7146 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 7147 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 7148 MODULE_ALIAS("md-raid5"); 7149 MODULE_ALIAS("md-raid4"); 7150 MODULE_ALIAS("md-level-5"); 7151 MODULE_ALIAS("md-level-4"); 7152 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 7153 MODULE_ALIAS("md-raid6"); 7154 MODULE_ALIAS("md-level-6"); 7155 7156 /* This used to be two separate modules, they were: */ 7157 MODULE_ALIAS("raid5"); 7158 MODULE_ALIAS("raid6"); 7159