1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <linux/sched/signal.h> 59 60 #include <trace/events/block.h> 61 62 #include "md.h" 63 #include "raid5.h" 64 #include "raid0.h" 65 #include "bitmap.h" 66 67 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 68 69 #define cpu_to_group(cpu) cpu_to_node(cpu) 70 #define ANY_GROUP NUMA_NO_NODE 71 72 static bool devices_handle_discard_safely = false; 73 module_param(devices_handle_discard_safely, bool, 0644); 74 MODULE_PARM_DESC(devices_handle_discard_safely, 75 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 76 static struct workqueue_struct *raid5_wq; 77 78 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 79 { 80 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 81 return &conf->stripe_hashtbl[hash]; 82 } 83 84 static inline int stripe_hash_locks_hash(sector_t sect) 85 { 86 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 87 } 88 89 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 90 { 91 spin_lock_irq(conf->hash_locks + hash); 92 spin_lock(&conf->device_lock); 93 } 94 95 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 96 { 97 spin_unlock(&conf->device_lock); 98 spin_unlock_irq(conf->hash_locks + hash); 99 } 100 101 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 102 { 103 int i; 104 local_irq_disable(); 105 spin_lock(conf->hash_locks); 106 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 107 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 108 spin_lock(&conf->device_lock); 109 } 110 111 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 112 { 113 int i; 114 spin_unlock(&conf->device_lock); 115 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 116 spin_unlock(conf->hash_locks + i - 1); 117 local_irq_enable(); 118 } 119 120 /* Find first data disk in a raid6 stripe */ 121 static inline int raid6_d0(struct stripe_head *sh) 122 { 123 if (sh->ddf_layout) 124 /* ddf always start from first device */ 125 return 0; 126 /* md starts just after Q block */ 127 if (sh->qd_idx == sh->disks - 1) 128 return 0; 129 else 130 return sh->qd_idx + 1; 131 } 132 static inline int raid6_next_disk(int disk, int raid_disks) 133 { 134 disk++; 135 return (disk < raid_disks) ? disk : 0; 136 } 137 138 /* When walking through the disks in a raid5, starting at raid6_d0, 139 * We need to map each disk to a 'slot', where the data disks are slot 140 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 141 * is raid_disks-1. This help does that mapping. 142 */ 143 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 144 int *count, int syndrome_disks) 145 { 146 int slot = *count; 147 148 if (sh->ddf_layout) 149 (*count)++; 150 if (idx == sh->pd_idx) 151 return syndrome_disks; 152 if (idx == sh->qd_idx) 153 return syndrome_disks + 1; 154 if (!sh->ddf_layout) 155 (*count)++; 156 return slot; 157 } 158 159 static void return_io(struct bio_list *return_bi) 160 { 161 struct bio *bi; 162 while ((bi = bio_list_pop(return_bi)) != NULL) { 163 bi->bi_iter.bi_size = 0; 164 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 165 bi, 0); 166 bio_endio(bi); 167 } 168 } 169 170 static void print_raid5_conf (struct r5conf *conf); 171 172 static int stripe_operations_active(struct stripe_head *sh) 173 { 174 return sh->check_state || sh->reconstruct_state || 175 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 176 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 177 } 178 179 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 180 { 181 struct r5conf *conf = sh->raid_conf; 182 struct r5worker_group *group; 183 int thread_cnt; 184 int i, cpu = sh->cpu; 185 186 if (!cpu_online(cpu)) { 187 cpu = cpumask_any(cpu_online_mask); 188 sh->cpu = cpu; 189 } 190 191 if (list_empty(&sh->lru)) { 192 struct r5worker_group *group; 193 group = conf->worker_groups + cpu_to_group(cpu); 194 list_add_tail(&sh->lru, &group->handle_list); 195 group->stripes_cnt++; 196 sh->group = group; 197 } 198 199 if (conf->worker_cnt_per_group == 0) { 200 md_wakeup_thread(conf->mddev->thread); 201 return; 202 } 203 204 group = conf->worker_groups + cpu_to_group(sh->cpu); 205 206 group->workers[0].working = true; 207 /* at least one worker should run to avoid race */ 208 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 209 210 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 211 /* wakeup more workers */ 212 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 213 if (group->workers[i].working == false) { 214 group->workers[i].working = true; 215 queue_work_on(sh->cpu, raid5_wq, 216 &group->workers[i].work); 217 thread_cnt--; 218 } 219 } 220 } 221 222 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 223 struct list_head *temp_inactive_list) 224 { 225 int i; 226 int injournal = 0; /* number of date pages with R5_InJournal */ 227 228 BUG_ON(!list_empty(&sh->lru)); 229 BUG_ON(atomic_read(&conf->active_stripes)==0); 230 231 if (r5c_is_writeback(conf->log)) 232 for (i = sh->disks; i--; ) 233 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 234 injournal++; 235 /* 236 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 237 * data in journal, so they are not released to cached lists 238 */ 239 if (conf->quiesce && r5c_is_writeback(conf->log) && 240 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 241 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 242 r5c_make_stripe_write_out(sh); 243 set_bit(STRIPE_HANDLE, &sh->state); 244 } 245 246 if (test_bit(STRIPE_HANDLE, &sh->state)) { 247 if (test_bit(STRIPE_DELAYED, &sh->state) && 248 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 249 list_add_tail(&sh->lru, &conf->delayed_list); 250 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 251 sh->bm_seq - conf->seq_write > 0) 252 list_add_tail(&sh->lru, &conf->bitmap_list); 253 else { 254 clear_bit(STRIPE_DELAYED, &sh->state); 255 clear_bit(STRIPE_BIT_DELAY, &sh->state); 256 if (conf->worker_cnt_per_group == 0) { 257 list_add_tail(&sh->lru, &conf->handle_list); 258 } else { 259 raid5_wakeup_stripe_thread(sh); 260 return; 261 } 262 } 263 md_wakeup_thread(conf->mddev->thread); 264 } else { 265 BUG_ON(stripe_operations_active(sh)); 266 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 267 if (atomic_dec_return(&conf->preread_active_stripes) 268 < IO_THRESHOLD) 269 md_wakeup_thread(conf->mddev->thread); 270 atomic_dec(&conf->active_stripes); 271 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 272 if (!r5c_is_writeback(conf->log)) 273 list_add_tail(&sh->lru, temp_inactive_list); 274 else { 275 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 276 if (injournal == 0) 277 list_add_tail(&sh->lru, temp_inactive_list); 278 else if (injournal == conf->raid_disks - conf->max_degraded) { 279 /* full stripe */ 280 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 281 atomic_inc(&conf->r5c_cached_full_stripes); 282 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 283 atomic_dec(&conf->r5c_cached_partial_stripes); 284 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 285 r5c_check_cached_full_stripe(conf); 286 } else 287 /* 288 * STRIPE_R5C_PARTIAL_STRIPE is set in 289 * r5c_try_caching_write(). No need to 290 * set it again. 291 */ 292 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 293 } 294 } 295 } 296 } 297 298 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 299 struct list_head *temp_inactive_list) 300 { 301 if (atomic_dec_and_test(&sh->count)) 302 do_release_stripe(conf, sh, temp_inactive_list); 303 } 304 305 /* 306 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 307 * 308 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 309 * given time. Adding stripes only takes device lock, while deleting stripes 310 * only takes hash lock. 311 */ 312 static void release_inactive_stripe_list(struct r5conf *conf, 313 struct list_head *temp_inactive_list, 314 int hash) 315 { 316 int size; 317 bool do_wakeup = false; 318 unsigned long flags; 319 320 if (hash == NR_STRIPE_HASH_LOCKS) { 321 size = NR_STRIPE_HASH_LOCKS; 322 hash = NR_STRIPE_HASH_LOCKS - 1; 323 } else 324 size = 1; 325 while (size) { 326 struct list_head *list = &temp_inactive_list[size - 1]; 327 328 /* 329 * We don't hold any lock here yet, raid5_get_active_stripe() might 330 * remove stripes from the list 331 */ 332 if (!list_empty_careful(list)) { 333 spin_lock_irqsave(conf->hash_locks + hash, flags); 334 if (list_empty(conf->inactive_list + hash) && 335 !list_empty(list)) 336 atomic_dec(&conf->empty_inactive_list_nr); 337 list_splice_tail_init(list, conf->inactive_list + hash); 338 do_wakeup = true; 339 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 340 } 341 size--; 342 hash--; 343 } 344 345 if (do_wakeup) { 346 wake_up(&conf->wait_for_stripe); 347 if (atomic_read(&conf->active_stripes) == 0) 348 wake_up(&conf->wait_for_quiescent); 349 if (conf->retry_read_aligned) 350 md_wakeup_thread(conf->mddev->thread); 351 } 352 } 353 354 /* should hold conf->device_lock already */ 355 static int release_stripe_list(struct r5conf *conf, 356 struct list_head *temp_inactive_list) 357 { 358 struct stripe_head *sh, *t; 359 int count = 0; 360 struct llist_node *head; 361 362 head = llist_del_all(&conf->released_stripes); 363 head = llist_reverse_order(head); 364 llist_for_each_entry_safe(sh, t, head, release_list) { 365 int hash; 366 367 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 368 smp_mb(); 369 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 370 /* 371 * Don't worry the bit is set here, because if the bit is set 372 * again, the count is always > 1. This is true for 373 * STRIPE_ON_UNPLUG_LIST bit too. 374 */ 375 hash = sh->hash_lock_index; 376 __release_stripe(conf, sh, &temp_inactive_list[hash]); 377 count++; 378 } 379 380 return count; 381 } 382 383 void raid5_release_stripe(struct stripe_head *sh) 384 { 385 struct r5conf *conf = sh->raid_conf; 386 unsigned long flags; 387 struct list_head list; 388 int hash; 389 bool wakeup; 390 391 /* Avoid release_list until the last reference. 392 */ 393 if (atomic_add_unless(&sh->count, -1, 1)) 394 return; 395 396 if (unlikely(!conf->mddev->thread) || 397 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 398 goto slow_path; 399 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 400 if (wakeup) 401 md_wakeup_thread(conf->mddev->thread); 402 return; 403 slow_path: 404 local_irq_save(flags); 405 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 406 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 407 INIT_LIST_HEAD(&list); 408 hash = sh->hash_lock_index; 409 do_release_stripe(conf, sh, &list); 410 spin_unlock(&conf->device_lock); 411 release_inactive_stripe_list(conf, &list, hash); 412 } 413 local_irq_restore(flags); 414 } 415 416 static inline void remove_hash(struct stripe_head *sh) 417 { 418 pr_debug("remove_hash(), stripe %llu\n", 419 (unsigned long long)sh->sector); 420 421 hlist_del_init(&sh->hash); 422 } 423 424 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 425 { 426 struct hlist_head *hp = stripe_hash(conf, sh->sector); 427 428 pr_debug("insert_hash(), stripe %llu\n", 429 (unsigned long long)sh->sector); 430 431 hlist_add_head(&sh->hash, hp); 432 } 433 434 /* find an idle stripe, make sure it is unhashed, and return it. */ 435 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 436 { 437 struct stripe_head *sh = NULL; 438 struct list_head *first; 439 440 if (list_empty(conf->inactive_list + hash)) 441 goto out; 442 first = (conf->inactive_list + hash)->next; 443 sh = list_entry(first, struct stripe_head, lru); 444 list_del_init(first); 445 remove_hash(sh); 446 atomic_inc(&conf->active_stripes); 447 BUG_ON(hash != sh->hash_lock_index); 448 if (list_empty(conf->inactive_list + hash)) 449 atomic_inc(&conf->empty_inactive_list_nr); 450 out: 451 return sh; 452 } 453 454 static void shrink_buffers(struct stripe_head *sh) 455 { 456 struct page *p; 457 int i; 458 int num = sh->raid_conf->pool_size; 459 460 for (i = 0; i < num ; i++) { 461 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 462 p = sh->dev[i].page; 463 if (!p) 464 continue; 465 sh->dev[i].page = NULL; 466 put_page(p); 467 } 468 } 469 470 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 471 { 472 int i; 473 int num = sh->raid_conf->pool_size; 474 475 for (i = 0; i < num; i++) { 476 struct page *page; 477 478 if (!(page = alloc_page(gfp))) { 479 return 1; 480 } 481 sh->dev[i].page = page; 482 sh->dev[i].orig_page = page; 483 } 484 return 0; 485 } 486 487 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 488 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 489 struct stripe_head *sh); 490 491 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 492 { 493 struct r5conf *conf = sh->raid_conf; 494 int i, seq; 495 496 BUG_ON(atomic_read(&sh->count) != 0); 497 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 498 BUG_ON(stripe_operations_active(sh)); 499 BUG_ON(sh->batch_head); 500 501 pr_debug("init_stripe called, stripe %llu\n", 502 (unsigned long long)sector); 503 retry: 504 seq = read_seqcount_begin(&conf->gen_lock); 505 sh->generation = conf->generation - previous; 506 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 507 sh->sector = sector; 508 stripe_set_idx(sector, conf, previous, sh); 509 sh->state = 0; 510 511 for (i = sh->disks; i--; ) { 512 struct r5dev *dev = &sh->dev[i]; 513 514 if (dev->toread || dev->read || dev->towrite || dev->written || 515 test_bit(R5_LOCKED, &dev->flags)) { 516 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 517 (unsigned long long)sh->sector, i, dev->toread, 518 dev->read, dev->towrite, dev->written, 519 test_bit(R5_LOCKED, &dev->flags)); 520 WARN_ON(1); 521 } 522 dev->flags = 0; 523 raid5_build_block(sh, i, previous); 524 } 525 if (read_seqcount_retry(&conf->gen_lock, seq)) 526 goto retry; 527 sh->overwrite_disks = 0; 528 insert_hash(conf, sh); 529 sh->cpu = smp_processor_id(); 530 set_bit(STRIPE_BATCH_READY, &sh->state); 531 } 532 533 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 534 short generation) 535 { 536 struct stripe_head *sh; 537 538 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 539 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 540 if (sh->sector == sector && sh->generation == generation) 541 return sh; 542 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 543 return NULL; 544 } 545 546 /* 547 * Need to check if array has failed when deciding whether to: 548 * - start an array 549 * - remove non-faulty devices 550 * - add a spare 551 * - allow a reshape 552 * This determination is simple when no reshape is happening. 553 * However if there is a reshape, we need to carefully check 554 * both the before and after sections. 555 * This is because some failed devices may only affect one 556 * of the two sections, and some non-in_sync devices may 557 * be insync in the section most affected by failed devices. 558 */ 559 int raid5_calc_degraded(struct r5conf *conf) 560 { 561 int degraded, degraded2; 562 int i; 563 564 rcu_read_lock(); 565 degraded = 0; 566 for (i = 0; i < conf->previous_raid_disks; i++) { 567 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 568 if (rdev && test_bit(Faulty, &rdev->flags)) 569 rdev = rcu_dereference(conf->disks[i].replacement); 570 if (!rdev || test_bit(Faulty, &rdev->flags)) 571 degraded++; 572 else if (test_bit(In_sync, &rdev->flags)) 573 ; 574 else 575 /* not in-sync or faulty. 576 * If the reshape increases the number of devices, 577 * this is being recovered by the reshape, so 578 * this 'previous' section is not in_sync. 579 * If the number of devices is being reduced however, 580 * the device can only be part of the array if 581 * we are reverting a reshape, so this section will 582 * be in-sync. 583 */ 584 if (conf->raid_disks >= conf->previous_raid_disks) 585 degraded++; 586 } 587 rcu_read_unlock(); 588 if (conf->raid_disks == conf->previous_raid_disks) 589 return degraded; 590 rcu_read_lock(); 591 degraded2 = 0; 592 for (i = 0; i < conf->raid_disks; i++) { 593 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 594 if (rdev && test_bit(Faulty, &rdev->flags)) 595 rdev = rcu_dereference(conf->disks[i].replacement); 596 if (!rdev || test_bit(Faulty, &rdev->flags)) 597 degraded2++; 598 else if (test_bit(In_sync, &rdev->flags)) 599 ; 600 else 601 /* not in-sync or faulty. 602 * If reshape increases the number of devices, this 603 * section has already been recovered, else it 604 * almost certainly hasn't. 605 */ 606 if (conf->raid_disks <= conf->previous_raid_disks) 607 degraded2++; 608 } 609 rcu_read_unlock(); 610 if (degraded2 > degraded) 611 return degraded2; 612 return degraded; 613 } 614 615 static int has_failed(struct r5conf *conf) 616 { 617 int degraded; 618 619 if (conf->mddev->reshape_position == MaxSector) 620 return conf->mddev->degraded > conf->max_degraded; 621 622 degraded = raid5_calc_degraded(conf); 623 if (degraded > conf->max_degraded) 624 return 1; 625 return 0; 626 } 627 628 struct stripe_head * 629 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 630 int previous, int noblock, int noquiesce) 631 { 632 struct stripe_head *sh; 633 int hash = stripe_hash_locks_hash(sector); 634 int inc_empty_inactive_list_flag; 635 636 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 637 638 spin_lock_irq(conf->hash_locks + hash); 639 640 do { 641 wait_event_lock_irq(conf->wait_for_quiescent, 642 conf->quiesce == 0 || noquiesce, 643 *(conf->hash_locks + hash)); 644 sh = __find_stripe(conf, sector, conf->generation - previous); 645 if (!sh) { 646 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 647 sh = get_free_stripe(conf, hash); 648 if (!sh && !test_bit(R5_DID_ALLOC, 649 &conf->cache_state)) 650 set_bit(R5_ALLOC_MORE, 651 &conf->cache_state); 652 } 653 if (noblock && sh == NULL) 654 break; 655 656 r5c_check_stripe_cache_usage(conf); 657 if (!sh) { 658 set_bit(R5_INACTIVE_BLOCKED, 659 &conf->cache_state); 660 r5l_wake_reclaim(conf->log, 0); 661 wait_event_lock_irq( 662 conf->wait_for_stripe, 663 !list_empty(conf->inactive_list + hash) && 664 (atomic_read(&conf->active_stripes) 665 < (conf->max_nr_stripes * 3 / 4) 666 || !test_bit(R5_INACTIVE_BLOCKED, 667 &conf->cache_state)), 668 *(conf->hash_locks + hash)); 669 clear_bit(R5_INACTIVE_BLOCKED, 670 &conf->cache_state); 671 } else { 672 init_stripe(sh, sector, previous); 673 atomic_inc(&sh->count); 674 } 675 } else if (!atomic_inc_not_zero(&sh->count)) { 676 spin_lock(&conf->device_lock); 677 if (!atomic_read(&sh->count)) { 678 if (!test_bit(STRIPE_HANDLE, &sh->state)) 679 atomic_inc(&conf->active_stripes); 680 BUG_ON(list_empty(&sh->lru) && 681 !test_bit(STRIPE_EXPANDING, &sh->state)); 682 inc_empty_inactive_list_flag = 0; 683 if (!list_empty(conf->inactive_list + hash)) 684 inc_empty_inactive_list_flag = 1; 685 list_del_init(&sh->lru); 686 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 687 atomic_inc(&conf->empty_inactive_list_nr); 688 if (sh->group) { 689 sh->group->stripes_cnt--; 690 sh->group = NULL; 691 } 692 } 693 atomic_inc(&sh->count); 694 spin_unlock(&conf->device_lock); 695 } 696 } while (sh == NULL); 697 698 spin_unlock_irq(conf->hash_locks + hash); 699 return sh; 700 } 701 702 static bool is_full_stripe_write(struct stripe_head *sh) 703 { 704 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 705 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 706 } 707 708 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 709 { 710 local_irq_disable(); 711 if (sh1 > sh2) { 712 spin_lock(&sh2->stripe_lock); 713 spin_lock_nested(&sh1->stripe_lock, 1); 714 } else { 715 spin_lock(&sh1->stripe_lock); 716 spin_lock_nested(&sh2->stripe_lock, 1); 717 } 718 } 719 720 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 721 { 722 spin_unlock(&sh1->stripe_lock); 723 spin_unlock(&sh2->stripe_lock); 724 local_irq_enable(); 725 } 726 727 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 728 static bool stripe_can_batch(struct stripe_head *sh) 729 { 730 struct r5conf *conf = sh->raid_conf; 731 732 if (conf->log) 733 return false; 734 return test_bit(STRIPE_BATCH_READY, &sh->state) && 735 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 736 is_full_stripe_write(sh); 737 } 738 739 /* we only do back search */ 740 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 741 { 742 struct stripe_head *head; 743 sector_t head_sector, tmp_sec; 744 int hash; 745 int dd_idx; 746 int inc_empty_inactive_list_flag; 747 748 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 749 tmp_sec = sh->sector; 750 if (!sector_div(tmp_sec, conf->chunk_sectors)) 751 return; 752 head_sector = sh->sector - STRIPE_SECTORS; 753 754 hash = stripe_hash_locks_hash(head_sector); 755 spin_lock_irq(conf->hash_locks + hash); 756 head = __find_stripe(conf, head_sector, conf->generation); 757 if (head && !atomic_inc_not_zero(&head->count)) { 758 spin_lock(&conf->device_lock); 759 if (!atomic_read(&head->count)) { 760 if (!test_bit(STRIPE_HANDLE, &head->state)) 761 atomic_inc(&conf->active_stripes); 762 BUG_ON(list_empty(&head->lru) && 763 !test_bit(STRIPE_EXPANDING, &head->state)); 764 inc_empty_inactive_list_flag = 0; 765 if (!list_empty(conf->inactive_list + hash)) 766 inc_empty_inactive_list_flag = 1; 767 list_del_init(&head->lru); 768 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 769 atomic_inc(&conf->empty_inactive_list_nr); 770 if (head->group) { 771 head->group->stripes_cnt--; 772 head->group = NULL; 773 } 774 } 775 atomic_inc(&head->count); 776 spin_unlock(&conf->device_lock); 777 } 778 spin_unlock_irq(conf->hash_locks + hash); 779 780 if (!head) 781 return; 782 if (!stripe_can_batch(head)) 783 goto out; 784 785 lock_two_stripes(head, sh); 786 /* clear_batch_ready clear the flag */ 787 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 788 goto unlock_out; 789 790 if (sh->batch_head) 791 goto unlock_out; 792 793 dd_idx = 0; 794 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 795 dd_idx++; 796 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 797 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 798 goto unlock_out; 799 800 if (head->batch_head) { 801 spin_lock(&head->batch_head->batch_lock); 802 /* This batch list is already running */ 803 if (!stripe_can_batch(head)) { 804 spin_unlock(&head->batch_head->batch_lock); 805 goto unlock_out; 806 } 807 808 /* 809 * at this point, head's BATCH_READY could be cleared, but we 810 * can still add the stripe to batch list 811 */ 812 list_add(&sh->batch_list, &head->batch_list); 813 spin_unlock(&head->batch_head->batch_lock); 814 815 sh->batch_head = head->batch_head; 816 } else { 817 head->batch_head = head; 818 sh->batch_head = head->batch_head; 819 spin_lock(&head->batch_lock); 820 list_add_tail(&sh->batch_list, &head->batch_list); 821 spin_unlock(&head->batch_lock); 822 } 823 824 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 825 if (atomic_dec_return(&conf->preread_active_stripes) 826 < IO_THRESHOLD) 827 md_wakeup_thread(conf->mddev->thread); 828 829 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 830 int seq = sh->bm_seq; 831 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 832 sh->batch_head->bm_seq > seq) 833 seq = sh->batch_head->bm_seq; 834 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 835 sh->batch_head->bm_seq = seq; 836 } 837 838 atomic_inc(&sh->count); 839 unlock_out: 840 unlock_two_stripes(head, sh); 841 out: 842 raid5_release_stripe(head); 843 } 844 845 /* Determine if 'data_offset' or 'new_data_offset' should be used 846 * in this stripe_head. 847 */ 848 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 849 { 850 sector_t progress = conf->reshape_progress; 851 /* Need a memory barrier to make sure we see the value 852 * of conf->generation, or ->data_offset that was set before 853 * reshape_progress was updated. 854 */ 855 smp_rmb(); 856 if (progress == MaxSector) 857 return 0; 858 if (sh->generation == conf->generation - 1) 859 return 0; 860 /* We are in a reshape, and this is a new-generation stripe, 861 * so use new_data_offset. 862 */ 863 return 1; 864 } 865 866 static void flush_deferred_bios(struct r5conf *conf) 867 { 868 struct bio_list tmp; 869 struct bio *bio; 870 871 if (!conf->batch_bio_dispatch || !conf->group_cnt) 872 return; 873 874 bio_list_init(&tmp); 875 spin_lock(&conf->pending_bios_lock); 876 bio_list_merge(&tmp, &conf->pending_bios); 877 bio_list_init(&conf->pending_bios); 878 spin_unlock(&conf->pending_bios_lock); 879 880 while ((bio = bio_list_pop(&tmp))) 881 generic_make_request(bio); 882 } 883 884 static void defer_bio_issue(struct r5conf *conf, struct bio *bio) 885 { 886 /* 887 * change group_cnt will drain all bios, so this is safe 888 * 889 * A read generally means a read-modify-write, which usually means a 890 * randwrite, so we don't delay it 891 */ 892 if (!conf->batch_bio_dispatch || !conf->group_cnt || 893 bio_op(bio) == REQ_OP_READ) { 894 generic_make_request(bio); 895 return; 896 } 897 spin_lock(&conf->pending_bios_lock); 898 bio_list_add(&conf->pending_bios, bio); 899 spin_unlock(&conf->pending_bios_lock); 900 md_wakeup_thread(conf->mddev->thread); 901 } 902 903 static void 904 raid5_end_read_request(struct bio *bi); 905 static void 906 raid5_end_write_request(struct bio *bi); 907 908 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 909 { 910 struct r5conf *conf = sh->raid_conf; 911 int i, disks = sh->disks; 912 struct stripe_head *head_sh = sh; 913 914 might_sleep(); 915 916 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 917 /* writing out phase */ 918 if (s->waiting_extra_page) 919 return; 920 if (r5l_write_stripe(conf->log, sh) == 0) 921 return; 922 } else { /* caching phase */ 923 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { 924 r5c_cache_data(conf->log, sh, s); 925 return; 926 } 927 } 928 929 for (i = disks; i--; ) { 930 int op, op_flags = 0; 931 int replace_only = 0; 932 struct bio *bi, *rbi; 933 struct md_rdev *rdev, *rrdev = NULL; 934 935 sh = head_sh; 936 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 937 op = REQ_OP_WRITE; 938 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 939 op_flags = REQ_FUA; 940 if (test_bit(R5_Discard, &sh->dev[i].flags)) 941 op = REQ_OP_DISCARD; 942 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 943 op = REQ_OP_READ; 944 else if (test_and_clear_bit(R5_WantReplace, 945 &sh->dev[i].flags)) { 946 op = REQ_OP_WRITE; 947 replace_only = 1; 948 } else 949 continue; 950 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 951 op_flags |= REQ_SYNC; 952 953 again: 954 bi = &sh->dev[i].req; 955 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 956 957 rcu_read_lock(); 958 rrdev = rcu_dereference(conf->disks[i].replacement); 959 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 960 rdev = rcu_dereference(conf->disks[i].rdev); 961 if (!rdev) { 962 rdev = rrdev; 963 rrdev = NULL; 964 } 965 if (op_is_write(op)) { 966 if (replace_only) 967 rdev = NULL; 968 if (rdev == rrdev) 969 /* We raced and saw duplicates */ 970 rrdev = NULL; 971 } else { 972 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 973 rdev = rrdev; 974 rrdev = NULL; 975 } 976 977 if (rdev && test_bit(Faulty, &rdev->flags)) 978 rdev = NULL; 979 if (rdev) 980 atomic_inc(&rdev->nr_pending); 981 if (rrdev && test_bit(Faulty, &rrdev->flags)) 982 rrdev = NULL; 983 if (rrdev) 984 atomic_inc(&rrdev->nr_pending); 985 rcu_read_unlock(); 986 987 /* We have already checked bad blocks for reads. Now 988 * need to check for writes. We never accept write errors 989 * on the replacement, so we don't to check rrdev. 990 */ 991 while (op_is_write(op) && rdev && 992 test_bit(WriteErrorSeen, &rdev->flags)) { 993 sector_t first_bad; 994 int bad_sectors; 995 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 996 &first_bad, &bad_sectors); 997 if (!bad) 998 break; 999 1000 if (bad < 0) { 1001 set_bit(BlockedBadBlocks, &rdev->flags); 1002 if (!conf->mddev->external && 1003 conf->mddev->sb_flags) { 1004 /* It is very unlikely, but we might 1005 * still need to write out the 1006 * bad block log - better give it 1007 * a chance*/ 1008 md_check_recovery(conf->mddev); 1009 } 1010 /* 1011 * Because md_wait_for_blocked_rdev 1012 * will dec nr_pending, we must 1013 * increment it first. 1014 */ 1015 atomic_inc(&rdev->nr_pending); 1016 md_wait_for_blocked_rdev(rdev, conf->mddev); 1017 } else { 1018 /* Acknowledged bad block - skip the write */ 1019 rdev_dec_pending(rdev, conf->mddev); 1020 rdev = NULL; 1021 } 1022 } 1023 1024 if (rdev) { 1025 if (s->syncing || s->expanding || s->expanded 1026 || s->replacing) 1027 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1028 1029 set_bit(STRIPE_IO_STARTED, &sh->state); 1030 1031 bi->bi_bdev = rdev->bdev; 1032 bio_set_op_attrs(bi, op, op_flags); 1033 bi->bi_end_io = op_is_write(op) 1034 ? raid5_end_write_request 1035 : raid5_end_read_request; 1036 bi->bi_private = sh; 1037 1038 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1039 __func__, (unsigned long long)sh->sector, 1040 bi->bi_opf, i); 1041 atomic_inc(&sh->count); 1042 if (sh != head_sh) 1043 atomic_inc(&head_sh->count); 1044 if (use_new_offset(conf, sh)) 1045 bi->bi_iter.bi_sector = (sh->sector 1046 + rdev->new_data_offset); 1047 else 1048 bi->bi_iter.bi_sector = (sh->sector 1049 + rdev->data_offset); 1050 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1051 bi->bi_opf |= REQ_NOMERGE; 1052 1053 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1054 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1055 1056 if (!op_is_write(op) && 1057 test_bit(R5_InJournal, &sh->dev[i].flags)) 1058 /* 1059 * issuing read for a page in journal, this 1060 * must be preparing for prexor in rmw; read 1061 * the data into orig_page 1062 */ 1063 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1064 else 1065 sh->dev[i].vec.bv_page = sh->dev[i].page; 1066 bi->bi_vcnt = 1; 1067 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1068 bi->bi_io_vec[0].bv_offset = 0; 1069 bi->bi_iter.bi_size = STRIPE_SIZE; 1070 /* 1071 * If this is discard request, set bi_vcnt 0. We don't 1072 * want to confuse SCSI because SCSI will replace payload 1073 */ 1074 if (op == REQ_OP_DISCARD) 1075 bi->bi_vcnt = 0; 1076 if (rrdev) 1077 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1078 1079 if (conf->mddev->gendisk) 1080 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1081 bi, disk_devt(conf->mddev->gendisk), 1082 sh->dev[i].sector); 1083 defer_bio_issue(conf, bi); 1084 } 1085 if (rrdev) { 1086 if (s->syncing || s->expanding || s->expanded 1087 || s->replacing) 1088 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1089 1090 set_bit(STRIPE_IO_STARTED, &sh->state); 1091 1092 rbi->bi_bdev = rrdev->bdev; 1093 bio_set_op_attrs(rbi, op, op_flags); 1094 BUG_ON(!op_is_write(op)); 1095 rbi->bi_end_io = raid5_end_write_request; 1096 rbi->bi_private = sh; 1097 1098 pr_debug("%s: for %llu schedule op %d on " 1099 "replacement disc %d\n", 1100 __func__, (unsigned long long)sh->sector, 1101 rbi->bi_opf, i); 1102 atomic_inc(&sh->count); 1103 if (sh != head_sh) 1104 atomic_inc(&head_sh->count); 1105 if (use_new_offset(conf, sh)) 1106 rbi->bi_iter.bi_sector = (sh->sector 1107 + rrdev->new_data_offset); 1108 else 1109 rbi->bi_iter.bi_sector = (sh->sector 1110 + rrdev->data_offset); 1111 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1112 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1113 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1114 rbi->bi_vcnt = 1; 1115 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1116 rbi->bi_io_vec[0].bv_offset = 0; 1117 rbi->bi_iter.bi_size = STRIPE_SIZE; 1118 /* 1119 * If this is discard request, set bi_vcnt 0. We don't 1120 * want to confuse SCSI because SCSI will replace payload 1121 */ 1122 if (op == REQ_OP_DISCARD) 1123 rbi->bi_vcnt = 0; 1124 if (conf->mddev->gendisk) 1125 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1126 rbi, disk_devt(conf->mddev->gendisk), 1127 sh->dev[i].sector); 1128 defer_bio_issue(conf, rbi); 1129 } 1130 if (!rdev && !rrdev) { 1131 if (op_is_write(op)) 1132 set_bit(STRIPE_DEGRADED, &sh->state); 1133 pr_debug("skip op %d on disc %d for sector %llu\n", 1134 bi->bi_opf, i, (unsigned long long)sh->sector); 1135 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1136 set_bit(STRIPE_HANDLE, &sh->state); 1137 } 1138 1139 if (!head_sh->batch_head) 1140 continue; 1141 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1142 batch_list); 1143 if (sh != head_sh) 1144 goto again; 1145 } 1146 } 1147 1148 static struct dma_async_tx_descriptor * 1149 async_copy_data(int frombio, struct bio *bio, struct page **page, 1150 sector_t sector, struct dma_async_tx_descriptor *tx, 1151 struct stripe_head *sh, int no_skipcopy) 1152 { 1153 struct bio_vec bvl; 1154 struct bvec_iter iter; 1155 struct page *bio_page; 1156 int page_offset; 1157 struct async_submit_ctl submit; 1158 enum async_tx_flags flags = 0; 1159 1160 if (bio->bi_iter.bi_sector >= sector) 1161 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1162 else 1163 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1164 1165 if (frombio) 1166 flags |= ASYNC_TX_FENCE; 1167 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1168 1169 bio_for_each_segment(bvl, bio, iter) { 1170 int len = bvl.bv_len; 1171 int clen; 1172 int b_offset = 0; 1173 1174 if (page_offset < 0) { 1175 b_offset = -page_offset; 1176 page_offset += b_offset; 1177 len -= b_offset; 1178 } 1179 1180 if (len > 0 && page_offset + len > STRIPE_SIZE) 1181 clen = STRIPE_SIZE - page_offset; 1182 else 1183 clen = len; 1184 1185 if (clen > 0) { 1186 b_offset += bvl.bv_offset; 1187 bio_page = bvl.bv_page; 1188 if (frombio) { 1189 if (sh->raid_conf->skip_copy && 1190 b_offset == 0 && page_offset == 0 && 1191 clen == STRIPE_SIZE && 1192 !no_skipcopy) 1193 *page = bio_page; 1194 else 1195 tx = async_memcpy(*page, bio_page, page_offset, 1196 b_offset, clen, &submit); 1197 } else 1198 tx = async_memcpy(bio_page, *page, b_offset, 1199 page_offset, clen, &submit); 1200 } 1201 /* chain the operations */ 1202 submit.depend_tx = tx; 1203 1204 if (clen < len) /* hit end of page */ 1205 break; 1206 page_offset += len; 1207 } 1208 1209 return tx; 1210 } 1211 1212 static void ops_complete_biofill(void *stripe_head_ref) 1213 { 1214 struct stripe_head *sh = stripe_head_ref; 1215 struct bio_list return_bi = BIO_EMPTY_LIST; 1216 int i; 1217 1218 pr_debug("%s: stripe %llu\n", __func__, 1219 (unsigned long long)sh->sector); 1220 1221 /* clear completed biofills */ 1222 for (i = sh->disks; i--; ) { 1223 struct r5dev *dev = &sh->dev[i]; 1224 1225 /* acknowledge completion of a biofill operation */ 1226 /* and check if we need to reply to a read request, 1227 * new R5_Wantfill requests are held off until 1228 * !STRIPE_BIOFILL_RUN 1229 */ 1230 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1231 struct bio *rbi, *rbi2; 1232 1233 BUG_ON(!dev->read); 1234 rbi = dev->read; 1235 dev->read = NULL; 1236 while (rbi && rbi->bi_iter.bi_sector < 1237 dev->sector + STRIPE_SECTORS) { 1238 rbi2 = r5_next_bio(rbi, dev->sector); 1239 if (!raid5_dec_bi_active_stripes(rbi)) 1240 bio_list_add(&return_bi, rbi); 1241 rbi = rbi2; 1242 } 1243 } 1244 } 1245 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1246 1247 return_io(&return_bi); 1248 1249 set_bit(STRIPE_HANDLE, &sh->state); 1250 raid5_release_stripe(sh); 1251 } 1252 1253 static void ops_run_biofill(struct stripe_head *sh) 1254 { 1255 struct dma_async_tx_descriptor *tx = NULL; 1256 struct async_submit_ctl submit; 1257 int i; 1258 1259 BUG_ON(sh->batch_head); 1260 pr_debug("%s: stripe %llu\n", __func__, 1261 (unsigned long long)sh->sector); 1262 1263 for (i = sh->disks; i--; ) { 1264 struct r5dev *dev = &sh->dev[i]; 1265 if (test_bit(R5_Wantfill, &dev->flags)) { 1266 struct bio *rbi; 1267 spin_lock_irq(&sh->stripe_lock); 1268 dev->read = rbi = dev->toread; 1269 dev->toread = NULL; 1270 spin_unlock_irq(&sh->stripe_lock); 1271 while (rbi && rbi->bi_iter.bi_sector < 1272 dev->sector + STRIPE_SECTORS) { 1273 tx = async_copy_data(0, rbi, &dev->page, 1274 dev->sector, tx, sh, 0); 1275 rbi = r5_next_bio(rbi, dev->sector); 1276 } 1277 } 1278 } 1279 1280 atomic_inc(&sh->count); 1281 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1282 async_trigger_callback(&submit); 1283 } 1284 1285 static void mark_target_uptodate(struct stripe_head *sh, int target) 1286 { 1287 struct r5dev *tgt; 1288 1289 if (target < 0) 1290 return; 1291 1292 tgt = &sh->dev[target]; 1293 set_bit(R5_UPTODATE, &tgt->flags); 1294 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1295 clear_bit(R5_Wantcompute, &tgt->flags); 1296 } 1297 1298 static void ops_complete_compute(void *stripe_head_ref) 1299 { 1300 struct stripe_head *sh = stripe_head_ref; 1301 1302 pr_debug("%s: stripe %llu\n", __func__, 1303 (unsigned long long)sh->sector); 1304 1305 /* mark the computed target(s) as uptodate */ 1306 mark_target_uptodate(sh, sh->ops.target); 1307 mark_target_uptodate(sh, sh->ops.target2); 1308 1309 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1310 if (sh->check_state == check_state_compute_run) 1311 sh->check_state = check_state_compute_result; 1312 set_bit(STRIPE_HANDLE, &sh->state); 1313 raid5_release_stripe(sh); 1314 } 1315 1316 /* return a pointer to the address conversion region of the scribble buffer */ 1317 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1318 struct raid5_percpu *percpu, int i) 1319 { 1320 void *addr; 1321 1322 addr = flex_array_get(percpu->scribble, i); 1323 return addr + sizeof(struct page *) * (sh->disks + 2); 1324 } 1325 1326 /* return a pointer to the address conversion region of the scribble buffer */ 1327 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1328 { 1329 void *addr; 1330 1331 addr = flex_array_get(percpu->scribble, i); 1332 return addr; 1333 } 1334 1335 static struct dma_async_tx_descriptor * 1336 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1337 { 1338 int disks = sh->disks; 1339 struct page **xor_srcs = to_addr_page(percpu, 0); 1340 int target = sh->ops.target; 1341 struct r5dev *tgt = &sh->dev[target]; 1342 struct page *xor_dest = tgt->page; 1343 int count = 0; 1344 struct dma_async_tx_descriptor *tx; 1345 struct async_submit_ctl submit; 1346 int i; 1347 1348 BUG_ON(sh->batch_head); 1349 1350 pr_debug("%s: stripe %llu block: %d\n", 1351 __func__, (unsigned long long)sh->sector, target); 1352 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1353 1354 for (i = disks; i--; ) 1355 if (i != target) 1356 xor_srcs[count++] = sh->dev[i].page; 1357 1358 atomic_inc(&sh->count); 1359 1360 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1361 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1362 if (unlikely(count == 1)) 1363 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1364 else 1365 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1366 1367 return tx; 1368 } 1369 1370 /* set_syndrome_sources - populate source buffers for gen_syndrome 1371 * @srcs - (struct page *) array of size sh->disks 1372 * @sh - stripe_head to parse 1373 * 1374 * Populates srcs in proper layout order for the stripe and returns the 1375 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1376 * destination buffer is recorded in srcs[count] and the Q destination 1377 * is recorded in srcs[count+1]]. 1378 */ 1379 static int set_syndrome_sources(struct page **srcs, 1380 struct stripe_head *sh, 1381 int srctype) 1382 { 1383 int disks = sh->disks; 1384 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1385 int d0_idx = raid6_d0(sh); 1386 int count; 1387 int i; 1388 1389 for (i = 0; i < disks; i++) 1390 srcs[i] = NULL; 1391 1392 count = 0; 1393 i = d0_idx; 1394 do { 1395 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1396 struct r5dev *dev = &sh->dev[i]; 1397 1398 if (i == sh->qd_idx || i == sh->pd_idx || 1399 (srctype == SYNDROME_SRC_ALL) || 1400 (srctype == SYNDROME_SRC_WANT_DRAIN && 1401 (test_bit(R5_Wantdrain, &dev->flags) || 1402 test_bit(R5_InJournal, &dev->flags))) || 1403 (srctype == SYNDROME_SRC_WRITTEN && 1404 dev->written)) { 1405 if (test_bit(R5_InJournal, &dev->flags)) 1406 srcs[slot] = sh->dev[i].orig_page; 1407 else 1408 srcs[slot] = sh->dev[i].page; 1409 } 1410 i = raid6_next_disk(i, disks); 1411 } while (i != d0_idx); 1412 1413 return syndrome_disks; 1414 } 1415 1416 static struct dma_async_tx_descriptor * 1417 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1418 { 1419 int disks = sh->disks; 1420 struct page **blocks = to_addr_page(percpu, 0); 1421 int target; 1422 int qd_idx = sh->qd_idx; 1423 struct dma_async_tx_descriptor *tx; 1424 struct async_submit_ctl submit; 1425 struct r5dev *tgt; 1426 struct page *dest; 1427 int i; 1428 int count; 1429 1430 BUG_ON(sh->batch_head); 1431 if (sh->ops.target < 0) 1432 target = sh->ops.target2; 1433 else if (sh->ops.target2 < 0) 1434 target = sh->ops.target; 1435 else 1436 /* we should only have one valid target */ 1437 BUG(); 1438 BUG_ON(target < 0); 1439 pr_debug("%s: stripe %llu block: %d\n", 1440 __func__, (unsigned long long)sh->sector, target); 1441 1442 tgt = &sh->dev[target]; 1443 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1444 dest = tgt->page; 1445 1446 atomic_inc(&sh->count); 1447 1448 if (target == qd_idx) { 1449 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1450 blocks[count] = NULL; /* regenerating p is not necessary */ 1451 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1452 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1453 ops_complete_compute, sh, 1454 to_addr_conv(sh, percpu, 0)); 1455 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1456 } else { 1457 /* Compute any data- or p-drive using XOR */ 1458 count = 0; 1459 for (i = disks; i-- ; ) { 1460 if (i == target || i == qd_idx) 1461 continue; 1462 blocks[count++] = sh->dev[i].page; 1463 } 1464 1465 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1466 NULL, ops_complete_compute, sh, 1467 to_addr_conv(sh, percpu, 0)); 1468 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1469 } 1470 1471 return tx; 1472 } 1473 1474 static struct dma_async_tx_descriptor * 1475 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1476 { 1477 int i, count, disks = sh->disks; 1478 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1479 int d0_idx = raid6_d0(sh); 1480 int faila = -1, failb = -1; 1481 int target = sh->ops.target; 1482 int target2 = sh->ops.target2; 1483 struct r5dev *tgt = &sh->dev[target]; 1484 struct r5dev *tgt2 = &sh->dev[target2]; 1485 struct dma_async_tx_descriptor *tx; 1486 struct page **blocks = to_addr_page(percpu, 0); 1487 struct async_submit_ctl submit; 1488 1489 BUG_ON(sh->batch_head); 1490 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1491 __func__, (unsigned long long)sh->sector, target, target2); 1492 BUG_ON(target < 0 || target2 < 0); 1493 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1494 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1495 1496 /* we need to open-code set_syndrome_sources to handle the 1497 * slot number conversion for 'faila' and 'failb' 1498 */ 1499 for (i = 0; i < disks ; i++) 1500 blocks[i] = NULL; 1501 count = 0; 1502 i = d0_idx; 1503 do { 1504 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1505 1506 blocks[slot] = sh->dev[i].page; 1507 1508 if (i == target) 1509 faila = slot; 1510 if (i == target2) 1511 failb = slot; 1512 i = raid6_next_disk(i, disks); 1513 } while (i != d0_idx); 1514 1515 BUG_ON(faila == failb); 1516 if (failb < faila) 1517 swap(faila, failb); 1518 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1519 __func__, (unsigned long long)sh->sector, faila, failb); 1520 1521 atomic_inc(&sh->count); 1522 1523 if (failb == syndrome_disks+1) { 1524 /* Q disk is one of the missing disks */ 1525 if (faila == syndrome_disks) { 1526 /* Missing P+Q, just recompute */ 1527 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1528 ops_complete_compute, sh, 1529 to_addr_conv(sh, percpu, 0)); 1530 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1531 STRIPE_SIZE, &submit); 1532 } else { 1533 struct page *dest; 1534 int data_target; 1535 int qd_idx = sh->qd_idx; 1536 1537 /* Missing D+Q: recompute D from P, then recompute Q */ 1538 if (target == qd_idx) 1539 data_target = target2; 1540 else 1541 data_target = target; 1542 1543 count = 0; 1544 for (i = disks; i-- ; ) { 1545 if (i == data_target || i == qd_idx) 1546 continue; 1547 blocks[count++] = sh->dev[i].page; 1548 } 1549 dest = sh->dev[data_target].page; 1550 init_async_submit(&submit, 1551 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1552 NULL, NULL, NULL, 1553 to_addr_conv(sh, percpu, 0)); 1554 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1555 &submit); 1556 1557 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1558 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1559 ops_complete_compute, sh, 1560 to_addr_conv(sh, percpu, 0)); 1561 return async_gen_syndrome(blocks, 0, count+2, 1562 STRIPE_SIZE, &submit); 1563 } 1564 } else { 1565 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1566 ops_complete_compute, sh, 1567 to_addr_conv(sh, percpu, 0)); 1568 if (failb == syndrome_disks) { 1569 /* We're missing D+P. */ 1570 return async_raid6_datap_recov(syndrome_disks+2, 1571 STRIPE_SIZE, faila, 1572 blocks, &submit); 1573 } else { 1574 /* We're missing D+D. */ 1575 return async_raid6_2data_recov(syndrome_disks+2, 1576 STRIPE_SIZE, faila, failb, 1577 blocks, &submit); 1578 } 1579 } 1580 } 1581 1582 static void ops_complete_prexor(void *stripe_head_ref) 1583 { 1584 struct stripe_head *sh = stripe_head_ref; 1585 1586 pr_debug("%s: stripe %llu\n", __func__, 1587 (unsigned long long)sh->sector); 1588 1589 if (r5c_is_writeback(sh->raid_conf->log)) 1590 /* 1591 * raid5-cache write back uses orig_page during prexor. 1592 * After prexor, it is time to free orig_page 1593 */ 1594 r5c_release_extra_page(sh); 1595 } 1596 1597 static struct dma_async_tx_descriptor * 1598 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1599 struct dma_async_tx_descriptor *tx) 1600 { 1601 int disks = sh->disks; 1602 struct page **xor_srcs = to_addr_page(percpu, 0); 1603 int count = 0, pd_idx = sh->pd_idx, i; 1604 struct async_submit_ctl submit; 1605 1606 /* existing parity data subtracted */ 1607 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1608 1609 BUG_ON(sh->batch_head); 1610 pr_debug("%s: stripe %llu\n", __func__, 1611 (unsigned long long)sh->sector); 1612 1613 for (i = disks; i--; ) { 1614 struct r5dev *dev = &sh->dev[i]; 1615 /* Only process blocks that are known to be uptodate */ 1616 if (test_bit(R5_InJournal, &dev->flags)) 1617 xor_srcs[count++] = dev->orig_page; 1618 else if (test_bit(R5_Wantdrain, &dev->flags)) 1619 xor_srcs[count++] = dev->page; 1620 } 1621 1622 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1623 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1624 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1625 1626 return tx; 1627 } 1628 1629 static struct dma_async_tx_descriptor * 1630 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1631 struct dma_async_tx_descriptor *tx) 1632 { 1633 struct page **blocks = to_addr_page(percpu, 0); 1634 int count; 1635 struct async_submit_ctl submit; 1636 1637 pr_debug("%s: stripe %llu\n", __func__, 1638 (unsigned long long)sh->sector); 1639 1640 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1641 1642 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1643 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1644 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1645 1646 return tx; 1647 } 1648 1649 static struct dma_async_tx_descriptor * 1650 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1651 { 1652 struct r5conf *conf = sh->raid_conf; 1653 int disks = sh->disks; 1654 int i; 1655 struct stripe_head *head_sh = sh; 1656 1657 pr_debug("%s: stripe %llu\n", __func__, 1658 (unsigned long long)sh->sector); 1659 1660 for (i = disks; i--; ) { 1661 struct r5dev *dev; 1662 struct bio *chosen; 1663 1664 sh = head_sh; 1665 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1666 struct bio *wbi; 1667 1668 again: 1669 dev = &sh->dev[i]; 1670 /* 1671 * clear R5_InJournal, so when rewriting a page in 1672 * journal, it is not skipped by r5l_log_stripe() 1673 */ 1674 clear_bit(R5_InJournal, &dev->flags); 1675 spin_lock_irq(&sh->stripe_lock); 1676 chosen = dev->towrite; 1677 dev->towrite = NULL; 1678 sh->overwrite_disks = 0; 1679 BUG_ON(dev->written); 1680 wbi = dev->written = chosen; 1681 spin_unlock_irq(&sh->stripe_lock); 1682 WARN_ON(dev->page != dev->orig_page); 1683 1684 while (wbi && wbi->bi_iter.bi_sector < 1685 dev->sector + STRIPE_SECTORS) { 1686 if (wbi->bi_opf & REQ_FUA) 1687 set_bit(R5_WantFUA, &dev->flags); 1688 if (wbi->bi_opf & REQ_SYNC) 1689 set_bit(R5_SyncIO, &dev->flags); 1690 if (bio_op(wbi) == REQ_OP_DISCARD) 1691 set_bit(R5_Discard, &dev->flags); 1692 else { 1693 tx = async_copy_data(1, wbi, &dev->page, 1694 dev->sector, tx, sh, 1695 r5c_is_writeback(conf->log)); 1696 if (dev->page != dev->orig_page && 1697 !r5c_is_writeback(conf->log)) { 1698 set_bit(R5_SkipCopy, &dev->flags); 1699 clear_bit(R5_UPTODATE, &dev->flags); 1700 clear_bit(R5_OVERWRITE, &dev->flags); 1701 } 1702 } 1703 wbi = r5_next_bio(wbi, dev->sector); 1704 } 1705 1706 if (head_sh->batch_head) { 1707 sh = list_first_entry(&sh->batch_list, 1708 struct stripe_head, 1709 batch_list); 1710 if (sh == head_sh) 1711 continue; 1712 goto again; 1713 } 1714 } 1715 } 1716 1717 return tx; 1718 } 1719 1720 static void ops_complete_reconstruct(void *stripe_head_ref) 1721 { 1722 struct stripe_head *sh = stripe_head_ref; 1723 int disks = sh->disks; 1724 int pd_idx = sh->pd_idx; 1725 int qd_idx = sh->qd_idx; 1726 int i; 1727 bool fua = false, sync = false, discard = false; 1728 1729 pr_debug("%s: stripe %llu\n", __func__, 1730 (unsigned long long)sh->sector); 1731 1732 for (i = disks; i--; ) { 1733 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1734 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1735 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1736 } 1737 1738 for (i = disks; i--; ) { 1739 struct r5dev *dev = &sh->dev[i]; 1740 1741 if (dev->written || i == pd_idx || i == qd_idx) { 1742 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1743 set_bit(R5_UPTODATE, &dev->flags); 1744 if (fua) 1745 set_bit(R5_WantFUA, &dev->flags); 1746 if (sync) 1747 set_bit(R5_SyncIO, &dev->flags); 1748 } 1749 } 1750 1751 if (sh->reconstruct_state == reconstruct_state_drain_run) 1752 sh->reconstruct_state = reconstruct_state_drain_result; 1753 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1754 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1755 else { 1756 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1757 sh->reconstruct_state = reconstruct_state_result; 1758 } 1759 1760 set_bit(STRIPE_HANDLE, &sh->state); 1761 raid5_release_stripe(sh); 1762 } 1763 1764 static void 1765 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1766 struct dma_async_tx_descriptor *tx) 1767 { 1768 int disks = sh->disks; 1769 struct page **xor_srcs; 1770 struct async_submit_ctl submit; 1771 int count, pd_idx = sh->pd_idx, i; 1772 struct page *xor_dest; 1773 int prexor = 0; 1774 unsigned long flags; 1775 int j = 0; 1776 struct stripe_head *head_sh = sh; 1777 int last_stripe; 1778 1779 pr_debug("%s: stripe %llu\n", __func__, 1780 (unsigned long long)sh->sector); 1781 1782 for (i = 0; i < sh->disks; i++) { 1783 if (pd_idx == i) 1784 continue; 1785 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1786 break; 1787 } 1788 if (i >= sh->disks) { 1789 atomic_inc(&sh->count); 1790 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1791 ops_complete_reconstruct(sh); 1792 return; 1793 } 1794 again: 1795 count = 0; 1796 xor_srcs = to_addr_page(percpu, j); 1797 /* check if prexor is active which means only process blocks 1798 * that are part of a read-modify-write (written) 1799 */ 1800 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1801 prexor = 1; 1802 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1803 for (i = disks; i--; ) { 1804 struct r5dev *dev = &sh->dev[i]; 1805 if (head_sh->dev[i].written || 1806 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1807 xor_srcs[count++] = dev->page; 1808 } 1809 } else { 1810 xor_dest = sh->dev[pd_idx].page; 1811 for (i = disks; i--; ) { 1812 struct r5dev *dev = &sh->dev[i]; 1813 if (i != pd_idx) 1814 xor_srcs[count++] = dev->page; 1815 } 1816 } 1817 1818 /* 1/ if we prexor'd then the dest is reused as a source 1819 * 2/ if we did not prexor then we are redoing the parity 1820 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1821 * for the synchronous xor case 1822 */ 1823 last_stripe = !head_sh->batch_head || 1824 list_first_entry(&sh->batch_list, 1825 struct stripe_head, batch_list) == head_sh; 1826 if (last_stripe) { 1827 flags = ASYNC_TX_ACK | 1828 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1829 1830 atomic_inc(&head_sh->count); 1831 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1832 to_addr_conv(sh, percpu, j)); 1833 } else { 1834 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1835 init_async_submit(&submit, flags, tx, NULL, NULL, 1836 to_addr_conv(sh, percpu, j)); 1837 } 1838 1839 if (unlikely(count == 1)) 1840 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1841 else 1842 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1843 if (!last_stripe) { 1844 j++; 1845 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1846 batch_list); 1847 goto again; 1848 } 1849 } 1850 1851 static void 1852 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1853 struct dma_async_tx_descriptor *tx) 1854 { 1855 struct async_submit_ctl submit; 1856 struct page **blocks; 1857 int count, i, j = 0; 1858 struct stripe_head *head_sh = sh; 1859 int last_stripe; 1860 int synflags; 1861 unsigned long txflags; 1862 1863 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1864 1865 for (i = 0; i < sh->disks; i++) { 1866 if (sh->pd_idx == i || sh->qd_idx == i) 1867 continue; 1868 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1869 break; 1870 } 1871 if (i >= sh->disks) { 1872 atomic_inc(&sh->count); 1873 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1874 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1875 ops_complete_reconstruct(sh); 1876 return; 1877 } 1878 1879 again: 1880 blocks = to_addr_page(percpu, j); 1881 1882 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1883 synflags = SYNDROME_SRC_WRITTEN; 1884 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1885 } else { 1886 synflags = SYNDROME_SRC_ALL; 1887 txflags = ASYNC_TX_ACK; 1888 } 1889 1890 count = set_syndrome_sources(blocks, sh, synflags); 1891 last_stripe = !head_sh->batch_head || 1892 list_first_entry(&sh->batch_list, 1893 struct stripe_head, batch_list) == head_sh; 1894 1895 if (last_stripe) { 1896 atomic_inc(&head_sh->count); 1897 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1898 head_sh, to_addr_conv(sh, percpu, j)); 1899 } else 1900 init_async_submit(&submit, 0, tx, NULL, NULL, 1901 to_addr_conv(sh, percpu, j)); 1902 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1903 if (!last_stripe) { 1904 j++; 1905 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1906 batch_list); 1907 goto again; 1908 } 1909 } 1910 1911 static void ops_complete_check(void *stripe_head_ref) 1912 { 1913 struct stripe_head *sh = stripe_head_ref; 1914 1915 pr_debug("%s: stripe %llu\n", __func__, 1916 (unsigned long long)sh->sector); 1917 1918 sh->check_state = check_state_check_result; 1919 set_bit(STRIPE_HANDLE, &sh->state); 1920 raid5_release_stripe(sh); 1921 } 1922 1923 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1924 { 1925 int disks = sh->disks; 1926 int pd_idx = sh->pd_idx; 1927 int qd_idx = sh->qd_idx; 1928 struct page *xor_dest; 1929 struct page **xor_srcs = to_addr_page(percpu, 0); 1930 struct dma_async_tx_descriptor *tx; 1931 struct async_submit_ctl submit; 1932 int count; 1933 int i; 1934 1935 pr_debug("%s: stripe %llu\n", __func__, 1936 (unsigned long long)sh->sector); 1937 1938 BUG_ON(sh->batch_head); 1939 count = 0; 1940 xor_dest = sh->dev[pd_idx].page; 1941 xor_srcs[count++] = xor_dest; 1942 for (i = disks; i--; ) { 1943 if (i == pd_idx || i == qd_idx) 1944 continue; 1945 xor_srcs[count++] = sh->dev[i].page; 1946 } 1947 1948 init_async_submit(&submit, 0, NULL, NULL, NULL, 1949 to_addr_conv(sh, percpu, 0)); 1950 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1951 &sh->ops.zero_sum_result, &submit); 1952 1953 atomic_inc(&sh->count); 1954 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1955 tx = async_trigger_callback(&submit); 1956 } 1957 1958 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1959 { 1960 struct page **srcs = to_addr_page(percpu, 0); 1961 struct async_submit_ctl submit; 1962 int count; 1963 1964 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1965 (unsigned long long)sh->sector, checkp); 1966 1967 BUG_ON(sh->batch_head); 1968 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 1969 if (!checkp) 1970 srcs[count] = NULL; 1971 1972 atomic_inc(&sh->count); 1973 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1974 sh, to_addr_conv(sh, percpu, 0)); 1975 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1976 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1977 } 1978 1979 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1980 { 1981 int overlap_clear = 0, i, disks = sh->disks; 1982 struct dma_async_tx_descriptor *tx = NULL; 1983 struct r5conf *conf = sh->raid_conf; 1984 int level = conf->level; 1985 struct raid5_percpu *percpu; 1986 unsigned long cpu; 1987 1988 cpu = get_cpu(); 1989 percpu = per_cpu_ptr(conf->percpu, cpu); 1990 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1991 ops_run_biofill(sh); 1992 overlap_clear++; 1993 } 1994 1995 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1996 if (level < 6) 1997 tx = ops_run_compute5(sh, percpu); 1998 else { 1999 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2000 tx = ops_run_compute6_1(sh, percpu); 2001 else 2002 tx = ops_run_compute6_2(sh, percpu); 2003 } 2004 /* terminate the chain if reconstruct is not set to be run */ 2005 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2006 async_tx_ack(tx); 2007 } 2008 2009 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2010 if (level < 6) 2011 tx = ops_run_prexor5(sh, percpu, tx); 2012 else 2013 tx = ops_run_prexor6(sh, percpu, tx); 2014 } 2015 2016 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2017 tx = ops_run_biodrain(sh, tx); 2018 overlap_clear++; 2019 } 2020 2021 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2022 if (level < 6) 2023 ops_run_reconstruct5(sh, percpu, tx); 2024 else 2025 ops_run_reconstruct6(sh, percpu, tx); 2026 } 2027 2028 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2029 if (sh->check_state == check_state_run) 2030 ops_run_check_p(sh, percpu); 2031 else if (sh->check_state == check_state_run_q) 2032 ops_run_check_pq(sh, percpu, 0); 2033 else if (sh->check_state == check_state_run_pq) 2034 ops_run_check_pq(sh, percpu, 1); 2035 else 2036 BUG(); 2037 } 2038 2039 if (overlap_clear && !sh->batch_head) 2040 for (i = disks; i--; ) { 2041 struct r5dev *dev = &sh->dev[i]; 2042 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2043 wake_up(&sh->raid_conf->wait_for_overlap); 2044 } 2045 put_cpu(); 2046 } 2047 2048 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2049 int disks) 2050 { 2051 struct stripe_head *sh; 2052 int i; 2053 2054 sh = kmem_cache_zalloc(sc, gfp); 2055 if (sh) { 2056 spin_lock_init(&sh->stripe_lock); 2057 spin_lock_init(&sh->batch_lock); 2058 INIT_LIST_HEAD(&sh->batch_list); 2059 INIT_LIST_HEAD(&sh->lru); 2060 INIT_LIST_HEAD(&sh->r5c); 2061 INIT_LIST_HEAD(&sh->log_list); 2062 atomic_set(&sh->count, 1); 2063 sh->log_start = MaxSector; 2064 for (i = 0; i < disks; i++) { 2065 struct r5dev *dev = &sh->dev[i]; 2066 2067 bio_init(&dev->req, &dev->vec, 1); 2068 bio_init(&dev->rreq, &dev->rvec, 1); 2069 } 2070 } 2071 return sh; 2072 } 2073 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2074 { 2075 struct stripe_head *sh; 2076 2077 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); 2078 if (!sh) 2079 return 0; 2080 2081 sh->raid_conf = conf; 2082 2083 if (grow_buffers(sh, gfp)) { 2084 shrink_buffers(sh); 2085 kmem_cache_free(conf->slab_cache, sh); 2086 return 0; 2087 } 2088 sh->hash_lock_index = 2089 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2090 /* we just created an active stripe so... */ 2091 atomic_inc(&conf->active_stripes); 2092 2093 raid5_release_stripe(sh); 2094 conf->max_nr_stripes++; 2095 return 1; 2096 } 2097 2098 static int grow_stripes(struct r5conf *conf, int num) 2099 { 2100 struct kmem_cache *sc; 2101 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2102 2103 if (conf->mddev->gendisk) 2104 sprintf(conf->cache_name[0], 2105 "raid%d-%s", conf->level, mdname(conf->mddev)); 2106 else 2107 sprintf(conf->cache_name[0], 2108 "raid%d-%p", conf->level, conf->mddev); 2109 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2110 2111 conf->active_name = 0; 2112 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2113 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2114 0, 0, NULL); 2115 if (!sc) 2116 return 1; 2117 conf->slab_cache = sc; 2118 conf->pool_size = devs; 2119 while (num--) 2120 if (!grow_one_stripe(conf, GFP_KERNEL)) 2121 return 1; 2122 2123 return 0; 2124 } 2125 2126 /** 2127 * scribble_len - return the required size of the scribble region 2128 * @num - total number of disks in the array 2129 * 2130 * The size must be enough to contain: 2131 * 1/ a struct page pointer for each device in the array +2 2132 * 2/ room to convert each entry in (1) to its corresponding dma 2133 * (dma_map_page()) or page (page_address()) address. 2134 * 2135 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2136 * calculate over all devices (not just the data blocks), using zeros in place 2137 * of the P and Q blocks. 2138 */ 2139 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2140 { 2141 struct flex_array *ret; 2142 size_t len; 2143 2144 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2145 ret = flex_array_alloc(len, cnt, flags); 2146 if (!ret) 2147 return NULL; 2148 /* always prealloc all elements, so no locking is required */ 2149 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2150 flex_array_free(ret); 2151 return NULL; 2152 } 2153 return ret; 2154 } 2155 2156 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2157 { 2158 unsigned long cpu; 2159 int err = 0; 2160 2161 /* 2162 * Never shrink. And mddev_suspend() could deadlock if this is called 2163 * from raid5d. In that case, scribble_disks and scribble_sectors 2164 * should equal to new_disks and new_sectors 2165 */ 2166 if (conf->scribble_disks >= new_disks && 2167 conf->scribble_sectors >= new_sectors) 2168 return 0; 2169 mddev_suspend(conf->mddev); 2170 get_online_cpus(); 2171 for_each_present_cpu(cpu) { 2172 struct raid5_percpu *percpu; 2173 struct flex_array *scribble; 2174 2175 percpu = per_cpu_ptr(conf->percpu, cpu); 2176 scribble = scribble_alloc(new_disks, 2177 new_sectors / STRIPE_SECTORS, 2178 GFP_NOIO); 2179 2180 if (scribble) { 2181 flex_array_free(percpu->scribble); 2182 percpu->scribble = scribble; 2183 } else { 2184 err = -ENOMEM; 2185 break; 2186 } 2187 } 2188 put_online_cpus(); 2189 mddev_resume(conf->mddev); 2190 if (!err) { 2191 conf->scribble_disks = new_disks; 2192 conf->scribble_sectors = new_sectors; 2193 } 2194 return err; 2195 } 2196 2197 static int resize_stripes(struct r5conf *conf, int newsize) 2198 { 2199 /* Make all the stripes able to hold 'newsize' devices. 2200 * New slots in each stripe get 'page' set to a new page. 2201 * 2202 * This happens in stages: 2203 * 1/ create a new kmem_cache and allocate the required number of 2204 * stripe_heads. 2205 * 2/ gather all the old stripe_heads and transfer the pages across 2206 * to the new stripe_heads. This will have the side effect of 2207 * freezing the array as once all stripe_heads have been collected, 2208 * no IO will be possible. Old stripe heads are freed once their 2209 * pages have been transferred over, and the old kmem_cache is 2210 * freed when all stripes are done. 2211 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2212 * we simple return a failre status - no need to clean anything up. 2213 * 4/ allocate new pages for the new slots in the new stripe_heads. 2214 * If this fails, we don't bother trying the shrink the 2215 * stripe_heads down again, we just leave them as they are. 2216 * As each stripe_head is processed the new one is released into 2217 * active service. 2218 * 2219 * Once step2 is started, we cannot afford to wait for a write, 2220 * so we use GFP_NOIO allocations. 2221 */ 2222 struct stripe_head *osh, *nsh; 2223 LIST_HEAD(newstripes); 2224 struct disk_info *ndisks; 2225 int err; 2226 struct kmem_cache *sc; 2227 int i; 2228 int hash, cnt; 2229 2230 if (newsize <= conf->pool_size) 2231 return 0; /* never bother to shrink */ 2232 2233 err = md_allow_write(conf->mddev); 2234 if (err) 2235 return err; 2236 2237 /* Step 1 */ 2238 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2239 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2240 0, 0, NULL); 2241 if (!sc) 2242 return -ENOMEM; 2243 2244 /* Need to ensure auto-resizing doesn't interfere */ 2245 mutex_lock(&conf->cache_size_mutex); 2246 2247 for (i = conf->max_nr_stripes; i; i--) { 2248 nsh = alloc_stripe(sc, GFP_KERNEL, newsize); 2249 if (!nsh) 2250 break; 2251 2252 nsh->raid_conf = conf; 2253 list_add(&nsh->lru, &newstripes); 2254 } 2255 if (i) { 2256 /* didn't get enough, give up */ 2257 while (!list_empty(&newstripes)) { 2258 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2259 list_del(&nsh->lru); 2260 kmem_cache_free(sc, nsh); 2261 } 2262 kmem_cache_destroy(sc); 2263 mutex_unlock(&conf->cache_size_mutex); 2264 return -ENOMEM; 2265 } 2266 /* Step 2 - Must use GFP_NOIO now. 2267 * OK, we have enough stripes, start collecting inactive 2268 * stripes and copying them over 2269 */ 2270 hash = 0; 2271 cnt = 0; 2272 list_for_each_entry(nsh, &newstripes, lru) { 2273 lock_device_hash_lock(conf, hash); 2274 wait_event_cmd(conf->wait_for_stripe, 2275 !list_empty(conf->inactive_list + hash), 2276 unlock_device_hash_lock(conf, hash), 2277 lock_device_hash_lock(conf, hash)); 2278 osh = get_free_stripe(conf, hash); 2279 unlock_device_hash_lock(conf, hash); 2280 2281 for(i=0; i<conf->pool_size; i++) { 2282 nsh->dev[i].page = osh->dev[i].page; 2283 nsh->dev[i].orig_page = osh->dev[i].page; 2284 } 2285 nsh->hash_lock_index = hash; 2286 kmem_cache_free(conf->slab_cache, osh); 2287 cnt++; 2288 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2289 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2290 hash++; 2291 cnt = 0; 2292 } 2293 } 2294 kmem_cache_destroy(conf->slab_cache); 2295 2296 /* Step 3. 2297 * At this point, we are holding all the stripes so the array 2298 * is completely stalled, so now is a good time to resize 2299 * conf->disks and the scribble region 2300 */ 2301 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2302 if (ndisks) { 2303 for (i = 0; i < conf->pool_size; i++) 2304 ndisks[i] = conf->disks[i]; 2305 2306 for (i = conf->pool_size; i < newsize; i++) { 2307 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2308 if (!ndisks[i].extra_page) 2309 err = -ENOMEM; 2310 } 2311 2312 if (err) { 2313 for (i = conf->pool_size; i < newsize; i++) 2314 if (ndisks[i].extra_page) 2315 put_page(ndisks[i].extra_page); 2316 kfree(ndisks); 2317 } else { 2318 kfree(conf->disks); 2319 conf->disks = ndisks; 2320 } 2321 } else 2322 err = -ENOMEM; 2323 2324 mutex_unlock(&conf->cache_size_mutex); 2325 /* Step 4, return new stripes to service */ 2326 while(!list_empty(&newstripes)) { 2327 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2328 list_del_init(&nsh->lru); 2329 2330 for (i=conf->raid_disks; i < newsize; i++) 2331 if (nsh->dev[i].page == NULL) { 2332 struct page *p = alloc_page(GFP_NOIO); 2333 nsh->dev[i].page = p; 2334 nsh->dev[i].orig_page = p; 2335 if (!p) 2336 err = -ENOMEM; 2337 } 2338 raid5_release_stripe(nsh); 2339 } 2340 /* critical section pass, GFP_NOIO no longer needed */ 2341 2342 conf->slab_cache = sc; 2343 conf->active_name = 1-conf->active_name; 2344 if (!err) 2345 conf->pool_size = newsize; 2346 return err; 2347 } 2348 2349 static int drop_one_stripe(struct r5conf *conf) 2350 { 2351 struct stripe_head *sh; 2352 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2353 2354 spin_lock_irq(conf->hash_locks + hash); 2355 sh = get_free_stripe(conf, hash); 2356 spin_unlock_irq(conf->hash_locks + hash); 2357 if (!sh) 2358 return 0; 2359 BUG_ON(atomic_read(&sh->count)); 2360 shrink_buffers(sh); 2361 kmem_cache_free(conf->slab_cache, sh); 2362 atomic_dec(&conf->active_stripes); 2363 conf->max_nr_stripes--; 2364 return 1; 2365 } 2366 2367 static void shrink_stripes(struct r5conf *conf) 2368 { 2369 while (conf->max_nr_stripes && 2370 drop_one_stripe(conf)) 2371 ; 2372 2373 kmem_cache_destroy(conf->slab_cache); 2374 conf->slab_cache = NULL; 2375 } 2376 2377 static void raid5_end_read_request(struct bio * bi) 2378 { 2379 struct stripe_head *sh = bi->bi_private; 2380 struct r5conf *conf = sh->raid_conf; 2381 int disks = sh->disks, i; 2382 char b[BDEVNAME_SIZE]; 2383 struct md_rdev *rdev = NULL; 2384 sector_t s; 2385 2386 for (i=0 ; i<disks; i++) 2387 if (bi == &sh->dev[i].req) 2388 break; 2389 2390 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2391 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2392 bi->bi_error); 2393 if (i == disks) { 2394 bio_reset(bi); 2395 BUG(); 2396 return; 2397 } 2398 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2399 /* If replacement finished while this request was outstanding, 2400 * 'replacement' might be NULL already. 2401 * In that case it moved down to 'rdev'. 2402 * rdev is not removed until all requests are finished. 2403 */ 2404 rdev = conf->disks[i].replacement; 2405 if (!rdev) 2406 rdev = conf->disks[i].rdev; 2407 2408 if (use_new_offset(conf, sh)) 2409 s = sh->sector + rdev->new_data_offset; 2410 else 2411 s = sh->sector + rdev->data_offset; 2412 if (!bi->bi_error) { 2413 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2414 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2415 /* Note that this cannot happen on a 2416 * replacement device. We just fail those on 2417 * any error 2418 */ 2419 pr_info_ratelimited( 2420 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2421 mdname(conf->mddev), STRIPE_SECTORS, 2422 (unsigned long long)s, 2423 bdevname(rdev->bdev, b)); 2424 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2425 clear_bit(R5_ReadError, &sh->dev[i].flags); 2426 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2427 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2428 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2429 2430 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2431 /* 2432 * end read for a page in journal, this 2433 * must be preparing for prexor in rmw 2434 */ 2435 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2436 2437 if (atomic_read(&rdev->read_errors)) 2438 atomic_set(&rdev->read_errors, 0); 2439 } else { 2440 const char *bdn = bdevname(rdev->bdev, b); 2441 int retry = 0; 2442 int set_bad = 0; 2443 2444 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2445 atomic_inc(&rdev->read_errors); 2446 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2447 pr_warn_ratelimited( 2448 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2449 mdname(conf->mddev), 2450 (unsigned long long)s, 2451 bdn); 2452 else if (conf->mddev->degraded >= conf->max_degraded) { 2453 set_bad = 1; 2454 pr_warn_ratelimited( 2455 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2456 mdname(conf->mddev), 2457 (unsigned long long)s, 2458 bdn); 2459 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2460 /* Oh, no!!! */ 2461 set_bad = 1; 2462 pr_warn_ratelimited( 2463 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2464 mdname(conf->mddev), 2465 (unsigned long long)s, 2466 bdn); 2467 } else if (atomic_read(&rdev->read_errors) 2468 > conf->max_nr_stripes) 2469 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2470 mdname(conf->mddev), bdn); 2471 else 2472 retry = 1; 2473 if (set_bad && test_bit(In_sync, &rdev->flags) 2474 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2475 retry = 1; 2476 if (retry) 2477 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2478 set_bit(R5_ReadError, &sh->dev[i].flags); 2479 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2480 } else 2481 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2482 else { 2483 clear_bit(R5_ReadError, &sh->dev[i].flags); 2484 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2485 if (!(set_bad 2486 && test_bit(In_sync, &rdev->flags) 2487 && rdev_set_badblocks( 2488 rdev, sh->sector, STRIPE_SECTORS, 0))) 2489 md_error(conf->mddev, rdev); 2490 } 2491 } 2492 rdev_dec_pending(rdev, conf->mddev); 2493 bio_reset(bi); 2494 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2495 set_bit(STRIPE_HANDLE, &sh->state); 2496 raid5_release_stripe(sh); 2497 } 2498 2499 static void raid5_end_write_request(struct bio *bi) 2500 { 2501 struct stripe_head *sh = bi->bi_private; 2502 struct r5conf *conf = sh->raid_conf; 2503 int disks = sh->disks, i; 2504 struct md_rdev *uninitialized_var(rdev); 2505 sector_t first_bad; 2506 int bad_sectors; 2507 int replacement = 0; 2508 2509 for (i = 0 ; i < disks; i++) { 2510 if (bi == &sh->dev[i].req) { 2511 rdev = conf->disks[i].rdev; 2512 break; 2513 } 2514 if (bi == &sh->dev[i].rreq) { 2515 rdev = conf->disks[i].replacement; 2516 if (rdev) 2517 replacement = 1; 2518 else 2519 /* rdev was removed and 'replacement' 2520 * replaced it. rdev is not removed 2521 * until all requests are finished. 2522 */ 2523 rdev = conf->disks[i].rdev; 2524 break; 2525 } 2526 } 2527 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2528 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2529 bi->bi_error); 2530 if (i == disks) { 2531 bio_reset(bi); 2532 BUG(); 2533 return; 2534 } 2535 2536 if (replacement) { 2537 if (bi->bi_error) 2538 md_error(conf->mddev, rdev); 2539 else if (is_badblock(rdev, sh->sector, 2540 STRIPE_SECTORS, 2541 &first_bad, &bad_sectors)) 2542 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2543 } else { 2544 if (bi->bi_error) { 2545 set_bit(STRIPE_DEGRADED, &sh->state); 2546 set_bit(WriteErrorSeen, &rdev->flags); 2547 set_bit(R5_WriteError, &sh->dev[i].flags); 2548 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2549 set_bit(MD_RECOVERY_NEEDED, 2550 &rdev->mddev->recovery); 2551 } else if (is_badblock(rdev, sh->sector, 2552 STRIPE_SECTORS, 2553 &first_bad, &bad_sectors)) { 2554 set_bit(R5_MadeGood, &sh->dev[i].flags); 2555 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2556 /* That was a successful write so make 2557 * sure it looks like we already did 2558 * a re-write. 2559 */ 2560 set_bit(R5_ReWrite, &sh->dev[i].flags); 2561 } 2562 } 2563 rdev_dec_pending(rdev, conf->mddev); 2564 2565 if (sh->batch_head && bi->bi_error && !replacement) 2566 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2567 2568 bio_reset(bi); 2569 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2570 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2571 set_bit(STRIPE_HANDLE, &sh->state); 2572 raid5_release_stripe(sh); 2573 2574 if (sh->batch_head && sh != sh->batch_head) 2575 raid5_release_stripe(sh->batch_head); 2576 } 2577 2578 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2579 { 2580 struct r5dev *dev = &sh->dev[i]; 2581 2582 dev->flags = 0; 2583 dev->sector = raid5_compute_blocknr(sh, i, previous); 2584 } 2585 2586 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2587 { 2588 char b[BDEVNAME_SIZE]; 2589 struct r5conf *conf = mddev->private; 2590 unsigned long flags; 2591 pr_debug("raid456: error called\n"); 2592 2593 spin_lock_irqsave(&conf->device_lock, flags); 2594 clear_bit(In_sync, &rdev->flags); 2595 mddev->degraded = raid5_calc_degraded(conf); 2596 spin_unlock_irqrestore(&conf->device_lock, flags); 2597 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2598 2599 set_bit(Blocked, &rdev->flags); 2600 set_bit(Faulty, &rdev->flags); 2601 set_mask_bits(&mddev->sb_flags, 0, 2602 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2603 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2604 "md/raid:%s: Operation continuing on %d devices.\n", 2605 mdname(mddev), 2606 bdevname(rdev->bdev, b), 2607 mdname(mddev), 2608 conf->raid_disks - mddev->degraded); 2609 r5c_update_on_rdev_error(mddev); 2610 } 2611 2612 /* 2613 * Input: a 'big' sector number, 2614 * Output: index of the data and parity disk, and the sector # in them. 2615 */ 2616 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2617 int previous, int *dd_idx, 2618 struct stripe_head *sh) 2619 { 2620 sector_t stripe, stripe2; 2621 sector_t chunk_number; 2622 unsigned int chunk_offset; 2623 int pd_idx, qd_idx; 2624 int ddf_layout = 0; 2625 sector_t new_sector; 2626 int algorithm = previous ? conf->prev_algo 2627 : conf->algorithm; 2628 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2629 : conf->chunk_sectors; 2630 int raid_disks = previous ? conf->previous_raid_disks 2631 : conf->raid_disks; 2632 int data_disks = raid_disks - conf->max_degraded; 2633 2634 /* First compute the information on this sector */ 2635 2636 /* 2637 * Compute the chunk number and the sector offset inside the chunk 2638 */ 2639 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2640 chunk_number = r_sector; 2641 2642 /* 2643 * Compute the stripe number 2644 */ 2645 stripe = chunk_number; 2646 *dd_idx = sector_div(stripe, data_disks); 2647 stripe2 = stripe; 2648 /* 2649 * Select the parity disk based on the user selected algorithm. 2650 */ 2651 pd_idx = qd_idx = -1; 2652 switch(conf->level) { 2653 case 4: 2654 pd_idx = data_disks; 2655 break; 2656 case 5: 2657 switch (algorithm) { 2658 case ALGORITHM_LEFT_ASYMMETRIC: 2659 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2660 if (*dd_idx >= pd_idx) 2661 (*dd_idx)++; 2662 break; 2663 case ALGORITHM_RIGHT_ASYMMETRIC: 2664 pd_idx = sector_div(stripe2, raid_disks); 2665 if (*dd_idx >= pd_idx) 2666 (*dd_idx)++; 2667 break; 2668 case ALGORITHM_LEFT_SYMMETRIC: 2669 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2670 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2671 break; 2672 case ALGORITHM_RIGHT_SYMMETRIC: 2673 pd_idx = sector_div(stripe2, raid_disks); 2674 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2675 break; 2676 case ALGORITHM_PARITY_0: 2677 pd_idx = 0; 2678 (*dd_idx)++; 2679 break; 2680 case ALGORITHM_PARITY_N: 2681 pd_idx = data_disks; 2682 break; 2683 default: 2684 BUG(); 2685 } 2686 break; 2687 case 6: 2688 2689 switch (algorithm) { 2690 case ALGORITHM_LEFT_ASYMMETRIC: 2691 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2692 qd_idx = pd_idx + 1; 2693 if (pd_idx == raid_disks-1) { 2694 (*dd_idx)++; /* Q D D D P */ 2695 qd_idx = 0; 2696 } else if (*dd_idx >= pd_idx) 2697 (*dd_idx) += 2; /* D D P Q D */ 2698 break; 2699 case ALGORITHM_RIGHT_ASYMMETRIC: 2700 pd_idx = sector_div(stripe2, raid_disks); 2701 qd_idx = pd_idx + 1; 2702 if (pd_idx == raid_disks-1) { 2703 (*dd_idx)++; /* Q D D D P */ 2704 qd_idx = 0; 2705 } else if (*dd_idx >= pd_idx) 2706 (*dd_idx) += 2; /* D D P Q D */ 2707 break; 2708 case ALGORITHM_LEFT_SYMMETRIC: 2709 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2710 qd_idx = (pd_idx + 1) % raid_disks; 2711 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2712 break; 2713 case ALGORITHM_RIGHT_SYMMETRIC: 2714 pd_idx = sector_div(stripe2, raid_disks); 2715 qd_idx = (pd_idx + 1) % raid_disks; 2716 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2717 break; 2718 2719 case ALGORITHM_PARITY_0: 2720 pd_idx = 0; 2721 qd_idx = 1; 2722 (*dd_idx) += 2; 2723 break; 2724 case ALGORITHM_PARITY_N: 2725 pd_idx = data_disks; 2726 qd_idx = data_disks + 1; 2727 break; 2728 2729 case ALGORITHM_ROTATING_ZERO_RESTART: 2730 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2731 * of blocks for computing Q is different. 2732 */ 2733 pd_idx = sector_div(stripe2, raid_disks); 2734 qd_idx = pd_idx + 1; 2735 if (pd_idx == raid_disks-1) { 2736 (*dd_idx)++; /* Q D D D P */ 2737 qd_idx = 0; 2738 } else if (*dd_idx >= pd_idx) 2739 (*dd_idx) += 2; /* D D P Q D */ 2740 ddf_layout = 1; 2741 break; 2742 2743 case ALGORITHM_ROTATING_N_RESTART: 2744 /* Same a left_asymmetric, by first stripe is 2745 * D D D P Q rather than 2746 * Q D D D P 2747 */ 2748 stripe2 += 1; 2749 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2750 qd_idx = pd_idx + 1; 2751 if (pd_idx == raid_disks-1) { 2752 (*dd_idx)++; /* Q D D D P */ 2753 qd_idx = 0; 2754 } else if (*dd_idx >= pd_idx) 2755 (*dd_idx) += 2; /* D D P Q D */ 2756 ddf_layout = 1; 2757 break; 2758 2759 case ALGORITHM_ROTATING_N_CONTINUE: 2760 /* Same as left_symmetric but Q is before P */ 2761 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2762 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2763 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2764 ddf_layout = 1; 2765 break; 2766 2767 case ALGORITHM_LEFT_ASYMMETRIC_6: 2768 /* RAID5 left_asymmetric, with Q on last device */ 2769 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2770 if (*dd_idx >= pd_idx) 2771 (*dd_idx)++; 2772 qd_idx = raid_disks - 1; 2773 break; 2774 2775 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2776 pd_idx = sector_div(stripe2, raid_disks-1); 2777 if (*dd_idx >= pd_idx) 2778 (*dd_idx)++; 2779 qd_idx = raid_disks - 1; 2780 break; 2781 2782 case ALGORITHM_LEFT_SYMMETRIC_6: 2783 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2784 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2785 qd_idx = raid_disks - 1; 2786 break; 2787 2788 case ALGORITHM_RIGHT_SYMMETRIC_6: 2789 pd_idx = sector_div(stripe2, raid_disks-1); 2790 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2791 qd_idx = raid_disks - 1; 2792 break; 2793 2794 case ALGORITHM_PARITY_0_6: 2795 pd_idx = 0; 2796 (*dd_idx)++; 2797 qd_idx = raid_disks - 1; 2798 break; 2799 2800 default: 2801 BUG(); 2802 } 2803 break; 2804 } 2805 2806 if (sh) { 2807 sh->pd_idx = pd_idx; 2808 sh->qd_idx = qd_idx; 2809 sh->ddf_layout = ddf_layout; 2810 } 2811 /* 2812 * Finally, compute the new sector number 2813 */ 2814 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2815 return new_sector; 2816 } 2817 2818 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2819 { 2820 struct r5conf *conf = sh->raid_conf; 2821 int raid_disks = sh->disks; 2822 int data_disks = raid_disks - conf->max_degraded; 2823 sector_t new_sector = sh->sector, check; 2824 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2825 : conf->chunk_sectors; 2826 int algorithm = previous ? conf->prev_algo 2827 : conf->algorithm; 2828 sector_t stripe; 2829 int chunk_offset; 2830 sector_t chunk_number; 2831 int dummy1, dd_idx = i; 2832 sector_t r_sector; 2833 struct stripe_head sh2; 2834 2835 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2836 stripe = new_sector; 2837 2838 if (i == sh->pd_idx) 2839 return 0; 2840 switch(conf->level) { 2841 case 4: break; 2842 case 5: 2843 switch (algorithm) { 2844 case ALGORITHM_LEFT_ASYMMETRIC: 2845 case ALGORITHM_RIGHT_ASYMMETRIC: 2846 if (i > sh->pd_idx) 2847 i--; 2848 break; 2849 case ALGORITHM_LEFT_SYMMETRIC: 2850 case ALGORITHM_RIGHT_SYMMETRIC: 2851 if (i < sh->pd_idx) 2852 i += raid_disks; 2853 i -= (sh->pd_idx + 1); 2854 break; 2855 case ALGORITHM_PARITY_0: 2856 i -= 1; 2857 break; 2858 case ALGORITHM_PARITY_N: 2859 break; 2860 default: 2861 BUG(); 2862 } 2863 break; 2864 case 6: 2865 if (i == sh->qd_idx) 2866 return 0; /* It is the Q disk */ 2867 switch (algorithm) { 2868 case ALGORITHM_LEFT_ASYMMETRIC: 2869 case ALGORITHM_RIGHT_ASYMMETRIC: 2870 case ALGORITHM_ROTATING_ZERO_RESTART: 2871 case ALGORITHM_ROTATING_N_RESTART: 2872 if (sh->pd_idx == raid_disks-1) 2873 i--; /* Q D D D P */ 2874 else if (i > sh->pd_idx) 2875 i -= 2; /* D D P Q D */ 2876 break; 2877 case ALGORITHM_LEFT_SYMMETRIC: 2878 case ALGORITHM_RIGHT_SYMMETRIC: 2879 if (sh->pd_idx == raid_disks-1) 2880 i--; /* Q D D D P */ 2881 else { 2882 /* D D P Q D */ 2883 if (i < sh->pd_idx) 2884 i += raid_disks; 2885 i -= (sh->pd_idx + 2); 2886 } 2887 break; 2888 case ALGORITHM_PARITY_0: 2889 i -= 2; 2890 break; 2891 case ALGORITHM_PARITY_N: 2892 break; 2893 case ALGORITHM_ROTATING_N_CONTINUE: 2894 /* Like left_symmetric, but P is before Q */ 2895 if (sh->pd_idx == 0) 2896 i--; /* P D D D Q */ 2897 else { 2898 /* D D Q P D */ 2899 if (i < sh->pd_idx) 2900 i += raid_disks; 2901 i -= (sh->pd_idx + 1); 2902 } 2903 break; 2904 case ALGORITHM_LEFT_ASYMMETRIC_6: 2905 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2906 if (i > sh->pd_idx) 2907 i--; 2908 break; 2909 case ALGORITHM_LEFT_SYMMETRIC_6: 2910 case ALGORITHM_RIGHT_SYMMETRIC_6: 2911 if (i < sh->pd_idx) 2912 i += data_disks + 1; 2913 i -= (sh->pd_idx + 1); 2914 break; 2915 case ALGORITHM_PARITY_0_6: 2916 i -= 1; 2917 break; 2918 default: 2919 BUG(); 2920 } 2921 break; 2922 } 2923 2924 chunk_number = stripe * data_disks + i; 2925 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2926 2927 check = raid5_compute_sector(conf, r_sector, 2928 previous, &dummy1, &sh2); 2929 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2930 || sh2.qd_idx != sh->qd_idx) { 2931 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 2932 mdname(conf->mddev)); 2933 return 0; 2934 } 2935 return r_sector; 2936 } 2937 2938 /* 2939 * There are cases where we want handle_stripe_dirtying() and 2940 * schedule_reconstruction() to delay towrite to some dev of a stripe. 2941 * 2942 * This function checks whether we want to delay the towrite. Specifically, 2943 * we delay the towrite when: 2944 * 2945 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 2946 * stripe has data in journal (for other devices). 2947 * 2948 * In this case, when reading data for the non-overwrite dev, it is 2949 * necessary to handle complex rmw of write back cache (prexor with 2950 * orig_page, and xor with page). To keep read path simple, we would 2951 * like to flush data in journal to RAID disks first, so complex rmw 2952 * is handled in the write patch (handle_stripe_dirtying). 2953 * 2954 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 2955 * 2956 * It is important to be able to flush all stripes in raid5-cache. 2957 * Therefore, we need reserve some space on the journal device for 2958 * these flushes. If flush operation includes pending writes to the 2959 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 2960 * for the flush out. If we exclude these pending writes from flush 2961 * operation, we only need (conf->max_degraded + 1) pages per stripe. 2962 * Therefore, excluding pending writes in these cases enables more 2963 * efficient use of the journal device. 2964 * 2965 * Note: To make sure the stripe makes progress, we only delay 2966 * towrite for stripes with data already in journal (injournal > 0). 2967 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 2968 * no_space_stripes list. 2969 * 2970 */ 2971 static inline bool delay_towrite(struct r5conf *conf, 2972 struct r5dev *dev, 2973 struct stripe_head_state *s) 2974 { 2975 /* case 1 above */ 2976 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2977 !test_bit(R5_Insync, &dev->flags) && s->injournal) 2978 return true; 2979 /* case 2 above */ 2980 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2981 s->injournal > 0) 2982 return true; 2983 return false; 2984 } 2985 2986 static void 2987 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2988 int rcw, int expand) 2989 { 2990 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 2991 struct r5conf *conf = sh->raid_conf; 2992 int level = conf->level; 2993 2994 if (rcw) { 2995 /* 2996 * In some cases, handle_stripe_dirtying initially decided to 2997 * run rmw and allocates extra page for prexor. However, rcw is 2998 * cheaper later on. We need to free the extra page now, 2999 * because we won't be able to do that in ops_complete_prexor(). 3000 */ 3001 r5c_release_extra_page(sh); 3002 3003 for (i = disks; i--; ) { 3004 struct r5dev *dev = &sh->dev[i]; 3005 3006 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3007 set_bit(R5_LOCKED, &dev->flags); 3008 set_bit(R5_Wantdrain, &dev->flags); 3009 if (!expand) 3010 clear_bit(R5_UPTODATE, &dev->flags); 3011 s->locked++; 3012 } else if (test_bit(R5_InJournal, &dev->flags)) { 3013 set_bit(R5_LOCKED, &dev->flags); 3014 s->locked++; 3015 } 3016 } 3017 /* if we are not expanding this is a proper write request, and 3018 * there will be bios with new data to be drained into the 3019 * stripe cache 3020 */ 3021 if (!expand) { 3022 if (!s->locked) 3023 /* False alarm, nothing to do */ 3024 return; 3025 sh->reconstruct_state = reconstruct_state_drain_run; 3026 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3027 } else 3028 sh->reconstruct_state = reconstruct_state_run; 3029 3030 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3031 3032 if (s->locked + conf->max_degraded == disks) 3033 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3034 atomic_inc(&conf->pending_full_writes); 3035 } else { 3036 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3037 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3038 BUG_ON(level == 6 && 3039 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3040 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3041 3042 for (i = disks; i--; ) { 3043 struct r5dev *dev = &sh->dev[i]; 3044 if (i == pd_idx || i == qd_idx) 3045 continue; 3046 3047 if (dev->towrite && 3048 (test_bit(R5_UPTODATE, &dev->flags) || 3049 test_bit(R5_Wantcompute, &dev->flags))) { 3050 set_bit(R5_Wantdrain, &dev->flags); 3051 set_bit(R5_LOCKED, &dev->flags); 3052 clear_bit(R5_UPTODATE, &dev->flags); 3053 s->locked++; 3054 } else if (test_bit(R5_InJournal, &dev->flags)) { 3055 set_bit(R5_LOCKED, &dev->flags); 3056 s->locked++; 3057 } 3058 } 3059 if (!s->locked) 3060 /* False alarm - nothing to do */ 3061 return; 3062 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3063 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3064 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3065 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3066 } 3067 3068 /* keep the parity disk(s) locked while asynchronous operations 3069 * are in flight 3070 */ 3071 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3072 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3073 s->locked++; 3074 3075 if (level == 6) { 3076 int qd_idx = sh->qd_idx; 3077 struct r5dev *dev = &sh->dev[qd_idx]; 3078 3079 set_bit(R5_LOCKED, &dev->flags); 3080 clear_bit(R5_UPTODATE, &dev->flags); 3081 s->locked++; 3082 } 3083 3084 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3085 __func__, (unsigned long long)sh->sector, 3086 s->locked, s->ops_request); 3087 } 3088 3089 /* 3090 * Each stripe/dev can have one or more bion attached. 3091 * toread/towrite point to the first in a chain. 3092 * The bi_next chain must be in order. 3093 */ 3094 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3095 int forwrite, int previous) 3096 { 3097 struct bio **bip; 3098 struct r5conf *conf = sh->raid_conf; 3099 int firstwrite=0; 3100 3101 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3102 (unsigned long long)bi->bi_iter.bi_sector, 3103 (unsigned long long)sh->sector); 3104 3105 /* 3106 * If several bio share a stripe. The bio bi_phys_segments acts as a 3107 * reference count to avoid race. The reference count should already be 3108 * increased before this function is called (for example, in 3109 * raid5_make_request()), so other bio sharing this stripe will not free the 3110 * stripe. If a stripe is owned by one stripe, the stripe lock will 3111 * protect it. 3112 */ 3113 spin_lock_irq(&sh->stripe_lock); 3114 /* Don't allow new IO added to stripes in batch list */ 3115 if (sh->batch_head) 3116 goto overlap; 3117 if (forwrite) { 3118 bip = &sh->dev[dd_idx].towrite; 3119 if (*bip == NULL) 3120 firstwrite = 1; 3121 } else 3122 bip = &sh->dev[dd_idx].toread; 3123 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3124 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3125 goto overlap; 3126 bip = & (*bip)->bi_next; 3127 } 3128 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3129 goto overlap; 3130 3131 if (!forwrite || previous) 3132 clear_bit(STRIPE_BATCH_READY, &sh->state); 3133 3134 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3135 if (*bip) 3136 bi->bi_next = *bip; 3137 *bip = bi; 3138 raid5_inc_bi_active_stripes(bi); 3139 3140 if (forwrite) { 3141 /* check if page is covered */ 3142 sector_t sector = sh->dev[dd_idx].sector; 3143 for (bi=sh->dev[dd_idx].towrite; 3144 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3145 bi && bi->bi_iter.bi_sector <= sector; 3146 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3147 if (bio_end_sector(bi) >= sector) 3148 sector = bio_end_sector(bi); 3149 } 3150 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3151 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3152 sh->overwrite_disks++; 3153 } 3154 3155 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3156 (unsigned long long)(*bip)->bi_iter.bi_sector, 3157 (unsigned long long)sh->sector, dd_idx); 3158 3159 if (conf->mddev->bitmap && firstwrite) { 3160 /* Cannot hold spinlock over bitmap_startwrite, 3161 * but must ensure this isn't added to a batch until 3162 * we have added to the bitmap and set bm_seq. 3163 * So set STRIPE_BITMAP_PENDING to prevent 3164 * batching. 3165 * If multiple add_stripe_bio() calls race here they 3166 * much all set STRIPE_BITMAP_PENDING. So only the first one 3167 * to complete "bitmap_startwrite" gets to set 3168 * STRIPE_BIT_DELAY. This is important as once a stripe 3169 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3170 * any more. 3171 */ 3172 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3173 spin_unlock_irq(&sh->stripe_lock); 3174 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3175 STRIPE_SECTORS, 0); 3176 spin_lock_irq(&sh->stripe_lock); 3177 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3178 if (!sh->batch_head) { 3179 sh->bm_seq = conf->seq_flush+1; 3180 set_bit(STRIPE_BIT_DELAY, &sh->state); 3181 } 3182 } 3183 spin_unlock_irq(&sh->stripe_lock); 3184 3185 if (stripe_can_batch(sh)) 3186 stripe_add_to_batch_list(conf, sh); 3187 return 1; 3188 3189 overlap: 3190 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3191 spin_unlock_irq(&sh->stripe_lock); 3192 return 0; 3193 } 3194 3195 static void end_reshape(struct r5conf *conf); 3196 3197 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3198 struct stripe_head *sh) 3199 { 3200 int sectors_per_chunk = 3201 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3202 int dd_idx; 3203 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3204 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3205 3206 raid5_compute_sector(conf, 3207 stripe * (disks - conf->max_degraded) 3208 *sectors_per_chunk + chunk_offset, 3209 previous, 3210 &dd_idx, sh); 3211 } 3212 3213 static void 3214 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3215 struct stripe_head_state *s, int disks, 3216 struct bio_list *return_bi) 3217 { 3218 int i; 3219 BUG_ON(sh->batch_head); 3220 for (i = disks; i--; ) { 3221 struct bio *bi; 3222 int bitmap_end = 0; 3223 3224 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3225 struct md_rdev *rdev; 3226 rcu_read_lock(); 3227 rdev = rcu_dereference(conf->disks[i].rdev); 3228 if (rdev && test_bit(In_sync, &rdev->flags) && 3229 !test_bit(Faulty, &rdev->flags)) 3230 atomic_inc(&rdev->nr_pending); 3231 else 3232 rdev = NULL; 3233 rcu_read_unlock(); 3234 if (rdev) { 3235 if (!rdev_set_badblocks( 3236 rdev, 3237 sh->sector, 3238 STRIPE_SECTORS, 0)) 3239 md_error(conf->mddev, rdev); 3240 rdev_dec_pending(rdev, conf->mddev); 3241 } 3242 } 3243 spin_lock_irq(&sh->stripe_lock); 3244 /* fail all writes first */ 3245 bi = sh->dev[i].towrite; 3246 sh->dev[i].towrite = NULL; 3247 sh->overwrite_disks = 0; 3248 spin_unlock_irq(&sh->stripe_lock); 3249 if (bi) 3250 bitmap_end = 1; 3251 3252 r5l_stripe_write_finished(sh); 3253 3254 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3255 wake_up(&conf->wait_for_overlap); 3256 3257 while (bi && bi->bi_iter.bi_sector < 3258 sh->dev[i].sector + STRIPE_SECTORS) { 3259 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3260 3261 bi->bi_error = -EIO; 3262 if (!raid5_dec_bi_active_stripes(bi)) { 3263 md_write_end(conf->mddev); 3264 bio_list_add(return_bi, bi); 3265 } 3266 bi = nextbi; 3267 } 3268 if (bitmap_end) 3269 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3270 STRIPE_SECTORS, 0, 0); 3271 bitmap_end = 0; 3272 /* and fail all 'written' */ 3273 bi = sh->dev[i].written; 3274 sh->dev[i].written = NULL; 3275 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3276 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3277 sh->dev[i].page = sh->dev[i].orig_page; 3278 } 3279 3280 if (bi) bitmap_end = 1; 3281 while (bi && bi->bi_iter.bi_sector < 3282 sh->dev[i].sector + STRIPE_SECTORS) { 3283 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3284 3285 bi->bi_error = -EIO; 3286 if (!raid5_dec_bi_active_stripes(bi)) { 3287 md_write_end(conf->mddev); 3288 bio_list_add(return_bi, bi); 3289 } 3290 bi = bi2; 3291 } 3292 3293 /* fail any reads if this device is non-operational and 3294 * the data has not reached the cache yet. 3295 */ 3296 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3297 s->failed > conf->max_degraded && 3298 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3299 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3300 spin_lock_irq(&sh->stripe_lock); 3301 bi = sh->dev[i].toread; 3302 sh->dev[i].toread = NULL; 3303 spin_unlock_irq(&sh->stripe_lock); 3304 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3305 wake_up(&conf->wait_for_overlap); 3306 if (bi) 3307 s->to_read--; 3308 while (bi && bi->bi_iter.bi_sector < 3309 sh->dev[i].sector + STRIPE_SECTORS) { 3310 struct bio *nextbi = 3311 r5_next_bio(bi, sh->dev[i].sector); 3312 3313 bi->bi_error = -EIO; 3314 if (!raid5_dec_bi_active_stripes(bi)) 3315 bio_list_add(return_bi, bi); 3316 bi = nextbi; 3317 } 3318 } 3319 if (bitmap_end) 3320 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3321 STRIPE_SECTORS, 0, 0); 3322 /* If we were in the middle of a write the parity block might 3323 * still be locked - so just clear all R5_LOCKED flags 3324 */ 3325 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3326 } 3327 s->to_write = 0; 3328 s->written = 0; 3329 3330 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3331 if (atomic_dec_and_test(&conf->pending_full_writes)) 3332 md_wakeup_thread(conf->mddev->thread); 3333 } 3334 3335 static void 3336 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3337 struct stripe_head_state *s) 3338 { 3339 int abort = 0; 3340 int i; 3341 3342 BUG_ON(sh->batch_head); 3343 clear_bit(STRIPE_SYNCING, &sh->state); 3344 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3345 wake_up(&conf->wait_for_overlap); 3346 s->syncing = 0; 3347 s->replacing = 0; 3348 /* There is nothing more to do for sync/check/repair. 3349 * Don't even need to abort as that is handled elsewhere 3350 * if needed, and not always wanted e.g. if there is a known 3351 * bad block here. 3352 * For recover/replace we need to record a bad block on all 3353 * non-sync devices, or abort the recovery 3354 */ 3355 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3356 /* During recovery devices cannot be removed, so 3357 * locking and refcounting of rdevs is not needed 3358 */ 3359 rcu_read_lock(); 3360 for (i = 0; i < conf->raid_disks; i++) { 3361 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3362 if (rdev 3363 && !test_bit(Faulty, &rdev->flags) 3364 && !test_bit(In_sync, &rdev->flags) 3365 && !rdev_set_badblocks(rdev, sh->sector, 3366 STRIPE_SECTORS, 0)) 3367 abort = 1; 3368 rdev = rcu_dereference(conf->disks[i].replacement); 3369 if (rdev 3370 && !test_bit(Faulty, &rdev->flags) 3371 && !test_bit(In_sync, &rdev->flags) 3372 && !rdev_set_badblocks(rdev, sh->sector, 3373 STRIPE_SECTORS, 0)) 3374 abort = 1; 3375 } 3376 rcu_read_unlock(); 3377 if (abort) 3378 conf->recovery_disabled = 3379 conf->mddev->recovery_disabled; 3380 } 3381 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3382 } 3383 3384 static int want_replace(struct stripe_head *sh, int disk_idx) 3385 { 3386 struct md_rdev *rdev; 3387 int rv = 0; 3388 3389 rcu_read_lock(); 3390 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3391 if (rdev 3392 && !test_bit(Faulty, &rdev->flags) 3393 && !test_bit(In_sync, &rdev->flags) 3394 && (rdev->recovery_offset <= sh->sector 3395 || rdev->mddev->recovery_cp <= sh->sector)) 3396 rv = 1; 3397 rcu_read_unlock(); 3398 return rv; 3399 } 3400 3401 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3402 int disk_idx, int disks) 3403 { 3404 struct r5dev *dev = &sh->dev[disk_idx]; 3405 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3406 &sh->dev[s->failed_num[1]] }; 3407 int i; 3408 3409 3410 if (test_bit(R5_LOCKED, &dev->flags) || 3411 test_bit(R5_UPTODATE, &dev->flags)) 3412 /* No point reading this as we already have it or have 3413 * decided to get it. 3414 */ 3415 return 0; 3416 3417 if (dev->toread || 3418 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3419 /* We need this block to directly satisfy a request */ 3420 return 1; 3421 3422 if (s->syncing || s->expanding || 3423 (s->replacing && want_replace(sh, disk_idx))) 3424 /* When syncing, or expanding we read everything. 3425 * When replacing, we need the replaced block. 3426 */ 3427 return 1; 3428 3429 if ((s->failed >= 1 && fdev[0]->toread) || 3430 (s->failed >= 2 && fdev[1]->toread)) 3431 /* If we want to read from a failed device, then 3432 * we need to actually read every other device. 3433 */ 3434 return 1; 3435 3436 /* Sometimes neither read-modify-write nor reconstruct-write 3437 * cycles can work. In those cases we read every block we 3438 * can. Then the parity-update is certain to have enough to 3439 * work with. 3440 * This can only be a problem when we need to write something, 3441 * and some device has failed. If either of those tests 3442 * fail we need look no further. 3443 */ 3444 if (!s->failed || !s->to_write) 3445 return 0; 3446 3447 if (test_bit(R5_Insync, &dev->flags) && 3448 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3449 /* Pre-reads at not permitted until after short delay 3450 * to gather multiple requests. However if this 3451 * device is no Insync, the block could only be be computed 3452 * and there is no need to delay that. 3453 */ 3454 return 0; 3455 3456 for (i = 0; i < s->failed && i < 2; i++) { 3457 if (fdev[i]->towrite && 3458 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3459 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3460 /* If we have a partial write to a failed 3461 * device, then we will need to reconstruct 3462 * the content of that device, so all other 3463 * devices must be read. 3464 */ 3465 return 1; 3466 } 3467 3468 /* If we are forced to do a reconstruct-write, either because 3469 * the current RAID6 implementation only supports that, or 3470 * or because parity cannot be trusted and we are currently 3471 * recovering it, there is extra need to be careful. 3472 * If one of the devices that we would need to read, because 3473 * it is not being overwritten (and maybe not written at all) 3474 * is missing/faulty, then we need to read everything we can. 3475 */ 3476 if (sh->raid_conf->level != 6 && 3477 sh->sector < sh->raid_conf->mddev->recovery_cp) 3478 /* reconstruct-write isn't being forced */ 3479 return 0; 3480 for (i = 0; i < s->failed && i < 2; i++) { 3481 if (s->failed_num[i] != sh->pd_idx && 3482 s->failed_num[i] != sh->qd_idx && 3483 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3484 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3485 return 1; 3486 } 3487 3488 return 0; 3489 } 3490 3491 /* fetch_block - checks the given member device to see if its data needs 3492 * to be read or computed to satisfy a request. 3493 * 3494 * Returns 1 when no more member devices need to be checked, otherwise returns 3495 * 0 to tell the loop in handle_stripe_fill to continue 3496 */ 3497 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3498 int disk_idx, int disks) 3499 { 3500 struct r5dev *dev = &sh->dev[disk_idx]; 3501 3502 /* is the data in this block needed, and can we get it? */ 3503 if (need_this_block(sh, s, disk_idx, disks)) { 3504 /* we would like to get this block, possibly by computing it, 3505 * otherwise read it if the backing disk is insync 3506 */ 3507 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3508 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3509 BUG_ON(sh->batch_head); 3510 if ((s->uptodate == disks - 1) && 3511 (s->failed && (disk_idx == s->failed_num[0] || 3512 disk_idx == s->failed_num[1]))) { 3513 /* have disk failed, and we're requested to fetch it; 3514 * do compute it 3515 */ 3516 pr_debug("Computing stripe %llu block %d\n", 3517 (unsigned long long)sh->sector, disk_idx); 3518 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3519 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3520 set_bit(R5_Wantcompute, &dev->flags); 3521 sh->ops.target = disk_idx; 3522 sh->ops.target2 = -1; /* no 2nd target */ 3523 s->req_compute = 1; 3524 /* Careful: from this point on 'uptodate' is in the eye 3525 * of raid_run_ops which services 'compute' operations 3526 * before writes. R5_Wantcompute flags a block that will 3527 * be R5_UPTODATE by the time it is needed for a 3528 * subsequent operation. 3529 */ 3530 s->uptodate++; 3531 return 1; 3532 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3533 /* Computing 2-failure is *very* expensive; only 3534 * do it if failed >= 2 3535 */ 3536 int other; 3537 for (other = disks; other--; ) { 3538 if (other == disk_idx) 3539 continue; 3540 if (!test_bit(R5_UPTODATE, 3541 &sh->dev[other].flags)) 3542 break; 3543 } 3544 BUG_ON(other < 0); 3545 pr_debug("Computing stripe %llu blocks %d,%d\n", 3546 (unsigned long long)sh->sector, 3547 disk_idx, other); 3548 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3549 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3550 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3551 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3552 sh->ops.target = disk_idx; 3553 sh->ops.target2 = other; 3554 s->uptodate += 2; 3555 s->req_compute = 1; 3556 return 1; 3557 } else if (test_bit(R5_Insync, &dev->flags)) { 3558 set_bit(R5_LOCKED, &dev->flags); 3559 set_bit(R5_Wantread, &dev->flags); 3560 s->locked++; 3561 pr_debug("Reading block %d (sync=%d)\n", 3562 disk_idx, s->syncing); 3563 } 3564 } 3565 3566 return 0; 3567 } 3568 3569 /** 3570 * handle_stripe_fill - read or compute data to satisfy pending requests. 3571 */ 3572 static void handle_stripe_fill(struct stripe_head *sh, 3573 struct stripe_head_state *s, 3574 int disks) 3575 { 3576 int i; 3577 3578 /* look for blocks to read/compute, skip this if a compute 3579 * is already in flight, or if the stripe contents are in the 3580 * midst of changing due to a write 3581 */ 3582 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3583 !sh->reconstruct_state) { 3584 3585 /* 3586 * For degraded stripe with data in journal, do not handle 3587 * read requests yet, instead, flush the stripe to raid 3588 * disks first, this avoids handling complex rmw of write 3589 * back cache (prexor with orig_page, and then xor with 3590 * page) in the read path 3591 */ 3592 if (s->injournal && s->failed) { 3593 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3594 r5c_make_stripe_write_out(sh); 3595 goto out; 3596 } 3597 3598 for (i = disks; i--; ) 3599 if (fetch_block(sh, s, i, disks)) 3600 break; 3601 } 3602 out: 3603 set_bit(STRIPE_HANDLE, &sh->state); 3604 } 3605 3606 static void break_stripe_batch_list(struct stripe_head *head_sh, 3607 unsigned long handle_flags); 3608 /* handle_stripe_clean_event 3609 * any written block on an uptodate or failed drive can be returned. 3610 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3611 * never LOCKED, so we don't need to test 'failed' directly. 3612 */ 3613 static void handle_stripe_clean_event(struct r5conf *conf, 3614 struct stripe_head *sh, int disks, struct bio_list *return_bi) 3615 { 3616 int i; 3617 struct r5dev *dev; 3618 int discard_pending = 0; 3619 struct stripe_head *head_sh = sh; 3620 bool do_endio = false; 3621 3622 for (i = disks; i--; ) 3623 if (sh->dev[i].written) { 3624 dev = &sh->dev[i]; 3625 if (!test_bit(R5_LOCKED, &dev->flags) && 3626 (test_bit(R5_UPTODATE, &dev->flags) || 3627 test_bit(R5_Discard, &dev->flags) || 3628 test_bit(R5_SkipCopy, &dev->flags))) { 3629 /* We can return any write requests */ 3630 struct bio *wbi, *wbi2; 3631 pr_debug("Return write for disc %d\n", i); 3632 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3633 clear_bit(R5_UPTODATE, &dev->flags); 3634 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3635 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3636 } 3637 do_endio = true; 3638 3639 returnbi: 3640 dev->page = dev->orig_page; 3641 wbi = dev->written; 3642 dev->written = NULL; 3643 while (wbi && wbi->bi_iter.bi_sector < 3644 dev->sector + STRIPE_SECTORS) { 3645 wbi2 = r5_next_bio(wbi, dev->sector); 3646 if (!raid5_dec_bi_active_stripes(wbi)) { 3647 md_write_end(conf->mddev); 3648 bio_list_add(return_bi, wbi); 3649 } 3650 wbi = wbi2; 3651 } 3652 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3653 STRIPE_SECTORS, 3654 !test_bit(STRIPE_DEGRADED, &sh->state), 3655 0); 3656 if (head_sh->batch_head) { 3657 sh = list_first_entry(&sh->batch_list, 3658 struct stripe_head, 3659 batch_list); 3660 if (sh != head_sh) { 3661 dev = &sh->dev[i]; 3662 goto returnbi; 3663 } 3664 } 3665 sh = head_sh; 3666 dev = &sh->dev[i]; 3667 } else if (test_bit(R5_Discard, &dev->flags)) 3668 discard_pending = 1; 3669 } 3670 3671 r5l_stripe_write_finished(sh); 3672 3673 if (!discard_pending && 3674 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3675 int hash; 3676 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3677 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3678 if (sh->qd_idx >= 0) { 3679 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3680 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3681 } 3682 /* now that discard is done we can proceed with any sync */ 3683 clear_bit(STRIPE_DISCARD, &sh->state); 3684 /* 3685 * SCSI discard will change some bio fields and the stripe has 3686 * no updated data, so remove it from hash list and the stripe 3687 * will be reinitialized 3688 */ 3689 unhash: 3690 hash = sh->hash_lock_index; 3691 spin_lock_irq(conf->hash_locks + hash); 3692 remove_hash(sh); 3693 spin_unlock_irq(conf->hash_locks + hash); 3694 if (head_sh->batch_head) { 3695 sh = list_first_entry(&sh->batch_list, 3696 struct stripe_head, batch_list); 3697 if (sh != head_sh) 3698 goto unhash; 3699 } 3700 sh = head_sh; 3701 3702 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3703 set_bit(STRIPE_HANDLE, &sh->state); 3704 3705 } 3706 3707 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3708 if (atomic_dec_and_test(&conf->pending_full_writes)) 3709 md_wakeup_thread(conf->mddev->thread); 3710 3711 if (head_sh->batch_head && do_endio) 3712 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3713 } 3714 3715 /* 3716 * For RMW in write back cache, we need extra page in prexor to store the 3717 * old data. This page is stored in dev->orig_page. 3718 * 3719 * This function checks whether we have data for prexor. The exact logic 3720 * is: 3721 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3722 */ 3723 static inline bool uptodate_for_rmw(struct r5dev *dev) 3724 { 3725 return (test_bit(R5_UPTODATE, &dev->flags)) && 3726 (!test_bit(R5_InJournal, &dev->flags) || 3727 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3728 } 3729 3730 static int handle_stripe_dirtying(struct r5conf *conf, 3731 struct stripe_head *sh, 3732 struct stripe_head_state *s, 3733 int disks) 3734 { 3735 int rmw = 0, rcw = 0, i; 3736 sector_t recovery_cp = conf->mddev->recovery_cp; 3737 3738 /* Check whether resync is now happening or should start. 3739 * If yes, then the array is dirty (after unclean shutdown or 3740 * initial creation), so parity in some stripes might be inconsistent. 3741 * In this case, we need to always do reconstruct-write, to ensure 3742 * that in case of drive failure or read-error correction, we 3743 * generate correct data from the parity. 3744 */ 3745 if (conf->rmw_level == PARITY_DISABLE_RMW || 3746 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3747 s->failed == 0)) { 3748 /* Calculate the real rcw later - for now make it 3749 * look like rcw is cheaper 3750 */ 3751 rcw = 1; rmw = 2; 3752 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3753 conf->rmw_level, (unsigned long long)recovery_cp, 3754 (unsigned long long)sh->sector); 3755 } else for (i = disks; i--; ) { 3756 /* would I have to read this buffer for read_modify_write */ 3757 struct r5dev *dev = &sh->dev[i]; 3758 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3759 i == sh->pd_idx || i == sh->qd_idx || 3760 test_bit(R5_InJournal, &dev->flags)) && 3761 !test_bit(R5_LOCKED, &dev->flags) && 3762 !(uptodate_for_rmw(dev) || 3763 test_bit(R5_Wantcompute, &dev->flags))) { 3764 if (test_bit(R5_Insync, &dev->flags)) 3765 rmw++; 3766 else 3767 rmw += 2*disks; /* cannot read it */ 3768 } 3769 /* Would I have to read this buffer for reconstruct_write */ 3770 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3771 i != sh->pd_idx && i != sh->qd_idx && 3772 !test_bit(R5_LOCKED, &dev->flags) && 3773 !(test_bit(R5_UPTODATE, &dev->flags) || 3774 test_bit(R5_Wantcompute, &dev->flags))) { 3775 if (test_bit(R5_Insync, &dev->flags)) 3776 rcw++; 3777 else 3778 rcw += 2*disks; 3779 } 3780 } 3781 3782 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3783 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3784 set_bit(STRIPE_HANDLE, &sh->state); 3785 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3786 /* prefer read-modify-write, but need to get some data */ 3787 if (conf->mddev->queue) 3788 blk_add_trace_msg(conf->mddev->queue, 3789 "raid5 rmw %llu %d", 3790 (unsigned long long)sh->sector, rmw); 3791 for (i = disks; i--; ) { 3792 struct r5dev *dev = &sh->dev[i]; 3793 if (test_bit(R5_InJournal, &dev->flags) && 3794 dev->page == dev->orig_page && 3795 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3796 /* alloc page for prexor */ 3797 struct page *p = alloc_page(GFP_NOIO); 3798 3799 if (p) { 3800 dev->orig_page = p; 3801 continue; 3802 } 3803 3804 /* 3805 * alloc_page() failed, try use 3806 * disk_info->extra_page 3807 */ 3808 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3809 &conf->cache_state)) { 3810 r5c_use_extra_page(sh); 3811 break; 3812 } 3813 3814 /* extra_page in use, add to delayed_list */ 3815 set_bit(STRIPE_DELAYED, &sh->state); 3816 s->waiting_extra_page = 1; 3817 return -EAGAIN; 3818 } 3819 } 3820 3821 for (i = disks; i--; ) { 3822 struct r5dev *dev = &sh->dev[i]; 3823 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3824 i == sh->pd_idx || i == sh->qd_idx || 3825 test_bit(R5_InJournal, &dev->flags)) && 3826 !test_bit(R5_LOCKED, &dev->flags) && 3827 !(uptodate_for_rmw(dev) || 3828 test_bit(R5_Wantcompute, &dev->flags)) && 3829 test_bit(R5_Insync, &dev->flags)) { 3830 if (test_bit(STRIPE_PREREAD_ACTIVE, 3831 &sh->state)) { 3832 pr_debug("Read_old block %d for r-m-w\n", 3833 i); 3834 set_bit(R5_LOCKED, &dev->flags); 3835 set_bit(R5_Wantread, &dev->flags); 3836 s->locked++; 3837 } else { 3838 set_bit(STRIPE_DELAYED, &sh->state); 3839 set_bit(STRIPE_HANDLE, &sh->state); 3840 } 3841 } 3842 } 3843 } 3844 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3845 /* want reconstruct write, but need to get some data */ 3846 int qread =0; 3847 rcw = 0; 3848 for (i = disks; i--; ) { 3849 struct r5dev *dev = &sh->dev[i]; 3850 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3851 i != sh->pd_idx && i != sh->qd_idx && 3852 !test_bit(R5_LOCKED, &dev->flags) && 3853 !(test_bit(R5_UPTODATE, &dev->flags) || 3854 test_bit(R5_Wantcompute, &dev->flags))) { 3855 rcw++; 3856 if (test_bit(R5_Insync, &dev->flags) && 3857 test_bit(STRIPE_PREREAD_ACTIVE, 3858 &sh->state)) { 3859 pr_debug("Read_old block " 3860 "%d for Reconstruct\n", i); 3861 set_bit(R5_LOCKED, &dev->flags); 3862 set_bit(R5_Wantread, &dev->flags); 3863 s->locked++; 3864 qread++; 3865 } else { 3866 set_bit(STRIPE_DELAYED, &sh->state); 3867 set_bit(STRIPE_HANDLE, &sh->state); 3868 } 3869 } 3870 } 3871 if (rcw && conf->mddev->queue) 3872 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3873 (unsigned long long)sh->sector, 3874 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3875 } 3876 3877 if (rcw > disks && rmw > disks && 3878 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3879 set_bit(STRIPE_DELAYED, &sh->state); 3880 3881 /* now if nothing is locked, and if we have enough data, 3882 * we can start a write request 3883 */ 3884 /* since handle_stripe can be called at any time we need to handle the 3885 * case where a compute block operation has been submitted and then a 3886 * subsequent call wants to start a write request. raid_run_ops only 3887 * handles the case where compute block and reconstruct are requested 3888 * simultaneously. If this is not the case then new writes need to be 3889 * held off until the compute completes. 3890 */ 3891 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3892 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3893 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3894 schedule_reconstruction(sh, s, rcw == 0, 0); 3895 return 0; 3896 } 3897 3898 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3899 struct stripe_head_state *s, int disks) 3900 { 3901 struct r5dev *dev = NULL; 3902 3903 BUG_ON(sh->batch_head); 3904 set_bit(STRIPE_HANDLE, &sh->state); 3905 3906 switch (sh->check_state) { 3907 case check_state_idle: 3908 /* start a new check operation if there are no failures */ 3909 if (s->failed == 0) { 3910 BUG_ON(s->uptodate != disks); 3911 sh->check_state = check_state_run; 3912 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3913 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3914 s->uptodate--; 3915 break; 3916 } 3917 dev = &sh->dev[s->failed_num[0]]; 3918 /* fall through */ 3919 case check_state_compute_result: 3920 sh->check_state = check_state_idle; 3921 if (!dev) 3922 dev = &sh->dev[sh->pd_idx]; 3923 3924 /* check that a write has not made the stripe insync */ 3925 if (test_bit(STRIPE_INSYNC, &sh->state)) 3926 break; 3927 3928 /* either failed parity check, or recovery is happening */ 3929 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3930 BUG_ON(s->uptodate != disks); 3931 3932 set_bit(R5_LOCKED, &dev->flags); 3933 s->locked++; 3934 set_bit(R5_Wantwrite, &dev->flags); 3935 3936 clear_bit(STRIPE_DEGRADED, &sh->state); 3937 set_bit(STRIPE_INSYNC, &sh->state); 3938 break; 3939 case check_state_run: 3940 break; /* we will be called again upon completion */ 3941 case check_state_check_result: 3942 sh->check_state = check_state_idle; 3943 3944 /* if a failure occurred during the check operation, leave 3945 * STRIPE_INSYNC not set and let the stripe be handled again 3946 */ 3947 if (s->failed) 3948 break; 3949 3950 /* handle a successful check operation, if parity is correct 3951 * we are done. Otherwise update the mismatch count and repair 3952 * parity if !MD_RECOVERY_CHECK 3953 */ 3954 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3955 /* parity is correct (on disc, 3956 * not in buffer any more) 3957 */ 3958 set_bit(STRIPE_INSYNC, &sh->state); 3959 else { 3960 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3961 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3962 /* don't try to repair!! */ 3963 set_bit(STRIPE_INSYNC, &sh->state); 3964 else { 3965 sh->check_state = check_state_compute_run; 3966 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3967 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3968 set_bit(R5_Wantcompute, 3969 &sh->dev[sh->pd_idx].flags); 3970 sh->ops.target = sh->pd_idx; 3971 sh->ops.target2 = -1; 3972 s->uptodate++; 3973 } 3974 } 3975 break; 3976 case check_state_compute_run: 3977 break; 3978 default: 3979 pr_err("%s: unknown check_state: %d sector: %llu\n", 3980 __func__, sh->check_state, 3981 (unsigned long long) sh->sector); 3982 BUG(); 3983 } 3984 } 3985 3986 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3987 struct stripe_head_state *s, 3988 int disks) 3989 { 3990 int pd_idx = sh->pd_idx; 3991 int qd_idx = sh->qd_idx; 3992 struct r5dev *dev; 3993 3994 BUG_ON(sh->batch_head); 3995 set_bit(STRIPE_HANDLE, &sh->state); 3996 3997 BUG_ON(s->failed > 2); 3998 3999 /* Want to check and possibly repair P and Q. 4000 * However there could be one 'failed' device, in which 4001 * case we can only check one of them, possibly using the 4002 * other to generate missing data 4003 */ 4004 4005 switch (sh->check_state) { 4006 case check_state_idle: 4007 /* start a new check operation if there are < 2 failures */ 4008 if (s->failed == s->q_failed) { 4009 /* The only possible failed device holds Q, so it 4010 * makes sense to check P (If anything else were failed, 4011 * we would have used P to recreate it). 4012 */ 4013 sh->check_state = check_state_run; 4014 } 4015 if (!s->q_failed && s->failed < 2) { 4016 /* Q is not failed, and we didn't use it to generate 4017 * anything, so it makes sense to check it 4018 */ 4019 if (sh->check_state == check_state_run) 4020 sh->check_state = check_state_run_pq; 4021 else 4022 sh->check_state = check_state_run_q; 4023 } 4024 4025 /* discard potentially stale zero_sum_result */ 4026 sh->ops.zero_sum_result = 0; 4027 4028 if (sh->check_state == check_state_run) { 4029 /* async_xor_zero_sum destroys the contents of P */ 4030 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4031 s->uptodate--; 4032 } 4033 if (sh->check_state >= check_state_run && 4034 sh->check_state <= check_state_run_pq) { 4035 /* async_syndrome_zero_sum preserves P and Q, so 4036 * no need to mark them !uptodate here 4037 */ 4038 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4039 break; 4040 } 4041 4042 /* we have 2-disk failure */ 4043 BUG_ON(s->failed != 2); 4044 /* fall through */ 4045 case check_state_compute_result: 4046 sh->check_state = check_state_idle; 4047 4048 /* check that a write has not made the stripe insync */ 4049 if (test_bit(STRIPE_INSYNC, &sh->state)) 4050 break; 4051 4052 /* now write out any block on a failed drive, 4053 * or P or Q if they were recomputed 4054 */ 4055 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4056 if (s->failed == 2) { 4057 dev = &sh->dev[s->failed_num[1]]; 4058 s->locked++; 4059 set_bit(R5_LOCKED, &dev->flags); 4060 set_bit(R5_Wantwrite, &dev->flags); 4061 } 4062 if (s->failed >= 1) { 4063 dev = &sh->dev[s->failed_num[0]]; 4064 s->locked++; 4065 set_bit(R5_LOCKED, &dev->flags); 4066 set_bit(R5_Wantwrite, &dev->flags); 4067 } 4068 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4069 dev = &sh->dev[pd_idx]; 4070 s->locked++; 4071 set_bit(R5_LOCKED, &dev->flags); 4072 set_bit(R5_Wantwrite, &dev->flags); 4073 } 4074 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4075 dev = &sh->dev[qd_idx]; 4076 s->locked++; 4077 set_bit(R5_LOCKED, &dev->flags); 4078 set_bit(R5_Wantwrite, &dev->flags); 4079 } 4080 clear_bit(STRIPE_DEGRADED, &sh->state); 4081 4082 set_bit(STRIPE_INSYNC, &sh->state); 4083 break; 4084 case check_state_run: 4085 case check_state_run_q: 4086 case check_state_run_pq: 4087 break; /* we will be called again upon completion */ 4088 case check_state_check_result: 4089 sh->check_state = check_state_idle; 4090 4091 /* handle a successful check operation, if parity is correct 4092 * we are done. Otherwise update the mismatch count and repair 4093 * parity if !MD_RECOVERY_CHECK 4094 */ 4095 if (sh->ops.zero_sum_result == 0) { 4096 /* both parities are correct */ 4097 if (!s->failed) 4098 set_bit(STRIPE_INSYNC, &sh->state); 4099 else { 4100 /* in contrast to the raid5 case we can validate 4101 * parity, but still have a failure to write 4102 * back 4103 */ 4104 sh->check_state = check_state_compute_result; 4105 /* Returning at this point means that we may go 4106 * off and bring p and/or q uptodate again so 4107 * we make sure to check zero_sum_result again 4108 * to verify if p or q need writeback 4109 */ 4110 } 4111 } else { 4112 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4113 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4114 /* don't try to repair!! */ 4115 set_bit(STRIPE_INSYNC, &sh->state); 4116 else { 4117 int *target = &sh->ops.target; 4118 4119 sh->ops.target = -1; 4120 sh->ops.target2 = -1; 4121 sh->check_state = check_state_compute_run; 4122 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4123 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4124 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4125 set_bit(R5_Wantcompute, 4126 &sh->dev[pd_idx].flags); 4127 *target = pd_idx; 4128 target = &sh->ops.target2; 4129 s->uptodate++; 4130 } 4131 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4132 set_bit(R5_Wantcompute, 4133 &sh->dev[qd_idx].flags); 4134 *target = qd_idx; 4135 s->uptodate++; 4136 } 4137 } 4138 } 4139 break; 4140 case check_state_compute_run: 4141 break; 4142 default: 4143 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4144 __func__, sh->check_state, 4145 (unsigned long long) sh->sector); 4146 BUG(); 4147 } 4148 } 4149 4150 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4151 { 4152 int i; 4153 4154 /* We have read all the blocks in this stripe and now we need to 4155 * copy some of them into a target stripe for expand. 4156 */ 4157 struct dma_async_tx_descriptor *tx = NULL; 4158 BUG_ON(sh->batch_head); 4159 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4160 for (i = 0; i < sh->disks; i++) 4161 if (i != sh->pd_idx && i != sh->qd_idx) { 4162 int dd_idx, j; 4163 struct stripe_head *sh2; 4164 struct async_submit_ctl submit; 4165 4166 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4167 sector_t s = raid5_compute_sector(conf, bn, 0, 4168 &dd_idx, NULL); 4169 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4170 if (sh2 == NULL) 4171 /* so far only the early blocks of this stripe 4172 * have been requested. When later blocks 4173 * get requested, we will try again 4174 */ 4175 continue; 4176 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4177 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4178 /* must have already done this block */ 4179 raid5_release_stripe(sh2); 4180 continue; 4181 } 4182 4183 /* place all the copies on one channel */ 4184 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4185 tx = async_memcpy(sh2->dev[dd_idx].page, 4186 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4187 &submit); 4188 4189 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4190 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4191 for (j = 0; j < conf->raid_disks; j++) 4192 if (j != sh2->pd_idx && 4193 j != sh2->qd_idx && 4194 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4195 break; 4196 if (j == conf->raid_disks) { 4197 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4198 set_bit(STRIPE_HANDLE, &sh2->state); 4199 } 4200 raid5_release_stripe(sh2); 4201 4202 } 4203 /* done submitting copies, wait for them to complete */ 4204 async_tx_quiesce(&tx); 4205 } 4206 4207 /* 4208 * handle_stripe - do things to a stripe. 4209 * 4210 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4211 * state of various bits to see what needs to be done. 4212 * Possible results: 4213 * return some read requests which now have data 4214 * return some write requests which are safely on storage 4215 * schedule a read on some buffers 4216 * schedule a write of some buffers 4217 * return confirmation of parity correctness 4218 * 4219 */ 4220 4221 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4222 { 4223 struct r5conf *conf = sh->raid_conf; 4224 int disks = sh->disks; 4225 struct r5dev *dev; 4226 int i; 4227 int do_recovery = 0; 4228 4229 memset(s, 0, sizeof(*s)); 4230 4231 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4232 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4233 s->failed_num[0] = -1; 4234 s->failed_num[1] = -1; 4235 s->log_failed = r5l_log_disk_error(conf); 4236 4237 /* Now to look around and see what can be done */ 4238 rcu_read_lock(); 4239 for (i=disks; i--; ) { 4240 struct md_rdev *rdev; 4241 sector_t first_bad; 4242 int bad_sectors; 4243 int is_bad = 0; 4244 4245 dev = &sh->dev[i]; 4246 4247 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4248 i, dev->flags, 4249 dev->toread, dev->towrite, dev->written); 4250 /* maybe we can reply to a read 4251 * 4252 * new wantfill requests are only permitted while 4253 * ops_complete_biofill is guaranteed to be inactive 4254 */ 4255 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4256 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4257 set_bit(R5_Wantfill, &dev->flags); 4258 4259 /* now count some things */ 4260 if (test_bit(R5_LOCKED, &dev->flags)) 4261 s->locked++; 4262 if (test_bit(R5_UPTODATE, &dev->flags)) 4263 s->uptodate++; 4264 if (test_bit(R5_Wantcompute, &dev->flags)) { 4265 s->compute++; 4266 BUG_ON(s->compute > 2); 4267 } 4268 4269 if (test_bit(R5_Wantfill, &dev->flags)) 4270 s->to_fill++; 4271 else if (dev->toread) 4272 s->to_read++; 4273 if (dev->towrite) { 4274 s->to_write++; 4275 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4276 s->non_overwrite++; 4277 } 4278 if (dev->written) 4279 s->written++; 4280 /* Prefer to use the replacement for reads, but only 4281 * if it is recovered enough and has no bad blocks. 4282 */ 4283 rdev = rcu_dereference(conf->disks[i].replacement); 4284 if (rdev && !test_bit(Faulty, &rdev->flags) && 4285 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4286 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4287 &first_bad, &bad_sectors)) 4288 set_bit(R5_ReadRepl, &dev->flags); 4289 else { 4290 if (rdev && !test_bit(Faulty, &rdev->flags)) 4291 set_bit(R5_NeedReplace, &dev->flags); 4292 else 4293 clear_bit(R5_NeedReplace, &dev->flags); 4294 rdev = rcu_dereference(conf->disks[i].rdev); 4295 clear_bit(R5_ReadRepl, &dev->flags); 4296 } 4297 if (rdev && test_bit(Faulty, &rdev->flags)) 4298 rdev = NULL; 4299 if (rdev) { 4300 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4301 &first_bad, &bad_sectors); 4302 if (s->blocked_rdev == NULL 4303 && (test_bit(Blocked, &rdev->flags) 4304 || is_bad < 0)) { 4305 if (is_bad < 0) 4306 set_bit(BlockedBadBlocks, 4307 &rdev->flags); 4308 s->blocked_rdev = rdev; 4309 atomic_inc(&rdev->nr_pending); 4310 } 4311 } 4312 clear_bit(R5_Insync, &dev->flags); 4313 if (!rdev) 4314 /* Not in-sync */; 4315 else if (is_bad) { 4316 /* also not in-sync */ 4317 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4318 test_bit(R5_UPTODATE, &dev->flags)) { 4319 /* treat as in-sync, but with a read error 4320 * which we can now try to correct 4321 */ 4322 set_bit(R5_Insync, &dev->flags); 4323 set_bit(R5_ReadError, &dev->flags); 4324 } 4325 } else if (test_bit(In_sync, &rdev->flags)) 4326 set_bit(R5_Insync, &dev->flags); 4327 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4328 /* in sync if before recovery_offset */ 4329 set_bit(R5_Insync, &dev->flags); 4330 else if (test_bit(R5_UPTODATE, &dev->flags) && 4331 test_bit(R5_Expanded, &dev->flags)) 4332 /* If we've reshaped into here, we assume it is Insync. 4333 * We will shortly update recovery_offset to make 4334 * it official. 4335 */ 4336 set_bit(R5_Insync, &dev->flags); 4337 4338 if (test_bit(R5_WriteError, &dev->flags)) { 4339 /* This flag does not apply to '.replacement' 4340 * only to .rdev, so make sure to check that*/ 4341 struct md_rdev *rdev2 = rcu_dereference( 4342 conf->disks[i].rdev); 4343 if (rdev2 == rdev) 4344 clear_bit(R5_Insync, &dev->flags); 4345 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4346 s->handle_bad_blocks = 1; 4347 atomic_inc(&rdev2->nr_pending); 4348 } else 4349 clear_bit(R5_WriteError, &dev->flags); 4350 } 4351 if (test_bit(R5_MadeGood, &dev->flags)) { 4352 /* This flag does not apply to '.replacement' 4353 * only to .rdev, so make sure to check that*/ 4354 struct md_rdev *rdev2 = rcu_dereference( 4355 conf->disks[i].rdev); 4356 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4357 s->handle_bad_blocks = 1; 4358 atomic_inc(&rdev2->nr_pending); 4359 } else 4360 clear_bit(R5_MadeGood, &dev->flags); 4361 } 4362 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4363 struct md_rdev *rdev2 = rcu_dereference( 4364 conf->disks[i].replacement); 4365 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4366 s->handle_bad_blocks = 1; 4367 atomic_inc(&rdev2->nr_pending); 4368 } else 4369 clear_bit(R5_MadeGoodRepl, &dev->flags); 4370 } 4371 if (!test_bit(R5_Insync, &dev->flags)) { 4372 /* The ReadError flag will just be confusing now */ 4373 clear_bit(R5_ReadError, &dev->flags); 4374 clear_bit(R5_ReWrite, &dev->flags); 4375 } 4376 if (test_bit(R5_ReadError, &dev->flags)) 4377 clear_bit(R5_Insync, &dev->flags); 4378 if (!test_bit(R5_Insync, &dev->flags)) { 4379 if (s->failed < 2) 4380 s->failed_num[s->failed] = i; 4381 s->failed++; 4382 if (rdev && !test_bit(Faulty, &rdev->flags)) 4383 do_recovery = 1; 4384 } 4385 4386 if (test_bit(R5_InJournal, &dev->flags)) 4387 s->injournal++; 4388 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4389 s->just_cached++; 4390 } 4391 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4392 /* If there is a failed device being replaced, 4393 * we must be recovering. 4394 * else if we are after recovery_cp, we must be syncing 4395 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4396 * else we can only be replacing 4397 * sync and recovery both need to read all devices, and so 4398 * use the same flag. 4399 */ 4400 if (do_recovery || 4401 sh->sector >= conf->mddev->recovery_cp || 4402 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4403 s->syncing = 1; 4404 else 4405 s->replacing = 1; 4406 } 4407 rcu_read_unlock(); 4408 } 4409 4410 static int clear_batch_ready(struct stripe_head *sh) 4411 { 4412 /* Return '1' if this is a member of batch, or 4413 * '0' if it is a lone stripe or a head which can now be 4414 * handled. 4415 */ 4416 struct stripe_head *tmp; 4417 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4418 return (sh->batch_head && sh->batch_head != sh); 4419 spin_lock(&sh->stripe_lock); 4420 if (!sh->batch_head) { 4421 spin_unlock(&sh->stripe_lock); 4422 return 0; 4423 } 4424 4425 /* 4426 * this stripe could be added to a batch list before we check 4427 * BATCH_READY, skips it 4428 */ 4429 if (sh->batch_head != sh) { 4430 spin_unlock(&sh->stripe_lock); 4431 return 1; 4432 } 4433 spin_lock(&sh->batch_lock); 4434 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4435 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4436 spin_unlock(&sh->batch_lock); 4437 spin_unlock(&sh->stripe_lock); 4438 4439 /* 4440 * BATCH_READY is cleared, no new stripes can be added. 4441 * batch_list can be accessed without lock 4442 */ 4443 return 0; 4444 } 4445 4446 static void break_stripe_batch_list(struct stripe_head *head_sh, 4447 unsigned long handle_flags) 4448 { 4449 struct stripe_head *sh, *next; 4450 int i; 4451 int do_wakeup = 0; 4452 4453 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4454 4455 list_del_init(&sh->batch_list); 4456 4457 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4458 (1 << STRIPE_SYNCING) | 4459 (1 << STRIPE_REPLACED) | 4460 (1 << STRIPE_DELAYED) | 4461 (1 << STRIPE_BIT_DELAY) | 4462 (1 << STRIPE_FULL_WRITE) | 4463 (1 << STRIPE_BIOFILL_RUN) | 4464 (1 << STRIPE_COMPUTE_RUN) | 4465 (1 << STRIPE_OPS_REQ_PENDING) | 4466 (1 << STRIPE_DISCARD) | 4467 (1 << STRIPE_BATCH_READY) | 4468 (1 << STRIPE_BATCH_ERR) | 4469 (1 << STRIPE_BITMAP_PENDING)), 4470 "stripe state: %lx\n", sh->state); 4471 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4472 (1 << STRIPE_REPLACED)), 4473 "head stripe state: %lx\n", head_sh->state); 4474 4475 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4476 (1 << STRIPE_PREREAD_ACTIVE) | 4477 (1 << STRIPE_DEGRADED)), 4478 head_sh->state & (1 << STRIPE_INSYNC)); 4479 4480 sh->check_state = head_sh->check_state; 4481 sh->reconstruct_state = head_sh->reconstruct_state; 4482 for (i = 0; i < sh->disks; i++) { 4483 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4484 do_wakeup = 1; 4485 sh->dev[i].flags = head_sh->dev[i].flags & 4486 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4487 } 4488 spin_lock_irq(&sh->stripe_lock); 4489 sh->batch_head = NULL; 4490 spin_unlock_irq(&sh->stripe_lock); 4491 if (handle_flags == 0 || 4492 sh->state & handle_flags) 4493 set_bit(STRIPE_HANDLE, &sh->state); 4494 raid5_release_stripe(sh); 4495 } 4496 spin_lock_irq(&head_sh->stripe_lock); 4497 head_sh->batch_head = NULL; 4498 spin_unlock_irq(&head_sh->stripe_lock); 4499 for (i = 0; i < head_sh->disks; i++) 4500 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4501 do_wakeup = 1; 4502 if (head_sh->state & handle_flags) 4503 set_bit(STRIPE_HANDLE, &head_sh->state); 4504 4505 if (do_wakeup) 4506 wake_up(&head_sh->raid_conf->wait_for_overlap); 4507 } 4508 4509 static void handle_stripe(struct stripe_head *sh) 4510 { 4511 struct stripe_head_state s; 4512 struct r5conf *conf = sh->raid_conf; 4513 int i; 4514 int prexor; 4515 int disks = sh->disks; 4516 struct r5dev *pdev, *qdev; 4517 4518 clear_bit(STRIPE_HANDLE, &sh->state); 4519 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4520 /* already being handled, ensure it gets handled 4521 * again when current action finishes */ 4522 set_bit(STRIPE_HANDLE, &sh->state); 4523 return; 4524 } 4525 4526 if (clear_batch_ready(sh) ) { 4527 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4528 return; 4529 } 4530 4531 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4532 break_stripe_batch_list(sh, 0); 4533 4534 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4535 spin_lock(&sh->stripe_lock); 4536 /* Cannot process 'sync' concurrently with 'discard' */ 4537 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4538 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4539 set_bit(STRIPE_SYNCING, &sh->state); 4540 clear_bit(STRIPE_INSYNC, &sh->state); 4541 clear_bit(STRIPE_REPLACED, &sh->state); 4542 } 4543 spin_unlock(&sh->stripe_lock); 4544 } 4545 clear_bit(STRIPE_DELAYED, &sh->state); 4546 4547 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4548 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4549 (unsigned long long)sh->sector, sh->state, 4550 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4551 sh->check_state, sh->reconstruct_state); 4552 4553 analyse_stripe(sh, &s); 4554 4555 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4556 goto finish; 4557 4558 if (s.handle_bad_blocks) { 4559 set_bit(STRIPE_HANDLE, &sh->state); 4560 goto finish; 4561 } 4562 4563 if (unlikely(s.blocked_rdev)) { 4564 if (s.syncing || s.expanding || s.expanded || 4565 s.replacing || s.to_write || s.written) { 4566 set_bit(STRIPE_HANDLE, &sh->state); 4567 goto finish; 4568 } 4569 /* There is nothing for the blocked_rdev to block */ 4570 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4571 s.blocked_rdev = NULL; 4572 } 4573 4574 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4575 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4576 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4577 } 4578 4579 pr_debug("locked=%d uptodate=%d to_read=%d" 4580 " to_write=%d failed=%d failed_num=%d,%d\n", 4581 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4582 s.failed_num[0], s.failed_num[1]); 4583 /* check if the array has lost more than max_degraded devices and, 4584 * if so, some requests might need to be failed. 4585 */ 4586 if (s.failed > conf->max_degraded || s.log_failed) { 4587 sh->check_state = 0; 4588 sh->reconstruct_state = 0; 4589 break_stripe_batch_list(sh, 0); 4590 if (s.to_read+s.to_write+s.written) 4591 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4592 if (s.syncing + s.replacing) 4593 handle_failed_sync(conf, sh, &s); 4594 } 4595 4596 /* Now we check to see if any write operations have recently 4597 * completed 4598 */ 4599 prexor = 0; 4600 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4601 prexor = 1; 4602 if (sh->reconstruct_state == reconstruct_state_drain_result || 4603 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4604 sh->reconstruct_state = reconstruct_state_idle; 4605 4606 /* All the 'written' buffers and the parity block are ready to 4607 * be written back to disk 4608 */ 4609 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4610 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4611 BUG_ON(sh->qd_idx >= 0 && 4612 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4613 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4614 for (i = disks; i--; ) { 4615 struct r5dev *dev = &sh->dev[i]; 4616 if (test_bit(R5_LOCKED, &dev->flags) && 4617 (i == sh->pd_idx || i == sh->qd_idx || 4618 dev->written || test_bit(R5_InJournal, 4619 &dev->flags))) { 4620 pr_debug("Writing block %d\n", i); 4621 set_bit(R5_Wantwrite, &dev->flags); 4622 if (prexor) 4623 continue; 4624 if (s.failed > 1) 4625 continue; 4626 if (!test_bit(R5_Insync, &dev->flags) || 4627 ((i == sh->pd_idx || i == sh->qd_idx) && 4628 s.failed == 0)) 4629 set_bit(STRIPE_INSYNC, &sh->state); 4630 } 4631 } 4632 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4633 s.dec_preread_active = 1; 4634 } 4635 4636 /* 4637 * might be able to return some write requests if the parity blocks 4638 * are safe, or on a failed drive 4639 */ 4640 pdev = &sh->dev[sh->pd_idx]; 4641 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4642 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4643 qdev = &sh->dev[sh->qd_idx]; 4644 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4645 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4646 || conf->level < 6; 4647 4648 if (s.written && 4649 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4650 && !test_bit(R5_LOCKED, &pdev->flags) 4651 && (test_bit(R5_UPTODATE, &pdev->flags) || 4652 test_bit(R5_Discard, &pdev->flags))))) && 4653 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4654 && !test_bit(R5_LOCKED, &qdev->flags) 4655 && (test_bit(R5_UPTODATE, &qdev->flags) || 4656 test_bit(R5_Discard, &qdev->flags)))))) 4657 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4658 4659 if (s.just_cached) 4660 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); 4661 r5l_stripe_write_finished(sh); 4662 4663 /* Now we might consider reading some blocks, either to check/generate 4664 * parity, or to satisfy requests 4665 * or to load a block that is being partially written. 4666 */ 4667 if (s.to_read || s.non_overwrite 4668 || (conf->level == 6 && s.to_write && s.failed) 4669 || (s.syncing && (s.uptodate + s.compute < disks)) 4670 || s.replacing 4671 || s.expanding) 4672 handle_stripe_fill(sh, &s, disks); 4673 4674 /* 4675 * When the stripe finishes full journal write cycle (write to journal 4676 * and raid disk), this is the clean up procedure so it is ready for 4677 * next operation. 4678 */ 4679 r5c_finish_stripe_write_out(conf, sh, &s); 4680 4681 /* 4682 * Now to consider new write requests, cache write back and what else, 4683 * if anything should be read. We do not handle new writes when: 4684 * 1/ A 'write' operation (copy+xor) is already in flight. 4685 * 2/ A 'check' operation is in flight, as it may clobber the parity 4686 * block. 4687 * 3/ A r5c cache log write is in flight. 4688 */ 4689 4690 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4691 if (!r5c_is_writeback(conf->log)) { 4692 if (s.to_write) 4693 handle_stripe_dirtying(conf, sh, &s, disks); 4694 } else { /* write back cache */ 4695 int ret = 0; 4696 4697 /* First, try handle writes in caching phase */ 4698 if (s.to_write) 4699 ret = r5c_try_caching_write(conf, sh, &s, 4700 disks); 4701 /* 4702 * If caching phase failed: ret == -EAGAIN 4703 * OR 4704 * stripe under reclaim: !caching && injournal 4705 * 4706 * fall back to handle_stripe_dirtying() 4707 */ 4708 if (ret == -EAGAIN || 4709 /* stripe under reclaim: !caching && injournal */ 4710 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4711 s.injournal > 0)) { 4712 ret = handle_stripe_dirtying(conf, sh, &s, 4713 disks); 4714 if (ret == -EAGAIN) 4715 goto finish; 4716 } 4717 } 4718 } 4719 4720 /* maybe we need to check and possibly fix the parity for this stripe 4721 * Any reads will already have been scheduled, so we just see if enough 4722 * data is available. The parity check is held off while parity 4723 * dependent operations are in flight. 4724 */ 4725 if (sh->check_state || 4726 (s.syncing && s.locked == 0 && 4727 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4728 !test_bit(STRIPE_INSYNC, &sh->state))) { 4729 if (conf->level == 6) 4730 handle_parity_checks6(conf, sh, &s, disks); 4731 else 4732 handle_parity_checks5(conf, sh, &s, disks); 4733 } 4734 4735 if ((s.replacing || s.syncing) && s.locked == 0 4736 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4737 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4738 /* Write out to replacement devices where possible */ 4739 for (i = 0; i < conf->raid_disks; i++) 4740 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4741 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4742 set_bit(R5_WantReplace, &sh->dev[i].flags); 4743 set_bit(R5_LOCKED, &sh->dev[i].flags); 4744 s.locked++; 4745 } 4746 if (s.replacing) 4747 set_bit(STRIPE_INSYNC, &sh->state); 4748 set_bit(STRIPE_REPLACED, &sh->state); 4749 } 4750 if ((s.syncing || s.replacing) && s.locked == 0 && 4751 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4752 test_bit(STRIPE_INSYNC, &sh->state)) { 4753 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4754 clear_bit(STRIPE_SYNCING, &sh->state); 4755 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4756 wake_up(&conf->wait_for_overlap); 4757 } 4758 4759 /* If the failed drives are just a ReadError, then we might need 4760 * to progress the repair/check process 4761 */ 4762 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4763 for (i = 0; i < s.failed; i++) { 4764 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4765 if (test_bit(R5_ReadError, &dev->flags) 4766 && !test_bit(R5_LOCKED, &dev->flags) 4767 && test_bit(R5_UPTODATE, &dev->flags) 4768 ) { 4769 if (!test_bit(R5_ReWrite, &dev->flags)) { 4770 set_bit(R5_Wantwrite, &dev->flags); 4771 set_bit(R5_ReWrite, &dev->flags); 4772 set_bit(R5_LOCKED, &dev->flags); 4773 s.locked++; 4774 } else { 4775 /* let's read it back */ 4776 set_bit(R5_Wantread, &dev->flags); 4777 set_bit(R5_LOCKED, &dev->flags); 4778 s.locked++; 4779 } 4780 } 4781 } 4782 4783 /* Finish reconstruct operations initiated by the expansion process */ 4784 if (sh->reconstruct_state == reconstruct_state_result) { 4785 struct stripe_head *sh_src 4786 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4787 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4788 /* sh cannot be written until sh_src has been read. 4789 * so arrange for sh to be delayed a little 4790 */ 4791 set_bit(STRIPE_DELAYED, &sh->state); 4792 set_bit(STRIPE_HANDLE, &sh->state); 4793 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4794 &sh_src->state)) 4795 atomic_inc(&conf->preread_active_stripes); 4796 raid5_release_stripe(sh_src); 4797 goto finish; 4798 } 4799 if (sh_src) 4800 raid5_release_stripe(sh_src); 4801 4802 sh->reconstruct_state = reconstruct_state_idle; 4803 clear_bit(STRIPE_EXPANDING, &sh->state); 4804 for (i = conf->raid_disks; i--; ) { 4805 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4806 set_bit(R5_LOCKED, &sh->dev[i].flags); 4807 s.locked++; 4808 } 4809 } 4810 4811 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4812 !sh->reconstruct_state) { 4813 /* Need to write out all blocks after computing parity */ 4814 sh->disks = conf->raid_disks; 4815 stripe_set_idx(sh->sector, conf, 0, sh); 4816 schedule_reconstruction(sh, &s, 1, 1); 4817 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4818 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4819 atomic_dec(&conf->reshape_stripes); 4820 wake_up(&conf->wait_for_overlap); 4821 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4822 } 4823 4824 if (s.expanding && s.locked == 0 && 4825 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4826 handle_stripe_expansion(conf, sh); 4827 4828 finish: 4829 /* wait for this device to become unblocked */ 4830 if (unlikely(s.blocked_rdev)) { 4831 if (conf->mddev->external) 4832 md_wait_for_blocked_rdev(s.blocked_rdev, 4833 conf->mddev); 4834 else 4835 /* Internal metadata will immediately 4836 * be written by raid5d, so we don't 4837 * need to wait here. 4838 */ 4839 rdev_dec_pending(s.blocked_rdev, 4840 conf->mddev); 4841 } 4842 4843 if (s.handle_bad_blocks) 4844 for (i = disks; i--; ) { 4845 struct md_rdev *rdev; 4846 struct r5dev *dev = &sh->dev[i]; 4847 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4848 /* We own a safe reference to the rdev */ 4849 rdev = conf->disks[i].rdev; 4850 if (!rdev_set_badblocks(rdev, sh->sector, 4851 STRIPE_SECTORS, 0)) 4852 md_error(conf->mddev, rdev); 4853 rdev_dec_pending(rdev, conf->mddev); 4854 } 4855 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4856 rdev = conf->disks[i].rdev; 4857 rdev_clear_badblocks(rdev, sh->sector, 4858 STRIPE_SECTORS, 0); 4859 rdev_dec_pending(rdev, conf->mddev); 4860 } 4861 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4862 rdev = conf->disks[i].replacement; 4863 if (!rdev) 4864 /* rdev have been moved down */ 4865 rdev = conf->disks[i].rdev; 4866 rdev_clear_badblocks(rdev, sh->sector, 4867 STRIPE_SECTORS, 0); 4868 rdev_dec_pending(rdev, conf->mddev); 4869 } 4870 } 4871 4872 if (s.ops_request) 4873 raid_run_ops(sh, s.ops_request); 4874 4875 ops_run_io(sh, &s); 4876 4877 if (s.dec_preread_active) { 4878 /* We delay this until after ops_run_io so that if make_request 4879 * is waiting on a flush, it won't continue until the writes 4880 * have actually been submitted. 4881 */ 4882 atomic_dec(&conf->preread_active_stripes); 4883 if (atomic_read(&conf->preread_active_stripes) < 4884 IO_THRESHOLD) 4885 md_wakeup_thread(conf->mddev->thread); 4886 } 4887 4888 if (!bio_list_empty(&s.return_bi)) { 4889 if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4890 spin_lock_irq(&conf->device_lock); 4891 bio_list_merge(&conf->return_bi, &s.return_bi); 4892 spin_unlock_irq(&conf->device_lock); 4893 md_wakeup_thread(conf->mddev->thread); 4894 } else 4895 return_io(&s.return_bi); 4896 } 4897 4898 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4899 } 4900 4901 static void raid5_activate_delayed(struct r5conf *conf) 4902 { 4903 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 4904 while (!list_empty(&conf->delayed_list)) { 4905 struct list_head *l = conf->delayed_list.next; 4906 struct stripe_head *sh; 4907 sh = list_entry(l, struct stripe_head, lru); 4908 list_del_init(l); 4909 clear_bit(STRIPE_DELAYED, &sh->state); 4910 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4911 atomic_inc(&conf->preread_active_stripes); 4912 list_add_tail(&sh->lru, &conf->hold_list); 4913 raid5_wakeup_stripe_thread(sh); 4914 } 4915 } 4916 } 4917 4918 static void activate_bit_delay(struct r5conf *conf, 4919 struct list_head *temp_inactive_list) 4920 { 4921 /* device_lock is held */ 4922 struct list_head head; 4923 list_add(&head, &conf->bitmap_list); 4924 list_del_init(&conf->bitmap_list); 4925 while (!list_empty(&head)) { 4926 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4927 int hash; 4928 list_del_init(&sh->lru); 4929 atomic_inc(&sh->count); 4930 hash = sh->hash_lock_index; 4931 __release_stripe(conf, sh, &temp_inactive_list[hash]); 4932 } 4933 } 4934 4935 static int raid5_congested(struct mddev *mddev, int bits) 4936 { 4937 struct r5conf *conf = mddev->private; 4938 4939 /* No difference between reads and writes. Just check 4940 * how busy the stripe_cache is 4941 */ 4942 4943 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 4944 return 1; 4945 4946 /* Also checks whether there is pressure on r5cache log space */ 4947 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 4948 return 1; 4949 if (conf->quiesce) 4950 return 1; 4951 if (atomic_read(&conf->empty_inactive_list_nr)) 4952 return 1; 4953 4954 return 0; 4955 } 4956 4957 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4958 { 4959 struct r5conf *conf = mddev->private; 4960 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 4961 unsigned int chunk_sectors; 4962 unsigned int bio_sectors = bio_sectors(bio); 4963 4964 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 4965 return chunk_sectors >= 4966 ((sector & (chunk_sectors - 1)) + bio_sectors); 4967 } 4968 4969 /* 4970 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 4971 * later sampled by raid5d. 4972 */ 4973 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 4974 { 4975 unsigned long flags; 4976 4977 spin_lock_irqsave(&conf->device_lock, flags); 4978 4979 bi->bi_next = conf->retry_read_aligned_list; 4980 conf->retry_read_aligned_list = bi; 4981 4982 spin_unlock_irqrestore(&conf->device_lock, flags); 4983 md_wakeup_thread(conf->mddev->thread); 4984 } 4985 4986 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4987 { 4988 struct bio *bi; 4989 4990 bi = conf->retry_read_aligned; 4991 if (bi) { 4992 conf->retry_read_aligned = NULL; 4993 return bi; 4994 } 4995 bi = conf->retry_read_aligned_list; 4996 if(bi) { 4997 conf->retry_read_aligned_list = bi->bi_next; 4998 bi->bi_next = NULL; 4999 /* 5000 * this sets the active strip count to 1 and the processed 5001 * strip count to zero (upper 8 bits) 5002 */ 5003 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 5004 } 5005 5006 return bi; 5007 } 5008 5009 /* 5010 * The "raid5_align_endio" should check if the read succeeded and if it 5011 * did, call bio_endio on the original bio (having bio_put the new bio 5012 * first). 5013 * If the read failed.. 5014 */ 5015 static void raid5_align_endio(struct bio *bi) 5016 { 5017 struct bio* raid_bi = bi->bi_private; 5018 struct mddev *mddev; 5019 struct r5conf *conf; 5020 struct md_rdev *rdev; 5021 int error = bi->bi_error; 5022 5023 bio_put(bi); 5024 5025 rdev = (void*)raid_bi->bi_next; 5026 raid_bi->bi_next = NULL; 5027 mddev = rdev->mddev; 5028 conf = mddev->private; 5029 5030 rdev_dec_pending(rdev, conf->mddev); 5031 5032 if (!error) { 5033 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 5034 raid_bi, 0); 5035 bio_endio(raid_bi); 5036 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5037 wake_up(&conf->wait_for_quiescent); 5038 return; 5039 } 5040 5041 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5042 5043 add_bio_to_retry(raid_bi, conf); 5044 } 5045 5046 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5047 { 5048 struct r5conf *conf = mddev->private; 5049 int dd_idx; 5050 struct bio* align_bi; 5051 struct md_rdev *rdev; 5052 sector_t end_sector; 5053 5054 if (!in_chunk_boundary(mddev, raid_bio)) { 5055 pr_debug("%s: non aligned\n", __func__); 5056 return 0; 5057 } 5058 /* 5059 * use bio_clone_fast to make a copy of the bio 5060 */ 5061 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5062 if (!align_bi) 5063 return 0; 5064 /* 5065 * set bi_end_io to a new function, and set bi_private to the 5066 * original bio. 5067 */ 5068 align_bi->bi_end_io = raid5_align_endio; 5069 align_bi->bi_private = raid_bio; 5070 /* 5071 * compute position 5072 */ 5073 align_bi->bi_iter.bi_sector = 5074 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5075 0, &dd_idx, NULL); 5076 5077 end_sector = bio_end_sector(align_bi); 5078 rcu_read_lock(); 5079 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5080 if (!rdev || test_bit(Faulty, &rdev->flags) || 5081 rdev->recovery_offset < end_sector) { 5082 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5083 if (rdev && 5084 (test_bit(Faulty, &rdev->flags) || 5085 !(test_bit(In_sync, &rdev->flags) || 5086 rdev->recovery_offset >= end_sector))) 5087 rdev = NULL; 5088 } 5089 5090 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5091 rcu_read_unlock(); 5092 bio_put(align_bi); 5093 return 0; 5094 } 5095 5096 if (rdev) { 5097 sector_t first_bad; 5098 int bad_sectors; 5099 5100 atomic_inc(&rdev->nr_pending); 5101 rcu_read_unlock(); 5102 raid_bio->bi_next = (void*)rdev; 5103 align_bi->bi_bdev = rdev->bdev; 5104 bio_clear_flag(align_bi, BIO_SEG_VALID); 5105 5106 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5107 bio_sectors(align_bi), 5108 &first_bad, &bad_sectors)) { 5109 bio_put(align_bi); 5110 rdev_dec_pending(rdev, mddev); 5111 return 0; 5112 } 5113 5114 /* No reshape active, so we can trust rdev->data_offset */ 5115 align_bi->bi_iter.bi_sector += rdev->data_offset; 5116 5117 spin_lock_irq(&conf->device_lock); 5118 wait_event_lock_irq(conf->wait_for_quiescent, 5119 conf->quiesce == 0, 5120 conf->device_lock); 5121 atomic_inc(&conf->active_aligned_reads); 5122 spin_unlock_irq(&conf->device_lock); 5123 5124 if (mddev->gendisk) 5125 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 5126 align_bi, disk_devt(mddev->gendisk), 5127 raid_bio->bi_iter.bi_sector); 5128 generic_make_request(align_bi); 5129 return 1; 5130 } else { 5131 rcu_read_unlock(); 5132 bio_put(align_bi); 5133 return 0; 5134 } 5135 } 5136 5137 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5138 { 5139 struct bio *split; 5140 5141 do { 5142 sector_t sector = raid_bio->bi_iter.bi_sector; 5143 unsigned chunk_sects = mddev->chunk_sectors; 5144 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5145 5146 if (sectors < bio_sectors(raid_bio)) { 5147 split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); 5148 bio_chain(split, raid_bio); 5149 } else 5150 split = raid_bio; 5151 5152 if (!raid5_read_one_chunk(mddev, split)) { 5153 if (split != raid_bio) 5154 generic_make_request(raid_bio); 5155 return split; 5156 } 5157 } while (split != raid_bio); 5158 5159 return NULL; 5160 } 5161 5162 /* __get_priority_stripe - get the next stripe to process 5163 * 5164 * Full stripe writes are allowed to pass preread active stripes up until 5165 * the bypass_threshold is exceeded. In general the bypass_count 5166 * increments when the handle_list is handled before the hold_list; however, it 5167 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5168 * stripe with in flight i/o. The bypass_count will be reset when the 5169 * head of the hold_list has changed, i.e. the head was promoted to the 5170 * handle_list. 5171 */ 5172 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5173 { 5174 struct stripe_head *sh = NULL, *tmp; 5175 struct list_head *handle_list = NULL; 5176 struct r5worker_group *wg = NULL; 5177 5178 if (conf->worker_cnt_per_group == 0) { 5179 handle_list = &conf->handle_list; 5180 } else if (group != ANY_GROUP) { 5181 handle_list = &conf->worker_groups[group].handle_list; 5182 wg = &conf->worker_groups[group]; 5183 } else { 5184 int i; 5185 for (i = 0; i < conf->group_cnt; i++) { 5186 handle_list = &conf->worker_groups[i].handle_list; 5187 wg = &conf->worker_groups[i]; 5188 if (!list_empty(handle_list)) 5189 break; 5190 } 5191 } 5192 5193 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5194 __func__, 5195 list_empty(handle_list) ? "empty" : "busy", 5196 list_empty(&conf->hold_list) ? "empty" : "busy", 5197 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5198 5199 if (!list_empty(handle_list)) { 5200 sh = list_entry(handle_list->next, typeof(*sh), lru); 5201 5202 if (list_empty(&conf->hold_list)) 5203 conf->bypass_count = 0; 5204 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5205 if (conf->hold_list.next == conf->last_hold) 5206 conf->bypass_count++; 5207 else { 5208 conf->last_hold = conf->hold_list.next; 5209 conf->bypass_count -= conf->bypass_threshold; 5210 if (conf->bypass_count < 0) 5211 conf->bypass_count = 0; 5212 } 5213 } 5214 } else if (!list_empty(&conf->hold_list) && 5215 ((conf->bypass_threshold && 5216 conf->bypass_count > conf->bypass_threshold) || 5217 atomic_read(&conf->pending_full_writes) == 0)) { 5218 5219 list_for_each_entry(tmp, &conf->hold_list, lru) { 5220 if (conf->worker_cnt_per_group == 0 || 5221 group == ANY_GROUP || 5222 !cpu_online(tmp->cpu) || 5223 cpu_to_group(tmp->cpu) == group) { 5224 sh = tmp; 5225 break; 5226 } 5227 } 5228 5229 if (sh) { 5230 conf->bypass_count -= conf->bypass_threshold; 5231 if (conf->bypass_count < 0) 5232 conf->bypass_count = 0; 5233 } 5234 wg = NULL; 5235 } 5236 5237 if (!sh) 5238 return NULL; 5239 5240 if (wg) { 5241 wg->stripes_cnt--; 5242 sh->group = NULL; 5243 } 5244 list_del_init(&sh->lru); 5245 BUG_ON(atomic_inc_return(&sh->count) != 1); 5246 return sh; 5247 } 5248 5249 struct raid5_plug_cb { 5250 struct blk_plug_cb cb; 5251 struct list_head list; 5252 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5253 }; 5254 5255 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5256 { 5257 struct raid5_plug_cb *cb = container_of( 5258 blk_cb, struct raid5_plug_cb, cb); 5259 struct stripe_head *sh; 5260 struct mddev *mddev = cb->cb.data; 5261 struct r5conf *conf = mddev->private; 5262 int cnt = 0; 5263 int hash; 5264 5265 if (cb->list.next && !list_empty(&cb->list)) { 5266 spin_lock_irq(&conf->device_lock); 5267 while (!list_empty(&cb->list)) { 5268 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5269 list_del_init(&sh->lru); 5270 /* 5271 * avoid race release_stripe_plug() sees 5272 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5273 * is still in our list 5274 */ 5275 smp_mb__before_atomic(); 5276 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5277 /* 5278 * STRIPE_ON_RELEASE_LIST could be set here. In that 5279 * case, the count is always > 1 here 5280 */ 5281 hash = sh->hash_lock_index; 5282 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5283 cnt++; 5284 } 5285 spin_unlock_irq(&conf->device_lock); 5286 } 5287 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5288 NR_STRIPE_HASH_LOCKS); 5289 if (mddev->queue) 5290 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5291 kfree(cb); 5292 } 5293 5294 static void release_stripe_plug(struct mddev *mddev, 5295 struct stripe_head *sh) 5296 { 5297 struct blk_plug_cb *blk_cb = blk_check_plugged( 5298 raid5_unplug, mddev, 5299 sizeof(struct raid5_plug_cb)); 5300 struct raid5_plug_cb *cb; 5301 5302 if (!blk_cb) { 5303 raid5_release_stripe(sh); 5304 return; 5305 } 5306 5307 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5308 5309 if (cb->list.next == NULL) { 5310 int i; 5311 INIT_LIST_HEAD(&cb->list); 5312 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5313 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5314 } 5315 5316 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5317 list_add_tail(&sh->lru, &cb->list); 5318 else 5319 raid5_release_stripe(sh); 5320 } 5321 5322 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5323 { 5324 struct r5conf *conf = mddev->private; 5325 sector_t logical_sector, last_sector; 5326 struct stripe_head *sh; 5327 int remaining; 5328 int stripe_sectors; 5329 5330 if (mddev->reshape_position != MaxSector) 5331 /* Skip discard while reshape is happening */ 5332 return; 5333 5334 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5335 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5336 5337 bi->bi_next = NULL; 5338 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5339 5340 stripe_sectors = conf->chunk_sectors * 5341 (conf->raid_disks - conf->max_degraded); 5342 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5343 stripe_sectors); 5344 sector_div(last_sector, stripe_sectors); 5345 5346 logical_sector *= conf->chunk_sectors; 5347 last_sector *= conf->chunk_sectors; 5348 5349 for (; logical_sector < last_sector; 5350 logical_sector += STRIPE_SECTORS) { 5351 DEFINE_WAIT(w); 5352 int d; 5353 again: 5354 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5355 prepare_to_wait(&conf->wait_for_overlap, &w, 5356 TASK_UNINTERRUPTIBLE); 5357 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5358 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5359 raid5_release_stripe(sh); 5360 schedule(); 5361 goto again; 5362 } 5363 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5364 spin_lock_irq(&sh->stripe_lock); 5365 for (d = 0; d < conf->raid_disks; d++) { 5366 if (d == sh->pd_idx || d == sh->qd_idx) 5367 continue; 5368 if (sh->dev[d].towrite || sh->dev[d].toread) { 5369 set_bit(R5_Overlap, &sh->dev[d].flags); 5370 spin_unlock_irq(&sh->stripe_lock); 5371 raid5_release_stripe(sh); 5372 schedule(); 5373 goto again; 5374 } 5375 } 5376 set_bit(STRIPE_DISCARD, &sh->state); 5377 finish_wait(&conf->wait_for_overlap, &w); 5378 sh->overwrite_disks = 0; 5379 for (d = 0; d < conf->raid_disks; d++) { 5380 if (d == sh->pd_idx || d == sh->qd_idx) 5381 continue; 5382 sh->dev[d].towrite = bi; 5383 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5384 raid5_inc_bi_active_stripes(bi); 5385 sh->overwrite_disks++; 5386 } 5387 spin_unlock_irq(&sh->stripe_lock); 5388 if (conf->mddev->bitmap) { 5389 for (d = 0; 5390 d < conf->raid_disks - conf->max_degraded; 5391 d++) 5392 bitmap_startwrite(mddev->bitmap, 5393 sh->sector, 5394 STRIPE_SECTORS, 5395 0); 5396 sh->bm_seq = conf->seq_flush + 1; 5397 set_bit(STRIPE_BIT_DELAY, &sh->state); 5398 } 5399 5400 set_bit(STRIPE_HANDLE, &sh->state); 5401 clear_bit(STRIPE_DELAYED, &sh->state); 5402 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5403 atomic_inc(&conf->preread_active_stripes); 5404 release_stripe_plug(mddev, sh); 5405 } 5406 5407 remaining = raid5_dec_bi_active_stripes(bi); 5408 if (remaining == 0) { 5409 md_write_end(mddev); 5410 bio_endio(bi); 5411 } 5412 } 5413 5414 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5415 { 5416 struct r5conf *conf = mddev->private; 5417 int dd_idx; 5418 sector_t new_sector; 5419 sector_t logical_sector, last_sector; 5420 struct stripe_head *sh; 5421 const int rw = bio_data_dir(bi); 5422 int remaining; 5423 DEFINE_WAIT(w); 5424 bool do_prepare; 5425 bool do_flush = false; 5426 5427 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5428 int ret = r5l_handle_flush_request(conf->log, bi); 5429 5430 if (ret == 0) 5431 return; 5432 if (ret == -ENODEV) { 5433 md_flush_request(mddev, bi); 5434 return; 5435 } 5436 /* ret == -EAGAIN, fallback */ 5437 /* 5438 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5439 * we need to flush journal device 5440 */ 5441 do_flush = bi->bi_opf & REQ_PREFLUSH; 5442 } 5443 5444 md_write_start(mddev, bi); 5445 5446 /* 5447 * If array is degraded, better not do chunk aligned read because 5448 * later we might have to read it again in order to reconstruct 5449 * data on failed drives. 5450 */ 5451 if (rw == READ && mddev->degraded == 0 && 5452 mddev->reshape_position == MaxSector) { 5453 bi = chunk_aligned_read(mddev, bi); 5454 if (!bi) 5455 return; 5456 } 5457 5458 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5459 make_discard_request(mddev, bi); 5460 return; 5461 } 5462 5463 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5464 last_sector = bio_end_sector(bi); 5465 bi->bi_next = NULL; 5466 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5467 5468 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5469 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5470 int previous; 5471 int seq; 5472 5473 do_prepare = false; 5474 retry: 5475 seq = read_seqcount_begin(&conf->gen_lock); 5476 previous = 0; 5477 if (do_prepare) 5478 prepare_to_wait(&conf->wait_for_overlap, &w, 5479 TASK_UNINTERRUPTIBLE); 5480 if (unlikely(conf->reshape_progress != MaxSector)) { 5481 /* spinlock is needed as reshape_progress may be 5482 * 64bit on a 32bit platform, and so it might be 5483 * possible to see a half-updated value 5484 * Of course reshape_progress could change after 5485 * the lock is dropped, so once we get a reference 5486 * to the stripe that we think it is, we will have 5487 * to check again. 5488 */ 5489 spin_lock_irq(&conf->device_lock); 5490 if (mddev->reshape_backwards 5491 ? logical_sector < conf->reshape_progress 5492 : logical_sector >= conf->reshape_progress) { 5493 previous = 1; 5494 } else { 5495 if (mddev->reshape_backwards 5496 ? logical_sector < conf->reshape_safe 5497 : logical_sector >= conf->reshape_safe) { 5498 spin_unlock_irq(&conf->device_lock); 5499 schedule(); 5500 do_prepare = true; 5501 goto retry; 5502 } 5503 } 5504 spin_unlock_irq(&conf->device_lock); 5505 } 5506 5507 new_sector = raid5_compute_sector(conf, logical_sector, 5508 previous, 5509 &dd_idx, NULL); 5510 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5511 (unsigned long long)new_sector, 5512 (unsigned long long)logical_sector); 5513 5514 sh = raid5_get_active_stripe(conf, new_sector, previous, 5515 (bi->bi_opf & REQ_RAHEAD), 0); 5516 if (sh) { 5517 if (unlikely(previous)) { 5518 /* expansion might have moved on while waiting for a 5519 * stripe, so we must do the range check again. 5520 * Expansion could still move past after this 5521 * test, but as we are holding a reference to 5522 * 'sh', we know that if that happens, 5523 * STRIPE_EXPANDING will get set and the expansion 5524 * won't proceed until we finish with the stripe. 5525 */ 5526 int must_retry = 0; 5527 spin_lock_irq(&conf->device_lock); 5528 if (mddev->reshape_backwards 5529 ? logical_sector >= conf->reshape_progress 5530 : logical_sector < conf->reshape_progress) 5531 /* mismatch, need to try again */ 5532 must_retry = 1; 5533 spin_unlock_irq(&conf->device_lock); 5534 if (must_retry) { 5535 raid5_release_stripe(sh); 5536 schedule(); 5537 do_prepare = true; 5538 goto retry; 5539 } 5540 } 5541 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5542 /* Might have got the wrong stripe_head 5543 * by accident 5544 */ 5545 raid5_release_stripe(sh); 5546 goto retry; 5547 } 5548 5549 if (rw == WRITE && 5550 logical_sector >= mddev->suspend_lo && 5551 logical_sector < mddev->suspend_hi) { 5552 raid5_release_stripe(sh); 5553 /* As the suspend_* range is controlled by 5554 * userspace, we want an interruptible 5555 * wait. 5556 */ 5557 flush_signals(current); 5558 prepare_to_wait(&conf->wait_for_overlap, 5559 &w, TASK_INTERRUPTIBLE); 5560 if (logical_sector >= mddev->suspend_lo && 5561 logical_sector < mddev->suspend_hi) { 5562 schedule(); 5563 do_prepare = true; 5564 } 5565 goto retry; 5566 } 5567 5568 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5569 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5570 /* Stripe is busy expanding or 5571 * add failed due to overlap. Flush everything 5572 * and wait a while 5573 */ 5574 md_wakeup_thread(mddev->thread); 5575 raid5_release_stripe(sh); 5576 schedule(); 5577 do_prepare = true; 5578 goto retry; 5579 } 5580 if (do_flush) { 5581 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5582 /* we only need flush for one stripe */ 5583 do_flush = false; 5584 } 5585 5586 set_bit(STRIPE_HANDLE, &sh->state); 5587 clear_bit(STRIPE_DELAYED, &sh->state); 5588 if ((!sh->batch_head || sh == sh->batch_head) && 5589 (bi->bi_opf & REQ_SYNC) && 5590 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5591 atomic_inc(&conf->preread_active_stripes); 5592 release_stripe_plug(mddev, sh); 5593 } else { 5594 /* cannot get stripe for read-ahead, just give-up */ 5595 bi->bi_error = -EIO; 5596 break; 5597 } 5598 } 5599 finish_wait(&conf->wait_for_overlap, &w); 5600 5601 remaining = raid5_dec_bi_active_stripes(bi); 5602 if (remaining == 0) { 5603 5604 if ( rw == WRITE ) 5605 md_write_end(mddev); 5606 5607 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 5608 bi, 0); 5609 bio_endio(bi); 5610 } 5611 } 5612 5613 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5614 5615 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5616 { 5617 /* reshaping is quite different to recovery/resync so it is 5618 * handled quite separately ... here. 5619 * 5620 * On each call to sync_request, we gather one chunk worth of 5621 * destination stripes and flag them as expanding. 5622 * Then we find all the source stripes and request reads. 5623 * As the reads complete, handle_stripe will copy the data 5624 * into the destination stripe and release that stripe. 5625 */ 5626 struct r5conf *conf = mddev->private; 5627 struct stripe_head *sh; 5628 sector_t first_sector, last_sector; 5629 int raid_disks = conf->previous_raid_disks; 5630 int data_disks = raid_disks - conf->max_degraded; 5631 int new_data_disks = conf->raid_disks - conf->max_degraded; 5632 int i; 5633 int dd_idx; 5634 sector_t writepos, readpos, safepos; 5635 sector_t stripe_addr; 5636 int reshape_sectors; 5637 struct list_head stripes; 5638 sector_t retn; 5639 5640 if (sector_nr == 0) { 5641 /* If restarting in the middle, skip the initial sectors */ 5642 if (mddev->reshape_backwards && 5643 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5644 sector_nr = raid5_size(mddev, 0, 0) 5645 - conf->reshape_progress; 5646 } else if (mddev->reshape_backwards && 5647 conf->reshape_progress == MaxSector) { 5648 /* shouldn't happen, but just in case, finish up.*/ 5649 sector_nr = MaxSector; 5650 } else if (!mddev->reshape_backwards && 5651 conf->reshape_progress > 0) 5652 sector_nr = conf->reshape_progress; 5653 sector_div(sector_nr, new_data_disks); 5654 if (sector_nr) { 5655 mddev->curr_resync_completed = sector_nr; 5656 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5657 *skipped = 1; 5658 retn = sector_nr; 5659 goto finish; 5660 } 5661 } 5662 5663 /* We need to process a full chunk at a time. 5664 * If old and new chunk sizes differ, we need to process the 5665 * largest of these 5666 */ 5667 5668 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5669 5670 /* We update the metadata at least every 10 seconds, or when 5671 * the data about to be copied would over-write the source of 5672 * the data at the front of the range. i.e. one new_stripe 5673 * along from reshape_progress new_maps to after where 5674 * reshape_safe old_maps to 5675 */ 5676 writepos = conf->reshape_progress; 5677 sector_div(writepos, new_data_disks); 5678 readpos = conf->reshape_progress; 5679 sector_div(readpos, data_disks); 5680 safepos = conf->reshape_safe; 5681 sector_div(safepos, data_disks); 5682 if (mddev->reshape_backwards) { 5683 BUG_ON(writepos < reshape_sectors); 5684 writepos -= reshape_sectors; 5685 readpos += reshape_sectors; 5686 safepos += reshape_sectors; 5687 } else { 5688 writepos += reshape_sectors; 5689 /* readpos and safepos are worst-case calculations. 5690 * A negative number is overly pessimistic, and causes 5691 * obvious problems for unsigned storage. So clip to 0. 5692 */ 5693 readpos -= min_t(sector_t, reshape_sectors, readpos); 5694 safepos -= min_t(sector_t, reshape_sectors, safepos); 5695 } 5696 5697 /* Having calculated the 'writepos' possibly use it 5698 * to set 'stripe_addr' which is where we will write to. 5699 */ 5700 if (mddev->reshape_backwards) { 5701 BUG_ON(conf->reshape_progress == 0); 5702 stripe_addr = writepos; 5703 BUG_ON((mddev->dev_sectors & 5704 ~((sector_t)reshape_sectors - 1)) 5705 - reshape_sectors - stripe_addr 5706 != sector_nr); 5707 } else { 5708 BUG_ON(writepos != sector_nr + reshape_sectors); 5709 stripe_addr = sector_nr; 5710 } 5711 5712 /* 'writepos' is the most advanced device address we might write. 5713 * 'readpos' is the least advanced device address we might read. 5714 * 'safepos' is the least address recorded in the metadata as having 5715 * been reshaped. 5716 * If there is a min_offset_diff, these are adjusted either by 5717 * increasing the safepos/readpos if diff is negative, or 5718 * increasing writepos if diff is positive. 5719 * If 'readpos' is then behind 'writepos', there is no way that we can 5720 * ensure safety in the face of a crash - that must be done by userspace 5721 * making a backup of the data. So in that case there is no particular 5722 * rush to update metadata. 5723 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5724 * update the metadata to advance 'safepos' to match 'readpos' so that 5725 * we can be safe in the event of a crash. 5726 * So we insist on updating metadata if safepos is behind writepos and 5727 * readpos is beyond writepos. 5728 * In any case, update the metadata every 10 seconds. 5729 * Maybe that number should be configurable, but I'm not sure it is 5730 * worth it.... maybe it could be a multiple of safemode_delay??? 5731 */ 5732 if (conf->min_offset_diff < 0) { 5733 safepos += -conf->min_offset_diff; 5734 readpos += -conf->min_offset_diff; 5735 } else 5736 writepos += conf->min_offset_diff; 5737 5738 if ((mddev->reshape_backwards 5739 ? (safepos > writepos && readpos < writepos) 5740 : (safepos < writepos && readpos > writepos)) || 5741 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5742 /* Cannot proceed until we've updated the superblock... */ 5743 wait_event(conf->wait_for_overlap, 5744 atomic_read(&conf->reshape_stripes)==0 5745 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5746 if (atomic_read(&conf->reshape_stripes) != 0) 5747 return 0; 5748 mddev->reshape_position = conf->reshape_progress; 5749 mddev->curr_resync_completed = sector_nr; 5750 conf->reshape_checkpoint = jiffies; 5751 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5752 md_wakeup_thread(mddev->thread); 5753 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5754 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5755 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5756 return 0; 5757 spin_lock_irq(&conf->device_lock); 5758 conf->reshape_safe = mddev->reshape_position; 5759 spin_unlock_irq(&conf->device_lock); 5760 wake_up(&conf->wait_for_overlap); 5761 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5762 } 5763 5764 INIT_LIST_HEAD(&stripes); 5765 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5766 int j; 5767 int skipped_disk = 0; 5768 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5769 set_bit(STRIPE_EXPANDING, &sh->state); 5770 atomic_inc(&conf->reshape_stripes); 5771 /* If any of this stripe is beyond the end of the old 5772 * array, then we need to zero those blocks 5773 */ 5774 for (j=sh->disks; j--;) { 5775 sector_t s; 5776 if (j == sh->pd_idx) 5777 continue; 5778 if (conf->level == 6 && 5779 j == sh->qd_idx) 5780 continue; 5781 s = raid5_compute_blocknr(sh, j, 0); 5782 if (s < raid5_size(mddev, 0, 0)) { 5783 skipped_disk = 1; 5784 continue; 5785 } 5786 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5787 set_bit(R5_Expanded, &sh->dev[j].flags); 5788 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5789 } 5790 if (!skipped_disk) { 5791 set_bit(STRIPE_EXPAND_READY, &sh->state); 5792 set_bit(STRIPE_HANDLE, &sh->state); 5793 } 5794 list_add(&sh->lru, &stripes); 5795 } 5796 spin_lock_irq(&conf->device_lock); 5797 if (mddev->reshape_backwards) 5798 conf->reshape_progress -= reshape_sectors * new_data_disks; 5799 else 5800 conf->reshape_progress += reshape_sectors * new_data_disks; 5801 spin_unlock_irq(&conf->device_lock); 5802 /* Ok, those stripe are ready. We can start scheduling 5803 * reads on the source stripes. 5804 * The source stripes are determined by mapping the first and last 5805 * block on the destination stripes. 5806 */ 5807 first_sector = 5808 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5809 1, &dd_idx, NULL); 5810 last_sector = 5811 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5812 * new_data_disks - 1), 5813 1, &dd_idx, NULL); 5814 if (last_sector >= mddev->dev_sectors) 5815 last_sector = mddev->dev_sectors - 1; 5816 while (first_sector <= last_sector) { 5817 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5818 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5819 set_bit(STRIPE_HANDLE, &sh->state); 5820 raid5_release_stripe(sh); 5821 first_sector += STRIPE_SECTORS; 5822 } 5823 /* Now that the sources are clearly marked, we can release 5824 * the destination stripes 5825 */ 5826 while (!list_empty(&stripes)) { 5827 sh = list_entry(stripes.next, struct stripe_head, lru); 5828 list_del_init(&sh->lru); 5829 raid5_release_stripe(sh); 5830 } 5831 /* If this takes us to the resync_max point where we have to pause, 5832 * then we need to write out the superblock. 5833 */ 5834 sector_nr += reshape_sectors; 5835 retn = reshape_sectors; 5836 finish: 5837 if (mddev->curr_resync_completed > mddev->resync_max || 5838 (sector_nr - mddev->curr_resync_completed) * 2 5839 >= mddev->resync_max - mddev->curr_resync_completed) { 5840 /* Cannot proceed until we've updated the superblock... */ 5841 wait_event(conf->wait_for_overlap, 5842 atomic_read(&conf->reshape_stripes) == 0 5843 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5844 if (atomic_read(&conf->reshape_stripes) != 0) 5845 goto ret; 5846 mddev->reshape_position = conf->reshape_progress; 5847 mddev->curr_resync_completed = sector_nr; 5848 conf->reshape_checkpoint = jiffies; 5849 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5850 md_wakeup_thread(mddev->thread); 5851 wait_event(mddev->sb_wait, 5852 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5853 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5854 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5855 goto ret; 5856 spin_lock_irq(&conf->device_lock); 5857 conf->reshape_safe = mddev->reshape_position; 5858 spin_unlock_irq(&conf->device_lock); 5859 wake_up(&conf->wait_for_overlap); 5860 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5861 } 5862 ret: 5863 return retn; 5864 } 5865 5866 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5867 int *skipped) 5868 { 5869 struct r5conf *conf = mddev->private; 5870 struct stripe_head *sh; 5871 sector_t max_sector = mddev->dev_sectors; 5872 sector_t sync_blocks; 5873 int still_degraded = 0; 5874 int i; 5875 5876 if (sector_nr >= max_sector) { 5877 /* just being told to finish up .. nothing much to do */ 5878 5879 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5880 end_reshape(conf); 5881 return 0; 5882 } 5883 5884 if (mddev->curr_resync < max_sector) /* aborted */ 5885 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5886 &sync_blocks, 1); 5887 else /* completed sync */ 5888 conf->fullsync = 0; 5889 bitmap_close_sync(mddev->bitmap); 5890 5891 return 0; 5892 } 5893 5894 /* Allow raid5_quiesce to complete */ 5895 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5896 5897 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5898 return reshape_request(mddev, sector_nr, skipped); 5899 5900 /* No need to check resync_max as we never do more than one 5901 * stripe, and as resync_max will always be on a chunk boundary, 5902 * if the check in md_do_sync didn't fire, there is no chance 5903 * of overstepping resync_max here 5904 */ 5905 5906 /* if there is too many failed drives and we are trying 5907 * to resync, then assert that we are finished, because there is 5908 * nothing we can do. 5909 */ 5910 if (mddev->degraded >= conf->max_degraded && 5911 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5912 sector_t rv = mddev->dev_sectors - sector_nr; 5913 *skipped = 1; 5914 return rv; 5915 } 5916 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 5917 !conf->fullsync && 5918 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 5919 sync_blocks >= STRIPE_SECTORS) { 5920 /* we can skip this block, and probably more */ 5921 sync_blocks /= STRIPE_SECTORS; 5922 *skipped = 1; 5923 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5924 } 5925 5926 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 5927 5928 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 5929 if (sh == NULL) { 5930 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 5931 /* make sure we don't swamp the stripe cache if someone else 5932 * is trying to get access 5933 */ 5934 schedule_timeout_uninterruptible(1); 5935 } 5936 /* Need to check if array will still be degraded after recovery/resync 5937 * Note in case of > 1 drive failures it's possible we're rebuilding 5938 * one drive while leaving another faulty drive in array. 5939 */ 5940 rcu_read_lock(); 5941 for (i = 0; i < conf->raid_disks; i++) { 5942 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 5943 5944 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 5945 still_degraded = 1; 5946 } 5947 rcu_read_unlock(); 5948 5949 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5950 5951 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5952 set_bit(STRIPE_HANDLE, &sh->state); 5953 5954 raid5_release_stripe(sh); 5955 5956 return STRIPE_SECTORS; 5957 } 5958 5959 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 5960 { 5961 /* We may not be able to submit a whole bio at once as there 5962 * may not be enough stripe_heads available. 5963 * We cannot pre-allocate enough stripe_heads as we may need 5964 * more than exist in the cache (if we allow ever large chunks). 5965 * So we do one stripe head at a time and record in 5966 * ->bi_hw_segments how many have been done. 5967 * 5968 * We *know* that this entire raid_bio is in one chunk, so 5969 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 5970 */ 5971 struct stripe_head *sh; 5972 int dd_idx; 5973 sector_t sector, logical_sector, last_sector; 5974 int scnt = 0; 5975 int remaining; 5976 int handled = 0; 5977 5978 logical_sector = raid_bio->bi_iter.bi_sector & 5979 ~((sector_t)STRIPE_SECTORS-1); 5980 sector = raid5_compute_sector(conf, logical_sector, 5981 0, &dd_idx, NULL); 5982 last_sector = bio_end_sector(raid_bio); 5983 5984 for (; logical_sector < last_sector; 5985 logical_sector += STRIPE_SECTORS, 5986 sector += STRIPE_SECTORS, 5987 scnt++) { 5988 5989 if (scnt < raid5_bi_processed_stripes(raid_bio)) 5990 /* already done this stripe */ 5991 continue; 5992 5993 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 5994 5995 if (!sh) { 5996 /* failed to get a stripe - must wait */ 5997 raid5_set_bi_processed_stripes(raid_bio, scnt); 5998 conf->retry_read_aligned = raid_bio; 5999 return handled; 6000 } 6001 6002 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6003 raid5_release_stripe(sh); 6004 raid5_set_bi_processed_stripes(raid_bio, scnt); 6005 conf->retry_read_aligned = raid_bio; 6006 return handled; 6007 } 6008 6009 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6010 handle_stripe(sh); 6011 raid5_release_stripe(sh); 6012 handled++; 6013 } 6014 remaining = raid5_dec_bi_active_stripes(raid_bio); 6015 if (remaining == 0) { 6016 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 6017 raid_bio, 0); 6018 bio_endio(raid_bio); 6019 } 6020 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6021 wake_up(&conf->wait_for_quiescent); 6022 return handled; 6023 } 6024 6025 static int handle_active_stripes(struct r5conf *conf, int group, 6026 struct r5worker *worker, 6027 struct list_head *temp_inactive_list) 6028 { 6029 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6030 int i, batch_size = 0, hash; 6031 bool release_inactive = false; 6032 6033 while (batch_size < MAX_STRIPE_BATCH && 6034 (sh = __get_priority_stripe(conf, group)) != NULL) 6035 batch[batch_size++] = sh; 6036 6037 if (batch_size == 0) { 6038 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6039 if (!list_empty(temp_inactive_list + i)) 6040 break; 6041 if (i == NR_STRIPE_HASH_LOCKS) { 6042 spin_unlock_irq(&conf->device_lock); 6043 r5l_flush_stripe_to_raid(conf->log); 6044 spin_lock_irq(&conf->device_lock); 6045 return batch_size; 6046 } 6047 release_inactive = true; 6048 } 6049 spin_unlock_irq(&conf->device_lock); 6050 6051 release_inactive_stripe_list(conf, temp_inactive_list, 6052 NR_STRIPE_HASH_LOCKS); 6053 6054 r5l_flush_stripe_to_raid(conf->log); 6055 if (release_inactive) { 6056 spin_lock_irq(&conf->device_lock); 6057 return 0; 6058 } 6059 6060 for (i = 0; i < batch_size; i++) 6061 handle_stripe(batch[i]); 6062 r5l_write_stripe_run(conf->log); 6063 6064 cond_resched(); 6065 6066 spin_lock_irq(&conf->device_lock); 6067 for (i = 0; i < batch_size; i++) { 6068 hash = batch[i]->hash_lock_index; 6069 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6070 } 6071 return batch_size; 6072 } 6073 6074 static void raid5_do_work(struct work_struct *work) 6075 { 6076 struct r5worker *worker = container_of(work, struct r5worker, work); 6077 struct r5worker_group *group = worker->group; 6078 struct r5conf *conf = group->conf; 6079 int group_id = group - conf->worker_groups; 6080 int handled; 6081 struct blk_plug plug; 6082 6083 pr_debug("+++ raid5worker active\n"); 6084 6085 blk_start_plug(&plug); 6086 handled = 0; 6087 spin_lock_irq(&conf->device_lock); 6088 while (1) { 6089 int batch_size, released; 6090 6091 released = release_stripe_list(conf, worker->temp_inactive_list); 6092 6093 batch_size = handle_active_stripes(conf, group_id, worker, 6094 worker->temp_inactive_list); 6095 worker->working = false; 6096 if (!batch_size && !released) 6097 break; 6098 handled += batch_size; 6099 } 6100 pr_debug("%d stripes handled\n", handled); 6101 6102 spin_unlock_irq(&conf->device_lock); 6103 blk_finish_plug(&plug); 6104 6105 pr_debug("--- raid5worker inactive\n"); 6106 } 6107 6108 /* 6109 * This is our raid5 kernel thread. 6110 * 6111 * We scan the hash table for stripes which can be handled now. 6112 * During the scan, completed stripes are saved for us by the interrupt 6113 * handler, so that they will not have to wait for our next wakeup. 6114 */ 6115 static void raid5d(struct md_thread *thread) 6116 { 6117 struct mddev *mddev = thread->mddev; 6118 struct r5conf *conf = mddev->private; 6119 int handled; 6120 struct blk_plug plug; 6121 6122 pr_debug("+++ raid5d active\n"); 6123 6124 md_check_recovery(mddev); 6125 6126 if (!bio_list_empty(&conf->return_bi) && 6127 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6128 struct bio_list tmp = BIO_EMPTY_LIST; 6129 spin_lock_irq(&conf->device_lock); 6130 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6131 bio_list_merge(&tmp, &conf->return_bi); 6132 bio_list_init(&conf->return_bi); 6133 } 6134 spin_unlock_irq(&conf->device_lock); 6135 return_io(&tmp); 6136 } 6137 6138 blk_start_plug(&plug); 6139 handled = 0; 6140 spin_lock_irq(&conf->device_lock); 6141 while (1) { 6142 struct bio *bio; 6143 int batch_size, released; 6144 6145 released = release_stripe_list(conf, conf->temp_inactive_list); 6146 if (released) 6147 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6148 6149 if ( 6150 !list_empty(&conf->bitmap_list)) { 6151 /* Now is a good time to flush some bitmap updates */ 6152 conf->seq_flush++; 6153 spin_unlock_irq(&conf->device_lock); 6154 bitmap_unplug(mddev->bitmap); 6155 spin_lock_irq(&conf->device_lock); 6156 conf->seq_write = conf->seq_flush; 6157 activate_bit_delay(conf, conf->temp_inactive_list); 6158 } 6159 raid5_activate_delayed(conf); 6160 6161 while ((bio = remove_bio_from_retry(conf))) { 6162 int ok; 6163 spin_unlock_irq(&conf->device_lock); 6164 ok = retry_aligned_read(conf, bio); 6165 spin_lock_irq(&conf->device_lock); 6166 if (!ok) 6167 break; 6168 handled++; 6169 } 6170 6171 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6172 conf->temp_inactive_list); 6173 if (!batch_size && !released) 6174 break; 6175 handled += batch_size; 6176 6177 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6178 spin_unlock_irq(&conf->device_lock); 6179 md_check_recovery(mddev); 6180 spin_lock_irq(&conf->device_lock); 6181 } 6182 } 6183 pr_debug("%d stripes handled\n", handled); 6184 6185 spin_unlock_irq(&conf->device_lock); 6186 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6187 mutex_trylock(&conf->cache_size_mutex)) { 6188 grow_one_stripe(conf, __GFP_NOWARN); 6189 /* Set flag even if allocation failed. This helps 6190 * slow down allocation requests when mem is short 6191 */ 6192 set_bit(R5_DID_ALLOC, &conf->cache_state); 6193 mutex_unlock(&conf->cache_size_mutex); 6194 } 6195 6196 flush_deferred_bios(conf); 6197 6198 r5l_flush_stripe_to_raid(conf->log); 6199 6200 async_tx_issue_pending_all(); 6201 blk_finish_plug(&plug); 6202 6203 pr_debug("--- raid5d inactive\n"); 6204 } 6205 6206 static ssize_t 6207 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6208 { 6209 struct r5conf *conf; 6210 int ret = 0; 6211 spin_lock(&mddev->lock); 6212 conf = mddev->private; 6213 if (conf) 6214 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6215 spin_unlock(&mddev->lock); 6216 return ret; 6217 } 6218 6219 int 6220 raid5_set_cache_size(struct mddev *mddev, int size) 6221 { 6222 struct r5conf *conf = mddev->private; 6223 int err; 6224 6225 if (size <= 16 || size > 32768) 6226 return -EINVAL; 6227 6228 conf->min_nr_stripes = size; 6229 mutex_lock(&conf->cache_size_mutex); 6230 while (size < conf->max_nr_stripes && 6231 drop_one_stripe(conf)) 6232 ; 6233 mutex_unlock(&conf->cache_size_mutex); 6234 6235 6236 err = md_allow_write(mddev); 6237 if (err) 6238 return err; 6239 6240 mutex_lock(&conf->cache_size_mutex); 6241 while (size > conf->max_nr_stripes) 6242 if (!grow_one_stripe(conf, GFP_KERNEL)) 6243 break; 6244 mutex_unlock(&conf->cache_size_mutex); 6245 6246 return 0; 6247 } 6248 EXPORT_SYMBOL(raid5_set_cache_size); 6249 6250 static ssize_t 6251 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6252 { 6253 struct r5conf *conf; 6254 unsigned long new; 6255 int err; 6256 6257 if (len >= PAGE_SIZE) 6258 return -EINVAL; 6259 if (kstrtoul(page, 10, &new)) 6260 return -EINVAL; 6261 err = mddev_lock(mddev); 6262 if (err) 6263 return err; 6264 conf = mddev->private; 6265 if (!conf) 6266 err = -ENODEV; 6267 else 6268 err = raid5_set_cache_size(mddev, new); 6269 mddev_unlock(mddev); 6270 6271 return err ?: len; 6272 } 6273 6274 static struct md_sysfs_entry 6275 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6276 raid5_show_stripe_cache_size, 6277 raid5_store_stripe_cache_size); 6278 6279 static ssize_t 6280 raid5_show_rmw_level(struct mddev *mddev, char *page) 6281 { 6282 struct r5conf *conf = mddev->private; 6283 if (conf) 6284 return sprintf(page, "%d\n", conf->rmw_level); 6285 else 6286 return 0; 6287 } 6288 6289 static ssize_t 6290 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6291 { 6292 struct r5conf *conf = mddev->private; 6293 unsigned long new; 6294 6295 if (!conf) 6296 return -ENODEV; 6297 6298 if (len >= PAGE_SIZE) 6299 return -EINVAL; 6300 6301 if (kstrtoul(page, 10, &new)) 6302 return -EINVAL; 6303 6304 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6305 return -EINVAL; 6306 6307 if (new != PARITY_DISABLE_RMW && 6308 new != PARITY_ENABLE_RMW && 6309 new != PARITY_PREFER_RMW) 6310 return -EINVAL; 6311 6312 conf->rmw_level = new; 6313 return len; 6314 } 6315 6316 static struct md_sysfs_entry 6317 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6318 raid5_show_rmw_level, 6319 raid5_store_rmw_level); 6320 6321 6322 static ssize_t 6323 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6324 { 6325 struct r5conf *conf; 6326 int ret = 0; 6327 spin_lock(&mddev->lock); 6328 conf = mddev->private; 6329 if (conf) 6330 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6331 spin_unlock(&mddev->lock); 6332 return ret; 6333 } 6334 6335 static ssize_t 6336 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6337 { 6338 struct r5conf *conf; 6339 unsigned long new; 6340 int err; 6341 6342 if (len >= PAGE_SIZE) 6343 return -EINVAL; 6344 if (kstrtoul(page, 10, &new)) 6345 return -EINVAL; 6346 6347 err = mddev_lock(mddev); 6348 if (err) 6349 return err; 6350 conf = mddev->private; 6351 if (!conf) 6352 err = -ENODEV; 6353 else if (new > conf->min_nr_stripes) 6354 err = -EINVAL; 6355 else 6356 conf->bypass_threshold = new; 6357 mddev_unlock(mddev); 6358 return err ?: len; 6359 } 6360 6361 static struct md_sysfs_entry 6362 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6363 S_IRUGO | S_IWUSR, 6364 raid5_show_preread_threshold, 6365 raid5_store_preread_threshold); 6366 6367 static ssize_t 6368 raid5_show_skip_copy(struct mddev *mddev, char *page) 6369 { 6370 struct r5conf *conf; 6371 int ret = 0; 6372 spin_lock(&mddev->lock); 6373 conf = mddev->private; 6374 if (conf) 6375 ret = sprintf(page, "%d\n", conf->skip_copy); 6376 spin_unlock(&mddev->lock); 6377 return ret; 6378 } 6379 6380 static ssize_t 6381 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6382 { 6383 struct r5conf *conf; 6384 unsigned long new; 6385 int err; 6386 6387 if (len >= PAGE_SIZE) 6388 return -EINVAL; 6389 if (kstrtoul(page, 10, &new)) 6390 return -EINVAL; 6391 new = !!new; 6392 6393 err = mddev_lock(mddev); 6394 if (err) 6395 return err; 6396 conf = mddev->private; 6397 if (!conf) 6398 err = -ENODEV; 6399 else if (new != conf->skip_copy) { 6400 mddev_suspend(mddev); 6401 conf->skip_copy = new; 6402 if (new) 6403 mddev->queue->backing_dev_info->capabilities |= 6404 BDI_CAP_STABLE_WRITES; 6405 else 6406 mddev->queue->backing_dev_info->capabilities &= 6407 ~BDI_CAP_STABLE_WRITES; 6408 mddev_resume(mddev); 6409 } 6410 mddev_unlock(mddev); 6411 return err ?: len; 6412 } 6413 6414 static struct md_sysfs_entry 6415 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6416 raid5_show_skip_copy, 6417 raid5_store_skip_copy); 6418 6419 static ssize_t 6420 stripe_cache_active_show(struct mddev *mddev, char *page) 6421 { 6422 struct r5conf *conf = mddev->private; 6423 if (conf) 6424 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6425 else 6426 return 0; 6427 } 6428 6429 static struct md_sysfs_entry 6430 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6431 6432 static ssize_t 6433 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6434 { 6435 struct r5conf *conf; 6436 int ret = 0; 6437 spin_lock(&mddev->lock); 6438 conf = mddev->private; 6439 if (conf) 6440 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6441 spin_unlock(&mddev->lock); 6442 return ret; 6443 } 6444 6445 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6446 int *group_cnt, 6447 int *worker_cnt_per_group, 6448 struct r5worker_group **worker_groups); 6449 static ssize_t 6450 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6451 { 6452 struct r5conf *conf; 6453 unsigned long new; 6454 int err; 6455 struct r5worker_group *new_groups, *old_groups; 6456 int group_cnt, worker_cnt_per_group; 6457 6458 if (len >= PAGE_SIZE) 6459 return -EINVAL; 6460 if (kstrtoul(page, 10, &new)) 6461 return -EINVAL; 6462 6463 err = mddev_lock(mddev); 6464 if (err) 6465 return err; 6466 conf = mddev->private; 6467 if (!conf) 6468 err = -ENODEV; 6469 else if (new != conf->worker_cnt_per_group) { 6470 mddev_suspend(mddev); 6471 6472 old_groups = conf->worker_groups; 6473 if (old_groups) 6474 flush_workqueue(raid5_wq); 6475 6476 err = alloc_thread_groups(conf, new, 6477 &group_cnt, &worker_cnt_per_group, 6478 &new_groups); 6479 if (!err) { 6480 spin_lock_irq(&conf->device_lock); 6481 conf->group_cnt = group_cnt; 6482 conf->worker_cnt_per_group = worker_cnt_per_group; 6483 conf->worker_groups = new_groups; 6484 spin_unlock_irq(&conf->device_lock); 6485 6486 if (old_groups) 6487 kfree(old_groups[0].workers); 6488 kfree(old_groups); 6489 } 6490 mddev_resume(mddev); 6491 } 6492 mddev_unlock(mddev); 6493 6494 return err ?: len; 6495 } 6496 6497 static struct md_sysfs_entry 6498 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6499 raid5_show_group_thread_cnt, 6500 raid5_store_group_thread_cnt); 6501 6502 static struct attribute *raid5_attrs[] = { 6503 &raid5_stripecache_size.attr, 6504 &raid5_stripecache_active.attr, 6505 &raid5_preread_bypass_threshold.attr, 6506 &raid5_group_thread_cnt.attr, 6507 &raid5_skip_copy.attr, 6508 &raid5_rmw_level.attr, 6509 &r5c_journal_mode.attr, 6510 NULL, 6511 }; 6512 static struct attribute_group raid5_attrs_group = { 6513 .name = NULL, 6514 .attrs = raid5_attrs, 6515 }; 6516 6517 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6518 int *group_cnt, 6519 int *worker_cnt_per_group, 6520 struct r5worker_group **worker_groups) 6521 { 6522 int i, j, k; 6523 ssize_t size; 6524 struct r5worker *workers; 6525 6526 *worker_cnt_per_group = cnt; 6527 if (cnt == 0) { 6528 *group_cnt = 0; 6529 *worker_groups = NULL; 6530 return 0; 6531 } 6532 *group_cnt = num_possible_nodes(); 6533 size = sizeof(struct r5worker) * cnt; 6534 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6535 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6536 *group_cnt, GFP_NOIO); 6537 if (!*worker_groups || !workers) { 6538 kfree(workers); 6539 kfree(*worker_groups); 6540 return -ENOMEM; 6541 } 6542 6543 for (i = 0; i < *group_cnt; i++) { 6544 struct r5worker_group *group; 6545 6546 group = &(*worker_groups)[i]; 6547 INIT_LIST_HEAD(&group->handle_list); 6548 group->conf = conf; 6549 group->workers = workers + i * cnt; 6550 6551 for (j = 0; j < cnt; j++) { 6552 struct r5worker *worker = group->workers + j; 6553 worker->group = group; 6554 INIT_WORK(&worker->work, raid5_do_work); 6555 6556 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6557 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6558 } 6559 } 6560 6561 return 0; 6562 } 6563 6564 static void free_thread_groups(struct r5conf *conf) 6565 { 6566 if (conf->worker_groups) 6567 kfree(conf->worker_groups[0].workers); 6568 kfree(conf->worker_groups); 6569 conf->worker_groups = NULL; 6570 } 6571 6572 static sector_t 6573 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6574 { 6575 struct r5conf *conf = mddev->private; 6576 6577 if (!sectors) 6578 sectors = mddev->dev_sectors; 6579 if (!raid_disks) 6580 /* size is defined by the smallest of previous and new size */ 6581 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6582 6583 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6584 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6585 return sectors * (raid_disks - conf->max_degraded); 6586 } 6587 6588 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6589 { 6590 safe_put_page(percpu->spare_page); 6591 if (percpu->scribble) 6592 flex_array_free(percpu->scribble); 6593 percpu->spare_page = NULL; 6594 percpu->scribble = NULL; 6595 } 6596 6597 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6598 { 6599 if (conf->level == 6 && !percpu->spare_page) 6600 percpu->spare_page = alloc_page(GFP_KERNEL); 6601 if (!percpu->scribble) 6602 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6603 conf->previous_raid_disks), 6604 max(conf->chunk_sectors, 6605 conf->prev_chunk_sectors) 6606 / STRIPE_SECTORS, 6607 GFP_KERNEL); 6608 6609 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6610 free_scratch_buffer(conf, percpu); 6611 return -ENOMEM; 6612 } 6613 6614 return 0; 6615 } 6616 6617 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6618 { 6619 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6620 6621 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6622 return 0; 6623 } 6624 6625 static void raid5_free_percpu(struct r5conf *conf) 6626 { 6627 if (!conf->percpu) 6628 return; 6629 6630 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6631 free_percpu(conf->percpu); 6632 } 6633 6634 static void free_conf(struct r5conf *conf) 6635 { 6636 int i; 6637 6638 if (conf->log) 6639 r5l_exit_log(conf->log); 6640 if (conf->shrinker.nr_deferred) 6641 unregister_shrinker(&conf->shrinker); 6642 6643 free_thread_groups(conf); 6644 shrink_stripes(conf); 6645 raid5_free_percpu(conf); 6646 for (i = 0; i < conf->pool_size; i++) 6647 if (conf->disks[i].extra_page) 6648 put_page(conf->disks[i].extra_page); 6649 kfree(conf->disks); 6650 kfree(conf->stripe_hashtbl); 6651 kfree(conf); 6652 } 6653 6654 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6655 { 6656 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6657 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6658 6659 if (alloc_scratch_buffer(conf, percpu)) { 6660 pr_warn("%s: failed memory allocation for cpu%u\n", 6661 __func__, cpu); 6662 return -ENOMEM; 6663 } 6664 return 0; 6665 } 6666 6667 static int raid5_alloc_percpu(struct r5conf *conf) 6668 { 6669 int err = 0; 6670 6671 conf->percpu = alloc_percpu(struct raid5_percpu); 6672 if (!conf->percpu) 6673 return -ENOMEM; 6674 6675 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6676 if (!err) { 6677 conf->scribble_disks = max(conf->raid_disks, 6678 conf->previous_raid_disks); 6679 conf->scribble_sectors = max(conf->chunk_sectors, 6680 conf->prev_chunk_sectors); 6681 } 6682 return err; 6683 } 6684 6685 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6686 struct shrink_control *sc) 6687 { 6688 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6689 unsigned long ret = SHRINK_STOP; 6690 6691 if (mutex_trylock(&conf->cache_size_mutex)) { 6692 ret= 0; 6693 while (ret < sc->nr_to_scan && 6694 conf->max_nr_stripes > conf->min_nr_stripes) { 6695 if (drop_one_stripe(conf) == 0) { 6696 ret = SHRINK_STOP; 6697 break; 6698 } 6699 ret++; 6700 } 6701 mutex_unlock(&conf->cache_size_mutex); 6702 } 6703 return ret; 6704 } 6705 6706 static unsigned long raid5_cache_count(struct shrinker *shrink, 6707 struct shrink_control *sc) 6708 { 6709 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6710 6711 if (conf->max_nr_stripes < conf->min_nr_stripes) 6712 /* unlikely, but not impossible */ 6713 return 0; 6714 return conf->max_nr_stripes - conf->min_nr_stripes; 6715 } 6716 6717 static struct r5conf *setup_conf(struct mddev *mddev) 6718 { 6719 struct r5conf *conf; 6720 int raid_disk, memory, max_disks; 6721 struct md_rdev *rdev; 6722 struct disk_info *disk; 6723 char pers_name[6]; 6724 int i; 6725 int group_cnt, worker_cnt_per_group; 6726 struct r5worker_group *new_group; 6727 6728 if (mddev->new_level != 5 6729 && mddev->new_level != 4 6730 && mddev->new_level != 6) { 6731 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6732 mdname(mddev), mddev->new_level); 6733 return ERR_PTR(-EIO); 6734 } 6735 if ((mddev->new_level == 5 6736 && !algorithm_valid_raid5(mddev->new_layout)) || 6737 (mddev->new_level == 6 6738 && !algorithm_valid_raid6(mddev->new_layout))) { 6739 pr_warn("md/raid:%s: layout %d not supported\n", 6740 mdname(mddev), mddev->new_layout); 6741 return ERR_PTR(-EIO); 6742 } 6743 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6744 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6745 mdname(mddev), mddev->raid_disks); 6746 return ERR_PTR(-EINVAL); 6747 } 6748 6749 if (!mddev->new_chunk_sectors || 6750 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6751 !is_power_of_2(mddev->new_chunk_sectors)) { 6752 pr_warn("md/raid:%s: invalid chunk size %d\n", 6753 mdname(mddev), mddev->new_chunk_sectors << 9); 6754 return ERR_PTR(-EINVAL); 6755 } 6756 6757 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6758 if (conf == NULL) 6759 goto abort; 6760 /* Don't enable multi-threading by default*/ 6761 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6762 &new_group)) { 6763 conf->group_cnt = group_cnt; 6764 conf->worker_cnt_per_group = worker_cnt_per_group; 6765 conf->worker_groups = new_group; 6766 } else 6767 goto abort; 6768 spin_lock_init(&conf->device_lock); 6769 seqcount_init(&conf->gen_lock); 6770 mutex_init(&conf->cache_size_mutex); 6771 init_waitqueue_head(&conf->wait_for_quiescent); 6772 init_waitqueue_head(&conf->wait_for_stripe); 6773 init_waitqueue_head(&conf->wait_for_overlap); 6774 INIT_LIST_HEAD(&conf->handle_list); 6775 INIT_LIST_HEAD(&conf->hold_list); 6776 INIT_LIST_HEAD(&conf->delayed_list); 6777 INIT_LIST_HEAD(&conf->bitmap_list); 6778 bio_list_init(&conf->return_bi); 6779 init_llist_head(&conf->released_stripes); 6780 atomic_set(&conf->active_stripes, 0); 6781 atomic_set(&conf->preread_active_stripes, 0); 6782 atomic_set(&conf->active_aligned_reads, 0); 6783 bio_list_init(&conf->pending_bios); 6784 spin_lock_init(&conf->pending_bios_lock); 6785 conf->batch_bio_dispatch = true; 6786 rdev_for_each(rdev, mddev) { 6787 if (test_bit(Journal, &rdev->flags)) 6788 continue; 6789 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6790 conf->batch_bio_dispatch = false; 6791 break; 6792 } 6793 } 6794 6795 conf->bypass_threshold = BYPASS_THRESHOLD; 6796 conf->recovery_disabled = mddev->recovery_disabled - 1; 6797 6798 conf->raid_disks = mddev->raid_disks; 6799 if (mddev->reshape_position == MaxSector) 6800 conf->previous_raid_disks = mddev->raid_disks; 6801 else 6802 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6803 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6804 6805 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6806 GFP_KERNEL); 6807 6808 if (!conf->disks) 6809 goto abort; 6810 6811 for (i = 0; i < max_disks; i++) { 6812 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6813 if (!conf->disks[i].extra_page) 6814 goto abort; 6815 } 6816 6817 conf->mddev = mddev; 6818 6819 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6820 goto abort; 6821 6822 /* We init hash_locks[0] separately to that it can be used 6823 * as the reference lock in the spin_lock_nest_lock() call 6824 * in lock_all_device_hash_locks_irq in order to convince 6825 * lockdep that we know what we are doing. 6826 */ 6827 spin_lock_init(conf->hash_locks); 6828 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6829 spin_lock_init(conf->hash_locks + i); 6830 6831 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6832 INIT_LIST_HEAD(conf->inactive_list + i); 6833 6834 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6835 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6836 6837 atomic_set(&conf->r5c_cached_full_stripes, 0); 6838 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6839 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6840 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6841 atomic_set(&conf->r5c_flushing_full_stripes, 0); 6842 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 6843 6844 conf->level = mddev->new_level; 6845 conf->chunk_sectors = mddev->new_chunk_sectors; 6846 if (raid5_alloc_percpu(conf) != 0) 6847 goto abort; 6848 6849 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6850 6851 rdev_for_each(rdev, mddev) { 6852 raid_disk = rdev->raid_disk; 6853 if (raid_disk >= max_disks 6854 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6855 continue; 6856 disk = conf->disks + raid_disk; 6857 6858 if (test_bit(Replacement, &rdev->flags)) { 6859 if (disk->replacement) 6860 goto abort; 6861 disk->replacement = rdev; 6862 } else { 6863 if (disk->rdev) 6864 goto abort; 6865 disk->rdev = rdev; 6866 } 6867 6868 if (test_bit(In_sync, &rdev->flags)) { 6869 char b[BDEVNAME_SIZE]; 6870 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 6871 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6872 } else if (rdev->saved_raid_disk != raid_disk) 6873 /* Cannot rely on bitmap to complete recovery */ 6874 conf->fullsync = 1; 6875 } 6876 6877 conf->level = mddev->new_level; 6878 if (conf->level == 6) { 6879 conf->max_degraded = 2; 6880 if (raid6_call.xor_syndrome) 6881 conf->rmw_level = PARITY_ENABLE_RMW; 6882 else 6883 conf->rmw_level = PARITY_DISABLE_RMW; 6884 } else { 6885 conf->max_degraded = 1; 6886 conf->rmw_level = PARITY_ENABLE_RMW; 6887 } 6888 conf->algorithm = mddev->new_layout; 6889 conf->reshape_progress = mddev->reshape_position; 6890 if (conf->reshape_progress != MaxSector) { 6891 conf->prev_chunk_sectors = mddev->chunk_sectors; 6892 conf->prev_algo = mddev->layout; 6893 } else { 6894 conf->prev_chunk_sectors = conf->chunk_sectors; 6895 conf->prev_algo = conf->algorithm; 6896 } 6897 6898 conf->min_nr_stripes = NR_STRIPES; 6899 if (mddev->reshape_position != MaxSector) { 6900 int stripes = max_t(int, 6901 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 6902 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 6903 conf->min_nr_stripes = max(NR_STRIPES, stripes); 6904 if (conf->min_nr_stripes != NR_STRIPES) 6905 pr_info("md/raid:%s: force stripe size %d for reshape\n", 6906 mdname(mddev), conf->min_nr_stripes); 6907 } 6908 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 6909 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6910 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6911 if (grow_stripes(conf, conf->min_nr_stripes)) { 6912 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 6913 mdname(mddev), memory); 6914 goto abort; 6915 } else 6916 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 6917 /* 6918 * Losing a stripe head costs more than the time to refill it, 6919 * it reduces the queue depth and so can hurt throughput. 6920 * So set it rather large, scaled by number of devices. 6921 */ 6922 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 6923 conf->shrinker.scan_objects = raid5_cache_scan; 6924 conf->shrinker.count_objects = raid5_cache_count; 6925 conf->shrinker.batch = 128; 6926 conf->shrinker.flags = 0; 6927 if (register_shrinker(&conf->shrinker)) { 6928 pr_warn("md/raid:%s: couldn't register shrinker.\n", 6929 mdname(mddev)); 6930 goto abort; 6931 } 6932 6933 sprintf(pers_name, "raid%d", mddev->new_level); 6934 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6935 if (!conf->thread) { 6936 pr_warn("md/raid:%s: couldn't allocate thread.\n", 6937 mdname(mddev)); 6938 goto abort; 6939 } 6940 6941 return conf; 6942 6943 abort: 6944 if (conf) { 6945 free_conf(conf); 6946 return ERR_PTR(-EIO); 6947 } else 6948 return ERR_PTR(-ENOMEM); 6949 } 6950 6951 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 6952 { 6953 switch (algo) { 6954 case ALGORITHM_PARITY_0: 6955 if (raid_disk < max_degraded) 6956 return 1; 6957 break; 6958 case ALGORITHM_PARITY_N: 6959 if (raid_disk >= raid_disks - max_degraded) 6960 return 1; 6961 break; 6962 case ALGORITHM_PARITY_0_6: 6963 if (raid_disk == 0 || 6964 raid_disk == raid_disks - 1) 6965 return 1; 6966 break; 6967 case ALGORITHM_LEFT_ASYMMETRIC_6: 6968 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6969 case ALGORITHM_LEFT_SYMMETRIC_6: 6970 case ALGORITHM_RIGHT_SYMMETRIC_6: 6971 if (raid_disk == raid_disks - 1) 6972 return 1; 6973 } 6974 return 0; 6975 } 6976 6977 static int raid5_run(struct mddev *mddev) 6978 { 6979 struct r5conf *conf; 6980 int working_disks = 0; 6981 int dirty_parity_disks = 0; 6982 struct md_rdev *rdev; 6983 struct md_rdev *journal_dev = NULL; 6984 sector_t reshape_offset = 0; 6985 int i; 6986 long long min_offset_diff = 0; 6987 int first = 1; 6988 6989 if (mddev->recovery_cp != MaxSector) 6990 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 6991 mdname(mddev)); 6992 6993 rdev_for_each(rdev, mddev) { 6994 long long diff; 6995 6996 if (test_bit(Journal, &rdev->flags)) { 6997 journal_dev = rdev; 6998 continue; 6999 } 7000 if (rdev->raid_disk < 0) 7001 continue; 7002 diff = (rdev->new_data_offset - rdev->data_offset); 7003 if (first) { 7004 min_offset_diff = diff; 7005 first = 0; 7006 } else if (mddev->reshape_backwards && 7007 diff < min_offset_diff) 7008 min_offset_diff = diff; 7009 else if (!mddev->reshape_backwards && 7010 diff > min_offset_diff) 7011 min_offset_diff = diff; 7012 } 7013 7014 if (mddev->reshape_position != MaxSector) { 7015 /* Check that we can continue the reshape. 7016 * Difficulties arise if the stripe we would write to 7017 * next is at or after the stripe we would read from next. 7018 * For a reshape that changes the number of devices, this 7019 * is only possible for a very short time, and mdadm makes 7020 * sure that time appears to have past before assembling 7021 * the array. So we fail if that time hasn't passed. 7022 * For a reshape that keeps the number of devices the same 7023 * mdadm must be monitoring the reshape can keeping the 7024 * critical areas read-only and backed up. It will start 7025 * the array in read-only mode, so we check for that. 7026 */ 7027 sector_t here_new, here_old; 7028 int old_disks; 7029 int max_degraded = (mddev->level == 6 ? 2 : 1); 7030 int chunk_sectors; 7031 int new_data_disks; 7032 7033 if (journal_dev) { 7034 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7035 mdname(mddev)); 7036 return -EINVAL; 7037 } 7038 7039 if (mddev->new_level != mddev->level) { 7040 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7041 mdname(mddev)); 7042 return -EINVAL; 7043 } 7044 old_disks = mddev->raid_disks - mddev->delta_disks; 7045 /* reshape_position must be on a new-stripe boundary, and one 7046 * further up in new geometry must map after here in old 7047 * geometry. 7048 * If the chunk sizes are different, then as we perform reshape 7049 * in units of the largest of the two, reshape_position needs 7050 * be a multiple of the largest chunk size times new data disks. 7051 */ 7052 here_new = mddev->reshape_position; 7053 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7054 new_data_disks = mddev->raid_disks - max_degraded; 7055 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7056 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7057 mdname(mddev)); 7058 return -EINVAL; 7059 } 7060 reshape_offset = here_new * chunk_sectors; 7061 /* here_new is the stripe we will write to */ 7062 here_old = mddev->reshape_position; 7063 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7064 /* here_old is the first stripe that we might need to read 7065 * from */ 7066 if (mddev->delta_disks == 0) { 7067 /* We cannot be sure it is safe to start an in-place 7068 * reshape. It is only safe if user-space is monitoring 7069 * and taking constant backups. 7070 * mdadm always starts a situation like this in 7071 * readonly mode so it can take control before 7072 * allowing any writes. So just check for that. 7073 */ 7074 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7075 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7076 /* not really in-place - so OK */; 7077 else if (mddev->ro == 0) { 7078 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7079 mdname(mddev)); 7080 return -EINVAL; 7081 } 7082 } else if (mddev->reshape_backwards 7083 ? (here_new * chunk_sectors + min_offset_diff <= 7084 here_old * chunk_sectors) 7085 : (here_new * chunk_sectors >= 7086 here_old * chunk_sectors + (-min_offset_diff))) { 7087 /* Reading from the same stripe as writing to - bad */ 7088 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7089 mdname(mddev)); 7090 return -EINVAL; 7091 } 7092 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7093 /* OK, we should be able to continue; */ 7094 } else { 7095 BUG_ON(mddev->level != mddev->new_level); 7096 BUG_ON(mddev->layout != mddev->new_layout); 7097 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7098 BUG_ON(mddev->delta_disks != 0); 7099 } 7100 7101 if (mddev->private == NULL) 7102 conf = setup_conf(mddev); 7103 else 7104 conf = mddev->private; 7105 7106 if (IS_ERR(conf)) 7107 return PTR_ERR(conf); 7108 7109 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7110 if (!journal_dev) { 7111 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7112 mdname(mddev)); 7113 mddev->ro = 1; 7114 set_disk_ro(mddev->gendisk, 1); 7115 } else if (mddev->recovery_cp == MaxSector) 7116 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7117 } 7118 7119 conf->min_offset_diff = min_offset_diff; 7120 mddev->thread = conf->thread; 7121 conf->thread = NULL; 7122 mddev->private = conf; 7123 7124 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7125 i++) { 7126 rdev = conf->disks[i].rdev; 7127 if (!rdev && conf->disks[i].replacement) { 7128 /* The replacement is all we have yet */ 7129 rdev = conf->disks[i].replacement; 7130 conf->disks[i].replacement = NULL; 7131 clear_bit(Replacement, &rdev->flags); 7132 conf->disks[i].rdev = rdev; 7133 } 7134 if (!rdev) 7135 continue; 7136 if (conf->disks[i].replacement && 7137 conf->reshape_progress != MaxSector) { 7138 /* replacements and reshape simply do not mix. */ 7139 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7140 goto abort; 7141 } 7142 if (test_bit(In_sync, &rdev->flags)) { 7143 working_disks++; 7144 continue; 7145 } 7146 /* This disc is not fully in-sync. However if it 7147 * just stored parity (beyond the recovery_offset), 7148 * when we don't need to be concerned about the 7149 * array being dirty. 7150 * When reshape goes 'backwards', we never have 7151 * partially completed devices, so we only need 7152 * to worry about reshape going forwards. 7153 */ 7154 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7155 if (mddev->major_version == 0 && 7156 mddev->minor_version > 90) 7157 rdev->recovery_offset = reshape_offset; 7158 7159 if (rdev->recovery_offset < reshape_offset) { 7160 /* We need to check old and new layout */ 7161 if (!only_parity(rdev->raid_disk, 7162 conf->algorithm, 7163 conf->raid_disks, 7164 conf->max_degraded)) 7165 continue; 7166 } 7167 if (!only_parity(rdev->raid_disk, 7168 conf->prev_algo, 7169 conf->previous_raid_disks, 7170 conf->max_degraded)) 7171 continue; 7172 dirty_parity_disks++; 7173 } 7174 7175 /* 7176 * 0 for a fully functional array, 1 or 2 for a degraded array. 7177 */ 7178 mddev->degraded = raid5_calc_degraded(conf); 7179 7180 if (has_failed(conf)) { 7181 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7182 mdname(mddev), mddev->degraded, conf->raid_disks); 7183 goto abort; 7184 } 7185 7186 /* device size must be a multiple of chunk size */ 7187 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7188 mddev->resync_max_sectors = mddev->dev_sectors; 7189 7190 if (mddev->degraded > dirty_parity_disks && 7191 mddev->recovery_cp != MaxSector) { 7192 if (mddev->ok_start_degraded) 7193 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7194 mdname(mddev)); 7195 else { 7196 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7197 mdname(mddev)); 7198 goto abort; 7199 } 7200 } 7201 7202 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7203 mdname(mddev), conf->level, 7204 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7205 mddev->new_layout); 7206 7207 print_raid5_conf(conf); 7208 7209 if (conf->reshape_progress != MaxSector) { 7210 conf->reshape_safe = conf->reshape_progress; 7211 atomic_set(&conf->reshape_stripes, 0); 7212 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7213 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7214 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7215 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7216 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7217 "reshape"); 7218 } 7219 7220 /* Ok, everything is just fine now */ 7221 if (mddev->to_remove == &raid5_attrs_group) 7222 mddev->to_remove = NULL; 7223 else if (mddev->kobj.sd && 7224 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7225 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7226 mdname(mddev)); 7227 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7228 7229 if (mddev->queue) { 7230 int chunk_size; 7231 bool discard_supported = true; 7232 /* read-ahead size must cover two whole stripes, which 7233 * is 2 * (datadisks) * chunksize where 'n' is the 7234 * number of raid devices 7235 */ 7236 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7237 int stripe = data_disks * 7238 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7239 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7240 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7241 7242 chunk_size = mddev->chunk_sectors << 9; 7243 blk_queue_io_min(mddev->queue, chunk_size); 7244 blk_queue_io_opt(mddev->queue, chunk_size * 7245 (conf->raid_disks - conf->max_degraded)); 7246 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7247 /* 7248 * We can only discard a whole stripe. It doesn't make sense to 7249 * discard data disk but write parity disk 7250 */ 7251 stripe = stripe * PAGE_SIZE; 7252 /* Round up to power of 2, as discard handling 7253 * currently assumes that */ 7254 while ((stripe-1) & stripe) 7255 stripe = (stripe | (stripe-1)) + 1; 7256 mddev->queue->limits.discard_alignment = stripe; 7257 mddev->queue->limits.discard_granularity = stripe; 7258 7259 /* 7260 * We use 16-bit counter of active stripes in bi_phys_segments 7261 * (minus one for over-loaded initialization) 7262 */ 7263 blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); 7264 blk_queue_max_discard_sectors(mddev->queue, 7265 0xfffe * STRIPE_SECTORS); 7266 7267 /* 7268 * unaligned part of discard request will be ignored, so can't 7269 * guarantee discard_zeroes_data 7270 */ 7271 mddev->queue->limits.discard_zeroes_data = 0; 7272 7273 blk_queue_max_write_same_sectors(mddev->queue, 0); 7274 7275 rdev_for_each(rdev, mddev) { 7276 disk_stack_limits(mddev->gendisk, rdev->bdev, 7277 rdev->data_offset << 9); 7278 disk_stack_limits(mddev->gendisk, rdev->bdev, 7279 rdev->new_data_offset << 9); 7280 /* 7281 * discard_zeroes_data is required, otherwise data 7282 * could be lost. Consider a scenario: discard a stripe 7283 * (the stripe could be inconsistent if 7284 * discard_zeroes_data is 0); write one disk of the 7285 * stripe (the stripe could be inconsistent again 7286 * depending on which disks are used to calculate 7287 * parity); the disk is broken; The stripe data of this 7288 * disk is lost. 7289 */ 7290 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 7291 !bdev_get_queue(rdev->bdev)-> 7292 limits.discard_zeroes_data) 7293 discard_supported = false; 7294 /* Unfortunately, discard_zeroes_data is not currently 7295 * a guarantee - just a hint. So we only allow DISCARD 7296 * if the sysadmin has confirmed that only safe devices 7297 * are in use by setting a module parameter. 7298 */ 7299 if (!devices_handle_discard_safely) { 7300 if (discard_supported) { 7301 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 7302 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 7303 } 7304 discard_supported = false; 7305 } 7306 } 7307 7308 if (discard_supported && 7309 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7310 mddev->queue->limits.discard_granularity >= stripe) 7311 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7312 mddev->queue); 7313 else 7314 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7315 mddev->queue); 7316 7317 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7318 } 7319 7320 if (journal_dev) { 7321 char b[BDEVNAME_SIZE]; 7322 7323 pr_debug("md/raid:%s: using device %s as journal\n", 7324 mdname(mddev), bdevname(journal_dev->bdev, b)); 7325 if (r5l_init_log(conf, journal_dev)) 7326 goto abort; 7327 } 7328 7329 return 0; 7330 abort: 7331 md_unregister_thread(&mddev->thread); 7332 print_raid5_conf(conf); 7333 free_conf(conf); 7334 mddev->private = NULL; 7335 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7336 return -EIO; 7337 } 7338 7339 static void raid5_free(struct mddev *mddev, void *priv) 7340 { 7341 struct r5conf *conf = priv; 7342 7343 free_conf(conf); 7344 mddev->to_remove = &raid5_attrs_group; 7345 } 7346 7347 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7348 { 7349 struct r5conf *conf = mddev->private; 7350 int i; 7351 7352 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7353 conf->chunk_sectors / 2, mddev->layout); 7354 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7355 rcu_read_lock(); 7356 for (i = 0; i < conf->raid_disks; i++) { 7357 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7358 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7359 } 7360 rcu_read_unlock(); 7361 seq_printf (seq, "]"); 7362 } 7363 7364 static void print_raid5_conf (struct r5conf *conf) 7365 { 7366 int i; 7367 struct disk_info *tmp; 7368 7369 pr_debug("RAID conf printout:\n"); 7370 if (!conf) { 7371 pr_debug("(conf==NULL)\n"); 7372 return; 7373 } 7374 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7375 conf->raid_disks, 7376 conf->raid_disks - conf->mddev->degraded); 7377 7378 for (i = 0; i < conf->raid_disks; i++) { 7379 char b[BDEVNAME_SIZE]; 7380 tmp = conf->disks + i; 7381 if (tmp->rdev) 7382 pr_debug(" disk %d, o:%d, dev:%s\n", 7383 i, !test_bit(Faulty, &tmp->rdev->flags), 7384 bdevname(tmp->rdev->bdev, b)); 7385 } 7386 } 7387 7388 static int raid5_spare_active(struct mddev *mddev) 7389 { 7390 int i; 7391 struct r5conf *conf = mddev->private; 7392 struct disk_info *tmp; 7393 int count = 0; 7394 unsigned long flags; 7395 7396 for (i = 0; i < conf->raid_disks; i++) { 7397 tmp = conf->disks + i; 7398 if (tmp->replacement 7399 && tmp->replacement->recovery_offset == MaxSector 7400 && !test_bit(Faulty, &tmp->replacement->flags) 7401 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7402 /* Replacement has just become active. */ 7403 if (!tmp->rdev 7404 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7405 count++; 7406 if (tmp->rdev) { 7407 /* Replaced device not technically faulty, 7408 * but we need to be sure it gets removed 7409 * and never re-added. 7410 */ 7411 set_bit(Faulty, &tmp->rdev->flags); 7412 sysfs_notify_dirent_safe( 7413 tmp->rdev->sysfs_state); 7414 } 7415 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7416 } else if (tmp->rdev 7417 && tmp->rdev->recovery_offset == MaxSector 7418 && !test_bit(Faulty, &tmp->rdev->flags) 7419 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7420 count++; 7421 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7422 } 7423 } 7424 spin_lock_irqsave(&conf->device_lock, flags); 7425 mddev->degraded = raid5_calc_degraded(conf); 7426 spin_unlock_irqrestore(&conf->device_lock, flags); 7427 print_raid5_conf(conf); 7428 return count; 7429 } 7430 7431 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7432 { 7433 struct r5conf *conf = mddev->private; 7434 int err = 0; 7435 int number = rdev->raid_disk; 7436 struct md_rdev **rdevp; 7437 struct disk_info *p = conf->disks + number; 7438 7439 print_raid5_conf(conf); 7440 if (test_bit(Journal, &rdev->flags) && conf->log) { 7441 struct r5l_log *log; 7442 /* 7443 * we can't wait pending write here, as this is called in 7444 * raid5d, wait will deadlock. 7445 */ 7446 if (atomic_read(&mddev->writes_pending)) 7447 return -EBUSY; 7448 log = conf->log; 7449 conf->log = NULL; 7450 synchronize_rcu(); 7451 r5l_exit_log(log); 7452 return 0; 7453 } 7454 if (rdev == p->rdev) 7455 rdevp = &p->rdev; 7456 else if (rdev == p->replacement) 7457 rdevp = &p->replacement; 7458 else 7459 return 0; 7460 7461 if (number >= conf->raid_disks && 7462 conf->reshape_progress == MaxSector) 7463 clear_bit(In_sync, &rdev->flags); 7464 7465 if (test_bit(In_sync, &rdev->flags) || 7466 atomic_read(&rdev->nr_pending)) { 7467 err = -EBUSY; 7468 goto abort; 7469 } 7470 /* Only remove non-faulty devices if recovery 7471 * isn't possible. 7472 */ 7473 if (!test_bit(Faulty, &rdev->flags) && 7474 mddev->recovery_disabled != conf->recovery_disabled && 7475 !has_failed(conf) && 7476 (!p->replacement || p->replacement == rdev) && 7477 number < conf->raid_disks) { 7478 err = -EBUSY; 7479 goto abort; 7480 } 7481 *rdevp = NULL; 7482 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7483 synchronize_rcu(); 7484 if (atomic_read(&rdev->nr_pending)) { 7485 /* lost the race, try later */ 7486 err = -EBUSY; 7487 *rdevp = rdev; 7488 } 7489 } 7490 if (p->replacement) { 7491 /* We must have just cleared 'rdev' */ 7492 p->rdev = p->replacement; 7493 clear_bit(Replacement, &p->replacement->flags); 7494 smp_mb(); /* Make sure other CPUs may see both as identical 7495 * but will never see neither - if they are careful 7496 */ 7497 p->replacement = NULL; 7498 clear_bit(WantReplacement, &rdev->flags); 7499 } else 7500 /* We might have just removed the Replacement as faulty- 7501 * clear the bit just in case 7502 */ 7503 clear_bit(WantReplacement, &rdev->flags); 7504 abort: 7505 7506 print_raid5_conf(conf); 7507 return err; 7508 } 7509 7510 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7511 { 7512 struct r5conf *conf = mddev->private; 7513 int err = -EEXIST; 7514 int disk; 7515 struct disk_info *p; 7516 int first = 0; 7517 int last = conf->raid_disks - 1; 7518 7519 if (test_bit(Journal, &rdev->flags)) { 7520 char b[BDEVNAME_SIZE]; 7521 if (conf->log) 7522 return -EBUSY; 7523 7524 rdev->raid_disk = 0; 7525 /* 7526 * The array is in readonly mode if journal is missing, so no 7527 * write requests running. We should be safe 7528 */ 7529 r5l_init_log(conf, rdev); 7530 pr_debug("md/raid:%s: using device %s as journal\n", 7531 mdname(mddev), bdevname(rdev->bdev, b)); 7532 return 0; 7533 } 7534 if (mddev->recovery_disabled == conf->recovery_disabled) 7535 return -EBUSY; 7536 7537 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7538 /* no point adding a device */ 7539 return -EINVAL; 7540 7541 if (rdev->raid_disk >= 0) 7542 first = last = rdev->raid_disk; 7543 7544 /* 7545 * find the disk ... but prefer rdev->saved_raid_disk 7546 * if possible. 7547 */ 7548 if (rdev->saved_raid_disk >= 0 && 7549 rdev->saved_raid_disk >= first && 7550 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7551 first = rdev->saved_raid_disk; 7552 7553 for (disk = first; disk <= last; disk++) { 7554 p = conf->disks + disk; 7555 if (p->rdev == NULL) { 7556 clear_bit(In_sync, &rdev->flags); 7557 rdev->raid_disk = disk; 7558 err = 0; 7559 if (rdev->saved_raid_disk != disk) 7560 conf->fullsync = 1; 7561 rcu_assign_pointer(p->rdev, rdev); 7562 goto out; 7563 } 7564 } 7565 for (disk = first; disk <= last; disk++) { 7566 p = conf->disks + disk; 7567 if (test_bit(WantReplacement, &p->rdev->flags) && 7568 p->replacement == NULL) { 7569 clear_bit(In_sync, &rdev->flags); 7570 set_bit(Replacement, &rdev->flags); 7571 rdev->raid_disk = disk; 7572 err = 0; 7573 conf->fullsync = 1; 7574 rcu_assign_pointer(p->replacement, rdev); 7575 break; 7576 } 7577 } 7578 out: 7579 print_raid5_conf(conf); 7580 return err; 7581 } 7582 7583 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7584 { 7585 /* no resync is happening, and there is enough space 7586 * on all devices, so we can resize. 7587 * We need to make sure resync covers any new space. 7588 * If the array is shrinking we should possibly wait until 7589 * any io in the removed space completes, but it hardly seems 7590 * worth it. 7591 */ 7592 sector_t newsize; 7593 struct r5conf *conf = mddev->private; 7594 7595 if (conf->log) 7596 return -EINVAL; 7597 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7598 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7599 if (mddev->external_size && 7600 mddev->array_sectors > newsize) 7601 return -EINVAL; 7602 if (mddev->bitmap) { 7603 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7604 if (ret) 7605 return ret; 7606 } 7607 md_set_array_sectors(mddev, newsize); 7608 set_capacity(mddev->gendisk, mddev->array_sectors); 7609 revalidate_disk(mddev->gendisk); 7610 if (sectors > mddev->dev_sectors && 7611 mddev->recovery_cp > mddev->dev_sectors) { 7612 mddev->recovery_cp = mddev->dev_sectors; 7613 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7614 } 7615 mddev->dev_sectors = sectors; 7616 mddev->resync_max_sectors = sectors; 7617 return 0; 7618 } 7619 7620 static int check_stripe_cache(struct mddev *mddev) 7621 { 7622 /* Can only proceed if there are plenty of stripe_heads. 7623 * We need a minimum of one full stripe,, and for sensible progress 7624 * it is best to have about 4 times that. 7625 * If we require 4 times, then the default 256 4K stripe_heads will 7626 * allow for chunk sizes up to 256K, which is probably OK. 7627 * If the chunk size is greater, user-space should request more 7628 * stripe_heads first. 7629 */ 7630 struct r5conf *conf = mddev->private; 7631 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7632 > conf->min_nr_stripes || 7633 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7634 > conf->min_nr_stripes) { 7635 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7636 mdname(mddev), 7637 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7638 / STRIPE_SIZE)*4); 7639 return 0; 7640 } 7641 return 1; 7642 } 7643 7644 static int check_reshape(struct mddev *mddev) 7645 { 7646 struct r5conf *conf = mddev->private; 7647 7648 if (conf->log) 7649 return -EINVAL; 7650 if (mddev->delta_disks == 0 && 7651 mddev->new_layout == mddev->layout && 7652 mddev->new_chunk_sectors == mddev->chunk_sectors) 7653 return 0; /* nothing to do */ 7654 if (has_failed(conf)) 7655 return -EINVAL; 7656 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7657 /* We might be able to shrink, but the devices must 7658 * be made bigger first. 7659 * For raid6, 4 is the minimum size. 7660 * Otherwise 2 is the minimum 7661 */ 7662 int min = 2; 7663 if (mddev->level == 6) 7664 min = 4; 7665 if (mddev->raid_disks + mddev->delta_disks < min) 7666 return -EINVAL; 7667 } 7668 7669 if (!check_stripe_cache(mddev)) 7670 return -ENOSPC; 7671 7672 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7673 mddev->delta_disks > 0) 7674 if (resize_chunks(conf, 7675 conf->previous_raid_disks 7676 + max(0, mddev->delta_disks), 7677 max(mddev->new_chunk_sectors, 7678 mddev->chunk_sectors) 7679 ) < 0) 7680 return -ENOMEM; 7681 return resize_stripes(conf, (conf->previous_raid_disks 7682 + mddev->delta_disks)); 7683 } 7684 7685 static int raid5_start_reshape(struct mddev *mddev) 7686 { 7687 struct r5conf *conf = mddev->private; 7688 struct md_rdev *rdev; 7689 int spares = 0; 7690 unsigned long flags; 7691 7692 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7693 return -EBUSY; 7694 7695 if (!check_stripe_cache(mddev)) 7696 return -ENOSPC; 7697 7698 if (has_failed(conf)) 7699 return -EINVAL; 7700 7701 rdev_for_each(rdev, mddev) { 7702 if (!test_bit(In_sync, &rdev->flags) 7703 && !test_bit(Faulty, &rdev->flags)) 7704 spares++; 7705 } 7706 7707 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7708 /* Not enough devices even to make a degraded array 7709 * of that size 7710 */ 7711 return -EINVAL; 7712 7713 /* Refuse to reduce size of the array. Any reductions in 7714 * array size must be through explicit setting of array_size 7715 * attribute. 7716 */ 7717 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7718 < mddev->array_sectors) { 7719 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7720 mdname(mddev)); 7721 return -EINVAL; 7722 } 7723 7724 atomic_set(&conf->reshape_stripes, 0); 7725 spin_lock_irq(&conf->device_lock); 7726 write_seqcount_begin(&conf->gen_lock); 7727 conf->previous_raid_disks = conf->raid_disks; 7728 conf->raid_disks += mddev->delta_disks; 7729 conf->prev_chunk_sectors = conf->chunk_sectors; 7730 conf->chunk_sectors = mddev->new_chunk_sectors; 7731 conf->prev_algo = conf->algorithm; 7732 conf->algorithm = mddev->new_layout; 7733 conf->generation++; 7734 /* Code that selects data_offset needs to see the generation update 7735 * if reshape_progress has been set - so a memory barrier needed. 7736 */ 7737 smp_mb(); 7738 if (mddev->reshape_backwards) 7739 conf->reshape_progress = raid5_size(mddev, 0, 0); 7740 else 7741 conf->reshape_progress = 0; 7742 conf->reshape_safe = conf->reshape_progress; 7743 write_seqcount_end(&conf->gen_lock); 7744 spin_unlock_irq(&conf->device_lock); 7745 7746 /* Now make sure any requests that proceeded on the assumption 7747 * the reshape wasn't running - like Discard or Read - have 7748 * completed. 7749 */ 7750 mddev_suspend(mddev); 7751 mddev_resume(mddev); 7752 7753 /* Add some new drives, as many as will fit. 7754 * We know there are enough to make the newly sized array work. 7755 * Don't add devices if we are reducing the number of 7756 * devices in the array. This is because it is not possible 7757 * to correctly record the "partially reconstructed" state of 7758 * such devices during the reshape and confusion could result. 7759 */ 7760 if (mddev->delta_disks >= 0) { 7761 rdev_for_each(rdev, mddev) 7762 if (rdev->raid_disk < 0 && 7763 !test_bit(Faulty, &rdev->flags)) { 7764 if (raid5_add_disk(mddev, rdev) == 0) { 7765 if (rdev->raid_disk 7766 >= conf->previous_raid_disks) 7767 set_bit(In_sync, &rdev->flags); 7768 else 7769 rdev->recovery_offset = 0; 7770 7771 if (sysfs_link_rdev(mddev, rdev)) 7772 /* Failure here is OK */; 7773 } 7774 } else if (rdev->raid_disk >= conf->previous_raid_disks 7775 && !test_bit(Faulty, &rdev->flags)) { 7776 /* This is a spare that was manually added */ 7777 set_bit(In_sync, &rdev->flags); 7778 } 7779 7780 /* When a reshape changes the number of devices, 7781 * ->degraded is measured against the larger of the 7782 * pre and post number of devices. 7783 */ 7784 spin_lock_irqsave(&conf->device_lock, flags); 7785 mddev->degraded = raid5_calc_degraded(conf); 7786 spin_unlock_irqrestore(&conf->device_lock, flags); 7787 } 7788 mddev->raid_disks = conf->raid_disks; 7789 mddev->reshape_position = conf->reshape_progress; 7790 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7791 7792 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7793 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7794 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7795 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7796 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7797 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7798 "reshape"); 7799 if (!mddev->sync_thread) { 7800 mddev->recovery = 0; 7801 spin_lock_irq(&conf->device_lock); 7802 write_seqcount_begin(&conf->gen_lock); 7803 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7804 mddev->new_chunk_sectors = 7805 conf->chunk_sectors = conf->prev_chunk_sectors; 7806 mddev->new_layout = conf->algorithm = conf->prev_algo; 7807 rdev_for_each(rdev, mddev) 7808 rdev->new_data_offset = rdev->data_offset; 7809 smp_wmb(); 7810 conf->generation --; 7811 conf->reshape_progress = MaxSector; 7812 mddev->reshape_position = MaxSector; 7813 write_seqcount_end(&conf->gen_lock); 7814 spin_unlock_irq(&conf->device_lock); 7815 return -EAGAIN; 7816 } 7817 conf->reshape_checkpoint = jiffies; 7818 md_wakeup_thread(mddev->sync_thread); 7819 md_new_event(mddev); 7820 return 0; 7821 } 7822 7823 /* This is called from the reshape thread and should make any 7824 * changes needed in 'conf' 7825 */ 7826 static void end_reshape(struct r5conf *conf) 7827 { 7828 7829 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7830 struct md_rdev *rdev; 7831 7832 spin_lock_irq(&conf->device_lock); 7833 conf->previous_raid_disks = conf->raid_disks; 7834 rdev_for_each(rdev, conf->mddev) 7835 rdev->data_offset = rdev->new_data_offset; 7836 smp_wmb(); 7837 conf->reshape_progress = MaxSector; 7838 conf->mddev->reshape_position = MaxSector; 7839 spin_unlock_irq(&conf->device_lock); 7840 wake_up(&conf->wait_for_overlap); 7841 7842 /* read-ahead size must cover two whole stripes, which is 7843 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7844 */ 7845 if (conf->mddev->queue) { 7846 int data_disks = conf->raid_disks - conf->max_degraded; 7847 int stripe = data_disks * ((conf->chunk_sectors << 9) 7848 / PAGE_SIZE); 7849 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7850 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7851 } 7852 } 7853 } 7854 7855 /* This is called from the raid5d thread with mddev_lock held. 7856 * It makes config changes to the device. 7857 */ 7858 static void raid5_finish_reshape(struct mddev *mddev) 7859 { 7860 struct r5conf *conf = mddev->private; 7861 7862 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7863 7864 if (mddev->delta_disks > 0) { 7865 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7866 if (mddev->queue) { 7867 set_capacity(mddev->gendisk, mddev->array_sectors); 7868 revalidate_disk(mddev->gendisk); 7869 } 7870 } else { 7871 int d; 7872 spin_lock_irq(&conf->device_lock); 7873 mddev->degraded = raid5_calc_degraded(conf); 7874 spin_unlock_irq(&conf->device_lock); 7875 for (d = conf->raid_disks ; 7876 d < conf->raid_disks - mddev->delta_disks; 7877 d++) { 7878 struct md_rdev *rdev = conf->disks[d].rdev; 7879 if (rdev) 7880 clear_bit(In_sync, &rdev->flags); 7881 rdev = conf->disks[d].replacement; 7882 if (rdev) 7883 clear_bit(In_sync, &rdev->flags); 7884 } 7885 } 7886 mddev->layout = conf->algorithm; 7887 mddev->chunk_sectors = conf->chunk_sectors; 7888 mddev->reshape_position = MaxSector; 7889 mddev->delta_disks = 0; 7890 mddev->reshape_backwards = 0; 7891 } 7892 } 7893 7894 static void raid5_quiesce(struct mddev *mddev, int state) 7895 { 7896 struct r5conf *conf = mddev->private; 7897 7898 switch(state) { 7899 case 2: /* resume for a suspend */ 7900 wake_up(&conf->wait_for_overlap); 7901 break; 7902 7903 case 1: /* stop all writes */ 7904 lock_all_device_hash_locks_irq(conf); 7905 /* '2' tells resync/reshape to pause so that all 7906 * active stripes can drain 7907 */ 7908 r5c_flush_cache(conf, INT_MAX); 7909 conf->quiesce = 2; 7910 wait_event_cmd(conf->wait_for_quiescent, 7911 atomic_read(&conf->active_stripes) == 0 && 7912 atomic_read(&conf->active_aligned_reads) == 0, 7913 unlock_all_device_hash_locks_irq(conf), 7914 lock_all_device_hash_locks_irq(conf)); 7915 conf->quiesce = 1; 7916 unlock_all_device_hash_locks_irq(conf); 7917 /* allow reshape to continue */ 7918 wake_up(&conf->wait_for_overlap); 7919 break; 7920 7921 case 0: /* re-enable writes */ 7922 lock_all_device_hash_locks_irq(conf); 7923 conf->quiesce = 0; 7924 wake_up(&conf->wait_for_quiescent); 7925 wake_up(&conf->wait_for_overlap); 7926 unlock_all_device_hash_locks_irq(conf); 7927 break; 7928 } 7929 r5l_quiesce(conf->log, state); 7930 } 7931 7932 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 7933 { 7934 struct r0conf *raid0_conf = mddev->private; 7935 sector_t sectors; 7936 7937 /* for raid0 takeover only one zone is supported */ 7938 if (raid0_conf->nr_strip_zones > 1) { 7939 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 7940 mdname(mddev)); 7941 return ERR_PTR(-EINVAL); 7942 } 7943 7944 sectors = raid0_conf->strip_zone[0].zone_end; 7945 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 7946 mddev->dev_sectors = sectors; 7947 mddev->new_level = level; 7948 mddev->new_layout = ALGORITHM_PARITY_N; 7949 mddev->new_chunk_sectors = mddev->chunk_sectors; 7950 mddev->raid_disks += 1; 7951 mddev->delta_disks = 1; 7952 /* make sure it will be not marked as dirty */ 7953 mddev->recovery_cp = MaxSector; 7954 7955 return setup_conf(mddev); 7956 } 7957 7958 static void *raid5_takeover_raid1(struct mddev *mddev) 7959 { 7960 int chunksect; 7961 void *ret; 7962 7963 if (mddev->raid_disks != 2 || 7964 mddev->degraded > 1) 7965 return ERR_PTR(-EINVAL); 7966 7967 /* Should check if there are write-behind devices? */ 7968 7969 chunksect = 64*2; /* 64K by default */ 7970 7971 /* The array must be an exact multiple of chunksize */ 7972 while (chunksect && (mddev->array_sectors & (chunksect-1))) 7973 chunksect >>= 1; 7974 7975 if ((chunksect<<9) < STRIPE_SIZE) 7976 /* array size does not allow a suitable chunk size */ 7977 return ERR_PTR(-EINVAL); 7978 7979 mddev->new_level = 5; 7980 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 7981 mddev->new_chunk_sectors = chunksect; 7982 7983 ret = setup_conf(mddev); 7984 if (!IS_ERR(ret)) 7985 mddev_clear_unsupported_flags(mddev, 7986 UNSUPPORTED_MDDEV_FLAGS); 7987 return ret; 7988 } 7989 7990 static void *raid5_takeover_raid6(struct mddev *mddev) 7991 { 7992 int new_layout; 7993 7994 switch (mddev->layout) { 7995 case ALGORITHM_LEFT_ASYMMETRIC_6: 7996 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 7997 break; 7998 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7999 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8000 break; 8001 case ALGORITHM_LEFT_SYMMETRIC_6: 8002 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8003 break; 8004 case ALGORITHM_RIGHT_SYMMETRIC_6: 8005 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8006 break; 8007 case ALGORITHM_PARITY_0_6: 8008 new_layout = ALGORITHM_PARITY_0; 8009 break; 8010 case ALGORITHM_PARITY_N: 8011 new_layout = ALGORITHM_PARITY_N; 8012 break; 8013 default: 8014 return ERR_PTR(-EINVAL); 8015 } 8016 mddev->new_level = 5; 8017 mddev->new_layout = new_layout; 8018 mddev->delta_disks = -1; 8019 mddev->raid_disks -= 1; 8020 return setup_conf(mddev); 8021 } 8022 8023 static int raid5_check_reshape(struct mddev *mddev) 8024 { 8025 /* For a 2-drive array, the layout and chunk size can be changed 8026 * immediately as not restriping is needed. 8027 * For larger arrays we record the new value - after validation 8028 * to be used by a reshape pass. 8029 */ 8030 struct r5conf *conf = mddev->private; 8031 int new_chunk = mddev->new_chunk_sectors; 8032 8033 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8034 return -EINVAL; 8035 if (new_chunk > 0) { 8036 if (!is_power_of_2(new_chunk)) 8037 return -EINVAL; 8038 if (new_chunk < (PAGE_SIZE>>9)) 8039 return -EINVAL; 8040 if (mddev->array_sectors & (new_chunk-1)) 8041 /* not factor of array size */ 8042 return -EINVAL; 8043 } 8044 8045 /* They look valid */ 8046 8047 if (mddev->raid_disks == 2) { 8048 /* can make the change immediately */ 8049 if (mddev->new_layout >= 0) { 8050 conf->algorithm = mddev->new_layout; 8051 mddev->layout = mddev->new_layout; 8052 } 8053 if (new_chunk > 0) { 8054 conf->chunk_sectors = new_chunk ; 8055 mddev->chunk_sectors = new_chunk; 8056 } 8057 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8058 md_wakeup_thread(mddev->thread); 8059 } 8060 return check_reshape(mddev); 8061 } 8062 8063 static int raid6_check_reshape(struct mddev *mddev) 8064 { 8065 int new_chunk = mddev->new_chunk_sectors; 8066 8067 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8068 return -EINVAL; 8069 if (new_chunk > 0) { 8070 if (!is_power_of_2(new_chunk)) 8071 return -EINVAL; 8072 if (new_chunk < (PAGE_SIZE >> 9)) 8073 return -EINVAL; 8074 if (mddev->array_sectors & (new_chunk-1)) 8075 /* not factor of array size */ 8076 return -EINVAL; 8077 } 8078 8079 /* They look valid */ 8080 return check_reshape(mddev); 8081 } 8082 8083 static void *raid5_takeover(struct mddev *mddev) 8084 { 8085 /* raid5 can take over: 8086 * raid0 - if there is only one strip zone - make it a raid4 layout 8087 * raid1 - if there are two drives. We need to know the chunk size 8088 * raid4 - trivial - just use a raid4 layout. 8089 * raid6 - Providing it is a *_6 layout 8090 */ 8091 if (mddev->level == 0) 8092 return raid45_takeover_raid0(mddev, 5); 8093 if (mddev->level == 1) 8094 return raid5_takeover_raid1(mddev); 8095 if (mddev->level == 4) { 8096 mddev->new_layout = ALGORITHM_PARITY_N; 8097 mddev->new_level = 5; 8098 return setup_conf(mddev); 8099 } 8100 if (mddev->level == 6) 8101 return raid5_takeover_raid6(mddev); 8102 8103 return ERR_PTR(-EINVAL); 8104 } 8105 8106 static void *raid4_takeover(struct mddev *mddev) 8107 { 8108 /* raid4 can take over: 8109 * raid0 - if there is only one strip zone 8110 * raid5 - if layout is right 8111 */ 8112 if (mddev->level == 0) 8113 return raid45_takeover_raid0(mddev, 4); 8114 if (mddev->level == 5 && 8115 mddev->layout == ALGORITHM_PARITY_N) { 8116 mddev->new_layout = 0; 8117 mddev->new_level = 4; 8118 return setup_conf(mddev); 8119 } 8120 return ERR_PTR(-EINVAL); 8121 } 8122 8123 static struct md_personality raid5_personality; 8124 8125 static void *raid6_takeover(struct mddev *mddev) 8126 { 8127 /* Currently can only take over a raid5. We map the 8128 * personality to an equivalent raid6 personality 8129 * with the Q block at the end. 8130 */ 8131 int new_layout; 8132 8133 if (mddev->pers != &raid5_personality) 8134 return ERR_PTR(-EINVAL); 8135 if (mddev->degraded > 1) 8136 return ERR_PTR(-EINVAL); 8137 if (mddev->raid_disks > 253) 8138 return ERR_PTR(-EINVAL); 8139 if (mddev->raid_disks < 3) 8140 return ERR_PTR(-EINVAL); 8141 8142 switch (mddev->layout) { 8143 case ALGORITHM_LEFT_ASYMMETRIC: 8144 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8145 break; 8146 case ALGORITHM_RIGHT_ASYMMETRIC: 8147 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8148 break; 8149 case ALGORITHM_LEFT_SYMMETRIC: 8150 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8151 break; 8152 case ALGORITHM_RIGHT_SYMMETRIC: 8153 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8154 break; 8155 case ALGORITHM_PARITY_0: 8156 new_layout = ALGORITHM_PARITY_0_6; 8157 break; 8158 case ALGORITHM_PARITY_N: 8159 new_layout = ALGORITHM_PARITY_N; 8160 break; 8161 default: 8162 return ERR_PTR(-EINVAL); 8163 } 8164 mddev->new_level = 6; 8165 mddev->new_layout = new_layout; 8166 mddev->delta_disks = 1; 8167 mddev->raid_disks += 1; 8168 return setup_conf(mddev); 8169 } 8170 8171 static struct md_personality raid6_personality = 8172 { 8173 .name = "raid6", 8174 .level = 6, 8175 .owner = THIS_MODULE, 8176 .make_request = raid5_make_request, 8177 .run = raid5_run, 8178 .free = raid5_free, 8179 .status = raid5_status, 8180 .error_handler = raid5_error, 8181 .hot_add_disk = raid5_add_disk, 8182 .hot_remove_disk= raid5_remove_disk, 8183 .spare_active = raid5_spare_active, 8184 .sync_request = raid5_sync_request, 8185 .resize = raid5_resize, 8186 .size = raid5_size, 8187 .check_reshape = raid6_check_reshape, 8188 .start_reshape = raid5_start_reshape, 8189 .finish_reshape = raid5_finish_reshape, 8190 .quiesce = raid5_quiesce, 8191 .takeover = raid6_takeover, 8192 .congested = raid5_congested, 8193 }; 8194 static struct md_personality raid5_personality = 8195 { 8196 .name = "raid5", 8197 .level = 5, 8198 .owner = THIS_MODULE, 8199 .make_request = raid5_make_request, 8200 .run = raid5_run, 8201 .free = raid5_free, 8202 .status = raid5_status, 8203 .error_handler = raid5_error, 8204 .hot_add_disk = raid5_add_disk, 8205 .hot_remove_disk= raid5_remove_disk, 8206 .spare_active = raid5_spare_active, 8207 .sync_request = raid5_sync_request, 8208 .resize = raid5_resize, 8209 .size = raid5_size, 8210 .check_reshape = raid5_check_reshape, 8211 .start_reshape = raid5_start_reshape, 8212 .finish_reshape = raid5_finish_reshape, 8213 .quiesce = raid5_quiesce, 8214 .takeover = raid5_takeover, 8215 .congested = raid5_congested, 8216 }; 8217 8218 static struct md_personality raid4_personality = 8219 { 8220 .name = "raid4", 8221 .level = 4, 8222 .owner = THIS_MODULE, 8223 .make_request = raid5_make_request, 8224 .run = raid5_run, 8225 .free = raid5_free, 8226 .status = raid5_status, 8227 .error_handler = raid5_error, 8228 .hot_add_disk = raid5_add_disk, 8229 .hot_remove_disk= raid5_remove_disk, 8230 .spare_active = raid5_spare_active, 8231 .sync_request = raid5_sync_request, 8232 .resize = raid5_resize, 8233 .size = raid5_size, 8234 .check_reshape = raid5_check_reshape, 8235 .start_reshape = raid5_start_reshape, 8236 .finish_reshape = raid5_finish_reshape, 8237 .quiesce = raid5_quiesce, 8238 .takeover = raid4_takeover, 8239 .congested = raid5_congested, 8240 }; 8241 8242 static int __init raid5_init(void) 8243 { 8244 int ret; 8245 8246 raid5_wq = alloc_workqueue("raid5wq", 8247 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8248 if (!raid5_wq) 8249 return -ENOMEM; 8250 8251 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8252 "md/raid5:prepare", 8253 raid456_cpu_up_prepare, 8254 raid456_cpu_dead); 8255 if (ret) { 8256 destroy_workqueue(raid5_wq); 8257 return ret; 8258 } 8259 register_md_personality(&raid6_personality); 8260 register_md_personality(&raid5_personality); 8261 register_md_personality(&raid4_personality); 8262 return 0; 8263 } 8264 8265 static void raid5_exit(void) 8266 { 8267 unregister_md_personality(&raid6_personality); 8268 unregister_md_personality(&raid5_personality); 8269 unregister_md_personality(&raid4_personality); 8270 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8271 destroy_workqueue(raid5_wq); 8272 } 8273 8274 module_init(raid5_init); 8275 module_exit(raid5_exit); 8276 MODULE_LICENSE("GPL"); 8277 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8278 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8279 MODULE_ALIAS("md-raid5"); 8280 MODULE_ALIAS("md-raid4"); 8281 MODULE_ALIAS("md-level-5"); 8282 MODULE_ALIAS("md-level-4"); 8283 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8284 MODULE_ALIAS("md-raid6"); 8285 MODULE_ALIAS("md-level-6"); 8286 8287 /* This used to be two separate modules, they were: */ 8288 MODULE_ALIAS("raid5"); 8289 MODULE_ALIAS("raid6"); 8290