1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <linux/sched/signal.h> 59 60 #include <trace/events/block.h> 61 #include <linux/list_sort.h> 62 63 #include "md.h" 64 #include "raid5.h" 65 #include "raid0.h" 66 #include "bitmap.h" 67 #include "raid5-log.h" 68 69 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 70 71 #define cpu_to_group(cpu) cpu_to_node(cpu) 72 #define ANY_GROUP NUMA_NO_NODE 73 74 static bool devices_handle_discard_safely = false; 75 module_param(devices_handle_discard_safely, bool, 0644); 76 MODULE_PARM_DESC(devices_handle_discard_safely, 77 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 78 static struct workqueue_struct *raid5_wq; 79 80 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 81 { 82 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 83 return &conf->stripe_hashtbl[hash]; 84 } 85 86 static inline int stripe_hash_locks_hash(sector_t sect) 87 { 88 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 89 } 90 91 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 92 { 93 spin_lock_irq(conf->hash_locks + hash); 94 spin_lock(&conf->device_lock); 95 } 96 97 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 98 { 99 spin_unlock(&conf->device_lock); 100 spin_unlock_irq(conf->hash_locks + hash); 101 } 102 103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 104 { 105 int i; 106 local_irq_disable(); 107 spin_lock(conf->hash_locks); 108 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 109 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 110 spin_lock(&conf->device_lock); 111 } 112 113 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 114 { 115 int i; 116 spin_unlock(&conf->device_lock); 117 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 118 spin_unlock(conf->hash_locks + i - 1); 119 local_irq_enable(); 120 } 121 122 /* Find first data disk in a raid6 stripe */ 123 static inline int raid6_d0(struct stripe_head *sh) 124 { 125 if (sh->ddf_layout) 126 /* ddf always start from first device */ 127 return 0; 128 /* md starts just after Q block */ 129 if (sh->qd_idx == sh->disks - 1) 130 return 0; 131 else 132 return sh->qd_idx + 1; 133 } 134 static inline int raid6_next_disk(int disk, int raid_disks) 135 { 136 disk++; 137 return (disk < raid_disks) ? disk : 0; 138 } 139 140 /* When walking through the disks in a raid5, starting at raid6_d0, 141 * We need to map each disk to a 'slot', where the data disks are slot 142 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 143 * is raid_disks-1. This help does that mapping. 144 */ 145 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 146 int *count, int syndrome_disks) 147 { 148 int slot = *count; 149 150 if (sh->ddf_layout) 151 (*count)++; 152 if (idx == sh->pd_idx) 153 return syndrome_disks; 154 if (idx == sh->qd_idx) 155 return syndrome_disks + 1; 156 if (!sh->ddf_layout) 157 (*count)++; 158 return slot; 159 } 160 161 static void print_raid5_conf (struct r5conf *conf); 162 163 static int stripe_operations_active(struct stripe_head *sh) 164 { 165 return sh->check_state || sh->reconstruct_state || 166 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 167 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 168 } 169 170 static bool stripe_is_lowprio(struct stripe_head *sh) 171 { 172 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 173 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 174 !test_bit(STRIPE_R5C_CACHING, &sh->state); 175 } 176 177 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 178 { 179 struct r5conf *conf = sh->raid_conf; 180 struct r5worker_group *group; 181 int thread_cnt; 182 int i, cpu = sh->cpu; 183 184 if (!cpu_online(cpu)) { 185 cpu = cpumask_any(cpu_online_mask); 186 sh->cpu = cpu; 187 } 188 189 if (list_empty(&sh->lru)) { 190 struct r5worker_group *group; 191 group = conf->worker_groups + cpu_to_group(cpu); 192 if (stripe_is_lowprio(sh)) 193 list_add_tail(&sh->lru, &group->loprio_list); 194 else 195 list_add_tail(&sh->lru, &group->handle_list); 196 group->stripes_cnt++; 197 sh->group = group; 198 } 199 200 if (conf->worker_cnt_per_group == 0) { 201 md_wakeup_thread(conf->mddev->thread); 202 return; 203 } 204 205 group = conf->worker_groups + cpu_to_group(sh->cpu); 206 207 group->workers[0].working = true; 208 /* at least one worker should run to avoid race */ 209 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 210 211 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 212 /* wakeup more workers */ 213 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 214 if (group->workers[i].working == false) { 215 group->workers[i].working = true; 216 queue_work_on(sh->cpu, raid5_wq, 217 &group->workers[i].work); 218 thread_cnt--; 219 } 220 } 221 } 222 223 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 224 struct list_head *temp_inactive_list) 225 { 226 int i; 227 int injournal = 0; /* number of date pages with R5_InJournal */ 228 229 BUG_ON(!list_empty(&sh->lru)); 230 BUG_ON(atomic_read(&conf->active_stripes)==0); 231 232 if (r5c_is_writeback(conf->log)) 233 for (i = sh->disks; i--; ) 234 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 235 injournal++; 236 /* 237 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 238 * data in journal, so they are not released to cached lists 239 */ 240 if (conf->quiesce && r5c_is_writeback(conf->log) && 241 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 242 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 243 r5c_make_stripe_write_out(sh); 244 set_bit(STRIPE_HANDLE, &sh->state); 245 } 246 247 if (test_bit(STRIPE_HANDLE, &sh->state)) { 248 if (test_bit(STRIPE_DELAYED, &sh->state) && 249 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 250 list_add_tail(&sh->lru, &conf->delayed_list); 251 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 252 sh->bm_seq - conf->seq_write > 0) 253 list_add_tail(&sh->lru, &conf->bitmap_list); 254 else { 255 clear_bit(STRIPE_DELAYED, &sh->state); 256 clear_bit(STRIPE_BIT_DELAY, &sh->state); 257 if (conf->worker_cnt_per_group == 0) { 258 if (stripe_is_lowprio(sh)) 259 list_add_tail(&sh->lru, 260 &conf->loprio_list); 261 else 262 list_add_tail(&sh->lru, 263 &conf->handle_list); 264 } else { 265 raid5_wakeup_stripe_thread(sh); 266 return; 267 } 268 } 269 md_wakeup_thread(conf->mddev->thread); 270 } else { 271 BUG_ON(stripe_operations_active(sh)); 272 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 273 if (atomic_dec_return(&conf->preread_active_stripes) 274 < IO_THRESHOLD) 275 md_wakeup_thread(conf->mddev->thread); 276 atomic_dec(&conf->active_stripes); 277 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 278 if (!r5c_is_writeback(conf->log)) 279 list_add_tail(&sh->lru, temp_inactive_list); 280 else { 281 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 282 if (injournal == 0) 283 list_add_tail(&sh->lru, temp_inactive_list); 284 else if (injournal == conf->raid_disks - conf->max_degraded) { 285 /* full stripe */ 286 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 287 atomic_inc(&conf->r5c_cached_full_stripes); 288 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 289 atomic_dec(&conf->r5c_cached_partial_stripes); 290 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 291 r5c_check_cached_full_stripe(conf); 292 } else 293 /* 294 * STRIPE_R5C_PARTIAL_STRIPE is set in 295 * r5c_try_caching_write(). No need to 296 * set it again. 297 */ 298 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 299 } 300 } 301 } 302 } 303 304 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 305 struct list_head *temp_inactive_list) 306 { 307 if (atomic_dec_and_test(&sh->count)) 308 do_release_stripe(conf, sh, temp_inactive_list); 309 } 310 311 /* 312 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 313 * 314 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 315 * given time. Adding stripes only takes device lock, while deleting stripes 316 * only takes hash lock. 317 */ 318 static void release_inactive_stripe_list(struct r5conf *conf, 319 struct list_head *temp_inactive_list, 320 int hash) 321 { 322 int size; 323 bool do_wakeup = false; 324 unsigned long flags; 325 326 if (hash == NR_STRIPE_HASH_LOCKS) { 327 size = NR_STRIPE_HASH_LOCKS; 328 hash = NR_STRIPE_HASH_LOCKS - 1; 329 } else 330 size = 1; 331 while (size) { 332 struct list_head *list = &temp_inactive_list[size - 1]; 333 334 /* 335 * We don't hold any lock here yet, raid5_get_active_stripe() might 336 * remove stripes from the list 337 */ 338 if (!list_empty_careful(list)) { 339 spin_lock_irqsave(conf->hash_locks + hash, flags); 340 if (list_empty(conf->inactive_list + hash) && 341 !list_empty(list)) 342 atomic_dec(&conf->empty_inactive_list_nr); 343 list_splice_tail_init(list, conf->inactive_list + hash); 344 do_wakeup = true; 345 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 346 } 347 size--; 348 hash--; 349 } 350 351 if (do_wakeup) { 352 wake_up(&conf->wait_for_stripe); 353 if (atomic_read(&conf->active_stripes) == 0) 354 wake_up(&conf->wait_for_quiescent); 355 if (conf->retry_read_aligned) 356 md_wakeup_thread(conf->mddev->thread); 357 } 358 } 359 360 /* should hold conf->device_lock already */ 361 static int release_stripe_list(struct r5conf *conf, 362 struct list_head *temp_inactive_list) 363 { 364 struct stripe_head *sh, *t; 365 int count = 0; 366 struct llist_node *head; 367 368 head = llist_del_all(&conf->released_stripes); 369 head = llist_reverse_order(head); 370 llist_for_each_entry_safe(sh, t, head, release_list) { 371 int hash; 372 373 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 374 smp_mb(); 375 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 376 /* 377 * Don't worry the bit is set here, because if the bit is set 378 * again, the count is always > 1. This is true for 379 * STRIPE_ON_UNPLUG_LIST bit too. 380 */ 381 hash = sh->hash_lock_index; 382 __release_stripe(conf, sh, &temp_inactive_list[hash]); 383 count++; 384 } 385 386 return count; 387 } 388 389 void raid5_release_stripe(struct stripe_head *sh) 390 { 391 struct r5conf *conf = sh->raid_conf; 392 unsigned long flags; 393 struct list_head list; 394 int hash; 395 bool wakeup; 396 397 /* Avoid release_list until the last reference. 398 */ 399 if (atomic_add_unless(&sh->count, -1, 1)) 400 return; 401 402 if (unlikely(!conf->mddev->thread) || 403 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 404 goto slow_path; 405 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 406 if (wakeup) 407 md_wakeup_thread(conf->mddev->thread); 408 return; 409 slow_path: 410 local_irq_save(flags); 411 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 412 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 413 INIT_LIST_HEAD(&list); 414 hash = sh->hash_lock_index; 415 do_release_stripe(conf, sh, &list); 416 spin_unlock(&conf->device_lock); 417 release_inactive_stripe_list(conf, &list, hash); 418 } 419 local_irq_restore(flags); 420 } 421 422 static inline void remove_hash(struct stripe_head *sh) 423 { 424 pr_debug("remove_hash(), stripe %llu\n", 425 (unsigned long long)sh->sector); 426 427 hlist_del_init(&sh->hash); 428 } 429 430 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 431 { 432 struct hlist_head *hp = stripe_hash(conf, sh->sector); 433 434 pr_debug("insert_hash(), stripe %llu\n", 435 (unsigned long long)sh->sector); 436 437 hlist_add_head(&sh->hash, hp); 438 } 439 440 /* find an idle stripe, make sure it is unhashed, and return it. */ 441 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 442 { 443 struct stripe_head *sh = NULL; 444 struct list_head *first; 445 446 if (list_empty(conf->inactive_list + hash)) 447 goto out; 448 first = (conf->inactive_list + hash)->next; 449 sh = list_entry(first, struct stripe_head, lru); 450 list_del_init(first); 451 remove_hash(sh); 452 atomic_inc(&conf->active_stripes); 453 BUG_ON(hash != sh->hash_lock_index); 454 if (list_empty(conf->inactive_list + hash)) 455 atomic_inc(&conf->empty_inactive_list_nr); 456 out: 457 return sh; 458 } 459 460 static void shrink_buffers(struct stripe_head *sh) 461 { 462 struct page *p; 463 int i; 464 int num = sh->raid_conf->pool_size; 465 466 for (i = 0; i < num ; i++) { 467 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 468 p = sh->dev[i].page; 469 if (!p) 470 continue; 471 sh->dev[i].page = NULL; 472 put_page(p); 473 } 474 475 if (sh->ppl_page) { 476 put_page(sh->ppl_page); 477 sh->ppl_page = NULL; 478 } 479 } 480 481 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 482 { 483 int i; 484 int num = sh->raid_conf->pool_size; 485 486 for (i = 0; i < num; i++) { 487 struct page *page; 488 489 if (!(page = alloc_page(gfp))) { 490 return 1; 491 } 492 sh->dev[i].page = page; 493 sh->dev[i].orig_page = page; 494 } 495 496 if (raid5_has_ppl(sh->raid_conf)) { 497 sh->ppl_page = alloc_page(gfp); 498 if (!sh->ppl_page) 499 return 1; 500 } 501 502 return 0; 503 } 504 505 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 506 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 507 struct stripe_head *sh); 508 509 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 510 { 511 struct r5conf *conf = sh->raid_conf; 512 int i, seq; 513 514 BUG_ON(atomic_read(&sh->count) != 0); 515 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 516 BUG_ON(stripe_operations_active(sh)); 517 BUG_ON(sh->batch_head); 518 519 pr_debug("init_stripe called, stripe %llu\n", 520 (unsigned long long)sector); 521 retry: 522 seq = read_seqcount_begin(&conf->gen_lock); 523 sh->generation = conf->generation - previous; 524 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 525 sh->sector = sector; 526 stripe_set_idx(sector, conf, previous, sh); 527 sh->state = 0; 528 529 for (i = sh->disks; i--; ) { 530 struct r5dev *dev = &sh->dev[i]; 531 532 if (dev->toread || dev->read || dev->towrite || dev->written || 533 test_bit(R5_LOCKED, &dev->flags)) { 534 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 535 (unsigned long long)sh->sector, i, dev->toread, 536 dev->read, dev->towrite, dev->written, 537 test_bit(R5_LOCKED, &dev->flags)); 538 WARN_ON(1); 539 } 540 dev->flags = 0; 541 raid5_build_block(sh, i, previous); 542 } 543 if (read_seqcount_retry(&conf->gen_lock, seq)) 544 goto retry; 545 sh->overwrite_disks = 0; 546 insert_hash(conf, sh); 547 sh->cpu = smp_processor_id(); 548 set_bit(STRIPE_BATCH_READY, &sh->state); 549 } 550 551 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 552 short generation) 553 { 554 struct stripe_head *sh; 555 556 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 557 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 558 if (sh->sector == sector && sh->generation == generation) 559 return sh; 560 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 561 return NULL; 562 } 563 564 /* 565 * Need to check if array has failed when deciding whether to: 566 * - start an array 567 * - remove non-faulty devices 568 * - add a spare 569 * - allow a reshape 570 * This determination is simple when no reshape is happening. 571 * However if there is a reshape, we need to carefully check 572 * both the before and after sections. 573 * This is because some failed devices may only affect one 574 * of the two sections, and some non-in_sync devices may 575 * be insync in the section most affected by failed devices. 576 */ 577 int raid5_calc_degraded(struct r5conf *conf) 578 { 579 int degraded, degraded2; 580 int i; 581 582 rcu_read_lock(); 583 degraded = 0; 584 for (i = 0; i < conf->previous_raid_disks; i++) { 585 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 586 if (rdev && test_bit(Faulty, &rdev->flags)) 587 rdev = rcu_dereference(conf->disks[i].replacement); 588 if (!rdev || test_bit(Faulty, &rdev->flags)) 589 degraded++; 590 else if (test_bit(In_sync, &rdev->flags)) 591 ; 592 else 593 /* not in-sync or faulty. 594 * If the reshape increases the number of devices, 595 * this is being recovered by the reshape, so 596 * this 'previous' section is not in_sync. 597 * If the number of devices is being reduced however, 598 * the device can only be part of the array if 599 * we are reverting a reshape, so this section will 600 * be in-sync. 601 */ 602 if (conf->raid_disks >= conf->previous_raid_disks) 603 degraded++; 604 } 605 rcu_read_unlock(); 606 if (conf->raid_disks == conf->previous_raid_disks) 607 return degraded; 608 rcu_read_lock(); 609 degraded2 = 0; 610 for (i = 0; i < conf->raid_disks; i++) { 611 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 612 if (rdev && test_bit(Faulty, &rdev->flags)) 613 rdev = rcu_dereference(conf->disks[i].replacement); 614 if (!rdev || test_bit(Faulty, &rdev->flags)) 615 degraded2++; 616 else if (test_bit(In_sync, &rdev->flags)) 617 ; 618 else 619 /* not in-sync or faulty. 620 * If reshape increases the number of devices, this 621 * section has already been recovered, else it 622 * almost certainly hasn't. 623 */ 624 if (conf->raid_disks <= conf->previous_raid_disks) 625 degraded2++; 626 } 627 rcu_read_unlock(); 628 if (degraded2 > degraded) 629 return degraded2; 630 return degraded; 631 } 632 633 static int has_failed(struct r5conf *conf) 634 { 635 int degraded; 636 637 if (conf->mddev->reshape_position == MaxSector) 638 return conf->mddev->degraded > conf->max_degraded; 639 640 degraded = raid5_calc_degraded(conf); 641 if (degraded > conf->max_degraded) 642 return 1; 643 return 0; 644 } 645 646 struct stripe_head * 647 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 648 int previous, int noblock, int noquiesce) 649 { 650 struct stripe_head *sh; 651 int hash = stripe_hash_locks_hash(sector); 652 int inc_empty_inactive_list_flag; 653 654 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 655 656 spin_lock_irq(conf->hash_locks + hash); 657 658 do { 659 wait_event_lock_irq(conf->wait_for_quiescent, 660 conf->quiesce == 0 || noquiesce, 661 *(conf->hash_locks + hash)); 662 sh = __find_stripe(conf, sector, conf->generation - previous); 663 if (!sh) { 664 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 665 sh = get_free_stripe(conf, hash); 666 if (!sh && !test_bit(R5_DID_ALLOC, 667 &conf->cache_state)) 668 set_bit(R5_ALLOC_MORE, 669 &conf->cache_state); 670 } 671 if (noblock && sh == NULL) 672 break; 673 674 r5c_check_stripe_cache_usage(conf); 675 if (!sh) { 676 set_bit(R5_INACTIVE_BLOCKED, 677 &conf->cache_state); 678 r5l_wake_reclaim(conf->log, 0); 679 wait_event_lock_irq( 680 conf->wait_for_stripe, 681 !list_empty(conf->inactive_list + hash) && 682 (atomic_read(&conf->active_stripes) 683 < (conf->max_nr_stripes * 3 / 4) 684 || !test_bit(R5_INACTIVE_BLOCKED, 685 &conf->cache_state)), 686 *(conf->hash_locks + hash)); 687 clear_bit(R5_INACTIVE_BLOCKED, 688 &conf->cache_state); 689 } else { 690 init_stripe(sh, sector, previous); 691 atomic_inc(&sh->count); 692 } 693 } else if (!atomic_inc_not_zero(&sh->count)) { 694 spin_lock(&conf->device_lock); 695 if (!atomic_read(&sh->count)) { 696 if (!test_bit(STRIPE_HANDLE, &sh->state)) 697 atomic_inc(&conf->active_stripes); 698 BUG_ON(list_empty(&sh->lru) && 699 !test_bit(STRIPE_EXPANDING, &sh->state)); 700 inc_empty_inactive_list_flag = 0; 701 if (!list_empty(conf->inactive_list + hash)) 702 inc_empty_inactive_list_flag = 1; 703 list_del_init(&sh->lru); 704 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 705 atomic_inc(&conf->empty_inactive_list_nr); 706 if (sh->group) { 707 sh->group->stripes_cnt--; 708 sh->group = NULL; 709 } 710 } 711 atomic_inc(&sh->count); 712 spin_unlock(&conf->device_lock); 713 } 714 } while (sh == NULL); 715 716 spin_unlock_irq(conf->hash_locks + hash); 717 return sh; 718 } 719 720 static bool is_full_stripe_write(struct stripe_head *sh) 721 { 722 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 723 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 724 } 725 726 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 727 { 728 local_irq_disable(); 729 if (sh1 > sh2) { 730 spin_lock(&sh2->stripe_lock); 731 spin_lock_nested(&sh1->stripe_lock, 1); 732 } else { 733 spin_lock(&sh1->stripe_lock); 734 spin_lock_nested(&sh2->stripe_lock, 1); 735 } 736 } 737 738 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 739 { 740 spin_unlock(&sh1->stripe_lock); 741 spin_unlock(&sh2->stripe_lock); 742 local_irq_enable(); 743 } 744 745 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 746 static bool stripe_can_batch(struct stripe_head *sh) 747 { 748 struct r5conf *conf = sh->raid_conf; 749 750 if (conf->log || raid5_has_ppl(conf)) 751 return false; 752 return test_bit(STRIPE_BATCH_READY, &sh->state) && 753 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 754 is_full_stripe_write(sh); 755 } 756 757 /* we only do back search */ 758 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 759 { 760 struct stripe_head *head; 761 sector_t head_sector, tmp_sec; 762 int hash; 763 int dd_idx; 764 int inc_empty_inactive_list_flag; 765 766 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 767 tmp_sec = sh->sector; 768 if (!sector_div(tmp_sec, conf->chunk_sectors)) 769 return; 770 head_sector = sh->sector - STRIPE_SECTORS; 771 772 hash = stripe_hash_locks_hash(head_sector); 773 spin_lock_irq(conf->hash_locks + hash); 774 head = __find_stripe(conf, head_sector, conf->generation); 775 if (head && !atomic_inc_not_zero(&head->count)) { 776 spin_lock(&conf->device_lock); 777 if (!atomic_read(&head->count)) { 778 if (!test_bit(STRIPE_HANDLE, &head->state)) 779 atomic_inc(&conf->active_stripes); 780 BUG_ON(list_empty(&head->lru) && 781 !test_bit(STRIPE_EXPANDING, &head->state)); 782 inc_empty_inactive_list_flag = 0; 783 if (!list_empty(conf->inactive_list + hash)) 784 inc_empty_inactive_list_flag = 1; 785 list_del_init(&head->lru); 786 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 787 atomic_inc(&conf->empty_inactive_list_nr); 788 if (head->group) { 789 head->group->stripes_cnt--; 790 head->group = NULL; 791 } 792 } 793 atomic_inc(&head->count); 794 spin_unlock(&conf->device_lock); 795 } 796 spin_unlock_irq(conf->hash_locks + hash); 797 798 if (!head) 799 return; 800 if (!stripe_can_batch(head)) 801 goto out; 802 803 lock_two_stripes(head, sh); 804 /* clear_batch_ready clear the flag */ 805 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 806 goto unlock_out; 807 808 if (sh->batch_head) 809 goto unlock_out; 810 811 dd_idx = 0; 812 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 813 dd_idx++; 814 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 815 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 816 goto unlock_out; 817 818 if (head->batch_head) { 819 spin_lock(&head->batch_head->batch_lock); 820 /* This batch list is already running */ 821 if (!stripe_can_batch(head)) { 822 spin_unlock(&head->batch_head->batch_lock); 823 goto unlock_out; 824 } 825 826 /* 827 * at this point, head's BATCH_READY could be cleared, but we 828 * can still add the stripe to batch list 829 */ 830 list_add(&sh->batch_list, &head->batch_list); 831 spin_unlock(&head->batch_head->batch_lock); 832 833 sh->batch_head = head->batch_head; 834 } else { 835 head->batch_head = head; 836 sh->batch_head = head->batch_head; 837 spin_lock(&head->batch_lock); 838 list_add_tail(&sh->batch_list, &head->batch_list); 839 spin_unlock(&head->batch_lock); 840 } 841 842 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 843 if (atomic_dec_return(&conf->preread_active_stripes) 844 < IO_THRESHOLD) 845 md_wakeup_thread(conf->mddev->thread); 846 847 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 848 int seq = sh->bm_seq; 849 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 850 sh->batch_head->bm_seq > seq) 851 seq = sh->batch_head->bm_seq; 852 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 853 sh->batch_head->bm_seq = seq; 854 } 855 856 atomic_inc(&sh->count); 857 unlock_out: 858 unlock_two_stripes(head, sh); 859 out: 860 raid5_release_stripe(head); 861 } 862 863 /* Determine if 'data_offset' or 'new_data_offset' should be used 864 * in this stripe_head. 865 */ 866 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 867 { 868 sector_t progress = conf->reshape_progress; 869 /* Need a memory barrier to make sure we see the value 870 * of conf->generation, or ->data_offset that was set before 871 * reshape_progress was updated. 872 */ 873 smp_rmb(); 874 if (progress == MaxSector) 875 return 0; 876 if (sh->generation == conf->generation - 1) 877 return 0; 878 /* We are in a reshape, and this is a new-generation stripe, 879 * so use new_data_offset. 880 */ 881 return 1; 882 } 883 884 static void dispatch_bio_list(struct bio_list *tmp) 885 { 886 struct bio *bio; 887 888 while ((bio = bio_list_pop(tmp))) 889 generic_make_request(bio); 890 } 891 892 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 893 { 894 const struct r5pending_data *da = list_entry(a, 895 struct r5pending_data, sibling); 896 const struct r5pending_data *db = list_entry(b, 897 struct r5pending_data, sibling); 898 if (da->sector > db->sector) 899 return 1; 900 if (da->sector < db->sector) 901 return -1; 902 return 0; 903 } 904 905 static void dispatch_defer_bios(struct r5conf *conf, int target, 906 struct bio_list *list) 907 { 908 struct r5pending_data *data; 909 struct list_head *first, *next = NULL; 910 int cnt = 0; 911 912 if (conf->pending_data_cnt == 0) 913 return; 914 915 list_sort(NULL, &conf->pending_list, cmp_stripe); 916 917 first = conf->pending_list.next; 918 919 /* temporarily move the head */ 920 if (conf->next_pending_data) 921 list_move_tail(&conf->pending_list, 922 &conf->next_pending_data->sibling); 923 924 while (!list_empty(&conf->pending_list)) { 925 data = list_first_entry(&conf->pending_list, 926 struct r5pending_data, sibling); 927 if (&data->sibling == first) 928 first = data->sibling.next; 929 next = data->sibling.next; 930 931 bio_list_merge(list, &data->bios); 932 list_move(&data->sibling, &conf->free_list); 933 cnt++; 934 if (cnt >= target) 935 break; 936 } 937 conf->pending_data_cnt -= cnt; 938 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 939 940 if (next != &conf->pending_list) 941 conf->next_pending_data = list_entry(next, 942 struct r5pending_data, sibling); 943 else 944 conf->next_pending_data = NULL; 945 /* list isn't empty */ 946 if (first != &conf->pending_list) 947 list_move_tail(&conf->pending_list, first); 948 } 949 950 static void flush_deferred_bios(struct r5conf *conf) 951 { 952 struct bio_list tmp = BIO_EMPTY_LIST; 953 954 if (conf->pending_data_cnt == 0) 955 return; 956 957 spin_lock(&conf->pending_bios_lock); 958 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 959 BUG_ON(conf->pending_data_cnt != 0); 960 spin_unlock(&conf->pending_bios_lock); 961 962 dispatch_bio_list(&tmp); 963 } 964 965 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 966 struct bio_list *bios) 967 { 968 struct bio_list tmp = BIO_EMPTY_LIST; 969 struct r5pending_data *ent; 970 971 spin_lock(&conf->pending_bios_lock); 972 ent = list_first_entry(&conf->free_list, struct r5pending_data, 973 sibling); 974 list_move_tail(&ent->sibling, &conf->pending_list); 975 ent->sector = sector; 976 bio_list_init(&ent->bios); 977 bio_list_merge(&ent->bios, bios); 978 conf->pending_data_cnt++; 979 if (conf->pending_data_cnt >= PENDING_IO_MAX) 980 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 981 982 spin_unlock(&conf->pending_bios_lock); 983 984 dispatch_bio_list(&tmp); 985 } 986 987 static void 988 raid5_end_read_request(struct bio *bi); 989 static void 990 raid5_end_write_request(struct bio *bi); 991 992 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 993 { 994 struct r5conf *conf = sh->raid_conf; 995 int i, disks = sh->disks; 996 struct stripe_head *head_sh = sh; 997 struct bio_list pending_bios = BIO_EMPTY_LIST; 998 bool should_defer; 999 1000 might_sleep(); 1001 1002 if (log_stripe(sh, s) == 0) 1003 return; 1004 1005 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1006 1007 for (i = disks; i--; ) { 1008 int op, op_flags = 0; 1009 int replace_only = 0; 1010 struct bio *bi, *rbi; 1011 struct md_rdev *rdev, *rrdev = NULL; 1012 1013 sh = head_sh; 1014 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1015 op = REQ_OP_WRITE; 1016 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1017 op_flags = REQ_FUA; 1018 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1019 op = REQ_OP_DISCARD; 1020 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1021 op = REQ_OP_READ; 1022 else if (test_and_clear_bit(R5_WantReplace, 1023 &sh->dev[i].flags)) { 1024 op = REQ_OP_WRITE; 1025 replace_only = 1; 1026 } else 1027 continue; 1028 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1029 op_flags |= REQ_SYNC; 1030 1031 again: 1032 bi = &sh->dev[i].req; 1033 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1034 1035 rcu_read_lock(); 1036 rrdev = rcu_dereference(conf->disks[i].replacement); 1037 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1038 rdev = rcu_dereference(conf->disks[i].rdev); 1039 if (!rdev) { 1040 rdev = rrdev; 1041 rrdev = NULL; 1042 } 1043 if (op_is_write(op)) { 1044 if (replace_only) 1045 rdev = NULL; 1046 if (rdev == rrdev) 1047 /* We raced and saw duplicates */ 1048 rrdev = NULL; 1049 } else { 1050 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1051 rdev = rrdev; 1052 rrdev = NULL; 1053 } 1054 1055 if (rdev && test_bit(Faulty, &rdev->flags)) 1056 rdev = NULL; 1057 if (rdev) 1058 atomic_inc(&rdev->nr_pending); 1059 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1060 rrdev = NULL; 1061 if (rrdev) 1062 atomic_inc(&rrdev->nr_pending); 1063 rcu_read_unlock(); 1064 1065 /* We have already checked bad blocks for reads. Now 1066 * need to check for writes. We never accept write errors 1067 * on the replacement, so we don't to check rrdev. 1068 */ 1069 while (op_is_write(op) && rdev && 1070 test_bit(WriteErrorSeen, &rdev->flags)) { 1071 sector_t first_bad; 1072 int bad_sectors; 1073 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1074 &first_bad, &bad_sectors); 1075 if (!bad) 1076 break; 1077 1078 if (bad < 0) { 1079 set_bit(BlockedBadBlocks, &rdev->flags); 1080 if (!conf->mddev->external && 1081 conf->mddev->sb_flags) { 1082 /* It is very unlikely, but we might 1083 * still need to write out the 1084 * bad block log - better give it 1085 * a chance*/ 1086 md_check_recovery(conf->mddev); 1087 } 1088 /* 1089 * Because md_wait_for_blocked_rdev 1090 * will dec nr_pending, we must 1091 * increment it first. 1092 */ 1093 atomic_inc(&rdev->nr_pending); 1094 md_wait_for_blocked_rdev(rdev, conf->mddev); 1095 } else { 1096 /* Acknowledged bad block - skip the write */ 1097 rdev_dec_pending(rdev, conf->mddev); 1098 rdev = NULL; 1099 } 1100 } 1101 1102 if (rdev) { 1103 if (s->syncing || s->expanding || s->expanded 1104 || s->replacing) 1105 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1106 1107 set_bit(STRIPE_IO_STARTED, &sh->state); 1108 1109 bi->bi_bdev = rdev->bdev; 1110 bio_set_op_attrs(bi, op, op_flags); 1111 bi->bi_end_io = op_is_write(op) 1112 ? raid5_end_write_request 1113 : raid5_end_read_request; 1114 bi->bi_private = sh; 1115 1116 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1117 __func__, (unsigned long long)sh->sector, 1118 bi->bi_opf, i); 1119 atomic_inc(&sh->count); 1120 if (sh != head_sh) 1121 atomic_inc(&head_sh->count); 1122 if (use_new_offset(conf, sh)) 1123 bi->bi_iter.bi_sector = (sh->sector 1124 + rdev->new_data_offset); 1125 else 1126 bi->bi_iter.bi_sector = (sh->sector 1127 + rdev->data_offset); 1128 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1129 bi->bi_opf |= REQ_NOMERGE; 1130 1131 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1132 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1133 1134 if (!op_is_write(op) && 1135 test_bit(R5_InJournal, &sh->dev[i].flags)) 1136 /* 1137 * issuing read for a page in journal, this 1138 * must be preparing for prexor in rmw; read 1139 * the data into orig_page 1140 */ 1141 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1142 else 1143 sh->dev[i].vec.bv_page = sh->dev[i].page; 1144 bi->bi_vcnt = 1; 1145 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1146 bi->bi_io_vec[0].bv_offset = 0; 1147 bi->bi_iter.bi_size = STRIPE_SIZE; 1148 /* 1149 * If this is discard request, set bi_vcnt 0. We don't 1150 * want to confuse SCSI because SCSI will replace payload 1151 */ 1152 if (op == REQ_OP_DISCARD) 1153 bi->bi_vcnt = 0; 1154 if (rrdev) 1155 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1156 1157 if (conf->mddev->gendisk) 1158 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1159 bi, disk_devt(conf->mddev->gendisk), 1160 sh->dev[i].sector); 1161 if (should_defer && op_is_write(op)) 1162 bio_list_add(&pending_bios, bi); 1163 else 1164 generic_make_request(bi); 1165 } 1166 if (rrdev) { 1167 if (s->syncing || s->expanding || s->expanded 1168 || s->replacing) 1169 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1170 1171 set_bit(STRIPE_IO_STARTED, &sh->state); 1172 1173 rbi->bi_bdev = rrdev->bdev; 1174 bio_set_op_attrs(rbi, op, op_flags); 1175 BUG_ON(!op_is_write(op)); 1176 rbi->bi_end_io = raid5_end_write_request; 1177 rbi->bi_private = sh; 1178 1179 pr_debug("%s: for %llu schedule op %d on " 1180 "replacement disc %d\n", 1181 __func__, (unsigned long long)sh->sector, 1182 rbi->bi_opf, i); 1183 atomic_inc(&sh->count); 1184 if (sh != head_sh) 1185 atomic_inc(&head_sh->count); 1186 if (use_new_offset(conf, sh)) 1187 rbi->bi_iter.bi_sector = (sh->sector 1188 + rrdev->new_data_offset); 1189 else 1190 rbi->bi_iter.bi_sector = (sh->sector 1191 + rrdev->data_offset); 1192 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1193 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1194 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1195 rbi->bi_vcnt = 1; 1196 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1197 rbi->bi_io_vec[0].bv_offset = 0; 1198 rbi->bi_iter.bi_size = STRIPE_SIZE; 1199 /* 1200 * If this is discard request, set bi_vcnt 0. We don't 1201 * want to confuse SCSI because SCSI will replace payload 1202 */ 1203 if (op == REQ_OP_DISCARD) 1204 rbi->bi_vcnt = 0; 1205 if (conf->mddev->gendisk) 1206 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1207 rbi, disk_devt(conf->mddev->gendisk), 1208 sh->dev[i].sector); 1209 if (should_defer && op_is_write(op)) 1210 bio_list_add(&pending_bios, rbi); 1211 else 1212 generic_make_request(rbi); 1213 } 1214 if (!rdev && !rrdev) { 1215 if (op_is_write(op)) 1216 set_bit(STRIPE_DEGRADED, &sh->state); 1217 pr_debug("skip op %d on disc %d for sector %llu\n", 1218 bi->bi_opf, i, (unsigned long long)sh->sector); 1219 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1220 set_bit(STRIPE_HANDLE, &sh->state); 1221 } 1222 1223 if (!head_sh->batch_head) 1224 continue; 1225 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1226 batch_list); 1227 if (sh != head_sh) 1228 goto again; 1229 } 1230 1231 if (should_defer && !bio_list_empty(&pending_bios)) 1232 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1233 } 1234 1235 static struct dma_async_tx_descriptor * 1236 async_copy_data(int frombio, struct bio *bio, struct page **page, 1237 sector_t sector, struct dma_async_tx_descriptor *tx, 1238 struct stripe_head *sh, int no_skipcopy) 1239 { 1240 struct bio_vec bvl; 1241 struct bvec_iter iter; 1242 struct page *bio_page; 1243 int page_offset; 1244 struct async_submit_ctl submit; 1245 enum async_tx_flags flags = 0; 1246 1247 if (bio->bi_iter.bi_sector >= sector) 1248 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1249 else 1250 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1251 1252 if (frombio) 1253 flags |= ASYNC_TX_FENCE; 1254 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1255 1256 bio_for_each_segment(bvl, bio, iter) { 1257 int len = bvl.bv_len; 1258 int clen; 1259 int b_offset = 0; 1260 1261 if (page_offset < 0) { 1262 b_offset = -page_offset; 1263 page_offset += b_offset; 1264 len -= b_offset; 1265 } 1266 1267 if (len > 0 && page_offset + len > STRIPE_SIZE) 1268 clen = STRIPE_SIZE - page_offset; 1269 else 1270 clen = len; 1271 1272 if (clen > 0) { 1273 b_offset += bvl.bv_offset; 1274 bio_page = bvl.bv_page; 1275 if (frombio) { 1276 if (sh->raid_conf->skip_copy && 1277 b_offset == 0 && page_offset == 0 && 1278 clen == STRIPE_SIZE && 1279 !no_skipcopy) 1280 *page = bio_page; 1281 else 1282 tx = async_memcpy(*page, bio_page, page_offset, 1283 b_offset, clen, &submit); 1284 } else 1285 tx = async_memcpy(bio_page, *page, b_offset, 1286 page_offset, clen, &submit); 1287 } 1288 /* chain the operations */ 1289 submit.depend_tx = tx; 1290 1291 if (clen < len) /* hit end of page */ 1292 break; 1293 page_offset += len; 1294 } 1295 1296 return tx; 1297 } 1298 1299 static void ops_complete_biofill(void *stripe_head_ref) 1300 { 1301 struct stripe_head *sh = stripe_head_ref; 1302 int i; 1303 1304 pr_debug("%s: stripe %llu\n", __func__, 1305 (unsigned long long)sh->sector); 1306 1307 /* clear completed biofills */ 1308 for (i = sh->disks; i--; ) { 1309 struct r5dev *dev = &sh->dev[i]; 1310 1311 /* acknowledge completion of a biofill operation */ 1312 /* and check if we need to reply to a read request, 1313 * new R5_Wantfill requests are held off until 1314 * !STRIPE_BIOFILL_RUN 1315 */ 1316 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1317 struct bio *rbi, *rbi2; 1318 1319 BUG_ON(!dev->read); 1320 rbi = dev->read; 1321 dev->read = NULL; 1322 while (rbi && rbi->bi_iter.bi_sector < 1323 dev->sector + STRIPE_SECTORS) { 1324 rbi2 = r5_next_bio(rbi, dev->sector); 1325 bio_endio(rbi); 1326 rbi = rbi2; 1327 } 1328 } 1329 } 1330 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1331 1332 set_bit(STRIPE_HANDLE, &sh->state); 1333 raid5_release_stripe(sh); 1334 } 1335 1336 static void ops_run_biofill(struct stripe_head *sh) 1337 { 1338 struct dma_async_tx_descriptor *tx = NULL; 1339 struct async_submit_ctl submit; 1340 int i; 1341 1342 BUG_ON(sh->batch_head); 1343 pr_debug("%s: stripe %llu\n", __func__, 1344 (unsigned long long)sh->sector); 1345 1346 for (i = sh->disks; i--; ) { 1347 struct r5dev *dev = &sh->dev[i]; 1348 if (test_bit(R5_Wantfill, &dev->flags)) { 1349 struct bio *rbi; 1350 spin_lock_irq(&sh->stripe_lock); 1351 dev->read = rbi = dev->toread; 1352 dev->toread = NULL; 1353 spin_unlock_irq(&sh->stripe_lock); 1354 while (rbi && rbi->bi_iter.bi_sector < 1355 dev->sector + STRIPE_SECTORS) { 1356 tx = async_copy_data(0, rbi, &dev->page, 1357 dev->sector, tx, sh, 0); 1358 rbi = r5_next_bio(rbi, dev->sector); 1359 } 1360 } 1361 } 1362 1363 atomic_inc(&sh->count); 1364 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1365 async_trigger_callback(&submit); 1366 } 1367 1368 static void mark_target_uptodate(struct stripe_head *sh, int target) 1369 { 1370 struct r5dev *tgt; 1371 1372 if (target < 0) 1373 return; 1374 1375 tgt = &sh->dev[target]; 1376 set_bit(R5_UPTODATE, &tgt->flags); 1377 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1378 clear_bit(R5_Wantcompute, &tgt->flags); 1379 } 1380 1381 static void ops_complete_compute(void *stripe_head_ref) 1382 { 1383 struct stripe_head *sh = stripe_head_ref; 1384 1385 pr_debug("%s: stripe %llu\n", __func__, 1386 (unsigned long long)sh->sector); 1387 1388 /* mark the computed target(s) as uptodate */ 1389 mark_target_uptodate(sh, sh->ops.target); 1390 mark_target_uptodate(sh, sh->ops.target2); 1391 1392 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1393 if (sh->check_state == check_state_compute_run) 1394 sh->check_state = check_state_compute_result; 1395 set_bit(STRIPE_HANDLE, &sh->state); 1396 raid5_release_stripe(sh); 1397 } 1398 1399 /* return a pointer to the address conversion region of the scribble buffer */ 1400 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1401 struct raid5_percpu *percpu, int i) 1402 { 1403 void *addr; 1404 1405 addr = flex_array_get(percpu->scribble, i); 1406 return addr + sizeof(struct page *) * (sh->disks + 2); 1407 } 1408 1409 /* return a pointer to the address conversion region of the scribble buffer */ 1410 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1411 { 1412 void *addr; 1413 1414 addr = flex_array_get(percpu->scribble, i); 1415 return addr; 1416 } 1417 1418 static struct dma_async_tx_descriptor * 1419 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1420 { 1421 int disks = sh->disks; 1422 struct page **xor_srcs = to_addr_page(percpu, 0); 1423 int target = sh->ops.target; 1424 struct r5dev *tgt = &sh->dev[target]; 1425 struct page *xor_dest = tgt->page; 1426 int count = 0; 1427 struct dma_async_tx_descriptor *tx; 1428 struct async_submit_ctl submit; 1429 int i; 1430 1431 BUG_ON(sh->batch_head); 1432 1433 pr_debug("%s: stripe %llu block: %d\n", 1434 __func__, (unsigned long long)sh->sector, target); 1435 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1436 1437 for (i = disks; i--; ) 1438 if (i != target) 1439 xor_srcs[count++] = sh->dev[i].page; 1440 1441 atomic_inc(&sh->count); 1442 1443 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1444 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1445 if (unlikely(count == 1)) 1446 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1447 else 1448 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1449 1450 return tx; 1451 } 1452 1453 /* set_syndrome_sources - populate source buffers for gen_syndrome 1454 * @srcs - (struct page *) array of size sh->disks 1455 * @sh - stripe_head to parse 1456 * 1457 * Populates srcs in proper layout order for the stripe and returns the 1458 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1459 * destination buffer is recorded in srcs[count] and the Q destination 1460 * is recorded in srcs[count+1]]. 1461 */ 1462 static int set_syndrome_sources(struct page **srcs, 1463 struct stripe_head *sh, 1464 int srctype) 1465 { 1466 int disks = sh->disks; 1467 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1468 int d0_idx = raid6_d0(sh); 1469 int count; 1470 int i; 1471 1472 for (i = 0; i < disks; i++) 1473 srcs[i] = NULL; 1474 1475 count = 0; 1476 i = d0_idx; 1477 do { 1478 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1479 struct r5dev *dev = &sh->dev[i]; 1480 1481 if (i == sh->qd_idx || i == sh->pd_idx || 1482 (srctype == SYNDROME_SRC_ALL) || 1483 (srctype == SYNDROME_SRC_WANT_DRAIN && 1484 (test_bit(R5_Wantdrain, &dev->flags) || 1485 test_bit(R5_InJournal, &dev->flags))) || 1486 (srctype == SYNDROME_SRC_WRITTEN && 1487 (dev->written || 1488 test_bit(R5_InJournal, &dev->flags)))) { 1489 if (test_bit(R5_InJournal, &dev->flags)) 1490 srcs[slot] = sh->dev[i].orig_page; 1491 else 1492 srcs[slot] = sh->dev[i].page; 1493 } 1494 i = raid6_next_disk(i, disks); 1495 } while (i != d0_idx); 1496 1497 return syndrome_disks; 1498 } 1499 1500 static struct dma_async_tx_descriptor * 1501 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1502 { 1503 int disks = sh->disks; 1504 struct page **blocks = to_addr_page(percpu, 0); 1505 int target; 1506 int qd_idx = sh->qd_idx; 1507 struct dma_async_tx_descriptor *tx; 1508 struct async_submit_ctl submit; 1509 struct r5dev *tgt; 1510 struct page *dest; 1511 int i; 1512 int count; 1513 1514 BUG_ON(sh->batch_head); 1515 if (sh->ops.target < 0) 1516 target = sh->ops.target2; 1517 else if (sh->ops.target2 < 0) 1518 target = sh->ops.target; 1519 else 1520 /* we should only have one valid target */ 1521 BUG(); 1522 BUG_ON(target < 0); 1523 pr_debug("%s: stripe %llu block: %d\n", 1524 __func__, (unsigned long long)sh->sector, target); 1525 1526 tgt = &sh->dev[target]; 1527 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1528 dest = tgt->page; 1529 1530 atomic_inc(&sh->count); 1531 1532 if (target == qd_idx) { 1533 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1534 blocks[count] = NULL; /* regenerating p is not necessary */ 1535 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1536 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1537 ops_complete_compute, sh, 1538 to_addr_conv(sh, percpu, 0)); 1539 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1540 } else { 1541 /* Compute any data- or p-drive using XOR */ 1542 count = 0; 1543 for (i = disks; i-- ; ) { 1544 if (i == target || i == qd_idx) 1545 continue; 1546 blocks[count++] = sh->dev[i].page; 1547 } 1548 1549 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1550 NULL, ops_complete_compute, sh, 1551 to_addr_conv(sh, percpu, 0)); 1552 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1553 } 1554 1555 return tx; 1556 } 1557 1558 static struct dma_async_tx_descriptor * 1559 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1560 { 1561 int i, count, disks = sh->disks; 1562 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1563 int d0_idx = raid6_d0(sh); 1564 int faila = -1, failb = -1; 1565 int target = sh->ops.target; 1566 int target2 = sh->ops.target2; 1567 struct r5dev *tgt = &sh->dev[target]; 1568 struct r5dev *tgt2 = &sh->dev[target2]; 1569 struct dma_async_tx_descriptor *tx; 1570 struct page **blocks = to_addr_page(percpu, 0); 1571 struct async_submit_ctl submit; 1572 1573 BUG_ON(sh->batch_head); 1574 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1575 __func__, (unsigned long long)sh->sector, target, target2); 1576 BUG_ON(target < 0 || target2 < 0); 1577 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1578 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1579 1580 /* we need to open-code set_syndrome_sources to handle the 1581 * slot number conversion for 'faila' and 'failb' 1582 */ 1583 for (i = 0; i < disks ; i++) 1584 blocks[i] = NULL; 1585 count = 0; 1586 i = d0_idx; 1587 do { 1588 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1589 1590 blocks[slot] = sh->dev[i].page; 1591 1592 if (i == target) 1593 faila = slot; 1594 if (i == target2) 1595 failb = slot; 1596 i = raid6_next_disk(i, disks); 1597 } while (i != d0_idx); 1598 1599 BUG_ON(faila == failb); 1600 if (failb < faila) 1601 swap(faila, failb); 1602 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1603 __func__, (unsigned long long)sh->sector, faila, failb); 1604 1605 atomic_inc(&sh->count); 1606 1607 if (failb == syndrome_disks+1) { 1608 /* Q disk is one of the missing disks */ 1609 if (faila == syndrome_disks) { 1610 /* Missing P+Q, just recompute */ 1611 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1612 ops_complete_compute, sh, 1613 to_addr_conv(sh, percpu, 0)); 1614 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1615 STRIPE_SIZE, &submit); 1616 } else { 1617 struct page *dest; 1618 int data_target; 1619 int qd_idx = sh->qd_idx; 1620 1621 /* Missing D+Q: recompute D from P, then recompute Q */ 1622 if (target == qd_idx) 1623 data_target = target2; 1624 else 1625 data_target = target; 1626 1627 count = 0; 1628 for (i = disks; i-- ; ) { 1629 if (i == data_target || i == qd_idx) 1630 continue; 1631 blocks[count++] = sh->dev[i].page; 1632 } 1633 dest = sh->dev[data_target].page; 1634 init_async_submit(&submit, 1635 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1636 NULL, NULL, NULL, 1637 to_addr_conv(sh, percpu, 0)); 1638 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1639 &submit); 1640 1641 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1642 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1643 ops_complete_compute, sh, 1644 to_addr_conv(sh, percpu, 0)); 1645 return async_gen_syndrome(blocks, 0, count+2, 1646 STRIPE_SIZE, &submit); 1647 } 1648 } else { 1649 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1650 ops_complete_compute, sh, 1651 to_addr_conv(sh, percpu, 0)); 1652 if (failb == syndrome_disks) { 1653 /* We're missing D+P. */ 1654 return async_raid6_datap_recov(syndrome_disks+2, 1655 STRIPE_SIZE, faila, 1656 blocks, &submit); 1657 } else { 1658 /* We're missing D+D. */ 1659 return async_raid6_2data_recov(syndrome_disks+2, 1660 STRIPE_SIZE, faila, failb, 1661 blocks, &submit); 1662 } 1663 } 1664 } 1665 1666 static void ops_complete_prexor(void *stripe_head_ref) 1667 { 1668 struct stripe_head *sh = stripe_head_ref; 1669 1670 pr_debug("%s: stripe %llu\n", __func__, 1671 (unsigned long long)sh->sector); 1672 1673 if (r5c_is_writeback(sh->raid_conf->log)) 1674 /* 1675 * raid5-cache write back uses orig_page during prexor. 1676 * After prexor, it is time to free orig_page 1677 */ 1678 r5c_release_extra_page(sh); 1679 } 1680 1681 static struct dma_async_tx_descriptor * 1682 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1683 struct dma_async_tx_descriptor *tx) 1684 { 1685 int disks = sh->disks; 1686 struct page **xor_srcs = to_addr_page(percpu, 0); 1687 int count = 0, pd_idx = sh->pd_idx, i; 1688 struct async_submit_ctl submit; 1689 1690 /* existing parity data subtracted */ 1691 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1692 1693 BUG_ON(sh->batch_head); 1694 pr_debug("%s: stripe %llu\n", __func__, 1695 (unsigned long long)sh->sector); 1696 1697 for (i = disks; i--; ) { 1698 struct r5dev *dev = &sh->dev[i]; 1699 /* Only process blocks that are known to be uptodate */ 1700 if (test_bit(R5_InJournal, &dev->flags)) 1701 xor_srcs[count++] = dev->orig_page; 1702 else if (test_bit(R5_Wantdrain, &dev->flags)) 1703 xor_srcs[count++] = dev->page; 1704 } 1705 1706 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1707 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1708 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1709 1710 return tx; 1711 } 1712 1713 static struct dma_async_tx_descriptor * 1714 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1715 struct dma_async_tx_descriptor *tx) 1716 { 1717 struct page **blocks = to_addr_page(percpu, 0); 1718 int count; 1719 struct async_submit_ctl submit; 1720 1721 pr_debug("%s: stripe %llu\n", __func__, 1722 (unsigned long long)sh->sector); 1723 1724 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1725 1726 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1727 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1728 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1729 1730 return tx; 1731 } 1732 1733 static struct dma_async_tx_descriptor * 1734 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1735 { 1736 struct r5conf *conf = sh->raid_conf; 1737 int disks = sh->disks; 1738 int i; 1739 struct stripe_head *head_sh = sh; 1740 1741 pr_debug("%s: stripe %llu\n", __func__, 1742 (unsigned long long)sh->sector); 1743 1744 for (i = disks; i--; ) { 1745 struct r5dev *dev; 1746 struct bio *chosen; 1747 1748 sh = head_sh; 1749 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1750 struct bio *wbi; 1751 1752 again: 1753 dev = &sh->dev[i]; 1754 /* 1755 * clear R5_InJournal, so when rewriting a page in 1756 * journal, it is not skipped by r5l_log_stripe() 1757 */ 1758 clear_bit(R5_InJournal, &dev->flags); 1759 spin_lock_irq(&sh->stripe_lock); 1760 chosen = dev->towrite; 1761 dev->towrite = NULL; 1762 sh->overwrite_disks = 0; 1763 BUG_ON(dev->written); 1764 wbi = dev->written = chosen; 1765 spin_unlock_irq(&sh->stripe_lock); 1766 WARN_ON(dev->page != dev->orig_page); 1767 1768 while (wbi && wbi->bi_iter.bi_sector < 1769 dev->sector + STRIPE_SECTORS) { 1770 if (wbi->bi_opf & REQ_FUA) 1771 set_bit(R5_WantFUA, &dev->flags); 1772 if (wbi->bi_opf & REQ_SYNC) 1773 set_bit(R5_SyncIO, &dev->flags); 1774 if (bio_op(wbi) == REQ_OP_DISCARD) 1775 set_bit(R5_Discard, &dev->flags); 1776 else { 1777 tx = async_copy_data(1, wbi, &dev->page, 1778 dev->sector, tx, sh, 1779 r5c_is_writeback(conf->log)); 1780 if (dev->page != dev->orig_page && 1781 !r5c_is_writeback(conf->log)) { 1782 set_bit(R5_SkipCopy, &dev->flags); 1783 clear_bit(R5_UPTODATE, &dev->flags); 1784 clear_bit(R5_OVERWRITE, &dev->flags); 1785 } 1786 } 1787 wbi = r5_next_bio(wbi, dev->sector); 1788 } 1789 1790 if (head_sh->batch_head) { 1791 sh = list_first_entry(&sh->batch_list, 1792 struct stripe_head, 1793 batch_list); 1794 if (sh == head_sh) 1795 continue; 1796 goto again; 1797 } 1798 } 1799 } 1800 1801 return tx; 1802 } 1803 1804 static void ops_complete_reconstruct(void *stripe_head_ref) 1805 { 1806 struct stripe_head *sh = stripe_head_ref; 1807 int disks = sh->disks; 1808 int pd_idx = sh->pd_idx; 1809 int qd_idx = sh->qd_idx; 1810 int i; 1811 bool fua = false, sync = false, discard = false; 1812 1813 pr_debug("%s: stripe %llu\n", __func__, 1814 (unsigned long long)sh->sector); 1815 1816 for (i = disks; i--; ) { 1817 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1818 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1819 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1820 } 1821 1822 for (i = disks; i--; ) { 1823 struct r5dev *dev = &sh->dev[i]; 1824 1825 if (dev->written || i == pd_idx || i == qd_idx) { 1826 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1827 set_bit(R5_UPTODATE, &dev->flags); 1828 if (fua) 1829 set_bit(R5_WantFUA, &dev->flags); 1830 if (sync) 1831 set_bit(R5_SyncIO, &dev->flags); 1832 } 1833 } 1834 1835 if (sh->reconstruct_state == reconstruct_state_drain_run) 1836 sh->reconstruct_state = reconstruct_state_drain_result; 1837 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1838 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1839 else { 1840 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1841 sh->reconstruct_state = reconstruct_state_result; 1842 } 1843 1844 set_bit(STRIPE_HANDLE, &sh->state); 1845 raid5_release_stripe(sh); 1846 } 1847 1848 static void 1849 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1850 struct dma_async_tx_descriptor *tx) 1851 { 1852 int disks = sh->disks; 1853 struct page **xor_srcs; 1854 struct async_submit_ctl submit; 1855 int count, pd_idx = sh->pd_idx, i; 1856 struct page *xor_dest; 1857 int prexor = 0; 1858 unsigned long flags; 1859 int j = 0; 1860 struct stripe_head *head_sh = sh; 1861 int last_stripe; 1862 1863 pr_debug("%s: stripe %llu\n", __func__, 1864 (unsigned long long)sh->sector); 1865 1866 for (i = 0; i < sh->disks; i++) { 1867 if (pd_idx == i) 1868 continue; 1869 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1870 break; 1871 } 1872 if (i >= sh->disks) { 1873 atomic_inc(&sh->count); 1874 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1875 ops_complete_reconstruct(sh); 1876 return; 1877 } 1878 again: 1879 count = 0; 1880 xor_srcs = to_addr_page(percpu, j); 1881 /* check if prexor is active which means only process blocks 1882 * that are part of a read-modify-write (written) 1883 */ 1884 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1885 prexor = 1; 1886 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1887 for (i = disks; i--; ) { 1888 struct r5dev *dev = &sh->dev[i]; 1889 if (head_sh->dev[i].written || 1890 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1891 xor_srcs[count++] = dev->page; 1892 } 1893 } else { 1894 xor_dest = sh->dev[pd_idx].page; 1895 for (i = disks; i--; ) { 1896 struct r5dev *dev = &sh->dev[i]; 1897 if (i != pd_idx) 1898 xor_srcs[count++] = dev->page; 1899 } 1900 } 1901 1902 /* 1/ if we prexor'd then the dest is reused as a source 1903 * 2/ if we did not prexor then we are redoing the parity 1904 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1905 * for the synchronous xor case 1906 */ 1907 last_stripe = !head_sh->batch_head || 1908 list_first_entry(&sh->batch_list, 1909 struct stripe_head, batch_list) == head_sh; 1910 if (last_stripe) { 1911 flags = ASYNC_TX_ACK | 1912 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1913 1914 atomic_inc(&head_sh->count); 1915 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1916 to_addr_conv(sh, percpu, j)); 1917 } else { 1918 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1919 init_async_submit(&submit, flags, tx, NULL, NULL, 1920 to_addr_conv(sh, percpu, j)); 1921 } 1922 1923 if (unlikely(count == 1)) 1924 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1925 else 1926 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1927 if (!last_stripe) { 1928 j++; 1929 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1930 batch_list); 1931 goto again; 1932 } 1933 } 1934 1935 static void 1936 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1937 struct dma_async_tx_descriptor *tx) 1938 { 1939 struct async_submit_ctl submit; 1940 struct page **blocks; 1941 int count, i, j = 0; 1942 struct stripe_head *head_sh = sh; 1943 int last_stripe; 1944 int synflags; 1945 unsigned long txflags; 1946 1947 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1948 1949 for (i = 0; i < sh->disks; i++) { 1950 if (sh->pd_idx == i || sh->qd_idx == i) 1951 continue; 1952 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1953 break; 1954 } 1955 if (i >= sh->disks) { 1956 atomic_inc(&sh->count); 1957 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1958 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1959 ops_complete_reconstruct(sh); 1960 return; 1961 } 1962 1963 again: 1964 blocks = to_addr_page(percpu, j); 1965 1966 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1967 synflags = SYNDROME_SRC_WRITTEN; 1968 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1969 } else { 1970 synflags = SYNDROME_SRC_ALL; 1971 txflags = ASYNC_TX_ACK; 1972 } 1973 1974 count = set_syndrome_sources(blocks, sh, synflags); 1975 last_stripe = !head_sh->batch_head || 1976 list_first_entry(&sh->batch_list, 1977 struct stripe_head, batch_list) == head_sh; 1978 1979 if (last_stripe) { 1980 atomic_inc(&head_sh->count); 1981 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1982 head_sh, to_addr_conv(sh, percpu, j)); 1983 } else 1984 init_async_submit(&submit, 0, tx, NULL, NULL, 1985 to_addr_conv(sh, percpu, j)); 1986 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1987 if (!last_stripe) { 1988 j++; 1989 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1990 batch_list); 1991 goto again; 1992 } 1993 } 1994 1995 static void ops_complete_check(void *stripe_head_ref) 1996 { 1997 struct stripe_head *sh = stripe_head_ref; 1998 1999 pr_debug("%s: stripe %llu\n", __func__, 2000 (unsigned long long)sh->sector); 2001 2002 sh->check_state = check_state_check_result; 2003 set_bit(STRIPE_HANDLE, &sh->state); 2004 raid5_release_stripe(sh); 2005 } 2006 2007 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2008 { 2009 int disks = sh->disks; 2010 int pd_idx = sh->pd_idx; 2011 int qd_idx = sh->qd_idx; 2012 struct page *xor_dest; 2013 struct page **xor_srcs = to_addr_page(percpu, 0); 2014 struct dma_async_tx_descriptor *tx; 2015 struct async_submit_ctl submit; 2016 int count; 2017 int i; 2018 2019 pr_debug("%s: stripe %llu\n", __func__, 2020 (unsigned long long)sh->sector); 2021 2022 BUG_ON(sh->batch_head); 2023 count = 0; 2024 xor_dest = sh->dev[pd_idx].page; 2025 xor_srcs[count++] = xor_dest; 2026 for (i = disks; i--; ) { 2027 if (i == pd_idx || i == qd_idx) 2028 continue; 2029 xor_srcs[count++] = sh->dev[i].page; 2030 } 2031 2032 init_async_submit(&submit, 0, NULL, NULL, NULL, 2033 to_addr_conv(sh, percpu, 0)); 2034 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2035 &sh->ops.zero_sum_result, &submit); 2036 2037 atomic_inc(&sh->count); 2038 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2039 tx = async_trigger_callback(&submit); 2040 } 2041 2042 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2043 { 2044 struct page **srcs = to_addr_page(percpu, 0); 2045 struct async_submit_ctl submit; 2046 int count; 2047 2048 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2049 (unsigned long long)sh->sector, checkp); 2050 2051 BUG_ON(sh->batch_head); 2052 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2053 if (!checkp) 2054 srcs[count] = NULL; 2055 2056 atomic_inc(&sh->count); 2057 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2058 sh, to_addr_conv(sh, percpu, 0)); 2059 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2060 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2061 } 2062 2063 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2064 { 2065 int overlap_clear = 0, i, disks = sh->disks; 2066 struct dma_async_tx_descriptor *tx = NULL; 2067 struct r5conf *conf = sh->raid_conf; 2068 int level = conf->level; 2069 struct raid5_percpu *percpu; 2070 unsigned long cpu; 2071 2072 cpu = get_cpu(); 2073 percpu = per_cpu_ptr(conf->percpu, cpu); 2074 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2075 ops_run_biofill(sh); 2076 overlap_clear++; 2077 } 2078 2079 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2080 if (level < 6) 2081 tx = ops_run_compute5(sh, percpu); 2082 else { 2083 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2084 tx = ops_run_compute6_1(sh, percpu); 2085 else 2086 tx = ops_run_compute6_2(sh, percpu); 2087 } 2088 /* terminate the chain if reconstruct is not set to be run */ 2089 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2090 async_tx_ack(tx); 2091 } 2092 2093 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2094 tx = ops_run_partial_parity(sh, percpu, tx); 2095 2096 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2097 if (level < 6) 2098 tx = ops_run_prexor5(sh, percpu, tx); 2099 else 2100 tx = ops_run_prexor6(sh, percpu, tx); 2101 } 2102 2103 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2104 tx = ops_run_biodrain(sh, tx); 2105 overlap_clear++; 2106 } 2107 2108 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2109 if (level < 6) 2110 ops_run_reconstruct5(sh, percpu, tx); 2111 else 2112 ops_run_reconstruct6(sh, percpu, tx); 2113 } 2114 2115 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2116 if (sh->check_state == check_state_run) 2117 ops_run_check_p(sh, percpu); 2118 else if (sh->check_state == check_state_run_q) 2119 ops_run_check_pq(sh, percpu, 0); 2120 else if (sh->check_state == check_state_run_pq) 2121 ops_run_check_pq(sh, percpu, 1); 2122 else 2123 BUG(); 2124 } 2125 2126 if (overlap_clear && !sh->batch_head) 2127 for (i = disks; i--; ) { 2128 struct r5dev *dev = &sh->dev[i]; 2129 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2130 wake_up(&sh->raid_conf->wait_for_overlap); 2131 } 2132 put_cpu(); 2133 } 2134 2135 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2136 int disks) 2137 { 2138 struct stripe_head *sh; 2139 int i; 2140 2141 sh = kmem_cache_zalloc(sc, gfp); 2142 if (sh) { 2143 spin_lock_init(&sh->stripe_lock); 2144 spin_lock_init(&sh->batch_lock); 2145 INIT_LIST_HEAD(&sh->batch_list); 2146 INIT_LIST_HEAD(&sh->lru); 2147 INIT_LIST_HEAD(&sh->r5c); 2148 INIT_LIST_HEAD(&sh->log_list); 2149 atomic_set(&sh->count, 1); 2150 sh->log_start = MaxSector; 2151 for (i = 0; i < disks; i++) { 2152 struct r5dev *dev = &sh->dev[i]; 2153 2154 bio_init(&dev->req, &dev->vec, 1); 2155 bio_init(&dev->rreq, &dev->rvec, 1); 2156 } 2157 } 2158 return sh; 2159 } 2160 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2161 { 2162 struct stripe_head *sh; 2163 2164 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); 2165 if (!sh) 2166 return 0; 2167 2168 sh->raid_conf = conf; 2169 2170 if (grow_buffers(sh, gfp)) { 2171 shrink_buffers(sh); 2172 kmem_cache_free(conf->slab_cache, sh); 2173 return 0; 2174 } 2175 sh->hash_lock_index = 2176 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2177 /* we just created an active stripe so... */ 2178 atomic_inc(&conf->active_stripes); 2179 2180 raid5_release_stripe(sh); 2181 conf->max_nr_stripes++; 2182 return 1; 2183 } 2184 2185 static int grow_stripes(struct r5conf *conf, int num) 2186 { 2187 struct kmem_cache *sc; 2188 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2189 2190 if (conf->mddev->gendisk) 2191 sprintf(conf->cache_name[0], 2192 "raid%d-%s", conf->level, mdname(conf->mddev)); 2193 else 2194 sprintf(conf->cache_name[0], 2195 "raid%d-%p", conf->level, conf->mddev); 2196 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2197 2198 conf->active_name = 0; 2199 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2200 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2201 0, 0, NULL); 2202 if (!sc) 2203 return 1; 2204 conf->slab_cache = sc; 2205 conf->pool_size = devs; 2206 while (num--) 2207 if (!grow_one_stripe(conf, GFP_KERNEL)) 2208 return 1; 2209 2210 return 0; 2211 } 2212 2213 /** 2214 * scribble_len - return the required size of the scribble region 2215 * @num - total number of disks in the array 2216 * 2217 * The size must be enough to contain: 2218 * 1/ a struct page pointer for each device in the array +2 2219 * 2/ room to convert each entry in (1) to its corresponding dma 2220 * (dma_map_page()) or page (page_address()) address. 2221 * 2222 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2223 * calculate over all devices (not just the data blocks), using zeros in place 2224 * of the P and Q blocks. 2225 */ 2226 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2227 { 2228 struct flex_array *ret; 2229 size_t len; 2230 2231 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2232 ret = flex_array_alloc(len, cnt, flags); 2233 if (!ret) 2234 return NULL; 2235 /* always prealloc all elements, so no locking is required */ 2236 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2237 flex_array_free(ret); 2238 return NULL; 2239 } 2240 return ret; 2241 } 2242 2243 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2244 { 2245 unsigned long cpu; 2246 int err = 0; 2247 2248 /* 2249 * Never shrink. And mddev_suspend() could deadlock if this is called 2250 * from raid5d. In that case, scribble_disks and scribble_sectors 2251 * should equal to new_disks and new_sectors 2252 */ 2253 if (conf->scribble_disks >= new_disks && 2254 conf->scribble_sectors >= new_sectors) 2255 return 0; 2256 mddev_suspend(conf->mddev); 2257 get_online_cpus(); 2258 for_each_present_cpu(cpu) { 2259 struct raid5_percpu *percpu; 2260 struct flex_array *scribble; 2261 2262 percpu = per_cpu_ptr(conf->percpu, cpu); 2263 scribble = scribble_alloc(new_disks, 2264 new_sectors / STRIPE_SECTORS, 2265 GFP_NOIO); 2266 2267 if (scribble) { 2268 flex_array_free(percpu->scribble); 2269 percpu->scribble = scribble; 2270 } else { 2271 err = -ENOMEM; 2272 break; 2273 } 2274 } 2275 put_online_cpus(); 2276 mddev_resume(conf->mddev); 2277 if (!err) { 2278 conf->scribble_disks = new_disks; 2279 conf->scribble_sectors = new_sectors; 2280 } 2281 return err; 2282 } 2283 2284 static int resize_stripes(struct r5conf *conf, int newsize) 2285 { 2286 /* Make all the stripes able to hold 'newsize' devices. 2287 * New slots in each stripe get 'page' set to a new page. 2288 * 2289 * This happens in stages: 2290 * 1/ create a new kmem_cache and allocate the required number of 2291 * stripe_heads. 2292 * 2/ gather all the old stripe_heads and transfer the pages across 2293 * to the new stripe_heads. This will have the side effect of 2294 * freezing the array as once all stripe_heads have been collected, 2295 * no IO will be possible. Old stripe heads are freed once their 2296 * pages have been transferred over, and the old kmem_cache is 2297 * freed when all stripes are done. 2298 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2299 * we simple return a failure status - no need to clean anything up. 2300 * 4/ allocate new pages for the new slots in the new stripe_heads. 2301 * If this fails, we don't bother trying the shrink the 2302 * stripe_heads down again, we just leave them as they are. 2303 * As each stripe_head is processed the new one is released into 2304 * active service. 2305 * 2306 * Once step2 is started, we cannot afford to wait for a write, 2307 * so we use GFP_NOIO allocations. 2308 */ 2309 struct stripe_head *osh, *nsh; 2310 LIST_HEAD(newstripes); 2311 struct disk_info *ndisks; 2312 int err; 2313 struct kmem_cache *sc; 2314 int i; 2315 int hash, cnt; 2316 2317 if (newsize <= conf->pool_size) 2318 return 0; /* never bother to shrink */ 2319 2320 err = md_allow_write(conf->mddev); 2321 if (err) 2322 return err; 2323 2324 /* Step 1 */ 2325 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2326 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2327 0, 0, NULL); 2328 if (!sc) 2329 return -ENOMEM; 2330 2331 /* Need to ensure auto-resizing doesn't interfere */ 2332 mutex_lock(&conf->cache_size_mutex); 2333 2334 for (i = conf->max_nr_stripes; i; i--) { 2335 nsh = alloc_stripe(sc, GFP_KERNEL, newsize); 2336 if (!nsh) 2337 break; 2338 2339 nsh->raid_conf = conf; 2340 list_add(&nsh->lru, &newstripes); 2341 } 2342 if (i) { 2343 /* didn't get enough, give up */ 2344 while (!list_empty(&newstripes)) { 2345 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2346 list_del(&nsh->lru); 2347 kmem_cache_free(sc, nsh); 2348 } 2349 kmem_cache_destroy(sc); 2350 mutex_unlock(&conf->cache_size_mutex); 2351 return -ENOMEM; 2352 } 2353 /* Step 2 - Must use GFP_NOIO now. 2354 * OK, we have enough stripes, start collecting inactive 2355 * stripes and copying them over 2356 */ 2357 hash = 0; 2358 cnt = 0; 2359 list_for_each_entry(nsh, &newstripes, lru) { 2360 lock_device_hash_lock(conf, hash); 2361 wait_event_cmd(conf->wait_for_stripe, 2362 !list_empty(conf->inactive_list + hash), 2363 unlock_device_hash_lock(conf, hash), 2364 lock_device_hash_lock(conf, hash)); 2365 osh = get_free_stripe(conf, hash); 2366 unlock_device_hash_lock(conf, hash); 2367 2368 for(i=0; i<conf->pool_size; i++) { 2369 nsh->dev[i].page = osh->dev[i].page; 2370 nsh->dev[i].orig_page = osh->dev[i].page; 2371 } 2372 nsh->hash_lock_index = hash; 2373 kmem_cache_free(conf->slab_cache, osh); 2374 cnt++; 2375 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2376 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2377 hash++; 2378 cnt = 0; 2379 } 2380 } 2381 kmem_cache_destroy(conf->slab_cache); 2382 2383 /* Step 3. 2384 * At this point, we are holding all the stripes so the array 2385 * is completely stalled, so now is a good time to resize 2386 * conf->disks and the scribble region 2387 */ 2388 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2389 if (ndisks) { 2390 for (i = 0; i < conf->pool_size; i++) 2391 ndisks[i] = conf->disks[i]; 2392 2393 for (i = conf->pool_size; i < newsize; i++) { 2394 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2395 if (!ndisks[i].extra_page) 2396 err = -ENOMEM; 2397 } 2398 2399 if (err) { 2400 for (i = conf->pool_size; i < newsize; i++) 2401 if (ndisks[i].extra_page) 2402 put_page(ndisks[i].extra_page); 2403 kfree(ndisks); 2404 } else { 2405 kfree(conf->disks); 2406 conf->disks = ndisks; 2407 } 2408 } else 2409 err = -ENOMEM; 2410 2411 mutex_unlock(&conf->cache_size_mutex); 2412 2413 conf->slab_cache = sc; 2414 conf->active_name = 1-conf->active_name; 2415 2416 /* Step 4, return new stripes to service */ 2417 while(!list_empty(&newstripes)) { 2418 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2419 list_del_init(&nsh->lru); 2420 2421 for (i=conf->raid_disks; i < newsize; i++) 2422 if (nsh->dev[i].page == NULL) { 2423 struct page *p = alloc_page(GFP_NOIO); 2424 nsh->dev[i].page = p; 2425 nsh->dev[i].orig_page = p; 2426 if (!p) 2427 err = -ENOMEM; 2428 } 2429 raid5_release_stripe(nsh); 2430 } 2431 /* critical section pass, GFP_NOIO no longer needed */ 2432 2433 if (!err) 2434 conf->pool_size = newsize; 2435 return err; 2436 } 2437 2438 static int drop_one_stripe(struct r5conf *conf) 2439 { 2440 struct stripe_head *sh; 2441 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2442 2443 spin_lock_irq(conf->hash_locks + hash); 2444 sh = get_free_stripe(conf, hash); 2445 spin_unlock_irq(conf->hash_locks + hash); 2446 if (!sh) 2447 return 0; 2448 BUG_ON(atomic_read(&sh->count)); 2449 shrink_buffers(sh); 2450 kmem_cache_free(conf->slab_cache, sh); 2451 atomic_dec(&conf->active_stripes); 2452 conf->max_nr_stripes--; 2453 return 1; 2454 } 2455 2456 static void shrink_stripes(struct r5conf *conf) 2457 { 2458 while (conf->max_nr_stripes && 2459 drop_one_stripe(conf)) 2460 ; 2461 2462 kmem_cache_destroy(conf->slab_cache); 2463 conf->slab_cache = NULL; 2464 } 2465 2466 static void raid5_end_read_request(struct bio * bi) 2467 { 2468 struct stripe_head *sh = bi->bi_private; 2469 struct r5conf *conf = sh->raid_conf; 2470 int disks = sh->disks, i; 2471 char b[BDEVNAME_SIZE]; 2472 struct md_rdev *rdev = NULL; 2473 sector_t s; 2474 2475 for (i=0 ; i<disks; i++) 2476 if (bi == &sh->dev[i].req) 2477 break; 2478 2479 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2480 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2481 bi->bi_error); 2482 if (i == disks) { 2483 bio_reset(bi); 2484 BUG(); 2485 return; 2486 } 2487 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2488 /* If replacement finished while this request was outstanding, 2489 * 'replacement' might be NULL already. 2490 * In that case it moved down to 'rdev'. 2491 * rdev is not removed until all requests are finished. 2492 */ 2493 rdev = conf->disks[i].replacement; 2494 if (!rdev) 2495 rdev = conf->disks[i].rdev; 2496 2497 if (use_new_offset(conf, sh)) 2498 s = sh->sector + rdev->new_data_offset; 2499 else 2500 s = sh->sector + rdev->data_offset; 2501 if (!bi->bi_error) { 2502 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2503 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2504 /* Note that this cannot happen on a 2505 * replacement device. We just fail those on 2506 * any error 2507 */ 2508 pr_info_ratelimited( 2509 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2510 mdname(conf->mddev), STRIPE_SECTORS, 2511 (unsigned long long)s, 2512 bdevname(rdev->bdev, b)); 2513 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2514 clear_bit(R5_ReadError, &sh->dev[i].flags); 2515 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2516 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2517 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2518 2519 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2520 /* 2521 * end read for a page in journal, this 2522 * must be preparing for prexor in rmw 2523 */ 2524 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2525 2526 if (atomic_read(&rdev->read_errors)) 2527 atomic_set(&rdev->read_errors, 0); 2528 } else { 2529 const char *bdn = bdevname(rdev->bdev, b); 2530 int retry = 0; 2531 int set_bad = 0; 2532 2533 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2534 atomic_inc(&rdev->read_errors); 2535 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2536 pr_warn_ratelimited( 2537 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2538 mdname(conf->mddev), 2539 (unsigned long long)s, 2540 bdn); 2541 else if (conf->mddev->degraded >= conf->max_degraded) { 2542 set_bad = 1; 2543 pr_warn_ratelimited( 2544 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2545 mdname(conf->mddev), 2546 (unsigned long long)s, 2547 bdn); 2548 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2549 /* Oh, no!!! */ 2550 set_bad = 1; 2551 pr_warn_ratelimited( 2552 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2553 mdname(conf->mddev), 2554 (unsigned long long)s, 2555 bdn); 2556 } else if (atomic_read(&rdev->read_errors) 2557 > conf->max_nr_stripes) 2558 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2559 mdname(conf->mddev), bdn); 2560 else 2561 retry = 1; 2562 if (set_bad && test_bit(In_sync, &rdev->flags) 2563 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2564 retry = 1; 2565 if (retry) 2566 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2567 set_bit(R5_ReadError, &sh->dev[i].flags); 2568 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2569 } else 2570 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2571 else { 2572 clear_bit(R5_ReadError, &sh->dev[i].flags); 2573 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2574 if (!(set_bad 2575 && test_bit(In_sync, &rdev->flags) 2576 && rdev_set_badblocks( 2577 rdev, sh->sector, STRIPE_SECTORS, 0))) 2578 md_error(conf->mddev, rdev); 2579 } 2580 } 2581 rdev_dec_pending(rdev, conf->mddev); 2582 bio_reset(bi); 2583 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2584 set_bit(STRIPE_HANDLE, &sh->state); 2585 raid5_release_stripe(sh); 2586 } 2587 2588 static void raid5_end_write_request(struct bio *bi) 2589 { 2590 struct stripe_head *sh = bi->bi_private; 2591 struct r5conf *conf = sh->raid_conf; 2592 int disks = sh->disks, i; 2593 struct md_rdev *uninitialized_var(rdev); 2594 sector_t first_bad; 2595 int bad_sectors; 2596 int replacement = 0; 2597 2598 for (i = 0 ; i < disks; i++) { 2599 if (bi == &sh->dev[i].req) { 2600 rdev = conf->disks[i].rdev; 2601 break; 2602 } 2603 if (bi == &sh->dev[i].rreq) { 2604 rdev = conf->disks[i].replacement; 2605 if (rdev) 2606 replacement = 1; 2607 else 2608 /* rdev was removed and 'replacement' 2609 * replaced it. rdev is not removed 2610 * until all requests are finished. 2611 */ 2612 rdev = conf->disks[i].rdev; 2613 break; 2614 } 2615 } 2616 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2617 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2618 bi->bi_error); 2619 if (i == disks) { 2620 bio_reset(bi); 2621 BUG(); 2622 return; 2623 } 2624 2625 if (replacement) { 2626 if (bi->bi_error) 2627 md_error(conf->mddev, rdev); 2628 else if (is_badblock(rdev, sh->sector, 2629 STRIPE_SECTORS, 2630 &first_bad, &bad_sectors)) 2631 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2632 } else { 2633 if (bi->bi_error) { 2634 set_bit(STRIPE_DEGRADED, &sh->state); 2635 set_bit(WriteErrorSeen, &rdev->flags); 2636 set_bit(R5_WriteError, &sh->dev[i].flags); 2637 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2638 set_bit(MD_RECOVERY_NEEDED, 2639 &rdev->mddev->recovery); 2640 } else if (is_badblock(rdev, sh->sector, 2641 STRIPE_SECTORS, 2642 &first_bad, &bad_sectors)) { 2643 set_bit(R5_MadeGood, &sh->dev[i].flags); 2644 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2645 /* That was a successful write so make 2646 * sure it looks like we already did 2647 * a re-write. 2648 */ 2649 set_bit(R5_ReWrite, &sh->dev[i].flags); 2650 } 2651 } 2652 rdev_dec_pending(rdev, conf->mddev); 2653 2654 if (sh->batch_head && bi->bi_error && !replacement) 2655 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2656 2657 bio_reset(bi); 2658 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2659 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2660 set_bit(STRIPE_HANDLE, &sh->state); 2661 raid5_release_stripe(sh); 2662 2663 if (sh->batch_head && sh != sh->batch_head) 2664 raid5_release_stripe(sh->batch_head); 2665 } 2666 2667 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2668 { 2669 struct r5dev *dev = &sh->dev[i]; 2670 2671 dev->flags = 0; 2672 dev->sector = raid5_compute_blocknr(sh, i, previous); 2673 } 2674 2675 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2676 { 2677 char b[BDEVNAME_SIZE]; 2678 struct r5conf *conf = mddev->private; 2679 unsigned long flags; 2680 pr_debug("raid456: error called\n"); 2681 2682 spin_lock_irqsave(&conf->device_lock, flags); 2683 clear_bit(In_sync, &rdev->flags); 2684 mddev->degraded = raid5_calc_degraded(conf); 2685 spin_unlock_irqrestore(&conf->device_lock, flags); 2686 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2687 2688 set_bit(Blocked, &rdev->flags); 2689 set_bit(Faulty, &rdev->flags); 2690 set_mask_bits(&mddev->sb_flags, 0, 2691 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2692 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2693 "md/raid:%s: Operation continuing on %d devices.\n", 2694 mdname(mddev), 2695 bdevname(rdev->bdev, b), 2696 mdname(mddev), 2697 conf->raid_disks - mddev->degraded); 2698 r5c_update_on_rdev_error(mddev); 2699 } 2700 2701 /* 2702 * Input: a 'big' sector number, 2703 * Output: index of the data and parity disk, and the sector # in them. 2704 */ 2705 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2706 int previous, int *dd_idx, 2707 struct stripe_head *sh) 2708 { 2709 sector_t stripe, stripe2; 2710 sector_t chunk_number; 2711 unsigned int chunk_offset; 2712 int pd_idx, qd_idx; 2713 int ddf_layout = 0; 2714 sector_t new_sector; 2715 int algorithm = previous ? conf->prev_algo 2716 : conf->algorithm; 2717 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2718 : conf->chunk_sectors; 2719 int raid_disks = previous ? conf->previous_raid_disks 2720 : conf->raid_disks; 2721 int data_disks = raid_disks - conf->max_degraded; 2722 2723 /* First compute the information on this sector */ 2724 2725 /* 2726 * Compute the chunk number and the sector offset inside the chunk 2727 */ 2728 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2729 chunk_number = r_sector; 2730 2731 /* 2732 * Compute the stripe number 2733 */ 2734 stripe = chunk_number; 2735 *dd_idx = sector_div(stripe, data_disks); 2736 stripe2 = stripe; 2737 /* 2738 * Select the parity disk based on the user selected algorithm. 2739 */ 2740 pd_idx = qd_idx = -1; 2741 switch(conf->level) { 2742 case 4: 2743 pd_idx = data_disks; 2744 break; 2745 case 5: 2746 switch (algorithm) { 2747 case ALGORITHM_LEFT_ASYMMETRIC: 2748 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2749 if (*dd_idx >= pd_idx) 2750 (*dd_idx)++; 2751 break; 2752 case ALGORITHM_RIGHT_ASYMMETRIC: 2753 pd_idx = sector_div(stripe2, raid_disks); 2754 if (*dd_idx >= pd_idx) 2755 (*dd_idx)++; 2756 break; 2757 case ALGORITHM_LEFT_SYMMETRIC: 2758 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2759 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2760 break; 2761 case ALGORITHM_RIGHT_SYMMETRIC: 2762 pd_idx = sector_div(stripe2, raid_disks); 2763 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2764 break; 2765 case ALGORITHM_PARITY_0: 2766 pd_idx = 0; 2767 (*dd_idx)++; 2768 break; 2769 case ALGORITHM_PARITY_N: 2770 pd_idx = data_disks; 2771 break; 2772 default: 2773 BUG(); 2774 } 2775 break; 2776 case 6: 2777 2778 switch (algorithm) { 2779 case ALGORITHM_LEFT_ASYMMETRIC: 2780 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2781 qd_idx = pd_idx + 1; 2782 if (pd_idx == raid_disks-1) { 2783 (*dd_idx)++; /* Q D D D P */ 2784 qd_idx = 0; 2785 } else if (*dd_idx >= pd_idx) 2786 (*dd_idx) += 2; /* D D P Q D */ 2787 break; 2788 case ALGORITHM_RIGHT_ASYMMETRIC: 2789 pd_idx = sector_div(stripe2, raid_disks); 2790 qd_idx = pd_idx + 1; 2791 if (pd_idx == raid_disks-1) { 2792 (*dd_idx)++; /* Q D D D P */ 2793 qd_idx = 0; 2794 } else if (*dd_idx >= pd_idx) 2795 (*dd_idx) += 2; /* D D P Q D */ 2796 break; 2797 case ALGORITHM_LEFT_SYMMETRIC: 2798 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2799 qd_idx = (pd_idx + 1) % raid_disks; 2800 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2801 break; 2802 case ALGORITHM_RIGHT_SYMMETRIC: 2803 pd_idx = sector_div(stripe2, raid_disks); 2804 qd_idx = (pd_idx + 1) % raid_disks; 2805 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2806 break; 2807 2808 case ALGORITHM_PARITY_0: 2809 pd_idx = 0; 2810 qd_idx = 1; 2811 (*dd_idx) += 2; 2812 break; 2813 case ALGORITHM_PARITY_N: 2814 pd_idx = data_disks; 2815 qd_idx = data_disks + 1; 2816 break; 2817 2818 case ALGORITHM_ROTATING_ZERO_RESTART: 2819 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2820 * of blocks for computing Q is different. 2821 */ 2822 pd_idx = sector_div(stripe2, raid_disks); 2823 qd_idx = pd_idx + 1; 2824 if (pd_idx == raid_disks-1) { 2825 (*dd_idx)++; /* Q D D D P */ 2826 qd_idx = 0; 2827 } else if (*dd_idx >= pd_idx) 2828 (*dd_idx) += 2; /* D D P Q D */ 2829 ddf_layout = 1; 2830 break; 2831 2832 case ALGORITHM_ROTATING_N_RESTART: 2833 /* Same a left_asymmetric, by first stripe is 2834 * D D D P Q rather than 2835 * Q D D D P 2836 */ 2837 stripe2 += 1; 2838 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2839 qd_idx = pd_idx + 1; 2840 if (pd_idx == raid_disks-1) { 2841 (*dd_idx)++; /* Q D D D P */ 2842 qd_idx = 0; 2843 } else if (*dd_idx >= pd_idx) 2844 (*dd_idx) += 2; /* D D P Q D */ 2845 ddf_layout = 1; 2846 break; 2847 2848 case ALGORITHM_ROTATING_N_CONTINUE: 2849 /* Same as left_symmetric but Q is before P */ 2850 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2851 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2852 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2853 ddf_layout = 1; 2854 break; 2855 2856 case ALGORITHM_LEFT_ASYMMETRIC_6: 2857 /* RAID5 left_asymmetric, with Q on last device */ 2858 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2859 if (*dd_idx >= pd_idx) 2860 (*dd_idx)++; 2861 qd_idx = raid_disks - 1; 2862 break; 2863 2864 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2865 pd_idx = sector_div(stripe2, raid_disks-1); 2866 if (*dd_idx >= pd_idx) 2867 (*dd_idx)++; 2868 qd_idx = raid_disks - 1; 2869 break; 2870 2871 case ALGORITHM_LEFT_SYMMETRIC_6: 2872 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2873 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2874 qd_idx = raid_disks - 1; 2875 break; 2876 2877 case ALGORITHM_RIGHT_SYMMETRIC_6: 2878 pd_idx = sector_div(stripe2, raid_disks-1); 2879 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2880 qd_idx = raid_disks - 1; 2881 break; 2882 2883 case ALGORITHM_PARITY_0_6: 2884 pd_idx = 0; 2885 (*dd_idx)++; 2886 qd_idx = raid_disks - 1; 2887 break; 2888 2889 default: 2890 BUG(); 2891 } 2892 break; 2893 } 2894 2895 if (sh) { 2896 sh->pd_idx = pd_idx; 2897 sh->qd_idx = qd_idx; 2898 sh->ddf_layout = ddf_layout; 2899 } 2900 /* 2901 * Finally, compute the new sector number 2902 */ 2903 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2904 return new_sector; 2905 } 2906 2907 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2908 { 2909 struct r5conf *conf = sh->raid_conf; 2910 int raid_disks = sh->disks; 2911 int data_disks = raid_disks - conf->max_degraded; 2912 sector_t new_sector = sh->sector, check; 2913 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2914 : conf->chunk_sectors; 2915 int algorithm = previous ? conf->prev_algo 2916 : conf->algorithm; 2917 sector_t stripe; 2918 int chunk_offset; 2919 sector_t chunk_number; 2920 int dummy1, dd_idx = i; 2921 sector_t r_sector; 2922 struct stripe_head sh2; 2923 2924 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2925 stripe = new_sector; 2926 2927 if (i == sh->pd_idx) 2928 return 0; 2929 switch(conf->level) { 2930 case 4: break; 2931 case 5: 2932 switch (algorithm) { 2933 case ALGORITHM_LEFT_ASYMMETRIC: 2934 case ALGORITHM_RIGHT_ASYMMETRIC: 2935 if (i > sh->pd_idx) 2936 i--; 2937 break; 2938 case ALGORITHM_LEFT_SYMMETRIC: 2939 case ALGORITHM_RIGHT_SYMMETRIC: 2940 if (i < sh->pd_idx) 2941 i += raid_disks; 2942 i -= (sh->pd_idx + 1); 2943 break; 2944 case ALGORITHM_PARITY_0: 2945 i -= 1; 2946 break; 2947 case ALGORITHM_PARITY_N: 2948 break; 2949 default: 2950 BUG(); 2951 } 2952 break; 2953 case 6: 2954 if (i == sh->qd_idx) 2955 return 0; /* It is the Q disk */ 2956 switch (algorithm) { 2957 case ALGORITHM_LEFT_ASYMMETRIC: 2958 case ALGORITHM_RIGHT_ASYMMETRIC: 2959 case ALGORITHM_ROTATING_ZERO_RESTART: 2960 case ALGORITHM_ROTATING_N_RESTART: 2961 if (sh->pd_idx == raid_disks-1) 2962 i--; /* Q D D D P */ 2963 else if (i > sh->pd_idx) 2964 i -= 2; /* D D P Q D */ 2965 break; 2966 case ALGORITHM_LEFT_SYMMETRIC: 2967 case ALGORITHM_RIGHT_SYMMETRIC: 2968 if (sh->pd_idx == raid_disks-1) 2969 i--; /* Q D D D P */ 2970 else { 2971 /* D D P Q D */ 2972 if (i < sh->pd_idx) 2973 i += raid_disks; 2974 i -= (sh->pd_idx + 2); 2975 } 2976 break; 2977 case ALGORITHM_PARITY_0: 2978 i -= 2; 2979 break; 2980 case ALGORITHM_PARITY_N: 2981 break; 2982 case ALGORITHM_ROTATING_N_CONTINUE: 2983 /* Like left_symmetric, but P is before Q */ 2984 if (sh->pd_idx == 0) 2985 i--; /* P D D D Q */ 2986 else { 2987 /* D D Q P D */ 2988 if (i < sh->pd_idx) 2989 i += raid_disks; 2990 i -= (sh->pd_idx + 1); 2991 } 2992 break; 2993 case ALGORITHM_LEFT_ASYMMETRIC_6: 2994 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2995 if (i > sh->pd_idx) 2996 i--; 2997 break; 2998 case ALGORITHM_LEFT_SYMMETRIC_6: 2999 case ALGORITHM_RIGHT_SYMMETRIC_6: 3000 if (i < sh->pd_idx) 3001 i += data_disks + 1; 3002 i -= (sh->pd_idx + 1); 3003 break; 3004 case ALGORITHM_PARITY_0_6: 3005 i -= 1; 3006 break; 3007 default: 3008 BUG(); 3009 } 3010 break; 3011 } 3012 3013 chunk_number = stripe * data_disks + i; 3014 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3015 3016 check = raid5_compute_sector(conf, r_sector, 3017 previous, &dummy1, &sh2); 3018 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3019 || sh2.qd_idx != sh->qd_idx) { 3020 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3021 mdname(conf->mddev)); 3022 return 0; 3023 } 3024 return r_sector; 3025 } 3026 3027 /* 3028 * There are cases where we want handle_stripe_dirtying() and 3029 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3030 * 3031 * This function checks whether we want to delay the towrite. Specifically, 3032 * we delay the towrite when: 3033 * 3034 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3035 * stripe has data in journal (for other devices). 3036 * 3037 * In this case, when reading data for the non-overwrite dev, it is 3038 * necessary to handle complex rmw of write back cache (prexor with 3039 * orig_page, and xor with page). To keep read path simple, we would 3040 * like to flush data in journal to RAID disks first, so complex rmw 3041 * is handled in the write patch (handle_stripe_dirtying). 3042 * 3043 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3044 * 3045 * It is important to be able to flush all stripes in raid5-cache. 3046 * Therefore, we need reserve some space on the journal device for 3047 * these flushes. If flush operation includes pending writes to the 3048 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3049 * for the flush out. If we exclude these pending writes from flush 3050 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3051 * Therefore, excluding pending writes in these cases enables more 3052 * efficient use of the journal device. 3053 * 3054 * Note: To make sure the stripe makes progress, we only delay 3055 * towrite for stripes with data already in journal (injournal > 0). 3056 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3057 * no_space_stripes list. 3058 * 3059 */ 3060 static inline bool delay_towrite(struct r5conf *conf, 3061 struct r5dev *dev, 3062 struct stripe_head_state *s) 3063 { 3064 /* case 1 above */ 3065 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3066 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3067 return true; 3068 /* case 2 above */ 3069 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3070 s->injournal > 0) 3071 return true; 3072 return false; 3073 } 3074 3075 static void 3076 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3077 int rcw, int expand) 3078 { 3079 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3080 struct r5conf *conf = sh->raid_conf; 3081 int level = conf->level; 3082 3083 if (rcw) { 3084 /* 3085 * In some cases, handle_stripe_dirtying initially decided to 3086 * run rmw and allocates extra page for prexor. However, rcw is 3087 * cheaper later on. We need to free the extra page now, 3088 * because we won't be able to do that in ops_complete_prexor(). 3089 */ 3090 r5c_release_extra_page(sh); 3091 3092 for (i = disks; i--; ) { 3093 struct r5dev *dev = &sh->dev[i]; 3094 3095 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3096 set_bit(R5_LOCKED, &dev->flags); 3097 set_bit(R5_Wantdrain, &dev->flags); 3098 if (!expand) 3099 clear_bit(R5_UPTODATE, &dev->flags); 3100 s->locked++; 3101 } else if (test_bit(R5_InJournal, &dev->flags)) { 3102 set_bit(R5_LOCKED, &dev->flags); 3103 s->locked++; 3104 } 3105 } 3106 /* if we are not expanding this is a proper write request, and 3107 * there will be bios with new data to be drained into the 3108 * stripe cache 3109 */ 3110 if (!expand) { 3111 if (!s->locked) 3112 /* False alarm, nothing to do */ 3113 return; 3114 sh->reconstruct_state = reconstruct_state_drain_run; 3115 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3116 } else 3117 sh->reconstruct_state = reconstruct_state_run; 3118 3119 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3120 3121 if (s->locked + conf->max_degraded == disks) 3122 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3123 atomic_inc(&conf->pending_full_writes); 3124 } else { 3125 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3126 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3127 BUG_ON(level == 6 && 3128 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3129 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3130 3131 for (i = disks; i--; ) { 3132 struct r5dev *dev = &sh->dev[i]; 3133 if (i == pd_idx || i == qd_idx) 3134 continue; 3135 3136 if (dev->towrite && 3137 (test_bit(R5_UPTODATE, &dev->flags) || 3138 test_bit(R5_Wantcompute, &dev->flags))) { 3139 set_bit(R5_Wantdrain, &dev->flags); 3140 set_bit(R5_LOCKED, &dev->flags); 3141 clear_bit(R5_UPTODATE, &dev->flags); 3142 s->locked++; 3143 } else if (test_bit(R5_InJournal, &dev->flags)) { 3144 set_bit(R5_LOCKED, &dev->flags); 3145 s->locked++; 3146 } 3147 } 3148 if (!s->locked) 3149 /* False alarm - nothing to do */ 3150 return; 3151 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3152 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3153 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3154 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3155 } 3156 3157 /* keep the parity disk(s) locked while asynchronous operations 3158 * are in flight 3159 */ 3160 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3161 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3162 s->locked++; 3163 3164 if (level == 6) { 3165 int qd_idx = sh->qd_idx; 3166 struct r5dev *dev = &sh->dev[qd_idx]; 3167 3168 set_bit(R5_LOCKED, &dev->flags); 3169 clear_bit(R5_UPTODATE, &dev->flags); 3170 s->locked++; 3171 } 3172 3173 if (raid5_has_ppl(sh->raid_conf) && 3174 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3175 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3176 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3177 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3178 3179 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3180 __func__, (unsigned long long)sh->sector, 3181 s->locked, s->ops_request); 3182 } 3183 3184 /* 3185 * Each stripe/dev can have one or more bion attached. 3186 * toread/towrite point to the first in a chain. 3187 * The bi_next chain must be in order. 3188 */ 3189 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3190 int forwrite, int previous) 3191 { 3192 struct bio **bip; 3193 struct r5conf *conf = sh->raid_conf; 3194 int firstwrite=0; 3195 3196 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3197 (unsigned long long)bi->bi_iter.bi_sector, 3198 (unsigned long long)sh->sector); 3199 3200 spin_lock_irq(&sh->stripe_lock); 3201 /* Don't allow new IO added to stripes in batch list */ 3202 if (sh->batch_head) 3203 goto overlap; 3204 if (forwrite) { 3205 bip = &sh->dev[dd_idx].towrite; 3206 if (*bip == NULL) 3207 firstwrite = 1; 3208 } else 3209 bip = &sh->dev[dd_idx].toread; 3210 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3211 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3212 goto overlap; 3213 bip = & (*bip)->bi_next; 3214 } 3215 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3216 goto overlap; 3217 3218 if (forwrite && raid5_has_ppl(conf)) { 3219 /* 3220 * With PPL only writes to consecutive data chunks within a 3221 * stripe are allowed because for a single stripe_head we can 3222 * only have one PPL entry at a time, which describes one data 3223 * range. Not really an overlap, but wait_for_overlap can be 3224 * used to handle this. 3225 */ 3226 sector_t sector; 3227 sector_t first = 0; 3228 sector_t last = 0; 3229 int count = 0; 3230 int i; 3231 3232 for (i = 0; i < sh->disks; i++) { 3233 if (i != sh->pd_idx && 3234 (i == dd_idx || sh->dev[i].towrite)) { 3235 sector = sh->dev[i].sector; 3236 if (count == 0 || sector < first) 3237 first = sector; 3238 if (sector > last) 3239 last = sector; 3240 count++; 3241 } 3242 } 3243 3244 if (first + conf->chunk_sectors * (count - 1) != last) 3245 goto overlap; 3246 } 3247 3248 if (!forwrite || previous) 3249 clear_bit(STRIPE_BATCH_READY, &sh->state); 3250 3251 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3252 if (*bip) 3253 bi->bi_next = *bip; 3254 *bip = bi; 3255 bio_inc_remaining(bi); 3256 md_write_inc(conf->mddev, bi); 3257 3258 if (forwrite) { 3259 /* check if page is covered */ 3260 sector_t sector = sh->dev[dd_idx].sector; 3261 for (bi=sh->dev[dd_idx].towrite; 3262 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3263 bi && bi->bi_iter.bi_sector <= sector; 3264 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3265 if (bio_end_sector(bi) >= sector) 3266 sector = bio_end_sector(bi); 3267 } 3268 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3269 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3270 sh->overwrite_disks++; 3271 } 3272 3273 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3274 (unsigned long long)(*bip)->bi_iter.bi_sector, 3275 (unsigned long long)sh->sector, dd_idx); 3276 3277 if (conf->mddev->bitmap && firstwrite) { 3278 /* Cannot hold spinlock over bitmap_startwrite, 3279 * but must ensure this isn't added to a batch until 3280 * we have added to the bitmap and set bm_seq. 3281 * So set STRIPE_BITMAP_PENDING to prevent 3282 * batching. 3283 * If multiple add_stripe_bio() calls race here they 3284 * much all set STRIPE_BITMAP_PENDING. So only the first one 3285 * to complete "bitmap_startwrite" gets to set 3286 * STRIPE_BIT_DELAY. This is important as once a stripe 3287 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3288 * any more. 3289 */ 3290 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3291 spin_unlock_irq(&sh->stripe_lock); 3292 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3293 STRIPE_SECTORS, 0); 3294 spin_lock_irq(&sh->stripe_lock); 3295 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3296 if (!sh->batch_head) { 3297 sh->bm_seq = conf->seq_flush+1; 3298 set_bit(STRIPE_BIT_DELAY, &sh->state); 3299 } 3300 } 3301 spin_unlock_irq(&sh->stripe_lock); 3302 3303 if (stripe_can_batch(sh)) 3304 stripe_add_to_batch_list(conf, sh); 3305 return 1; 3306 3307 overlap: 3308 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3309 spin_unlock_irq(&sh->stripe_lock); 3310 return 0; 3311 } 3312 3313 static void end_reshape(struct r5conf *conf); 3314 3315 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3316 struct stripe_head *sh) 3317 { 3318 int sectors_per_chunk = 3319 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3320 int dd_idx; 3321 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3322 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3323 3324 raid5_compute_sector(conf, 3325 stripe * (disks - conf->max_degraded) 3326 *sectors_per_chunk + chunk_offset, 3327 previous, 3328 &dd_idx, sh); 3329 } 3330 3331 static void 3332 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3333 struct stripe_head_state *s, int disks) 3334 { 3335 int i; 3336 BUG_ON(sh->batch_head); 3337 for (i = disks; i--; ) { 3338 struct bio *bi; 3339 int bitmap_end = 0; 3340 3341 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3342 struct md_rdev *rdev; 3343 rcu_read_lock(); 3344 rdev = rcu_dereference(conf->disks[i].rdev); 3345 if (rdev && test_bit(In_sync, &rdev->flags) && 3346 !test_bit(Faulty, &rdev->flags)) 3347 atomic_inc(&rdev->nr_pending); 3348 else 3349 rdev = NULL; 3350 rcu_read_unlock(); 3351 if (rdev) { 3352 if (!rdev_set_badblocks( 3353 rdev, 3354 sh->sector, 3355 STRIPE_SECTORS, 0)) 3356 md_error(conf->mddev, rdev); 3357 rdev_dec_pending(rdev, conf->mddev); 3358 } 3359 } 3360 spin_lock_irq(&sh->stripe_lock); 3361 /* fail all writes first */ 3362 bi = sh->dev[i].towrite; 3363 sh->dev[i].towrite = NULL; 3364 sh->overwrite_disks = 0; 3365 spin_unlock_irq(&sh->stripe_lock); 3366 if (bi) 3367 bitmap_end = 1; 3368 3369 log_stripe_write_finished(sh); 3370 3371 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3372 wake_up(&conf->wait_for_overlap); 3373 3374 while (bi && bi->bi_iter.bi_sector < 3375 sh->dev[i].sector + STRIPE_SECTORS) { 3376 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3377 3378 bi->bi_error = -EIO; 3379 md_write_end(conf->mddev); 3380 bio_endio(bi); 3381 bi = nextbi; 3382 } 3383 if (bitmap_end) 3384 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3385 STRIPE_SECTORS, 0, 0); 3386 bitmap_end = 0; 3387 /* and fail all 'written' */ 3388 bi = sh->dev[i].written; 3389 sh->dev[i].written = NULL; 3390 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3391 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3392 sh->dev[i].page = sh->dev[i].orig_page; 3393 } 3394 3395 if (bi) bitmap_end = 1; 3396 while (bi && bi->bi_iter.bi_sector < 3397 sh->dev[i].sector + STRIPE_SECTORS) { 3398 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3399 3400 bi->bi_error = -EIO; 3401 md_write_end(conf->mddev); 3402 bio_endio(bi); 3403 bi = bi2; 3404 } 3405 3406 /* fail any reads if this device is non-operational and 3407 * the data has not reached the cache yet. 3408 */ 3409 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3410 s->failed > conf->max_degraded && 3411 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3412 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3413 spin_lock_irq(&sh->stripe_lock); 3414 bi = sh->dev[i].toread; 3415 sh->dev[i].toread = NULL; 3416 spin_unlock_irq(&sh->stripe_lock); 3417 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3418 wake_up(&conf->wait_for_overlap); 3419 if (bi) 3420 s->to_read--; 3421 while (bi && bi->bi_iter.bi_sector < 3422 sh->dev[i].sector + STRIPE_SECTORS) { 3423 struct bio *nextbi = 3424 r5_next_bio(bi, sh->dev[i].sector); 3425 3426 bi->bi_error = -EIO; 3427 bio_endio(bi); 3428 bi = nextbi; 3429 } 3430 } 3431 if (bitmap_end) 3432 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3433 STRIPE_SECTORS, 0, 0); 3434 /* If we were in the middle of a write the parity block might 3435 * still be locked - so just clear all R5_LOCKED flags 3436 */ 3437 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3438 } 3439 s->to_write = 0; 3440 s->written = 0; 3441 3442 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3443 if (atomic_dec_and_test(&conf->pending_full_writes)) 3444 md_wakeup_thread(conf->mddev->thread); 3445 } 3446 3447 static void 3448 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3449 struct stripe_head_state *s) 3450 { 3451 int abort = 0; 3452 int i; 3453 3454 BUG_ON(sh->batch_head); 3455 clear_bit(STRIPE_SYNCING, &sh->state); 3456 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3457 wake_up(&conf->wait_for_overlap); 3458 s->syncing = 0; 3459 s->replacing = 0; 3460 /* There is nothing more to do for sync/check/repair. 3461 * Don't even need to abort as that is handled elsewhere 3462 * if needed, and not always wanted e.g. if there is a known 3463 * bad block here. 3464 * For recover/replace we need to record a bad block on all 3465 * non-sync devices, or abort the recovery 3466 */ 3467 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3468 /* During recovery devices cannot be removed, so 3469 * locking and refcounting of rdevs is not needed 3470 */ 3471 rcu_read_lock(); 3472 for (i = 0; i < conf->raid_disks; i++) { 3473 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3474 if (rdev 3475 && !test_bit(Faulty, &rdev->flags) 3476 && !test_bit(In_sync, &rdev->flags) 3477 && !rdev_set_badblocks(rdev, sh->sector, 3478 STRIPE_SECTORS, 0)) 3479 abort = 1; 3480 rdev = rcu_dereference(conf->disks[i].replacement); 3481 if (rdev 3482 && !test_bit(Faulty, &rdev->flags) 3483 && !test_bit(In_sync, &rdev->flags) 3484 && !rdev_set_badblocks(rdev, sh->sector, 3485 STRIPE_SECTORS, 0)) 3486 abort = 1; 3487 } 3488 rcu_read_unlock(); 3489 if (abort) 3490 conf->recovery_disabled = 3491 conf->mddev->recovery_disabled; 3492 } 3493 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3494 } 3495 3496 static int want_replace(struct stripe_head *sh, int disk_idx) 3497 { 3498 struct md_rdev *rdev; 3499 int rv = 0; 3500 3501 rcu_read_lock(); 3502 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3503 if (rdev 3504 && !test_bit(Faulty, &rdev->flags) 3505 && !test_bit(In_sync, &rdev->flags) 3506 && (rdev->recovery_offset <= sh->sector 3507 || rdev->mddev->recovery_cp <= sh->sector)) 3508 rv = 1; 3509 rcu_read_unlock(); 3510 return rv; 3511 } 3512 3513 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3514 int disk_idx, int disks) 3515 { 3516 struct r5dev *dev = &sh->dev[disk_idx]; 3517 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3518 &sh->dev[s->failed_num[1]] }; 3519 int i; 3520 3521 3522 if (test_bit(R5_LOCKED, &dev->flags) || 3523 test_bit(R5_UPTODATE, &dev->flags)) 3524 /* No point reading this as we already have it or have 3525 * decided to get it. 3526 */ 3527 return 0; 3528 3529 if (dev->toread || 3530 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3531 /* We need this block to directly satisfy a request */ 3532 return 1; 3533 3534 if (s->syncing || s->expanding || 3535 (s->replacing && want_replace(sh, disk_idx))) 3536 /* When syncing, or expanding we read everything. 3537 * When replacing, we need the replaced block. 3538 */ 3539 return 1; 3540 3541 if ((s->failed >= 1 && fdev[0]->toread) || 3542 (s->failed >= 2 && fdev[1]->toread)) 3543 /* If we want to read from a failed device, then 3544 * we need to actually read every other device. 3545 */ 3546 return 1; 3547 3548 /* Sometimes neither read-modify-write nor reconstruct-write 3549 * cycles can work. In those cases we read every block we 3550 * can. Then the parity-update is certain to have enough to 3551 * work with. 3552 * This can only be a problem when we need to write something, 3553 * and some device has failed. If either of those tests 3554 * fail we need look no further. 3555 */ 3556 if (!s->failed || !s->to_write) 3557 return 0; 3558 3559 if (test_bit(R5_Insync, &dev->flags) && 3560 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3561 /* Pre-reads at not permitted until after short delay 3562 * to gather multiple requests. However if this 3563 * device is no Insync, the block could only be computed 3564 * and there is no need to delay that. 3565 */ 3566 return 0; 3567 3568 for (i = 0; i < s->failed && i < 2; i++) { 3569 if (fdev[i]->towrite && 3570 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3571 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3572 /* If we have a partial write to a failed 3573 * device, then we will need to reconstruct 3574 * the content of that device, so all other 3575 * devices must be read. 3576 */ 3577 return 1; 3578 } 3579 3580 /* If we are forced to do a reconstruct-write, either because 3581 * the current RAID6 implementation only supports that, or 3582 * because parity cannot be trusted and we are currently 3583 * recovering it, there is extra need to be careful. 3584 * If one of the devices that we would need to read, because 3585 * it is not being overwritten (and maybe not written at all) 3586 * is missing/faulty, then we need to read everything we can. 3587 */ 3588 if (sh->raid_conf->level != 6 && 3589 sh->sector < sh->raid_conf->mddev->recovery_cp) 3590 /* reconstruct-write isn't being forced */ 3591 return 0; 3592 for (i = 0; i < s->failed && i < 2; i++) { 3593 if (s->failed_num[i] != sh->pd_idx && 3594 s->failed_num[i] != sh->qd_idx && 3595 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3596 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3597 return 1; 3598 } 3599 3600 return 0; 3601 } 3602 3603 /* fetch_block - checks the given member device to see if its data needs 3604 * to be read or computed to satisfy a request. 3605 * 3606 * Returns 1 when no more member devices need to be checked, otherwise returns 3607 * 0 to tell the loop in handle_stripe_fill to continue 3608 */ 3609 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3610 int disk_idx, int disks) 3611 { 3612 struct r5dev *dev = &sh->dev[disk_idx]; 3613 3614 /* is the data in this block needed, and can we get it? */ 3615 if (need_this_block(sh, s, disk_idx, disks)) { 3616 /* we would like to get this block, possibly by computing it, 3617 * otherwise read it if the backing disk is insync 3618 */ 3619 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3620 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3621 BUG_ON(sh->batch_head); 3622 3623 /* 3624 * In the raid6 case if the only non-uptodate disk is P 3625 * then we already trusted P to compute the other failed 3626 * drives. It is safe to compute rather than re-read P. 3627 * In other cases we only compute blocks from failed 3628 * devices, otherwise check/repair might fail to detect 3629 * a real inconsistency. 3630 */ 3631 3632 if ((s->uptodate == disks - 1) && 3633 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3634 (s->failed && (disk_idx == s->failed_num[0] || 3635 disk_idx == s->failed_num[1])))) { 3636 /* have disk failed, and we're requested to fetch it; 3637 * do compute it 3638 */ 3639 pr_debug("Computing stripe %llu block %d\n", 3640 (unsigned long long)sh->sector, disk_idx); 3641 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3642 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3643 set_bit(R5_Wantcompute, &dev->flags); 3644 sh->ops.target = disk_idx; 3645 sh->ops.target2 = -1; /* no 2nd target */ 3646 s->req_compute = 1; 3647 /* Careful: from this point on 'uptodate' is in the eye 3648 * of raid_run_ops which services 'compute' operations 3649 * before writes. R5_Wantcompute flags a block that will 3650 * be R5_UPTODATE by the time it is needed for a 3651 * subsequent operation. 3652 */ 3653 s->uptodate++; 3654 return 1; 3655 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3656 /* Computing 2-failure is *very* expensive; only 3657 * do it if failed >= 2 3658 */ 3659 int other; 3660 for (other = disks; other--; ) { 3661 if (other == disk_idx) 3662 continue; 3663 if (!test_bit(R5_UPTODATE, 3664 &sh->dev[other].flags)) 3665 break; 3666 } 3667 BUG_ON(other < 0); 3668 pr_debug("Computing stripe %llu blocks %d,%d\n", 3669 (unsigned long long)sh->sector, 3670 disk_idx, other); 3671 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3672 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3673 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3674 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3675 sh->ops.target = disk_idx; 3676 sh->ops.target2 = other; 3677 s->uptodate += 2; 3678 s->req_compute = 1; 3679 return 1; 3680 } else if (test_bit(R5_Insync, &dev->flags)) { 3681 set_bit(R5_LOCKED, &dev->flags); 3682 set_bit(R5_Wantread, &dev->flags); 3683 s->locked++; 3684 pr_debug("Reading block %d (sync=%d)\n", 3685 disk_idx, s->syncing); 3686 } 3687 } 3688 3689 return 0; 3690 } 3691 3692 /** 3693 * handle_stripe_fill - read or compute data to satisfy pending requests. 3694 */ 3695 static void handle_stripe_fill(struct stripe_head *sh, 3696 struct stripe_head_state *s, 3697 int disks) 3698 { 3699 int i; 3700 3701 /* look for blocks to read/compute, skip this if a compute 3702 * is already in flight, or if the stripe contents are in the 3703 * midst of changing due to a write 3704 */ 3705 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3706 !sh->reconstruct_state) { 3707 3708 /* 3709 * For degraded stripe with data in journal, do not handle 3710 * read requests yet, instead, flush the stripe to raid 3711 * disks first, this avoids handling complex rmw of write 3712 * back cache (prexor with orig_page, and then xor with 3713 * page) in the read path 3714 */ 3715 if (s->injournal && s->failed) { 3716 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3717 r5c_make_stripe_write_out(sh); 3718 goto out; 3719 } 3720 3721 for (i = disks; i--; ) 3722 if (fetch_block(sh, s, i, disks)) 3723 break; 3724 } 3725 out: 3726 set_bit(STRIPE_HANDLE, &sh->state); 3727 } 3728 3729 static void break_stripe_batch_list(struct stripe_head *head_sh, 3730 unsigned long handle_flags); 3731 /* handle_stripe_clean_event 3732 * any written block on an uptodate or failed drive can be returned. 3733 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3734 * never LOCKED, so we don't need to test 'failed' directly. 3735 */ 3736 static void handle_stripe_clean_event(struct r5conf *conf, 3737 struct stripe_head *sh, int disks) 3738 { 3739 int i; 3740 struct r5dev *dev; 3741 int discard_pending = 0; 3742 struct stripe_head *head_sh = sh; 3743 bool do_endio = false; 3744 3745 for (i = disks; i--; ) 3746 if (sh->dev[i].written) { 3747 dev = &sh->dev[i]; 3748 if (!test_bit(R5_LOCKED, &dev->flags) && 3749 (test_bit(R5_UPTODATE, &dev->flags) || 3750 test_bit(R5_Discard, &dev->flags) || 3751 test_bit(R5_SkipCopy, &dev->flags))) { 3752 /* We can return any write requests */ 3753 struct bio *wbi, *wbi2; 3754 pr_debug("Return write for disc %d\n", i); 3755 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3756 clear_bit(R5_UPTODATE, &dev->flags); 3757 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3758 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3759 } 3760 do_endio = true; 3761 3762 returnbi: 3763 dev->page = dev->orig_page; 3764 wbi = dev->written; 3765 dev->written = NULL; 3766 while (wbi && wbi->bi_iter.bi_sector < 3767 dev->sector + STRIPE_SECTORS) { 3768 wbi2 = r5_next_bio(wbi, dev->sector); 3769 md_write_end(conf->mddev); 3770 bio_endio(wbi); 3771 wbi = wbi2; 3772 } 3773 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3774 STRIPE_SECTORS, 3775 !test_bit(STRIPE_DEGRADED, &sh->state), 3776 0); 3777 if (head_sh->batch_head) { 3778 sh = list_first_entry(&sh->batch_list, 3779 struct stripe_head, 3780 batch_list); 3781 if (sh != head_sh) { 3782 dev = &sh->dev[i]; 3783 goto returnbi; 3784 } 3785 } 3786 sh = head_sh; 3787 dev = &sh->dev[i]; 3788 } else if (test_bit(R5_Discard, &dev->flags)) 3789 discard_pending = 1; 3790 } 3791 3792 log_stripe_write_finished(sh); 3793 3794 if (!discard_pending && 3795 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3796 int hash; 3797 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3798 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3799 if (sh->qd_idx >= 0) { 3800 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3801 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3802 } 3803 /* now that discard is done we can proceed with any sync */ 3804 clear_bit(STRIPE_DISCARD, &sh->state); 3805 /* 3806 * SCSI discard will change some bio fields and the stripe has 3807 * no updated data, so remove it from hash list and the stripe 3808 * will be reinitialized 3809 */ 3810 unhash: 3811 hash = sh->hash_lock_index; 3812 spin_lock_irq(conf->hash_locks + hash); 3813 remove_hash(sh); 3814 spin_unlock_irq(conf->hash_locks + hash); 3815 if (head_sh->batch_head) { 3816 sh = list_first_entry(&sh->batch_list, 3817 struct stripe_head, batch_list); 3818 if (sh != head_sh) 3819 goto unhash; 3820 } 3821 sh = head_sh; 3822 3823 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3824 set_bit(STRIPE_HANDLE, &sh->state); 3825 3826 } 3827 3828 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3829 if (atomic_dec_and_test(&conf->pending_full_writes)) 3830 md_wakeup_thread(conf->mddev->thread); 3831 3832 if (head_sh->batch_head && do_endio) 3833 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3834 } 3835 3836 /* 3837 * For RMW in write back cache, we need extra page in prexor to store the 3838 * old data. This page is stored in dev->orig_page. 3839 * 3840 * This function checks whether we have data for prexor. The exact logic 3841 * is: 3842 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3843 */ 3844 static inline bool uptodate_for_rmw(struct r5dev *dev) 3845 { 3846 return (test_bit(R5_UPTODATE, &dev->flags)) && 3847 (!test_bit(R5_InJournal, &dev->flags) || 3848 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3849 } 3850 3851 static int handle_stripe_dirtying(struct r5conf *conf, 3852 struct stripe_head *sh, 3853 struct stripe_head_state *s, 3854 int disks) 3855 { 3856 int rmw = 0, rcw = 0, i; 3857 sector_t recovery_cp = conf->mddev->recovery_cp; 3858 3859 /* Check whether resync is now happening or should start. 3860 * If yes, then the array is dirty (after unclean shutdown or 3861 * initial creation), so parity in some stripes might be inconsistent. 3862 * In this case, we need to always do reconstruct-write, to ensure 3863 * that in case of drive failure or read-error correction, we 3864 * generate correct data from the parity. 3865 */ 3866 if (conf->rmw_level == PARITY_DISABLE_RMW || 3867 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3868 s->failed == 0)) { 3869 /* Calculate the real rcw later - for now make it 3870 * look like rcw is cheaper 3871 */ 3872 rcw = 1; rmw = 2; 3873 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3874 conf->rmw_level, (unsigned long long)recovery_cp, 3875 (unsigned long long)sh->sector); 3876 } else for (i = disks; i--; ) { 3877 /* would I have to read this buffer for read_modify_write */ 3878 struct r5dev *dev = &sh->dev[i]; 3879 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3880 i == sh->pd_idx || i == sh->qd_idx || 3881 test_bit(R5_InJournal, &dev->flags)) && 3882 !test_bit(R5_LOCKED, &dev->flags) && 3883 !(uptodate_for_rmw(dev) || 3884 test_bit(R5_Wantcompute, &dev->flags))) { 3885 if (test_bit(R5_Insync, &dev->flags)) 3886 rmw++; 3887 else 3888 rmw += 2*disks; /* cannot read it */ 3889 } 3890 /* Would I have to read this buffer for reconstruct_write */ 3891 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3892 i != sh->pd_idx && i != sh->qd_idx && 3893 !test_bit(R5_LOCKED, &dev->flags) && 3894 !(test_bit(R5_UPTODATE, &dev->flags) || 3895 test_bit(R5_Wantcompute, &dev->flags))) { 3896 if (test_bit(R5_Insync, &dev->flags)) 3897 rcw++; 3898 else 3899 rcw += 2*disks; 3900 } 3901 } 3902 3903 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3904 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3905 set_bit(STRIPE_HANDLE, &sh->state); 3906 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3907 /* prefer read-modify-write, but need to get some data */ 3908 if (conf->mddev->queue) 3909 blk_add_trace_msg(conf->mddev->queue, 3910 "raid5 rmw %llu %d", 3911 (unsigned long long)sh->sector, rmw); 3912 for (i = disks; i--; ) { 3913 struct r5dev *dev = &sh->dev[i]; 3914 if (test_bit(R5_InJournal, &dev->flags) && 3915 dev->page == dev->orig_page && 3916 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3917 /* alloc page for prexor */ 3918 struct page *p = alloc_page(GFP_NOIO); 3919 3920 if (p) { 3921 dev->orig_page = p; 3922 continue; 3923 } 3924 3925 /* 3926 * alloc_page() failed, try use 3927 * disk_info->extra_page 3928 */ 3929 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3930 &conf->cache_state)) { 3931 r5c_use_extra_page(sh); 3932 break; 3933 } 3934 3935 /* extra_page in use, add to delayed_list */ 3936 set_bit(STRIPE_DELAYED, &sh->state); 3937 s->waiting_extra_page = 1; 3938 return -EAGAIN; 3939 } 3940 } 3941 3942 for (i = disks; i--; ) { 3943 struct r5dev *dev = &sh->dev[i]; 3944 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3945 i == sh->pd_idx || i == sh->qd_idx || 3946 test_bit(R5_InJournal, &dev->flags)) && 3947 !test_bit(R5_LOCKED, &dev->flags) && 3948 !(uptodate_for_rmw(dev) || 3949 test_bit(R5_Wantcompute, &dev->flags)) && 3950 test_bit(R5_Insync, &dev->flags)) { 3951 if (test_bit(STRIPE_PREREAD_ACTIVE, 3952 &sh->state)) { 3953 pr_debug("Read_old block %d for r-m-w\n", 3954 i); 3955 set_bit(R5_LOCKED, &dev->flags); 3956 set_bit(R5_Wantread, &dev->flags); 3957 s->locked++; 3958 } else { 3959 set_bit(STRIPE_DELAYED, &sh->state); 3960 set_bit(STRIPE_HANDLE, &sh->state); 3961 } 3962 } 3963 } 3964 } 3965 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3966 /* want reconstruct write, but need to get some data */ 3967 int qread =0; 3968 rcw = 0; 3969 for (i = disks; i--; ) { 3970 struct r5dev *dev = &sh->dev[i]; 3971 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3972 i != sh->pd_idx && i != sh->qd_idx && 3973 !test_bit(R5_LOCKED, &dev->flags) && 3974 !(test_bit(R5_UPTODATE, &dev->flags) || 3975 test_bit(R5_Wantcompute, &dev->flags))) { 3976 rcw++; 3977 if (test_bit(R5_Insync, &dev->flags) && 3978 test_bit(STRIPE_PREREAD_ACTIVE, 3979 &sh->state)) { 3980 pr_debug("Read_old block " 3981 "%d for Reconstruct\n", i); 3982 set_bit(R5_LOCKED, &dev->flags); 3983 set_bit(R5_Wantread, &dev->flags); 3984 s->locked++; 3985 qread++; 3986 } else { 3987 set_bit(STRIPE_DELAYED, &sh->state); 3988 set_bit(STRIPE_HANDLE, &sh->state); 3989 } 3990 } 3991 } 3992 if (rcw && conf->mddev->queue) 3993 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3994 (unsigned long long)sh->sector, 3995 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3996 } 3997 3998 if (rcw > disks && rmw > disks && 3999 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4000 set_bit(STRIPE_DELAYED, &sh->state); 4001 4002 /* now if nothing is locked, and if we have enough data, 4003 * we can start a write request 4004 */ 4005 /* since handle_stripe can be called at any time we need to handle the 4006 * case where a compute block operation has been submitted and then a 4007 * subsequent call wants to start a write request. raid_run_ops only 4008 * handles the case where compute block and reconstruct are requested 4009 * simultaneously. If this is not the case then new writes need to be 4010 * held off until the compute completes. 4011 */ 4012 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4013 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4014 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4015 schedule_reconstruction(sh, s, rcw == 0, 0); 4016 return 0; 4017 } 4018 4019 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4020 struct stripe_head_state *s, int disks) 4021 { 4022 struct r5dev *dev = NULL; 4023 4024 BUG_ON(sh->batch_head); 4025 set_bit(STRIPE_HANDLE, &sh->state); 4026 4027 switch (sh->check_state) { 4028 case check_state_idle: 4029 /* start a new check operation if there are no failures */ 4030 if (s->failed == 0) { 4031 BUG_ON(s->uptodate != disks); 4032 sh->check_state = check_state_run; 4033 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4034 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4035 s->uptodate--; 4036 break; 4037 } 4038 dev = &sh->dev[s->failed_num[0]]; 4039 /* fall through */ 4040 case check_state_compute_result: 4041 sh->check_state = check_state_idle; 4042 if (!dev) 4043 dev = &sh->dev[sh->pd_idx]; 4044 4045 /* check that a write has not made the stripe insync */ 4046 if (test_bit(STRIPE_INSYNC, &sh->state)) 4047 break; 4048 4049 /* either failed parity check, or recovery is happening */ 4050 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4051 BUG_ON(s->uptodate != disks); 4052 4053 set_bit(R5_LOCKED, &dev->flags); 4054 s->locked++; 4055 set_bit(R5_Wantwrite, &dev->flags); 4056 4057 clear_bit(STRIPE_DEGRADED, &sh->state); 4058 set_bit(STRIPE_INSYNC, &sh->state); 4059 break; 4060 case check_state_run: 4061 break; /* we will be called again upon completion */ 4062 case check_state_check_result: 4063 sh->check_state = check_state_idle; 4064 4065 /* if a failure occurred during the check operation, leave 4066 * STRIPE_INSYNC not set and let the stripe be handled again 4067 */ 4068 if (s->failed) 4069 break; 4070 4071 /* handle a successful check operation, if parity is correct 4072 * we are done. Otherwise update the mismatch count and repair 4073 * parity if !MD_RECOVERY_CHECK 4074 */ 4075 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4076 /* parity is correct (on disc, 4077 * not in buffer any more) 4078 */ 4079 set_bit(STRIPE_INSYNC, &sh->state); 4080 else { 4081 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4082 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4083 /* don't try to repair!! */ 4084 set_bit(STRIPE_INSYNC, &sh->state); 4085 else { 4086 sh->check_state = check_state_compute_run; 4087 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4088 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4089 set_bit(R5_Wantcompute, 4090 &sh->dev[sh->pd_idx].flags); 4091 sh->ops.target = sh->pd_idx; 4092 sh->ops.target2 = -1; 4093 s->uptodate++; 4094 } 4095 } 4096 break; 4097 case check_state_compute_run: 4098 break; 4099 default: 4100 pr_err("%s: unknown check_state: %d sector: %llu\n", 4101 __func__, sh->check_state, 4102 (unsigned long long) sh->sector); 4103 BUG(); 4104 } 4105 } 4106 4107 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4108 struct stripe_head_state *s, 4109 int disks) 4110 { 4111 int pd_idx = sh->pd_idx; 4112 int qd_idx = sh->qd_idx; 4113 struct r5dev *dev; 4114 4115 BUG_ON(sh->batch_head); 4116 set_bit(STRIPE_HANDLE, &sh->state); 4117 4118 BUG_ON(s->failed > 2); 4119 4120 /* Want to check and possibly repair P and Q. 4121 * However there could be one 'failed' device, in which 4122 * case we can only check one of them, possibly using the 4123 * other to generate missing data 4124 */ 4125 4126 switch (sh->check_state) { 4127 case check_state_idle: 4128 /* start a new check operation if there are < 2 failures */ 4129 if (s->failed == s->q_failed) { 4130 /* The only possible failed device holds Q, so it 4131 * makes sense to check P (If anything else were failed, 4132 * we would have used P to recreate it). 4133 */ 4134 sh->check_state = check_state_run; 4135 } 4136 if (!s->q_failed && s->failed < 2) { 4137 /* Q is not failed, and we didn't use it to generate 4138 * anything, so it makes sense to check it 4139 */ 4140 if (sh->check_state == check_state_run) 4141 sh->check_state = check_state_run_pq; 4142 else 4143 sh->check_state = check_state_run_q; 4144 } 4145 4146 /* discard potentially stale zero_sum_result */ 4147 sh->ops.zero_sum_result = 0; 4148 4149 if (sh->check_state == check_state_run) { 4150 /* async_xor_zero_sum destroys the contents of P */ 4151 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4152 s->uptodate--; 4153 } 4154 if (sh->check_state >= check_state_run && 4155 sh->check_state <= check_state_run_pq) { 4156 /* async_syndrome_zero_sum preserves P and Q, so 4157 * no need to mark them !uptodate here 4158 */ 4159 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4160 break; 4161 } 4162 4163 /* we have 2-disk failure */ 4164 BUG_ON(s->failed != 2); 4165 /* fall through */ 4166 case check_state_compute_result: 4167 sh->check_state = check_state_idle; 4168 4169 /* check that a write has not made the stripe insync */ 4170 if (test_bit(STRIPE_INSYNC, &sh->state)) 4171 break; 4172 4173 /* now write out any block on a failed drive, 4174 * or P or Q if they were recomputed 4175 */ 4176 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4177 if (s->failed == 2) { 4178 dev = &sh->dev[s->failed_num[1]]; 4179 s->locked++; 4180 set_bit(R5_LOCKED, &dev->flags); 4181 set_bit(R5_Wantwrite, &dev->flags); 4182 } 4183 if (s->failed >= 1) { 4184 dev = &sh->dev[s->failed_num[0]]; 4185 s->locked++; 4186 set_bit(R5_LOCKED, &dev->flags); 4187 set_bit(R5_Wantwrite, &dev->flags); 4188 } 4189 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4190 dev = &sh->dev[pd_idx]; 4191 s->locked++; 4192 set_bit(R5_LOCKED, &dev->flags); 4193 set_bit(R5_Wantwrite, &dev->flags); 4194 } 4195 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4196 dev = &sh->dev[qd_idx]; 4197 s->locked++; 4198 set_bit(R5_LOCKED, &dev->flags); 4199 set_bit(R5_Wantwrite, &dev->flags); 4200 } 4201 clear_bit(STRIPE_DEGRADED, &sh->state); 4202 4203 set_bit(STRIPE_INSYNC, &sh->state); 4204 break; 4205 case check_state_run: 4206 case check_state_run_q: 4207 case check_state_run_pq: 4208 break; /* we will be called again upon completion */ 4209 case check_state_check_result: 4210 sh->check_state = check_state_idle; 4211 4212 /* handle a successful check operation, if parity is correct 4213 * we are done. Otherwise update the mismatch count and repair 4214 * parity if !MD_RECOVERY_CHECK 4215 */ 4216 if (sh->ops.zero_sum_result == 0) { 4217 /* both parities are correct */ 4218 if (!s->failed) 4219 set_bit(STRIPE_INSYNC, &sh->state); 4220 else { 4221 /* in contrast to the raid5 case we can validate 4222 * parity, but still have a failure to write 4223 * back 4224 */ 4225 sh->check_state = check_state_compute_result; 4226 /* Returning at this point means that we may go 4227 * off and bring p and/or q uptodate again so 4228 * we make sure to check zero_sum_result again 4229 * to verify if p or q need writeback 4230 */ 4231 } 4232 } else { 4233 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4234 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4235 /* don't try to repair!! */ 4236 set_bit(STRIPE_INSYNC, &sh->state); 4237 else { 4238 int *target = &sh->ops.target; 4239 4240 sh->ops.target = -1; 4241 sh->ops.target2 = -1; 4242 sh->check_state = check_state_compute_run; 4243 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4244 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4245 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4246 set_bit(R5_Wantcompute, 4247 &sh->dev[pd_idx].flags); 4248 *target = pd_idx; 4249 target = &sh->ops.target2; 4250 s->uptodate++; 4251 } 4252 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4253 set_bit(R5_Wantcompute, 4254 &sh->dev[qd_idx].flags); 4255 *target = qd_idx; 4256 s->uptodate++; 4257 } 4258 } 4259 } 4260 break; 4261 case check_state_compute_run: 4262 break; 4263 default: 4264 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4265 __func__, sh->check_state, 4266 (unsigned long long) sh->sector); 4267 BUG(); 4268 } 4269 } 4270 4271 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4272 { 4273 int i; 4274 4275 /* We have read all the blocks in this stripe and now we need to 4276 * copy some of them into a target stripe for expand. 4277 */ 4278 struct dma_async_tx_descriptor *tx = NULL; 4279 BUG_ON(sh->batch_head); 4280 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4281 for (i = 0; i < sh->disks; i++) 4282 if (i != sh->pd_idx && i != sh->qd_idx) { 4283 int dd_idx, j; 4284 struct stripe_head *sh2; 4285 struct async_submit_ctl submit; 4286 4287 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4288 sector_t s = raid5_compute_sector(conf, bn, 0, 4289 &dd_idx, NULL); 4290 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4291 if (sh2 == NULL) 4292 /* so far only the early blocks of this stripe 4293 * have been requested. When later blocks 4294 * get requested, we will try again 4295 */ 4296 continue; 4297 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4298 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4299 /* must have already done this block */ 4300 raid5_release_stripe(sh2); 4301 continue; 4302 } 4303 4304 /* place all the copies on one channel */ 4305 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4306 tx = async_memcpy(sh2->dev[dd_idx].page, 4307 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4308 &submit); 4309 4310 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4311 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4312 for (j = 0; j < conf->raid_disks; j++) 4313 if (j != sh2->pd_idx && 4314 j != sh2->qd_idx && 4315 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4316 break; 4317 if (j == conf->raid_disks) { 4318 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4319 set_bit(STRIPE_HANDLE, &sh2->state); 4320 } 4321 raid5_release_stripe(sh2); 4322 4323 } 4324 /* done submitting copies, wait for them to complete */ 4325 async_tx_quiesce(&tx); 4326 } 4327 4328 /* 4329 * handle_stripe - do things to a stripe. 4330 * 4331 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4332 * state of various bits to see what needs to be done. 4333 * Possible results: 4334 * return some read requests which now have data 4335 * return some write requests which are safely on storage 4336 * schedule a read on some buffers 4337 * schedule a write of some buffers 4338 * return confirmation of parity correctness 4339 * 4340 */ 4341 4342 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4343 { 4344 struct r5conf *conf = sh->raid_conf; 4345 int disks = sh->disks; 4346 struct r5dev *dev; 4347 int i; 4348 int do_recovery = 0; 4349 4350 memset(s, 0, sizeof(*s)); 4351 4352 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4353 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4354 s->failed_num[0] = -1; 4355 s->failed_num[1] = -1; 4356 s->log_failed = r5l_log_disk_error(conf); 4357 4358 /* Now to look around and see what can be done */ 4359 rcu_read_lock(); 4360 for (i=disks; i--; ) { 4361 struct md_rdev *rdev; 4362 sector_t first_bad; 4363 int bad_sectors; 4364 int is_bad = 0; 4365 4366 dev = &sh->dev[i]; 4367 4368 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4369 i, dev->flags, 4370 dev->toread, dev->towrite, dev->written); 4371 /* maybe we can reply to a read 4372 * 4373 * new wantfill requests are only permitted while 4374 * ops_complete_biofill is guaranteed to be inactive 4375 */ 4376 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4377 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4378 set_bit(R5_Wantfill, &dev->flags); 4379 4380 /* now count some things */ 4381 if (test_bit(R5_LOCKED, &dev->flags)) 4382 s->locked++; 4383 if (test_bit(R5_UPTODATE, &dev->flags)) 4384 s->uptodate++; 4385 if (test_bit(R5_Wantcompute, &dev->flags)) { 4386 s->compute++; 4387 BUG_ON(s->compute > 2); 4388 } 4389 4390 if (test_bit(R5_Wantfill, &dev->flags)) 4391 s->to_fill++; 4392 else if (dev->toread) 4393 s->to_read++; 4394 if (dev->towrite) { 4395 s->to_write++; 4396 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4397 s->non_overwrite++; 4398 } 4399 if (dev->written) 4400 s->written++; 4401 /* Prefer to use the replacement for reads, but only 4402 * if it is recovered enough and has no bad blocks. 4403 */ 4404 rdev = rcu_dereference(conf->disks[i].replacement); 4405 if (rdev && !test_bit(Faulty, &rdev->flags) && 4406 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4407 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4408 &first_bad, &bad_sectors)) 4409 set_bit(R5_ReadRepl, &dev->flags); 4410 else { 4411 if (rdev && !test_bit(Faulty, &rdev->flags)) 4412 set_bit(R5_NeedReplace, &dev->flags); 4413 else 4414 clear_bit(R5_NeedReplace, &dev->flags); 4415 rdev = rcu_dereference(conf->disks[i].rdev); 4416 clear_bit(R5_ReadRepl, &dev->flags); 4417 } 4418 if (rdev && test_bit(Faulty, &rdev->flags)) 4419 rdev = NULL; 4420 if (rdev) { 4421 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4422 &first_bad, &bad_sectors); 4423 if (s->blocked_rdev == NULL 4424 && (test_bit(Blocked, &rdev->flags) 4425 || is_bad < 0)) { 4426 if (is_bad < 0) 4427 set_bit(BlockedBadBlocks, 4428 &rdev->flags); 4429 s->blocked_rdev = rdev; 4430 atomic_inc(&rdev->nr_pending); 4431 } 4432 } 4433 clear_bit(R5_Insync, &dev->flags); 4434 if (!rdev) 4435 /* Not in-sync */; 4436 else if (is_bad) { 4437 /* also not in-sync */ 4438 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4439 test_bit(R5_UPTODATE, &dev->flags)) { 4440 /* treat as in-sync, but with a read error 4441 * which we can now try to correct 4442 */ 4443 set_bit(R5_Insync, &dev->flags); 4444 set_bit(R5_ReadError, &dev->flags); 4445 } 4446 } else if (test_bit(In_sync, &rdev->flags)) 4447 set_bit(R5_Insync, &dev->flags); 4448 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4449 /* in sync if before recovery_offset */ 4450 set_bit(R5_Insync, &dev->flags); 4451 else if (test_bit(R5_UPTODATE, &dev->flags) && 4452 test_bit(R5_Expanded, &dev->flags)) 4453 /* If we've reshaped into here, we assume it is Insync. 4454 * We will shortly update recovery_offset to make 4455 * it official. 4456 */ 4457 set_bit(R5_Insync, &dev->flags); 4458 4459 if (test_bit(R5_WriteError, &dev->flags)) { 4460 /* This flag does not apply to '.replacement' 4461 * only to .rdev, so make sure to check that*/ 4462 struct md_rdev *rdev2 = rcu_dereference( 4463 conf->disks[i].rdev); 4464 if (rdev2 == rdev) 4465 clear_bit(R5_Insync, &dev->flags); 4466 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4467 s->handle_bad_blocks = 1; 4468 atomic_inc(&rdev2->nr_pending); 4469 } else 4470 clear_bit(R5_WriteError, &dev->flags); 4471 } 4472 if (test_bit(R5_MadeGood, &dev->flags)) { 4473 /* This flag does not apply to '.replacement' 4474 * only to .rdev, so make sure to check that*/ 4475 struct md_rdev *rdev2 = rcu_dereference( 4476 conf->disks[i].rdev); 4477 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4478 s->handle_bad_blocks = 1; 4479 atomic_inc(&rdev2->nr_pending); 4480 } else 4481 clear_bit(R5_MadeGood, &dev->flags); 4482 } 4483 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4484 struct md_rdev *rdev2 = rcu_dereference( 4485 conf->disks[i].replacement); 4486 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4487 s->handle_bad_blocks = 1; 4488 atomic_inc(&rdev2->nr_pending); 4489 } else 4490 clear_bit(R5_MadeGoodRepl, &dev->flags); 4491 } 4492 if (!test_bit(R5_Insync, &dev->flags)) { 4493 /* The ReadError flag will just be confusing now */ 4494 clear_bit(R5_ReadError, &dev->flags); 4495 clear_bit(R5_ReWrite, &dev->flags); 4496 } 4497 if (test_bit(R5_ReadError, &dev->flags)) 4498 clear_bit(R5_Insync, &dev->flags); 4499 if (!test_bit(R5_Insync, &dev->flags)) { 4500 if (s->failed < 2) 4501 s->failed_num[s->failed] = i; 4502 s->failed++; 4503 if (rdev && !test_bit(Faulty, &rdev->flags)) 4504 do_recovery = 1; 4505 } 4506 4507 if (test_bit(R5_InJournal, &dev->flags)) 4508 s->injournal++; 4509 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4510 s->just_cached++; 4511 } 4512 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4513 /* If there is a failed device being replaced, 4514 * we must be recovering. 4515 * else if we are after recovery_cp, we must be syncing 4516 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4517 * else we can only be replacing 4518 * sync and recovery both need to read all devices, and so 4519 * use the same flag. 4520 */ 4521 if (do_recovery || 4522 sh->sector >= conf->mddev->recovery_cp || 4523 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4524 s->syncing = 1; 4525 else 4526 s->replacing = 1; 4527 } 4528 rcu_read_unlock(); 4529 } 4530 4531 static int clear_batch_ready(struct stripe_head *sh) 4532 { 4533 /* Return '1' if this is a member of batch, or 4534 * '0' if it is a lone stripe or a head which can now be 4535 * handled. 4536 */ 4537 struct stripe_head *tmp; 4538 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4539 return (sh->batch_head && sh->batch_head != sh); 4540 spin_lock(&sh->stripe_lock); 4541 if (!sh->batch_head) { 4542 spin_unlock(&sh->stripe_lock); 4543 return 0; 4544 } 4545 4546 /* 4547 * this stripe could be added to a batch list before we check 4548 * BATCH_READY, skips it 4549 */ 4550 if (sh->batch_head != sh) { 4551 spin_unlock(&sh->stripe_lock); 4552 return 1; 4553 } 4554 spin_lock(&sh->batch_lock); 4555 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4556 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4557 spin_unlock(&sh->batch_lock); 4558 spin_unlock(&sh->stripe_lock); 4559 4560 /* 4561 * BATCH_READY is cleared, no new stripes can be added. 4562 * batch_list can be accessed without lock 4563 */ 4564 return 0; 4565 } 4566 4567 static void break_stripe_batch_list(struct stripe_head *head_sh, 4568 unsigned long handle_flags) 4569 { 4570 struct stripe_head *sh, *next; 4571 int i; 4572 int do_wakeup = 0; 4573 4574 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4575 4576 list_del_init(&sh->batch_list); 4577 4578 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4579 (1 << STRIPE_SYNCING) | 4580 (1 << STRIPE_REPLACED) | 4581 (1 << STRIPE_DELAYED) | 4582 (1 << STRIPE_BIT_DELAY) | 4583 (1 << STRIPE_FULL_WRITE) | 4584 (1 << STRIPE_BIOFILL_RUN) | 4585 (1 << STRIPE_COMPUTE_RUN) | 4586 (1 << STRIPE_OPS_REQ_PENDING) | 4587 (1 << STRIPE_DISCARD) | 4588 (1 << STRIPE_BATCH_READY) | 4589 (1 << STRIPE_BATCH_ERR) | 4590 (1 << STRIPE_BITMAP_PENDING)), 4591 "stripe state: %lx\n", sh->state); 4592 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4593 (1 << STRIPE_REPLACED)), 4594 "head stripe state: %lx\n", head_sh->state); 4595 4596 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4597 (1 << STRIPE_PREREAD_ACTIVE) | 4598 (1 << STRIPE_DEGRADED)), 4599 head_sh->state & (1 << STRIPE_INSYNC)); 4600 4601 sh->check_state = head_sh->check_state; 4602 sh->reconstruct_state = head_sh->reconstruct_state; 4603 for (i = 0; i < sh->disks; i++) { 4604 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4605 do_wakeup = 1; 4606 sh->dev[i].flags = head_sh->dev[i].flags & 4607 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4608 } 4609 spin_lock_irq(&sh->stripe_lock); 4610 sh->batch_head = NULL; 4611 spin_unlock_irq(&sh->stripe_lock); 4612 if (handle_flags == 0 || 4613 sh->state & handle_flags) 4614 set_bit(STRIPE_HANDLE, &sh->state); 4615 raid5_release_stripe(sh); 4616 } 4617 spin_lock_irq(&head_sh->stripe_lock); 4618 head_sh->batch_head = NULL; 4619 spin_unlock_irq(&head_sh->stripe_lock); 4620 for (i = 0; i < head_sh->disks; i++) 4621 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4622 do_wakeup = 1; 4623 if (head_sh->state & handle_flags) 4624 set_bit(STRIPE_HANDLE, &head_sh->state); 4625 4626 if (do_wakeup) 4627 wake_up(&head_sh->raid_conf->wait_for_overlap); 4628 } 4629 4630 static void handle_stripe(struct stripe_head *sh) 4631 { 4632 struct stripe_head_state s; 4633 struct r5conf *conf = sh->raid_conf; 4634 int i; 4635 int prexor; 4636 int disks = sh->disks; 4637 struct r5dev *pdev, *qdev; 4638 4639 clear_bit(STRIPE_HANDLE, &sh->state); 4640 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4641 /* already being handled, ensure it gets handled 4642 * again when current action finishes */ 4643 set_bit(STRIPE_HANDLE, &sh->state); 4644 return; 4645 } 4646 4647 if (clear_batch_ready(sh) ) { 4648 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4649 return; 4650 } 4651 4652 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4653 break_stripe_batch_list(sh, 0); 4654 4655 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4656 spin_lock(&sh->stripe_lock); 4657 /* Cannot process 'sync' concurrently with 'discard' */ 4658 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4659 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4660 set_bit(STRIPE_SYNCING, &sh->state); 4661 clear_bit(STRIPE_INSYNC, &sh->state); 4662 clear_bit(STRIPE_REPLACED, &sh->state); 4663 } 4664 spin_unlock(&sh->stripe_lock); 4665 } 4666 clear_bit(STRIPE_DELAYED, &sh->state); 4667 4668 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4669 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4670 (unsigned long long)sh->sector, sh->state, 4671 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4672 sh->check_state, sh->reconstruct_state); 4673 4674 analyse_stripe(sh, &s); 4675 4676 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4677 goto finish; 4678 4679 if (s.handle_bad_blocks || 4680 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4681 set_bit(STRIPE_HANDLE, &sh->state); 4682 goto finish; 4683 } 4684 4685 if (unlikely(s.blocked_rdev)) { 4686 if (s.syncing || s.expanding || s.expanded || 4687 s.replacing || s.to_write || s.written) { 4688 set_bit(STRIPE_HANDLE, &sh->state); 4689 goto finish; 4690 } 4691 /* There is nothing for the blocked_rdev to block */ 4692 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4693 s.blocked_rdev = NULL; 4694 } 4695 4696 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4697 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4698 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4699 } 4700 4701 pr_debug("locked=%d uptodate=%d to_read=%d" 4702 " to_write=%d failed=%d failed_num=%d,%d\n", 4703 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4704 s.failed_num[0], s.failed_num[1]); 4705 /* check if the array has lost more than max_degraded devices and, 4706 * if so, some requests might need to be failed. 4707 */ 4708 if (s.failed > conf->max_degraded || s.log_failed) { 4709 sh->check_state = 0; 4710 sh->reconstruct_state = 0; 4711 break_stripe_batch_list(sh, 0); 4712 if (s.to_read+s.to_write+s.written) 4713 handle_failed_stripe(conf, sh, &s, disks); 4714 if (s.syncing + s.replacing) 4715 handle_failed_sync(conf, sh, &s); 4716 } 4717 4718 /* Now we check to see if any write operations have recently 4719 * completed 4720 */ 4721 prexor = 0; 4722 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4723 prexor = 1; 4724 if (sh->reconstruct_state == reconstruct_state_drain_result || 4725 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4726 sh->reconstruct_state = reconstruct_state_idle; 4727 4728 /* All the 'written' buffers and the parity block are ready to 4729 * be written back to disk 4730 */ 4731 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4732 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4733 BUG_ON(sh->qd_idx >= 0 && 4734 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4735 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4736 for (i = disks; i--; ) { 4737 struct r5dev *dev = &sh->dev[i]; 4738 if (test_bit(R5_LOCKED, &dev->flags) && 4739 (i == sh->pd_idx || i == sh->qd_idx || 4740 dev->written || test_bit(R5_InJournal, 4741 &dev->flags))) { 4742 pr_debug("Writing block %d\n", i); 4743 set_bit(R5_Wantwrite, &dev->flags); 4744 if (prexor) 4745 continue; 4746 if (s.failed > 1) 4747 continue; 4748 if (!test_bit(R5_Insync, &dev->flags) || 4749 ((i == sh->pd_idx || i == sh->qd_idx) && 4750 s.failed == 0)) 4751 set_bit(STRIPE_INSYNC, &sh->state); 4752 } 4753 } 4754 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4755 s.dec_preread_active = 1; 4756 } 4757 4758 /* 4759 * might be able to return some write requests if the parity blocks 4760 * are safe, or on a failed drive 4761 */ 4762 pdev = &sh->dev[sh->pd_idx]; 4763 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4764 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4765 qdev = &sh->dev[sh->qd_idx]; 4766 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4767 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4768 || conf->level < 6; 4769 4770 if (s.written && 4771 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4772 && !test_bit(R5_LOCKED, &pdev->flags) 4773 && (test_bit(R5_UPTODATE, &pdev->flags) || 4774 test_bit(R5_Discard, &pdev->flags))))) && 4775 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4776 && !test_bit(R5_LOCKED, &qdev->flags) 4777 && (test_bit(R5_UPTODATE, &qdev->flags) || 4778 test_bit(R5_Discard, &qdev->flags)))))) 4779 handle_stripe_clean_event(conf, sh, disks); 4780 4781 if (s.just_cached) 4782 r5c_handle_cached_data_endio(conf, sh, disks); 4783 log_stripe_write_finished(sh); 4784 4785 /* Now we might consider reading some blocks, either to check/generate 4786 * parity, or to satisfy requests 4787 * or to load a block that is being partially written. 4788 */ 4789 if (s.to_read || s.non_overwrite 4790 || (conf->level == 6 && s.to_write && s.failed) 4791 || (s.syncing && (s.uptodate + s.compute < disks)) 4792 || s.replacing 4793 || s.expanding) 4794 handle_stripe_fill(sh, &s, disks); 4795 4796 /* 4797 * When the stripe finishes full journal write cycle (write to journal 4798 * and raid disk), this is the clean up procedure so it is ready for 4799 * next operation. 4800 */ 4801 r5c_finish_stripe_write_out(conf, sh, &s); 4802 4803 /* 4804 * Now to consider new write requests, cache write back and what else, 4805 * if anything should be read. We do not handle new writes when: 4806 * 1/ A 'write' operation (copy+xor) is already in flight. 4807 * 2/ A 'check' operation is in flight, as it may clobber the parity 4808 * block. 4809 * 3/ A r5c cache log write is in flight. 4810 */ 4811 4812 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4813 if (!r5c_is_writeback(conf->log)) { 4814 if (s.to_write) 4815 handle_stripe_dirtying(conf, sh, &s, disks); 4816 } else { /* write back cache */ 4817 int ret = 0; 4818 4819 /* First, try handle writes in caching phase */ 4820 if (s.to_write) 4821 ret = r5c_try_caching_write(conf, sh, &s, 4822 disks); 4823 /* 4824 * If caching phase failed: ret == -EAGAIN 4825 * OR 4826 * stripe under reclaim: !caching && injournal 4827 * 4828 * fall back to handle_stripe_dirtying() 4829 */ 4830 if (ret == -EAGAIN || 4831 /* stripe under reclaim: !caching && injournal */ 4832 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4833 s.injournal > 0)) { 4834 ret = handle_stripe_dirtying(conf, sh, &s, 4835 disks); 4836 if (ret == -EAGAIN) 4837 goto finish; 4838 } 4839 } 4840 } 4841 4842 /* maybe we need to check and possibly fix the parity for this stripe 4843 * Any reads will already have been scheduled, so we just see if enough 4844 * data is available. The parity check is held off while parity 4845 * dependent operations are in flight. 4846 */ 4847 if (sh->check_state || 4848 (s.syncing && s.locked == 0 && 4849 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4850 !test_bit(STRIPE_INSYNC, &sh->state))) { 4851 if (conf->level == 6) 4852 handle_parity_checks6(conf, sh, &s, disks); 4853 else 4854 handle_parity_checks5(conf, sh, &s, disks); 4855 } 4856 4857 if ((s.replacing || s.syncing) && s.locked == 0 4858 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4859 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4860 /* Write out to replacement devices where possible */ 4861 for (i = 0; i < conf->raid_disks; i++) 4862 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4863 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4864 set_bit(R5_WantReplace, &sh->dev[i].flags); 4865 set_bit(R5_LOCKED, &sh->dev[i].flags); 4866 s.locked++; 4867 } 4868 if (s.replacing) 4869 set_bit(STRIPE_INSYNC, &sh->state); 4870 set_bit(STRIPE_REPLACED, &sh->state); 4871 } 4872 if ((s.syncing || s.replacing) && s.locked == 0 && 4873 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4874 test_bit(STRIPE_INSYNC, &sh->state)) { 4875 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4876 clear_bit(STRIPE_SYNCING, &sh->state); 4877 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4878 wake_up(&conf->wait_for_overlap); 4879 } 4880 4881 /* If the failed drives are just a ReadError, then we might need 4882 * to progress the repair/check process 4883 */ 4884 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4885 for (i = 0; i < s.failed; i++) { 4886 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4887 if (test_bit(R5_ReadError, &dev->flags) 4888 && !test_bit(R5_LOCKED, &dev->flags) 4889 && test_bit(R5_UPTODATE, &dev->flags) 4890 ) { 4891 if (!test_bit(R5_ReWrite, &dev->flags)) { 4892 set_bit(R5_Wantwrite, &dev->flags); 4893 set_bit(R5_ReWrite, &dev->flags); 4894 set_bit(R5_LOCKED, &dev->flags); 4895 s.locked++; 4896 } else { 4897 /* let's read it back */ 4898 set_bit(R5_Wantread, &dev->flags); 4899 set_bit(R5_LOCKED, &dev->flags); 4900 s.locked++; 4901 } 4902 } 4903 } 4904 4905 /* Finish reconstruct operations initiated by the expansion process */ 4906 if (sh->reconstruct_state == reconstruct_state_result) { 4907 struct stripe_head *sh_src 4908 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4909 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4910 /* sh cannot be written until sh_src has been read. 4911 * so arrange for sh to be delayed a little 4912 */ 4913 set_bit(STRIPE_DELAYED, &sh->state); 4914 set_bit(STRIPE_HANDLE, &sh->state); 4915 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4916 &sh_src->state)) 4917 atomic_inc(&conf->preread_active_stripes); 4918 raid5_release_stripe(sh_src); 4919 goto finish; 4920 } 4921 if (sh_src) 4922 raid5_release_stripe(sh_src); 4923 4924 sh->reconstruct_state = reconstruct_state_idle; 4925 clear_bit(STRIPE_EXPANDING, &sh->state); 4926 for (i = conf->raid_disks; i--; ) { 4927 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4928 set_bit(R5_LOCKED, &sh->dev[i].flags); 4929 s.locked++; 4930 } 4931 } 4932 4933 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4934 !sh->reconstruct_state) { 4935 /* Need to write out all blocks after computing parity */ 4936 sh->disks = conf->raid_disks; 4937 stripe_set_idx(sh->sector, conf, 0, sh); 4938 schedule_reconstruction(sh, &s, 1, 1); 4939 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4940 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4941 atomic_dec(&conf->reshape_stripes); 4942 wake_up(&conf->wait_for_overlap); 4943 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4944 } 4945 4946 if (s.expanding && s.locked == 0 && 4947 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4948 handle_stripe_expansion(conf, sh); 4949 4950 finish: 4951 /* wait for this device to become unblocked */ 4952 if (unlikely(s.blocked_rdev)) { 4953 if (conf->mddev->external) 4954 md_wait_for_blocked_rdev(s.blocked_rdev, 4955 conf->mddev); 4956 else 4957 /* Internal metadata will immediately 4958 * be written by raid5d, so we don't 4959 * need to wait here. 4960 */ 4961 rdev_dec_pending(s.blocked_rdev, 4962 conf->mddev); 4963 } 4964 4965 if (s.handle_bad_blocks) 4966 for (i = disks; i--; ) { 4967 struct md_rdev *rdev; 4968 struct r5dev *dev = &sh->dev[i]; 4969 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4970 /* We own a safe reference to the rdev */ 4971 rdev = conf->disks[i].rdev; 4972 if (!rdev_set_badblocks(rdev, sh->sector, 4973 STRIPE_SECTORS, 0)) 4974 md_error(conf->mddev, rdev); 4975 rdev_dec_pending(rdev, conf->mddev); 4976 } 4977 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4978 rdev = conf->disks[i].rdev; 4979 rdev_clear_badblocks(rdev, sh->sector, 4980 STRIPE_SECTORS, 0); 4981 rdev_dec_pending(rdev, conf->mddev); 4982 } 4983 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4984 rdev = conf->disks[i].replacement; 4985 if (!rdev) 4986 /* rdev have been moved down */ 4987 rdev = conf->disks[i].rdev; 4988 rdev_clear_badblocks(rdev, sh->sector, 4989 STRIPE_SECTORS, 0); 4990 rdev_dec_pending(rdev, conf->mddev); 4991 } 4992 } 4993 4994 if (s.ops_request) 4995 raid_run_ops(sh, s.ops_request); 4996 4997 ops_run_io(sh, &s); 4998 4999 if (s.dec_preread_active) { 5000 /* We delay this until after ops_run_io so that if make_request 5001 * is waiting on a flush, it won't continue until the writes 5002 * have actually been submitted. 5003 */ 5004 atomic_dec(&conf->preread_active_stripes); 5005 if (atomic_read(&conf->preread_active_stripes) < 5006 IO_THRESHOLD) 5007 md_wakeup_thread(conf->mddev->thread); 5008 } 5009 5010 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5011 } 5012 5013 static void raid5_activate_delayed(struct r5conf *conf) 5014 { 5015 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5016 while (!list_empty(&conf->delayed_list)) { 5017 struct list_head *l = conf->delayed_list.next; 5018 struct stripe_head *sh; 5019 sh = list_entry(l, struct stripe_head, lru); 5020 list_del_init(l); 5021 clear_bit(STRIPE_DELAYED, &sh->state); 5022 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5023 atomic_inc(&conf->preread_active_stripes); 5024 list_add_tail(&sh->lru, &conf->hold_list); 5025 raid5_wakeup_stripe_thread(sh); 5026 } 5027 } 5028 } 5029 5030 static void activate_bit_delay(struct r5conf *conf, 5031 struct list_head *temp_inactive_list) 5032 { 5033 /* device_lock is held */ 5034 struct list_head head; 5035 list_add(&head, &conf->bitmap_list); 5036 list_del_init(&conf->bitmap_list); 5037 while (!list_empty(&head)) { 5038 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5039 int hash; 5040 list_del_init(&sh->lru); 5041 atomic_inc(&sh->count); 5042 hash = sh->hash_lock_index; 5043 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5044 } 5045 } 5046 5047 static int raid5_congested(struct mddev *mddev, int bits) 5048 { 5049 struct r5conf *conf = mddev->private; 5050 5051 /* No difference between reads and writes. Just check 5052 * how busy the stripe_cache is 5053 */ 5054 5055 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5056 return 1; 5057 5058 /* Also checks whether there is pressure on r5cache log space */ 5059 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5060 return 1; 5061 if (conf->quiesce) 5062 return 1; 5063 if (atomic_read(&conf->empty_inactive_list_nr)) 5064 return 1; 5065 5066 return 0; 5067 } 5068 5069 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5070 { 5071 struct r5conf *conf = mddev->private; 5072 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 5073 unsigned int chunk_sectors; 5074 unsigned int bio_sectors = bio_sectors(bio); 5075 5076 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5077 return chunk_sectors >= 5078 ((sector & (chunk_sectors - 1)) + bio_sectors); 5079 } 5080 5081 /* 5082 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5083 * later sampled by raid5d. 5084 */ 5085 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5086 { 5087 unsigned long flags; 5088 5089 spin_lock_irqsave(&conf->device_lock, flags); 5090 5091 bi->bi_next = conf->retry_read_aligned_list; 5092 conf->retry_read_aligned_list = bi; 5093 5094 spin_unlock_irqrestore(&conf->device_lock, flags); 5095 md_wakeup_thread(conf->mddev->thread); 5096 } 5097 5098 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5099 unsigned int *offset) 5100 { 5101 struct bio *bi; 5102 5103 bi = conf->retry_read_aligned; 5104 if (bi) { 5105 *offset = conf->retry_read_offset; 5106 conf->retry_read_aligned = NULL; 5107 return bi; 5108 } 5109 bi = conf->retry_read_aligned_list; 5110 if(bi) { 5111 conf->retry_read_aligned_list = bi->bi_next; 5112 bi->bi_next = NULL; 5113 *offset = 0; 5114 } 5115 5116 return bi; 5117 } 5118 5119 /* 5120 * The "raid5_align_endio" should check if the read succeeded and if it 5121 * did, call bio_endio on the original bio (having bio_put the new bio 5122 * first). 5123 * If the read failed.. 5124 */ 5125 static void raid5_align_endio(struct bio *bi) 5126 { 5127 struct bio* raid_bi = bi->bi_private; 5128 struct mddev *mddev; 5129 struct r5conf *conf; 5130 struct md_rdev *rdev; 5131 int error = bi->bi_error; 5132 5133 bio_put(bi); 5134 5135 rdev = (void*)raid_bi->bi_next; 5136 raid_bi->bi_next = NULL; 5137 mddev = rdev->mddev; 5138 conf = mddev->private; 5139 5140 rdev_dec_pending(rdev, conf->mddev); 5141 5142 if (!error) { 5143 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 5144 raid_bi, 0); 5145 bio_endio(raid_bi); 5146 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5147 wake_up(&conf->wait_for_quiescent); 5148 return; 5149 } 5150 5151 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5152 5153 add_bio_to_retry(raid_bi, conf); 5154 } 5155 5156 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5157 { 5158 struct r5conf *conf = mddev->private; 5159 int dd_idx; 5160 struct bio* align_bi; 5161 struct md_rdev *rdev; 5162 sector_t end_sector; 5163 5164 if (!in_chunk_boundary(mddev, raid_bio)) { 5165 pr_debug("%s: non aligned\n", __func__); 5166 return 0; 5167 } 5168 /* 5169 * use bio_clone_fast to make a copy of the bio 5170 */ 5171 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5172 if (!align_bi) 5173 return 0; 5174 /* 5175 * set bi_end_io to a new function, and set bi_private to the 5176 * original bio. 5177 */ 5178 align_bi->bi_end_io = raid5_align_endio; 5179 align_bi->bi_private = raid_bio; 5180 /* 5181 * compute position 5182 */ 5183 align_bi->bi_iter.bi_sector = 5184 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5185 0, &dd_idx, NULL); 5186 5187 end_sector = bio_end_sector(align_bi); 5188 rcu_read_lock(); 5189 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5190 if (!rdev || test_bit(Faulty, &rdev->flags) || 5191 rdev->recovery_offset < end_sector) { 5192 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5193 if (rdev && 5194 (test_bit(Faulty, &rdev->flags) || 5195 !(test_bit(In_sync, &rdev->flags) || 5196 rdev->recovery_offset >= end_sector))) 5197 rdev = NULL; 5198 } 5199 5200 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5201 rcu_read_unlock(); 5202 bio_put(align_bi); 5203 return 0; 5204 } 5205 5206 if (rdev) { 5207 sector_t first_bad; 5208 int bad_sectors; 5209 5210 atomic_inc(&rdev->nr_pending); 5211 rcu_read_unlock(); 5212 raid_bio->bi_next = (void*)rdev; 5213 align_bi->bi_bdev = rdev->bdev; 5214 bio_clear_flag(align_bi, BIO_SEG_VALID); 5215 5216 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5217 bio_sectors(align_bi), 5218 &first_bad, &bad_sectors)) { 5219 bio_put(align_bi); 5220 rdev_dec_pending(rdev, mddev); 5221 return 0; 5222 } 5223 5224 /* No reshape active, so we can trust rdev->data_offset */ 5225 align_bi->bi_iter.bi_sector += rdev->data_offset; 5226 5227 spin_lock_irq(&conf->device_lock); 5228 wait_event_lock_irq(conf->wait_for_quiescent, 5229 conf->quiesce == 0, 5230 conf->device_lock); 5231 atomic_inc(&conf->active_aligned_reads); 5232 spin_unlock_irq(&conf->device_lock); 5233 5234 if (mddev->gendisk) 5235 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 5236 align_bi, disk_devt(mddev->gendisk), 5237 raid_bio->bi_iter.bi_sector); 5238 generic_make_request(align_bi); 5239 return 1; 5240 } else { 5241 rcu_read_unlock(); 5242 bio_put(align_bi); 5243 return 0; 5244 } 5245 } 5246 5247 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5248 { 5249 struct bio *split; 5250 5251 do { 5252 sector_t sector = raid_bio->bi_iter.bi_sector; 5253 unsigned chunk_sects = mddev->chunk_sectors; 5254 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5255 5256 if (sectors < bio_sectors(raid_bio)) { 5257 split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); 5258 bio_chain(split, raid_bio); 5259 } else 5260 split = raid_bio; 5261 5262 if (!raid5_read_one_chunk(mddev, split)) { 5263 if (split != raid_bio) 5264 generic_make_request(raid_bio); 5265 return split; 5266 } 5267 } while (split != raid_bio); 5268 5269 return NULL; 5270 } 5271 5272 /* __get_priority_stripe - get the next stripe to process 5273 * 5274 * Full stripe writes are allowed to pass preread active stripes up until 5275 * the bypass_threshold is exceeded. In general the bypass_count 5276 * increments when the handle_list is handled before the hold_list; however, it 5277 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5278 * stripe with in flight i/o. The bypass_count will be reset when the 5279 * head of the hold_list has changed, i.e. the head was promoted to the 5280 * handle_list. 5281 */ 5282 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5283 { 5284 struct stripe_head *sh, *tmp; 5285 struct list_head *handle_list = NULL; 5286 struct r5worker_group *wg; 5287 bool second_try = !r5c_is_writeback(conf->log); 5288 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); 5289 5290 again: 5291 wg = NULL; 5292 sh = NULL; 5293 if (conf->worker_cnt_per_group == 0) { 5294 handle_list = try_loprio ? &conf->loprio_list : 5295 &conf->handle_list; 5296 } else if (group != ANY_GROUP) { 5297 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5298 &conf->worker_groups[group].handle_list; 5299 wg = &conf->worker_groups[group]; 5300 } else { 5301 int i; 5302 for (i = 0; i < conf->group_cnt; i++) { 5303 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5304 &conf->worker_groups[i].handle_list; 5305 wg = &conf->worker_groups[i]; 5306 if (!list_empty(handle_list)) 5307 break; 5308 } 5309 } 5310 5311 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5312 __func__, 5313 list_empty(handle_list) ? "empty" : "busy", 5314 list_empty(&conf->hold_list) ? "empty" : "busy", 5315 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5316 5317 if (!list_empty(handle_list)) { 5318 sh = list_entry(handle_list->next, typeof(*sh), lru); 5319 5320 if (list_empty(&conf->hold_list)) 5321 conf->bypass_count = 0; 5322 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5323 if (conf->hold_list.next == conf->last_hold) 5324 conf->bypass_count++; 5325 else { 5326 conf->last_hold = conf->hold_list.next; 5327 conf->bypass_count -= conf->bypass_threshold; 5328 if (conf->bypass_count < 0) 5329 conf->bypass_count = 0; 5330 } 5331 } 5332 } else if (!list_empty(&conf->hold_list) && 5333 ((conf->bypass_threshold && 5334 conf->bypass_count > conf->bypass_threshold) || 5335 atomic_read(&conf->pending_full_writes) == 0)) { 5336 5337 list_for_each_entry(tmp, &conf->hold_list, lru) { 5338 if (conf->worker_cnt_per_group == 0 || 5339 group == ANY_GROUP || 5340 !cpu_online(tmp->cpu) || 5341 cpu_to_group(tmp->cpu) == group) { 5342 sh = tmp; 5343 break; 5344 } 5345 } 5346 5347 if (sh) { 5348 conf->bypass_count -= conf->bypass_threshold; 5349 if (conf->bypass_count < 0) 5350 conf->bypass_count = 0; 5351 } 5352 wg = NULL; 5353 } 5354 5355 if (!sh) { 5356 if (second_try) 5357 return NULL; 5358 second_try = true; 5359 try_loprio = !try_loprio; 5360 goto again; 5361 } 5362 5363 if (wg) { 5364 wg->stripes_cnt--; 5365 sh->group = NULL; 5366 } 5367 list_del_init(&sh->lru); 5368 BUG_ON(atomic_inc_return(&sh->count) != 1); 5369 return sh; 5370 } 5371 5372 struct raid5_plug_cb { 5373 struct blk_plug_cb cb; 5374 struct list_head list; 5375 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5376 }; 5377 5378 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5379 { 5380 struct raid5_plug_cb *cb = container_of( 5381 blk_cb, struct raid5_plug_cb, cb); 5382 struct stripe_head *sh; 5383 struct mddev *mddev = cb->cb.data; 5384 struct r5conf *conf = mddev->private; 5385 int cnt = 0; 5386 int hash; 5387 5388 if (cb->list.next && !list_empty(&cb->list)) { 5389 spin_lock_irq(&conf->device_lock); 5390 while (!list_empty(&cb->list)) { 5391 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5392 list_del_init(&sh->lru); 5393 /* 5394 * avoid race release_stripe_plug() sees 5395 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5396 * is still in our list 5397 */ 5398 smp_mb__before_atomic(); 5399 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5400 /* 5401 * STRIPE_ON_RELEASE_LIST could be set here. In that 5402 * case, the count is always > 1 here 5403 */ 5404 hash = sh->hash_lock_index; 5405 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5406 cnt++; 5407 } 5408 spin_unlock_irq(&conf->device_lock); 5409 } 5410 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5411 NR_STRIPE_HASH_LOCKS); 5412 if (mddev->queue) 5413 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5414 kfree(cb); 5415 } 5416 5417 static void release_stripe_plug(struct mddev *mddev, 5418 struct stripe_head *sh) 5419 { 5420 struct blk_plug_cb *blk_cb = blk_check_plugged( 5421 raid5_unplug, mddev, 5422 sizeof(struct raid5_plug_cb)); 5423 struct raid5_plug_cb *cb; 5424 5425 if (!blk_cb) { 5426 raid5_release_stripe(sh); 5427 return; 5428 } 5429 5430 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5431 5432 if (cb->list.next == NULL) { 5433 int i; 5434 INIT_LIST_HEAD(&cb->list); 5435 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5436 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5437 } 5438 5439 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5440 list_add_tail(&sh->lru, &cb->list); 5441 else 5442 raid5_release_stripe(sh); 5443 } 5444 5445 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5446 { 5447 struct r5conf *conf = mddev->private; 5448 sector_t logical_sector, last_sector; 5449 struct stripe_head *sh; 5450 int stripe_sectors; 5451 5452 if (mddev->reshape_position != MaxSector) 5453 /* Skip discard while reshape is happening */ 5454 return; 5455 5456 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5457 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5458 5459 bi->bi_next = NULL; 5460 md_write_start(mddev, bi); 5461 5462 stripe_sectors = conf->chunk_sectors * 5463 (conf->raid_disks - conf->max_degraded); 5464 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5465 stripe_sectors); 5466 sector_div(last_sector, stripe_sectors); 5467 5468 logical_sector *= conf->chunk_sectors; 5469 last_sector *= conf->chunk_sectors; 5470 5471 for (; logical_sector < last_sector; 5472 logical_sector += STRIPE_SECTORS) { 5473 DEFINE_WAIT(w); 5474 int d; 5475 again: 5476 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5477 prepare_to_wait(&conf->wait_for_overlap, &w, 5478 TASK_UNINTERRUPTIBLE); 5479 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5480 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5481 raid5_release_stripe(sh); 5482 schedule(); 5483 goto again; 5484 } 5485 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5486 spin_lock_irq(&sh->stripe_lock); 5487 for (d = 0; d < conf->raid_disks; d++) { 5488 if (d == sh->pd_idx || d == sh->qd_idx) 5489 continue; 5490 if (sh->dev[d].towrite || sh->dev[d].toread) { 5491 set_bit(R5_Overlap, &sh->dev[d].flags); 5492 spin_unlock_irq(&sh->stripe_lock); 5493 raid5_release_stripe(sh); 5494 schedule(); 5495 goto again; 5496 } 5497 } 5498 set_bit(STRIPE_DISCARD, &sh->state); 5499 finish_wait(&conf->wait_for_overlap, &w); 5500 sh->overwrite_disks = 0; 5501 for (d = 0; d < conf->raid_disks; d++) { 5502 if (d == sh->pd_idx || d == sh->qd_idx) 5503 continue; 5504 sh->dev[d].towrite = bi; 5505 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5506 bio_inc_remaining(bi); 5507 md_write_inc(mddev, bi); 5508 sh->overwrite_disks++; 5509 } 5510 spin_unlock_irq(&sh->stripe_lock); 5511 if (conf->mddev->bitmap) { 5512 for (d = 0; 5513 d < conf->raid_disks - conf->max_degraded; 5514 d++) 5515 bitmap_startwrite(mddev->bitmap, 5516 sh->sector, 5517 STRIPE_SECTORS, 5518 0); 5519 sh->bm_seq = conf->seq_flush + 1; 5520 set_bit(STRIPE_BIT_DELAY, &sh->state); 5521 } 5522 5523 set_bit(STRIPE_HANDLE, &sh->state); 5524 clear_bit(STRIPE_DELAYED, &sh->state); 5525 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5526 atomic_inc(&conf->preread_active_stripes); 5527 release_stripe_plug(mddev, sh); 5528 } 5529 5530 md_write_end(mddev); 5531 bio_endio(bi); 5532 } 5533 5534 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5535 { 5536 struct r5conf *conf = mddev->private; 5537 int dd_idx; 5538 sector_t new_sector; 5539 sector_t logical_sector, last_sector; 5540 struct stripe_head *sh; 5541 const int rw = bio_data_dir(bi); 5542 DEFINE_WAIT(w); 5543 bool do_prepare; 5544 bool do_flush = false; 5545 5546 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5547 int ret = r5l_handle_flush_request(conf->log, bi); 5548 5549 if (ret == 0) 5550 return; 5551 if (ret == -ENODEV) { 5552 md_flush_request(mddev, bi); 5553 return; 5554 } 5555 /* ret == -EAGAIN, fallback */ 5556 /* 5557 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5558 * we need to flush journal device 5559 */ 5560 do_flush = bi->bi_opf & REQ_PREFLUSH; 5561 } 5562 5563 /* 5564 * If array is degraded, better not do chunk aligned read because 5565 * later we might have to read it again in order to reconstruct 5566 * data on failed drives. 5567 */ 5568 if (rw == READ && mddev->degraded == 0 && 5569 mddev->reshape_position == MaxSector) { 5570 bi = chunk_aligned_read(mddev, bi); 5571 if (!bi) 5572 return; 5573 } 5574 5575 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5576 make_discard_request(mddev, bi); 5577 return; 5578 } 5579 5580 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5581 last_sector = bio_end_sector(bi); 5582 bi->bi_next = NULL; 5583 md_write_start(mddev, bi); 5584 5585 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5586 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5587 int previous; 5588 int seq; 5589 5590 do_prepare = false; 5591 retry: 5592 seq = read_seqcount_begin(&conf->gen_lock); 5593 previous = 0; 5594 if (do_prepare) 5595 prepare_to_wait(&conf->wait_for_overlap, &w, 5596 TASK_UNINTERRUPTIBLE); 5597 if (unlikely(conf->reshape_progress != MaxSector)) { 5598 /* spinlock is needed as reshape_progress may be 5599 * 64bit on a 32bit platform, and so it might be 5600 * possible to see a half-updated value 5601 * Of course reshape_progress could change after 5602 * the lock is dropped, so once we get a reference 5603 * to the stripe that we think it is, we will have 5604 * to check again. 5605 */ 5606 spin_lock_irq(&conf->device_lock); 5607 if (mddev->reshape_backwards 5608 ? logical_sector < conf->reshape_progress 5609 : logical_sector >= conf->reshape_progress) { 5610 previous = 1; 5611 } else { 5612 if (mddev->reshape_backwards 5613 ? logical_sector < conf->reshape_safe 5614 : logical_sector >= conf->reshape_safe) { 5615 spin_unlock_irq(&conf->device_lock); 5616 schedule(); 5617 do_prepare = true; 5618 goto retry; 5619 } 5620 } 5621 spin_unlock_irq(&conf->device_lock); 5622 } 5623 5624 new_sector = raid5_compute_sector(conf, logical_sector, 5625 previous, 5626 &dd_idx, NULL); 5627 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5628 (unsigned long long)new_sector, 5629 (unsigned long long)logical_sector); 5630 5631 sh = raid5_get_active_stripe(conf, new_sector, previous, 5632 (bi->bi_opf & REQ_RAHEAD), 0); 5633 if (sh) { 5634 if (unlikely(previous)) { 5635 /* expansion might have moved on while waiting for a 5636 * stripe, so we must do the range check again. 5637 * Expansion could still move past after this 5638 * test, but as we are holding a reference to 5639 * 'sh', we know that if that happens, 5640 * STRIPE_EXPANDING will get set and the expansion 5641 * won't proceed until we finish with the stripe. 5642 */ 5643 int must_retry = 0; 5644 spin_lock_irq(&conf->device_lock); 5645 if (mddev->reshape_backwards 5646 ? logical_sector >= conf->reshape_progress 5647 : logical_sector < conf->reshape_progress) 5648 /* mismatch, need to try again */ 5649 must_retry = 1; 5650 spin_unlock_irq(&conf->device_lock); 5651 if (must_retry) { 5652 raid5_release_stripe(sh); 5653 schedule(); 5654 do_prepare = true; 5655 goto retry; 5656 } 5657 } 5658 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5659 /* Might have got the wrong stripe_head 5660 * by accident 5661 */ 5662 raid5_release_stripe(sh); 5663 goto retry; 5664 } 5665 5666 if (rw == WRITE && 5667 logical_sector >= mddev->suspend_lo && 5668 logical_sector < mddev->suspend_hi) { 5669 raid5_release_stripe(sh); 5670 /* As the suspend_* range is controlled by 5671 * userspace, we want an interruptible 5672 * wait. 5673 */ 5674 flush_signals(current); 5675 prepare_to_wait(&conf->wait_for_overlap, 5676 &w, TASK_INTERRUPTIBLE); 5677 if (logical_sector >= mddev->suspend_lo && 5678 logical_sector < mddev->suspend_hi) { 5679 schedule(); 5680 do_prepare = true; 5681 } 5682 goto retry; 5683 } 5684 5685 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5686 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5687 /* Stripe is busy expanding or 5688 * add failed due to overlap. Flush everything 5689 * and wait a while 5690 */ 5691 md_wakeup_thread(mddev->thread); 5692 raid5_release_stripe(sh); 5693 schedule(); 5694 do_prepare = true; 5695 goto retry; 5696 } 5697 if (do_flush) { 5698 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5699 /* we only need flush for one stripe */ 5700 do_flush = false; 5701 } 5702 5703 set_bit(STRIPE_HANDLE, &sh->state); 5704 clear_bit(STRIPE_DELAYED, &sh->state); 5705 if ((!sh->batch_head || sh == sh->batch_head) && 5706 (bi->bi_opf & REQ_SYNC) && 5707 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5708 atomic_inc(&conf->preread_active_stripes); 5709 release_stripe_plug(mddev, sh); 5710 } else { 5711 /* cannot get stripe for read-ahead, just give-up */ 5712 bi->bi_error = -EIO; 5713 break; 5714 } 5715 } 5716 finish_wait(&conf->wait_for_overlap, &w); 5717 5718 if (rw == WRITE) 5719 md_write_end(mddev); 5720 bio_endio(bi); 5721 } 5722 5723 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5724 5725 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5726 { 5727 /* reshaping is quite different to recovery/resync so it is 5728 * handled quite separately ... here. 5729 * 5730 * On each call to sync_request, we gather one chunk worth of 5731 * destination stripes and flag them as expanding. 5732 * Then we find all the source stripes and request reads. 5733 * As the reads complete, handle_stripe will copy the data 5734 * into the destination stripe and release that stripe. 5735 */ 5736 struct r5conf *conf = mddev->private; 5737 struct stripe_head *sh; 5738 sector_t first_sector, last_sector; 5739 int raid_disks = conf->previous_raid_disks; 5740 int data_disks = raid_disks - conf->max_degraded; 5741 int new_data_disks = conf->raid_disks - conf->max_degraded; 5742 int i; 5743 int dd_idx; 5744 sector_t writepos, readpos, safepos; 5745 sector_t stripe_addr; 5746 int reshape_sectors; 5747 struct list_head stripes; 5748 sector_t retn; 5749 5750 if (sector_nr == 0) { 5751 /* If restarting in the middle, skip the initial sectors */ 5752 if (mddev->reshape_backwards && 5753 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5754 sector_nr = raid5_size(mddev, 0, 0) 5755 - conf->reshape_progress; 5756 } else if (mddev->reshape_backwards && 5757 conf->reshape_progress == MaxSector) { 5758 /* shouldn't happen, but just in case, finish up.*/ 5759 sector_nr = MaxSector; 5760 } else if (!mddev->reshape_backwards && 5761 conf->reshape_progress > 0) 5762 sector_nr = conf->reshape_progress; 5763 sector_div(sector_nr, new_data_disks); 5764 if (sector_nr) { 5765 mddev->curr_resync_completed = sector_nr; 5766 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5767 *skipped = 1; 5768 retn = sector_nr; 5769 goto finish; 5770 } 5771 } 5772 5773 /* We need to process a full chunk at a time. 5774 * If old and new chunk sizes differ, we need to process the 5775 * largest of these 5776 */ 5777 5778 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5779 5780 /* We update the metadata at least every 10 seconds, or when 5781 * the data about to be copied would over-write the source of 5782 * the data at the front of the range. i.e. one new_stripe 5783 * along from reshape_progress new_maps to after where 5784 * reshape_safe old_maps to 5785 */ 5786 writepos = conf->reshape_progress; 5787 sector_div(writepos, new_data_disks); 5788 readpos = conf->reshape_progress; 5789 sector_div(readpos, data_disks); 5790 safepos = conf->reshape_safe; 5791 sector_div(safepos, data_disks); 5792 if (mddev->reshape_backwards) { 5793 BUG_ON(writepos < reshape_sectors); 5794 writepos -= reshape_sectors; 5795 readpos += reshape_sectors; 5796 safepos += reshape_sectors; 5797 } else { 5798 writepos += reshape_sectors; 5799 /* readpos and safepos are worst-case calculations. 5800 * A negative number is overly pessimistic, and causes 5801 * obvious problems for unsigned storage. So clip to 0. 5802 */ 5803 readpos -= min_t(sector_t, reshape_sectors, readpos); 5804 safepos -= min_t(sector_t, reshape_sectors, safepos); 5805 } 5806 5807 /* Having calculated the 'writepos' possibly use it 5808 * to set 'stripe_addr' which is where we will write to. 5809 */ 5810 if (mddev->reshape_backwards) { 5811 BUG_ON(conf->reshape_progress == 0); 5812 stripe_addr = writepos; 5813 BUG_ON((mddev->dev_sectors & 5814 ~((sector_t)reshape_sectors - 1)) 5815 - reshape_sectors - stripe_addr 5816 != sector_nr); 5817 } else { 5818 BUG_ON(writepos != sector_nr + reshape_sectors); 5819 stripe_addr = sector_nr; 5820 } 5821 5822 /* 'writepos' is the most advanced device address we might write. 5823 * 'readpos' is the least advanced device address we might read. 5824 * 'safepos' is the least address recorded in the metadata as having 5825 * been reshaped. 5826 * If there is a min_offset_diff, these are adjusted either by 5827 * increasing the safepos/readpos if diff is negative, or 5828 * increasing writepos if diff is positive. 5829 * If 'readpos' is then behind 'writepos', there is no way that we can 5830 * ensure safety in the face of a crash - that must be done by userspace 5831 * making a backup of the data. So in that case there is no particular 5832 * rush to update metadata. 5833 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5834 * update the metadata to advance 'safepos' to match 'readpos' so that 5835 * we can be safe in the event of a crash. 5836 * So we insist on updating metadata if safepos is behind writepos and 5837 * readpos is beyond writepos. 5838 * In any case, update the metadata every 10 seconds. 5839 * Maybe that number should be configurable, but I'm not sure it is 5840 * worth it.... maybe it could be a multiple of safemode_delay??? 5841 */ 5842 if (conf->min_offset_diff < 0) { 5843 safepos += -conf->min_offset_diff; 5844 readpos += -conf->min_offset_diff; 5845 } else 5846 writepos += conf->min_offset_diff; 5847 5848 if ((mddev->reshape_backwards 5849 ? (safepos > writepos && readpos < writepos) 5850 : (safepos < writepos && readpos > writepos)) || 5851 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5852 /* Cannot proceed until we've updated the superblock... */ 5853 wait_event(conf->wait_for_overlap, 5854 atomic_read(&conf->reshape_stripes)==0 5855 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5856 if (atomic_read(&conf->reshape_stripes) != 0) 5857 return 0; 5858 mddev->reshape_position = conf->reshape_progress; 5859 mddev->curr_resync_completed = sector_nr; 5860 conf->reshape_checkpoint = jiffies; 5861 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5862 md_wakeup_thread(mddev->thread); 5863 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5864 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5865 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5866 return 0; 5867 spin_lock_irq(&conf->device_lock); 5868 conf->reshape_safe = mddev->reshape_position; 5869 spin_unlock_irq(&conf->device_lock); 5870 wake_up(&conf->wait_for_overlap); 5871 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5872 } 5873 5874 INIT_LIST_HEAD(&stripes); 5875 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5876 int j; 5877 int skipped_disk = 0; 5878 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5879 set_bit(STRIPE_EXPANDING, &sh->state); 5880 atomic_inc(&conf->reshape_stripes); 5881 /* If any of this stripe is beyond the end of the old 5882 * array, then we need to zero those blocks 5883 */ 5884 for (j=sh->disks; j--;) { 5885 sector_t s; 5886 if (j == sh->pd_idx) 5887 continue; 5888 if (conf->level == 6 && 5889 j == sh->qd_idx) 5890 continue; 5891 s = raid5_compute_blocknr(sh, j, 0); 5892 if (s < raid5_size(mddev, 0, 0)) { 5893 skipped_disk = 1; 5894 continue; 5895 } 5896 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5897 set_bit(R5_Expanded, &sh->dev[j].flags); 5898 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5899 } 5900 if (!skipped_disk) { 5901 set_bit(STRIPE_EXPAND_READY, &sh->state); 5902 set_bit(STRIPE_HANDLE, &sh->state); 5903 } 5904 list_add(&sh->lru, &stripes); 5905 } 5906 spin_lock_irq(&conf->device_lock); 5907 if (mddev->reshape_backwards) 5908 conf->reshape_progress -= reshape_sectors * new_data_disks; 5909 else 5910 conf->reshape_progress += reshape_sectors * new_data_disks; 5911 spin_unlock_irq(&conf->device_lock); 5912 /* Ok, those stripe are ready. We can start scheduling 5913 * reads on the source stripes. 5914 * The source stripes are determined by mapping the first and last 5915 * block on the destination stripes. 5916 */ 5917 first_sector = 5918 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5919 1, &dd_idx, NULL); 5920 last_sector = 5921 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5922 * new_data_disks - 1), 5923 1, &dd_idx, NULL); 5924 if (last_sector >= mddev->dev_sectors) 5925 last_sector = mddev->dev_sectors - 1; 5926 while (first_sector <= last_sector) { 5927 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5928 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5929 set_bit(STRIPE_HANDLE, &sh->state); 5930 raid5_release_stripe(sh); 5931 first_sector += STRIPE_SECTORS; 5932 } 5933 /* Now that the sources are clearly marked, we can release 5934 * the destination stripes 5935 */ 5936 while (!list_empty(&stripes)) { 5937 sh = list_entry(stripes.next, struct stripe_head, lru); 5938 list_del_init(&sh->lru); 5939 raid5_release_stripe(sh); 5940 } 5941 /* If this takes us to the resync_max point where we have to pause, 5942 * then we need to write out the superblock. 5943 */ 5944 sector_nr += reshape_sectors; 5945 retn = reshape_sectors; 5946 finish: 5947 if (mddev->curr_resync_completed > mddev->resync_max || 5948 (sector_nr - mddev->curr_resync_completed) * 2 5949 >= mddev->resync_max - mddev->curr_resync_completed) { 5950 /* Cannot proceed until we've updated the superblock... */ 5951 wait_event(conf->wait_for_overlap, 5952 atomic_read(&conf->reshape_stripes) == 0 5953 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5954 if (atomic_read(&conf->reshape_stripes) != 0) 5955 goto ret; 5956 mddev->reshape_position = conf->reshape_progress; 5957 mddev->curr_resync_completed = sector_nr; 5958 conf->reshape_checkpoint = jiffies; 5959 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5960 md_wakeup_thread(mddev->thread); 5961 wait_event(mddev->sb_wait, 5962 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5963 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5964 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5965 goto ret; 5966 spin_lock_irq(&conf->device_lock); 5967 conf->reshape_safe = mddev->reshape_position; 5968 spin_unlock_irq(&conf->device_lock); 5969 wake_up(&conf->wait_for_overlap); 5970 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5971 } 5972 ret: 5973 return retn; 5974 } 5975 5976 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5977 int *skipped) 5978 { 5979 struct r5conf *conf = mddev->private; 5980 struct stripe_head *sh; 5981 sector_t max_sector = mddev->dev_sectors; 5982 sector_t sync_blocks; 5983 int still_degraded = 0; 5984 int i; 5985 5986 if (sector_nr >= max_sector) { 5987 /* just being told to finish up .. nothing much to do */ 5988 5989 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5990 end_reshape(conf); 5991 return 0; 5992 } 5993 5994 if (mddev->curr_resync < max_sector) /* aborted */ 5995 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5996 &sync_blocks, 1); 5997 else /* completed sync */ 5998 conf->fullsync = 0; 5999 bitmap_close_sync(mddev->bitmap); 6000 6001 return 0; 6002 } 6003 6004 /* Allow raid5_quiesce to complete */ 6005 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6006 6007 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6008 return reshape_request(mddev, sector_nr, skipped); 6009 6010 /* No need to check resync_max as we never do more than one 6011 * stripe, and as resync_max will always be on a chunk boundary, 6012 * if the check in md_do_sync didn't fire, there is no chance 6013 * of overstepping resync_max here 6014 */ 6015 6016 /* if there is too many failed drives and we are trying 6017 * to resync, then assert that we are finished, because there is 6018 * nothing we can do. 6019 */ 6020 if (mddev->degraded >= conf->max_degraded && 6021 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6022 sector_t rv = mddev->dev_sectors - sector_nr; 6023 *skipped = 1; 6024 return rv; 6025 } 6026 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6027 !conf->fullsync && 6028 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6029 sync_blocks >= STRIPE_SECTORS) { 6030 /* we can skip this block, and probably more */ 6031 sync_blocks /= STRIPE_SECTORS; 6032 *skipped = 1; 6033 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6034 } 6035 6036 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6037 6038 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6039 if (sh == NULL) { 6040 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6041 /* make sure we don't swamp the stripe cache if someone else 6042 * is trying to get access 6043 */ 6044 schedule_timeout_uninterruptible(1); 6045 } 6046 /* Need to check if array will still be degraded after recovery/resync 6047 * Note in case of > 1 drive failures it's possible we're rebuilding 6048 * one drive while leaving another faulty drive in array. 6049 */ 6050 rcu_read_lock(); 6051 for (i = 0; i < conf->raid_disks; i++) { 6052 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 6053 6054 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6055 still_degraded = 1; 6056 } 6057 rcu_read_unlock(); 6058 6059 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6060 6061 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6062 set_bit(STRIPE_HANDLE, &sh->state); 6063 6064 raid5_release_stripe(sh); 6065 6066 return STRIPE_SECTORS; 6067 } 6068 6069 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6070 unsigned int offset) 6071 { 6072 /* We may not be able to submit a whole bio at once as there 6073 * may not be enough stripe_heads available. 6074 * We cannot pre-allocate enough stripe_heads as we may need 6075 * more than exist in the cache (if we allow ever large chunks). 6076 * So we do one stripe head at a time and record in 6077 * ->bi_hw_segments how many have been done. 6078 * 6079 * We *know* that this entire raid_bio is in one chunk, so 6080 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6081 */ 6082 struct stripe_head *sh; 6083 int dd_idx; 6084 sector_t sector, logical_sector, last_sector; 6085 int scnt = 0; 6086 int handled = 0; 6087 6088 logical_sector = raid_bio->bi_iter.bi_sector & 6089 ~((sector_t)STRIPE_SECTORS-1); 6090 sector = raid5_compute_sector(conf, logical_sector, 6091 0, &dd_idx, NULL); 6092 last_sector = bio_end_sector(raid_bio); 6093 6094 for (; logical_sector < last_sector; 6095 logical_sector += STRIPE_SECTORS, 6096 sector += STRIPE_SECTORS, 6097 scnt++) { 6098 6099 if (scnt < offset) 6100 /* already done this stripe */ 6101 continue; 6102 6103 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6104 6105 if (!sh) { 6106 /* failed to get a stripe - must wait */ 6107 conf->retry_read_aligned = raid_bio; 6108 conf->retry_read_offset = scnt; 6109 return handled; 6110 } 6111 6112 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6113 raid5_release_stripe(sh); 6114 conf->retry_read_aligned = raid_bio; 6115 conf->retry_read_offset = scnt; 6116 return handled; 6117 } 6118 6119 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6120 handle_stripe(sh); 6121 raid5_release_stripe(sh); 6122 handled++; 6123 } 6124 6125 bio_endio(raid_bio); 6126 6127 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6128 wake_up(&conf->wait_for_quiescent); 6129 return handled; 6130 } 6131 6132 static int handle_active_stripes(struct r5conf *conf, int group, 6133 struct r5worker *worker, 6134 struct list_head *temp_inactive_list) 6135 { 6136 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6137 int i, batch_size = 0, hash; 6138 bool release_inactive = false; 6139 6140 while (batch_size < MAX_STRIPE_BATCH && 6141 (sh = __get_priority_stripe(conf, group)) != NULL) 6142 batch[batch_size++] = sh; 6143 6144 if (batch_size == 0) { 6145 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6146 if (!list_empty(temp_inactive_list + i)) 6147 break; 6148 if (i == NR_STRIPE_HASH_LOCKS) { 6149 spin_unlock_irq(&conf->device_lock); 6150 r5l_flush_stripe_to_raid(conf->log); 6151 spin_lock_irq(&conf->device_lock); 6152 return batch_size; 6153 } 6154 release_inactive = true; 6155 } 6156 spin_unlock_irq(&conf->device_lock); 6157 6158 release_inactive_stripe_list(conf, temp_inactive_list, 6159 NR_STRIPE_HASH_LOCKS); 6160 6161 r5l_flush_stripe_to_raid(conf->log); 6162 if (release_inactive) { 6163 spin_lock_irq(&conf->device_lock); 6164 return 0; 6165 } 6166 6167 for (i = 0; i < batch_size; i++) 6168 handle_stripe(batch[i]); 6169 log_write_stripe_run(conf); 6170 6171 cond_resched(); 6172 6173 spin_lock_irq(&conf->device_lock); 6174 for (i = 0; i < batch_size; i++) { 6175 hash = batch[i]->hash_lock_index; 6176 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6177 } 6178 return batch_size; 6179 } 6180 6181 static void raid5_do_work(struct work_struct *work) 6182 { 6183 struct r5worker *worker = container_of(work, struct r5worker, work); 6184 struct r5worker_group *group = worker->group; 6185 struct r5conf *conf = group->conf; 6186 struct mddev *mddev = conf->mddev; 6187 int group_id = group - conf->worker_groups; 6188 int handled; 6189 struct blk_plug plug; 6190 6191 pr_debug("+++ raid5worker active\n"); 6192 6193 blk_start_plug(&plug); 6194 handled = 0; 6195 spin_lock_irq(&conf->device_lock); 6196 while (1) { 6197 int batch_size, released; 6198 6199 released = release_stripe_list(conf, worker->temp_inactive_list); 6200 6201 batch_size = handle_active_stripes(conf, group_id, worker, 6202 worker->temp_inactive_list); 6203 worker->working = false; 6204 if (!batch_size && !released) 6205 break; 6206 handled += batch_size; 6207 wait_event_lock_irq(mddev->sb_wait, 6208 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6209 conf->device_lock); 6210 } 6211 pr_debug("%d stripes handled\n", handled); 6212 6213 spin_unlock_irq(&conf->device_lock); 6214 blk_finish_plug(&plug); 6215 6216 pr_debug("--- raid5worker inactive\n"); 6217 } 6218 6219 /* 6220 * This is our raid5 kernel thread. 6221 * 6222 * We scan the hash table for stripes which can be handled now. 6223 * During the scan, completed stripes are saved for us by the interrupt 6224 * handler, so that they will not have to wait for our next wakeup. 6225 */ 6226 static void raid5d(struct md_thread *thread) 6227 { 6228 struct mddev *mddev = thread->mddev; 6229 struct r5conf *conf = mddev->private; 6230 int handled; 6231 struct blk_plug plug; 6232 6233 pr_debug("+++ raid5d active\n"); 6234 6235 md_check_recovery(mddev); 6236 6237 blk_start_plug(&plug); 6238 handled = 0; 6239 spin_lock_irq(&conf->device_lock); 6240 while (1) { 6241 struct bio *bio; 6242 int batch_size, released; 6243 unsigned int offset; 6244 6245 released = release_stripe_list(conf, conf->temp_inactive_list); 6246 if (released) 6247 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6248 6249 if ( 6250 !list_empty(&conf->bitmap_list)) { 6251 /* Now is a good time to flush some bitmap updates */ 6252 conf->seq_flush++; 6253 spin_unlock_irq(&conf->device_lock); 6254 bitmap_unplug(mddev->bitmap); 6255 spin_lock_irq(&conf->device_lock); 6256 conf->seq_write = conf->seq_flush; 6257 activate_bit_delay(conf, conf->temp_inactive_list); 6258 } 6259 raid5_activate_delayed(conf); 6260 6261 while ((bio = remove_bio_from_retry(conf, &offset))) { 6262 int ok; 6263 spin_unlock_irq(&conf->device_lock); 6264 ok = retry_aligned_read(conf, bio, offset); 6265 spin_lock_irq(&conf->device_lock); 6266 if (!ok) 6267 break; 6268 handled++; 6269 } 6270 6271 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6272 conf->temp_inactive_list); 6273 if (!batch_size && !released) 6274 break; 6275 handled += batch_size; 6276 6277 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6278 spin_unlock_irq(&conf->device_lock); 6279 md_check_recovery(mddev); 6280 spin_lock_irq(&conf->device_lock); 6281 } 6282 } 6283 pr_debug("%d stripes handled\n", handled); 6284 6285 spin_unlock_irq(&conf->device_lock); 6286 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6287 mutex_trylock(&conf->cache_size_mutex)) { 6288 grow_one_stripe(conf, __GFP_NOWARN); 6289 /* Set flag even if allocation failed. This helps 6290 * slow down allocation requests when mem is short 6291 */ 6292 set_bit(R5_DID_ALLOC, &conf->cache_state); 6293 mutex_unlock(&conf->cache_size_mutex); 6294 } 6295 6296 flush_deferred_bios(conf); 6297 6298 r5l_flush_stripe_to_raid(conf->log); 6299 6300 async_tx_issue_pending_all(); 6301 blk_finish_plug(&plug); 6302 6303 pr_debug("--- raid5d inactive\n"); 6304 } 6305 6306 static ssize_t 6307 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6308 { 6309 struct r5conf *conf; 6310 int ret = 0; 6311 spin_lock(&mddev->lock); 6312 conf = mddev->private; 6313 if (conf) 6314 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6315 spin_unlock(&mddev->lock); 6316 return ret; 6317 } 6318 6319 int 6320 raid5_set_cache_size(struct mddev *mddev, int size) 6321 { 6322 struct r5conf *conf = mddev->private; 6323 int err; 6324 6325 if (size <= 16 || size > 32768) 6326 return -EINVAL; 6327 6328 conf->min_nr_stripes = size; 6329 mutex_lock(&conf->cache_size_mutex); 6330 while (size < conf->max_nr_stripes && 6331 drop_one_stripe(conf)) 6332 ; 6333 mutex_unlock(&conf->cache_size_mutex); 6334 6335 6336 err = md_allow_write(mddev); 6337 if (err) 6338 return err; 6339 6340 mutex_lock(&conf->cache_size_mutex); 6341 while (size > conf->max_nr_stripes) 6342 if (!grow_one_stripe(conf, GFP_KERNEL)) 6343 break; 6344 mutex_unlock(&conf->cache_size_mutex); 6345 6346 return 0; 6347 } 6348 EXPORT_SYMBOL(raid5_set_cache_size); 6349 6350 static ssize_t 6351 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6352 { 6353 struct r5conf *conf; 6354 unsigned long new; 6355 int err; 6356 6357 if (len >= PAGE_SIZE) 6358 return -EINVAL; 6359 if (kstrtoul(page, 10, &new)) 6360 return -EINVAL; 6361 err = mddev_lock(mddev); 6362 if (err) 6363 return err; 6364 conf = mddev->private; 6365 if (!conf) 6366 err = -ENODEV; 6367 else 6368 err = raid5_set_cache_size(mddev, new); 6369 mddev_unlock(mddev); 6370 6371 return err ?: len; 6372 } 6373 6374 static struct md_sysfs_entry 6375 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6376 raid5_show_stripe_cache_size, 6377 raid5_store_stripe_cache_size); 6378 6379 static ssize_t 6380 raid5_show_rmw_level(struct mddev *mddev, char *page) 6381 { 6382 struct r5conf *conf = mddev->private; 6383 if (conf) 6384 return sprintf(page, "%d\n", conf->rmw_level); 6385 else 6386 return 0; 6387 } 6388 6389 static ssize_t 6390 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6391 { 6392 struct r5conf *conf = mddev->private; 6393 unsigned long new; 6394 6395 if (!conf) 6396 return -ENODEV; 6397 6398 if (len >= PAGE_SIZE) 6399 return -EINVAL; 6400 6401 if (kstrtoul(page, 10, &new)) 6402 return -EINVAL; 6403 6404 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6405 return -EINVAL; 6406 6407 if (new != PARITY_DISABLE_RMW && 6408 new != PARITY_ENABLE_RMW && 6409 new != PARITY_PREFER_RMW) 6410 return -EINVAL; 6411 6412 conf->rmw_level = new; 6413 return len; 6414 } 6415 6416 static struct md_sysfs_entry 6417 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6418 raid5_show_rmw_level, 6419 raid5_store_rmw_level); 6420 6421 6422 static ssize_t 6423 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6424 { 6425 struct r5conf *conf; 6426 int ret = 0; 6427 spin_lock(&mddev->lock); 6428 conf = mddev->private; 6429 if (conf) 6430 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6431 spin_unlock(&mddev->lock); 6432 return ret; 6433 } 6434 6435 static ssize_t 6436 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6437 { 6438 struct r5conf *conf; 6439 unsigned long new; 6440 int err; 6441 6442 if (len >= PAGE_SIZE) 6443 return -EINVAL; 6444 if (kstrtoul(page, 10, &new)) 6445 return -EINVAL; 6446 6447 err = mddev_lock(mddev); 6448 if (err) 6449 return err; 6450 conf = mddev->private; 6451 if (!conf) 6452 err = -ENODEV; 6453 else if (new > conf->min_nr_stripes) 6454 err = -EINVAL; 6455 else 6456 conf->bypass_threshold = new; 6457 mddev_unlock(mddev); 6458 return err ?: len; 6459 } 6460 6461 static struct md_sysfs_entry 6462 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6463 S_IRUGO | S_IWUSR, 6464 raid5_show_preread_threshold, 6465 raid5_store_preread_threshold); 6466 6467 static ssize_t 6468 raid5_show_skip_copy(struct mddev *mddev, char *page) 6469 { 6470 struct r5conf *conf; 6471 int ret = 0; 6472 spin_lock(&mddev->lock); 6473 conf = mddev->private; 6474 if (conf) 6475 ret = sprintf(page, "%d\n", conf->skip_copy); 6476 spin_unlock(&mddev->lock); 6477 return ret; 6478 } 6479 6480 static ssize_t 6481 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6482 { 6483 struct r5conf *conf; 6484 unsigned long new; 6485 int err; 6486 6487 if (len >= PAGE_SIZE) 6488 return -EINVAL; 6489 if (kstrtoul(page, 10, &new)) 6490 return -EINVAL; 6491 new = !!new; 6492 6493 err = mddev_lock(mddev); 6494 if (err) 6495 return err; 6496 conf = mddev->private; 6497 if (!conf) 6498 err = -ENODEV; 6499 else if (new != conf->skip_copy) { 6500 mddev_suspend(mddev); 6501 conf->skip_copy = new; 6502 if (new) 6503 mddev->queue->backing_dev_info->capabilities |= 6504 BDI_CAP_STABLE_WRITES; 6505 else 6506 mddev->queue->backing_dev_info->capabilities &= 6507 ~BDI_CAP_STABLE_WRITES; 6508 mddev_resume(mddev); 6509 } 6510 mddev_unlock(mddev); 6511 return err ?: len; 6512 } 6513 6514 static struct md_sysfs_entry 6515 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6516 raid5_show_skip_copy, 6517 raid5_store_skip_copy); 6518 6519 static ssize_t 6520 stripe_cache_active_show(struct mddev *mddev, char *page) 6521 { 6522 struct r5conf *conf = mddev->private; 6523 if (conf) 6524 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6525 else 6526 return 0; 6527 } 6528 6529 static struct md_sysfs_entry 6530 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6531 6532 static ssize_t 6533 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6534 { 6535 struct r5conf *conf; 6536 int ret = 0; 6537 spin_lock(&mddev->lock); 6538 conf = mddev->private; 6539 if (conf) 6540 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6541 spin_unlock(&mddev->lock); 6542 return ret; 6543 } 6544 6545 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6546 int *group_cnt, 6547 int *worker_cnt_per_group, 6548 struct r5worker_group **worker_groups); 6549 static ssize_t 6550 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6551 { 6552 struct r5conf *conf; 6553 unsigned long new; 6554 int err; 6555 struct r5worker_group *new_groups, *old_groups; 6556 int group_cnt, worker_cnt_per_group; 6557 6558 if (len >= PAGE_SIZE) 6559 return -EINVAL; 6560 if (kstrtoul(page, 10, &new)) 6561 return -EINVAL; 6562 6563 err = mddev_lock(mddev); 6564 if (err) 6565 return err; 6566 conf = mddev->private; 6567 if (!conf) 6568 err = -ENODEV; 6569 else if (new != conf->worker_cnt_per_group) { 6570 mddev_suspend(mddev); 6571 6572 old_groups = conf->worker_groups; 6573 if (old_groups) 6574 flush_workqueue(raid5_wq); 6575 6576 err = alloc_thread_groups(conf, new, 6577 &group_cnt, &worker_cnt_per_group, 6578 &new_groups); 6579 if (!err) { 6580 spin_lock_irq(&conf->device_lock); 6581 conf->group_cnt = group_cnt; 6582 conf->worker_cnt_per_group = worker_cnt_per_group; 6583 conf->worker_groups = new_groups; 6584 spin_unlock_irq(&conf->device_lock); 6585 6586 if (old_groups) 6587 kfree(old_groups[0].workers); 6588 kfree(old_groups); 6589 } 6590 mddev_resume(mddev); 6591 } 6592 mddev_unlock(mddev); 6593 6594 return err ?: len; 6595 } 6596 6597 static struct md_sysfs_entry 6598 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6599 raid5_show_group_thread_cnt, 6600 raid5_store_group_thread_cnt); 6601 6602 static struct attribute *raid5_attrs[] = { 6603 &raid5_stripecache_size.attr, 6604 &raid5_stripecache_active.attr, 6605 &raid5_preread_bypass_threshold.attr, 6606 &raid5_group_thread_cnt.attr, 6607 &raid5_skip_copy.attr, 6608 &raid5_rmw_level.attr, 6609 &r5c_journal_mode.attr, 6610 NULL, 6611 }; 6612 static struct attribute_group raid5_attrs_group = { 6613 .name = NULL, 6614 .attrs = raid5_attrs, 6615 }; 6616 6617 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6618 int *group_cnt, 6619 int *worker_cnt_per_group, 6620 struct r5worker_group **worker_groups) 6621 { 6622 int i, j, k; 6623 ssize_t size; 6624 struct r5worker *workers; 6625 6626 *worker_cnt_per_group = cnt; 6627 if (cnt == 0) { 6628 *group_cnt = 0; 6629 *worker_groups = NULL; 6630 return 0; 6631 } 6632 *group_cnt = num_possible_nodes(); 6633 size = sizeof(struct r5worker) * cnt; 6634 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6635 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6636 *group_cnt, GFP_NOIO); 6637 if (!*worker_groups || !workers) { 6638 kfree(workers); 6639 kfree(*worker_groups); 6640 return -ENOMEM; 6641 } 6642 6643 for (i = 0; i < *group_cnt; i++) { 6644 struct r5worker_group *group; 6645 6646 group = &(*worker_groups)[i]; 6647 INIT_LIST_HEAD(&group->handle_list); 6648 INIT_LIST_HEAD(&group->loprio_list); 6649 group->conf = conf; 6650 group->workers = workers + i * cnt; 6651 6652 for (j = 0; j < cnt; j++) { 6653 struct r5worker *worker = group->workers + j; 6654 worker->group = group; 6655 INIT_WORK(&worker->work, raid5_do_work); 6656 6657 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6658 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6659 } 6660 } 6661 6662 return 0; 6663 } 6664 6665 static void free_thread_groups(struct r5conf *conf) 6666 { 6667 if (conf->worker_groups) 6668 kfree(conf->worker_groups[0].workers); 6669 kfree(conf->worker_groups); 6670 conf->worker_groups = NULL; 6671 } 6672 6673 static sector_t 6674 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6675 { 6676 struct r5conf *conf = mddev->private; 6677 6678 if (!sectors) 6679 sectors = mddev->dev_sectors; 6680 if (!raid_disks) 6681 /* size is defined by the smallest of previous and new size */ 6682 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6683 6684 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6685 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6686 return sectors * (raid_disks - conf->max_degraded); 6687 } 6688 6689 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6690 { 6691 safe_put_page(percpu->spare_page); 6692 if (percpu->scribble) 6693 flex_array_free(percpu->scribble); 6694 percpu->spare_page = NULL; 6695 percpu->scribble = NULL; 6696 } 6697 6698 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6699 { 6700 if (conf->level == 6 && !percpu->spare_page) 6701 percpu->spare_page = alloc_page(GFP_KERNEL); 6702 if (!percpu->scribble) 6703 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6704 conf->previous_raid_disks), 6705 max(conf->chunk_sectors, 6706 conf->prev_chunk_sectors) 6707 / STRIPE_SECTORS, 6708 GFP_KERNEL); 6709 6710 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6711 free_scratch_buffer(conf, percpu); 6712 return -ENOMEM; 6713 } 6714 6715 return 0; 6716 } 6717 6718 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6719 { 6720 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6721 6722 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6723 return 0; 6724 } 6725 6726 static void raid5_free_percpu(struct r5conf *conf) 6727 { 6728 if (!conf->percpu) 6729 return; 6730 6731 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6732 free_percpu(conf->percpu); 6733 } 6734 6735 static void free_conf(struct r5conf *conf) 6736 { 6737 int i; 6738 6739 log_exit(conf); 6740 6741 if (conf->shrinker.nr_deferred) 6742 unregister_shrinker(&conf->shrinker); 6743 6744 free_thread_groups(conf); 6745 shrink_stripes(conf); 6746 raid5_free_percpu(conf); 6747 for (i = 0; i < conf->pool_size; i++) 6748 if (conf->disks[i].extra_page) 6749 put_page(conf->disks[i].extra_page); 6750 kfree(conf->disks); 6751 kfree(conf->stripe_hashtbl); 6752 kfree(conf->pending_data); 6753 kfree(conf); 6754 } 6755 6756 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6757 { 6758 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6759 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6760 6761 if (alloc_scratch_buffer(conf, percpu)) { 6762 pr_warn("%s: failed memory allocation for cpu%u\n", 6763 __func__, cpu); 6764 return -ENOMEM; 6765 } 6766 return 0; 6767 } 6768 6769 static int raid5_alloc_percpu(struct r5conf *conf) 6770 { 6771 int err = 0; 6772 6773 conf->percpu = alloc_percpu(struct raid5_percpu); 6774 if (!conf->percpu) 6775 return -ENOMEM; 6776 6777 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6778 if (!err) { 6779 conf->scribble_disks = max(conf->raid_disks, 6780 conf->previous_raid_disks); 6781 conf->scribble_sectors = max(conf->chunk_sectors, 6782 conf->prev_chunk_sectors); 6783 } 6784 return err; 6785 } 6786 6787 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6788 struct shrink_control *sc) 6789 { 6790 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6791 unsigned long ret = SHRINK_STOP; 6792 6793 if (mutex_trylock(&conf->cache_size_mutex)) { 6794 ret= 0; 6795 while (ret < sc->nr_to_scan && 6796 conf->max_nr_stripes > conf->min_nr_stripes) { 6797 if (drop_one_stripe(conf) == 0) { 6798 ret = SHRINK_STOP; 6799 break; 6800 } 6801 ret++; 6802 } 6803 mutex_unlock(&conf->cache_size_mutex); 6804 } 6805 return ret; 6806 } 6807 6808 static unsigned long raid5_cache_count(struct shrinker *shrink, 6809 struct shrink_control *sc) 6810 { 6811 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6812 6813 if (conf->max_nr_stripes < conf->min_nr_stripes) 6814 /* unlikely, but not impossible */ 6815 return 0; 6816 return conf->max_nr_stripes - conf->min_nr_stripes; 6817 } 6818 6819 static struct r5conf *setup_conf(struct mddev *mddev) 6820 { 6821 struct r5conf *conf; 6822 int raid_disk, memory, max_disks; 6823 struct md_rdev *rdev; 6824 struct disk_info *disk; 6825 char pers_name[6]; 6826 int i; 6827 int group_cnt, worker_cnt_per_group; 6828 struct r5worker_group *new_group; 6829 6830 if (mddev->new_level != 5 6831 && mddev->new_level != 4 6832 && mddev->new_level != 6) { 6833 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6834 mdname(mddev), mddev->new_level); 6835 return ERR_PTR(-EIO); 6836 } 6837 if ((mddev->new_level == 5 6838 && !algorithm_valid_raid5(mddev->new_layout)) || 6839 (mddev->new_level == 6 6840 && !algorithm_valid_raid6(mddev->new_layout))) { 6841 pr_warn("md/raid:%s: layout %d not supported\n", 6842 mdname(mddev), mddev->new_layout); 6843 return ERR_PTR(-EIO); 6844 } 6845 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6846 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6847 mdname(mddev), mddev->raid_disks); 6848 return ERR_PTR(-EINVAL); 6849 } 6850 6851 if (!mddev->new_chunk_sectors || 6852 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6853 !is_power_of_2(mddev->new_chunk_sectors)) { 6854 pr_warn("md/raid:%s: invalid chunk size %d\n", 6855 mdname(mddev), mddev->new_chunk_sectors << 9); 6856 return ERR_PTR(-EINVAL); 6857 } 6858 6859 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6860 if (conf == NULL) 6861 goto abort; 6862 INIT_LIST_HEAD(&conf->free_list); 6863 INIT_LIST_HEAD(&conf->pending_list); 6864 conf->pending_data = kzalloc(sizeof(struct r5pending_data) * 6865 PENDING_IO_MAX, GFP_KERNEL); 6866 if (!conf->pending_data) 6867 goto abort; 6868 for (i = 0; i < PENDING_IO_MAX; i++) 6869 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6870 /* Don't enable multi-threading by default*/ 6871 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6872 &new_group)) { 6873 conf->group_cnt = group_cnt; 6874 conf->worker_cnt_per_group = worker_cnt_per_group; 6875 conf->worker_groups = new_group; 6876 } else 6877 goto abort; 6878 spin_lock_init(&conf->device_lock); 6879 seqcount_init(&conf->gen_lock); 6880 mutex_init(&conf->cache_size_mutex); 6881 init_waitqueue_head(&conf->wait_for_quiescent); 6882 init_waitqueue_head(&conf->wait_for_stripe); 6883 init_waitqueue_head(&conf->wait_for_overlap); 6884 INIT_LIST_HEAD(&conf->handle_list); 6885 INIT_LIST_HEAD(&conf->loprio_list); 6886 INIT_LIST_HEAD(&conf->hold_list); 6887 INIT_LIST_HEAD(&conf->delayed_list); 6888 INIT_LIST_HEAD(&conf->bitmap_list); 6889 init_llist_head(&conf->released_stripes); 6890 atomic_set(&conf->active_stripes, 0); 6891 atomic_set(&conf->preread_active_stripes, 0); 6892 atomic_set(&conf->active_aligned_reads, 0); 6893 spin_lock_init(&conf->pending_bios_lock); 6894 conf->batch_bio_dispatch = true; 6895 rdev_for_each(rdev, mddev) { 6896 if (test_bit(Journal, &rdev->flags)) 6897 continue; 6898 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6899 conf->batch_bio_dispatch = false; 6900 break; 6901 } 6902 } 6903 6904 conf->bypass_threshold = BYPASS_THRESHOLD; 6905 conf->recovery_disabled = mddev->recovery_disabled - 1; 6906 6907 conf->raid_disks = mddev->raid_disks; 6908 if (mddev->reshape_position == MaxSector) 6909 conf->previous_raid_disks = mddev->raid_disks; 6910 else 6911 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6912 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6913 6914 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6915 GFP_KERNEL); 6916 6917 if (!conf->disks) 6918 goto abort; 6919 6920 for (i = 0; i < max_disks; i++) { 6921 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6922 if (!conf->disks[i].extra_page) 6923 goto abort; 6924 } 6925 6926 conf->mddev = mddev; 6927 6928 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6929 goto abort; 6930 6931 /* We init hash_locks[0] separately to that it can be used 6932 * as the reference lock in the spin_lock_nest_lock() call 6933 * in lock_all_device_hash_locks_irq in order to convince 6934 * lockdep that we know what we are doing. 6935 */ 6936 spin_lock_init(conf->hash_locks); 6937 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6938 spin_lock_init(conf->hash_locks + i); 6939 6940 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6941 INIT_LIST_HEAD(conf->inactive_list + i); 6942 6943 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6944 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6945 6946 atomic_set(&conf->r5c_cached_full_stripes, 0); 6947 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6948 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6949 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6950 atomic_set(&conf->r5c_flushing_full_stripes, 0); 6951 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 6952 6953 conf->level = mddev->new_level; 6954 conf->chunk_sectors = mddev->new_chunk_sectors; 6955 if (raid5_alloc_percpu(conf) != 0) 6956 goto abort; 6957 6958 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6959 6960 rdev_for_each(rdev, mddev) { 6961 raid_disk = rdev->raid_disk; 6962 if (raid_disk >= max_disks 6963 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6964 continue; 6965 disk = conf->disks + raid_disk; 6966 6967 if (test_bit(Replacement, &rdev->flags)) { 6968 if (disk->replacement) 6969 goto abort; 6970 disk->replacement = rdev; 6971 } else { 6972 if (disk->rdev) 6973 goto abort; 6974 disk->rdev = rdev; 6975 } 6976 6977 if (test_bit(In_sync, &rdev->flags)) { 6978 char b[BDEVNAME_SIZE]; 6979 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 6980 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6981 } else if (rdev->saved_raid_disk != raid_disk) 6982 /* Cannot rely on bitmap to complete recovery */ 6983 conf->fullsync = 1; 6984 } 6985 6986 conf->level = mddev->new_level; 6987 if (conf->level == 6) { 6988 conf->max_degraded = 2; 6989 if (raid6_call.xor_syndrome) 6990 conf->rmw_level = PARITY_ENABLE_RMW; 6991 else 6992 conf->rmw_level = PARITY_DISABLE_RMW; 6993 } else { 6994 conf->max_degraded = 1; 6995 conf->rmw_level = PARITY_ENABLE_RMW; 6996 } 6997 conf->algorithm = mddev->new_layout; 6998 conf->reshape_progress = mddev->reshape_position; 6999 if (conf->reshape_progress != MaxSector) { 7000 conf->prev_chunk_sectors = mddev->chunk_sectors; 7001 conf->prev_algo = mddev->layout; 7002 } else { 7003 conf->prev_chunk_sectors = conf->chunk_sectors; 7004 conf->prev_algo = conf->algorithm; 7005 } 7006 7007 conf->min_nr_stripes = NR_STRIPES; 7008 if (mddev->reshape_position != MaxSector) { 7009 int stripes = max_t(int, 7010 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7011 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7012 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7013 if (conf->min_nr_stripes != NR_STRIPES) 7014 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7015 mdname(mddev), conf->min_nr_stripes); 7016 } 7017 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7018 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7019 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7020 if (grow_stripes(conf, conf->min_nr_stripes)) { 7021 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7022 mdname(mddev), memory); 7023 goto abort; 7024 } else 7025 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7026 /* 7027 * Losing a stripe head costs more than the time to refill it, 7028 * it reduces the queue depth and so can hurt throughput. 7029 * So set it rather large, scaled by number of devices. 7030 */ 7031 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7032 conf->shrinker.scan_objects = raid5_cache_scan; 7033 conf->shrinker.count_objects = raid5_cache_count; 7034 conf->shrinker.batch = 128; 7035 conf->shrinker.flags = 0; 7036 if (register_shrinker(&conf->shrinker)) { 7037 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7038 mdname(mddev)); 7039 goto abort; 7040 } 7041 7042 sprintf(pers_name, "raid%d", mddev->new_level); 7043 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7044 if (!conf->thread) { 7045 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7046 mdname(mddev)); 7047 goto abort; 7048 } 7049 7050 return conf; 7051 7052 abort: 7053 if (conf) { 7054 free_conf(conf); 7055 return ERR_PTR(-EIO); 7056 } else 7057 return ERR_PTR(-ENOMEM); 7058 } 7059 7060 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7061 { 7062 switch (algo) { 7063 case ALGORITHM_PARITY_0: 7064 if (raid_disk < max_degraded) 7065 return 1; 7066 break; 7067 case ALGORITHM_PARITY_N: 7068 if (raid_disk >= raid_disks - max_degraded) 7069 return 1; 7070 break; 7071 case ALGORITHM_PARITY_0_6: 7072 if (raid_disk == 0 || 7073 raid_disk == raid_disks - 1) 7074 return 1; 7075 break; 7076 case ALGORITHM_LEFT_ASYMMETRIC_6: 7077 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7078 case ALGORITHM_LEFT_SYMMETRIC_6: 7079 case ALGORITHM_RIGHT_SYMMETRIC_6: 7080 if (raid_disk == raid_disks - 1) 7081 return 1; 7082 } 7083 return 0; 7084 } 7085 7086 static int raid5_run(struct mddev *mddev) 7087 { 7088 struct r5conf *conf; 7089 int working_disks = 0; 7090 int dirty_parity_disks = 0; 7091 struct md_rdev *rdev; 7092 struct md_rdev *journal_dev = NULL; 7093 sector_t reshape_offset = 0; 7094 int i; 7095 long long min_offset_diff = 0; 7096 int first = 1; 7097 7098 if (mddev->recovery_cp != MaxSector) 7099 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7100 mdname(mddev)); 7101 7102 rdev_for_each(rdev, mddev) { 7103 long long diff; 7104 7105 if (test_bit(Journal, &rdev->flags)) { 7106 journal_dev = rdev; 7107 continue; 7108 } 7109 if (rdev->raid_disk < 0) 7110 continue; 7111 diff = (rdev->new_data_offset - rdev->data_offset); 7112 if (first) { 7113 min_offset_diff = diff; 7114 first = 0; 7115 } else if (mddev->reshape_backwards && 7116 diff < min_offset_diff) 7117 min_offset_diff = diff; 7118 else if (!mddev->reshape_backwards && 7119 diff > min_offset_diff) 7120 min_offset_diff = diff; 7121 } 7122 7123 if (mddev->reshape_position != MaxSector) { 7124 /* Check that we can continue the reshape. 7125 * Difficulties arise if the stripe we would write to 7126 * next is at or after the stripe we would read from next. 7127 * For a reshape that changes the number of devices, this 7128 * is only possible for a very short time, and mdadm makes 7129 * sure that time appears to have past before assembling 7130 * the array. So we fail if that time hasn't passed. 7131 * For a reshape that keeps the number of devices the same 7132 * mdadm must be monitoring the reshape can keeping the 7133 * critical areas read-only and backed up. It will start 7134 * the array in read-only mode, so we check for that. 7135 */ 7136 sector_t here_new, here_old; 7137 int old_disks; 7138 int max_degraded = (mddev->level == 6 ? 2 : 1); 7139 int chunk_sectors; 7140 int new_data_disks; 7141 7142 if (journal_dev) { 7143 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7144 mdname(mddev)); 7145 return -EINVAL; 7146 } 7147 7148 if (mddev->new_level != mddev->level) { 7149 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7150 mdname(mddev)); 7151 return -EINVAL; 7152 } 7153 old_disks = mddev->raid_disks - mddev->delta_disks; 7154 /* reshape_position must be on a new-stripe boundary, and one 7155 * further up in new geometry must map after here in old 7156 * geometry. 7157 * If the chunk sizes are different, then as we perform reshape 7158 * in units of the largest of the two, reshape_position needs 7159 * be a multiple of the largest chunk size times new data disks. 7160 */ 7161 here_new = mddev->reshape_position; 7162 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7163 new_data_disks = mddev->raid_disks - max_degraded; 7164 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7165 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7166 mdname(mddev)); 7167 return -EINVAL; 7168 } 7169 reshape_offset = here_new * chunk_sectors; 7170 /* here_new is the stripe we will write to */ 7171 here_old = mddev->reshape_position; 7172 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7173 /* here_old is the first stripe that we might need to read 7174 * from */ 7175 if (mddev->delta_disks == 0) { 7176 /* We cannot be sure it is safe to start an in-place 7177 * reshape. It is only safe if user-space is monitoring 7178 * and taking constant backups. 7179 * mdadm always starts a situation like this in 7180 * readonly mode so it can take control before 7181 * allowing any writes. So just check for that. 7182 */ 7183 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7184 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7185 /* not really in-place - so OK */; 7186 else if (mddev->ro == 0) { 7187 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7188 mdname(mddev)); 7189 return -EINVAL; 7190 } 7191 } else if (mddev->reshape_backwards 7192 ? (here_new * chunk_sectors + min_offset_diff <= 7193 here_old * chunk_sectors) 7194 : (here_new * chunk_sectors >= 7195 here_old * chunk_sectors + (-min_offset_diff))) { 7196 /* Reading from the same stripe as writing to - bad */ 7197 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7198 mdname(mddev)); 7199 return -EINVAL; 7200 } 7201 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7202 /* OK, we should be able to continue; */ 7203 } else { 7204 BUG_ON(mddev->level != mddev->new_level); 7205 BUG_ON(mddev->layout != mddev->new_layout); 7206 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7207 BUG_ON(mddev->delta_disks != 0); 7208 } 7209 7210 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7211 test_bit(MD_HAS_PPL, &mddev->flags)) { 7212 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7213 mdname(mddev)); 7214 clear_bit(MD_HAS_PPL, &mddev->flags); 7215 } 7216 7217 if (mddev->private == NULL) 7218 conf = setup_conf(mddev); 7219 else 7220 conf = mddev->private; 7221 7222 if (IS_ERR(conf)) 7223 return PTR_ERR(conf); 7224 7225 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7226 if (!journal_dev) { 7227 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7228 mdname(mddev)); 7229 mddev->ro = 1; 7230 set_disk_ro(mddev->gendisk, 1); 7231 } else if (mddev->recovery_cp == MaxSector) 7232 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7233 } 7234 7235 conf->min_offset_diff = min_offset_diff; 7236 mddev->thread = conf->thread; 7237 conf->thread = NULL; 7238 mddev->private = conf; 7239 7240 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7241 i++) { 7242 rdev = conf->disks[i].rdev; 7243 if (!rdev && conf->disks[i].replacement) { 7244 /* The replacement is all we have yet */ 7245 rdev = conf->disks[i].replacement; 7246 conf->disks[i].replacement = NULL; 7247 clear_bit(Replacement, &rdev->flags); 7248 conf->disks[i].rdev = rdev; 7249 } 7250 if (!rdev) 7251 continue; 7252 if (conf->disks[i].replacement && 7253 conf->reshape_progress != MaxSector) { 7254 /* replacements and reshape simply do not mix. */ 7255 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7256 goto abort; 7257 } 7258 if (test_bit(In_sync, &rdev->flags)) { 7259 working_disks++; 7260 continue; 7261 } 7262 /* This disc is not fully in-sync. However if it 7263 * just stored parity (beyond the recovery_offset), 7264 * when we don't need to be concerned about the 7265 * array being dirty. 7266 * When reshape goes 'backwards', we never have 7267 * partially completed devices, so we only need 7268 * to worry about reshape going forwards. 7269 */ 7270 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7271 if (mddev->major_version == 0 && 7272 mddev->minor_version > 90) 7273 rdev->recovery_offset = reshape_offset; 7274 7275 if (rdev->recovery_offset < reshape_offset) { 7276 /* We need to check old and new layout */ 7277 if (!only_parity(rdev->raid_disk, 7278 conf->algorithm, 7279 conf->raid_disks, 7280 conf->max_degraded)) 7281 continue; 7282 } 7283 if (!only_parity(rdev->raid_disk, 7284 conf->prev_algo, 7285 conf->previous_raid_disks, 7286 conf->max_degraded)) 7287 continue; 7288 dirty_parity_disks++; 7289 } 7290 7291 /* 7292 * 0 for a fully functional array, 1 or 2 for a degraded array. 7293 */ 7294 mddev->degraded = raid5_calc_degraded(conf); 7295 7296 if (has_failed(conf)) { 7297 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7298 mdname(mddev), mddev->degraded, conf->raid_disks); 7299 goto abort; 7300 } 7301 7302 /* device size must be a multiple of chunk size */ 7303 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7304 mddev->resync_max_sectors = mddev->dev_sectors; 7305 7306 if (mddev->degraded > dirty_parity_disks && 7307 mddev->recovery_cp != MaxSector) { 7308 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7309 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7310 mdname(mddev)); 7311 else if (mddev->ok_start_degraded) 7312 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7313 mdname(mddev)); 7314 else { 7315 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7316 mdname(mddev)); 7317 goto abort; 7318 } 7319 } 7320 7321 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7322 mdname(mddev), conf->level, 7323 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7324 mddev->new_layout); 7325 7326 print_raid5_conf(conf); 7327 7328 if (conf->reshape_progress != MaxSector) { 7329 conf->reshape_safe = conf->reshape_progress; 7330 atomic_set(&conf->reshape_stripes, 0); 7331 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7332 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7333 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7334 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7335 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7336 "reshape"); 7337 } 7338 7339 /* Ok, everything is just fine now */ 7340 if (mddev->to_remove == &raid5_attrs_group) 7341 mddev->to_remove = NULL; 7342 else if (mddev->kobj.sd && 7343 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7344 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7345 mdname(mddev)); 7346 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7347 7348 if (mddev->queue) { 7349 int chunk_size; 7350 bool discard_supported = true; 7351 /* read-ahead size must cover two whole stripes, which 7352 * is 2 * (datadisks) * chunksize where 'n' is the 7353 * number of raid devices 7354 */ 7355 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7356 int stripe = data_disks * 7357 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7358 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7359 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7360 7361 chunk_size = mddev->chunk_sectors << 9; 7362 blk_queue_io_min(mddev->queue, chunk_size); 7363 blk_queue_io_opt(mddev->queue, chunk_size * 7364 (conf->raid_disks - conf->max_degraded)); 7365 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7366 /* 7367 * We can only discard a whole stripe. It doesn't make sense to 7368 * discard data disk but write parity disk 7369 */ 7370 stripe = stripe * PAGE_SIZE; 7371 /* Round up to power of 2, as discard handling 7372 * currently assumes that */ 7373 while ((stripe-1) & stripe) 7374 stripe = (stripe | (stripe-1)) + 1; 7375 mddev->queue->limits.discard_alignment = stripe; 7376 mddev->queue->limits.discard_granularity = stripe; 7377 /* 7378 * unaligned part of discard request will be ignored, so can't 7379 * guarantee discard_zeroes_data 7380 */ 7381 mddev->queue->limits.discard_zeroes_data = 0; 7382 7383 blk_queue_max_write_same_sectors(mddev->queue, 0); 7384 7385 rdev_for_each(rdev, mddev) { 7386 disk_stack_limits(mddev->gendisk, rdev->bdev, 7387 rdev->data_offset << 9); 7388 disk_stack_limits(mddev->gendisk, rdev->bdev, 7389 rdev->new_data_offset << 9); 7390 /* 7391 * discard_zeroes_data is required, otherwise data 7392 * could be lost. Consider a scenario: discard a stripe 7393 * (the stripe could be inconsistent if 7394 * discard_zeroes_data is 0); write one disk of the 7395 * stripe (the stripe could be inconsistent again 7396 * depending on which disks are used to calculate 7397 * parity); the disk is broken; The stripe data of this 7398 * disk is lost. 7399 */ 7400 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 7401 !bdev_get_queue(rdev->bdev)-> 7402 limits.discard_zeroes_data) 7403 discard_supported = false; 7404 /* Unfortunately, discard_zeroes_data is not currently 7405 * a guarantee - just a hint. So we only allow DISCARD 7406 * if the sysadmin has confirmed that only safe devices 7407 * are in use by setting a module parameter. 7408 */ 7409 if (!devices_handle_discard_safely) { 7410 if (discard_supported) { 7411 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 7412 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 7413 } 7414 discard_supported = false; 7415 } 7416 } 7417 7418 if (discard_supported && 7419 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7420 mddev->queue->limits.discard_granularity >= stripe) 7421 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7422 mddev->queue); 7423 else 7424 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7425 mddev->queue); 7426 7427 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7428 } 7429 7430 if (log_init(conf, journal_dev)) 7431 goto abort; 7432 7433 return 0; 7434 abort: 7435 md_unregister_thread(&mddev->thread); 7436 print_raid5_conf(conf); 7437 free_conf(conf); 7438 mddev->private = NULL; 7439 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7440 return -EIO; 7441 } 7442 7443 static void raid5_free(struct mddev *mddev, void *priv) 7444 { 7445 struct r5conf *conf = priv; 7446 7447 free_conf(conf); 7448 mddev->to_remove = &raid5_attrs_group; 7449 } 7450 7451 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7452 { 7453 struct r5conf *conf = mddev->private; 7454 int i; 7455 7456 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7457 conf->chunk_sectors / 2, mddev->layout); 7458 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7459 rcu_read_lock(); 7460 for (i = 0; i < conf->raid_disks; i++) { 7461 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7462 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7463 } 7464 rcu_read_unlock(); 7465 seq_printf (seq, "]"); 7466 } 7467 7468 static void print_raid5_conf (struct r5conf *conf) 7469 { 7470 int i; 7471 struct disk_info *tmp; 7472 7473 pr_debug("RAID conf printout:\n"); 7474 if (!conf) { 7475 pr_debug("(conf==NULL)\n"); 7476 return; 7477 } 7478 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7479 conf->raid_disks, 7480 conf->raid_disks - conf->mddev->degraded); 7481 7482 for (i = 0; i < conf->raid_disks; i++) { 7483 char b[BDEVNAME_SIZE]; 7484 tmp = conf->disks + i; 7485 if (tmp->rdev) 7486 pr_debug(" disk %d, o:%d, dev:%s\n", 7487 i, !test_bit(Faulty, &tmp->rdev->flags), 7488 bdevname(tmp->rdev->bdev, b)); 7489 } 7490 } 7491 7492 static int raid5_spare_active(struct mddev *mddev) 7493 { 7494 int i; 7495 struct r5conf *conf = mddev->private; 7496 struct disk_info *tmp; 7497 int count = 0; 7498 unsigned long flags; 7499 7500 for (i = 0; i < conf->raid_disks; i++) { 7501 tmp = conf->disks + i; 7502 if (tmp->replacement 7503 && tmp->replacement->recovery_offset == MaxSector 7504 && !test_bit(Faulty, &tmp->replacement->flags) 7505 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7506 /* Replacement has just become active. */ 7507 if (!tmp->rdev 7508 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7509 count++; 7510 if (tmp->rdev) { 7511 /* Replaced device not technically faulty, 7512 * but we need to be sure it gets removed 7513 * and never re-added. 7514 */ 7515 set_bit(Faulty, &tmp->rdev->flags); 7516 sysfs_notify_dirent_safe( 7517 tmp->rdev->sysfs_state); 7518 } 7519 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7520 } else if (tmp->rdev 7521 && tmp->rdev->recovery_offset == MaxSector 7522 && !test_bit(Faulty, &tmp->rdev->flags) 7523 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7524 count++; 7525 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7526 } 7527 } 7528 spin_lock_irqsave(&conf->device_lock, flags); 7529 mddev->degraded = raid5_calc_degraded(conf); 7530 spin_unlock_irqrestore(&conf->device_lock, flags); 7531 print_raid5_conf(conf); 7532 return count; 7533 } 7534 7535 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7536 { 7537 struct r5conf *conf = mddev->private; 7538 int err = 0; 7539 int number = rdev->raid_disk; 7540 struct md_rdev **rdevp; 7541 struct disk_info *p = conf->disks + number; 7542 7543 print_raid5_conf(conf); 7544 if (test_bit(Journal, &rdev->flags) && conf->log) { 7545 /* 7546 * we can't wait pending write here, as this is called in 7547 * raid5d, wait will deadlock. 7548 * neilb: there is no locking about new writes here, 7549 * so this cannot be safe. 7550 */ 7551 if (atomic_read(&conf->active_stripes)) { 7552 return -EBUSY; 7553 } 7554 log_exit(conf); 7555 return 0; 7556 } 7557 if (rdev == p->rdev) 7558 rdevp = &p->rdev; 7559 else if (rdev == p->replacement) 7560 rdevp = &p->replacement; 7561 else 7562 return 0; 7563 7564 if (number >= conf->raid_disks && 7565 conf->reshape_progress == MaxSector) 7566 clear_bit(In_sync, &rdev->flags); 7567 7568 if (test_bit(In_sync, &rdev->flags) || 7569 atomic_read(&rdev->nr_pending)) { 7570 err = -EBUSY; 7571 goto abort; 7572 } 7573 /* Only remove non-faulty devices if recovery 7574 * isn't possible. 7575 */ 7576 if (!test_bit(Faulty, &rdev->flags) && 7577 mddev->recovery_disabled != conf->recovery_disabled && 7578 !has_failed(conf) && 7579 (!p->replacement || p->replacement == rdev) && 7580 number < conf->raid_disks) { 7581 err = -EBUSY; 7582 goto abort; 7583 } 7584 *rdevp = NULL; 7585 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7586 synchronize_rcu(); 7587 if (atomic_read(&rdev->nr_pending)) { 7588 /* lost the race, try later */ 7589 err = -EBUSY; 7590 *rdevp = rdev; 7591 } 7592 } 7593 if (!err) { 7594 err = log_modify(conf, rdev, false); 7595 if (err) 7596 goto abort; 7597 } 7598 if (p->replacement) { 7599 /* We must have just cleared 'rdev' */ 7600 p->rdev = p->replacement; 7601 clear_bit(Replacement, &p->replacement->flags); 7602 smp_mb(); /* Make sure other CPUs may see both as identical 7603 * but will never see neither - if they are careful 7604 */ 7605 p->replacement = NULL; 7606 clear_bit(WantReplacement, &rdev->flags); 7607 7608 if (!err) 7609 err = log_modify(conf, p->rdev, true); 7610 } else 7611 /* We might have just removed the Replacement as faulty- 7612 * clear the bit just in case 7613 */ 7614 clear_bit(WantReplacement, &rdev->flags); 7615 abort: 7616 7617 print_raid5_conf(conf); 7618 return err; 7619 } 7620 7621 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7622 { 7623 struct r5conf *conf = mddev->private; 7624 int err = -EEXIST; 7625 int disk; 7626 struct disk_info *p; 7627 int first = 0; 7628 int last = conf->raid_disks - 1; 7629 7630 if (test_bit(Journal, &rdev->flags)) { 7631 if (conf->log) 7632 return -EBUSY; 7633 7634 rdev->raid_disk = 0; 7635 /* 7636 * The array is in readonly mode if journal is missing, so no 7637 * write requests running. We should be safe 7638 */ 7639 log_init(conf, rdev); 7640 return 0; 7641 } 7642 if (mddev->recovery_disabled == conf->recovery_disabled) 7643 return -EBUSY; 7644 7645 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7646 /* no point adding a device */ 7647 return -EINVAL; 7648 7649 if (rdev->raid_disk >= 0) 7650 first = last = rdev->raid_disk; 7651 7652 /* 7653 * find the disk ... but prefer rdev->saved_raid_disk 7654 * if possible. 7655 */ 7656 if (rdev->saved_raid_disk >= 0 && 7657 rdev->saved_raid_disk >= first && 7658 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7659 first = rdev->saved_raid_disk; 7660 7661 for (disk = first; disk <= last; disk++) { 7662 p = conf->disks + disk; 7663 if (p->rdev == NULL) { 7664 clear_bit(In_sync, &rdev->flags); 7665 rdev->raid_disk = disk; 7666 if (rdev->saved_raid_disk != disk) 7667 conf->fullsync = 1; 7668 rcu_assign_pointer(p->rdev, rdev); 7669 7670 err = log_modify(conf, rdev, true); 7671 7672 goto out; 7673 } 7674 } 7675 for (disk = first; disk <= last; disk++) { 7676 p = conf->disks + disk; 7677 if (test_bit(WantReplacement, &p->rdev->flags) && 7678 p->replacement == NULL) { 7679 clear_bit(In_sync, &rdev->flags); 7680 set_bit(Replacement, &rdev->flags); 7681 rdev->raid_disk = disk; 7682 err = 0; 7683 conf->fullsync = 1; 7684 rcu_assign_pointer(p->replacement, rdev); 7685 break; 7686 } 7687 } 7688 out: 7689 print_raid5_conf(conf); 7690 return err; 7691 } 7692 7693 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7694 { 7695 /* no resync is happening, and there is enough space 7696 * on all devices, so we can resize. 7697 * We need to make sure resync covers any new space. 7698 * If the array is shrinking we should possibly wait until 7699 * any io in the removed space completes, but it hardly seems 7700 * worth it. 7701 */ 7702 sector_t newsize; 7703 struct r5conf *conf = mddev->private; 7704 7705 if (conf->log || raid5_has_ppl(conf)) 7706 return -EINVAL; 7707 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7708 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7709 if (mddev->external_size && 7710 mddev->array_sectors > newsize) 7711 return -EINVAL; 7712 if (mddev->bitmap) { 7713 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7714 if (ret) 7715 return ret; 7716 } 7717 md_set_array_sectors(mddev, newsize); 7718 if (sectors > mddev->dev_sectors && 7719 mddev->recovery_cp > mddev->dev_sectors) { 7720 mddev->recovery_cp = mddev->dev_sectors; 7721 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7722 } 7723 mddev->dev_sectors = sectors; 7724 mddev->resync_max_sectors = sectors; 7725 return 0; 7726 } 7727 7728 static int check_stripe_cache(struct mddev *mddev) 7729 { 7730 /* Can only proceed if there are plenty of stripe_heads. 7731 * We need a minimum of one full stripe,, and for sensible progress 7732 * it is best to have about 4 times that. 7733 * If we require 4 times, then the default 256 4K stripe_heads will 7734 * allow for chunk sizes up to 256K, which is probably OK. 7735 * If the chunk size is greater, user-space should request more 7736 * stripe_heads first. 7737 */ 7738 struct r5conf *conf = mddev->private; 7739 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7740 > conf->min_nr_stripes || 7741 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7742 > conf->min_nr_stripes) { 7743 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7744 mdname(mddev), 7745 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7746 / STRIPE_SIZE)*4); 7747 return 0; 7748 } 7749 return 1; 7750 } 7751 7752 static int check_reshape(struct mddev *mddev) 7753 { 7754 struct r5conf *conf = mddev->private; 7755 7756 if (conf->log || raid5_has_ppl(conf)) 7757 return -EINVAL; 7758 if (mddev->delta_disks == 0 && 7759 mddev->new_layout == mddev->layout && 7760 mddev->new_chunk_sectors == mddev->chunk_sectors) 7761 return 0; /* nothing to do */ 7762 if (has_failed(conf)) 7763 return -EINVAL; 7764 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7765 /* We might be able to shrink, but the devices must 7766 * be made bigger first. 7767 * For raid6, 4 is the minimum size. 7768 * Otherwise 2 is the minimum 7769 */ 7770 int min = 2; 7771 if (mddev->level == 6) 7772 min = 4; 7773 if (mddev->raid_disks + mddev->delta_disks < min) 7774 return -EINVAL; 7775 } 7776 7777 if (!check_stripe_cache(mddev)) 7778 return -ENOSPC; 7779 7780 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7781 mddev->delta_disks > 0) 7782 if (resize_chunks(conf, 7783 conf->previous_raid_disks 7784 + max(0, mddev->delta_disks), 7785 max(mddev->new_chunk_sectors, 7786 mddev->chunk_sectors) 7787 ) < 0) 7788 return -ENOMEM; 7789 return resize_stripes(conf, (conf->previous_raid_disks 7790 + mddev->delta_disks)); 7791 } 7792 7793 static int raid5_start_reshape(struct mddev *mddev) 7794 { 7795 struct r5conf *conf = mddev->private; 7796 struct md_rdev *rdev; 7797 int spares = 0; 7798 unsigned long flags; 7799 7800 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7801 return -EBUSY; 7802 7803 if (!check_stripe_cache(mddev)) 7804 return -ENOSPC; 7805 7806 if (has_failed(conf)) 7807 return -EINVAL; 7808 7809 rdev_for_each(rdev, mddev) { 7810 if (!test_bit(In_sync, &rdev->flags) 7811 && !test_bit(Faulty, &rdev->flags)) 7812 spares++; 7813 } 7814 7815 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7816 /* Not enough devices even to make a degraded array 7817 * of that size 7818 */ 7819 return -EINVAL; 7820 7821 /* Refuse to reduce size of the array. Any reductions in 7822 * array size must be through explicit setting of array_size 7823 * attribute. 7824 */ 7825 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7826 < mddev->array_sectors) { 7827 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7828 mdname(mddev)); 7829 return -EINVAL; 7830 } 7831 7832 atomic_set(&conf->reshape_stripes, 0); 7833 spin_lock_irq(&conf->device_lock); 7834 write_seqcount_begin(&conf->gen_lock); 7835 conf->previous_raid_disks = conf->raid_disks; 7836 conf->raid_disks += mddev->delta_disks; 7837 conf->prev_chunk_sectors = conf->chunk_sectors; 7838 conf->chunk_sectors = mddev->new_chunk_sectors; 7839 conf->prev_algo = conf->algorithm; 7840 conf->algorithm = mddev->new_layout; 7841 conf->generation++; 7842 /* Code that selects data_offset needs to see the generation update 7843 * if reshape_progress has been set - so a memory barrier needed. 7844 */ 7845 smp_mb(); 7846 if (mddev->reshape_backwards) 7847 conf->reshape_progress = raid5_size(mddev, 0, 0); 7848 else 7849 conf->reshape_progress = 0; 7850 conf->reshape_safe = conf->reshape_progress; 7851 write_seqcount_end(&conf->gen_lock); 7852 spin_unlock_irq(&conf->device_lock); 7853 7854 /* Now make sure any requests that proceeded on the assumption 7855 * the reshape wasn't running - like Discard or Read - have 7856 * completed. 7857 */ 7858 mddev_suspend(mddev); 7859 mddev_resume(mddev); 7860 7861 /* Add some new drives, as many as will fit. 7862 * We know there are enough to make the newly sized array work. 7863 * Don't add devices if we are reducing the number of 7864 * devices in the array. This is because it is not possible 7865 * to correctly record the "partially reconstructed" state of 7866 * such devices during the reshape and confusion could result. 7867 */ 7868 if (mddev->delta_disks >= 0) { 7869 rdev_for_each(rdev, mddev) 7870 if (rdev->raid_disk < 0 && 7871 !test_bit(Faulty, &rdev->flags)) { 7872 if (raid5_add_disk(mddev, rdev) == 0) { 7873 if (rdev->raid_disk 7874 >= conf->previous_raid_disks) 7875 set_bit(In_sync, &rdev->flags); 7876 else 7877 rdev->recovery_offset = 0; 7878 7879 if (sysfs_link_rdev(mddev, rdev)) 7880 /* Failure here is OK */; 7881 } 7882 } else if (rdev->raid_disk >= conf->previous_raid_disks 7883 && !test_bit(Faulty, &rdev->flags)) { 7884 /* This is a spare that was manually added */ 7885 set_bit(In_sync, &rdev->flags); 7886 } 7887 7888 /* When a reshape changes the number of devices, 7889 * ->degraded is measured against the larger of the 7890 * pre and post number of devices. 7891 */ 7892 spin_lock_irqsave(&conf->device_lock, flags); 7893 mddev->degraded = raid5_calc_degraded(conf); 7894 spin_unlock_irqrestore(&conf->device_lock, flags); 7895 } 7896 mddev->raid_disks = conf->raid_disks; 7897 mddev->reshape_position = conf->reshape_progress; 7898 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7899 7900 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7901 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7902 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7903 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7904 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7905 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7906 "reshape"); 7907 if (!mddev->sync_thread) { 7908 mddev->recovery = 0; 7909 spin_lock_irq(&conf->device_lock); 7910 write_seqcount_begin(&conf->gen_lock); 7911 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7912 mddev->new_chunk_sectors = 7913 conf->chunk_sectors = conf->prev_chunk_sectors; 7914 mddev->new_layout = conf->algorithm = conf->prev_algo; 7915 rdev_for_each(rdev, mddev) 7916 rdev->new_data_offset = rdev->data_offset; 7917 smp_wmb(); 7918 conf->generation --; 7919 conf->reshape_progress = MaxSector; 7920 mddev->reshape_position = MaxSector; 7921 write_seqcount_end(&conf->gen_lock); 7922 spin_unlock_irq(&conf->device_lock); 7923 return -EAGAIN; 7924 } 7925 conf->reshape_checkpoint = jiffies; 7926 md_wakeup_thread(mddev->sync_thread); 7927 md_new_event(mddev); 7928 return 0; 7929 } 7930 7931 /* This is called from the reshape thread and should make any 7932 * changes needed in 'conf' 7933 */ 7934 static void end_reshape(struct r5conf *conf) 7935 { 7936 7937 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7938 struct md_rdev *rdev; 7939 7940 spin_lock_irq(&conf->device_lock); 7941 conf->previous_raid_disks = conf->raid_disks; 7942 rdev_for_each(rdev, conf->mddev) 7943 rdev->data_offset = rdev->new_data_offset; 7944 smp_wmb(); 7945 conf->reshape_progress = MaxSector; 7946 conf->mddev->reshape_position = MaxSector; 7947 spin_unlock_irq(&conf->device_lock); 7948 wake_up(&conf->wait_for_overlap); 7949 7950 /* read-ahead size must cover two whole stripes, which is 7951 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7952 */ 7953 if (conf->mddev->queue) { 7954 int data_disks = conf->raid_disks - conf->max_degraded; 7955 int stripe = data_disks * ((conf->chunk_sectors << 9) 7956 / PAGE_SIZE); 7957 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7958 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7959 } 7960 } 7961 } 7962 7963 /* This is called from the raid5d thread with mddev_lock held. 7964 * It makes config changes to the device. 7965 */ 7966 static void raid5_finish_reshape(struct mddev *mddev) 7967 { 7968 struct r5conf *conf = mddev->private; 7969 7970 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7971 7972 if (mddev->delta_disks > 0) { 7973 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7974 if (mddev->queue) { 7975 set_capacity(mddev->gendisk, mddev->array_sectors); 7976 revalidate_disk(mddev->gendisk); 7977 } 7978 } else { 7979 int d; 7980 spin_lock_irq(&conf->device_lock); 7981 mddev->degraded = raid5_calc_degraded(conf); 7982 spin_unlock_irq(&conf->device_lock); 7983 for (d = conf->raid_disks ; 7984 d < conf->raid_disks - mddev->delta_disks; 7985 d++) { 7986 struct md_rdev *rdev = conf->disks[d].rdev; 7987 if (rdev) 7988 clear_bit(In_sync, &rdev->flags); 7989 rdev = conf->disks[d].replacement; 7990 if (rdev) 7991 clear_bit(In_sync, &rdev->flags); 7992 } 7993 } 7994 mddev->layout = conf->algorithm; 7995 mddev->chunk_sectors = conf->chunk_sectors; 7996 mddev->reshape_position = MaxSector; 7997 mddev->delta_disks = 0; 7998 mddev->reshape_backwards = 0; 7999 } 8000 } 8001 8002 static void raid5_quiesce(struct mddev *mddev, int state) 8003 { 8004 struct r5conf *conf = mddev->private; 8005 8006 switch(state) { 8007 case 2: /* resume for a suspend */ 8008 wake_up(&conf->wait_for_overlap); 8009 break; 8010 8011 case 1: /* stop all writes */ 8012 lock_all_device_hash_locks_irq(conf); 8013 /* '2' tells resync/reshape to pause so that all 8014 * active stripes can drain 8015 */ 8016 r5c_flush_cache(conf, INT_MAX); 8017 conf->quiesce = 2; 8018 wait_event_cmd(conf->wait_for_quiescent, 8019 atomic_read(&conf->active_stripes) == 0 && 8020 atomic_read(&conf->active_aligned_reads) == 0, 8021 unlock_all_device_hash_locks_irq(conf), 8022 lock_all_device_hash_locks_irq(conf)); 8023 conf->quiesce = 1; 8024 unlock_all_device_hash_locks_irq(conf); 8025 /* allow reshape to continue */ 8026 wake_up(&conf->wait_for_overlap); 8027 break; 8028 8029 case 0: /* re-enable writes */ 8030 lock_all_device_hash_locks_irq(conf); 8031 conf->quiesce = 0; 8032 wake_up(&conf->wait_for_quiescent); 8033 wake_up(&conf->wait_for_overlap); 8034 unlock_all_device_hash_locks_irq(conf); 8035 break; 8036 } 8037 r5l_quiesce(conf->log, state); 8038 } 8039 8040 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8041 { 8042 struct r0conf *raid0_conf = mddev->private; 8043 sector_t sectors; 8044 8045 /* for raid0 takeover only one zone is supported */ 8046 if (raid0_conf->nr_strip_zones > 1) { 8047 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8048 mdname(mddev)); 8049 return ERR_PTR(-EINVAL); 8050 } 8051 8052 sectors = raid0_conf->strip_zone[0].zone_end; 8053 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8054 mddev->dev_sectors = sectors; 8055 mddev->new_level = level; 8056 mddev->new_layout = ALGORITHM_PARITY_N; 8057 mddev->new_chunk_sectors = mddev->chunk_sectors; 8058 mddev->raid_disks += 1; 8059 mddev->delta_disks = 1; 8060 /* make sure it will be not marked as dirty */ 8061 mddev->recovery_cp = MaxSector; 8062 8063 return setup_conf(mddev); 8064 } 8065 8066 static void *raid5_takeover_raid1(struct mddev *mddev) 8067 { 8068 int chunksect; 8069 void *ret; 8070 8071 if (mddev->raid_disks != 2 || 8072 mddev->degraded > 1) 8073 return ERR_PTR(-EINVAL); 8074 8075 /* Should check if there are write-behind devices? */ 8076 8077 chunksect = 64*2; /* 64K by default */ 8078 8079 /* The array must be an exact multiple of chunksize */ 8080 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8081 chunksect >>= 1; 8082 8083 if ((chunksect<<9) < STRIPE_SIZE) 8084 /* array size does not allow a suitable chunk size */ 8085 return ERR_PTR(-EINVAL); 8086 8087 mddev->new_level = 5; 8088 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8089 mddev->new_chunk_sectors = chunksect; 8090 8091 ret = setup_conf(mddev); 8092 if (!IS_ERR(ret)) 8093 mddev_clear_unsupported_flags(mddev, 8094 UNSUPPORTED_MDDEV_FLAGS); 8095 return ret; 8096 } 8097 8098 static void *raid5_takeover_raid6(struct mddev *mddev) 8099 { 8100 int new_layout; 8101 8102 switch (mddev->layout) { 8103 case ALGORITHM_LEFT_ASYMMETRIC_6: 8104 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8105 break; 8106 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8107 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8108 break; 8109 case ALGORITHM_LEFT_SYMMETRIC_6: 8110 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8111 break; 8112 case ALGORITHM_RIGHT_SYMMETRIC_6: 8113 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8114 break; 8115 case ALGORITHM_PARITY_0_6: 8116 new_layout = ALGORITHM_PARITY_0; 8117 break; 8118 case ALGORITHM_PARITY_N: 8119 new_layout = ALGORITHM_PARITY_N; 8120 break; 8121 default: 8122 return ERR_PTR(-EINVAL); 8123 } 8124 mddev->new_level = 5; 8125 mddev->new_layout = new_layout; 8126 mddev->delta_disks = -1; 8127 mddev->raid_disks -= 1; 8128 return setup_conf(mddev); 8129 } 8130 8131 static int raid5_check_reshape(struct mddev *mddev) 8132 { 8133 /* For a 2-drive array, the layout and chunk size can be changed 8134 * immediately as not restriping is needed. 8135 * For larger arrays we record the new value - after validation 8136 * to be used by a reshape pass. 8137 */ 8138 struct r5conf *conf = mddev->private; 8139 int new_chunk = mddev->new_chunk_sectors; 8140 8141 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8142 return -EINVAL; 8143 if (new_chunk > 0) { 8144 if (!is_power_of_2(new_chunk)) 8145 return -EINVAL; 8146 if (new_chunk < (PAGE_SIZE>>9)) 8147 return -EINVAL; 8148 if (mddev->array_sectors & (new_chunk-1)) 8149 /* not factor of array size */ 8150 return -EINVAL; 8151 } 8152 8153 /* They look valid */ 8154 8155 if (mddev->raid_disks == 2) { 8156 /* can make the change immediately */ 8157 if (mddev->new_layout >= 0) { 8158 conf->algorithm = mddev->new_layout; 8159 mddev->layout = mddev->new_layout; 8160 } 8161 if (new_chunk > 0) { 8162 conf->chunk_sectors = new_chunk ; 8163 mddev->chunk_sectors = new_chunk; 8164 } 8165 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8166 md_wakeup_thread(mddev->thread); 8167 } 8168 return check_reshape(mddev); 8169 } 8170 8171 static int raid6_check_reshape(struct mddev *mddev) 8172 { 8173 int new_chunk = mddev->new_chunk_sectors; 8174 8175 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8176 return -EINVAL; 8177 if (new_chunk > 0) { 8178 if (!is_power_of_2(new_chunk)) 8179 return -EINVAL; 8180 if (new_chunk < (PAGE_SIZE >> 9)) 8181 return -EINVAL; 8182 if (mddev->array_sectors & (new_chunk-1)) 8183 /* not factor of array size */ 8184 return -EINVAL; 8185 } 8186 8187 /* They look valid */ 8188 return check_reshape(mddev); 8189 } 8190 8191 static void *raid5_takeover(struct mddev *mddev) 8192 { 8193 /* raid5 can take over: 8194 * raid0 - if there is only one strip zone - make it a raid4 layout 8195 * raid1 - if there are two drives. We need to know the chunk size 8196 * raid4 - trivial - just use a raid4 layout. 8197 * raid6 - Providing it is a *_6 layout 8198 */ 8199 if (mddev->level == 0) 8200 return raid45_takeover_raid0(mddev, 5); 8201 if (mddev->level == 1) 8202 return raid5_takeover_raid1(mddev); 8203 if (mddev->level == 4) { 8204 mddev->new_layout = ALGORITHM_PARITY_N; 8205 mddev->new_level = 5; 8206 return setup_conf(mddev); 8207 } 8208 if (mddev->level == 6) 8209 return raid5_takeover_raid6(mddev); 8210 8211 return ERR_PTR(-EINVAL); 8212 } 8213 8214 static void *raid4_takeover(struct mddev *mddev) 8215 { 8216 /* raid4 can take over: 8217 * raid0 - if there is only one strip zone 8218 * raid5 - if layout is right 8219 */ 8220 if (mddev->level == 0) 8221 return raid45_takeover_raid0(mddev, 4); 8222 if (mddev->level == 5 && 8223 mddev->layout == ALGORITHM_PARITY_N) { 8224 mddev->new_layout = 0; 8225 mddev->new_level = 4; 8226 return setup_conf(mddev); 8227 } 8228 return ERR_PTR(-EINVAL); 8229 } 8230 8231 static struct md_personality raid5_personality; 8232 8233 static void *raid6_takeover(struct mddev *mddev) 8234 { 8235 /* Currently can only take over a raid5. We map the 8236 * personality to an equivalent raid6 personality 8237 * with the Q block at the end. 8238 */ 8239 int new_layout; 8240 8241 if (mddev->pers != &raid5_personality) 8242 return ERR_PTR(-EINVAL); 8243 if (mddev->degraded > 1) 8244 return ERR_PTR(-EINVAL); 8245 if (mddev->raid_disks > 253) 8246 return ERR_PTR(-EINVAL); 8247 if (mddev->raid_disks < 3) 8248 return ERR_PTR(-EINVAL); 8249 8250 switch (mddev->layout) { 8251 case ALGORITHM_LEFT_ASYMMETRIC: 8252 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8253 break; 8254 case ALGORITHM_RIGHT_ASYMMETRIC: 8255 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8256 break; 8257 case ALGORITHM_LEFT_SYMMETRIC: 8258 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8259 break; 8260 case ALGORITHM_RIGHT_SYMMETRIC: 8261 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8262 break; 8263 case ALGORITHM_PARITY_0: 8264 new_layout = ALGORITHM_PARITY_0_6; 8265 break; 8266 case ALGORITHM_PARITY_N: 8267 new_layout = ALGORITHM_PARITY_N; 8268 break; 8269 default: 8270 return ERR_PTR(-EINVAL); 8271 } 8272 mddev->new_level = 6; 8273 mddev->new_layout = new_layout; 8274 mddev->delta_disks = 1; 8275 mddev->raid_disks += 1; 8276 return setup_conf(mddev); 8277 } 8278 8279 static void raid5_reset_stripe_cache(struct mddev *mddev) 8280 { 8281 struct r5conf *conf = mddev->private; 8282 8283 mutex_lock(&conf->cache_size_mutex); 8284 while (conf->max_nr_stripes && 8285 drop_one_stripe(conf)) 8286 ; 8287 while (conf->min_nr_stripes > conf->max_nr_stripes && 8288 grow_one_stripe(conf, GFP_KERNEL)) 8289 ; 8290 mutex_unlock(&conf->cache_size_mutex); 8291 } 8292 8293 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8294 { 8295 struct r5conf *conf; 8296 int err; 8297 8298 err = mddev_lock(mddev); 8299 if (err) 8300 return err; 8301 conf = mddev->private; 8302 if (!conf) { 8303 mddev_unlock(mddev); 8304 return -ENODEV; 8305 } 8306 8307 if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) { 8308 /* ppl only works with RAID 5 */ 8309 if (conf->level == 5) { 8310 mddev_suspend(mddev); 8311 set_bit(MD_HAS_PPL, &mddev->flags); 8312 err = log_init(conf, NULL); 8313 if (!err) 8314 raid5_reset_stripe_cache(mddev); 8315 mddev_resume(mddev); 8316 } else 8317 err = -EINVAL; 8318 } else if (strncmp(buf, "resync", 6) == 0) { 8319 if (raid5_has_ppl(conf)) { 8320 mddev_suspend(mddev); 8321 log_exit(conf); 8322 raid5_reset_stripe_cache(mddev); 8323 mddev_resume(mddev); 8324 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8325 r5l_log_disk_error(conf)) { 8326 bool journal_dev_exists = false; 8327 struct md_rdev *rdev; 8328 8329 rdev_for_each(rdev, mddev) 8330 if (test_bit(Journal, &rdev->flags)) { 8331 journal_dev_exists = true; 8332 break; 8333 } 8334 8335 if (!journal_dev_exists) { 8336 mddev_suspend(mddev); 8337 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8338 mddev_resume(mddev); 8339 } else /* need remove journal device first */ 8340 err = -EBUSY; 8341 } else 8342 err = -EINVAL; 8343 } else { 8344 err = -EINVAL; 8345 } 8346 8347 if (!err) 8348 md_update_sb(mddev, 1); 8349 8350 mddev_unlock(mddev); 8351 8352 return err; 8353 } 8354 8355 static struct md_personality raid6_personality = 8356 { 8357 .name = "raid6", 8358 .level = 6, 8359 .owner = THIS_MODULE, 8360 .make_request = raid5_make_request, 8361 .run = raid5_run, 8362 .free = raid5_free, 8363 .status = raid5_status, 8364 .error_handler = raid5_error, 8365 .hot_add_disk = raid5_add_disk, 8366 .hot_remove_disk= raid5_remove_disk, 8367 .spare_active = raid5_spare_active, 8368 .sync_request = raid5_sync_request, 8369 .resize = raid5_resize, 8370 .size = raid5_size, 8371 .check_reshape = raid6_check_reshape, 8372 .start_reshape = raid5_start_reshape, 8373 .finish_reshape = raid5_finish_reshape, 8374 .quiesce = raid5_quiesce, 8375 .takeover = raid6_takeover, 8376 .congested = raid5_congested, 8377 .change_consistency_policy = raid5_change_consistency_policy, 8378 }; 8379 static struct md_personality raid5_personality = 8380 { 8381 .name = "raid5", 8382 .level = 5, 8383 .owner = THIS_MODULE, 8384 .make_request = raid5_make_request, 8385 .run = raid5_run, 8386 .free = raid5_free, 8387 .status = raid5_status, 8388 .error_handler = raid5_error, 8389 .hot_add_disk = raid5_add_disk, 8390 .hot_remove_disk= raid5_remove_disk, 8391 .spare_active = raid5_spare_active, 8392 .sync_request = raid5_sync_request, 8393 .resize = raid5_resize, 8394 .size = raid5_size, 8395 .check_reshape = raid5_check_reshape, 8396 .start_reshape = raid5_start_reshape, 8397 .finish_reshape = raid5_finish_reshape, 8398 .quiesce = raid5_quiesce, 8399 .takeover = raid5_takeover, 8400 .congested = raid5_congested, 8401 .change_consistency_policy = raid5_change_consistency_policy, 8402 }; 8403 8404 static struct md_personality raid4_personality = 8405 { 8406 .name = "raid4", 8407 .level = 4, 8408 .owner = THIS_MODULE, 8409 .make_request = raid5_make_request, 8410 .run = raid5_run, 8411 .free = raid5_free, 8412 .status = raid5_status, 8413 .error_handler = raid5_error, 8414 .hot_add_disk = raid5_add_disk, 8415 .hot_remove_disk= raid5_remove_disk, 8416 .spare_active = raid5_spare_active, 8417 .sync_request = raid5_sync_request, 8418 .resize = raid5_resize, 8419 .size = raid5_size, 8420 .check_reshape = raid5_check_reshape, 8421 .start_reshape = raid5_start_reshape, 8422 .finish_reshape = raid5_finish_reshape, 8423 .quiesce = raid5_quiesce, 8424 .takeover = raid4_takeover, 8425 .congested = raid5_congested, 8426 .change_consistency_policy = raid5_change_consistency_policy, 8427 }; 8428 8429 static int __init raid5_init(void) 8430 { 8431 int ret; 8432 8433 raid5_wq = alloc_workqueue("raid5wq", 8434 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8435 if (!raid5_wq) 8436 return -ENOMEM; 8437 8438 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8439 "md/raid5:prepare", 8440 raid456_cpu_up_prepare, 8441 raid456_cpu_dead); 8442 if (ret) { 8443 destroy_workqueue(raid5_wq); 8444 return ret; 8445 } 8446 register_md_personality(&raid6_personality); 8447 register_md_personality(&raid5_personality); 8448 register_md_personality(&raid4_personality); 8449 return 0; 8450 } 8451 8452 static void raid5_exit(void) 8453 { 8454 unregister_md_personality(&raid6_personality); 8455 unregister_md_personality(&raid5_personality); 8456 unregister_md_personality(&raid4_personality); 8457 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8458 destroy_workqueue(raid5_wq); 8459 } 8460 8461 module_init(raid5_init); 8462 module_exit(raid5_exit); 8463 MODULE_LICENSE("GPL"); 8464 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8465 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8466 MODULE_ALIAS("md-raid5"); 8467 MODULE_ALIAS("md-raid4"); 8468 MODULE_ALIAS("md-level-5"); 8469 MODULE_ALIAS("md-level-4"); 8470 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8471 MODULE_ALIAS("md-raid6"); 8472 MODULE_ALIAS("md-level-6"); 8473 8474 /* This used to be two separate modules, they were: */ 8475 MODULE_ALIAS("raid5"); 8476 MODULE_ALIAS("raid6"); 8477