1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 58 #include <trace/events/block.h> 59 #include <linux/list_sort.h> 60 61 #include "md.h" 62 #include "raid5.h" 63 #include "raid0.h" 64 #include "md-bitmap.h" 65 #include "raid5-log.h" 66 67 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 68 69 #define cpu_to_group(cpu) cpu_to_node(cpu) 70 #define ANY_GROUP NUMA_NO_NODE 71 72 static bool devices_handle_discard_safely = false; 73 module_param(devices_handle_discard_safely, bool, 0644); 74 MODULE_PARM_DESC(devices_handle_discard_safely, 75 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 76 static struct workqueue_struct *raid5_wq; 77 78 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 79 { 80 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 81 return &conf->stripe_hashtbl[hash]; 82 } 83 84 static inline int stripe_hash_locks_hash(sector_t sect) 85 { 86 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 87 } 88 89 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 90 { 91 spin_lock_irq(conf->hash_locks + hash); 92 spin_lock(&conf->device_lock); 93 } 94 95 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 96 { 97 spin_unlock(&conf->device_lock); 98 spin_unlock_irq(conf->hash_locks + hash); 99 } 100 101 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 102 { 103 int i; 104 spin_lock_irq(conf->hash_locks); 105 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 106 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 107 spin_lock(&conf->device_lock); 108 } 109 110 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 111 { 112 int i; 113 spin_unlock(&conf->device_lock); 114 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 115 spin_unlock(conf->hash_locks + i); 116 spin_unlock_irq(conf->hash_locks); 117 } 118 119 /* Find first data disk in a raid6 stripe */ 120 static inline int raid6_d0(struct stripe_head *sh) 121 { 122 if (sh->ddf_layout) 123 /* ddf always start from first device */ 124 return 0; 125 /* md starts just after Q block */ 126 if (sh->qd_idx == sh->disks - 1) 127 return 0; 128 else 129 return sh->qd_idx + 1; 130 } 131 static inline int raid6_next_disk(int disk, int raid_disks) 132 { 133 disk++; 134 return (disk < raid_disks) ? disk : 0; 135 } 136 137 /* When walking through the disks in a raid5, starting at raid6_d0, 138 * We need to map each disk to a 'slot', where the data disks are slot 139 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 140 * is raid_disks-1. This help does that mapping. 141 */ 142 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 143 int *count, int syndrome_disks) 144 { 145 int slot = *count; 146 147 if (sh->ddf_layout) 148 (*count)++; 149 if (idx == sh->pd_idx) 150 return syndrome_disks; 151 if (idx == sh->qd_idx) 152 return syndrome_disks + 1; 153 if (!sh->ddf_layout) 154 (*count)++; 155 return slot; 156 } 157 158 static void print_raid5_conf (struct r5conf *conf); 159 160 static int stripe_operations_active(struct stripe_head *sh) 161 { 162 return sh->check_state || sh->reconstruct_state || 163 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 164 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 165 } 166 167 static bool stripe_is_lowprio(struct stripe_head *sh) 168 { 169 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 170 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 171 !test_bit(STRIPE_R5C_CACHING, &sh->state); 172 } 173 174 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 175 { 176 struct r5conf *conf = sh->raid_conf; 177 struct r5worker_group *group; 178 int thread_cnt; 179 int i, cpu = sh->cpu; 180 181 if (!cpu_online(cpu)) { 182 cpu = cpumask_any(cpu_online_mask); 183 sh->cpu = cpu; 184 } 185 186 if (list_empty(&sh->lru)) { 187 struct r5worker_group *group; 188 group = conf->worker_groups + cpu_to_group(cpu); 189 if (stripe_is_lowprio(sh)) 190 list_add_tail(&sh->lru, &group->loprio_list); 191 else 192 list_add_tail(&sh->lru, &group->handle_list); 193 group->stripes_cnt++; 194 sh->group = group; 195 } 196 197 if (conf->worker_cnt_per_group == 0) { 198 md_wakeup_thread(conf->mddev->thread); 199 return; 200 } 201 202 group = conf->worker_groups + cpu_to_group(sh->cpu); 203 204 group->workers[0].working = true; 205 /* at least one worker should run to avoid race */ 206 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 207 208 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 209 /* wakeup more workers */ 210 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 211 if (group->workers[i].working == false) { 212 group->workers[i].working = true; 213 queue_work_on(sh->cpu, raid5_wq, 214 &group->workers[i].work); 215 thread_cnt--; 216 } 217 } 218 } 219 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 221 struct list_head *temp_inactive_list) 222 { 223 int i; 224 int injournal = 0; /* number of date pages with R5_InJournal */ 225 226 BUG_ON(!list_empty(&sh->lru)); 227 BUG_ON(atomic_read(&conf->active_stripes)==0); 228 229 if (r5c_is_writeback(conf->log)) 230 for (i = sh->disks; i--; ) 231 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 232 injournal++; 233 /* 234 * In the following cases, the stripe cannot be released to cached 235 * lists. Therefore, we make the stripe write out and set 236 * STRIPE_HANDLE: 237 * 1. when quiesce in r5c write back; 238 * 2. when resync is requested fot the stripe. 239 */ 240 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 241 (conf->quiesce && r5c_is_writeback(conf->log) && 242 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 243 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 244 r5c_make_stripe_write_out(sh); 245 set_bit(STRIPE_HANDLE, &sh->state); 246 } 247 248 if (test_bit(STRIPE_HANDLE, &sh->state)) { 249 if (test_bit(STRIPE_DELAYED, &sh->state) && 250 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 251 list_add_tail(&sh->lru, &conf->delayed_list); 252 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 253 sh->bm_seq - conf->seq_write > 0) 254 list_add_tail(&sh->lru, &conf->bitmap_list); 255 else { 256 clear_bit(STRIPE_DELAYED, &sh->state); 257 clear_bit(STRIPE_BIT_DELAY, &sh->state); 258 if (conf->worker_cnt_per_group == 0) { 259 if (stripe_is_lowprio(sh)) 260 list_add_tail(&sh->lru, 261 &conf->loprio_list); 262 else 263 list_add_tail(&sh->lru, 264 &conf->handle_list); 265 } else { 266 raid5_wakeup_stripe_thread(sh); 267 return; 268 } 269 } 270 md_wakeup_thread(conf->mddev->thread); 271 } else { 272 BUG_ON(stripe_operations_active(sh)); 273 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 274 if (atomic_dec_return(&conf->preread_active_stripes) 275 < IO_THRESHOLD) 276 md_wakeup_thread(conf->mddev->thread); 277 atomic_dec(&conf->active_stripes); 278 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 279 if (!r5c_is_writeback(conf->log)) 280 list_add_tail(&sh->lru, temp_inactive_list); 281 else { 282 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 283 if (injournal == 0) 284 list_add_tail(&sh->lru, temp_inactive_list); 285 else if (injournal == conf->raid_disks - conf->max_degraded) { 286 /* full stripe */ 287 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 288 atomic_inc(&conf->r5c_cached_full_stripes); 289 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 290 atomic_dec(&conf->r5c_cached_partial_stripes); 291 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 292 r5c_check_cached_full_stripe(conf); 293 } else 294 /* 295 * STRIPE_R5C_PARTIAL_STRIPE is set in 296 * r5c_try_caching_write(). No need to 297 * set it again. 298 */ 299 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 300 } 301 } 302 } 303 } 304 305 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 306 struct list_head *temp_inactive_list) 307 { 308 if (atomic_dec_and_test(&sh->count)) 309 do_release_stripe(conf, sh, temp_inactive_list); 310 } 311 312 /* 313 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 314 * 315 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 316 * given time. Adding stripes only takes device lock, while deleting stripes 317 * only takes hash lock. 318 */ 319 static void release_inactive_stripe_list(struct r5conf *conf, 320 struct list_head *temp_inactive_list, 321 int hash) 322 { 323 int size; 324 bool do_wakeup = false; 325 unsigned long flags; 326 327 if (hash == NR_STRIPE_HASH_LOCKS) { 328 size = NR_STRIPE_HASH_LOCKS; 329 hash = NR_STRIPE_HASH_LOCKS - 1; 330 } else 331 size = 1; 332 while (size) { 333 struct list_head *list = &temp_inactive_list[size - 1]; 334 335 /* 336 * We don't hold any lock here yet, raid5_get_active_stripe() might 337 * remove stripes from the list 338 */ 339 if (!list_empty_careful(list)) { 340 spin_lock_irqsave(conf->hash_locks + hash, flags); 341 if (list_empty(conf->inactive_list + hash) && 342 !list_empty(list)) 343 atomic_dec(&conf->empty_inactive_list_nr); 344 list_splice_tail_init(list, conf->inactive_list + hash); 345 do_wakeup = true; 346 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 347 } 348 size--; 349 hash--; 350 } 351 352 if (do_wakeup) { 353 wake_up(&conf->wait_for_stripe); 354 if (atomic_read(&conf->active_stripes) == 0) 355 wake_up(&conf->wait_for_quiescent); 356 if (conf->retry_read_aligned) 357 md_wakeup_thread(conf->mddev->thread); 358 } 359 } 360 361 /* should hold conf->device_lock already */ 362 static int release_stripe_list(struct r5conf *conf, 363 struct list_head *temp_inactive_list) 364 { 365 struct stripe_head *sh, *t; 366 int count = 0; 367 struct llist_node *head; 368 369 head = llist_del_all(&conf->released_stripes); 370 head = llist_reverse_order(head); 371 llist_for_each_entry_safe(sh, t, head, release_list) { 372 int hash; 373 374 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 375 smp_mb(); 376 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 377 /* 378 * Don't worry the bit is set here, because if the bit is set 379 * again, the count is always > 1. This is true for 380 * STRIPE_ON_UNPLUG_LIST bit too. 381 */ 382 hash = sh->hash_lock_index; 383 __release_stripe(conf, sh, &temp_inactive_list[hash]); 384 count++; 385 } 386 387 return count; 388 } 389 390 void raid5_release_stripe(struct stripe_head *sh) 391 { 392 struct r5conf *conf = sh->raid_conf; 393 unsigned long flags; 394 struct list_head list; 395 int hash; 396 bool wakeup; 397 398 /* Avoid release_list until the last reference. 399 */ 400 if (atomic_add_unless(&sh->count, -1, 1)) 401 return; 402 403 if (unlikely(!conf->mddev->thread) || 404 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 405 goto slow_path; 406 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 407 if (wakeup) 408 md_wakeup_thread(conf->mddev->thread); 409 return; 410 slow_path: 411 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 412 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { 413 INIT_LIST_HEAD(&list); 414 hash = sh->hash_lock_index; 415 do_release_stripe(conf, sh, &list); 416 spin_unlock_irqrestore(&conf->device_lock, flags); 417 release_inactive_stripe_list(conf, &list, hash); 418 } 419 } 420 421 static inline void remove_hash(struct stripe_head *sh) 422 { 423 pr_debug("remove_hash(), stripe %llu\n", 424 (unsigned long long)sh->sector); 425 426 hlist_del_init(&sh->hash); 427 } 428 429 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 430 { 431 struct hlist_head *hp = stripe_hash(conf, sh->sector); 432 433 pr_debug("insert_hash(), stripe %llu\n", 434 (unsigned long long)sh->sector); 435 436 hlist_add_head(&sh->hash, hp); 437 } 438 439 /* find an idle stripe, make sure it is unhashed, and return it. */ 440 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 441 { 442 struct stripe_head *sh = NULL; 443 struct list_head *first; 444 445 if (list_empty(conf->inactive_list + hash)) 446 goto out; 447 first = (conf->inactive_list + hash)->next; 448 sh = list_entry(first, struct stripe_head, lru); 449 list_del_init(first); 450 remove_hash(sh); 451 atomic_inc(&conf->active_stripes); 452 BUG_ON(hash != sh->hash_lock_index); 453 if (list_empty(conf->inactive_list + hash)) 454 atomic_inc(&conf->empty_inactive_list_nr); 455 out: 456 return sh; 457 } 458 459 static void shrink_buffers(struct stripe_head *sh) 460 { 461 struct page *p; 462 int i; 463 int num = sh->raid_conf->pool_size; 464 465 for (i = 0; i < num ; i++) { 466 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 467 p = sh->dev[i].page; 468 if (!p) 469 continue; 470 sh->dev[i].page = NULL; 471 put_page(p); 472 } 473 } 474 475 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 476 { 477 int i; 478 int num = sh->raid_conf->pool_size; 479 480 for (i = 0; i < num; i++) { 481 struct page *page; 482 483 if (!(page = alloc_page(gfp))) { 484 return 1; 485 } 486 sh->dev[i].page = page; 487 sh->dev[i].orig_page = page; 488 } 489 490 return 0; 491 } 492 493 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 494 struct stripe_head *sh); 495 496 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 497 { 498 struct r5conf *conf = sh->raid_conf; 499 int i, seq; 500 501 BUG_ON(atomic_read(&sh->count) != 0); 502 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 503 BUG_ON(stripe_operations_active(sh)); 504 BUG_ON(sh->batch_head); 505 506 pr_debug("init_stripe called, stripe %llu\n", 507 (unsigned long long)sector); 508 retry: 509 seq = read_seqcount_begin(&conf->gen_lock); 510 sh->generation = conf->generation - previous; 511 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 512 sh->sector = sector; 513 stripe_set_idx(sector, conf, previous, sh); 514 sh->state = 0; 515 516 for (i = sh->disks; i--; ) { 517 struct r5dev *dev = &sh->dev[i]; 518 519 if (dev->toread || dev->read || dev->towrite || dev->written || 520 test_bit(R5_LOCKED, &dev->flags)) { 521 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 522 (unsigned long long)sh->sector, i, dev->toread, 523 dev->read, dev->towrite, dev->written, 524 test_bit(R5_LOCKED, &dev->flags)); 525 WARN_ON(1); 526 } 527 dev->flags = 0; 528 dev->sector = raid5_compute_blocknr(sh, i, previous); 529 } 530 if (read_seqcount_retry(&conf->gen_lock, seq)) 531 goto retry; 532 sh->overwrite_disks = 0; 533 insert_hash(conf, sh); 534 sh->cpu = smp_processor_id(); 535 set_bit(STRIPE_BATCH_READY, &sh->state); 536 } 537 538 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 539 short generation) 540 { 541 struct stripe_head *sh; 542 543 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 544 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 545 if (sh->sector == sector && sh->generation == generation) 546 return sh; 547 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 548 return NULL; 549 } 550 551 /* 552 * Need to check if array has failed when deciding whether to: 553 * - start an array 554 * - remove non-faulty devices 555 * - add a spare 556 * - allow a reshape 557 * This determination is simple when no reshape is happening. 558 * However if there is a reshape, we need to carefully check 559 * both the before and after sections. 560 * This is because some failed devices may only affect one 561 * of the two sections, and some non-in_sync devices may 562 * be insync in the section most affected by failed devices. 563 */ 564 int raid5_calc_degraded(struct r5conf *conf) 565 { 566 int degraded, degraded2; 567 int i; 568 569 rcu_read_lock(); 570 degraded = 0; 571 for (i = 0; i < conf->previous_raid_disks; i++) { 572 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 573 if (rdev && test_bit(Faulty, &rdev->flags)) 574 rdev = rcu_dereference(conf->disks[i].replacement); 575 if (!rdev || test_bit(Faulty, &rdev->flags)) 576 degraded++; 577 else if (test_bit(In_sync, &rdev->flags)) 578 ; 579 else 580 /* not in-sync or faulty. 581 * If the reshape increases the number of devices, 582 * this is being recovered by the reshape, so 583 * this 'previous' section is not in_sync. 584 * If the number of devices is being reduced however, 585 * the device can only be part of the array if 586 * we are reverting a reshape, so this section will 587 * be in-sync. 588 */ 589 if (conf->raid_disks >= conf->previous_raid_disks) 590 degraded++; 591 } 592 rcu_read_unlock(); 593 if (conf->raid_disks == conf->previous_raid_disks) 594 return degraded; 595 rcu_read_lock(); 596 degraded2 = 0; 597 for (i = 0; i < conf->raid_disks; i++) { 598 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 599 if (rdev && test_bit(Faulty, &rdev->flags)) 600 rdev = rcu_dereference(conf->disks[i].replacement); 601 if (!rdev || test_bit(Faulty, &rdev->flags)) 602 degraded2++; 603 else if (test_bit(In_sync, &rdev->flags)) 604 ; 605 else 606 /* not in-sync or faulty. 607 * If reshape increases the number of devices, this 608 * section has already been recovered, else it 609 * almost certainly hasn't. 610 */ 611 if (conf->raid_disks <= conf->previous_raid_disks) 612 degraded2++; 613 } 614 rcu_read_unlock(); 615 if (degraded2 > degraded) 616 return degraded2; 617 return degraded; 618 } 619 620 static int has_failed(struct r5conf *conf) 621 { 622 int degraded; 623 624 if (conf->mddev->reshape_position == MaxSector) 625 return conf->mddev->degraded > conf->max_degraded; 626 627 degraded = raid5_calc_degraded(conf); 628 if (degraded > conf->max_degraded) 629 return 1; 630 return 0; 631 } 632 633 struct stripe_head * 634 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 635 int previous, int noblock, int noquiesce) 636 { 637 struct stripe_head *sh; 638 int hash = stripe_hash_locks_hash(sector); 639 int inc_empty_inactive_list_flag; 640 641 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 642 643 spin_lock_irq(conf->hash_locks + hash); 644 645 do { 646 wait_event_lock_irq(conf->wait_for_quiescent, 647 conf->quiesce == 0 || noquiesce, 648 *(conf->hash_locks + hash)); 649 sh = __find_stripe(conf, sector, conf->generation - previous); 650 if (!sh) { 651 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 652 sh = get_free_stripe(conf, hash); 653 if (!sh && !test_bit(R5_DID_ALLOC, 654 &conf->cache_state)) 655 set_bit(R5_ALLOC_MORE, 656 &conf->cache_state); 657 } 658 if (noblock && sh == NULL) 659 break; 660 661 r5c_check_stripe_cache_usage(conf); 662 if (!sh) { 663 set_bit(R5_INACTIVE_BLOCKED, 664 &conf->cache_state); 665 r5l_wake_reclaim(conf->log, 0); 666 wait_event_lock_irq( 667 conf->wait_for_stripe, 668 !list_empty(conf->inactive_list + hash) && 669 (atomic_read(&conf->active_stripes) 670 < (conf->max_nr_stripes * 3 / 4) 671 || !test_bit(R5_INACTIVE_BLOCKED, 672 &conf->cache_state)), 673 *(conf->hash_locks + hash)); 674 clear_bit(R5_INACTIVE_BLOCKED, 675 &conf->cache_state); 676 } else { 677 init_stripe(sh, sector, previous); 678 atomic_inc(&sh->count); 679 } 680 } else if (!atomic_inc_not_zero(&sh->count)) { 681 spin_lock(&conf->device_lock); 682 if (!atomic_read(&sh->count)) { 683 if (!test_bit(STRIPE_HANDLE, &sh->state)) 684 atomic_inc(&conf->active_stripes); 685 BUG_ON(list_empty(&sh->lru) && 686 !test_bit(STRIPE_EXPANDING, &sh->state)); 687 inc_empty_inactive_list_flag = 0; 688 if (!list_empty(conf->inactive_list + hash)) 689 inc_empty_inactive_list_flag = 1; 690 list_del_init(&sh->lru); 691 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 692 atomic_inc(&conf->empty_inactive_list_nr); 693 if (sh->group) { 694 sh->group->stripes_cnt--; 695 sh->group = NULL; 696 } 697 } 698 atomic_inc(&sh->count); 699 spin_unlock(&conf->device_lock); 700 } 701 } while (sh == NULL); 702 703 spin_unlock_irq(conf->hash_locks + hash); 704 return sh; 705 } 706 707 static bool is_full_stripe_write(struct stripe_head *sh) 708 { 709 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 710 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 711 } 712 713 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 714 { 715 if (sh1 > sh2) { 716 spin_lock_irq(&sh2->stripe_lock); 717 spin_lock_nested(&sh1->stripe_lock, 1); 718 } else { 719 spin_lock_irq(&sh1->stripe_lock); 720 spin_lock_nested(&sh2->stripe_lock, 1); 721 } 722 } 723 724 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 725 { 726 spin_unlock(&sh1->stripe_lock); 727 spin_unlock_irq(&sh2->stripe_lock); 728 } 729 730 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 731 static bool stripe_can_batch(struct stripe_head *sh) 732 { 733 struct r5conf *conf = sh->raid_conf; 734 735 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 736 return false; 737 return test_bit(STRIPE_BATCH_READY, &sh->state) && 738 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 739 is_full_stripe_write(sh); 740 } 741 742 /* we only do back search */ 743 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 744 { 745 struct stripe_head *head; 746 sector_t head_sector, tmp_sec; 747 int hash; 748 int dd_idx; 749 int inc_empty_inactive_list_flag; 750 751 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 752 tmp_sec = sh->sector; 753 if (!sector_div(tmp_sec, conf->chunk_sectors)) 754 return; 755 head_sector = sh->sector - STRIPE_SECTORS; 756 757 hash = stripe_hash_locks_hash(head_sector); 758 spin_lock_irq(conf->hash_locks + hash); 759 head = __find_stripe(conf, head_sector, conf->generation); 760 if (head && !atomic_inc_not_zero(&head->count)) { 761 spin_lock(&conf->device_lock); 762 if (!atomic_read(&head->count)) { 763 if (!test_bit(STRIPE_HANDLE, &head->state)) 764 atomic_inc(&conf->active_stripes); 765 BUG_ON(list_empty(&head->lru) && 766 !test_bit(STRIPE_EXPANDING, &head->state)); 767 inc_empty_inactive_list_flag = 0; 768 if (!list_empty(conf->inactive_list + hash)) 769 inc_empty_inactive_list_flag = 1; 770 list_del_init(&head->lru); 771 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 772 atomic_inc(&conf->empty_inactive_list_nr); 773 if (head->group) { 774 head->group->stripes_cnt--; 775 head->group = NULL; 776 } 777 } 778 atomic_inc(&head->count); 779 spin_unlock(&conf->device_lock); 780 } 781 spin_unlock_irq(conf->hash_locks + hash); 782 783 if (!head) 784 return; 785 if (!stripe_can_batch(head)) 786 goto out; 787 788 lock_two_stripes(head, sh); 789 /* clear_batch_ready clear the flag */ 790 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 791 goto unlock_out; 792 793 if (sh->batch_head) 794 goto unlock_out; 795 796 dd_idx = 0; 797 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 798 dd_idx++; 799 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 800 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 801 goto unlock_out; 802 803 if (head->batch_head) { 804 spin_lock(&head->batch_head->batch_lock); 805 /* This batch list is already running */ 806 if (!stripe_can_batch(head)) { 807 spin_unlock(&head->batch_head->batch_lock); 808 goto unlock_out; 809 } 810 /* 811 * We must assign batch_head of this stripe within the 812 * batch_lock, otherwise clear_batch_ready of batch head 813 * stripe could clear BATCH_READY bit of this stripe and 814 * this stripe->batch_head doesn't get assigned, which 815 * could confuse clear_batch_ready for this stripe 816 */ 817 sh->batch_head = head->batch_head; 818 819 /* 820 * at this point, head's BATCH_READY could be cleared, but we 821 * can still add the stripe to batch list 822 */ 823 list_add(&sh->batch_list, &head->batch_list); 824 spin_unlock(&head->batch_head->batch_lock); 825 } else { 826 head->batch_head = head; 827 sh->batch_head = head->batch_head; 828 spin_lock(&head->batch_lock); 829 list_add_tail(&sh->batch_list, &head->batch_list); 830 spin_unlock(&head->batch_lock); 831 } 832 833 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 834 if (atomic_dec_return(&conf->preread_active_stripes) 835 < IO_THRESHOLD) 836 md_wakeup_thread(conf->mddev->thread); 837 838 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 839 int seq = sh->bm_seq; 840 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 841 sh->batch_head->bm_seq > seq) 842 seq = sh->batch_head->bm_seq; 843 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 844 sh->batch_head->bm_seq = seq; 845 } 846 847 atomic_inc(&sh->count); 848 unlock_out: 849 unlock_two_stripes(head, sh); 850 out: 851 raid5_release_stripe(head); 852 } 853 854 /* Determine if 'data_offset' or 'new_data_offset' should be used 855 * in this stripe_head. 856 */ 857 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 858 { 859 sector_t progress = conf->reshape_progress; 860 /* Need a memory barrier to make sure we see the value 861 * of conf->generation, or ->data_offset that was set before 862 * reshape_progress was updated. 863 */ 864 smp_rmb(); 865 if (progress == MaxSector) 866 return 0; 867 if (sh->generation == conf->generation - 1) 868 return 0; 869 /* We are in a reshape, and this is a new-generation stripe, 870 * so use new_data_offset. 871 */ 872 return 1; 873 } 874 875 static void dispatch_bio_list(struct bio_list *tmp) 876 { 877 struct bio *bio; 878 879 while ((bio = bio_list_pop(tmp))) 880 generic_make_request(bio); 881 } 882 883 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 884 { 885 const struct r5pending_data *da = list_entry(a, 886 struct r5pending_data, sibling); 887 const struct r5pending_data *db = list_entry(b, 888 struct r5pending_data, sibling); 889 if (da->sector > db->sector) 890 return 1; 891 if (da->sector < db->sector) 892 return -1; 893 return 0; 894 } 895 896 static void dispatch_defer_bios(struct r5conf *conf, int target, 897 struct bio_list *list) 898 { 899 struct r5pending_data *data; 900 struct list_head *first, *next = NULL; 901 int cnt = 0; 902 903 if (conf->pending_data_cnt == 0) 904 return; 905 906 list_sort(NULL, &conf->pending_list, cmp_stripe); 907 908 first = conf->pending_list.next; 909 910 /* temporarily move the head */ 911 if (conf->next_pending_data) 912 list_move_tail(&conf->pending_list, 913 &conf->next_pending_data->sibling); 914 915 while (!list_empty(&conf->pending_list)) { 916 data = list_first_entry(&conf->pending_list, 917 struct r5pending_data, sibling); 918 if (&data->sibling == first) 919 first = data->sibling.next; 920 next = data->sibling.next; 921 922 bio_list_merge(list, &data->bios); 923 list_move(&data->sibling, &conf->free_list); 924 cnt++; 925 if (cnt >= target) 926 break; 927 } 928 conf->pending_data_cnt -= cnt; 929 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 930 931 if (next != &conf->pending_list) 932 conf->next_pending_data = list_entry(next, 933 struct r5pending_data, sibling); 934 else 935 conf->next_pending_data = NULL; 936 /* list isn't empty */ 937 if (first != &conf->pending_list) 938 list_move_tail(&conf->pending_list, first); 939 } 940 941 static void flush_deferred_bios(struct r5conf *conf) 942 { 943 struct bio_list tmp = BIO_EMPTY_LIST; 944 945 if (conf->pending_data_cnt == 0) 946 return; 947 948 spin_lock(&conf->pending_bios_lock); 949 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 950 BUG_ON(conf->pending_data_cnt != 0); 951 spin_unlock(&conf->pending_bios_lock); 952 953 dispatch_bio_list(&tmp); 954 } 955 956 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 957 struct bio_list *bios) 958 { 959 struct bio_list tmp = BIO_EMPTY_LIST; 960 struct r5pending_data *ent; 961 962 spin_lock(&conf->pending_bios_lock); 963 ent = list_first_entry(&conf->free_list, struct r5pending_data, 964 sibling); 965 list_move_tail(&ent->sibling, &conf->pending_list); 966 ent->sector = sector; 967 bio_list_init(&ent->bios); 968 bio_list_merge(&ent->bios, bios); 969 conf->pending_data_cnt++; 970 if (conf->pending_data_cnt >= PENDING_IO_MAX) 971 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 972 973 spin_unlock(&conf->pending_bios_lock); 974 975 dispatch_bio_list(&tmp); 976 } 977 978 static void 979 raid5_end_read_request(struct bio *bi); 980 static void 981 raid5_end_write_request(struct bio *bi); 982 983 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 984 { 985 struct r5conf *conf = sh->raid_conf; 986 int i, disks = sh->disks; 987 struct stripe_head *head_sh = sh; 988 struct bio_list pending_bios = BIO_EMPTY_LIST; 989 bool should_defer; 990 991 might_sleep(); 992 993 if (log_stripe(sh, s) == 0) 994 return; 995 996 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 997 998 for (i = disks; i--; ) { 999 int op, op_flags = 0; 1000 int replace_only = 0; 1001 struct bio *bi, *rbi; 1002 struct md_rdev *rdev, *rrdev = NULL; 1003 1004 sh = head_sh; 1005 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1006 op = REQ_OP_WRITE; 1007 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1008 op_flags = REQ_FUA; 1009 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1010 op = REQ_OP_DISCARD; 1011 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1012 op = REQ_OP_READ; 1013 else if (test_and_clear_bit(R5_WantReplace, 1014 &sh->dev[i].flags)) { 1015 op = REQ_OP_WRITE; 1016 replace_only = 1; 1017 } else 1018 continue; 1019 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1020 op_flags |= REQ_SYNC; 1021 1022 again: 1023 bi = &sh->dev[i].req; 1024 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1025 1026 rcu_read_lock(); 1027 rrdev = rcu_dereference(conf->disks[i].replacement); 1028 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1029 rdev = rcu_dereference(conf->disks[i].rdev); 1030 if (!rdev) { 1031 rdev = rrdev; 1032 rrdev = NULL; 1033 } 1034 if (op_is_write(op)) { 1035 if (replace_only) 1036 rdev = NULL; 1037 if (rdev == rrdev) 1038 /* We raced and saw duplicates */ 1039 rrdev = NULL; 1040 } else { 1041 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1042 rdev = rrdev; 1043 rrdev = NULL; 1044 } 1045 1046 if (rdev && test_bit(Faulty, &rdev->flags)) 1047 rdev = NULL; 1048 if (rdev) 1049 atomic_inc(&rdev->nr_pending); 1050 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1051 rrdev = NULL; 1052 if (rrdev) 1053 atomic_inc(&rrdev->nr_pending); 1054 rcu_read_unlock(); 1055 1056 /* We have already checked bad blocks for reads. Now 1057 * need to check for writes. We never accept write errors 1058 * on the replacement, so we don't to check rrdev. 1059 */ 1060 while (op_is_write(op) && rdev && 1061 test_bit(WriteErrorSeen, &rdev->flags)) { 1062 sector_t first_bad; 1063 int bad_sectors; 1064 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1065 &first_bad, &bad_sectors); 1066 if (!bad) 1067 break; 1068 1069 if (bad < 0) { 1070 set_bit(BlockedBadBlocks, &rdev->flags); 1071 if (!conf->mddev->external && 1072 conf->mddev->sb_flags) { 1073 /* It is very unlikely, but we might 1074 * still need to write out the 1075 * bad block log - better give it 1076 * a chance*/ 1077 md_check_recovery(conf->mddev); 1078 } 1079 /* 1080 * Because md_wait_for_blocked_rdev 1081 * will dec nr_pending, we must 1082 * increment it first. 1083 */ 1084 atomic_inc(&rdev->nr_pending); 1085 md_wait_for_blocked_rdev(rdev, conf->mddev); 1086 } else { 1087 /* Acknowledged bad block - skip the write */ 1088 rdev_dec_pending(rdev, conf->mddev); 1089 rdev = NULL; 1090 } 1091 } 1092 1093 if (rdev) { 1094 if (s->syncing || s->expanding || s->expanded 1095 || s->replacing) 1096 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1097 1098 set_bit(STRIPE_IO_STARTED, &sh->state); 1099 1100 bio_set_dev(bi, rdev->bdev); 1101 bio_set_op_attrs(bi, op, op_flags); 1102 bi->bi_end_io = op_is_write(op) 1103 ? raid5_end_write_request 1104 : raid5_end_read_request; 1105 bi->bi_private = sh; 1106 1107 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1108 __func__, (unsigned long long)sh->sector, 1109 bi->bi_opf, i); 1110 atomic_inc(&sh->count); 1111 if (sh != head_sh) 1112 atomic_inc(&head_sh->count); 1113 if (use_new_offset(conf, sh)) 1114 bi->bi_iter.bi_sector = (sh->sector 1115 + rdev->new_data_offset); 1116 else 1117 bi->bi_iter.bi_sector = (sh->sector 1118 + rdev->data_offset); 1119 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1120 bi->bi_opf |= REQ_NOMERGE; 1121 1122 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1123 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1124 1125 if (!op_is_write(op) && 1126 test_bit(R5_InJournal, &sh->dev[i].flags)) 1127 /* 1128 * issuing read for a page in journal, this 1129 * must be preparing for prexor in rmw; read 1130 * the data into orig_page 1131 */ 1132 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1133 else 1134 sh->dev[i].vec.bv_page = sh->dev[i].page; 1135 bi->bi_vcnt = 1; 1136 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1137 bi->bi_io_vec[0].bv_offset = 0; 1138 bi->bi_iter.bi_size = STRIPE_SIZE; 1139 bi->bi_write_hint = sh->dev[i].write_hint; 1140 if (!rrdev) 1141 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1142 /* 1143 * If this is discard request, set bi_vcnt 0. We don't 1144 * want to confuse SCSI because SCSI will replace payload 1145 */ 1146 if (op == REQ_OP_DISCARD) 1147 bi->bi_vcnt = 0; 1148 if (rrdev) 1149 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1150 1151 if (conf->mddev->gendisk) 1152 trace_block_bio_remap(bi->bi_disk->queue, 1153 bi, disk_devt(conf->mddev->gendisk), 1154 sh->dev[i].sector); 1155 if (should_defer && op_is_write(op)) 1156 bio_list_add(&pending_bios, bi); 1157 else 1158 generic_make_request(bi); 1159 } 1160 if (rrdev) { 1161 if (s->syncing || s->expanding || s->expanded 1162 || s->replacing) 1163 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1164 1165 set_bit(STRIPE_IO_STARTED, &sh->state); 1166 1167 bio_set_dev(rbi, rrdev->bdev); 1168 bio_set_op_attrs(rbi, op, op_flags); 1169 BUG_ON(!op_is_write(op)); 1170 rbi->bi_end_io = raid5_end_write_request; 1171 rbi->bi_private = sh; 1172 1173 pr_debug("%s: for %llu schedule op %d on " 1174 "replacement disc %d\n", 1175 __func__, (unsigned long long)sh->sector, 1176 rbi->bi_opf, i); 1177 atomic_inc(&sh->count); 1178 if (sh != head_sh) 1179 atomic_inc(&head_sh->count); 1180 if (use_new_offset(conf, sh)) 1181 rbi->bi_iter.bi_sector = (sh->sector 1182 + rrdev->new_data_offset); 1183 else 1184 rbi->bi_iter.bi_sector = (sh->sector 1185 + rrdev->data_offset); 1186 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1187 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1188 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1189 rbi->bi_vcnt = 1; 1190 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1191 rbi->bi_io_vec[0].bv_offset = 0; 1192 rbi->bi_iter.bi_size = STRIPE_SIZE; 1193 rbi->bi_write_hint = sh->dev[i].write_hint; 1194 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1195 /* 1196 * If this is discard request, set bi_vcnt 0. We don't 1197 * want to confuse SCSI because SCSI will replace payload 1198 */ 1199 if (op == REQ_OP_DISCARD) 1200 rbi->bi_vcnt = 0; 1201 if (conf->mddev->gendisk) 1202 trace_block_bio_remap(rbi->bi_disk->queue, 1203 rbi, disk_devt(conf->mddev->gendisk), 1204 sh->dev[i].sector); 1205 if (should_defer && op_is_write(op)) 1206 bio_list_add(&pending_bios, rbi); 1207 else 1208 generic_make_request(rbi); 1209 } 1210 if (!rdev && !rrdev) { 1211 if (op_is_write(op)) 1212 set_bit(STRIPE_DEGRADED, &sh->state); 1213 pr_debug("skip op %d on disc %d for sector %llu\n", 1214 bi->bi_opf, i, (unsigned long long)sh->sector); 1215 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1216 set_bit(STRIPE_HANDLE, &sh->state); 1217 } 1218 1219 if (!head_sh->batch_head) 1220 continue; 1221 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1222 batch_list); 1223 if (sh != head_sh) 1224 goto again; 1225 } 1226 1227 if (should_defer && !bio_list_empty(&pending_bios)) 1228 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1229 } 1230 1231 static struct dma_async_tx_descriptor * 1232 async_copy_data(int frombio, struct bio *bio, struct page **page, 1233 sector_t sector, struct dma_async_tx_descriptor *tx, 1234 struct stripe_head *sh, int no_skipcopy) 1235 { 1236 struct bio_vec bvl; 1237 struct bvec_iter iter; 1238 struct page *bio_page; 1239 int page_offset; 1240 struct async_submit_ctl submit; 1241 enum async_tx_flags flags = 0; 1242 1243 if (bio->bi_iter.bi_sector >= sector) 1244 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1245 else 1246 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1247 1248 if (frombio) 1249 flags |= ASYNC_TX_FENCE; 1250 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1251 1252 bio_for_each_segment(bvl, bio, iter) { 1253 int len = bvl.bv_len; 1254 int clen; 1255 int b_offset = 0; 1256 1257 if (page_offset < 0) { 1258 b_offset = -page_offset; 1259 page_offset += b_offset; 1260 len -= b_offset; 1261 } 1262 1263 if (len > 0 && page_offset + len > STRIPE_SIZE) 1264 clen = STRIPE_SIZE - page_offset; 1265 else 1266 clen = len; 1267 1268 if (clen > 0) { 1269 b_offset += bvl.bv_offset; 1270 bio_page = bvl.bv_page; 1271 if (frombio) { 1272 if (sh->raid_conf->skip_copy && 1273 b_offset == 0 && page_offset == 0 && 1274 clen == STRIPE_SIZE && 1275 !no_skipcopy) 1276 *page = bio_page; 1277 else 1278 tx = async_memcpy(*page, bio_page, page_offset, 1279 b_offset, clen, &submit); 1280 } else 1281 tx = async_memcpy(bio_page, *page, b_offset, 1282 page_offset, clen, &submit); 1283 } 1284 /* chain the operations */ 1285 submit.depend_tx = tx; 1286 1287 if (clen < len) /* hit end of page */ 1288 break; 1289 page_offset += len; 1290 } 1291 1292 return tx; 1293 } 1294 1295 static void ops_complete_biofill(void *stripe_head_ref) 1296 { 1297 struct stripe_head *sh = stripe_head_ref; 1298 int i; 1299 1300 pr_debug("%s: stripe %llu\n", __func__, 1301 (unsigned long long)sh->sector); 1302 1303 /* clear completed biofills */ 1304 for (i = sh->disks; i--; ) { 1305 struct r5dev *dev = &sh->dev[i]; 1306 1307 /* acknowledge completion of a biofill operation */ 1308 /* and check if we need to reply to a read request, 1309 * new R5_Wantfill requests are held off until 1310 * !STRIPE_BIOFILL_RUN 1311 */ 1312 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1313 struct bio *rbi, *rbi2; 1314 1315 BUG_ON(!dev->read); 1316 rbi = dev->read; 1317 dev->read = NULL; 1318 while (rbi && rbi->bi_iter.bi_sector < 1319 dev->sector + STRIPE_SECTORS) { 1320 rbi2 = r5_next_bio(rbi, dev->sector); 1321 bio_endio(rbi); 1322 rbi = rbi2; 1323 } 1324 } 1325 } 1326 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1327 1328 set_bit(STRIPE_HANDLE, &sh->state); 1329 raid5_release_stripe(sh); 1330 } 1331 1332 static void ops_run_biofill(struct stripe_head *sh) 1333 { 1334 struct dma_async_tx_descriptor *tx = NULL; 1335 struct async_submit_ctl submit; 1336 int i; 1337 1338 BUG_ON(sh->batch_head); 1339 pr_debug("%s: stripe %llu\n", __func__, 1340 (unsigned long long)sh->sector); 1341 1342 for (i = sh->disks; i--; ) { 1343 struct r5dev *dev = &sh->dev[i]; 1344 if (test_bit(R5_Wantfill, &dev->flags)) { 1345 struct bio *rbi; 1346 spin_lock_irq(&sh->stripe_lock); 1347 dev->read = rbi = dev->toread; 1348 dev->toread = NULL; 1349 spin_unlock_irq(&sh->stripe_lock); 1350 while (rbi && rbi->bi_iter.bi_sector < 1351 dev->sector + STRIPE_SECTORS) { 1352 tx = async_copy_data(0, rbi, &dev->page, 1353 dev->sector, tx, sh, 0); 1354 rbi = r5_next_bio(rbi, dev->sector); 1355 } 1356 } 1357 } 1358 1359 atomic_inc(&sh->count); 1360 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1361 async_trigger_callback(&submit); 1362 } 1363 1364 static void mark_target_uptodate(struct stripe_head *sh, int target) 1365 { 1366 struct r5dev *tgt; 1367 1368 if (target < 0) 1369 return; 1370 1371 tgt = &sh->dev[target]; 1372 set_bit(R5_UPTODATE, &tgt->flags); 1373 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1374 clear_bit(R5_Wantcompute, &tgt->flags); 1375 } 1376 1377 static void ops_complete_compute(void *stripe_head_ref) 1378 { 1379 struct stripe_head *sh = stripe_head_ref; 1380 1381 pr_debug("%s: stripe %llu\n", __func__, 1382 (unsigned long long)sh->sector); 1383 1384 /* mark the computed target(s) as uptodate */ 1385 mark_target_uptodate(sh, sh->ops.target); 1386 mark_target_uptodate(sh, sh->ops.target2); 1387 1388 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1389 if (sh->check_state == check_state_compute_run) 1390 sh->check_state = check_state_compute_result; 1391 set_bit(STRIPE_HANDLE, &sh->state); 1392 raid5_release_stripe(sh); 1393 } 1394 1395 /* return a pointer to the address conversion region of the scribble buffer */ 1396 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1397 { 1398 return percpu->scribble + i * percpu->scribble_obj_size; 1399 } 1400 1401 /* return a pointer to the address conversion region of the scribble buffer */ 1402 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1403 struct raid5_percpu *percpu, int i) 1404 { 1405 return (void *) (to_addr_page(percpu, i) + sh->disks + 2); 1406 } 1407 1408 static struct dma_async_tx_descriptor * 1409 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1410 { 1411 int disks = sh->disks; 1412 struct page **xor_srcs = to_addr_page(percpu, 0); 1413 int target = sh->ops.target; 1414 struct r5dev *tgt = &sh->dev[target]; 1415 struct page *xor_dest = tgt->page; 1416 int count = 0; 1417 struct dma_async_tx_descriptor *tx; 1418 struct async_submit_ctl submit; 1419 int i; 1420 1421 BUG_ON(sh->batch_head); 1422 1423 pr_debug("%s: stripe %llu block: %d\n", 1424 __func__, (unsigned long long)sh->sector, target); 1425 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1426 1427 for (i = disks; i--; ) 1428 if (i != target) 1429 xor_srcs[count++] = sh->dev[i].page; 1430 1431 atomic_inc(&sh->count); 1432 1433 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1434 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1435 if (unlikely(count == 1)) 1436 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1437 else 1438 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1439 1440 return tx; 1441 } 1442 1443 /* set_syndrome_sources - populate source buffers for gen_syndrome 1444 * @srcs - (struct page *) array of size sh->disks 1445 * @sh - stripe_head to parse 1446 * 1447 * Populates srcs in proper layout order for the stripe and returns the 1448 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1449 * destination buffer is recorded in srcs[count] and the Q destination 1450 * is recorded in srcs[count+1]]. 1451 */ 1452 static int set_syndrome_sources(struct page **srcs, 1453 struct stripe_head *sh, 1454 int srctype) 1455 { 1456 int disks = sh->disks; 1457 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1458 int d0_idx = raid6_d0(sh); 1459 int count; 1460 int i; 1461 1462 for (i = 0; i < disks; i++) 1463 srcs[i] = NULL; 1464 1465 count = 0; 1466 i = d0_idx; 1467 do { 1468 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1469 struct r5dev *dev = &sh->dev[i]; 1470 1471 if (i == sh->qd_idx || i == sh->pd_idx || 1472 (srctype == SYNDROME_SRC_ALL) || 1473 (srctype == SYNDROME_SRC_WANT_DRAIN && 1474 (test_bit(R5_Wantdrain, &dev->flags) || 1475 test_bit(R5_InJournal, &dev->flags))) || 1476 (srctype == SYNDROME_SRC_WRITTEN && 1477 (dev->written || 1478 test_bit(R5_InJournal, &dev->flags)))) { 1479 if (test_bit(R5_InJournal, &dev->flags)) 1480 srcs[slot] = sh->dev[i].orig_page; 1481 else 1482 srcs[slot] = sh->dev[i].page; 1483 } 1484 i = raid6_next_disk(i, disks); 1485 } while (i != d0_idx); 1486 1487 return syndrome_disks; 1488 } 1489 1490 static struct dma_async_tx_descriptor * 1491 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1492 { 1493 int disks = sh->disks; 1494 struct page **blocks = to_addr_page(percpu, 0); 1495 int target; 1496 int qd_idx = sh->qd_idx; 1497 struct dma_async_tx_descriptor *tx; 1498 struct async_submit_ctl submit; 1499 struct r5dev *tgt; 1500 struct page *dest; 1501 int i; 1502 int count; 1503 1504 BUG_ON(sh->batch_head); 1505 if (sh->ops.target < 0) 1506 target = sh->ops.target2; 1507 else if (sh->ops.target2 < 0) 1508 target = sh->ops.target; 1509 else 1510 /* we should only have one valid target */ 1511 BUG(); 1512 BUG_ON(target < 0); 1513 pr_debug("%s: stripe %llu block: %d\n", 1514 __func__, (unsigned long long)sh->sector, target); 1515 1516 tgt = &sh->dev[target]; 1517 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1518 dest = tgt->page; 1519 1520 atomic_inc(&sh->count); 1521 1522 if (target == qd_idx) { 1523 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1524 blocks[count] = NULL; /* regenerating p is not necessary */ 1525 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1526 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1527 ops_complete_compute, sh, 1528 to_addr_conv(sh, percpu, 0)); 1529 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1530 } else { 1531 /* Compute any data- or p-drive using XOR */ 1532 count = 0; 1533 for (i = disks; i-- ; ) { 1534 if (i == target || i == qd_idx) 1535 continue; 1536 blocks[count++] = sh->dev[i].page; 1537 } 1538 1539 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1540 NULL, ops_complete_compute, sh, 1541 to_addr_conv(sh, percpu, 0)); 1542 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1543 } 1544 1545 return tx; 1546 } 1547 1548 static struct dma_async_tx_descriptor * 1549 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1550 { 1551 int i, count, disks = sh->disks; 1552 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1553 int d0_idx = raid6_d0(sh); 1554 int faila = -1, failb = -1; 1555 int target = sh->ops.target; 1556 int target2 = sh->ops.target2; 1557 struct r5dev *tgt = &sh->dev[target]; 1558 struct r5dev *tgt2 = &sh->dev[target2]; 1559 struct dma_async_tx_descriptor *tx; 1560 struct page **blocks = to_addr_page(percpu, 0); 1561 struct async_submit_ctl submit; 1562 1563 BUG_ON(sh->batch_head); 1564 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1565 __func__, (unsigned long long)sh->sector, target, target2); 1566 BUG_ON(target < 0 || target2 < 0); 1567 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1568 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1569 1570 /* we need to open-code set_syndrome_sources to handle the 1571 * slot number conversion for 'faila' and 'failb' 1572 */ 1573 for (i = 0; i < disks ; i++) 1574 blocks[i] = NULL; 1575 count = 0; 1576 i = d0_idx; 1577 do { 1578 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1579 1580 blocks[slot] = sh->dev[i].page; 1581 1582 if (i == target) 1583 faila = slot; 1584 if (i == target2) 1585 failb = slot; 1586 i = raid6_next_disk(i, disks); 1587 } while (i != d0_idx); 1588 1589 BUG_ON(faila == failb); 1590 if (failb < faila) 1591 swap(faila, failb); 1592 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1593 __func__, (unsigned long long)sh->sector, faila, failb); 1594 1595 atomic_inc(&sh->count); 1596 1597 if (failb == syndrome_disks+1) { 1598 /* Q disk is one of the missing disks */ 1599 if (faila == syndrome_disks) { 1600 /* Missing P+Q, just recompute */ 1601 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1602 ops_complete_compute, sh, 1603 to_addr_conv(sh, percpu, 0)); 1604 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1605 STRIPE_SIZE, &submit); 1606 } else { 1607 struct page *dest; 1608 int data_target; 1609 int qd_idx = sh->qd_idx; 1610 1611 /* Missing D+Q: recompute D from P, then recompute Q */ 1612 if (target == qd_idx) 1613 data_target = target2; 1614 else 1615 data_target = target; 1616 1617 count = 0; 1618 for (i = disks; i-- ; ) { 1619 if (i == data_target || i == qd_idx) 1620 continue; 1621 blocks[count++] = sh->dev[i].page; 1622 } 1623 dest = sh->dev[data_target].page; 1624 init_async_submit(&submit, 1625 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1626 NULL, NULL, NULL, 1627 to_addr_conv(sh, percpu, 0)); 1628 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1629 &submit); 1630 1631 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1632 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1633 ops_complete_compute, sh, 1634 to_addr_conv(sh, percpu, 0)); 1635 return async_gen_syndrome(blocks, 0, count+2, 1636 STRIPE_SIZE, &submit); 1637 } 1638 } else { 1639 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1640 ops_complete_compute, sh, 1641 to_addr_conv(sh, percpu, 0)); 1642 if (failb == syndrome_disks) { 1643 /* We're missing D+P. */ 1644 return async_raid6_datap_recov(syndrome_disks+2, 1645 STRIPE_SIZE, faila, 1646 blocks, &submit); 1647 } else { 1648 /* We're missing D+D. */ 1649 return async_raid6_2data_recov(syndrome_disks+2, 1650 STRIPE_SIZE, faila, failb, 1651 blocks, &submit); 1652 } 1653 } 1654 } 1655 1656 static void ops_complete_prexor(void *stripe_head_ref) 1657 { 1658 struct stripe_head *sh = stripe_head_ref; 1659 1660 pr_debug("%s: stripe %llu\n", __func__, 1661 (unsigned long long)sh->sector); 1662 1663 if (r5c_is_writeback(sh->raid_conf->log)) 1664 /* 1665 * raid5-cache write back uses orig_page during prexor. 1666 * After prexor, it is time to free orig_page 1667 */ 1668 r5c_release_extra_page(sh); 1669 } 1670 1671 static struct dma_async_tx_descriptor * 1672 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1673 struct dma_async_tx_descriptor *tx) 1674 { 1675 int disks = sh->disks; 1676 struct page **xor_srcs = to_addr_page(percpu, 0); 1677 int count = 0, pd_idx = sh->pd_idx, i; 1678 struct async_submit_ctl submit; 1679 1680 /* existing parity data subtracted */ 1681 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1682 1683 BUG_ON(sh->batch_head); 1684 pr_debug("%s: stripe %llu\n", __func__, 1685 (unsigned long long)sh->sector); 1686 1687 for (i = disks; i--; ) { 1688 struct r5dev *dev = &sh->dev[i]; 1689 /* Only process blocks that are known to be uptodate */ 1690 if (test_bit(R5_InJournal, &dev->flags)) 1691 xor_srcs[count++] = dev->orig_page; 1692 else if (test_bit(R5_Wantdrain, &dev->flags)) 1693 xor_srcs[count++] = dev->page; 1694 } 1695 1696 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1697 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1698 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1699 1700 return tx; 1701 } 1702 1703 static struct dma_async_tx_descriptor * 1704 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1705 struct dma_async_tx_descriptor *tx) 1706 { 1707 struct page **blocks = to_addr_page(percpu, 0); 1708 int count; 1709 struct async_submit_ctl submit; 1710 1711 pr_debug("%s: stripe %llu\n", __func__, 1712 (unsigned long long)sh->sector); 1713 1714 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1715 1716 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1717 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1718 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1719 1720 return tx; 1721 } 1722 1723 static struct dma_async_tx_descriptor * 1724 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1725 { 1726 struct r5conf *conf = sh->raid_conf; 1727 int disks = sh->disks; 1728 int i; 1729 struct stripe_head *head_sh = sh; 1730 1731 pr_debug("%s: stripe %llu\n", __func__, 1732 (unsigned long long)sh->sector); 1733 1734 for (i = disks; i--; ) { 1735 struct r5dev *dev; 1736 struct bio *chosen; 1737 1738 sh = head_sh; 1739 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1740 struct bio *wbi; 1741 1742 again: 1743 dev = &sh->dev[i]; 1744 /* 1745 * clear R5_InJournal, so when rewriting a page in 1746 * journal, it is not skipped by r5l_log_stripe() 1747 */ 1748 clear_bit(R5_InJournal, &dev->flags); 1749 spin_lock_irq(&sh->stripe_lock); 1750 chosen = dev->towrite; 1751 dev->towrite = NULL; 1752 sh->overwrite_disks = 0; 1753 BUG_ON(dev->written); 1754 wbi = dev->written = chosen; 1755 spin_unlock_irq(&sh->stripe_lock); 1756 WARN_ON(dev->page != dev->orig_page); 1757 1758 while (wbi && wbi->bi_iter.bi_sector < 1759 dev->sector + STRIPE_SECTORS) { 1760 if (wbi->bi_opf & REQ_FUA) 1761 set_bit(R5_WantFUA, &dev->flags); 1762 if (wbi->bi_opf & REQ_SYNC) 1763 set_bit(R5_SyncIO, &dev->flags); 1764 if (bio_op(wbi) == REQ_OP_DISCARD) 1765 set_bit(R5_Discard, &dev->flags); 1766 else { 1767 tx = async_copy_data(1, wbi, &dev->page, 1768 dev->sector, tx, sh, 1769 r5c_is_writeback(conf->log)); 1770 if (dev->page != dev->orig_page && 1771 !r5c_is_writeback(conf->log)) { 1772 set_bit(R5_SkipCopy, &dev->flags); 1773 clear_bit(R5_UPTODATE, &dev->flags); 1774 clear_bit(R5_OVERWRITE, &dev->flags); 1775 } 1776 } 1777 wbi = r5_next_bio(wbi, dev->sector); 1778 } 1779 1780 if (head_sh->batch_head) { 1781 sh = list_first_entry(&sh->batch_list, 1782 struct stripe_head, 1783 batch_list); 1784 if (sh == head_sh) 1785 continue; 1786 goto again; 1787 } 1788 } 1789 } 1790 1791 return tx; 1792 } 1793 1794 static void ops_complete_reconstruct(void *stripe_head_ref) 1795 { 1796 struct stripe_head *sh = stripe_head_ref; 1797 int disks = sh->disks; 1798 int pd_idx = sh->pd_idx; 1799 int qd_idx = sh->qd_idx; 1800 int i; 1801 bool fua = false, sync = false, discard = false; 1802 1803 pr_debug("%s: stripe %llu\n", __func__, 1804 (unsigned long long)sh->sector); 1805 1806 for (i = disks; i--; ) { 1807 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1808 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1809 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1810 } 1811 1812 for (i = disks; i--; ) { 1813 struct r5dev *dev = &sh->dev[i]; 1814 1815 if (dev->written || i == pd_idx || i == qd_idx) { 1816 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { 1817 set_bit(R5_UPTODATE, &dev->flags); 1818 if (test_bit(STRIPE_EXPAND_READY, &sh->state)) 1819 set_bit(R5_Expanded, &dev->flags); 1820 } 1821 if (fua) 1822 set_bit(R5_WantFUA, &dev->flags); 1823 if (sync) 1824 set_bit(R5_SyncIO, &dev->flags); 1825 } 1826 } 1827 1828 if (sh->reconstruct_state == reconstruct_state_drain_run) 1829 sh->reconstruct_state = reconstruct_state_drain_result; 1830 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1831 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1832 else { 1833 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1834 sh->reconstruct_state = reconstruct_state_result; 1835 } 1836 1837 set_bit(STRIPE_HANDLE, &sh->state); 1838 raid5_release_stripe(sh); 1839 } 1840 1841 static void 1842 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1843 struct dma_async_tx_descriptor *tx) 1844 { 1845 int disks = sh->disks; 1846 struct page **xor_srcs; 1847 struct async_submit_ctl submit; 1848 int count, pd_idx = sh->pd_idx, i; 1849 struct page *xor_dest; 1850 int prexor = 0; 1851 unsigned long flags; 1852 int j = 0; 1853 struct stripe_head *head_sh = sh; 1854 int last_stripe; 1855 1856 pr_debug("%s: stripe %llu\n", __func__, 1857 (unsigned long long)sh->sector); 1858 1859 for (i = 0; i < sh->disks; i++) { 1860 if (pd_idx == i) 1861 continue; 1862 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1863 break; 1864 } 1865 if (i >= sh->disks) { 1866 atomic_inc(&sh->count); 1867 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1868 ops_complete_reconstruct(sh); 1869 return; 1870 } 1871 again: 1872 count = 0; 1873 xor_srcs = to_addr_page(percpu, j); 1874 /* check if prexor is active which means only process blocks 1875 * that are part of a read-modify-write (written) 1876 */ 1877 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1878 prexor = 1; 1879 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1880 for (i = disks; i--; ) { 1881 struct r5dev *dev = &sh->dev[i]; 1882 if (head_sh->dev[i].written || 1883 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1884 xor_srcs[count++] = dev->page; 1885 } 1886 } else { 1887 xor_dest = sh->dev[pd_idx].page; 1888 for (i = disks; i--; ) { 1889 struct r5dev *dev = &sh->dev[i]; 1890 if (i != pd_idx) 1891 xor_srcs[count++] = dev->page; 1892 } 1893 } 1894 1895 /* 1/ if we prexor'd then the dest is reused as a source 1896 * 2/ if we did not prexor then we are redoing the parity 1897 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1898 * for the synchronous xor case 1899 */ 1900 last_stripe = !head_sh->batch_head || 1901 list_first_entry(&sh->batch_list, 1902 struct stripe_head, batch_list) == head_sh; 1903 if (last_stripe) { 1904 flags = ASYNC_TX_ACK | 1905 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1906 1907 atomic_inc(&head_sh->count); 1908 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1909 to_addr_conv(sh, percpu, j)); 1910 } else { 1911 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1912 init_async_submit(&submit, flags, tx, NULL, NULL, 1913 to_addr_conv(sh, percpu, j)); 1914 } 1915 1916 if (unlikely(count == 1)) 1917 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1918 else 1919 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1920 if (!last_stripe) { 1921 j++; 1922 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1923 batch_list); 1924 goto again; 1925 } 1926 } 1927 1928 static void 1929 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1930 struct dma_async_tx_descriptor *tx) 1931 { 1932 struct async_submit_ctl submit; 1933 struct page **blocks; 1934 int count, i, j = 0; 1935 struct stripe_head *head_sh = sh; 1936 int last_stripe; 1937 int synflags; 1938 unsigned long txflags; 1939 1940 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1941 1942 for (i = 0; i < sh->disks; i++) { 1943 if (sh->pd_idx == i || sh->qd_idx == i) 1944 continue; 1945 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1946 break; 1947 } 1948 if (i >= sh->disks) { 1949 atomic_inc(&sh->count); 1950 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1951 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1952 ops_complete_reconstruct(sh); 1953 return; 1954 } 1955 1956 again: 1957 blocks = to_addr_page(percpu, j); 1958 1959 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1960 synflags = SYNDROME_SRC_WRITTEN; 1961 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1962 } else { 1963 synflags = SYNDROME_SRC_ALL; 1964 txflags = ASYNC_TX_ACK; 1965 } 1966 1967 count = set_syndrome_sources(blocks, sh, synflags); 1968 last_stripe = !head_sh->batch_head || 1969 list_first_entry(&sh->batch_list, 1970 struct stripe_head, batch_list) == head_sh; 1971 1972 if (last_stripe) { 1973 atomic_inc(&head_sh->count); 1974 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1975 head_sh, to_addr_conv(sh, percpu, j)); 1976 } else 1977 init_async_submit(&submit, 0, tx, NULL, NULL, 1978 to_addr_conv(sh, percpu, j)); 1979 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1980 if (!last_stripe) { 1981 j++; 1982 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1983 batch_list); 1984 goto again; 1985 } 1986 } 1987 1988 static void ops_complete_check(void *stripe_head_ref) 1989 { 1990 struct stripe_head *sh = stripe_head_ref; 1991 1992 pr_debug("%s: stripe %llu\n", __func__, 1993 (unsigned long long)sh->sector); 1994 1995 sh->check_state = check_state_check_result; 1996 set_bit(STRIPE_HANDLE, &sh->state); 1997 raid5_release_stripe(sh); 1998 } 1999 2000 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2001 { 2002 int disks = sh->disks; 2003 int pd_idx = sh->pd_idx; 2004 int qd_idx = sh->qd_idx; 2005 struct page *xor_dest; 2006 struct page **xor_srcs = to_addr_page(percpu, 0); 2007 struct dma_async_tx_descriptor *tx; 2008 struct async_submit_ctl submit; 2009 int count; 2010 int i; 2011 2012 pr_debug("%s: stripe %llu\n", __func__, 2013 (unsigned long long)sh->sector); 2014 2015 BUG_ON(sh->batch_head); 2016 count = 0; 2017 xor_dest = sh->dev[pd_idx].page; 2018 xor_srcs[count++] = xor_dest; 2019 for (i = disks; i--; ) { 2020 if (i == pd_idx || i == qd_idx) 2021 continue; 2022 xor_srcs[count++] = sh->dev[i].page; 2023 } 2024 2025 init_async_submit(&submit, 0, NULL, NULL, NULL, 2026 to_addr_conv(sh, percpu, 0)); 2027 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2028 &sh->ops.zero_sum_result, &submit); 2029 2030 atomic_inc(&sh->count); 2031 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2032 tx = async_trigger_callback(&submit); 2033 } 2034 2035 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2036 { 2037 struct page **srcs = to_addr_page(percpu, 0); 2038 struct async_submit_ctl submit; 2039 int count; 2040 2041 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2042 (unsigned long long)sh->sector, checkp); 2043 2044 BUG_ON(sh->batch_head); 2045 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2046 if (!checkp) 2047 srcs[count] = NULL; 2048 2049 atomic_inc(&sh->count); 2050 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2051 sh, to_addr_conv(sh, percpu, 0)); 2052 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2053 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2054 } 2055 2056 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2057 { 2058 int overlap_clear = 0, i, disks = sh->disks; 2059 struct dma_async_tx_descriptor *tx = NULL; 2060 struct r5conf *conf = sh->raid_conf; 2061 int level = conf->level; 2062 struct raid5_percpu *percpu; 2063 unsigned long cpu; 2064 2065 cpu = get_cpu(); 2066 percpu = per_cpu_ptr(conf->percpu, cpu); 2067 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2068 ops_run_biofill(sh); 2069 overlap_clear++; 2070 } 2071 2072 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2073 if (level < 6) 2074 tx = ops_run_compute5(sh, percpu); 2075 else { 2076 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2077 tx = ops_run_compute6_1(sh, percpu); 2078 else 2079 tx = ops_run_compute6_2(sh, percpu); 2080 } 2081 /* terminate the chain if reconstruct is not set to be run */ 2082 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2083 async_tx_ack(tx); 2084 } 2085 2086 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2087 if (level < 6) 2088 tx = ops_run_prexor5(sh, percpu, tx); 2089 else 2090 tx = ops_run_prexor6(sh, percpu, tx); 2091 } 2092 2093 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2094 tx = ops_run_partial_parity(sh, percpu, tx); 2095 2096 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2097 tx = ops_run_biodrain(sh, tx); 2098 overlap_clear++; 2099 } 2100 2101 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2102 if (level < 6) 2103 ops_run_reconstruct5(sh, percpu, tx); 2104 else 2105 ops_run_reconstruct6(sh, percpu, tx); 2106 } 2107 2108 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2109 if (sh->check_state == check_state_run) 2110 ops_run_check_p(sh, percpu); 2111 else if (sh->check_state == check_state_run_q) 2112 ops_run_check_pq(sh, percpu, 0); 2113 else if (sh->check_state == check_state_run_pq) 2114 ops_run_check_pq(sh, percpu, 1); 2115 else 2116 BUG(); 2117 } 2118 2119 if (overlap_clear && !sh->batch_head) 2120 for (i = disks; i--; ) { 2121 struct r5dev *dev = &sh->dev[i]; 2122 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2123 wake_up(&sh->raid_conf->wait_for_overlap); 2124 } 2125 put_cpu(); 2126 } 2127 2128 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2129 { 2130 if (sh->ppl_page) 2131 __free_page(sh->ppl_page); 2132 kmem_cache_free(sc, sh); 2133 } 2134 2135 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2136 int disks, struct r5conf *conf) 2137 { 2138 struct stripe_head *sh; 2139 int i; 2140 2141 sh = kmem_cache_zalloc(sc, gfp); 2142 if (sh) { 2143 spin_lock_init(&sh->stripe_lock); 2144 spin_lock_init(&sh->batch_lock); 2145 INIT_LIST_HEAD(&sh->batch_list); 2146 INIT_LIST_HEAD(&sh->lru); 2147 INIT_LIST_HEAD(&sh->r5c); 2148 INIT_LIST_HEAD(&sh->log_list); 2149 atomic_set(&sh->count, 1); 2150 sh->raid_conf = conf; 2151 sh->log_start = MaxSector; 2152 for (i = 0; i < disks; i++) { 2153 struct r5dev *dev = &sh->dev[i]; 2154 2155 bio_init(&dev->req, &dev->vec, 1); 2156 bio_init(&dev->rreq, &dev->rvec, 1); 2157 } 2158 2159 if (raid5_has_ppl(conf)) { 2160 sh->ppl_page = alloc_page(gfp); 2161 if (!sh->ppl_page) { 2162 free_stripe(sc, sh); 2163 sh = NULL; 2164 } 2165 } 2166 } 2167 return sh; 2168 } 2169 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2170 { 2171 struct stripe_head *sh; 2172 2173 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2174 if (!sh) 2175 return 0; 2176 2177 if (grow_buffers(sh, gfp)) { 2178 shrink_buffers(sh); 2179 free_stripe(conf->slab_cache, sh); 2180 return 0; 2181 } 2182 sh->hash_lock_index = 2183 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2184 /* we just created an active stripe so... */ 2185 atomic_inc(&conf->active_stripes); 2186 2187 raid5_release_stripe(sh); 2188 conf->max_nr_stripes++; 2189 return 1; 2190 } 2191 2192 static int grow_stripes(struct r5conf *conf, int num) 2193 { 2194 struct kmem_cache *sc; 2195 size_t namelen = sizeof(conf->cache_name[0]); 2196 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2197 2198 if (conf->mddev->gendisk) 2199 snprintf(conf->cache_name[0], namelen, 2200 "raid%d-%s", conf->level, mdname(conf->mddev)); 2201 else 2202 snprintf(conf->cache_name[0], namelen, 2203 "raid%d-%p", conf->level, conf->mddev); 2204 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2205 2206 conf->active_name = 0; 2207 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2208 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2209 0, 0, NULL); 2210 if (!sc) 2211 return 1; 2212 conf->slab_cache = sc; 2213 conf->pool_size = devs; 2214 while (num--) 2215 if (!grow_one_stripe(conf, GFP_KERNEL)) 2216 return 1; 2217 2218 return 0; 2219 } 2220 2221 /** 2222 * scribble_len - return the required size of the scribble region 2223 * @num - total number of disks in the array 2224 * 2225 * The size must be enough to contain: 2226 * 1/ a struct page pointer for each device in the array +2 2227 * 2/ room to convert each entry in (1) to its corresponding dma 2228 * (dma_map_page()) or page (page_address()) address. 2229 * 2230 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2231 * calculate over all devices (not just the data blocks), using zeros in place 2232 * of the P and Q blocks. 2233 */ 2234 static int scribble_alloc(struct raid5_percpu *percpu, 2235 int num, int cnt, gfp_t flags) 2236 { 2237 size_t obj_size = 2238 sizeof(struct page *) * (num+2) + 2239 sizeof(addr_conv_t) * (num+2); 2240 void *scribble; 2241 2242 scribble = kvmalloc_array(cnt, obj_size, flags); 2243 if (!scribble) 2244 return -ENOMEM; 2245 2246 kvfree(percpu->scribble); 2247 2248 percpu->scribble = scribble; 2249 percpu->scribble_obj_size = obj_size; 2250 return 0; 2251 } 2252 2253 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2254 { 2255 unsigned long cpu; 2256 int err = 0; 2257 2258 /* 2259 * Never shrink. And mddev_suspend() could deadlock if this is called 2260 * from raid5d. In that case, scribble_disks and scribble_sectors 2261 * should equal to new_disks and new_sectors 2262 */ 2263 if (conf->scribble_disks >= new_disks && 2264 conf->scribble_sectors >= new_sectors) 2265 return 0; 2266 mddev_suspend(conf->mddev); 2267 get_online_cpus(); 2268 2269 for_each_present_cpu(cpu) { 2270 struct raid5_percpu *percpu; 2271 2272 percpu = per_cpu_ptr(conf->percpu, cpu); 2273 err = scribble_alloc(percpu, new_disks, 2274 new_sectors / STRIPE_SECTORS, 2275 GFP_NOIO); 2276 if (err) 2277 break; 2278 } 2279 2280 put_online_cpus(); 2281 mddev_resume(conf->mddev); 2282 if (!err) { 2283 conf->scribble_disks = new_disks; 2284 conf->scribble_sectors = new_sectors; 2285 } 2286 return err; 2287 } 2288 2289 static int resize_stripes(struct r5conf *conf, int newsize) 2290 { 2291 /* Make all the stripes able to hold 'newsize' devices. 2292 * New slots in each stripe get 'page' set to a new page. 2293 * 2294 * This happens in stages: 2295 * 1/ create a new kmem_cache and allocate the required number of 2296 * stripe_heads. 2297 * 2/ gather all the old stripe_heads and transfer the pages across 2298 * to the new stripe_heads. This will have the side effect of 2299 * freezing the array as once all stripe_heads have been collected, 2300 * no IO will be possible. Old stripe heads are freed once their 2301 * pages have been transferred over, and the old kmem_cache is 2302 * freed when all stripes are done. 2303 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2304 * we simple return a failure status - no need to clean anything up. 2305 * 4/ allocate new pages for the new slots in the new stripe_heads. 2306 * If this fails, we don't bother trying the shrink the 2307 * stripe_heads down again, we just leave them as they are. 2308 * As each stripe_head is processed the new one is released into 2309 * active service. 2310 * 2311 * Once step2 is started, we cannot afford to wait for a write, 2312 * so we use GFP_NOIO allocations. 2313 */ 2314 struct stripe_head *osh, *nsh; 2315 LIST_HEAD(newstripes); 2316 struct disk_info *ndisks; 2317 int err = 0; 2318 struct kmem_cache *sc; 2319 int i; 2320 int hash, cnt; 2321 2322 md_allow_write(conf->mddev); 2323 2324 /* Step 1 */ 2325 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2326 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2327 0, 0, NULL); 2328 if (!sc) 2329 return -ENOMEM; 2330 2331 /* Need to ensure auto-resizing doesn't interfere */ 2332 mutex_lock(&conf->cache_size_mutex); 2333 2334 for (i = conf->max_nr_stripes; i; i--) { 2335 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2336 if (!nsh) 2337 break; 2338 2339 list_add(&nsh->lru, &newstripes); 2340 } 2341 if (i) { 2342 /* didn't get enough, give up */ 2343 while (!list_empty(&newstripes)) { 2344 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2345 list_del(&nsh->lru); 2346 free_stripe(sc, nsh); 2347 } 2348 kmem_cache_destroy(sc); 2349 mutex_unlock(&conf->cache_size_mutex); 2350 return -ENOMEM; 2351 } 2352 /* Step 2 - Must use GFP_NOIO now. 2353 * OK, we have enough stripes, start collecting inactive 2354 * stripes and copying them over 2355 */ 2356 hash = 0; 2357 cnt = 0; 2358 list_for_each_entry(nsh, &newstripes, lru) { 2359 lock_device_hash_lock(conf, hash); 2360 wait_event_cmd(conf->wait_for_stripe, 2361 !list_empty(conf->inactive_list + hash), 2362 unlock_device_hash_lock(conf, hash), 2363 lock_device_hash_lock(conf, hash)); 2364 osh = get_free_stripe(conf, hash); 2365 unlock_device_hash_lock(conf, hash); 2366 2367 for(i=0; i<conf->pool_size; i++) { 2368 nsh->dev[i].page = osh->dev[i].page; 2369 nsh->dev[i].orig_page = osh->dev[i].page; 2370 } 2371 nsh->hash_lock_index = hash; 2372 free_stripe(conf->slab_cache, osh); 2373 cnt++; 2374 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2375 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2376 hash++; 2377 cnt = 0; 2378 } 2379 } 2380 kmem_cache_destroy(conf->slab_cache); 2381 2382 /* Step 3. 2383 * At this point, we are holding all the stripes so the array 2384 * is completely stalled, so now is a good time to resize 2385 * conf->disks and the scribble region 2386 */ 2387 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO); 2388 if (ndisks) { 2389 for (i = 0; i < conf->pool_size; i++) 2390 ndisks[i] = conf->disks[i]; 2391 2392 for (i = conf->pool_size; i < newsize; i++) { 2393 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2394 if (!ndisks[i].extra_page) 2395 err = -ENOMEM; 2396 } 2397 2398 if (err) { 2399 for (i = conf->pool_size; i < newsize; i++) 2400 if (ndisks[i].extra_page) 2401 put_page(ndisks[i].extra_page); 2402 kfree(ndisks); 2403 } else { 2404 kfree(conf->disks); 2405 conf->disks = ndisks; 2406 } 2407 } else 2408 err = -ENOMEM; 2409 2410 mutex_unlock(&conf->cache_size_mutex); 2411 2412 conf->slab_cache = sc; 2413 conf->active_name = 1-conf->active_name; 2414 2415 /* Step 4, return new stripes to service */ 2416 while(!list_empty(&newstripes)) { 2417 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2418 list_del_init(&nsh->lru); 2419 2420 for (i=conf->raid_disks; i < newsize; i++) 2421 if (nsh->dev[i].page == NULL) { 2422 struct page *p = alloc_page(GFP_NOIO); 2423 nsh->dev[i].page = p; 2424 nsh->dev[i].orig_page = p; 2425 if (!p) 2426 err = -ENOMEM; 2427 } 2428 raid5_release_stripe(nsh); 2429 } 2430 /* critical section pass, GFP_NOIO no longer needed */ 2431 2432 if (!err) 2433 conf->pool_size = newsize; 2434 return err; 2435 } 2436 2437 static int drop_one_stripe(struct r5conf *conf) 2438 { 2439 struct stripe_head *sh; 2440 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2441 2442 spin_lock_irq(conf->hash_locks + hash); 2443 sh = get_free_stripe(conf, hash); 2444 spin_unlock_irq(conf->hash_locks + hash); 2445 if (!sh) 2446 return 0; 2447 BUG_ON(atomic_read(&sh->count)); 2448 shrink_buffers(sh); 2449 free_stripe(conf->slab_cache, sh); 2450 atomic_dec(&conf->active_stripes); 2451 conf->max_nr_stripes--; 2452 return 1; 2453 } 2454 2455 static void shrink_stripes(struct r5conf *conf) 2456 { 2457 while (conf->max_nr_stripes && 2458 drop_one_stripe(conf)) 2459 ; 2460 2461 kmem_cache_destroy(conf->slab_cache); 2462 conf->slab_cache = NULL; 2463 } 2464 2465 static void raid5_end_read_request(struct bio * bi) 2466 { 2467 struct stripe_head *sh = bi->bi_private; 2468 struct r5conf *conf = sh->raid_conf; 2469 int disks = sh->disks, i; 2470 char b[BDEVNAME_SIZE]; 2471 struct md_rdev *rdev = NULL; 2472 sector_t s; 2473 2474 for (i=0 ; i<disks; i++) 2475 if (bi == &sh->dev[i].req) 2476 break; 2477 2478 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2479 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2480 bi->bi_status); 2481 if (i == disks) { 2482 bio_reset(bi); 2483 BUG(); 2484 return; 2485 } 2486 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2487 /* If replacement finished while this request was outstanding, 2488 * 'replacement' might be NULL already. 2489 * In that case it moved down to 'rdev'. 2490 * rdev is not removed until all requests are finished. 2491 */ 2492 rdev = conf->disks[i].replacement; 2493 if (!rdev) 2494 rdev = conf->disks[i].rdev; 2495 2496 if (use_new_offset(conf, sh)) 2497 s = sh->sector + rdev->new_data_offset; 2498 else 2499 s = sh->sector + rdev->data_offset; 2500 if (!bi->bi_status) { 2501 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2502 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2503 /* Note that this cannot happen on a 2504 * replacement device. We just fail those on 2505 * any error 2506 */ 2507 pr_info_ratelimited( 2508 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2509 mdname(conf->mddev), STRIPE_SECTORS, 2510 (unsigned long long)s, 2511 bdevname(rdev->bdev, b)); 2512 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2513 clear_bit(R5_ReadError, &sh->dev[i].flags); 2514 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2515 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2516 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2517 2518 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2519 /* 2520 * end read for a page in journal, this 2521 * must be preparing for prexor in rmw 2522 */ 2523 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2524 2525 if (atomic_read(&rdev->read_errors)) 2526 atomic_set(&rdev->read_errors, 0); 2527 } else { 2528 const char *bdn = bdevname(rdev->bdev, b); 2529 int retry = 0; 2530 int set_bad = 0; 2531 2532 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2533 atomic_inc(&rdev->read_errors); 2534 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2535 pr_warn_ratelimited( 2536 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2537 mdname(conf->mddev), 2538 (unsigned long long)s, 2539 bdn); 2540 else if (conf->mddev->degraded >= conf->max_degraded) { 2541 set_bad = 1; 2542 pr_warn_ratelimited( 2543 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2544 mdname(conf->mddev), 2545 (unsigned long long)s, 2546 bdn); 2547 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2548 /* Oh, no!!! */ 2549 set_bad = 1; 2550 pr_warn_ratelimited( 2551 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2552 mdname(conf->mddev), 2553 (unsigned long long)s, 2554 bdn); 2555 } else if (atomic_read(&rdev->read_errors) 2556 > conf->max_nr_stripes) 2557 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2558 mdname(conf->mddev), bdn); 2559 else 2560 retry = 1; 2561 if (set_bad && test_bit(In_sync, &rdev->flags) 2562 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2563 retry = 1; 2564 if (retry) 2565 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2566 set_bit(R5_ReadError, &sh->dev[i].flags); 2567 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2568 } else 2569 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2570 else { 2571 clear_bit(R5_ReadError, &sh->dev[i].flags); 2572 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2573 if (!(set_bad 2574 && test_bit(In_sync, &rdev->flags) 2575 && rdev_set_badblocks( 2576 rdev, sh->sector, STRIPE_SECTORS, 0))) 2577 md_error(conf->mddev, rdev); 2578 } 2579 } 2580 rdev_dec_pending(rdev, conf->mddev); 2581 bio_reset(bi); 2582 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2583 set_bit(STRIPE_HANDLE, &sh->state); 2584 raid5_release_stripe(sh); 2585 } 2586 2587 static void raid5_end_write_request(struct bio *bi) 2588 { 2589 struct stripe_head *sh = bi->bi_private; 2590 struct r5conf *conf = sh->raid_conf; 2591 int disks = sh->disks, i; 2592 struct md_rdev *uninitialized_var(rdev); 2593 sector_t first_bad; 2594 int bad_sectors; 2595 int replacement = 0; 2596 2597 for (i = 0 ; i < disks; i++) { 2598 if (bi == &sh->dev[i].req) { 2599 rdev = conf->disks[i].rdev; 2600 break; 2601 } 2602 if (bi == &sh->dev[i].rreq) { 2603 rdev = conf->disks[i].replacement; 2604 if (rdev) 2605 replacement = 1; 2606 else 2607 /* rdev was removed and 'replacement' 2608 * replaced it. rdev is not removed 2609 * until all requests are finished. 2610 */ 2611 rdev = conf->disks[i].rdev; 2612 break; 2613 } 2614 } 2615 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2616 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2617 bi->bi_status); 2618 if (i == disks) { 2619 bio_reset(bi); 2620 BUG(); 2621 return; 2622 } 2623 2624 if (replacement) { 2625 if (bi->bi_status) 2626 md_error(conf->mddev, rdev); 2627 else if (is_badblock(rdev, sh->sector, 2628 STRIPE_SECTORS, 2629 &first_bad, &bad_sectors)) 2630 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2631 } else { 2632 if (bi->bi_status) { 2633 set_bit(STRIPE_DEGRADED, &sh->state); 2634 set_bit(WriteErrorSeen, &rdev->flags); 2635 set_bit(R5_WriteError, &sh->dev[i].flags); 2636 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2637 set_bit(MD_RECOVERY_NEEDED, 2638 &rdev->mddev->recovery); 2639 } else if (is_badblock(rdev, sh->sector, 2640 STRIPE_SECTORS, 2641 &first_bad, &bad_sectors)) { 2642 set_bit(R5_MadeGood, &sh->dev[i].flags); 2643 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2644 /* That was a successful write so make 2645 * sure it looks like we already did 2646 * a re-write. 2647 */ 2648 set_bit(R5_ReWrite, &sh->dev[i].flags); 2649 } 2650 } 2651 rdev_dec_pending(rdev, conf->mddev); 2652 2653 if (sh->batch_head && bi->bi_status && !replacement) 2654 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2655 2656 bio_reset(bi); 2657 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2658 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2659 set_bit(STRIPE_HANDLE, &sh->state); 2660 raid5_release_stripe(sh); 2661 2662 if (sh->batch_head && sh != sh->batch_head) 2663 raid5_release_stripe(sh->batch_head); 2664 } 2665 2666 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2667 { 2668 char b[BDEVNAME_SIZE]; 2669 struct r5conf *conf = mddev->private; 2670 unsigned long flags; 2671 pr_debug("raid456: error called\n"); 2672 2673 spin_lock_irqsave(&conf->device_lock, flags); 2674 2675 if (test_bit(In_sync, &rdev->flags) && 2676 mddev->degraded == conf->max_degraded) { 2677 /* 2678 * Don't allow to achieve failed state 2679 * Don't try to recover this device 2680 */ 2681 conf->recovery_disabled = mddev->recovery_disabled; 2682 spin_unlock_irqrestore(&conf->device_lock, flags); 2683 return; 2684 } 2685 2686 set_bit(Faulty, &rdev->flags); 2687 clear_bit(In_sync, &rdev->flags); 2688 mddev->degraded = raid5_calc_degraded(conf); 2689 spin_unlock_irqrestore(&conf->device_lock, flags); 2690 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2691 2692 set_bit(Blocked, &rdev->flags); 2693 set_mask_bits(&mddev->sb_flags, 0, 2694 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2695 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2696 "md/raid:%s: Operation continuing on %d devices.\n", 2697 mdname(mddev), 2698 bdevname(rdev->bdev, b), 2699 mdname(mddev), 2700 conf->raid_disks - mddev->degraded); 2701 r5c_update_on_rdev_error(mddev, rdev); 2702 } 2703 2704 /* 2705 * Input: a 'big' sector number, 2706 * Output: index of the data and parity disk, and the sector # in them. 2707 */ 2708 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2709 int previous, int *dd_idx, 2710 struct stripe_head *sh) 2711 { 2712 sector_t stripe, stripe2; 2713 sector_t chunk_number; 2714 unsigned int chunk_offset; 2715 int pd_idx, qd_idx; 2716 int ddf_layout = 0; 2717 sector_t new_sector; 2718 int algorithm = previous ? conf->prev_algo 2719 : conf->algorithm; 2720 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2721 : conf->chunk_sectors; 2722 int raid_disks = previous ? conf->previous_raid_disks 2723 : conf->raid_disks; 2724 int data_disks = raid_disks - conf->max_degraded; 2725 2726 /* First compute the information on this sector */ 2727 2728 /* 2729 * Compute the chunk number and the sector offset inside the chunk 2730 */ 2731 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2732 chunk_number = r_sector; 2733 2734 /* 2735 * Compute the stripe number 2736 */ 2737 stripe = chunk_number; 2738 *dd_idx = sector_div(stripe, data_disks); 2739 stripe2 = stripe; 2740 /* 2741 * Select the parity disk based on the user selected algorithm. 2742 */ 2743 pd_idx = qd_idx = -1; 2744 switch(conf->level) { 2745 case 4: 2746 pd_idx = data_disks; 2747 break; 2748 case 5: 2749 switch (algorithm) { 2750 case ALGORITHM_LEFT_ASYMMETRIC: 2751 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2752 if (*dd_idx >= pd_idx) 2753 (*dd_idx)++; 2754 break; 2755 case ALGORITHM_RIGHT_ASYMMETRIC: 2756 pd_idx = sector_div(stripe2, raid_disks); 2757 if (*dd_idx >= pd_idx) 2758 (*dd_idx)++; 2759 break; 2760 case ALGORITHM_LEFT_SYMMETRIC: 2761 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2762 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2763 break; 2764 case ALGORITHM_RIGHT_SYMMETRIC: 2765 pd_idx = sector_div(stripe2, raid_disks); 2766 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2767 break; 2768 case ALGORITHM_PARITY_0: 2769 pd_idx = 0; 2770 (*dd_idx)++; 2771 break; 2772 case ALGORITHM_PARITY_N: 2773 pd_idx = data_disks; 2774 break; 2775 default: 2776 BUG(); 2777 } 2778 break; 2779 case 6: 2780 2781 switch (algorithm) { 2782 case ALGORITHM_LEFT_ASYMMETRIC: 2783 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2784 qd_idx = pd_idx + 1; 2785 if (pd_idx == raid_disks-1) { 2786 (*dd_idx)++; /* Q D D D P */ 2787 qd_idx = 0; 2788 } else if (*dd_idx >= pd_idx) 2789 (*dd_idx) += 2; /* D D P Q D */ 2790 break; 2791 case ALGORITHM_RIGHT_ASYMMETRIC: 2792 pd_idx = sector_div(stripe2, raid_disks); 2793 qd_idx = pd_idx + 1; 2794 if (pd_idx == raid_disks-1) { 2795 (*dd_idx)++; /* Q D D D P */ 2796 qd_idx = 0; 2797 } else if (*dd_idx >= pd_idx) 2798 (*dd_idx) += 2; /* D D P Q D */ 2799 break; 2800 case ALGORITHM_LEFT_SYMMETRIC: 2801 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2802 qd_idx = (pd_idx + 1) % raid_disks; 2803 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2804 break; 2805 case ALGORITHM_RIGHT_SYMMETRIC: 2806 pd_idx = sector_div(stripe2, raid_disks); 2807 qd_idx = (pd_idx + 1) % raid_disks; 2808 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2809 break; 2810 2811 case ALGORITHM_PARITY_0: 2812 pd_idx = 0; 2813 qd_idx = 1; 2814 (*dd_idx) += 2; 2815 break; 2816 case ALGORITHM_PARITY_N: 2817 pd_idx = data_disks; 2818 qd_idx = data_disks + 1; 2819 break; 2820 2821 case ALGORITHM_ROTATING_ZERO_RESTART: 2822 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2823 * of blocks for computing Q is different. 2824 */ 2825 pd_idx = sector_div(stripe2, raid_disks); 2826 qd_idx = pd_idx + 1; 2827 if (pd_idx == raid_disks-1) { 2828 (*dd_idx)++; /* Q D D D P */ 2829 qd_idx = 0; 2830 } else if (*dd_idx >= pd_idx) 2831 (*dd_idx) += 2; /* D D P Q D */ 2832 ddf_layout = 1; 2833 break; 2834 2835 case ALGORITHM_ROTATING_N_RESTART: 2836 /* Same a left_asymmetric, by first stripe is 2837 * D D D P Q rather than 2838 * Q D D D P 2839 */ 2840 stripe2 += 1; 2841 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2842 qd_idx = pd_idx + 1; 2843 if (pd_idx == raid_disks-1) { 2844 (*dd_idx)++; /* Q D D D P */ 2845 qd_idx = 0; 2846 } else if (*dd_idx >= pd_idx) 2847 (*dd_idx) += 2; /* D D P Q D */ 2848 ddf_layout = 1; 2849 break; 2850 2851 case ALGORITHM_ROTATING_N_CONTINUE: 2852 /* Same as left_symmetric but Q is before P */ 2853 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2854 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2855 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2856 ddf_layout = 1; 2857 break; 2858 2859 case ALGORITHM_LEFT_ASYMMETRIC_6: 2860 /* RAID5 left_asymmetric, with Q on last device */ 2861 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2862 if (*dd_idx >= pd_idx) 2863 (*dd_idx)++; 2864 qd_idx = raid_disks - 1; 2865 break; 2866 2867 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2868 pd_idx = sector_div(stripe2, raid_disks-1); 2869 if (*dd_idx >= pd_idx) 2870 (*dd_idx)++; 2871 qd_idx = raid_disks - 1; 2872 break; 2873 2874 case ALGORITHM_LEFT_SYMMETRIC_6: 2875 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2876 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2877 qd_idx = raid_disks - 1; 2878 break; 2879 2880 case ALGORITHM_RIGHT_SYMMETRIC_6: 2881 pd_idx = sector_div(stripe2, raid_disks-1); 2882 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2883 qd_idx = raid_disks - 1; 2884 break; 2885 2886 case ALGORITHM_PARITY_0_6: 2887 pd_idx = 0; 2888 (*dd_idx)++; 2889 qd_idx = raid_disks - 1; 2890 break; 2891 2892 default: 2893 BUG(); 2894 } 2895 break; 2896 } 2897 2898 if (sh) { 2899 sh->pd_idx = pd_idx; 2900 sh->qd_idx = qd_idx; 2901 sh->ddf_layout = ddf_layout; 2902 } 2903 /* 2904 * Finally, compute the new sector number 2905 */ 2906 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2907 return new_sector; 2908 } 2909 2910 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2911 { 2912 struct r5conf *conf = sh->raid_conf; 2913 int raid_disks = sh->disks; 2914 int data_disks = raid_disks - conf->max_degraded; 2915 sector_t new_sector = sh->sector, check; 2916 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2917 : conf->chunk_sectors; 2918 int algorithm = previous ? conf->prev_algo 2919 : conf->algorithm; 2920 sector_t stripe; 2921 int chunk_offset; 2922 sector_t chunk_number; 2923 int dummy1, dd_idx = i; 2924 sector_t r_sector; 2925 struct stripe_head sh2; 2926 2927 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2928 stripe = new_sector; 2929 2930 if (i == sh->pd_idx) 2931 return 0; 2932 switch(conf->level) { 2933 case 4: break; 2934 case 5: 2935 switch (algorithm) { 2936 case ALGORITHM_LEFT_ASYMMETRIC: 2937 case ALGORITHM_RIGHT_ASYMMETRIC: 2938 if (i > sh->pd_idx) 2939 i--; 2940 break; 2941 case ALGORITHM_LEFT_SYMMETRIC: 2942 case ALGORITHM_RIGHT_SYMMETRIC: 2943 if (i < sh->pd_idx) 2944 i += raid_disks; 2945 i -= (sh->pd_idx + 1); 2946 break; 2947 case ALGORITHM_PARITY_0: 2948 i -= 1; 2949 break; 2950 case ALGORITHM_PARITY_N: 2951 break; 2952 default: 2953 BUG(); 2954 } 2955 break; 2956 case 6: 2957 if (i == sh->qd_idx) 2958 return 0; /* It is the Q disk */ 2959 switch (algorithm) { 2960 case ALGORITHM_LEFT_ASYMMETRIC: 2961 case ALGORITHM_RIGHT_ASYMMETRIC: 2962 case ALGORITHM_ROTATING_ZERO_RESTART: 2963 case ALGORITHM_ROTATING_N_RESTART: 2964 if (sh->pd_idx == raid_disks-1) 2965 i--; /* Q D D D P */ 2966 else if (i > sh->pd_idx) 2967 i -= 2; /* D D P Q D */ 2968 break; 2969 case ALGORITHM_LEFT_SYMMETRIC: 2970 case ALGORITHM_RIGHT_SYMMETRIC: 2971 if (sh->pd_idx == raid_disks-1) 2972 i--; /* Q D D D P */ 2973 else { 2974 /* D D P Q D */ 2975 if (i < sh->pd_idx) 2976 i += raid_disks; 2977 i -= (sh->pd_idx + 2); 2978 } 2979 break; 2980 case ALGORITHM_PARITY_0: 2981 i -= 2; 2982 break; 2983 case ALGORITHM_PARITY_N: 2984 break; 2985 case ALGORITHM_ROTATING_N_CONTINUE: 2986 /* Like left_symmetric, but P is before Q */ 2987 if (sh->pd_idx == 0) 2988 i--; /* P D D D Q */ 2989 else { 2990 /* D D Q P D */ 2991 if (i < sh->pd_idx) 2992 i += raid_disks; 2993 i -= (sh->pd_idx + 1); 2994 } 2995 break; 2996 case ALGORITHM_LEFT_ASYMMETRIC_6: 2997 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2998 if (i > sh->pd_idx) 2999 i--; 3000 break; 3001 case ALGORITHM_LEFT_SYMMETRIC_6: 3002 case ALGORITHM_RIGHT_SYMMETRIC_6: 3003 if (i < sh->pd_idx) 3004 i += data_disks + 1; 3005 i -= (sh->pd_idx + 1); 3006 break; 3007 case ALGORITHM_PARITY_0_6: 3008 i -= 1; 3009 break; 3010 default: 3011 BUG(); 3012 } 3013 break; 3014 } 3015 3016 chunk_number = stripe * data_disks + i; 3017 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3018 3019 check = raid5_compute_sector(conf, r_sector, 3020 previous, &dummy1, &sh2); 3021 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3022 || sh2.qd_idx != sh->qd_idx) { 3023 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3024 mdname(conf->mddev)); 3025 return 0; 3026 } 3027 return r_sector; 3028 } 3029 3030 /* 3031 * There are cases where we want handle_stripe_dirtying() and 3032 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3033 * 3034 * This function checks whether we want to delay the towrite. Specifically, 3035 * we delay the towrite when: 3036 * 3037 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3038 * stripe has data in journal (for other devices). 3039 * 3040 * In this case, when reading data for the non-overwrite dev, it is 3041 * necessary to handle complex rmw of write back cache (prexor with 3042 * orig_page, and xor with page). To keep read path simple, we would 3043 * like to flush data in journal to RAID disks first, so complex rmw 3044 * is handled in the write patch (handle_stripe_dirtying). 3045 * 3046 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3047 * 3048 * It is important to be able to flush all stripes in raid5-cache. 3049 * Therefore, we need reserve some space on the journal device for 3050 * these flushes. If flush operation includes pending writes to the 3051 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3052 * for the flush out. If we exclude these pending writes from flush 3053 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3054 * Therefore, excluding pending writes in these cases enables more 3055 * efficient use of the journal device. 3056 * 3057 * Note: To make sure the stripe makes progress, we only delay 3058 * towrite for stripes with data already in journal (injournal > 0). 3059 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3060 * no_space_stripes list. 3061 * 3062 * 3. during journal failure 3063 * In journal failure, we try to flush all cached data to raid disks 3064 * based on data in stripe cache. The array is read-only to upper 3065 * layers, so we would skip all pending writes. 3066 * 3067 */ 3068 static inline bool delay_towrite(struct r5conf *conf, 3069 struct r5dev *dev, 3070 struct stripe_head_state *s) 3071 { 3072 /* case 1 above */ 3073 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3074 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3075 return true; 3076 /* case 2 above */ 3077 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3078 s->injournal > 0) 3079 return true; 3080 /* case 3 above */ 3081 if (s->log_failed && s->injournal) 3082 return true; 3083 return false; 3084 } 3085 3086 static void 3087 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3088 int rcw, int expand) 3089 { 3090 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3091 struct r5conf *conf = sh->raid_conf; 3092 int level = conf->level; 3093 3094 if (rcw) { 3095 /* 3096 * In some cases, handle_stripe_dirtying initially decided to 3097 * run rmw and allocates extra page for prexor. However, rcw is 3098 * cheaper later on. We need to free the extra page now, 3099 * because we won't be able to do that in ops_complete_prexor(). 3100 */ 3101 r5c_release_extra_page(sh); 3102 3103 for (i = disks; i--; ) { 3104 struct r5dev *dev = &sh->dev[i]; 3105 3106 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3107 set_bit(R5_LOCKED, &dev->flags); 3108 set_bit(R5_Wantdrain, &dev->flags); 3109 if (!expand) 3110 clear_bit(R5_UPTODATE, &dev->flags); 3111 s->locked++; 3112 } else if (test_bit(R5_InJournal, &dev->flags)) { 3113 set_bit(R5_LOCKED, &dev->flags); 3114 s->locked++; 3115 } 3116 } 3117 /* if we are not expanding this is a proper write request, and 3118 * there will be bios with new data to be drained into the 3119 * stripe cache 3120 */ 3121 if (!expand) { 3122 if (!s->locked) 3123 /* False alarm, nothing to do */ 3124 return; 3125 sh->reconstruct_state = reconstruct_state_drain_run; 3126 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3127 } else 3128 sh->reconstruct_state = reconstruct_state_run; 3129 3130 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3131 3132 if (s->locked + conf->max_degraded == disks) 3133 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3134 atomic_inc(&conf->pending_full_writes); 3135 } else { 3136 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3137 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3138 BUG_ON(level == 6 && 3139 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3140 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3141 3142 for (i = disks; i--; ) { 3143 struct r5dev *dev = &sh->dev[i]; 3144 if (i == pd_idx || i == qd_idx) 3145 continue; 3146 3147 if (dev->towrite && 3148 (test_bit(R5_UPTODATE, &dev->flags) || 3149 test_bit(R5_Wantcompute, &dev->flags))) { 3150 set_bit(R5_Wantdrain, &dev->flags); 3151 set_bit(R5_LOCKED, &dev->flags); 3152 clear_bit(R5_UPTODATE, &dev->flags); 3153 s->locked++; 3154 } else if (test_bit(R5_InJournal, &dev->flags)) { 3155 set_bit(R5_LOCKED, &dev->flags); 3156 s->locked++; 3157 } 3158 } 3159 if (!s->locked) 3160 /* False alarm - nothing to do */ 3161 return; 3162 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3163 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3164 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3165 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3166 } 3167 3168 /* keep the parity disk(s) locked while asynchronous operations 3169 * are in flight 3170 */ 3171 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3172 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3173 s->locked++; 3174 3175 if (level == 6) { 3176 int qd_idx = sh->qd_idx; 3177 struct r5dev *dev = &sh->dev[qd_idx]; 3178 3179 set_bit(R5_LOCKED, &dev->flags); 3180 clear_bit(R5_UPTODATE, &dev->flags); 3181 s->locked++; 3182 } 3183 3184 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3185 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3186 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3187 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3188 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3189 3190 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3191 __func__, (unsigned long long)sh->sector, 3192 s->locked, s->ops_request); 3193 } 3194 3195 /* 3196 * Each stripe/dev can have one or more bion attached. 3197 * toread/towrite point to the first in a chain. 3198 * The bi_next chain must be in order. 3199 */ 3200 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3201 int forwrite, int previous) 3202 { 3203 struct bio **bip; 3204 struct r5conf *conf = sh->raid_conf; 3205 int firstwrite=0; 3206 3207 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3208 (unsigned long long)bi->bi_iter.bi_sector, 3209 (unsigned long long)sh->sector); 3210 3211 spin_lock_irq(&sh->stripe_lock); 3212 sh->dev[dd_idx].write_hint = bi->bi_write_hint; 3213 /* Don't allow new IO added to stripes in batch list */ 3214 if (sh->batch_head) 3215 goto overlap; 3216 if (forwrite) { 3217 bip = &sh->dev[dd_idx].towrite; 3218 if (*bip == NULL) 3219 firstwrite = 1; 3220 } else 3221 bip = &sh->dev[dd_idx].toread; 3222 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3223 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3224 goto overlap; 3225 bip = & (*bip)->bi_next; 3226 } 3227 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3228 goto overlap; 3229 3230 if (forwrite && raid5_has_ppl(conf)) { 3231 /* 3232 * With PPL only writes to consecutive data chunks within a 3233 * stripe are allowed because for a single stripe_head we can 3234 * only have one PPL entry at a time, which describes one data 3235 * range. Not really an overlap, but wait_for_overlap can be 3236 * used to handle this. 3237 */ 3238 sector_t sector; 3239 sector_t first = 0; 3240 sector_t last = 0; 3241 int count = 0; 3242 int i; 3243 3244 for (i = 0; i < sh->disks; i++) { 3245 if (i != sh->pd_idx && 3246 (i == dd_idx || sh->dev[i].towrite)) { 3247 sector = sh->dev[i].sector; 3248 if (count == 0 || sector < first) 3249 first = sector; 3250 if (sector > last) 3251 last = sector; 3252 count++; 3253 } 3254 } 3255 3256 if (first + conf->chunk_sectors * (count - 1) != last) 3257 goto overlap; 3258 } 3259 3260 if (!forwrite || previous) 3261 clear_bit(STRIPE_BATCH_READY, &sh->state); 3262 3263 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3264 if (*bip) 3265 bi->bi_next = *bip; 3266 *bip = bi; 3267 bio_inc_remaining(bi); 3268 md_write_inc(conf->mddev, bi); 3269 3270 if (forwrite) { 3271 /* check if page is covered */ 3272 sector_t sector = sh->dev[dd_idx].sector; 3273 for (bi=sh->dev[dd_idx].towrite; 3274 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3275 bi && bi->bi_iter.bi_sector <= sector; 3276 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3277 if (bio_end_sector(bi) >= sector) 3278 sector = bio_end_sector(bi); 3279 } 3280 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3281 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3282 sh->overwrite_disks++; 3283 } 3284 3285 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3286 (unsigned long long)(*bip)->bi_iter.bi_sector, 3287 (unsigned long long)sh->sector, dd_idx); 3288 3289 if (conf->mddev->bitmap && firstwrite) { 3290 /* Cannot hold spinlock over bitmap_startwrite, 3291 * but must ensure this isn't added to a batch until 3292 * we have added to the bitmap and set bm_seq. 3293 * So set STRIPE_BITMAP_PENDING to prevent 3294 * batching. 3295 * If multiple add_stripe_bio() calls race here they 3296 * much all set STRIPE_BITMAP_PENDING. So only the first one 3297 * to complete "bitmap_startwrite" gets to set 3298 * STRIPE_BIT_DELAY. This is important as once a stripe 3299 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3300 * any more. 3301 */ 3302 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3303 spin_unlock_irq(&sh->stripe_lock); 3304 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3305 STRIPE_SECTORS, 0); 3306 spin_lock_irq(&sh->stripe_lock); 3307 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3308 if (!sh->batch_head) { 3309 sh->bm_seq = conf->seq_flush+1; 3310 set_bit(STRIPE_BIT_DELAY, &sh->state); 3311 } 3312 } 3313 spin_unlock_irq(&sh->stripe_lock); 3314 3315 if (stripe_can_batch(sh)) 3316 stripe_add_to_batch_list(conf, sh); 3317 return 1; 3318 3319 overlap: 3320 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3321 spin_unlock_irq(&sh->stripe_lock); 3322 return 0; 3323 } 3324 3325 static void end_reshape(struct r5conf *conf); 3326 3327 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3328 struct stripe_head *sh) 3329 { 3330 int sectors_per_chunk = 3331 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3332 int dd_idx; 3333 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3334 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3335 3336 raid5_compute_sector(conf, 3337 stripe * (disks - conf->max_degraded) 3338 *sectors_per_chunk + chunk_offset, 3339 previous, 3340 &dd_idx, sh); 3341 } 3342 3343 static void 3344 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3345 struct stripe_head_state *s, int disks) 3346 { 3347 int i; 3348 BUG_ON(sh->batch_head); 3349 for (i = disks; i--; ) { 3350 struct bio *bi; 3351 int bitmap_end = 0; 3352 3353 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3354 struct md_rdev *rdev; 3355 rcu_read_lock(); 3356 rdev = rcu_dereference(conf->disks[i].rdev); 3357 if (rdev && test_bit(In_sync, &rdev->flags) && 3358 !test_bit(Faulty, &rdev->flags)) 3359 atomic_inc(&rdev->nr_pending); 3360 else 3361 rdev = NULL; 3362 rcu_read_unlock(); 3363 if (rdev) { 3364 if (!rdev_set_badblocks( 3365 rdev, 3366 sh->sector, 3367 STRIPE_SECTORS, 0)) 3368 md_error(conf->mddev, rdev); 3369 rdev_dec_pending(rdev, conf->mddev); 3370 } 3371 } 3372 spin_lock_irq(&sh->stripe_lock); 3373 /* fail all writes first */ 3374 bi = sh->dev[i].towrite; 3375 sh->dev[i].towrite = NULL; 3376 sh->overwrite_disks = 0; 3377 spin_unlock_irq(&sh->stripe_lock); 3378 if (bi) 3379 bitmap_end = 1; 3380 3381 log_stripe_write_finished(sh); 3382 3383 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3384 wake_up(&conf->wait_for_overlap); 3385 3386 while (bi && bi->bi_iter.bi_sector < 3387 sh->dev[i].sector + STRIPE_SECTORS) { 3388 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3389 3390 md_write_end(conf->mddev); 3391 bio_io_error(bi); 3392 bi = nextbi; 3393 } 3394 if (bitmap_end) 3395 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3396 STRIPE_SECTORS, 0, 0); 3397 bitmap_end = 0; 3398 /* and fail all 'written' */ 3399 bi = sh->dev[i].written; 3400 sh->dev[i].written = NULL; 3401 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3402 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3403 sh->dev[i].page = sh->dev[i].orig_page; 3404 } 3405 3406 if (bi) bitmap_end = 1; 3407 while (bi && bi->bi_iter.bi_sector < 3408 sh->dev[i].sector + STRIPE_SECTORS) { 3409 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3410 3411 md_write_end(conf->mddev); 3412 bio_io_error(bi); 3413 bi = bi2; 3414 } 3415 3416 /* fail any reads if this device is non-operational and 3417 * the data has not reached the cache yet. 3418 */ 3419 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3420 s->failed > conf->max_degraded && 3421 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3422 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3423 spin_lock_irq(&sh->stripe_lock); 3424 bi = sh->dev[i].toread; 3425 sh->dev[i].toread = NULL; 3426 spin_unlock_irq(&sh->stripe_lock); 3427 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3428 wake_up(&conf->wait_for_overlap); 3429 if (bi) 3430 s->to_read--; 3431 while (bi && bi->bi_iter.bi_sector < 3432 sh->dev[i].sector + STRIPE_SECTORS) { 3433 struct bio *nextbi = 3434 r5_next_bio(bi, sh->dev[i].sector); 3435 3436 bio_io_error(bi); 3437 bi = nextbi; 3438 } 3439 } 3440 if (bitmap_end) 3441 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3442 STRIPE_SECTORS, 0, 0); 3443 /* If we were in the middle of a write the parity block might 3444 * still be locked - so just clear all R5_LOCKED flags 3445 */ 3446 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3447 } 3448 s->to_write = 0; 3449 s->written = 0; 3450 3451 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3452 if (atomic_dec_and_test(&conf->pending_full_writes)) 3453 md_wakeup_thread(conf->mddev->thread); 3454 } 3455 3456 static void 3457 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3458 struct stripe_head_state *s) 3459 { 3460 int abort = 0; 3461 int i; 3462 3463 BUG_ON(sh->batch_head); 3464 clear_bit(STRIPE_SYNCING, &sh->state); 3465 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3466 wake_up(&conf->wait_for_overlap); 3467 s->syncing = 0; 3468 s->replacing = 0; 3469 /* There is nothing more to do for sync/check/repair. 3470 * Don't even need to abort as that is handled elsewhere 3471 * if needed, and not always wanted e.g. if there is a known 3472 * bad block here. 3473 * For recover/replace we need to record a bad block on all 3474 * non-sync devices, or abort the recovery 3475 */ 3476 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3477 /* During recovery devices cannot be removed, so 3478 * locking and refcounting of rdevs is not needed 3479 */ 3480 rcu_read_lock(); 3481 for (i = 0; i < conf->raid_disks; i++) { 3482 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3483 if (rdev 3484 && !test_bit(Faulty, &rdev->flags) 3485 && !test_bit(In_sync, &rdev->flags) 3486 && !rdev_set_badblocks(rdev, sh->sector, 3487 STRIPE_SECTORS, 0)) 3488 abort = 1; 3489 rdev = rcu_dereference(conf->disks[i].replacement); 3490 if (rdev 3491 && !test_bit(Faulty, &rdev->flags) 3492 && !test_bit(In_sync, &rdev->flags) 3493 && !rdev_set_badblocks(rdev, sh->sector, 3494 STRIPE_SECTORS, 0)) 3495 abort = 1; 3496 } 3497 rcu_read_unlock(); 3498 if (abort) 3499 conf->recovery_disabled = 3500 conf->mddev->recovery_disabled; 3501 } 3502 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3503 } 3504 3505 static int want_replace(struct stripe_head *sh, int disk_idx) 3506 { 3507 struct md_rdev *rdev; 3508 int rv = 0; 3509 3510 rcu_read_lock(); 3511 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3512 if (rdev 3513 && !test_bit(Faulty, &rdev->flags) 3514 && !test_bit(In_sync, &rdev->flags) 3515 && (rdev->recovery_offset <= sh->sector 3516 || rdev->mddev->recovery_cp <= sh->sector)) 3517 rv = 1; 3518 rcu_read_unlock(); 3519 return rv; 3520 } 3521 3522 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3523 int disk_idx, int disks) 3524 { 3525 struct r5dev *dev = &sh->dev[disk_idx]; 3526 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3527 &sh->dev[s->failed_num[1]] }; 3528 int i; 3529 3530 3531 if (test_bit(R5_LOCKED, &dev->flags) || 3532 test_bit(R5_UPTODATE, &dev->flags)) 3533 /* No point reading this as we already have it or have 3534 * decided to get it. 3535 */ 3536 return 0; 3537 3538 if (dev->toread || 3539 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3540 /* We need this block to directly satisfy a request */ 3541 return 1; 3542 3543 if (s->syncing || s->expanding || 3544 (s->replacing && want_replace(sh, disk_idx))) 3545 /* When syncing, or expanding we read everything. 3546 * When replacing, we need the replaced block. 3547 */ 3548 return 1; 3549 3550 if ((s->failed >= 1 && fdev[0]->toread) || 3551 (s->failed >= 2 && fdev[1]->toread)) 3552 /* If we want to read from a failed device, then 3553 * we need to actually read every other device. 3554 */ 3555 return 1; 3556 3557 /* Sometimes neither read-modify-write nor reconstruct-write 3558 * cycles can work. In those cases we read every block we 3559 * can. Then the parity-update is certain to have enough to 3560 * work with. 3561 * This can only be a problem when we need to write something, 3562 * and some device has failed. If either of those tests 3563 * fail we need look no further. 3564 */ 3565 if (!s->failed || !s->to_write) 3566 return 0; 3567 3568 if (test_bit(R5_Insync, &dev->flags) && 3569 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3570 /* Pre-reads at not permitted until after short delay 3571 * to gather multiple requests. However if this 3572 * device is no Insync, the block could only be computed 3573 * and there is no need to delay that. 3574 */ 3575 return 0; 3576 3577 for (i = 0; i < s->failed && i < 2; i++) { 3578 if (fdev[i]->towrite && 3579 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3580 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3581 /* If we have a partial write to a failed 3582 * device, then we will need to reconstruct 3583 * the content of that device, so all other 3584 * devices must be read. 3585 */ 3586 return 1; 3587 } 3588 3589 /* If we are forced to do a reconstruct-write, either because 3590 * the current RAID6 implementation only supports that, or 3591 * because parity cannot be trusted and we are currently 3592 * recovering it, there is extra need to be careful. 3593 * If one of the devices that we would need to read, because 3594 * it is not being overwritten (and maybe not written at all) 3595 * is missing/faulty, then we need to read everything we can. 3596 */ 3597 if (sh->raid_conf->level != 6 && 3598 sh->sector < sh->raid_conf->mddev->recovery_cp) 3599 /* reconstruct-write isn't being forced */ 3600 return 0; 3601 for (i = 0; i < s->failed && i < 2; i++) { 3602 if (s->failed_num[i] != sh->pd_idx && 3603 s->failed_num[i] != sh->qd_idx && 3604 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3605 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3606 return 1; 3607 } 3608 3609 return 0; 3610 } 3611 3612 /* fetch_block - checks the given member device to see if its data needs 3613 * to be read or computed to satisfy a request. 3614 * 3615 * Returns 1 when no more member devices need to be checked, otherwise returns 3616 * 0 to tell the loop in handle_stripe_fill to continue 3617 */ 3618 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3619 int disk_idx, int disks) 3620 { 3621 struct r5dev *dev = &sh->dev[disk_idx]; 3622 3623 /* is the data in this block needed, and can we get it? */ 3624 if (need_this_block(sh, s, disk_idx, disks)) { 3625 /* we would like to get this block, possibly by computing it, 3626 * otherwise read it if the backing disk is insync 3627 */ 3628 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3629 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3630 BUG_ON(sh->batch_head); 3631 3632 /* 3633 * In the raid6 case if the only non-uptodate disk is P 3634 * then we already trusted P to compute the other failed 3635 * drives. It is safe to compute rather than re-read P. 3636 * In other cases we only compute blocks from failed 3637 * devices, otherwise check/repair might fail to detect 3638 * a real inconsistency. 3639 */ 3640 3641 if ((s->uptodate == disks - 1) && 3642 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3643 (s->failed && (disk_idx == s->failed_num[0] || 3644 disk_idx == s->failed_num[1])))) { 3645 /* have disk failed, and we're requested to fetch it; 3646 * do compute it 3647 */ 3648 pr_debug("Computing stripe %llu block %d\n", 3649 (unsigned long long)sh->sector, disk_idx); 3650 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3651 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3652 set_bit(R5_Wantcompute, &dev->flags); 3653 sh->ops.target = disk_idx; 3654 sh->ops.target2 = -1; /* no 2nd target */ 3655 s->req_compute = 1; 3656 /* Careful: from this point on 'uptodate' is in the eye 3657 * of raid_run_ops which services 'compute' operations 3658 * before writes. R5_Wantcompute flags a block that will 3659 * be R5_UPTODATE by the time it is needed for a 3660 * subsequent operation. 3661 */ 3662 s->uptodate++; 3663 return 1; 3664 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3665 /* Computing 2-failure is *very* expensive; only 3666 * do it if failed >= 2 3667 */ 3668 int other; 3669 for (other = disks; other--; ) { 3670 if (other == disk_idx) 3671 continue; 3672 if (!test_bit(R5_UPTODATE, 3673 &sh->dev[other].flags)) 3674 break; 3675 } 3676 BUG_ON(other < 0); 3677 pr_debug("Computing stripe %llu blocks %d,%d\n", 3678 (unsigned long long)sh->sector, 3679 disk_idx, other); 3680 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3681 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3682 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3683 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3684 sh->ops.target = disk_idx; 3685 sh->ops.target2 = other; 3686 s->uptodate += 2; 3687 s->req_compute = 1; 3688 return 1; 3689 } else if (test_bit(R5_Insync, &dev->flags)) { 3690 set_bit(R5_LOCKED, &dev->flags); 3691 set_bit(R5_Wantread, &dev->flags); 3692 s->locked++; 3693 pr_debug("Reading block %d (sync=%d)\n", 3694 disk_idx, s->syncing); 3695 } 3696 } 3697 3698 return 0; 3699 } 3700 3701 /** 3702 * handle_stripe_fill - read or compute data to satisfy pending requests. 3703 */ 3704 static void handle_stripe_fill(struct stripe_head *sh, 3705 struct stripe_head_state *s, 3706 int disks) 3707 { 3708 int i; 3709 3710 /* look for blocks to read/compute, skip this if a compute 3711 * is already in flight, or if the stripe contents are in the 3712 * midst of changing due to a write 3713 */ 3714 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3715 !sh->reconstruct_state) { 3716 3717 /* 3718 * For degraded stripe with data in journal, do not handle 3719 * read requests yet, instead, flush the stripe to raid 3720 * disks first, this avoids handling complex rmw of write 3721 * back cache (prexor with orig_page, and then xor with 3722 * page) in the read path 3723 */ 3724 if (s->injournal && s->failed) { 3725 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3726 r5c_make_stripe_write_out(sh); 3727 goto out; 3728 } 3729 3730 for (i = disks; i--; ) 3731 if (fetch_block(sh, s, i, disks)) 3732 break; 3733 } 3734 out: 3735 set_bit(STRIPE_HANDLE, &sh->state); 3736 } 3737 3738 static void break_stripe_batch_list(struct stripe_head *head_sh, 3739 unsigned long handle_flags); 3740 /* handle_stripe_clean_event 3741 * any written block on an uptodate or failed drive can be returned. 3742 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3743 * never LOCKED, so we don't need to test 'failed' directly. 3744 */ 3745 static void handle_stripe_clean_event(struct r5conf *conf, 3746 struct stripe_head *sh, int disks) 3747 { 3748 int i; 3749 struct r5dev *dev; 3750 int discard_pending = 0; 3751 struct stripe_head *head_sh = sh; 3752 bool do_endio = false; 3753 3754 for (i = disks; i--; ) 3755 if (sh->dev[i].written) { 3756 dev = &sh->dev[i]; 3757 if (!test_bit(R5_LOCKED, &dev->flags) && 3758 (test_bit(R5_UPTODATE, &dev->flags) || 3759 test_bit(R5_Discard, &dev->flags) || 3760 test_bit(R5_SkipCopy, &dev->flags))) { 3761 /* We can return any write requests */ 3762 struct bio *wbi, *wbi2; 3763 pr_debug("Return write for disc %d\n", i); 3764 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3765 clear_bit(R5_UPTODATE, &dev->flags); 3766 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3767 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3768 } 3769 do_endio = true; 3770 3771 returnbi: 3772 dev->page = dev->orig_page; 3773 wbi = dev->written; 3774 dev->written = NULL; 3775 while (wbi && wbi->bi_iter.bi_sector < 3776 dev->sector + STRIPE_SECTORS) { 3777 wbi2 = r5_next_bio(wbi, dev->sector); 3778 md_write_end(conf->mddev); 3779 bio_endio(wbi); 3780 wbi = wbi2; 3781 } 3782 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3783 STRIPE_SECTORS, 3784 !test_bit(STRIPE_DEGRADED, &sh->state), 3785 0); 3786 if (head_sh->batch_head) { 3787 sh = list_first_entry(&sh->batch_list, 3788 struct stripe_head, 3789 batch_list); 3790 if (sh != head_sh) { 3791 dev = &sh->dev[i]; 3792 goto returnbi; 3793 } 3794 } 3795 sh = head_sh; 3796 dev = &sh->dev[i]; 3797 } else if (test_bit(R5_Discard, &dev->flags)) 3798 discard_pending = 1; 3799 } 3800 3801 log_stripe_write_finished(sh); 3802 3803 if (!discard_pending && 3804 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3805 int hash; 3806 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3807 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3808 if (sh->qd_idx >= 0) { 3809 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3810 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3811 } 3812 /* now that discard is done we can proceed with any sync */ 3813 clear_bit(STRIPE_DISCARD, &sh->state); 3814 /* 3815 * SCSI discard will change some bio fields and the stripe has 3816 * no updated data, so remove it from hash list and the stripe 3817 * will be reinitialized 3818 */ 3819 unhash: 3820 hash = sh->hash_lock_index; 3821 spin_lock_irq(conf->hash_locks + hash); 3822 remove_hash(sh); 3823 spin_unlock_irq(conf->hash_locks + hash); 3824 if (head_sh->batch_head) { 3825 sh = list_first_entry(&sh->batch_list, 3826 struct stripe_head, batch_list); 3827 if (sh != head_sh) 3828 goto unhash; 3829 } 3830 sh = head_sh; 3831 3832 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3833 set_bit(STRIPE_HANDLE, &sh->state); 3834 3835 } 3836 3837 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3838 if (atomic_dec_and_test(&conf->pending_full_writes)) 3839 md_wakeup_thread(conf->mddev->thread); 3840 3841 if (head_sh->batch_head && do_endio) 3842 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3843 } 3844 3845 /* 3846 * For RMW in write back cache, we need extra page in prexor to store the 3847 * old data. This page is stored in dev->orig_page. 3848 * 3849 * This function checks whether we have data for prexor. The exact logic 3850 * is: 3851 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3852 */ 3853 static inline bool uptodate_for_rmw(struct r5dev *dev) 3854 { 3855 return (test_bit(R5_UPTODATE, &dev->flags)) && 3856 (!test_bit(R5_InJournal, &dev->flags) || 3857 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3858 } 3859 3860 static int handle_stripe_dirtying(struct r5conf *conf, 3861 struct stripe_head *sh, 3862 struct stripe_head_state *s, 3863 int disks) 3864 { 3865 int rmw = 0, rcw = 0, i; 3866 sector_t recovery_cp = conf->mddev->recovery_cp; 3867 3868 /* Check whether resync is now happening or should start. 3869 * If yes, then the array is dirty (after unclean shutdown or 3870 * initial creation), so parity in some stripes might be inconsistent. 3871 * In this case, we need to always do reconstruct-write, to ensure 3872 * that in case of drive failure or read-error correction, we 3873 * generate correct data from the parity. 3874 */ 3875 if (conf->rmw_level == PARITY_DISABLE_RMW || 3876 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3877 s->failed == 0)) { 3878 /* Calculate the real rcw later - for now make it 3879 * look like rcw is cheaper 3880 */ 3881 rcw = 1; rmw = 2; 3882 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3883 conf->rmw_level, (unsigned long long)recovery_cp, 3884 (unsigned long long)sh->sector); 3885 } else for (i = disks; i--; ) { 3886 /* would I have to read this buffer for read_modify_write */ 3887 struct r5dev *dev = &sh->dev[i]; 3888 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3889 i == sh->pd_idx || i == sh->qd_idx || 3890 test_bit(R5_InJournal, &dev->flags)) && 3891 !test_bit(R5_LOCKED, &dev->flags) && 3892 !(uptodate_for_rmw(dev) || 3893 test_bit(R5_Wantcompute, &dev->flags))) { 3894 if (test_bit(R5_Insync, &dev->flags)) 3895 rmw++; 3896 else 3897 rmw += 2*disks; /* cannot read it */ 3898 } 3899 /* Would I have to read this buffer for reconstruct_write */ 3900 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3901 i != sh->pd_idx && i != sh->qd_idx && 3902 !test_bit(R5_LOCKED, &dev->flags) && 3903 !(test_bit(R5_UPTODATE, &dev->flags) || 3904 test_bit(R5_Wantcompute, &dev->flags))) { 3905 if (test_bit(R5_Insync, &dev->flags)) 3906 rcw++; 3907 else 3908 rcw += 2*disks; 3909 } 3910 } 3911 3912 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3913 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3914 set_bit(STRIPE_HANDLE, &sh->state); 3915 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3916 /* prefer read-modify-write, but need to get some data */ 3917 if (conf->mddev->queue) 3918 blk_add_trace_msg(conf->mddev->queue, 3919 "raid5 rmw %llu %d", 3920 (unsigned long long)sh->sector, rmw); 3921 for (i = disks; i--; ) { 3922 struct r5dev *dev = &sh->dev[i]; 3923 if (test_bit(R5_InJournal, &dev->flags) && 3924 dev->page == dev->orig_page && 3925 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3926 /* alloc page for prexor */ 3927 struct page *p = alloc_page(GFP_NOIO); 3928 3929 if (p) { 3930 dev->orig_page = p; 3931 continue; 3932 } 3933 3934 /* 3935 * alloc_page() failed, try use 3936 * disk_info->extra_page 3937 */ 3938 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3939 &conf->cache_state)) { 3940 r5c_use_extra_page(sh); 3941 break; 3942 } 3943 3944 /* extra_page in use, add to delayed_list */ 3945 set_bit(STRIPE_DELAYED, &sh->state); 3946 s->waiting_extra_page = 1; 3947 return -EAGAIN; 3948 } 3949 } 3950 3951 for (i = disks; i--; ) { 3952 struct r5dev *dev = &sh->dev[i]; 3953 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3954 i == sh->pd_idx || i == sh->qd_idx || 3955 test_bit(R5_InJournal, &dev->flags)) && 3956 !test_bit(R5_LOCKED, &dev->flags) && 3957 !(uptodate_for_rmw(dev) || 3958 test_bit(R5_Wantcompute, &dev->flags)) && 3959 test_bit(R5_Insync, &dev->flags)) { 3960 if (test_bit(STRIPE_PREREAD_ACTIVE, 3961 &sh->state)) { 3962 pr_debug("Read_old block %d for r-m-w\n", 3963 i); 3964 set_bit(R5_LOCKED, &dev->flags); 3965 set_bit(R5_Wantread, &dev->flags); 3966 s->locked++; 3967 } else { 3968 set_bit(STRIPE_DELAYED, &sh->state); 3969 set_bit(STRIPE_HANDLE, &sh->state); 3970 } 3971 } 3972 } 3973 } 3974 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3975 /* want reconstruct write, but need to get some data */ 3976 int qread =0; 3977 rcw = 0; 3978 for (i = disks; i--; ) { 3979 struct r5dev *dev = &sh->dev[i]; 3980 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3981 i != sh->pd_idx && i != sh->qd_idx && 3982 !test_bit(R5_LOCKED, &dev->flags) && 3983 !(test_bit(R5_UPTODATE, &dev->flags) || 3984 test_bit(R5_Wantcompute, &dev->flags))) { 3985 rcw++; 3986 if (test_bit(R5_Insync, &dev->flags) && 3987 test_bit(STRIPE_PREREAD_ACTIVE, 3988 &sh->state)) { 3989 pr_debug("Read_old block " 3990 "%d for Reconstruct\n", i); 3991 set_bit(R5_LOCKED, &dev->flags); 3992 set_bit(R5_Wantread, &dev->flags); 3993 s->locked++; 3994 qread++; 3995 } else { 3996 set_bit(STRIPE_DELAYED, &sh->state); 3997 set_bit(STRIPE_HANDLE, &sh->state); 3998 } 3999 } 4000 } 4001 if (rcw && conf->mddev->queue) 4002 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 4003 (unsigned long long)sh->sector, 4004 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 4005 } 4006 4007 if (rcw > disks && rmw > disks && 4008 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4009 set_bit(STRIPE_DELAYED, &sh->state); 4010 4011 /* now if nothing is locked, and if we have enough data, 4012 * we can start a write request 4013 */ 4014 /* since handle_stripe can be called at any time we need to handle the 4015 * case where a compute block operation has been submitted and then a 4016 * subsequent call wants to start a write request. raid_run_ops only 4017 * handles the case where compute block and reconstruct are requested 4018 * simultaneously. If this is not the case then new writes need to be 4019 * held off until the compute completes. 4020 */ 4021 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4022 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4023 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4024 schedule_reconstruction(sh, s, rcw == 0, 0); 4025 return 0; 4026 } 4027 4028 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4029 struct stripe_head_state *s, int disks) 4030 { 4031 struct r5dev *dev = NULL; 4032 4033 BUG_ON(sh->batch_head); 4034 set_bit(STRIPE_HANDLE, &sh->state); 4035 4036 switch (sh->check_state) { 4037 case check_state_idle: 4038 /* start a new check operation if there are no failures */ 4039 if (s->failed == 0) { 4040 BUG_ON(s->uptodate != disks); 4041 sh->check_state = check_state_run; 4042 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4043 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4044 s->uptodate--; 4045 break; 4046 } 4047 dev = &sh->dev[s->failed_num[0]]; 4048 /* fall through */ 4049 case check_state_compute_result: 4050 sh->check_state = check_state_idle; 4051 if (!dev) 4052 dev = &sh->dev[sh->pd_idx]; 4053 4054 /* check that a write has not made the stripe insync */ 4055 if (test_bit(STRIPE_INSYNC, &sh->state)) 4056 break; 4057 4058 /* either failed parity check, or recovery is happening */ 4059 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4060 BUG_ON(s->uptodate != disks); 4061 4062 set_bit(R5_LOCKED, &dev->flags); 4063 s->locked++; 4064 set_bit(R5_Wantwrite, &dev->flags); 4065 4066 clear_bit(STRIPE_DEGRADED, &sh->state); 4067 set_bit(STRIPE_INSYNC, &sh->state); 4068 break; 4069 case check_state_run: 4070 break; /* we will be called again upon completion */ 4071 case check_state_check_result: 4072 sh->check_state = check_state_idle; 4073 4074 /* if a failure occurred during the check operation, leave 4075 * STRIPE_INSYNC not set and let the stripe be handled again 4076 */ 4077 if (s->failed) 4078 break; 4079 4080 /* handle a successful check operation, if parity is correct 4081 * we are done. Otherwise update the mismatch count and repair 4082 * parity if !MD_RECOVERY_CHECK 4083 */ 4084 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4085 /* parity is correct (on disc, 4086 * not in buffer any more) 4087 */ 4088 set_bit(STRIPE_INSYNC, &sh->state); 4089 else { 4090 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4091 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4092 /* don't try to repair!! */ 4093 set_bit(STRIPE_INSYNC, &sh->state); 4094 pr_warn_ratelimited("%s: mismatch sector in range " 4095 "%llu-%llu\n", mdname(conf->mddev), 4096 (unsigned long long) sh->sector, 4097 (unsigned long long) sh->sector + 4098 STRIPE_SECTORS); 4099 } else { 4100 sh->check_state = check_state_compute_run; 4101 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4102 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4103 set_bit(R5_Wantcompute, 4104 &sh->dev[sh->pd_idx].flags); 4105 sh->ops.target = sh->pd_idx; 4106 sh->ops.target2 = -1; 4107 s->uptodate++; 4108 } 4109 } 4110 break; 4111 case check_state_compute_run: 4112 break; 4113 default: 4114 pr_err("%s: unknown check_state: %d sector: %llu\n", 4115 __func__, sh->check_state, 4116 (unsigned long long) sh->sector); 4117 BUG(); 4118 } 4119 } 4120 4121 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4122 struct stripe_head_state *s, 4123 int disks) 4124 { 4125 int pd_idx = sh->pd_idx; 4126 int qd_idx = sh->qd_idx; 4127 struct r5dev *dev; 4128 4129 BUG_ON(sh->batch_head); 4130 set_bit(STRIPE_HANDLE, &sh->state); 4131 4132 BUG_ON(s->failed > 2); 4133 4134 /* Want to check and possibly repair P and Q. 4135 * However there could be one 'failed' device, in which 4136 * case we can only check one of them, possibly using the 4137 * other to generate missing data 4138 */ 4139 4140 switch (sh->check_state) { 4141 case check_state_idle: 4142 /* start a new check operation if there are < 2 failures */ 4143 if (s->failed == s->q_failed) { 4144 /* The only possible failed device holds Q, so it 4145 * makes sense to check P (If anything else were failed, 4146 * we would have used P to recreate it). 4147 */ 4148 sh->check_state = check_state_run; 4149 } 4150 if (!s->q_failed && s->failed < 2) { 4151 /* Q is not failed, and we didn't use it to generate 4152 * anything, so it makes sense to check it 4153 */ 4154 if (sh->check_state == check_state_run) 4155 sh->check_state = check_state_run_pq; 4156 else 4157 sh->check_state = check_state_run_q; 4158 } 4159 4160 /* discard potentially stale zero_sum_result */ 4161 sh->ops.zero_sum_result = 0; 4162 4163 if (sh->check_state == check_state_run) { 4164 /* async_xor_zero_sum destroys the contents of P */ 4165 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4166 s->uptodate--; 4167 } 4168 if (sh->check_state >= check_state_run && 4169 sh->check_state <= check_state_run_pq) { 4170 /* async_syndrome_zero_sum preserves P and Q, so 4171 * no need to mark them !uptodate here 4172 */ 4173 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4174 break; 4175 } 4176 4177 /* we have 2-disk failure */ 4178 BUG_ON(s->failed != 2); 4179 /* fall through */ 4180 case check_state_compute_result: 4181 sh->check_state = check_state_idle; 4182 4183 /* check that a write has not made the stripe insync */ 4184 if (test_bit(STRIPE_INSYNC, &sh->state)) 4185 break; 4186 4187 /* now write out any block on a failed drive, 4188 * or P or Q if they were recomputed 4189 */ 4190 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4191 if (s->failed == 2) { 4192 dev = &sh->dev[s->failed_num[1]]; 4193 s->locked++; 4194 set_bit(R5_LOCKED, &dev->flags); 4195 set_bit(R5_Wantwrite, &dev->flags); 4196 } 4197 if (s->failed >= 1) { 4198 dev = &sh->dev[s->failed_num[0]]; 4199 s->locked++; 4200 set_bit(R5_LOCKED, &dev->flags); 4201 set_bit(R5_Wantwrite, &dev->flags); 4202 } 4203 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4204 dev = &sh->dev[pd_idx]; 4205 s->locked++; 4206 set_bit(R5_LOCKED, &dev->flags); 4207 set_bit(R5_Wantwrite, &dev->flags); 4208 } 4209 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4210 dev = &sh->dev[qd_idx]; 4211 s->locked++; 4212 set_bit(R5_LOCKED, &dev->flags); 4213 set_bit(R5_Wantwrite, &dev->flags); 4214 } 4215 clear_bit(STRIPE_DEGRADED, &sh->state); 4216 4217 set_bit(STRIPE_INSYNC, &sh->state); 4218 break; 4219 case check_state_run: 4220 case check_state_run_q: 4221 case check_state_run_pq: 4222 break; /* we will be called again upon completion */ 4223 case check_state_check_result: 4224 sh->check_state = check_state_idle; 4225 4226 /* handle a successful check operation, if parity is correct 4227 * we are done. Otherwise update the mismatch count and repair 4228 * parity if !MD_RECOVERY_CHECK 4229 */ 4230 if (sh->ops.zero_sum_result == 0) { 4231 /* both parities are correct */ 4232 if (!s->failed) 4233 set_bit(STRIPE_INSYNC, &sh->state); 4234 else { 4235 /* in contrast to the raid5 case we can validate 4236 * parity, but still have a failure to write 4237 * back 4238 */ 4239 sh->check_state = check_state_compute_result; 4240 /* Returning at this point means that we may go 4241 * off and bring p and/or q uptodate again so 4242 * we make sure to check zero_sum_result again 4243 * to verify if p or q need writeback 4244 */ 4245 } 4246 } else { 4247 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4248 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4249 /* don't try to repair!! */ 4250 set_bit(STRIPE_INSYNC, &sh->state); 4251 pr_warn_ratelimited("%s: mismatch sector in range " 4252 "%llu-%llu\n", mdname(conf->mddev), 4253 (unsigned long long) sh->sector, 4254 (unsigned long long) sh->sector + 4255 STRIPE_SECTORS); 4256 } else { 4257 int *target = &sh->ops.target; 4258 4259 sh->ops.target = -1; 4260 sh->ops.target2 = -1; 4261 sh->check_state = check_state_compute_run; 4262 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4263 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4264 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4265 set_bit(R5_Wantcompute, 4266 &sh->dev[pd_idx].flags); 4267 *target = pd_idx; 4268 target = &sh->ops.target2; 4269 s->uptodate++; 4270 } 4271 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4272 set_bit(R5_Wantcompute, 4273 &sh->dev[qd_idx].flags); 4274 *target = qd_idx; 4275 s->uptodate++; 4276 } 4277 } 4278 } 4279 break; 4280 case check_state_compute_run: 4281 break; 4282 default: 4283 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4284 __func__, sh->check_state, 4285 (unsigned long long) sh->sector); 4286 BUG(); 4287 } 4288 } 4289 4290 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4291 { 4292 int i; 4293 4294 /* We have read all the blocks in this stripe and now we need to 4295 * copy some of them into a target stripe for expand. 4296 */ 4297 struct dma_async_tx_descriptor *tx = NULL; 4298 BUG_ON(sh->batch_head); 4299 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4300 for (i = 0; i < sh->disks; i++) 4301 if (i != sh->pd_idx && i != sh->qd_idx) { 4302 int dd_idx, j; 4303 struct stripe_head *sh2; 4304 struct async_submit_ctl submit; 4305 4306 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4307 sector_t s = raid5_compute_sector(conf, bn, 0, 4308 &dd_idx, NULL); 4309 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4310 if (sh2 == NULL) 4311 /* so far only the early blocks of this stripe 4312 * have been requested. When later blocks 4313 * get requested, we will try again 4314 */ 4315 continue; 4316 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4317 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4318 /* must have already done this block */ 4319 raid5_release_stripe(sh2); 4320 continue; 4321 } 4322 4323 /* place all the copies on one channel */ 4324 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4325 tx = async_memcpy(sh2->dev[dd_idx].page, 4326 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4327 &submit); 4328 4329 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4330 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4331 for (j = 0; j < conf->raid_disks; j++) 4332 if (j != sh2->pd_idx && 4333 j != sh2->qd_idx && 4334 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4335 break; 4336 if (j == conf->raid_disks) { 4337 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4338 set_bit(STRIPE_HANDLE, &sh2->state); 4339 } 4340 raid5_release_stripe(sh2); 4341 4342 } 4343 /* done submitting copies, wait for them to complete */ 4344 async_tx_quiesce(&tx); 4345 } 4346 4347 /* 4348 * handle_stripe - do things to a stripe. 4349 * 4350 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4351 * state of various bits to see what needs to be done. 4352 * Possible results: 4353 * return some read requests which now have data 4354 * return some write requests which are safely on storage 4355 * schedule a read on some buffers 4356 * schedule a write of some buffers 4357 * return confirmation of parity correctness 4358 * 4359 */ 4360 4361 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4362 { 4363 struct r5conf *conf = sh->raid_conf; 4364 int disks = sh->disks; 4365 struct r5dev *dev; 4366 int i; 4367 int do_recovery = 0; 4368 4369 memset(s, 0, sizeof(*s)); 4370 4371 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4372 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4373 s->failed_num[0] = -1; 4374 s->failed_num[1] = -1; 4375 s->log_failed = r5l_log_disk_error(conf); 4376 4377 /* Now to look around and see what can be done */ 4378 rcu_read_lock(); 4379 for (i=disks; i--; ) { 4380 struct md_rdev *rdev; 4381 sector_t first_bad; 4382 int bad_sectors; 4383 int is_bad = 0; 4384 4385 dev = &sh->dev[i]; 4386 4387 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4388 i, dev->flags, 4389 dev->toread, dev->towrite, dev->written); 4390 /* maybe we can reply to a read 4391 * 4392 * new wantfill requests are only permitted while 4393 * ops_complete_biofill is guaranteed to be inactive 4394 */ 4395 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4396 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4397 set_bit(R5_Wantfill, &dev->flags); 4398 4399 /* now count some things */ 4400 if (test_bit(R5_LOCKED, &dev->flags)) 4401 s->locked++; 4402 if (test_bit(R5_UPTODATE, &dev->flags)) 4403 s->uptodate++; 4404 if (test_bit(R5_Wantcompute, &dev->flags)) { 4405 s->compute++; 4406 BUG_ON(s->compute > 2); 4407 } 4408 4409 if (test_bit(R5_Wantfill, &dev->flags)) 4410 s->to_fill++; 4411 else if (dev->toread) 4412 s->to_read++; 4413 if (dev->towrite) { 4414 s->to_write++; 4415 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4416 s->non_overwrite++; 4417 } 4418 if (dev->written) 4419 s->written++; 4420 /* Prefer to use the replacement for reads, but only 4421 * if it is recovered enough and has no bad blocks. 4422 */ 4423 rdev = rcu_dereference(conf->disks[i].replacement); 4424 if (rdev && !test_bit(Faulty, &rdev->flags) && 4425 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4426 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4427 &first_bad, &bad_sectors)) 4428 set_bit(R5_ReadRepl, &dev->flags); 4429 else { 4430 if (rdev && !test_bit(Faulty, &rdev->flags)) 4431 set_bit(R5_NeedReplace, &dev->flags); 4432 else 4433 clear_bit(R5_NeedReplace, &dev->flags); 4434 rdev = rcu_dereference(conf->disks[i].rdev); 4435 clear_bit(R5_ReadRepl, &dev->flags); 4436 } 4437 if (rdev && test_bit(Faulty, &rdev->flags)) 4438 rdev = NULL; 4439 if (rdev) { 4440 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4441 &first_bad, &bad_sectors); 4442 if (s->blocked_rdev == NULL 4443 && (test_bit(Blocked, &rdev->flags) 4444 || is_bad < 0)) { 4445 if (is_bad < 0) 4446 set_bit(BlockedBadBlocks, 4447 &rdev->flags); 4448 s->blocked_rdev = rdev; 4449 atomic_inc(&rdev->nr_pending); 4450 } 4451 } 4452 clear_bit(R5_Insync, &dev->flags); 4453 if (!rdev) 4454 /* Not in-sync */; 4455 else if (is_bad) { 4456 /* also not in-sync */ 4457 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4458 test_bit(R5_UPTODATE, &dev->flags)) { 4459 /* treat as in-sync, but with a read error 4460 * which we can now try to correct 4461 */ 4462 set_bit(R5_Insync, &dev->flags); 4463 set_bit(R5_ReadError, &dev->flags); 4464 } 4465 } else if (test_bit(In_sync, &rdev->flags)) 4466 set_bit(R5_Insync, &dev->flags); 4467 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4468 /* in sync if before recovery_offset */ 4469 set_bit(R5_Insync, &dev->flags); 4470 else if (test_bit(R5_UPTODATE, &dev->flags) && 4471 test_bit(R5_Expanded, &dev->flags)) 4472 /* If we've reshaped into here, we assume it is Insync. 4473 * We will shortly update recovery_offset to make 4474 * it official. 4475 */ 4476 set_bit(R5_Insync, &dev->flags); 4477 4478 if (test_bit(R5_WriteError, &dev->flags)) { 4479 /* This flag does not apply to '.replacement' 4480 * only to .rdev, so make sure to check that*/ 4481 struct md_rdev *rdev2 = rcu_dereference( 4482 conf->disks[i].rdev); 4483 if (rdev2 == rdev) 4484 clear_bit(R5_Insync, &dev->flags); 4485 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4486 s->handle_bad_blocks = 1; 4487 atomic_inc(&rdev2->nr_pending); 4488 } else 4489 clear_bit(R5_WriteError, &dev->flags); 4490 } 4491 if (test_bit(R5_MadeGood, &dev->flags)) { 4492 /* This flag does not apply to '.replacement' 4493 * only to .rdev, so make sure to check that*/ 4494 struct md_rdev *rdev2 = rcu_dereference( 4495 conf->disks[i].rdev); 4496 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4497 s->handle_bad_blocks = 1; 4498 atomic_inc(&rdev2->nr_pending); 4499 } else 4500 clear_bit(R5_MadeGood, &dev->flags); 4501 } 4502 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4503 struct md_rdev *rdev2 = rcu_dereference( 4504 conf->disks[i].replacement); 4505 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4506 s->handle_bad_blocks = 1; 4507 atomic_inc(&rdev2->nr_pending); 4508 } else 4509 clear_bit(R5_MadeGoodRepl, &dev->flags); 4510 } 4511 if (!test_bit(R5_Insync, &dev->flags)) { 4512 /* The ReadError flag will just be confusing now */ 4513 clear_bit(R5_ReadError, &dev->flags); 4514 clear_bit(R5_ReWrite, &dev->flags); 4515 } 4516 if (test_bit(R5_ReadError, &dev->flags)) 4517 clear_bit(R5_Insync, &dev->flags); 4518 if (!test_bit(R5_Insync, &dev->flags)) { 4519 if (s->failed < 2) 4520 s->failed_num[s->failed] = i; 4521 s->failed++; 4522 if (rdev && !test_bit(Faulty, &rdev->flags)) 4523 do_recovery = 1; 4524 else if (!rdev) { 4525 rdev = rcu_dereference( 4526 conf->disks[i].replacement); 4527 if (rdev && !test_bit(Faulty, &rdev->flags)) 4528 do_recovery = 1; 4529 } 4530 } 4531 4532 if (test_bit(R5_InJournal, &dev->flags)) 4533 s->injournal++; 4534 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4535 s->just_cached++; 4536 } 4537 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4538 /* If there is a failed device being replaced, 4539 * we must be recovering. 4540 * else if we are after recovery_cp, we must be syncing 4541 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4542 * else we can only be replacing 4543 * sync and recovery both need to read all devices, and so 4544 * use the same flag. 4545 */ 4546 if (do_recovery || 4547 sh->sector >= conf->mddev->recovery_cp || 4548 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4549 s->syncing = 1; 4550 else 4551 s->replacing = 1; 4552 } 4553 rcu_read_unlock(); 4554 } 4555 4556 static int clear_batch_ready(struct stripe_head *sh) 4557 { 4558 /* Return '1' if this is a member of batch, or 4559 * '0' if it is a lone stripe or a head which can now be 4560 * handled. 4561 */ 4562 struct stripe_head *tmp; 4563 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4564 return (sh->batch_head && sh->batch_head != sh); 4565 spin_lock(&sh->stripe_lock); 4566 if (!sh->batch_head) { 4567 spin_unlock(&sh->stripe_lock); 4568 return 0; 4569 } 4570 4571 /* 4572 * this stripe could be added to a batch list before we check 4573 * BATCH_READY, skips it 4574 */ 4575 if (sh->batch_head != sh) { 4576 spin_unlock(&sh->stripe_lock); 4577 return 1; 4578 } 4579 spin_lock(&sh->batch_lock); 4580 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4581 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4582 spin_unlock(&sh->batch_lock); 4583 spin_unlock(&sh->stripe_lock); 4584 4585 /* 4586 * BATCH_READY is cleared, no new stripes can be added. 4587 * batch_list can be accessed without lock 4588 */ 4589 return 0; 4590 } 4591 4592 static void break_stripe_batch_list(struct stripe_head *head_sh, 4593 unsigned long handle_flags) 4594 { 4595 struct stripe_head *sh, *next; 4596 int i; 4597 int do_wakeup = 0; 4598 4599 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4600 4601 list_del_init(&sh->batch_list); 4602 4603 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4604 (1 << STRIPE_SYNCING) | 4605 (1 << STRIPE_REPLACED) | 4606 (1 << STRIPE_DELAYED) | 4607 (1 << STRIPE_BIT_DELAY) | 4608 (1 << STRIPE_FULL_WRITE) | 4609 (1 << STRIPE_BIOFILL_RUN) | 4610 (1 << STRIPE_COMPUTE_RUN) | 4611 (1 << STRIPE_OPS_REQ_PENDING) | 4612 (1 << STRIPE_DISCARD) | 4613 (1 << STRIPE_BATCH_READY) | 4614 (1 << STRIPE_BATCH_ERR) | 4615 (1 << STRIPE_BITMAP_PENDING)), 4616 "stripe state: %lx\n", sh->state); 4617 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4618 (1 << STRIPE_REPLACED)), 4619 "head stripe state: %lx\n", head_sh->state); 4620 4621 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4622 (1 << STRIPE_PREREAD_ACTIVE) | 4623 (1 << STRIPE_DEGRADED) | 4624 (1 << STRIPE_ON_UNPLUG_LIST)), 4625 head_sh->state & (1 << STRIPE_INSYNC)); 4626 4627 sh->check_state = head_sh->check_state; 4628 sh->reconstruct_state = head_sh->reconstruct_state; 4629 spin_lock_irq(&sh->stripe_lock); 4630 sh->batch_head = NULL; 4631 spin_unlock_irq(&sh->stripe_lock); 4632 for (i = 0; i < sh->disks; i++) { 4633 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4634 do_wakeup = 1; 4635 sh->dev[i].flags = head_sh->dev[i].flags & 4636 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4637 } 4638 if (handle_flags == 0 || 4639 sh->state & handle_flags) 4640 set_bit(STRIPE_HANDLE, &sh->state); 4641 raid5_release_stripe(sh); 4642 } 4643 spin_lock_irq(&head_sh->stripe_lock); 4644 head_sh->batch_head = NULL; 4645 spin_unlock_irq(&head_sh->stripe_lock); 4646 for (i = 0; i < head_sh->disks; i++) 4647 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4648 do_wakeup = 1; 4649 if (head_sh->state & handle_flags) 4650 set_bit(STRIPE_HANDLE, &head_sh->state); 4651 4652 if (do_wakeup) 4653 wake_up(&head_sh->raid_conf->wait_for_overlap); 4654 } 4655 4656 static void handle_stripe(struct stripe_head *sh) 4657 { 4658 struct stripe_head_state s; 4659 struct r5conf *conf = sh->raid_conf; 4660 int i; 4661 int prexor; 4662 int disks = sh->disks; 4663 struct r5dev *pdev, *qdev; 4664 4665 clear_bit(STRIPE_HANDLE, &sh->state); 4666 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4667 /* already being handled, ensure it gets handled 4668 * again when current action finishes */ 4669 set_bit(STRIPE_HANDLE, &sh->state); 4670 return; 4671 } 4672 4673 if (clear_batch_ready(sh) ) { 4674 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4675 return; 4676 } 4677 4678 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4679 break_stripe_batch_list(sh, 0); 4680 4681 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4682 spin_lock(&sh->stripe_lock); 4683 /* 4684 * Cannot process 'sync' concurrently with 'discard'. 4685 * Flush data in r5cache before 'sync'. 4686 */ 4687 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 4688 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 4689 !test_bit(STRIPE_DISCARD, &sh->state) && 4690 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4691 set_bit(STRIPE_SYNCING, &sh->state); 4692 clear_bit(STRIPE_INSYNC, &sh->state); 4693 clear_bit(STRIPE_REPLACED, &sh->state); 4694 } 4695 spin_unlock(&sh->stripe_lock); 4696 } 4697 clear_bit(STRIPE_DELAYED, &sh->state); 4698 4699 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4700 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4701 (unsigned long long)sh->sector, sh->state, 4702 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4703 sh->check_state, sh->reconstruct_state); 4704 4705 analyse_stripe(sh, &s); 4706 4707 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4708 goto finish; 4709 4710 if (s.handle_bad_blocks || 4711 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4712 set_bit(STRIPE_HANDLE, &sh->state); 4713 goto finish; 4714 } 4715 4716 if (unlikely(s.blocked_rdev)) { 4717 if (s.syncing || s.expanding || s.expanded || 4718 s.replacing || s.to_write || s.written) { 4719 set_bit(STRIPE_HANDLE, &sh->state); 4720 goto finish; 4721 } 4722 /* There is nothing for the blocked_rdev to block */ 4723 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4724 s.blocked_rdev = NULL; 4725 } 4726 4727 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4728 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4729 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4730 } 4731 4732 pr_debug("locked=%d uptodate=%d to_read=%d" 4733 " to_write=%d failed=%d failed_num=%d,%d\n", 4734 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4735 s.failed_num[0], s.failed_num[1]); 4736 /* 4737 * check if the array has lost more than max_degraded devices and, 4738 * if so, some requests might need to be failed. 4739 * 4740 * When journal device failed (log_failed), we will only process 4741 * the stripe if there is data need write to raid disks 4742 */ 4743 if (s.failed > conf->max_degraded || 4744 (s.log_failed && s.injournal == 0)) { 4745 sh->check_state = 0; 4746 sh->reconstruct_state = 0; 4747 break_stripe_batch_list(sh, 0); 4748 if (s.to_read+s.to_write+s.written) 4749 handle_failed_stripe(conf, sh, &s, disks); 4750 if (s.syncing + s.replacing) 4751 handle_failed_sync(conf, sh, &s); 4752 } 4753 4754 /* Now we check to see if any write operations have recently 4755 * completed 4756 */ 4757 prexor = 0; 4758 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4759 prexor = 1; 4760 if (sh->reconstruct_state == reconstruct_state_drain_result || 4761 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4762 sh->reconstruct_state = reconstruct_state_idle; 4763 4764 /* All the 'written' buffers and the parity block are ready to 4765 * be written back to disk 4766 */ 4767 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4768 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4769 BUG_ON(sh->qd_idx >= 0 && 4770 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4771 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4772 for (i = disks; i--; ) { 4773 struct r5dev *dev = &sh->dev[i]; 4774 if (test_bit(R5_LOCKED, &dev->flags) && 4775 (i == sh->pd_idx || i == sh->qd_idx || 4776 dev->written || test_bit(R5_InJournal, 4777 &dev->flags))) { 4778 pr_debug("Writing block %d\n", i); 4779 set_bit(R5_Wantwrite, &dev->flags); 4780 if (prexor) 4781 continue; 4782 if (s.failed > 1) 4783 continue; 4784 if (!test_bit(R5_Insync, &dev->flags) || 4785 ((i == sh->pd_idx || i == sh->qd_idx) && 4786 s.failed == 0)) 4787 set_bit(STRIPE_INSYNC, &sh->state); 4788 } 4789 } 4790 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4791 s.dec_preread_active = 1; 4792 } 4793 4794 /* 4795 * might be able to return some write requests if the parity blocks 4796 * are safe, or on a failed drive 4797 */ 4798 pdev = &sh->dev[sh->pd_idx]; 4799 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4800 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4801 qdev = &sh->dev[sh->qd_idx]; 4802 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4803 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4804 || conf->level < 6; 4805 4806 if (s.written && 4807 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4808 && !test_bit(R5_LOCKED, &pdev->flags) 4809 && (test_bit(R5_UPTODATE, &pdev->flags) || 4810 test_bit(R5_Discard, &pdev->flags))))) && 4811 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4812 && !test_bit(R5_LOCKED, &qdev->flags) 4813 && (test_bit(R5_UPTODATE, &qdev->flags) || 4814 test_bit(R5_Discard, &qdev->flags)))))) 4815 handle_stripe_clean_event(conf, sh, disks); 4816 4817 if (s.just_cached) 4818 r5c_handle_cached_data_endio(conf, sh, disks); 4819 log_stripe_write_finished(sh); 4820 4821 /* Now we might consider reading some blocks, either to check/generate 4822 * parity, or to satisfy requests 4823 * or to load a block that is being partially written. 4824 */ 4825 if (s.to_read || s.non_overwrite 4826 || (conf->level == 6 && s.to_write && s.failed) 4827 || (s.syncing && (s.uptodate + s.compute < disks)) 4828 || s.replacing 4829 || s.expanding) 4830 handle_stripe_fill(sh, &s, disks); 4831 4832 /* 4833 * When the stripe finishes full journal write cycle (write to journal 4834 * and raid disk), this is the clean up procedure so it is ready for 4835 * next operation. 4836 */ 4837 r5c_finish_stripe_write_out(conf, sh, &s); 4838 4839 /* 4840 * Now to consider new write requests, cache write back and what else, 4841 * if anything should be read. We do not handle new writes when: 4842 * 1/ A 'write' operation (copy+xor) is already in flight. 4843 * 2/ A 'check' operation is in flight, as it may clobber the parity 4844 * block. 4845 * 3/ A r5c cache log write is in flight. 4846 */ 4847 4848 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4849 if (!r5c_is_writeback(conf->log)) { 4850 if (s.to_write) 4851 handle_stripe_dirtying(conf, sh, &s, disks); 4852 } else { /* write back cache */ 4853 int ret = 0; 4854 4855 /* First, try handle writes in caching phase */ 4856 if (s.to_write) 4857 ret = r5c_try_caching_write(conf, sh, &s, 4858 disks); 4859 /* 4860 * If caching phase failed: ret == -EAGAIN 4861 * OR 4862 * stripe under reclaim: !caching && injournal 4863 * 4864 * fall back to handle_stripe_dirtying() 4865 */ 4866 if (ret == -EAGAIN || 4867 /* stripe under reclaim: !caching && injournal */ 4868 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4869 s.injournal > 0)) { 4870 ret = handle_stripe_dirtying(conf, sh, &s, 4871 disks); 4872 if (ret == -EAGAIN) 4873 goto finish; 4874 } 4875 } 4876 } 4877 4878 /* maybe we need to check and possibly fix the parity for this stripe 4879 * Any reads will already have been scheduled, so we just see if enough 4880 * data is available. The parity check is held off while parity 4881 * dependent operations are in flight. 4882 */ 4883 if (sh->check_state || 4884 (s.syncing && s.locked == 0 && 4885 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4886 !test_bit(STRIPE_INSYNC, &sh->state))) { 4887 if (conf->level == 6) 4888 handle_parity_checks6(conf, sh, &s, disks); 4889 else 4890 handle_parity_checks5(conf, sh, &s, disks); 4891 } 4892 4893 if ((s.replacing || s.syncing) && s.locked == 0 4894 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4895 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4896 /* Write out to replacement devices where possible */ 4897 for (i = 0; i < conf->raid_disks; i++) 4898 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4899 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4900 set_bit(R5_WantReplace, &sh->dev[i].flags); 4901 set_bit(R5_LOCKED, &sh->dev[i].flags); 4902 s.locked++; 4903 } 4904 if (s.replacing) 4905 set_bit(STRIPE_INSYNC, &sh->state); 4906 set_bit(STRIPE_REPLACED, &sh->state); 4907 } 4908 if ((s.syncing || s.replacing) && s.locked == 0 && 4909 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4910 test_bit(STRIPE_INSYNC, &sh->state)) { 4911 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4912 clear_bit(STRIPE_SYNCING, &sh->state); 4913 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4914 wake_up(&conf->wait_for_overlap); 4915 } 4916 4917 /* If the failed drives are just a ReadError, then we might need 4918 * to progress the repair/check process 4919 */ 4920 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4921 for (i = 0; i < s.failed; i++) { 4922 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4923 if (test_bit(R5_ReadError, &dev->flags) 4924 && !test_bit(R5_LOCKED, &dev->flags) 4925 && test_bit(R5_UPTODATE, &dev->flags) 4926 ) { 4927 if (!test_bit(R5_ReWrite, &dev->flags)) { 4928 set_bit(R5_Wantwrite, &dev->flags); 4929 set_bit(R5_ReWrite, &dev->flags); 4930 set_bit(R5_LOCKED, &dev->flags); 4931 s.locked++; 4932 } else { 4933 /* let's read it back */ 4934 set_bit(R5_Wantread, &dev->flags); 4935 set_bit(R5_LOCKED, &dev->flags); 4936 s.locked++; 4937 } 4938 } 4939 } 4940 4941 /* Finish reconstruct operations initiated by the expansion process */ 4942 if (sh->reconstruct_state == reconstruct_state_result) { 4943 struct stripe_head *sh_src 4944 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4945 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4946 /* sh cannot be written until sh_src has been read. 4947 * so arrange for sh to be delayed a little 4948 */ 4949 set_bit(STRIPE_DELAYED, &sh->state); 4950 set_bit(STRIPE_HANDLE, &sh->state); 4951 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4952 &sh_src->state)) 4953 atomic_inc(&conf->preread_active_stripes); 4954 raid5_release_stripe(sh_src); 4955 goto finish; 4956 } 4957 if (sh_src) 4958 raid5_release_stripe(sh_src); 4959 4960 sh->reconstruct_state = reconstruct_state_idle; 4961 clear_bit(STRIPE_EXPANDING, &sh->state); 4962 for (i = conf->raid_disks; i--; ) { 4963 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4964 set_bit(R5_LOCKED, &sh->dev[i].flags); 4965 s.locked++; 4966 } 4967 } 4968 4969 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4970 !sh->reconstruct_state) { 4971 /* Need to write out all blocks after computing parity */ 4972 sh->disks = conf->raid_disks; 4973 stripe_set_idx(sh->sector, conf, 0, sh); 4974 schedule_reconstruction(sh, &s, 1, 1); 4975 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4976 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4977 atomic_dec(&conf->reshape_stripes); 4978 wake_up(&conf->wait_for_overlap); 4979 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4980 } 4981 4982 if (s.expanding && s.locked == 0 && 4983 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4984 handle_stripe_expansion(conf, sh); 4985 4986 finish: 4987 /* wait for this device to become unblocked */ 4988 if (unlikely(s.blocked_rdev)) { 4989 if (conf->mddev->external) 4990 md_wait_for_blocked_rdev(s.blocked_rdev, 4991 conf->mddev); 4992 else 4993 /* Internal metadata will immediately 4994 * be written by raid5d, so we don't 4995 * need to wait here. 4996 */ 4997 rdev_dec_pending(s.blocked_rdev, 4998 conf->mddev); 4999 } 5000 5001 if (s.handle_bad_blocks) 5002 for (i = disks; i--; ) { 5003 struct md_rdev *rdev; 5004 struct r5dev *dev = &sh->dev[i]; 5005 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 5006 /* We own a safe reference to the rdev */ 5007 rdev = conf->disks[i].rdev; 5008 if (!rdev_set_badblocks(rdev, sh->sector, 5009 STRIPE_SECTORS, 0)) 5010 md_error(conf->mddev, rdev); 5011 rdev_dec_pending(rdev, conf->mddev); 5012 } 5013 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 5014 rdev = conf->disks[i].rdev; 5015 rdev_clear_badblocks(rdev, sh->sector, 5016 STRIPE_SECTORS, 0); 5017 rdev_dec_pending(rdev, conf->mddev); 5018 } 5019 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5020 rdev = conf->disks[i].replacement; 5021 if (!rdev) 5022 /* rdev have been moved down */ 5023 rdev = conf->disks[i].rdev; 5024 rdev_clear_badblocks(rdev, sh->sector, 5025 STRIPE_SECTORS, 0); 5026 rdev_dec_pending(rdev, conf->mddev); 5027 } 5028 } 5029 5030 if (s.ops_request) 5031 raid_run_ops(sh, s.ops_request); 5032 5033 ops_run_io(sh, &s); 5034 5035 if (s.dec_preread_active) { 5036 /* We delay this until after ops_run_io so that if make_request 5037 * is waiting on a flush, it won't continue until the writes 5038 * have actually been submitted. 5039 */ 5040 atomic_dec(&conf->preread_active_stripes); 5041 if (atomic_read(&conf->preread_active_stripes) < 5042 IO_THRESHOLD) 5043 md_wakeup_thread(conf->mddev->thread); 5044 } 5045 5046 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5047 } 5048 5049 static void raid5_activate_delayed(struct r5conf *conf) 5050 { 5051 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5052 while (!list_empty(&conf->delayed_list)) { 5053 struct list_head *l = conf->delayed_list.next; 5054 struct stripe_head *sh; 5055 sh = list_entry(l, struct stripe_head, lru); 5056 list_del_init(l); 5057 clear_bit(STRIPE_DELAYED, &sh->state); 5058 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5059 atomic_inc(&conf->preread_active_stripes); 5060 list_add_tail(&sh->lru, &conf->hold_list); 5061 raid5_wakeup_stripe_thread(sh); 5062 } 5063 } 5064 } 5065 5066 static void activate_bit_delay(struct r5conf *conf, 5067 struct list_head *temp_inactive_list) 5068 { 5069 /* device_lock is held */ 5070 struct list_head head; 5071 list_add(&head, &conf->bitmap_list); 5072 list_del_init(&conf->bitmap_list); 5073 while (!list_empty(&head)) { 5074 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5075 int hash; 5076 list_del_init(&sh->lru); 5077 atomic_inc(&sh->count); 5078 hash = sh->hash_lock_index; 5079 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5080 } 5081 } 5082 5083 static int raid5_congested(struct mddev *mddev, int bits) 5084 { 5085 struct r5conf *conf = mddev->private; 5086 5087 /* No difference between reads and writes. Just check 5088 * how busy the stripe_cache is 5089 */ 5090 5091 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5092 return 1; 5093 5094 /* Also checks whether there is pressure on r5cache log space */ 5095 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5096 return 1; 5097 if (conf->quiesce) 5098 return 1; 5099 if (atomic_read(&conf->empty_inactive_list_nr)) 5100 return 1; 5101 5102 return 0; 5103 } 5104 5105 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5106 { 5107 struct r5conf *conf = mddev->private; 5108 sector_t sector = bio->bi_iter.bi_sector; 5109 unsigned int chunk_sectors; 5110 unsigned int bio_sectors = bio_sectors(bio); 5111 5112 WARN_ON_ONCE(bio->bi_partno); 5113 5114 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5115 return chunk_sectors >= 5116 ((sector & (chunk_sectors - 1)) + bio_sectors); 5117 } 5118 5119 /* 5120 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5121 * later sampled by raid5d. 5122 */ 5123 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5124 { 5125 unsigned long flags; 5126 5127 spin_lock_irqsave(&conf->device_lock, flags); 5128 5129 bi->bi_next = conf->retry_read_aligned_list; 5130 conf->retry_read_aligned_list = bi; 5131 5132 spin_unlock_irqrestore(&conf->device_lock, flags); 5133 md_wakeup_thread(conf->mddev->thread); 5134 } 5135 5136 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5137 unsigned int *offset) 5138 { 5139 struct bio *bi; 5140 5141 bi = conf->retry_read_aligned; 5142 if (bi) { 5143 *offset = conf->retry_read_offset; 5144 conf->retry_read_aligned = NULL; 5145 return bi; 5146 } 5147 bi = conf->retry_read_aligned_list; 5148 if(bi) { 5149 conf->retry_read_aligned_list = bi->bi_next; 5150 bi->bi_next = NULL; 5151 *offset = 0; 5152 } 5153 5154 return bi; 5155 } 5156 5157 /* 5158 * The "raid5_align_endio" should check if the read succeeded and if it 5159 * did, call bio_endio on the original bio (having bio_put the new bio 5160 * first). 5161 * If the read failed.. 5162 */ 5163 static void raid5_align_endio(struct bio *bi) 5164 { 5165 struct bio* raid_bi = bi->bi_private; 5166 struct mddev *mddev; 5167 struct r5conf *conf; 5168 struct md_rdev *rdev; 5169 blk_status_t error = bi->bi_status; 5170 5171 bio_put(bi); 5172 5173 rdev = (void*)raid_bi->bi_next; 5174 raid_bi->bi_next = NULL; 5175 mddev = rdev->mddev; 5176 conf = mddev->private; 5177 5178 rdev_dec_pending(rdev, conf->mddev); 5179 5180 if (!error) { 5181 bio_endio(raid_bi); 5182 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5183 wake_up(&conf->wait_for_quiescent); 5184 return; 5185 } 5186 5187 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5188 5189 add_bio_to_retry(raid_bi, conf); 5190 } 5191 5192 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5193 { 5194 struct r5conf *conf = mddev->private; 5195 int dd_idx; 5196 struct bio* align_bi; 5197 struct md_rdev *rdev; 5198 sector_t end_sector; 5199 5200 if (!in_chunk_boundary(mddev, raid_bio)) { 5201 pr_debug("%s: non aligned\n", __func__); 5202 return 0; 5203 } 5204 /* 5205 * use bio_clone_fast to make a copy of the bio 5206 */ 5207 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); 5208 if (!align_bi) 5209 return 0; 5210 /* 5211 * set bi_end_io to a new function, and set bi_private to the 5212 * original bio. 5213 */ 5214 align_bi->bi_end_io = raid5_align_endio; 5215 align_bi->bi_private = raid_bio; 5216 /* 5217 * compute position 5218 */ 5219 align_bi->bi_iter.bi_sector = 5220 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5221 0, &dd_idx, NULL); 5222 5223 end_sector = bio_end_sector(align_bi); 5224 rcu_read_lock(); 5225 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5226 if (!rdev || test_bit(Faulty, &rdev->flags) || 5227 rdev->recovery_offset < end_sector) { 5228 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5229 if (rdev && 5230 (test_bit(Faulty, &rdev->flags) || 5231 !(test_bit(In_sync, &rdev->flags) || 5232 rdev->recovery_offset >= end_sector))) 5233 rdev = NULL; 5234 } 5235 5236 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5237 rcu_read_unlock(); 5238 bio_put(align_bi); 5239 return 0; 5240 } 5241 5242 if (rdev) { 5243 sector_t first_bad; 5244 int bad_sectors; 5245 5246 atomic_inc(&rdev->nr_pending); 5247 rcu_read_unlock(); 5248 raid_bio->bi_next = (void*)rdev; 5249 bio_set_dev(align_bi, rdev->bdev); 5250 bio_clear_flag(align_bi, BIO_SEG_VALID); 5251 5252 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5253 bio_sectors(align_bi), 5254 &first_bad, &bad_sectors)) { 5255 bio_put(align_bi); 5256 rdev_dec_pending(rdev, mddev); 5257 return 0; 5258 } 5259 5260 /* No reshape active, so we can trust rdev->data_offset */ 5261 align_bi->bi_iter.bi_sector += rdev->data_offset; 5262 5263 spin_lock_irq(&conf->device_lock); 5264 wait_event_lock_irq(conf->wait_for_quiescent, 5265 conf->quiesce == 0, 5266 conf->device_lock); 5267 atomic_inc(&conf->active_aligned_reads); 5268 spin_unlock_irq(&conf->device_lock); 5269 5270 if (mddev->gendisk) 5271 trace_block_bio_remap(align_bi->bi_disk->queue, 5272 align_bi, disk_devt(mddev->gendisk), 5273 raid_bio->bi_iter.bi_sector); 5274 generic_make_request(align_bi); 5275 return 1; 5276 } else { 5277 rcu_read_unlock(); 5278 bio_put(align_bi); 5279 return 0; 5280 } 5281 } 5282 5283 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5284 { 5285 struct bio *split; 5286 sector_t sector = raid_bio->bi_iter.bi_sector; 5287 unsigned chunk_sects = mddev->chunk_sectors; 5288 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5289 5290 if (sectors < bio_sectors(raid_bio)) { 5291 struct r5conf *conf = mddev->private; 5292 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); 5293 bio_chain(split, raid_bio); 5294 generic_make_request(raid_bio); 5295 raid_bio = split; 5296 } 5297 5298 if (!raid5_read_one_chunk(mddev, raid_bio)) 5299 return raid_bio; 5300 5301 return NULL; 5302 } 5303 5304 /* __get_priority_stripe - get the next stripe to process 5305 * 5306 * Full stripe writes are allowed to pass preread active stripes up until 5307 * the bypass_threshold is exceeded. In general the bypass_count 5308 * increments when the handle_list is handled before the hold_list; however, it 5309 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5310 * stripe with in flight i/o. The bypass_count will be reset when the 5311 * head of the hold_list has changed, i.e. the head was promoted to the 5312 * handle_list. 5313 */ 5314 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5315 { 5316 struct stripe_head *sh, *tmp; 5317 struct list_head *handle_list = NULL; 5318 struct r5worker_group *wg; 5319 bool second_try = !r5c_is_writeback(conf->log) && 5320 !r5l_log_disk_error(conf); 5321 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5322 r5l_log_disk_error(conf); 5323 5324 again: 5325 wg = NULL; 5326 sh = NULL; 5327 if (conf->worker_cnt_per_group == 0) { 5328 handle_list = try_loprio ? &conf->loprio_list : 5329 &conf->handle_list; 5330 } else if (group != ANY_GROUP) { 5331 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5332 &conf->worker_groups[group].handle_list; 5333 wg = &conf->worker_groups[group]; 5334 } else { 5335 int i; 5336 for (i = 0; i < conf->group_cnt; i++) { 5337 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5338 &conf->worker_groups[i].handle_list; 5339 wg = &conf->worker_groups[i]; 5340 if (!list_empty(handle_list)) 5341 break; 5342 } 5343 } 5344 5345 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5346 __func__, 5347 list_empty(handle_list) ? "empty" : "busy", 5348 list_empty(&conf->hold_list) ? "empty" : "busy", 5349 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5350 5351 if (!list_empty(handle_list)) { 5352 sh = list_entry(handle_list->next, typeof(*sh), lru); 5353 5354 if (list_empty(&conf->hold_list)) 5355 conf->bypass_count = 0; 5356 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5357 if (conf->hold_list.next == conf->last_hold) 5358 conf->bypass_count++; 5359 else { 5360 conf->last_hold = conf->hold_list.next; 5361 conf->bypass_count -= conf->bypass_threshold; 5362 if (conf->bypass_count < 0) 5363 conf->bypass_count = 0; 5364 } 5365 } 5366 } else if (!list_empty(&conf->hold_list) && 5367 ((conf->bypass_threshold && 5368 conf->bypass_count > conf->bypass_threshold) || 5369 atomic_read(&conf->pending_full_writes) == 0)) { 5370 5371 list_for_each_entry(tmp, &conf->hold_list, lru) { 5372 if (conf->worker_cnt_per_group == 0 || 5373 group == ANY_GROUP || 5374 !cpu_online(tmp->cpu) || 5375 cpu_to_group(tmp->cpu) == group) { 5376 sh = tmp; 5377 break; 5378 } 5379 } 5380 5381 if (sh) { 5382 conf->bypass_count -= conf->bypass_threshold; 5383 if (conf->bypass_count < 0) 5384 conf->bypass_count = 0; 5385 } 5386 wg = NULL; 5387 } 5388 5389 if (!sh) { 5390 if (second_try) 5391 return NULL; 5392 second_try = true; 5393 try_loprio = !try_loprio; 5394 goto again; 5395 } 5396 5397 if (wg) { 5398 wg->stripes_cnt--; 5399 sh->group = NULL; 5400 } 5401 list_del_init(&sh->lru); 5402 BUG_ON(atomic_inc_return(&sh->count) != 1); 5403 return sh; 5404 } 5405 5406 struct raid5_plug_cb { 5407 struct blk_plug_cb cb; 5408 struct list_head list; 5409 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5410 }; 5411 5412 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5413 { 5414 struct raid5_plug_cb *cb = container_of( 5415 blk_cb, struct raid5_plug_cb, cb); 5416 struct stripe_head *sh; 5417 struct mddev *mddev = cb->cb.data; 5418 struct r5conf *conf = mddev->private; 5419 int cnt = 0; 5420 int hash; 5421 5422 if (cb->list.next && !list_empty(&cb->list)) { 5423 spin_lock_irq(&conf->device_lock); 5424 while (!list_empty(&cb->list)) { 5425 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5426 list_del_init(&sh->lru); 5427 /* 5428 * avoid race release_stripe_plug() sees 5429 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5430 * is still in our list 5431 */ 5432 smp_mb__before_atomic(); 5433 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5434 /* 5435 * STRIPE_ON_RELEASE_LIST could be set here. In that 5436 * case, the count is always > 1 here 5437 */ 5438 hash = sh->hash_lock_index; 5439 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5440 cnt++; 5441 } 5442 spin_unlock_irq(&conf->device_lock); 5443 } 5444 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5445 NR_STRIPE_HASH_LOCKS); 5446 if (mddev->queue) 5447 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5448 kfree(cb); 5449 } 5450 5451 static void release_stripe_plug(struct mddev *mddev, 5452 struct stripe_head *sh) 5453 { 5454 struct blk_plug_cb *blk_cb = blk_check_plugged( 5455 raid5_unplug, mddev, 5456 sizeof(struct raid5_plug_cb)); 5457 struct raid5_plug_cb *cb; 5458 5459 if (!blk_cb) { 5460 raid5_release_stripe(sh); 5461 return; 5462 } 5463 5464 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5465 5466 if (cb->list.next == NULL) { 5467 int i; 5468 INIT_LIST_HEAD(&cb->list); 5469 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5470 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5471 } 5472 5473 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5474 list_add_tail(&sh->lru, &cb->list); 5475 else 5476 raid5_release_stripe(sh); 5477 } 5478 5479 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5480 { 5481 struct r5conf *conf = mddev->private; 5482 sector_t logical_sector, last_sector; 5483 struct stripe_head *sh; 5484 int stripe_sectors; 5485 5486 if (mddev->reshape_position != MaxSector) 5487 /* Skip discard while reshape is happening */ 5488 return; 5489 5490 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5491 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5492 5493 bi->bi_next = NULL; 5494 5495 stripe_sectors = conf->chunk_sectors * 5496 (conf->raid_disks - conf->max_degraded); 5497 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5498 stripe_sectors); 5499 sector_div(last_sector, stripe_sectors); 5500 5501 logical_sector *= conf->chunk_sectors; 5502 last_sector *= conf->chunk_sectors; 5503 5504 for (; logical_sector < last_sector; 5505 logical_sector += STRIPE_SECTORS) { 5506 DEFINE_WAIT(w); 5507 int d; 5508 again: 5509 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5510 prepare_to_wait(&conf->wait_for_overlap, &w, 5511 TASK_UNINTERRUPTIBLE); 5512 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5513 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5514 raid5_release_stripe(sh); 5515 schedule(); 5516 goto again; 5517 } 5518 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5519 spin_lock_irq(&sh->stripe_lock); 5520 for (d = 0; d < conf->raid_disks; d++) { 5521 if (d == sh->pd_idx || d == sh->qd_idx) 5522 continue; 5523 if (sh->dev[d].towrite || sh->dev[d].toread) { 5524 set_bit(R5_Overlap, &sh->dev[d].flags); 5525 spin_unlock_irq(&sh->stripe_lock); 5526 raid5_release_stripe(sh); 5527 schedule(); 5528 goto again; 5529 } 5530 } 5531 set_bit(STRIPE_DISCARD, &sh->state); 5532 finish_wait(&conf->wait_for_overlap, &w); 5533 sh->overwrite_disks = 0; 5534 for (d = 0; d < conf->raid_disks; d++) { 5535 if (d == sh->pd_idx || d == sh->qd_idx) 5536 continue; 5537 sh->dev[d].towrite = bi; 5538 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5539 bio_inc_remaining(bi); 5540 md_write_inc(mddev, bi); 5541 sh->overwrite_disks++; 5542 } 5543 spin_unlock_irq(&sh->stripe_lock); 5544 if (conf->mddev->bitmap) { 5545 for (d = 0; 5546 d < conf->raid_disks - conf->max_degraded; 5547 d++) 5548 md_bitmap_startwrite(mddev->bitmap, 5549 sh->sector, 5550 STRIPE_SECTORS, 5551 0); 5552 sh->bm_seq = conf->seq_flush + 1; 5553 set_bit(STRIPE_BIT_DELAY, &sh->state); 5554 } 5555 5556 set_bit(STRIPE_HANDLE, &sh->state); 5557 clear_bit(STRIPE_DELAYED, &sh->state); 5558 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5559 atomic_inc(&conf->preread_active_stripes); 5560 release_stripe_plug(mddev, sh); 5561 } 5562 5563 bio_endio(bi); 5564 } 5565 5566 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5567 { 5568 struct r5conf *conf = mddev->private; 5569 int dd_idx; 5570 sector_t new_sector; 5571 sector_t logical_sector, last_sector; 5572 struct stripe_head *sh; 5573 const int rw = bio_data_dir(bi); 5574 DEFINE_WAIT(w); 5575 bool do_prepare; 5576 bool do_flush = false; 5577 5578 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5579 int ret = log_handle_flush_request(conf, bi); 5580 5581 if (ret == 0) 5582 return true; 5583 if (ret == -ENODEV) { 5584 md_flush_request(mddev, bi); 5585 return true; 5586 } 5587 /* ret == -EAGAIN, fallback */ 5588 /* 5589 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5590 * we need to flush journal device 5591 */ 5592 do_flush = bi->bi_opf & REQ_PREFLUSH; 5593 } 5594 5595 if (!md_write_start(mddev, bi)) 5596 return false; 5597 /* 5598 * If array is degraded, better not do chunk aligned read because 5599 * later we might have to read it again in order to reconstruct 5600 * data on failed drives. 5601 */ 5602 if (rw == READ && mddev->degraded == 0 && 5603 mddev->reshape_position == MaxSector) { 5604 bi = chunk_aligned_read(mddev, bi); 5605 if (!bi) 5606 return true; 5607 } 5608 5609 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5610 make_discard_request(mddev, bi); 5611 md_write_end(mddev); 5612 return true; 5613 } 5614 5615 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5616 last_sector = bio_end_sector(bi); 5617 bi->bi_next = NULL; 5618 5619 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5620 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5621 int previous; 5622 int seq; 5623 5624 do_prepare = false; 5625 retry: 5626 seq = read_seqcount_begin(&conf->gen_lock); 5627 previous = 0; 5628 if (do_prepare) 5629 prepare_to_wait(&conf->wait_for_overlap, &w, 5630 TASK_UNINTERRUPTIBLE); 5631 if (unlikely(conf->reshape_progress != MaxSector)) { 5632 /* spinlock is needed as reshape_progress may be 5633 * 64bit on a 32bit platform, and so it might be 5634 * possible to see a half-updated value 5635 * Of course reshape_progress could change after 5636 * the lock is dropped, so once we get a reference 5637 * to the stripe that we think it is, we will have 5638 * to check again. 5639 */ 5640 spin_lock_irq(&conf->device_lock); 5641 if (mddev->reshape_backwards 5642 ? logical_sector < conf->reshape_progress 5643 : logical_sector >= conf->reshape_progress) { 5644 previous = 1; 5645 } else { 5646 if (mddev->reshape_backwards 5647 ? logical_sector < conf->reshape_safe 5648 : logical_sector >= conf->reshape_safe) { 5649 spin_unlock_irq(&conf->device_lock); 5650 schedule(); 5651 do_prepare = true; 5652 goto retry; 5653 } 5654 } 5655 spin_unlock_irq(&conf->device_lock); 5656 } 5657 5658 new_sector = raid5_compute_sector(conf, logical_sector, 5659 previous, 5660 &dd_idx, NULL); 5661 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5662 (unsigned long long)new_sector, 5663 (unsigned long long)logical_sector); 5664 5665 sh = raid5_get_active_stripe(conf, new_sector, previous, 5666 (bi->bi_opf & REQ_RAHEAD), 0); 5667 if (sh) { 5668 if (unlikely(previous)) { 5669 /* expansion might have moved on while waiting for a 5670 * stripe, so we must do the range check again. 5671 * Expansion could still move past after this 5672 * test, but as we are holding a reference to 5673 * 'sh', we know that if that happens, 5674 * STRIPE_EXPANDING will get set and the expansion 5675 * won't proceed until we finish with the stripe. 5676 */ 5677 int must_retry = 0; 5678 spin_lock_irq(&conf->device_lock); 5679 if (mddev->reshape_backwards 5680 ? logical_sector >= conf->reshape_progress 5681 : logical_sector < conf->reshape_progress) 5682 /* mismatch, need to try again */ 5683 must_retry = 1; 5684 spin_unlock_irq(&conf->device_lock); 5685 if (must_retry) { 5686 raid5_release_stripe(sh); 5687 schedule(); 5688 do_prepare = true; 5689 goto retry; 5690 } 5691 } 5692 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5693 /* Might have got the wrong stripe_head 5694 * by accident 5695 */ 5696 raid5_release_stripe(sh); 5697 goto retry; 5698 } 5699 5700 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5701 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5702 /* Stripe is busy expanding or 5703 * add failed due to overlap. Flush everything 5704 * and wait a while 5705 */ 5706 md_wakeup_thread(mddev->thread); 5707 raid5_release_stripe(sh); 5708 schedule(); 5709 do_prepare = true; 5710 goto retry; 5711 } 5712 if (do_flush) { 5713 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5714 /* we only need flush for one stripe */ 5715 do_flush = false; 5716 } 5717 5718 set_bit(STRIPE_HANDLE, &sh->state); 5719 clear_bit(STRIPE_DELAYED, &sh->state); 5720 if ((!sh->batch_head || sh == sh->batch_head) && 5721 (bi->bi_opf & REQ_SYNC) && 5722 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5723 atomic_inc(&conf->preread_active_stripes); 5724 release_stripe_plug(mddev, sh); 5725 } else { 5726 /* cannot get stripe for read-ahead, just give-up */ 5727 bi->bi_status = BLK_STS_IOERR; 5728 break; 5729 } 5730 } 5731 finish_wait(&conf->wait_for_overlap, &w); 5732 5733 if (rw == WRITE) 5734 md_write_end(mddev); 5735 bio_endio(bi); 5736 return true; 5737 } 5738 5739 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5740 5741 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5742 { 5743 /* reshaping is quite different to recovery/resync so it is 5744 * handled quite separately ... here. 5745 * 5746 * On each call to sync_request, we gather one chunk worth of 5747 * destination stripes and flag them as expanding. 5748 * Then we find all the source stripes and request reads. 5749 * As the reads complete, handle_stripe will copy the data 5750 * into the destination stripe and release that stripe. 5751 */ 5752 struct r5conf *conf = mddev->private; 5753 struct stripe_head *sh; 5754 struct md_rdev *rdev; 5755 sector_t first_sector, last_sector; 5756 int raid_disks = conf->previous_raid_disks; 5757 int data_disks = raid_disks - conf->max_degraded; 5758 int new_data_disks = conf->raid_disks - conf->max_degraded; 5759 int i; 5760 int dd_idx; 5761 sector_t writepos, readpos, safepos; 5762 sector_t stripe_addr; 5763 int reshape_sectors; 5764 struct list_head stripes; 5765 sector_t retn; 5766 5767 if (sector_nr == 0) { 5768 /* If restarting in the middle, skip the initial sectors */ 5769 if (mddev->reshape_backwards && 5770 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5771 sector_nr = raid5_size(mddev, 0, 0) 5772 - conf->reshape_progress; 5773 } else if (mddev->reshape_backwards && 5774 conf->reshape_progress == MaxSector) { 5775 /* shouldn't happen, but just in case, finish up.*/ 5776 sector_nr = MaxSector; 5777 } else if (!mddev->reshape_backwards && 5778 conf->reshape_progress > 0) 5779 sector_nr = conf->reshape_progress; 5780 sector_div(sector_nr, new_data_disks); 5781 if (sector_nr) { 5782 mddev->curr_resync_completed = sector_nr; 5783 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5784 *skipped = 1; 5785 retn = sector_nr; 5786 goto finish; 5787 } 5788 } 5789 5790 /* We need to process a full chunk at a time. 5791 * If old and new chunk sizes differ, we need to process the 5792 * largest of these 5793 */ 5794 5795 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5796 5797 /* We update the metadata at least every 10 seconds, or when 5798 * the data about to be copied would over-write the source of 5799 * the data at the front of the range. i.e. one new_stripe 5800 * along from reshape_progress new_maps to after where 5801 * reshape_safe old_maps to 5802 */ 5803 writepos = conf->reshape_progress; 5804 sector_div(writepos, new_data_disks); 5805 readpos = conf->reshape_progress; 5806 sector_div(readpos, data_disks); 5807 safepos = conf->reshape_safe; 5808 sector_div(safepos, data_disks); 5809 if (mddev->reshape_backwards) { 5810 BUG_ON(writepos < reshape_sectors); 5811 writepos -= reshape_sectors; 5812 readpos += reshape_sectors; 5813 safepos += reshape_sectors; 5814 } else { 5815 writepos += reshape_sectors; 5816 /* readpos and safepos are worst-case calculations. 5817 * A negative number is overly pessimistic, and causes 5818 * obvious problems for unsigned storage. So clip to 0. 5819 */ 5820 readpos -= min_t(sector_t, reshape_sectors, readpos); 5821 safepos -= min_t(sector_t, reshape_sectors, safepos); 5822 } 5823 5824 /* Having calculated the 'writepos' possibly use it 5825 * to set 'stripe_addr' which is where we will write to. 5826 */ 5827 if (mddev->reshape_backwards) { 5828 BUG_ON(conf->reshape_progress == 0); 5829 stripe_addr = writepos; 5830 BUG_ON((mddev->dev_sectors & 5831 ~((sector_t)reshape_sectors - 1)) 5832 - reshape_sectors - stripe_addr 5833 != sector_nr); 5834 } else { 5835 BUG_ON(writepos != sector_nr + reshape_sectors); 5836 stripe_addr = sector_nr; 5837 } 5838 5839 /* 'writepos' is the most advanced device address we might write. 5840 * 'readpos' is the least advanced device address we might read. 5841 * 'safepos' is the least address recorded in the metadata as having 5842 * been reshaped. 5843 * If there is a min_offset_diff, these are adjusted either by 5844 * increasing the safepos/readpos if diff is negative, or 5845 * increasing writepos if diff is positive. 5846 * If 'readpos' is then behind 'writepos', there is no way that we can 5847 * ensure safety in the face of a crash - that must be done by userspace 5848 * making a backup of the data. So in that case there is no particular 5849 * rush to update metadata. 5850 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5851 * update the metadata to advance 'safepos' to match 'readpos' so that 5852 * we can be safe in the event of a crash. 5853 * So we insist on updating metadata if safepos is behind writepos and 5854 * readpos is beyond writepos. 5855 * In any case, update the metadata every 10 seconds. 5856 * Maybe that number should be configurable, but I'm not sure it is 5857 * worth it.... maybe it could be a multiple of safemode_delay??? 5858 */ 5859 if (conf->min_offset_diff < 0) { 5860 safepos += -conf->min_offset_diff; 5861 readpos += -conf->min_offset_diff; 5862 } else 5863 writepos += conf->min_offset_diff; 5864 5865 if ((mddev->reshape_backwards 5866 ? (safepos > writepos && readpos < writepos) 5867 : (safepos < writepos && readpos > writepos)) || 5868 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5869 /* Cannot proceed until we've updated the superblock... */ 5870 wait_event(conf->wait_for_overlap, 5871 atomic_read(&conf->reshape_stripes)==0 5872 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5873 if (atomic_read(&conf->reshape_stripes) != 0) 5874 return 0; 5875 mddev->reshape_position = conf->reshape_progress; 5876 mddev->curr_resync_completed = sector_nr; 5877 if (!mddev->reshape_backwards) 5878 /* Can update recovery_offset */ 5879 rdev_for_each(rdev, mddev) 5880 if (rdev->raid_disk >= 0 && 5881 !test_bit(Journal, &rdev->flags) && 5882 !test_bit(In_sync, &rdev->flags) && 5883 rdev->recovery_offset < sector_nr) 5884 rdev->recovery_offset = sector_nr; 5885 5886 conf->reshape_checkpoint = jiffies; 5887 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5888 md_wakeup_thread(mddev->thread); 5889 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5890 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5891 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5892 return 0; 5893 spin_lock_irq(&conf->device_lock); 5894 conf->reshape_safe = mddev->reshape_position; 5895 spin_unlock_irq(&conf->device_lock); 5896 wake_up(&conf->wait_for_overlap); 5897 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5898 } 5899 5900 INIT_LIST_HEAD(&stripes); 5901 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5902 int j; 5903 int skipped_disk = 0; 5904 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5905 set_bit(STRIPE_EXPANDING, &sh->state); 5906 atomic_inc(&conf->reshape_stripes); 5907 /* If any of this stripe is beyond the end of the old 5908 * array, then we need to zero those blocks 5909 */ 5910 for (j=sh->disks; j--;) { 5911 sector_t s; 5912 if (j == sh->pd_idx) 5913 continue; 5914 if (conf->level == 6 && 5915 j == sh->qd_idx) 5916 continue; 5917 s = raid5_compute_blocknr(sh, j, 0); 5918 if (s < raid5_size(mddev, 0, 0)) { 5919 skipped_disk = 1; 5920 continue; 5921 } 5922 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5923 set_bit(R5_Expanded, &sh->dev[j].flags); 5924 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5925 } 5926 if (!skipped_disk) { 5927 set_bit(STRIPE_EXPAND_READY, &sh->state); 5928 set_bit(STRIPE_HANDLE, &sh->state); 5929 } 5930 list_add(&sh->lru, &stripes); 5931 } 5932 spin_lock_irq(&conf->device_lock); 5933 if (mddev->reshape_backwards) 5934 conf->reshape_progress -= reshape_sectors * new_data_disks; 5935 else 5936 conf->reshape_progress += reshape_sectors * new_data_disks; 5937 spin_unlock_irq(&conf->device_lock); 5938 /* Ok, those stripe are ready. We can start scheduling 5939 * reads on the source stripes. 5940 * The source stripes are determined by mapping the first and last 5941 * block on the destination stripes. 5942 */ 5943 first_sector = 5944 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5945 1, &dd_idx, NULL); 5946 last_sector = 5947 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5948 * new_data_disks - 1), 5949 1, &dd_idx, NULL); 5950 if (last_sector >= mddev->dev_sectors) 5951 last_sector = mddev->dev_sectors - 1; 5952 while (first_sector <= last_sector) { 5953 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5954 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5955 set_bit(STRIPE_HANDLE, &sh->state); 5956 raid5_release_stripe(sh); 5957 first_sector += STRIPE_SECTORS; 5958 } 5959 /* Now that the sources are clearly marked, we can release 5960 * the destination stripes 5961 */ 5962 while (!list_empty(&stripes)) { 5963 sh = list_entry(stripes.next, struct stripe_head, lru); 5964 list_del_init(&sh->lru); 5965 raid5_release_stripe(sh); 5966 } 5967 /* If this takes us to the resync_max point where we have to pause, 5968 * then we need to write out the superblock. 5969 */ 5970 sector_nr += reshape_sectors; 5971 retn = reshape_sectors; 5972 finish: 5973 if (mddev->curr_resync_completed > mddev->resync_max || 5974 (sector_nr - mddev->curr_resync_completed) * 2 5975 >= mddev->resync_max - mddev->curr_resync_completed) { 5976 /* Cannot proceed until we've updated the superblock... */ 5977 wait_event(conf->wait_for_overlap, 5978 atomic_read(&conf->reshape_stripes) == 0 5979 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5980 if (atomic_read(&conf->reshape_stripes) != 0) 5981 goto ret; 5982 mddev->reshape_position = conf->reshape_progress; 5983 mddev->curr_resync_completed = sector_nr; 5984 if (!mddev->reshape_backwards) 5985 /* Can update recovery_offset */ 5986 rdev_for_each(rdev, mddev) 5987 if (rdev->raid_disk >= 0 && 5988 !test_bit(Journal, &rdev->flags) && 5989 !test_bit(In_sync, &rdev->flags) && 5990 rdev->recovery_offset < sector_nr) 5991 rdev->recovery_offset = sector_nr; 5992 conf->reshape_checkpoint = jiffies; 5993 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5994 md_wakeup_thread(mddev->thread); 5995 wait_event(mddev->sb_wait, 5996 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5997 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5998 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5999 goto ret; 6000 spin_lock_irq(&conf->device_lock); 6001 conf->reshape_safe = mddev->reshape_position; 6002 spin_unlock_irq(&conf->device_lock); 6003 wake_up(&conf->wait_for_overlap); 6004 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6005 } 6006 ret: 6007 return retn; 6008 } 6009 6010 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6011 int *skipped) 6012 { 6013 struct r5conf *conf = mddev->private; 6014 struct stripe_head *sh; 6015 sector_t max_sector = mddev->dev_sectors; 6016 sector_t sync_blocks; 6017 int still_degraded = 0; 6018 int i; 6019 6020 if (sector_nr >= max_sector) { 6021 /* just being told to finish up .. nothing much to do */ 6022 6023 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6024 end_reshape(conf); 6025 return 0; 6026 } 6027 6028 if (mddev->curr_resync < max_sector) /* aborted */ 6029 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6030 &sync_blocks, 1); 6031 else /* completed sync */ 6032 conf->fullsync = 0; 6033 md_bitmap_close_sync(mddev->bitmap); 6034 6035 return 0; 6036 } 6037 6038 /* Allow raid5_quiesce to complete */ 6039 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6040 6041 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6042 return reshape_request(mddev, sector_nr, skipped); 6043 6044 /* No need to check resync_max as we never do more than one 6045 * stripe, and as resync_max will always be on a chunk boundary, 6046 * if the check in md_do_sync didn't fire, there is no chance 6047 * of overstepping resync_max here 6048 */ 6049 6050 /* if there is too many failed drives and we are trying 6051 * to resync, then assert that we are finished, because there is 6052 * nothing we can do. 6053 */ 6054 if (mddev->degraded >= conf->max_degraded && 6055 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6056 sector_t rv = mddev->dev_sectors - sector_nr; 6057 *skipped = 1; 6058 return rv; 6059 } 6060 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6061 !conf->fullsync && 6062 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6063 sync_blocks >= STRIPE_SECTORS) { 6064 /* we can skip this block, and probably more */ 6065 sync_blocks /= STRIPE_SECTORS; 6066 *skipped = 1; 6067 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6068 } 6069 6070 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6071 6072 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6073 if (sh == NULL) { 6074 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6075 /* make sure we don't swamp the stripe cache if someone else 6076 * is trying to get access 6077 */ 6078 schedule_timeout_uninterruptible(1); 6079 } 6080 /* Need to check if array will still be degraded after recovery/resync 6081 * Note in case of > 1 drive failures it's possible we're rebuilding 6082 * one drive while leaving another faulty drive in array. 6083 */ 6084 rcu_read_lock(); 6085 for (i = 0; i < conf->raid_disks; i++) { 6086 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 6087 6088 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6089 still_degraded = 1; 6090 } 6091 rcu_read_unlock(); 6092 6093 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6094 6095 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6096 set_bit(STRIPE_HANDLE, &sh->state); 6097 6098 raid5_release_stripe(sh); 6099 6100 return STRIPE_SECTORS; 6101 } 6102 6103 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6104 unsigned int offset) 6105 { 6106 /* We may not be able to submit a whole bio at once as there 6107 * may not be enough stripe_heads available. 6108 * We cannot pre-allocate enough stripe_heads as we may need 6109 * more than exist in the cache (if we allow ever large chunks). 6110 * So we do one stripe head at a time and record in 6111 * ->bi_hw_segments how many have been done. 6112 * 6113 * We *know* that this entire raid_bio is in one chunk, so 6114 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6115 */ 6116 struct stripe_head *sh; 6117 int dd_idx; 6118 sector_t sector, logical_sector, last_sector; 6119 int scnt = 0; 6120 int handled = 0; 6121 6122 logical_sector = raid_bio->bi_iter.bi_sector & 6123 ~((sector_t)STRIPE_SECTORS-1); 6124 sector = raid5_compute_sector(conf, logical_sector, 6125 0, &dd_idx, NULL); 6126 last_sector = bio_end_sector(raid_bio); 6127 6128 for (; logical_sector < last_sector; 6129 logical_sector += STRIPE_SECTORS, 6130 sector += STRIPE_SECTORS, 6131 scnt++) { 6132 6133 if (scnt < offset) 6134 /* already done this stripe */ 6135 continue; 6136 6137 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6138 6139 if (!sh) { 6140 /* failed to get a stripe - must wait */ 6141 conf->retry_read_aligned = raid_bio; 6142 conf->retry_read_offset = scnt; 6143 return handled; 6144 } 6145 6146 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6147 raid5_release_stripe(sh); 6148 conf->retry_read_aligned = raid_bio; 6149 conf->retry_read_offset = scnt; 6150 return handled; 6151 } 6152 6153 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6154 handle_stripe(sh); 6155 raid5_release_stripe(sh); 6156 handled++; 6157 } 6158 6159 bio_endio(raid_bio); 6160 6161 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6162 wake_up(&conf->wait_for_quiescent); 6163 return handled; 6164 } 6165 6166 static int handle_active_stripes(struct r5conf *conf, int group, 6167 struct r5worker *worker, 6168 struct list_head *temp_inactive_list) 6169 { 6170 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6171 int i, batch_size = 0, hash; 6172 bool release_inactive = false; 6173 6174 while (batch_size < MAX_STRIPE_BATCH && 6175 (sh = __get_priority_stripe(conf, group)) != NULL) 6176 batch[batch_size++] = sh; 6177 6178 if (batch_size == 0) { 6179 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6180 if (!list_empty(temp_inactive_list + i)) 6181 break; 6182 if (i == NR_STRIPE_HASH_LOCKS) { 6183 spin_unlock_irq(&conf->device_lock); 6184 log_flush_stripe_to_raid(conf); 6185 spin_lock_irq(&conf->device_lock); 6186 return batch_size; 6187 } 6188 release_inactive = true; 6189 } 6190 spin_unlock_irq(&conf->device_lock); 6191 6192 release_inactive_stripe_list(conf, temp_inactive_list, 6193 NR_STRIPE_HASH_LOCKS); 6194 6195 r5l_flush_stripe_to_raid(conf->log); 6196 if (release_inactive) { 6197 spin_lock_irq(&conf->device_lock); 6198 return 0; 6199 } 6200 6201 for (i = 0; i < batch_size; i++) 6202 handle_stripe(batch[i]); 6203 log_write_stripe_run(conf); 6204 6205 cond_resched(); 6206 6207 spin_lock_irq(&conf->device_lock); 6208 for (i = 0; i < batch_size; i++) { 6209 hash = batch[i]->hash_lock_index; 6210 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6211 } 6212 return batch_size; 6213 } 6214 6215 static void raid5_do_work(struct work_struct *work) 6216 { 6217 struct r5worker *worker = container_of(work, struct r5worker, work); 6218 struct r5worker_group *group = worker->group; 6219 struct r5conf *conf = group->conf; 6220 struct mddev *mddev = conf->mddev; 6221 int group_id = group - conf->worker_groups; 6222 int handled; 6223 struct blk_plug plug; 6224 6225 pr_debug("+++ raid5worker active\n"); 6226 6227 blk_start_plug(&plug); 6228 handled = 0; 6229 spin_lock_irq(&conf->device_lock); 6230 while (1) { 6231 int batch_size, released; 6232 6233 released = release_stripe_list(conf, worker->temp_inactive_list); 6234 6235 batch_size = handle_active_stripes(conf, group_id, worker, 6236 worker->temp_inactive_list); 6237 worker->working = false; 6238 if (!batch_size && !released) 6239 break; 6240 handled += batch_size; 6241 wait_event_lock_irq(mddev->sb_wait, 6242 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6243 conf->device_lock); 6244 } 6245 pr_debug("%d stripes handled\n", handled); 6246 6247 spin_unlock_irq(&conf->device_lock); 6248 6249 flush_deferred_bios(conf); 6250 6251 r5l_flush_stripe_to_raid(conf->log); 6252 6253 async_tx_issue_pending_all(); 6254 blk_finish_plug(&plug); 6255 6256 pr_debug("--- raid5worker inactive\n"); 6257 } 6258 6259 /* 6260 * This is our raid5 kernel thread. 6261 * 6262 * We scan the hash table for stripes which can be handled now. 6263 * During the scan, completed stripes are saved for us by the interrupt 6264 * handler, so that they will not have to wait for our next wakeup. 6265 */ 6266 static void raid5d(struct md_thread *thread) 6267 { 6268 struct mddev *mddev = thread->mddev; 6269 struct r5conf *conf = mddev->private; 6270 int handled; 6271 struct blk_plug plug; 6272 6273 pr_debug("+++ raid5d active\n"); 6274 6275 md_check_recovery(mddev); 6276 6277 blk_start_plug(&plug); 6278 handled = 0; 6279 spin_lock_irq(&conf->device_lock); 6280 while (1) { 6281 struct bio *bio; 6282 int batch_size, released; 6283 unsigned int offset; 6284 6285 released = release_stripe_list(conf, conf->temp_inactive_list); 6286 if (released) 6287 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6288 6289 if ( 6290 !list_empty(&conf->bitmap_list)) { 6291 /* Now is a good time to flush some bitmap updates */ 6292 conf->seq_flush++; 6293 spin_unlock_irq(&conf->device_lock); 6294 md_bitmap_unplug(mddev->bitmap); 6295 spin_lock_irq(&conf->device_lock); 6296 conf->seq_write = conf->seq_flush; 6297 activate_bit_delay(conf, conf->temp_inactive_list); 6298 } 6299 raid5_activate_delayed(conf); 6300 6301 while ((bio = remove_bio_from_retry(conf, &offset))) { 6302 int ok; 6303 spin_unlock_irq(&conf->device_lock); 6304 ok = retry_aligned_read(conf, bio, offset); 6305 spin_lock_irq(&conf->device_lock); 6306 if (!ok) 6307 break; 6308 handled++; 6309 } 6310 6311 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6312 conf->temp_inactive_list); 6313 if (!batch_size && !released) 6314 break; 6315 handled += batch_size; 6316 6317 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6318 spin_unlock_irq(&conf->device_lock); 6319 md_check_recovery(mddev); 6320 spin_lock_irq(&conf->device_lock); 6321 } 6322 } 6323 pr_debug("%d stripes handled\n", handled); 6324 6325 spin_unlock_irq(&conf->device_lock); 6326 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6327 mutex_trylock(&conf->cache_size_mutex)) { 6328 grow_one_stripe(conf, __GFP_NOWARN); 6329 /* Set flag even if allocation failed. This helps 6330 * slow down allocation requests when mem is short 6331 */ 6332 set_bit(R5_DID_ALLOC, &conf->cache_state); 6333 mutex_unlock(&conf->cache_size_mutex); 6334 } 6335 6336 flush_deferred_bios(conf); 6337 6338 r5l_flush_stripe_to_raid(conf->log); 6339 6340 async_tx_issue_pending_all(); 6341 blk_finish_plug(&plug); 6342 6343 pr_debug("--- raid5d inactive\n"); 6344 } 6345 6346 static ssize_t 6347 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6348 { 6349 struct r5conf *conf; 6350 int ret = 0; 6351 spin_lock(&mddev->lock); 6352 conf = mddev->private; 6353 if (conf) 6354 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6355 spin_unlock(&mddev->lock); 6356 return ret; 6357 } 6358 6359 int 6360 raid5_set_cache_size(struct mddev *mddev, int size) 6361 { 6362 int result = 0; 6363 struct r5conf *conf = mddev->private; 6364 6365 if (size <= 16 || size > 32768) 6366 return -EINVAL; 6367 6368 conf->min_nr_stripes = size; 6369 mutex_lock(&conf->cache_size_mutex); 6370 while (size < conf->max_nr_stripes && 6371 drop_one_stripe(conf)) 6372 ; 6373 mutex_unlock(&conf->cache_size_mutex); 6374 6375 md_allow_write(mddev); 6376 6377 mutex_lock(&conf->cache_size_mutex); 6378 while (size > conf->max_nr_stripes) 6379 if (!grow_one_stripe(conf, GFP_KERNEL)) { 6380 conf->min_nr_stripes = conf->max_nr_stripes; 6381 result = -ENOMEM; 6382 break; 6383 } 6384 mutex_unlock(&conf->cache_size_mutex); 6385 6386 return result; 6387 } 6388 EXPORT_SYMBOL(raid5_set_cache_size); 6389 6390 static ssize_t 6391 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6392 { 6393 struct r5conf *conf; 6394 unsigned long new; 6395 int err; 6396 6397 if (len >= PAGE_SIZE) 6398 return -EINVAL; 6399 if (kstrtoul(page, 10, &new)) 6400 return -EINVAL; 6401 err = mddev_lock(mddev); 6402 if (err) 6403 return err; 6404 conf = mddev->private; 6405 if (!conf) 6406 err = -ENODEV; 6407 else 6408 err = raid5_set_cache_size(mddev, new); 6409 mddev_unlock(mddev); 6410 6411 return err ?: len; 6412 } 6413 6414 static struct md_sysfs_entry 6415 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6416 raid5_show_stripe_cache_size, 6417 raid5_store_stripe_cache_size); 6418 6419 static ssize_t 6420 raid5_show_rmw_level(struct mddev *mddev, char *page) 6421 { 6422 struct r5conf *conf = mddev->private; 6423 if (conf) 6424 return sprintf(page, "%d\n", conf->rmw_level); 6425 else 6426 return 0; 6427 } 6428 6429 static ssize_t 6430 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6431 { 6432 struct r5conf *conf = mddev->private; 6433 unsigned long new; 6434 6435 if (!conf) 6436 return -ENODEV; 6437 6438 if (len >= PAGE_SIZE) 6439 return -EINVAL; 6440 6441 if (kstrtoul(page, 10, &new)) 6442 return -EINVAL; 6443 6444 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6445 return -EINVAL; 6446 6447 if (new != PARITY_DISABLE_RMW && 6448 new != PARITY_ENABLE_RMW && 6449 new != PARITY_PREFER_RMW) 6450 return -EINVAL; 6451 6452 conf->rmw_level = new; 6453 return len; 6454 } 6455 6456 static struct md_sysfs_entry 6457 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6458 raid5_show_rmw_level, 6459 raid5_store_rmw_level); 6460 6461 6462 static ssize_t 6463 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6464 { 6465 struct r5conf *conf; 6466 int ret = 0; 6467 spin_lock(&mddev->lock); 6468 conf = mddev->private; 6469 if (conf) 6470 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6471 spin_unlock(&mddev->lock); 6472 return ret; 6473 } 6474 6475 static ssize_t 6476 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6477 { 6478 struct r5conf *conf; 6479 unsigned long new; 6480 int err; 6481 6482 if (len >= PAGE_SIZE) 6483 return -EINVAL; 6484 if (kstrtoul(page, 10, &new)) 6485 return -EINVAL; 6486 6487 err = mddev_lock(mddev); 6488 if (err) 6489 return err; 6490 conf = mddev->private; 6491 if (!conf) 6492 err = -ENODEV; 6493 else if (new > conf->min_nr_stripes) 6494 err = -EINVAL; 6495 else 6496 conf->bypass_threshold = new; 6497 mddev_unlock(mddev); 6498 return err ?: len; 6499 } 6500 6501 static struct md_sysfs_entry 6502 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6503 S_IRUGO | S_IWUSR, 6504 raid5_show_preread_threshold, 6505 raid5_store_preread_threshold); 6506 6507 static ssize_t 6508 raid5_show_skip_copy(struct mddev *mddev, char *page) 6509 { 6510 struct r5conf *conf; 6511 int ret = 0; 6512 spin_lock(&mddev->lock); 6513 conf = mddev->private; 6514 if (conf) 6515 ret = sprintf(page, "%d\n", conf->skip_copy); 6516 spin_unlock(&mddev->lock); 6517 return ret; 6518 } 6519 6520 static ssize_t 6521 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6522 { 6523 struct r5conf *conf; 6524 unsigned long new; 6525 int err; 6526 6527 if (len >= PAGE_SIZE) 6528 return -EINVAL; 6529 if (kstrtoul(page, 10, &new)) 6530 return -EINVAL; 6531 new = !!new; 6532 6533 err = mddev_lock(mddev); 6534 if (err) 6535 return err; 6536 conf = mddev->private; 6537 if (!conf) 6538 err = -ENODEV; 6539 else if (new != conf->skip_copy) { 6540 mddev_suspend(mddev); 6541 conf->skip_copy = new; 6542 if (new) 6543 mddev->queue->backing_dev_info->capabilities |= 6544 BDI_CAP_STABLE_WRITES; 6545 else 6546 mddev->queue->backing_dev_info->capabilities &= 6547 ~BDI_CAP_STABLE_WRITES; 6548 mddev_resume(mddev); 6549 } 6550 mddev_unlock(mddev); 6551 return err ?: len; 6552 } 6553 6554 static struct md_sysfs_entry 6555 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6556 raid5_show_skip_copy, 6557 raid5_store_skip_copy); 6558 6559 static ssize_t 6560 stripe_cache_active_show(struct mddev *mddev, char *page) 6561 { 6562 struct r5conf *conf = mddev->private; 6563 if (conf) 6564 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6565 else 6566 return 0; 6567 } 6568 6569 static struct md_sysfs_entry 6570 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6571 6572 static ssize_t 6573 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6574 { 6575 struct r5conf *conf; 6576 int ret = 0; 6577 spin_lock(&mddev->lock); 6578 conf = mddev->private; 6579 if (conf) 6580 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6581 spin_unlock(&mddev->lock); 6582 return ret; 6583 } 6584 6585 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6586 int *group_cnt, 6587 int *worker_cnt_per_group, 6588 struct r5worker_group **worker_groups); 6589 static ssize_t 6590 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6591 { 6592 struct r5conf *conf; 6593 unsigned int new; 6594 int err; 6595 struct r5worker_group *new_groups, *old_groups; 6596 int group_cnt, worker_cnt_per_group; 6597 6598 if (len >= PAGE_SIZE) 6599 return -EINVAL; 6600 if (kstrtouint(page, 10, &new)) 6601 return -EINVAL; 6602 /* 8192 should be big enough */ 6603 if (new > 8192) 6604 return -EINVAL; 6605 6606 err = mddev_lock(mddev); 6607 if (err) 6608 return err; 6609 conf = mddev->private; 6610 if (!conf) 6611 err = -ENODEV; 6612 else if (new != conf->worker_cnt_per_group) { 6613 mddev_suspend(mddev); 6614 6615 old_groups = conf->worker_groups; 6616 if (old_groups) 6617 flush_workqueue(raid5_wq); 6618 6619 err = alloc_thread_groups(conf, new, 6620 &group_cnt, &worker_cnt_per_group, 6621 &new_groups); 6622 if (!err) { 6623 spin_lock_irq(&conf->device_lock); 6624 conf->group_cnt = group_cnt; 6625 conf->worker_cnt_per_group = worker_cnt_per_group; 6626 conf->worker_groups = new_groups; 6627 spin_unlock_irq(&conf->device_lock); 6628 6629 if (old_groups) 6630 kfree(old_groups[0].workers); 6631 kfree(old_groups); 6632 } 6633 mddev_resume(mddev); 6634 } 6635 mddev_unlock(mddev); 6636 6637 return err ?: len; 6638 } 6639 6640 static struct md_sysfs_entry 6641 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6642 raid5_show_group_thread_cnt, 6643 raid5_store_group_thread_cnt); 6644 6645 static struct attribute *raid5_attrs[] = { 6646 &raid5_stripecache_size.attr, 6647 &raid5_stripecache_active.attr, 6648 &raid5_preread_bypass_threshold.attr, 6649 &raid5_group_thread_cnt.attr, 6650 &raid5_skip_copy.attr, 6651 &raid5_rmw_level.attr, 6652 &r5c_journal_mode.attr, 6653 &ppl_write_hint.attr, 6654 NULL, 6655 }; 6656 static struct attribute_group raid5_attrs_group = { 6657 .name = NULL, 6658 .attrs = raid5_attrs, 6659 }; 6660 6661 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6662 int *group_cnt, 6663 int *worker_cnt_per_group, 6664 struct r5worker_group **worker_groups) 6665 { 6666 int i, j, k; 6667 ssize_t size; 6668 struct r5worker *workers; 6669 6670 *worker_cnt_per_group = cnt; 6671 if (cnt == 0) { 6672 *group_cnt = 0; 6673 *worker_groups = NULL; 6674 return 0; 6675 } 6676 *group_cnt = num_possible_nodes(); 6677 size = sizeof(struct r5worker) * cnt; 6678 workers = kcalloc(size, *group_cnt, GFP_NOIO); 6679 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group), 6680 GFP_NOIO); 6681 if (!*worker_groups || !workers) { 6682 kfree(workers); 6683 kfree(*worker_groups); 6684 return -ENOMEM; 6685 } 6686 6687 for (i = 0; i < *group_cnt; i++) { 6688 struct r5worker_group *group; 6689 6690 group = &(*worker_groups)[i]; 6691 INIT_LIST_HEAD(&group->handle_list); 6692 INIT_LIST_HEAD(&group->loprio_list); 6693 group->conf = conf; 6694 group->workers = workers + i * cnt; 6695 6696 for (j = 0; j < cnt; j++) { 6697 struct r5worker *worker = group->workers + j; 6698 worker->group = group; 6699 INIT_WORK(&worker->work, raid5_do_work); 6700 6701 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6702 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6703 } 6704 } 6705 6706 return 0; 6707 } 6708 6709 static void free_thread_groups(struct r5conf *conf) 6710 { 6711 if (conf->worker_groups) 6712 kfree(conf->worker_groups[0].workers); 6713 kfree(conf->worker_groups); 6714 conf->worker_groups = NULL; 6715 } 6716 6717 static sector_t 6718 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6719 { 6720 struct r5conf *conf = mddev->private; 6721 6722 if (!sectors) 6723 sectors = mddev->dev_sectors; 6724 if (!raid_disks) 6725 /* size is defined by the smallest of previous and new size */ 6726 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6727 6728 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6729 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6730 return sectors * (raid_disks - conf->max_degraded); 6731 } 6732 6733 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6734 { 6735 safe_put_page(percpu->spare_page); 6736 percpu->spare_page = NULL; 6737 kvfree(percpu->scribble); 6738 percpu->scribble = NULL; 6739 } 6740 6741 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6742 { 6743 if (conf->level == 6 && !percpu->spare_page) { 6744 percpu->spare_page = alloc_page(GFP_KERNEL); 6745 if (!percpu->spare_page) 6746 return -ENOMEM; 6747 } 6748 6749 if (scribble_alloc(percpu, 6750 max(conf->raid_disks, 6751 conf->previous_raid_disks), 6752 max(conf->chunk_sectors, 6753 conf->prev_chunk_sectors) 6754 / STRIPE_SECTORS, 6755 GFP_KERNEL)) { 6756 free_scratch_buffer(conf, percpu); 6757 return -ENOMEM; 6758 } 6759 6760 return 0; 6761 } 6762 6763 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6764 { 6765 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6766 6767 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6768 return 0; 6769 } 6770 6771 static void raid5_free_percpu(struct r5conf *conf) 6772 { 6773 if (!conf->percpu) 6774 return; 6775 6776 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6777 free_percpu(conf->percpu); 6778 } 6779 6780 static void free_conf(struct r5conf *conf) 6781 { 6782 int i; 6783 6784 log_exit(conf); 6785 6786 unregister_shrinker(&conf->shrinker); 6787 free_thread_groups(conf); 6788 shrink_stripes(conf); 6789 raid5_free_percpu(conf); 6790 for (i = 0; i < conf->pool_size; i++) 6791 if (conf->disks[i].extra_page) 6792 put_page(conf->disks[i].extra_page); 6793 kfree(conf->disks); 6794 bioset_exit(&conf->bio_split); 6795 kfree(conf->stripe_hashtbl); 6796 kfree(conf->pending_data); 6797 kfree(conf); 6798 } 6799 6800 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6801 { 6802 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6803 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6804 6805 if (alloc_scratch_buffer(conf, percpu)) { 6806 pr_warn("%s: failed memory allocation for cpu%u\n", 6807 __func__, cpu); 6808 return -ENOMEM; 6809 } 6810 return 0; 6811 } 6812 6813 static int raid5_alloc_percpu(struct r5conf *conf) 6814 { 6815 int err = 0; 6816 6817 conf->percpu = alloc_percpu(struct raid5_percpu); 6818 if (!conf->percpu) 6819 return -ENOMEM; 6820 6821 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6822 if (!err) { 6823 conf->scribble_disks = max(conf->raid_disks, 6824 conf->previous_raid_disks); 6825 conf->scribble_sectors = max(conf->chunk_sectors, 6826 conf->prev_chunk_sectors); 6827 } 6828 return err; 6829 } 6830 6831 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6832 struct shrink_control *sc) 6833 { 6834 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6835 unsigned long ret = SHRINK_STOP; 6836 6837 if (mutex_trylock(&conf->cache_size_mutex)) { 6838 ret= 0; 6839 while (ret < sc->nr_to_scan && 6840 conf->max_nr_stripes > conf->min_nr_stripes) { 6841 if (drop_one_stripe(conf) == 0) { 6842 ret = SHRINK_STOP; 6843 break; 6844 } 6845 ret++; 6846 } 6847 mutex_unlock(&conf->cache_size_mutex); 6848 } 6849 return ret; 6850 } 6851 6852 static unsigned long raid5_cache_count(struct shrinker *shrink, 6853 struct shrink_control *sc) 6854 { 6855 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6856 6857 if (conf->max_nr_stripes < conf->min_nr_stripes) 6858 /* unlikely, but not impossible */ 6859 return 0; 6860 return conf->max_nr_stripes - conf->min_nr_stripes; 6861 } 6862 6863 static struct r5conf *setup_conf(struct mddev *mddev) 6864 { 6865 struct r5conf *conf; 6866 int raid_disk, memory, max_disks; 6867 struct md_rdev *rdev; 6868 struct disk_info *disk; 6869 char pers_name[6]; 6870 int i; 6871 int group_cnt, worker_cnt_per_group; 6872 struct r5worker_group *new_group; 6873 int ret; 6874 6875 if (mddev->new_level != 5 6876 && mddev->new_level != 4 6877 && mddev->new_level != 6) { 6878 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6879 mdname(mddev), mddev->new_level); 6880 return ERR_PTR(-EIO); 6881 } 6882 if ((mddev->new_level == 5 6883 && !algorithm_valid_raid5(mddev->new_layout)) || 6884 (mddev->new_level == 6 6885 && !algorithm_valid_raid6(mddev->new_layout))) { 6886 pr_warn("md/raid:%s: layout %d not supported\n", 6887 mdname(mddev), mddev->new_layout); 6888 return ERR_PTR(-EIO); 6889 } 6890 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6891 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6892 mdname(mddev), mddev->raid_disks); 6893 return ERR_PTR(-EINVAL); 6894 } 6895 6896 if (!mddev->new_chunk_sectors || 6897 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6898 !is_power_of_2(mddev->new_chunk_sectors)) { 6899 pr_warn("md/raid:%s: invalid chunk size %d\n", 6900 mdname(mddev), mddev->new_chunk_sectors << 9); 6901 return ERR_PTR(-EINVAL); 6902 } 6903 6904 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6905 if (conf == NULL) 6906 goto abort; 6907 INIT_LIST_HEAD(&conf->free_list); 6908 INIT_LIST_HEAD(&conf->pending_list); 6909 conf->pending_data = kcalloc(PENDING_IO_MAX, 6910 sizeof(struct r5pending_data), 6911 GFP_KERNEL); 6912 if (!conf->pending_data) 6913 goto abort; 6914 for (i = 0; i < PENDING_IO_MAX; i++) 6915 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6916 /* Don't enable multi-threading by default*/ 6917 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6918 &new_group)) { 6919 conf->group_cnt = group_cnt; 6920 conf->worker_cnt_per_group = worker_cnt_per_group; 6921 conf->worker_groups = new_group; 6922 } else 6923 goto abort; 6924 spin_lock_init(&conf->device_lock); 6925 seqcount_init(&conf->gen_lock); 6926 mutex_init(&conf->cache_size_mutex); 6927 init_waitqueue_head(&conf->wait_for_quiescent); 6928 init_waitqueue_head(&conf->wait_for_stripe); 6929 init_waitqueue_head(&conf->wait_for_overlap); 6930 INIT_LIST_HEAD(&conf->handle_list); 6931 INIT_LIST_HEAD(&conf->loprio_list); 6932 INIT_LIST_HEAD(&conf->hold_list); 6933 INIT_LIST_HEAD(&conf->delayed_list); 6934 INIT_LIST_HEAD(&conf->bitmap_list); 6935 init_llist_head(&conf->released_stripes); 6936 atomic_set(&conf->active_stripes, 0); 6937 atomic_set(&conf->preread_active_stripes, 0); 6938 atomic_set(&conf->active_aligned_reads, 0); 6939 spin_lock_init(&conf->pending_bios_lock); 6940 conf->batch_bio_dispatch = true; 6941 rdev_for_each(rdev, mddev) { 6942 if (test_bit(Journal, &rdev->flags)) 6943 continue; 6944 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6945 conf->batch_bio_dispatch = false; 6946 break; 6947 } 6948 } 6949 6950 conf->bypass_threshold = BYPASS_THRESHOLD; 6951 conf->recovery_disabled = mddev->recovery_disabled - 1; 6952 6953 conf->raid_disks = mddev->raid_disks; 6954 if (mddev->reshape_position == MaxSector) 6955 conf->previous_raid_disks = mddev->raid_disks; 6956 else 6957 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6958 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6959 6960 conf->disks = kcalloc(max_disks, sizeof(struct disk_info), 6961 GFP_KERNEL); 6962 6963 if (!conf->disks) 6964 goto abort; 6965 6966 for (i = 0; i < max_disks; i++) { 6967 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6968 if (!conf->disks[i].extra_page) 6969 goto abort; 6970 } 6971 6972 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 6973 if (ret) 6974 goto abort; 6975 conf->mddev = mddev; 6976 6977 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6978 goto abort; 6979 6980 /* We init hash_locks[0] separately to that it can be used 6981 * as the reference lock in the spin_lock_nest_lock() call 6982 * in lock_all_device_hash_locks_irq in order to convince 6983 * lockdep that we know what we are doing. 6984 */ 6985 spin_lock_init(conf->hash_locks); 6986 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6987 spin_lock_init(conf->hash_locks + i); 6988 6989 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6990 INIT_LIST_HEAD(conf->inactive_list + i); 6991 6992 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6993 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6994 6995 atomic_set(&conf->r5c_cached_full_stripes, 0); 6996 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6997 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6998 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6999 atomic_set(&conf->r5c_flushing_full_stripes, 0); 7000 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 7001 7002 conf->level = mddev->new_level; 7003 conf->chunk_sectors = mddev->new_chunk_sectors; 7004 if (raid5_alloc_percpu(conf) != 0) 7005 goto abort; 7006 7007 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 7008 7009 rdev_for_each(rdev, mddev) { 7010 raid_disk = rdev->raid_disk; 7011 if (raid_disk >= max_disks 7012 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 7013 continue; 7014 disk = conf->disks + raid_disk; 7015 7016 if (test_bit(Replacement, &rdev->flags)) { 7017 if (disk->replacement) 7018 goto abort; 7019 disk->replacement = rdev; 7020 } else { 7021 if (disk->rdev) 7022 goto abort; 7023 disk->rdev = rdev; 7024 } 7025 7026 if (test_bit(In_sync, &rdev->flags)) { 7027 char b[BDEVNAME_SIZE]; 7028 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 7029 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 7030 } else if (rdev->saved_raid_disk != raid_disk) 7031 /* Cannot rely on bitmap to complete recovery */ 7032 conf->fullsync = 1; 7033 } 7034 7035 conf->level = mddev->new_level; 7036 if (conf->level == 6) { 7037 conf->max_degraded = 2; 7038 if (raid6_call.xor_syndrome) 7039 conf->rmw_level = PARITY_ENABLE_RMW; 7040 else 7041 conf->rmw_level = PARITY_DISABLE_RMW; 7042 } else { 7043 conf->max_degraded = 1; 7044 conf->rmw_level = PARITY_ENABLE_RMW; 7045 } 7046 conf->algorithm = mddev->new_layout; 7047 conf->reshape_progress = mddev->reshape_position; 7048 if (conf->reshape_progress != MaxSector) { 7049 conf->prev_chunk_sectors = mddev->chunk_sectors; 7050 conf->prev_algo = mddev->layout; 7051 } else { 7052 conf->prev_chunk_sectors = conf->chunk_sectors; 7053 conf->prev_algo = conf->algorithm; 7054 } 7055 7056 conf->min_nr_stripes = NR_STRIPES; 7057 if (mddev->reshape_position != MaxSector) { 7058 int stripes = max_t(int, 7059 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7060 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7061 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7062 if (conf->min_nr_stripes != NR_STRIPES) 7063 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7064 mdname(mddev), conf->min_nr_stripes); 7065 } 7066 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7067 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7068 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7069 if (grow_stripes(conf, conf->min_nr_stripes)) { 7070 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7071 mdname(mddev), memory); 7072 goto abort; 7073 } else 7074 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7075 /* 7076 * Losing a stripe head costs more than the time to refill it, 7077 * it reduces the queue depth and so can hurt throughput. 7078 * So set it rather large, scaled by number of devices. 7079 */ 7080 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7081 conf->shrinker.scan_objects = raid5_cache_scan; 7082 conf->shrinker.count_objects = raid5_cache_count; 7083 conf->shrinker.batch = 128; 7084 conf->shrinker.flags = 0; 7085 if (register_shrinker(&conf->shrinker)) { 7086 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7087 mdname(mddev)); 7088 goto abort; 7089 } 7090 7091 sprintf(pers_name, "raid%d", mddev->new_level); 7092 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7093 if (!conf->thread) { 7094 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7095 mdname(mddev)); 7096 goto abort; 7097 } 7098 7099 return conf; 7100 7101 abort: 7102 if (conf) { 7103 free_conf(conf); 7104 return ERR_PTR(-EIO); 7105 } else 7106 return ERR_PTR(-ENOMEM); 7107 } 7108 7109 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7110 { 7111 switch (algo) { 7112 case ALGORITHM_PARITY_0: 7113 if (raid_disk < max_degraded) 7114 return 1; 7115 break; 7116 case ALGORITHM_PARITY_N: 7117 if (raid_disk >= raid_disks - max_degraded) 7118 return 1; 7119 break; 7120 case ALGORITHM_PARITY_0_6: 7121 if (raid_disk == 0 || 7122 raid_disk == raid_disks - 1) 7123 return 1; 7124 break; 7125 case ALGORITHM_LEFT_ASYMMETRIC_6: 7126 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7127 case ALGORITHM_LEFT_SYMMETRIC_6: 7128 case ALGORITHM_RIGHT_SYMMETRIC_6: 7129 if (raid_disk == raid_disks - 1) 7130 return 1; 7131 } 7132 return 0; 7133 } 7134 7135 static int raid5_run(struct mddev *mddev) 7136 { 7137 struct r5conf *conf; 7138 int working_disks = 0; 7139 int dirty_parity_disks = 0; 7140 struct md_rdev *rdev; 7141 struct md_rdev *journal_dev = NULL; 7142 sector_t reshape_offset = 0; 7143 int i; 7144 long long min_offset_diff = 0; 7145 int first = 1; 7146 7147 if (mddev_init_writes_pending(mddev) < 0) 7148 return -ENOMEM; 7149 7150 if (mddev->recovery_cp != MaxSector) 7151 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7152 mdname(mddev)); 7153 7154 rdev_for_each(rdev, mddev) { 7155 long long diff; 7156 7157 if (test_bit(Journal, &rdev->flags)) { 7158 journal_dev = rdev; 7159 continue; 7160 } 7161 if (rdev->raid_disk < 0) 7162 continue; 7163 diff = (rdev->new_data_offset - rdev->data_offset); 7164 if (first) { 7165 min_offset_diff = diff; 7166 first = 0; 7167 } else if (mddev->reshape_backwards && 7168 diff < min_offset_diff) 7169 min_offset_diff = diff; 7170 else if (!mddev->reshape_backwards && 7171 diff > min_offset_diff) 7172 min_offset_diff = diff; 7173 } 7174 7175 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && 7176 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7177 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7178 mdname(mddev)); 7179 return -EINVAL; 7180 } 7181 7182 if (mddev->reshape_position != MaxSector) { 7183 /* Check that we can continue the reshape. 7184 * Difficulties arise if the stripe we would write to 7185 * next is at or after the stripe we would read from next. 7186 * For a reshape that changes the number of devices, this 7187 * is only possible for a very short time, and mdadm makes 7188 * sure that time appears to have past before assembling 7189 * the array. So we fail if that time hasn't passed. 7190 * For a reshape that keeps the number of devices the same 7191 * mdadm must be monitoring the reshape can keeping the 7192 * critical areas read-only and backed up. It will start 7193 * the array in read-only mode, so we check for that. 7194 */ 7195 sector_t here_new, here_old; 7196 int old_disks; 7197 int max_degraded = (mddev->level == 6 ? 2 : 1); 7198 int chunk_sectors; 7199 int new_data_disks; 7200 7201 if (journal_dev) { 7202 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7203 mdname(mddev)); 7204 return -EINVAL; 7205 } 7206 7207 if (mddev->new_level != mddev->level) { 7208 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7209 mdname(mddev)); 7210 return -EINVAL; 7211 } 7212 old_disks = mddev->raid_disks - mddev->delta_disks; 7213 /* reshape_position must be on a new-stripe boundary, and one 7214 * further up in new geometry must map after here in old 7215 * geometry. 7216 * If the chunk sizes are different, then as we perform reshape 7217 * in units of the largest of the two, reshape_position needs 7218 * be a multiple of the largest chunk size times new data disks. 7219 */ 7220 here_new = mddev->reshape_position; 7221 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7222 new_data_disks = mddev->raid_disks - max_degraded; 7223 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7224 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7225 mdname(mddev)); 7226 return -EINVAL; 7227 } 7228 reshape_offset = here_new * chunk_sectors; 7229 /* here_new is the stripe we will write to */ 7230 here_old = mddev->reshape_position; 7231 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7232 /* here_old is the first stripe that we might need to read 7233 * from */ 7234 if (mddev->delta_disks == 0) { 7235 /* We cannot be sure it is safe to start an in-place 7236 * reshape. It is only safe if user-space is monitoring 7237 * and taking constant backups. 7238 * mdadm always starts a situation like this in 7239 * readonly mode so it can take control before 7240 * allowing any writes. So just check for that. 7241 */ 7242 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7243 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7244 /* not really in-place - so OK */; 7245 else if (mddev->ro == 0) { 7246 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7247 mdname(mddev)); 7248 return -EINVAL; 7249 } 7250 } else if (mddev->reshape_backwards 7251 ? (here_new * chunk_sectors + min_offset_diff <= 7252 here_old * chunk_sectors) 7253 : (here_new * chunk_sectors >= 7254 here_old * chunk_sectors + (-min_offset_diff))) { 7255 /* Reading from the same stripe as writing to - bad */ 7256 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7257 mdname(mddev)); 7258 return -EINVAL; 7259 } 7260 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7261 /* OK, we should be able to continue; */ 7262 } else { 7263 BUG_ON(mddev->level != mddev->new_level); 7264 BUG_ON(mddev->layout != mddev->new_layout); 7265 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7266 BUG_ON(mddev->delta_disks != 0); 7267 } 7268 7269 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7270 test_bit(MD_HAS_PPL, &mddev->flags)) { 7271 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7272 mdname(mddev)); 7273 clear_bit(MD_HAS_PPL, &mddev->flags); 7274 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7275 } 7276 7277 if (mddev->private == NULL) 7278 conf = setup_conf(mddev); 7279 else 7280 conf = mddev->private; 7281 7282 if (IS_ERR(conf)) 7283 return PTR_ERR(conf); 7284 7285 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7286 if (!journal_dev) { 7287 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7288 mdname(mddev)); 7289 mddev->ro = 1; 7290 set_disk_ro(mddev->gendisk, 1); 7291 } else if (mddev->recovery_cp == MaxSector) 7292 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7293 } 7294 7295 conf->min_offset_diff = min_offset_diff; 7296 mddev->thread = conf->thread; 7297 conf->thread = NULL; 7298 mddev->private = conf; 7299 7300 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7301 i++) { 7302 rdev = conf->disks[i].rdev; 7303 if (!rdev && conf->disks[i].replacement) { 7304 /* The replacement is all we have yet */ 7305 rdev = conf->disks[i].replacement; 7306 conf->disks[i].replacement = NULL; 7307 clear_bit(Replacement, &rdev->flags); 7308 conf->disks[i].rdev = rdev; 7309 } 7310 if (!rdev) 7311 continue; 7312 if (conf->disks[i].replacement && 7313 conf->reshape_progress != MaxSector) { 7314 /* replacements and reshape simply do not mix. */ 7315 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7316 goto abort; 7317 } 7318 if (test_bit(In_sync, &rdev->flags)) { 7319 working_disks++; 7320 continue; 7321 } 7322 /* This disc is not fully in-sync. However if it 7323 * just stored parity (beyond the recovery_offset), 7324 * when we don't need to be concerned about the 7325 * array being dirty. 7326 * When reshape goes 'backwards', we never have 7327 * partially completed devices, so we only need 7328 * to worry about reshape going forwards. 7329 */ 7330 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7331 if (mddev->major_version == 0 && 7332 mddev->minor_version > 90) 7333 rdev->recovery_offset = reshape_offset; 7334 7335 if (rdev->recovery_offset < reshape_offset) { 7336 /* We need to check old and new layout */ 7337 if (!only_parity(rdev->raid_disk, 7338 conf->algorithm, 7339 conf->raid_disks, 7340 conf->max_degraded)) 7341 continue; 7342 } 7343 if (!only_parity(rdev->raid_disk, 7344 conf->prev_algo, 7345 conf->previous_raid_disks, 7346 conf->max_degraded)) 7347 continue; 7348 dirty_parity_disks++; 7349 } 7350 7351 /* 7352 * 0 for a fully functional array, 1 or 2 for a degraded array. 7353 */ 7354 mddev->degraded = raid5_calc_degraded(conf); 7355 7356 if (has_failed(conf)) { 7357 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7358 mdname(mddev), mddev->degraded, conf->raid_disks); 7359 goto abort; 7360 } 7361 7362 /* device size must be a multiple of chunk size */ 7363 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7364 mddev->resync_max_sectors = mddev->dev_sectors; 7365 7366 if (mddev->degraded > dirty_parity_disks && 7367 mddev->recovery_cp != MaxSector) { 7368 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7369 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7370 mdname(mddev)); 7371 else if (mddev->ok_start_degraded) 7372 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7373 mdname(mddev)); 7374 else { 7375 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7376 mdname(mddev)); 7377 goto abort; 7378 } 7379 } 7380 7381 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7382 mdname(mddev), conf->level, 7383 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7384 mddev->new_layout); 7385 7386 print_raid5_conf(conf); 7387 7388 if (conf->reshape_progress != MaxSector) { 7389 conf->reshape_safe = conf->reshape_progress; 7390 atomic_set(&conf->reshape_stripes, 0); 7391 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7392 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7393 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7394 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7395 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7396 "reshape"); 7397 if (!mddev->sync_thread) 7398 goto abort; 7399 } 7400 7401 /* Ok, everything is just fine now */ 7402 if (mddev->to_remove == &raid5_attrs_group) 7403 mddev->to_remove = NULL; 7404 else if (mddev->kobj.sd && 7405 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7406 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7407 mdname(mddev)); 7408 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7409 7410 if (mddev->queue) { 7411 int chunk_size; 7412 /* read-ahead size must cover two whole stripes, which 7413 * is 2 * (datadisks) * chunksize where 'n' is the 7414 * number of raid devices 7415 */ 7416 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7417 int stripe = data_disks * 7418 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7419 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7420 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7421 7422 chunk_size = mddev->chunk_sectors << 9; 7423 blk_queue_io_min(mddev->queue, chunk_size); 7424 blk_queue_io_opt(mddev->queue, chunk_size * 7425 (conf->raid_disks - conf->max_degraded)); 7426 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7427 /* 7428 * We can only discard a whole stripe. It doesn't make sense to 7429 * discard data disk but write parity disk 7430 */ 7431 stripe = stripe * PAGE_SIZE; 7432 /* Round up to power of 2, as discard handling 7433 * currently assumes that */ 7434 while ((stripe-1) & stripe) 7435 stripe = (stripe | (stripe-1)) + 1; 7436 mddev->queue->limits.discard_alignment = stripe; 7437 mddev->queue->limits.discard_granularity = stripe; 7438 7439 blk_queue_max_write_same_sectors(mddev->queue, 0); 7440 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 7441 7442 rdev_for_each(rdev, mddev) { 7443 disk_stack_limits(mddev->gendisk, rdev->bdev, 7444 rdev->data_offset << 9); 7445 disk_stack_limits(mddev->gendisk, rdev->bdev, 7446 rdev->new_data_offset << 9); 7447 } 7448 7449 /* 7450 * zeroing is required, otherwise data 7451 * could be lost. Consider a scenario: discard a stripe 7452 * (the stripe could be inconsistent if 7453 * discard_zeroes_data is 0); write one disk of the 7454 * stripe (the stripe could be inconsistent again 7455 * depending on which disks are used to calculate 7456 * parity); the disk is broken; The stripe data of this 7457 * disk is lost. 7458 * 7459 * We only allow DISCARD if the sysadmin has confirmed that 7460 * only safe devices are in use by setting a module parameter. 7461 * A better idea might be to turn DISCARD into WRITE_ZEROES 7462 * requests, as that is required to be safe. 7463 */ 7464 if (devices_handle_discard_safely && 7465 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7466 mddev->queue->limits.discard_granularity >= stripe) 7467 blk_queue_flag_set(QUEUE_FLAG_DISCARD, 7468 mddev->queue); 7469 else 7470 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, 7471 mddev->queue); 7472 7473 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7474 } 7475 7476 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 7477 goto abort; 7478 7479 return 0; 7480 abort: 7481 md_unregister_thread(&mddev->thread); 7482 print_raid5_conf(conf); 7483 free_conf(conf); 7484 mddev->private = NULL; 7485 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7486 return -EIO; 7487 } 7488 7489 static void raid5_free(struct mddev *mddev, void *priv) 7490 { 7491 struct r5conf *conf = priv; 7492 7493 free_conf(conf); 7494 mddev->to_remove = &raid5_attrs_group; 7495 } 7496 7497 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7498 { 7499 struct r5conf *conf = mddev->private; 7500 int i; 7501 7502 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7503 conf->chunk_sectors / 2, mddev->layout); 7504 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7505 rcu_read_lock(); 7506 for (i = 0; i < conf->raid_disks; i++) { 7507 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7508 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7509 } 7510 rcu_read_unlock(); 7511 seq_printf (seq, "]"); 7512 } 7513 7514 static void print_raid5_conf (struct r5conf *conf) 7515 { 7516 int i; 7517 struct disk_info *tmp; 7518 7519 pr_debug("RAID conf printout:\n"); 7520 if (!conf) { 7521 pr_debug("(conf==NULL)\n"); 7522 return; 7523 } 7524 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7525 conf->raid_disks, 7526 conf->raid_disks - conf->mddev->degraded); 7527 7528 for (i = 0; i < conf->raid_disks; i++) { 7529 char b[BDEVNAME_SIZE]; 7530 tmp = conf->disks + i; 7531 if (tmp->rdev) 7532 pr_debug(" disk %d, o:%d, dev:%s\n", 7533 i, !test_bit(Faulty, &tmp->rdev->flags), 7534 bdevname(tmp->rdev->bdev, b)); 7535 } 7536 } 7537 7538 static int raid5_spare_active(struct mddev *mddev) 7539 { 7540 int i; 7541 struct r5conf *conf = mddev->private; 7542 struct disk_info *tmp; 7543 int count = 0; 7544 unsigned long flags; 7545 7546 for (i = 0; i < conf->raid_disks; i++) { 7547 tmp = conf->disks + i; 7548 if (tmp->replacement 7549 && tmp->replacement->recovery_offset == MaxSector 7550 && !test_bit(Faulty, &tmp->replacement->flags) 7551 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7552 /* Replacement has just become active. */ 7553 if (!tmp->rdev 7554 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7555 count++; 7556 if (tmp->rdev) { 7557 /* Replaced device not technically faulty, 7558 * but we need to be sure it gets removed 7559 * and never re-added. 7560 */ 7561 set_bit(Faulty, &tmp->rdev->flags); 7562 sysfs_notify_dirent_safe( 7563 tmp->rdev->sysfs_state); 7564 } 7565 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7566 } else if (tmp->rdev 7567 && tmp->rdev->recovery_offset == MaxSector 7568 && !test_bit(Faulty, &tmp->rdev->flags) 7569 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7570 count++; 7571 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7572 } 7573 } 7574 spin_lock_irqsave(&conf->device_lock, flags); 7575 mddev->degraded = raid5_calc_degraded(conf); 7576 spin_unlock_irqrestore(&conf->device_lock, flags); 7577 print_raid5_conf(conf); 7578 return count; 7579 } 7580 7581 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7582 { 7583 struct r5conf *conf = mddev->private; 7584 int err = 0; 7585 int number = rdev->raid_disk; 7586 struct md_rdev **rdevp; 7587 struct disk_info *p = conf->disks + number; 7588 7589 print_raid5_conf(conf); 7590 if (test_bit(Journal, &rdev->flags) && conf->log) { 7591 /* 7592 * we can't wait pending write here, as this is called in 7593 * raid5d, wait will deadlock. 7594 * neilb: there is no locking about new writes here, 7595 * so this cannot be safe. 7596 */ 7597 if (atomic_read(&conf->active_stripes) || 7598 atomic_read(&conf->r5c_cached_full_stripes) || 7599 atomic_read(&conf->r5c_cached_partial_stripes)) { 7600 return -EBUSY; 7601 } 7602 log_exit(conf); 7603 return 0; 7604 } 7605 if (rdev == p->rdev) 7606 rdevp = &p->rdev; 7607 else if (rdev == p->replacement) 7608 rdevp = &p->replacement; 7609 else 7610 return 0; 7611 7612 if (number >= conf->raid_disks && 7613 conf->reshape_progress == MaxSector) 7614 clear_bit(In_sync, &rdev->flags); 7615 7616 if (test_bit(In_sync, &rdev->flags) || 7617 atomic_read(&rdev->nr_pending)) { 7618 err = -EBUSY; 7619 goto abort; 7620 } 7621 /* Only remove non-faulty devices if recovery 7622 * isn't possible. 7623 */ 7624 if (!test_bit(Faulty, &rdev->flags) && 7625 mddev->recovery_disabled != conf->recovery_disabled && 7626 !has_failed(conf) && 7627 (!p->replacement || p->replacement == rdev) && 7628 number < conf->raid_disks) { 7629 err = -EBUSY; 7630 goto abort; 7631 } 7632 *rdevp = NULL; 7633 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7634 synchronize_rcu(); 7635 if (atomic_read(&rdev->nr_pending)) { 7636 /* lost the race, try later */ 7637 err = -EBUSY; 7638 *rdevp = rdev; 7639 } 7640 } 7641 if (!err) { 7642 err = log_modify(conf, rdev, false); 7643 if (err) 7644 goto abort; 7645 } 7646 if (p->replacement) { 7647 /* We must have just cleared 'rdev' */ 7648 p->rdev = p->replacement; 7649 clear_bit(Replacement, &p->replacement->flags); 7650 smp_mb(); /* Make sure other CPUs may see both as identical 7651 * but will never see neither - if they are careful 7652 */ 7653 p->replacement = NULL; 7654 7655 if (!err) 7656 err = log_modify(conf, p->rdev, true); 7657 } 7658 7659 clear_bit(WantReplacement, &rdev->flags); 7660 abort: 7661 7662 print_raid5_conf(conf); 7663 return err; 7664 } 7665 7666 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7667 { 7668 struct r5conf *conf = mddev->private; 7669 int err = -EEXIST; 7670 int disk; 7671 struct disk_info *p; 7672 int first = 0; 7673 int last = conf->raid_disks - 1; 7674 7675 if (test_bit(Journal, &rdev->flags)) { 7676 if (conf->log) 7677 return -EBUSY; 7678 7679 rdev->raid_disk = 0; 7680 /* 7681 * The array is in readonly mode if journal is missing, so no 7682 * write requests running. We should be safe 7683 */ 7684 log_init(conf, rdev, false); 7685 return 0; 7686 } 7687 if (mddev->recovery_disabled == conf->recovery_disabled) 7688 return -EBUSY; 7689 7690 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7691 /* no point adding a device */ 7692 return -EINVAL; 7693 7694 if (rdev->raid_disk >= 0) 7695 first = last = rdev->raid_disk; 7696 7697 /* 7698 * find the disk ... but prefer rdev->saved_raid_disk 7699 * if possible. 7700 */ 7701 if (rdev->saved_raid_disk >= 0 && 7702 rdev->saved_raid_disk >= first && 7703 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7704 first = rdev->saved_raid_disk; 7705 7706 for (disk = first; disk <= last; disk++) { 7707 p = conf->disks + disk; 7708 if (p->rdev == NULL) { 7709 clear_bit(In_sync, &rdev->flags); 7710 rdev->raid_disk = disk; 7711 if (rdev->saved_raid_disk != disk) 7712 conf->fullsync = 1; 7713 rcu_assign_pointer(p->rdev, rdev); 7714 7715 err = log_modify(conf, rdev, true); 7716 7717 goto out; 7718 } 7719 } 7720 for (disk = first; disk <= last; disk++) { 7721 p = conf->disks + disk; 7722 if (test_bit(WantReplacement, &p->rdev->flags) && 7723 p->replacement == NULL) { 7724 clear_bit(In_sync, &rdev->flags); 7725 set_bit(Replacement, &rdev->flags); 7726 rdev->raid_disk = disk; 7727 err = 0; 7728 conf->fullsync = 1; 7729 rcu_assign_pointer(p->replacement, rdev); 7730 break; 7731 } 7732 } 7733 out: 7734 print_raid5_conf(conf); 7735 return err; 7736 } 7737 7738 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7739 { 7740 /* no resync is happening, and there is enough space 7741 * on all devices, so we can resize. 7742 * We need to make sure resync covers any new space. 7743 * If the array is shrinking we should possibly wait until 7744 * any io in the removed space completes, but it hardly seems 7745 * worth it. 7746 */ 7747 sector_t newsize; 7748 struct r5conf *conf = mddev->private; 7749 7750 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 7751 return -EINVAL; 7752 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7753 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7754 if (mddev->external_size && 7755 mddev->array_sectors > newsize) 7756 return -EINVAL; 7757 if (mddev->bitmap) { 7758 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); 7759 if (ret) 7760 return ret; 7761 } 7762 md_set_array_sectors(mddev, newsize); 7763 if (sectors > mddev->dev_sectors && 7764 mddev->recovery_cp > mddev->dev_sectors) { 7765 mddev->recovery_cp = mddev->dev_sectors; 7766 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7767 } 7768 mddev->dev_sectors = sectors; 7769 mddev->resync_max_sectors = sectors; 7770 return 0; 7771 } 7772 7773 static int check_stripe_cache(struct mddev *mddev) 7774 { 7775 /* Can only proceed if there are plenty of stripe_heads. 7776 * We need a minimum of one full stripe,, and for sensible progress 7777 * it is best to have about 4 times that. 7778 * If we require 4 times, then the default 256 4K stripe_heads will 7779 * allow for chunk sizes up to 256K, which is probably OK. 7780 * If the chunk size is greater, user-space should request more 7781 * stripe_heads first. 7782 */ 7783 struct r5conf *conf = mddev->private; 7784 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7785 > conf->min_nr_stripes || 7786 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7787 > conf->min_nr_stripes) { 7788 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7789 mdname(mddev), 7790 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7791 / STRIPE_SIZE)*4); 7792 return 0; 7793 } 7794 return 1; 7795 } 7796 7797 static int check_reshape(struct mddev *mddev) 7798 { 7799 struct r5conf *conf = mddev->private; 7800 7801 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 7802 return -EINVAL; 7803 if (mddev->delta_disks == 0 && 7804 mddev->new_layout == mddev->layout && 7805 mddev->new_chunk_sectors == mddev->chunk_sectors) 7806 return 0; /* nothing to do */ 7807 if (has_failed(conf)) 7808 return -EINVAL; 7809 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7810 /* We might be able to shrink, but the devices must 7811 * be made bigger first. 7812 * For raid6, 4 is the minimum size. 7813 * Otherwise 2 is the minimum 7814 */ 7815 int min = 2; 7816 if (mddev->level == 6) 7817 min = 4; 7818 if (mddev->raid_disks + mddev->delta_disks < min) 7819 return -EINVAL; 7820 } 7821 7822 if (!check_stripe_cache(mddev)) 7823 return -ENOSPC; 7824 7825 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7826 mddev->delta_disks > 0) 7827 if (resize_chunks(conf, 7828 conf->previous_raid_disks 7829 + max(0, mddev->delta_disks), 7830 max(mddev->new_chunk_sectors, 7831 mddev->chunk_sectors) 7832 ) < 0) 7833 return -ENOMEM; 7834 7835 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 7836 return 0; /* never bother to shrink */ 7837 return resize_stripes(conf, (conf->previous_raid_disks 7838 + mddev->delta_disks)); 7839 } 7840 7841 static int raid5_start_reshape(struct mddev *mddev) 7842 { 7843 struct r5conf *conf = mddev->private; 7844 struct md_rdev *rdev; 7845 int spares = 0; 7846 unsigned long flags; 7847 7848 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7849 return -EBUSY; 7850 7851 if (!check_stripe_cache(mddev)) 7852 return -ENOSPC; 7853 7854 if (has_failed(conf)) 7855 return -EINVAL; 7856 7857 rdev_for_each(rdev, mddev) { 7858 if (!test_bit(In_sync, &rdev->flags) 7859 && !test_bit(Faulty, &rdev->flags)) 7860 spares++; 7861 } 7862 7863 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7864 /* Not enough devices even to make a degraded array 7865 * of that size 7866 */ 7867 return -EINVAL; 7868 7869 /* Refuse to reduce size of the array. Any reductions in 7870 * array size must be through explicit setting of array_size 7871 * attribute. 7872 */ 7873 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7874 < mddev->array_sectors) { 7875 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7876 mdname(mddev)); 7877 return -EINVAL; 7878 } 7879 7880 atomic_set(&conf->reshape_stripes, 0); 7881 spin_lock_irq(&conf->device_lock); 7882 write_seqcount_begin(&conf->gen_lock); 7883 conf->previous_raid_disks = conf->raid_disks; 7884 conf->raid_disks += mddev->delta_disks; 7885 conf->prev_chunk_sectors = conf->chunk_sectors; 7886 conf->chunk_sectors = mddev->new_chunk_sectors; 7887 conf->prev_algo = conf->algorithm; 7888 conf->algorithm = mddev->new_layout; 7889 conf->generation++; 7890 /* Code that selects data_offset needs to see the generation update 7891 * if reshape_progress has been set - so a memory barrier needed. 7892 */ 7893 smp_mb(); 7894 if (mddev->reshape_backwards) 7895 conf->reshape_progress = raid5_size(mddev, 0, 0); 7896 else 7897 conf->reshape_progress = 0; 7898 conf->reshape_safe = conf->reshape_progress; 7899 write_seqcount_end(&conf->gen_lock); 7900 spin_unlock_irq(&conf->device_lock); 7901 7902 /* Now make sure any requests that proceeded on the assumption 7903 * the reshape wasn't running - like Discard or Read - have 7904 * completed. 7905 */ 7906 mddev_suspend(mddev); 7907 mddev_resume(mddev); 7908 7909 /* Add some new drives, as many as will fit. 7910 * We know there are enough to make the newly sized array work. 7911 * Don't add devices if we are reducing the number of 7912 * devices in the array. This is because it is not possible 7913 * to correctly record the "partially reconstructed" state of 7914 * such devices during the reshape and confusion could result. 7915 */ 7916 if (mddev->delta_disks >= 0) { 7917 rdev_for_each(rdev, mddev) 7918 if (rdev->raid_disk < 0 && 7919 !test_bit(Faulty, &rdev->flags)) { 7920 if (raid5_add_disk(mddev, rdev) == 0) { 7921 if (rdev->raid_disk 7922 >= conf->previous_raid_disks) 7923 set_bit(In_sync, &rdev->flags); 7924 else 7925 rdev->recovery_offset = 0; 7926 7927 if (sysfs_link_rdev(mddev, rdev)) 7928 /* Failure here is OK */; 7929 } 7930 } else if (rdev->raid_disk >= conf->previous_raid_disks 7931 && !test_bit(Faulty, &rdev->flags)) { 7932 /* This is a spare that was manually added */ 7933 set_bit(In_sync, &rdev->flags); 7934 } 7935 7936 /* When a reshape changes the number of devices, 7937 * ->degraded is measured against the larger of the 7938 * pre and post number of devices. 7939 */ 7940 spin_lock_irqsave(&conf->device_lock, flags); 7941 mddev->degraded = raid5_calc_degraded(conf); 7942 spin_unlock_irqrestore(&conf->device_lock, flags); 7943 } 7944 mddev->raid_disks = conf->raid_disks; 7945 mddev->reshape_position = conf->reshape_progress; 7946 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7947 7948 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7949 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7950 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7951 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7952 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7953 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7954 "reshape"); 7955 if (!mddev->sync_thread) { 7956 mddev->recovery = 0; 7957 spin_lock_irq(&conf->device_lock); 7958 write_seqcount_begin(&conf->gen_lock); 7959 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7960 mddev->new_chunk_sectors = 7961 conf->chunk_sectors = conf->prev_chunk_sectors; 7962 mddev->new_layout = conf->algorithm = conf->prev_algo; 7963 rdev_for_each(rdev, mddev) 7964 rdev->new_data_offset = rdev->data_offset; 7965 smp_wmb(); 7966 conf->generation --; 7967 conf->reshape_progress = MaxSector; 7968 mddev->reshape_position = MaxSector; 7969 write_seqcount_end(&conf->gen_lock); 7970 spin_unlock_irq(&conf->device_lock); 7971 return -EAGAIN; 7972 } 7973 conf->reshape_checkpoint = jiffies; 7974 md_wakeup_thread(mddev->sync_thread); 7975 md_new_event(mddev); 7976 return 0; 7977 } 7978 7979 /* This is called from the reshape thread and should make any 7980 * changes needed in 'conf' 7981 */ 7982 static void end_reshape(struct r5conf *conf) 7983 { 7984 7985 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7986 struct md_rdev *rdev; 7987 7988 spin_lock_irq(&conf->device_lock); 7989 conf->previous_raid_disks = conf->raid_disks; 7990 md_finish_reshape(conf->mddev); 7991 smp_wmb(); 7992 conf->reshape_progress = MaxSector; 7993 conf->mddev->reshape_position = MaxSector; 7994 rdev_for_each(rdev, conf->mddev) 7995 if (rdev->raid_disk >= 0 && 7996 !test_bit(Journal, &rdev->flags) && 7997 !test_bit(In_sync, &rdev->flags)) 7998 rdev->recovery_offset = MaxSector; 7999 spin_unlock_irq(&conf->device_lock); 8000 wake_up(&conf->wait_for_overlap); 8001 8002 /* read-ahead size must cover two whole stripes, which is 8003 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 8004 */ 8005 if (conf->mddev->queue) { 8006 int data_disks = conf->raid_disks - conf->max_degraded; 8007 int stripe = data_disks * ((conf->chunk_sectors << 9) 8008 / PAGE_SIZE); 8009 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 8010 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 8011 } 8012 } 8013 } 8014 8015 /* This is called from the raid5d thread with mddev_lock held. 8016 * It makes config changes to the device. 8017 */ 8018 static void raid5_finish_reshape(struct mddev *mddev) 8019 { 8020 struct r5conf *conf = mddev->private; 8021 8022 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8023 8024 if (mddev->delta_disks <= 0) { 8025 int d; 8026 spin_lock_irq(&conf->device_lock); 8027 mddev->degraded = raid5_calc_degraded(conf); 8028 spin_unlock_irq(&conf->device_lock); 8029 for (d = conf->raid_disks ; 8030 d < conf->raid_disks - mddev->delta_disks; 8031 d++) { 8032 struct md_rdev *rdev = conf->disks[d].rdev; 8033 if (rdev) 8034 clear_bit(In_sync, &rdev->flags); 8035 rdev = conf->disks[d].replacement; 8036 if (rdev) 8037 clear_bit(In_sync, &rdev->flags); 8038 } 8039 } 8040 mddev->layout = conf->algorithm; 8041 mddev->chunk_sectors = conf->chunk_sectors; 8042 mddev->reshape_position = MaxSector; 8043 mddev->delta_disks = 0; 8044 mddev->reshape_backwards = 0; 8045 } 8046 } 8047 8048 static void raid5_quiesce(struct mddev *mddev, int quiesce) 8049 { 8050 struct r5conf *conf = mddev->private; 8051 8052 if (quiesce) { 8053 /* stop all writes */ 8054 lock_all_device_hash_locks_irq(conf); 8055 /* '2' tells resync/reshape to pause so that all 8056 * active stripes can drain 8057 */ 8058 r5c_flush_cache(conf, INT_MAX); 8059 conf->quiesce = 2; 8060 wait_event_cmd(conf->wait_for_quiescent, 8061 atomic_read(&conf->active_stripes) == 0 && 8062 atomic_read(&conf->active_aligned_reads) == 0, 8063 unlock_all_device_hash_locks_irq(conf), 8064 lock_all_device_hash_locks_irq(conf)); 8065 conf->quiesce = 1; 8066 unlock_all_device_hash_locks_irq(conf); 8067 /* allow reshape to continue */ 8068 wake_up(&conf->wait_for_overlap); 8069 } else { 8070 /* re-enable writes */ 8071 lock_all_device_hash_locks_irq(conf); 8072 conf->quiesce = 0; 8073 wake_up(&conf->wait_for_quiescent); 8074 wake_up(&conf->wait_for_overlap); 8075 unlock_all_device_hash_locks_irq(conf); 8076 } 8077 log_quiesce(conf, quiesce); 8078 } 8079 8080 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8081 { 8082 struct r0conf *raid0_conf = mddev->private; 8083 sector_t sectors; 8084 8085 /* for raid0 takeover only one zone is supported */ 8086 if (raid0_conf->nr_strip_zones > 1) { 8087 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8088 mdname(mddev)); 8089 return ERR_PTR(-EINVAL); 8090 } 8091 8092 sectors = raid0_conf->strip_zone[0].zone_end; 8093 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8094 mddev->dev_sectors = sectors; 8095 mddev->new_level = level; 8096 mddev->new_layout = ALGORITHM_PARITY_N; 8097 mddev->new_chunk_sectors = mddev->chunk_sectors; 8098 mddev->raid_disks += 1; 8099 mddev->delta_disks = 1; 8100 /* make sure it will be not marked as dirty */ 8101 mddev->recovery_cp = MaxSector; 8102 8103 return setup_conf(mddev); 8104 } 8105 8106 static void *raid5_takeover_raid1(struct mddev *mddev) 8107 { 8108 int chunksect; 8109 void *ret; 8110 8111 if (mddev->raid_disks != 2 || 8112 mddev->degraded > 1) 8113 return ERR_PTR(-EINVAL); 8114 8115 /* Should check if there are write-behind devices? */ 8116 8117 chunksect = 64*2; /* 64K by default */ 8118 8119 /* The array must be an exact multiple of chunksize */ 8120 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8121 chunksect >>= 1; 8122 8123 if ((chunksect<<9) < STRIPE_SIZE) 8124 /* array size does not allow a suitable chunk size */ 8125 return ERR_PTR(-EINVAL); 8126 8127 mddev->new_level = 5; 8128 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8129 mddev->new_chunk_sectors = chunksect; 8130 8131 ret = setup_conf(mddev); 8132 if (!IS_ERR(ret)) 8133 mddev_clear_unsupported_flags(mddev, 8134 UNSUPPORTED_MDDEV_FLAGS); 8135 return ret; 8136 } 8137 8138 static void *raid5_takeover_raid6(struct mddev *mddev) 8139 { 8140 int new_layout; 8141 8142 switch (mddev->layout) { 8143 case ALGORITHM_LEFT_ASYMMETRIC_6: 8144 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8145 break; 8146 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8147 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8148 break; 8149 case ALGORITHM_LEFT_SYMMETRIC_6: 8150 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8151 break; 8152 case ALGORITHM_RIGHT_SYMMETRIC_6: 8153 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8154 break; 8155 case ALGORITHM_PARITY_0_6: 8156 new_layout = ALGORITHM_PARITY_0; 8157 break; 8158 case ALGORITHM_PARITY_N: 8159 new_layout = ALGORITHM_PARITY_N; 8160 break; 8161 default: 8162 return ERR_PTR(-EINVAL); 8163 } 8164 mddev->new_level = 5; 8165 mddev->new_layout = new_layout; 8166 mddev->delta_disks = -1; 8167 mddev->raid_disks -= 1; 8168 return setup_conf(mddev); 8169 } 8170 8171 static int raid5_check_reshape(struct mddev *mddev) 8172 { 8173 /* For a 2-drive array, the layout and chunk size can be changed 8174 * immediately as not restriping is needed. 8175 * For larger arrays we record the new value - after validation 8176 * to be used by a reshape pass. 8177 */ 8178 struct r5conf *conf = mddev->private; 8179 int new_chunk = mddev->new_chunk_sectors; 8180 8181 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8182 return -EINVAL; 8183 if (new_chunk > 0) { 8184 if (!is_power_of_2(new_chunk)) 8185 return -EINVAL; 8186 if (new_chunk < (PAGE_SIZE>>9)) 8187 return -EINVAL; 8188 if (mddev->array_sectors & (new_chunk-1)) 8189 /* not factor of array size */ 8190 return -EINVAL; 8191 } 8192 8193 /* They look valid */ 8194 8195 if (mddev->raid_disks == 2) { 8196 /* can make the change immediately */ 8197 if (mddev->new_layout >= 0) { 8198 conf->algorithm = mddev->new_layout; 8199 mddev->layout = mddev->new_layout; 8200 } 8201 if (new_chunk > 0) { 8202 conf->chunk_sectors = new_chunk ; 8203 mddev->chunk_sectors = new_chunk; 8204 } 8205 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8206 md_wakeup_thread(mddev->thread); 8207 } 8208 return check_reshape(mddev); 8209 } 8210 8211 static int raid6_check_reshape(struct mddev *mddev) 8212 { 8213 int new_chunk = mddev->new_chunk_sectors; 8214 8215 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8216 return -EINVAL; 8217 if (new_chunk > 0) { 8218 if (!is_power_of_2(new_chunk)) 8219 return -EINVAL; 8220 if (new_chunk < (PAGE_SIZE >> 9)) 8221 return -EINVAL; 8222 if (mddev->array_sectors & (new_chunk-1)) 8223 /* not factor of array size */ 8224 return -EINVAL; 8225 } 8226 8227 /* They look valid */ 8228 return check_reshape(mddev); 8229 } 8230 8231 static void *raid5_takeover(struct mddev *mddev) 8232 { 8233 /* raid5 can take over: 8234 * raid0 - if there is only one strip zone - make it a raid4 layout 8235 * raid1 - if there are two drives. We need to know the chunk size 8236 * raid4 - trivial - just use a raid4 layout. 8237 * raid6 - Providing it is a *_6 layout 8238 */ 8239 if (mddev->level == 0) 8240 return raid45_takeover_raid0(mddev, 5); 8241 if (mddev->level == 1) 8242 return raid5_takeover_raid1(mddev); 8243 if (mddev->level == 4) { 8244 mddev->new_layout = ALGORITHM_PARITY_N; 8245 mddev->new_level = 5; 8246 return setup_conf(mddev); 8247 } 8248 if (mddev->level == 6) 8249 return raid5_takeover_raid6(mddev); 8250 8251 return ERR_PTR(-EINVAL); 8252 } 8253 8254 static void *raid4_takeover(struct mddev *mddev) 8255 { 8256 /* raid4 can take over: 8257 * raid0 - if there is only one strip zone 8258 * raid5 - if layout is right 8259 */ 8260 if (mddev->level == 0) 8261 return raid45_takeover_raid0(mddev, 4); 8262 if (mddev->level == 5 && 8263 mddev->layout == ALGORITHM_PARITY_N) { 8264 mddev->new_layout = 0; 8265 mddev->new_level = 4; 8266 return setup_conf(mddev); 8267 } 8268 return ERR_PTR(-EINVAL); 8269 } 8270 8271 static struct md_personality raid5_personality; 8272 8273 static void *raid6_takeover(struct mddev *mddev) 8274 { 8275 /* Currently can only take over a raid5. We map the 8276 * personality to an equivalent raid6 personality 8277 * with the Q block at the end. 8278 */ 8279 int new_layout; 8280 8281 if (mddev->pers != &raid5_personality) 8282 return ERR_PTR(-EINVAL); 8283 if (mddev->degraded > 1) 8284 return ERR_PTR(-EINVAL); 8285 if (mddev->raid_disks > 253) 8286 return ERR_PTR(-EINVAL); 8287 if (mddev->raid_disks < 3) 8288 return ERR_PTR(-EINVAL); 8289 8290 switch (mddev->layout) { 8291 case ALGORITHM_LEFT_ASYMMETRIC: 8292 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8293 break; 8294 case ALGORITHM_RIGHT_ASYMMETRIC: 8295 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8296 break; 8297 case ALGORITHM_LEFT_SYMMETRIC: 8298 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8299 break; 8300 case ALGORITHM_RIGHT_SYMMETRIC: 8301 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8302 break; 8303 case ALGORITHM_PARITY_0: 8304 new_layout = ALGORITHM_PARITY_0_6; 8305 break; 8306 case ALGORITHM_PARITY_N: 8307 new_layout = ALGORITHM_PARITY_N; 8308 break; 8309 default: 8310 return ERR_PTR(-EINVAL); 8311 } 8312 mddev->new_level = 6; 8313 mddev->new_layout = new_layout; 8314 mddev->delta_disks = 1; 8315 mddev->raid_disks += 1; 8316 return setup_conf(mddev); 8317 } 8318 8319 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8320 { 8321 struct r5conf *conf; 8322 int err; 8323 8324 err = mddev_lock(mddev); 8325 if (err) 8326 return err; 8327 conf = mddev->private; 8328 if (!conf) { 8329 mddev_unlock(mddev); 8330 return -ENODEV; 8331 } 8332 8333 if (strncmp(buf, "ppl", 3) == 0) { 8334 /* ppl only works with RAID 5 */ 8335 if (!raid5_has_ppl(conf) && conf->level == 5) { 8336 err = log_init(conf, NULL, true); 8337 if (!err) { 8338 err = resize_stripes(conf, conf->pool_size); 8339 if (err) 8340 log_exit(conf); 8341 } 8342 } else 8343 err = -EINVAL; 8344 } else if (strncmp(buf, "resync", 6) == 0) { 8345 if (raid5_has_ppl(conf)) { 8346 mddev_suspend(mddev); 8347 log_exit(conf); 8348 mddev_resume(mddev); 8349 err = resize_stripes(conf, conf->pool_size); 8350 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8351 r5l_log_disk_error(conf)) { 8352 bool journal_dev_exists = false; 8353 struct md_rdev *rdev; 8354 8355 rdev_for_each(rdev, mddev) 8356 if (test_bit(Journal, &rdev->flags)) { 8357 journal_dev_exists = true; 8358 break; 8359 } 8360 8361 if (!journal_dev_exists) { 8362 mddev_suspend(mddev); 8363 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8364 mddev_resume(mddev); 8365 } else /* need remove journal device first */ 8366 err = -EBUSY; 8367 } else 8368 err = -EINVAL; 8369 } else { 8370 err = -EINVAL; 8371 } 8372 8373 if (!err) 8374 md_update_sb(mddev, 1); 8375 8376 mddev_unlock(mddev); 8377 8378 return err; 8379 } 8380 8381 static int raid5_start(struct mddev *mddev) 8382 { 8383 struct r5conf *conf = mddev->private; 8384 8385 return r5l_start(conf->log); 8386 } 8387 8388 static struct md_personality raid6_personality = 8389 { 8390 .name = "raid6", 8391 .level = 6, 8392 .owner = THIS_MODULE, 8393 .make_request = raid5_make_request, 8394 .run = raid5_run, 8395 .start = raid5_start, 8396 .free = raid5_free, 8397 .status = raid5_status, 8398 .error_handler = raid5_error, 8399 .hot_add_disk = raid5_add_disk, 8400 .hot_remove_disk= raid5_remove_disk, 8401 .spare_active = raid5_spare_active, 8402 .sync_request = raid5_sync_request, 8403 .resize = raid5_resize, 8404 .size = raid5_size, 8405 .check_reshape = raid6_check_reshape, 8406 .start_reshape = raid5_start_reshape, 8407 .finish_reshape = raid5_finish_reshape, 8408 .quiesce = raid5_quiesce, 8409 .takeover = raid6_takeover, 8410 .congested = raid5_congested, 8411 .change_consistency_policy = raid5_change_consistency_policy, 8412 }; 8413 static struct md_personality raid5_personality = 8414 { 8415 .name = "raid5", 8416 .level = 5, 8417 .owner = THIS_MODULE, 8418 .make_request = raid5_make_request, 8419 .run = raid5_run, 8420 .start = raid5_start, 8421 .free = raid5_free, 8422 .status = raid5_status, 8423 .error_handler = raid5_error, 8424 .hot_add_disk = raid5_add_disk, 8425 .hot_remove_disk= raid5_remove_disk, 8426 .spare_active = raid5_spare_active, 8427 .sync_request = raid5_sync_request, 8428 .resize = raid5_resize, 8429 .size = raid5_size, 8430 .check_reshape = raid5_check_reshape, 8431 .start_reshape = raid5_start_reshape, 8432 .finish_reshape = raid5_finish_reshape, 8433 .quiesce = raid5_quiesce, 8434 .takeover = raid5_takeover, 8435 .congested = raid5_congested, 8436 .change_consistency_policy = raid5_change_consistency_policy, 8437 }; 8438 8439 static struct md_personality raid4_personality = 8440 { 8441 .name = "raid4", 8442 .level = 4, 8443 .owner = THIS_MODULE, 8444 .make_request = raid5_make_request, 8445 .run = raid5_run, 8446 .start = raid5_start, 8447 .free = raid5_free, 8448 .status = raid5_status, 8449 .error_handler = raid5_error, 8450 .hot_add_disk = raid5_add_disk, 8451 .hot_remove_disk= raid5_remove_disk, 8452 .spare_active = raid5_spare_active, 8453 .sync_request = raid5_sync_request, 8454 .resize = raid5_resize, 8455 .size = raid5_size, 8456 .check_reshape = raid5_check_reshape, 8457 .start_reshape = raid5_start_reshape, 8458 .finish_reshape = raid5_finish_reshape, 8459 .quiesce = raid5_quiesce, 8460 .takeover = raid4_takeover, 8461 .congested = raid5_congested, 8462 .change_consistency_policy = raid5_change_consistency_policy, 8463 }; 8464 8465 static int __init raid5_init(void) 8466 { 8467 int ret; 8468 8469 raid5_wq = alloc_workqueue("raid5wq", 8470 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8471 if (!raid5_wq) 8472 return -ENOMEM; 8473 8474 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8475 "md/raid5:prepare", 8476 raid456_cpu_up_prepare, 8477 raid456_cpu_dead); 8478 if (ret) { 8479 destroy_workqueue(raid5_wq); 8480 return ret; 8481 } 8482 register_md_personality(&raid6_personality); 8483 register_md_personality(&raid5_personality); 8484 register_md_personality(&raid4_personality); 8485 return 0; 8486 } 8487 8488 static void raid5_exit(void) 8489 { 8490 unregister_md_personality(&raid6_personality); 8491 unregister_md_personality(&raid5_personality); 8492 unregister_md_personality(&raid4_personality); 8493 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8494 destroy_workqueue(raid5_wq); 8495 } 8496 8497 module_init(raid5_init); 8498 module_exit(raid5_exit); 8499 MODULE_LICENSE("GPL"); 8500 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8501 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8502 MODULE_ALIAS("md-raid5"); 8503 MODULE_ALIAS("md-raid4"); 8504 MODULE_ALIAS("md-level-5"); 8505 MODULE_ALIAS("md-level-4"); 8506 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8507 MODULE_ALIAS("md-raid6"); 8508 MODULE_ALIAS("md-level-6"); 8509 8510 /* This used to be two separate modules, they were: */ 8511 MODULE_ALIAS("raid5"); 8512 MODULE_ALIAS("raid6"); 8513