1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid5.c : Multiple Devices driver for Linux 4 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 5 * Copyright (C) 1999, 2000 Ingo Molnar 6 * Copyright (C) 2002, 2003 H. Peter Anvin 7 * 8 * RAID-4/5/6 management functions. 9 * Thanks to Penguin Computing for making the RAID-6 development possible 10 * by donating a test server! 11 */ 12 13 /* 14 * BITMAP UNPLUGGING: 15 * 16 * The sequencing for updating the bitmap reliably is a little 17 * subtle (and I got it wrong the first time) so it deserves some 18 * explanation. 19 * 20 * We group bitmap updates into batches. Each batch has a number. 21 * We may write out several batches at once, but that isn't very important. 22 * conf->seq_write is the number of the last batch successfully written. 23 * conf->seq_flush is the number of the last batch that was closed to 24 * new additions. 25 * When we discover that we will need to write to any block in a stripe 26 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 27 * the number of the batch it will be in. This is seq_flush+1. 28 * When we are ready to do a write, if that batch hasn't been written yet, 29 * we plug the array and queue the stripe for later. 30 * When an unplug happens, we increment bm_flush, thus closing the current 31 * batch. 32 * When we notice that bm_flush > bm_write, we write out all pending updates 33 * to the bitmap, and advance bm_write to where bm_flush was. 34 * This may occasionally write a bit out twice, but is sure never to 35 * miss any bits. 36 */ 37 38 #include <linux/blkdev.h> 39 #include <linux/delay.h> 40 #include <linux/kthread.h> 41 #include <linux/raid/pq.h> 42 #include <linux/async_tx.h> 43 #include <linux/module.h> 44 #include <linux/async.h> 45 #include <linux/seq_file.h> 46 #include <linux/cpu.h> 47 #include <linux/slab.h> 48 #include <linux/ratelimit.h> 49 #include <linux/nodemask.h> 50 51 #include <trace/events/block.h> 52 #include <linux/list_sort.h> 53 54 #include "md.h" 55 #include "raid5.h" 56 #include "raid0.h" 57 #include "md-bitmap.h" 58 #include "raid5-log.h" 59 60 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 61 62 #define cpu_to_group(cpu) cpu_to_node(cpu) 63 #define ANY_GROUP NUMA_NO_NODE 64 65 #define RAID5_MAX_REQ_STRIPES 256 66 67 static bool devices_handle_discard_safely = false; 68 module_param(devices_handle_discard_safely, bool, 0644); 69 MODULE_PARM_DESC(devices_handle_discard_safely, 70 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 71 static struct workqueue_struct *raid5_wq; 72 73 static void raid5_quiesce(struct mddev *mddev, int quiesce); 74 75 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 76 { 77 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; 78 return &conf->stripe_hashtbl[hash]; 79 } 80 81 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) 82 { 83 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; 84 } 85 86 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 87 __acquires(&conf->device_lock) 88 { 89 spin_lock_irq(conf->hash_locks + hash); 90 spin_lock(&conf->device_lock); 91 } 92 93 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 94 __releases(&conf->device_lock) 95 { 96 spin_unlock(&conf->device_lock); 97 spin_unlock_irq(conf->hash_locks + hash); 98 } 99 100 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 101 __acquires(&conf->device_lock) 102 { 103 int i; 104 spin_lock_irq(conf->hash_locks); 105 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 106 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 107 spin_lock(&conf->device_lock); 108 } 109 110 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 111 __releases(&conf->device_lock) 112 { 113 int i; 114 spin_unlock(&conf->device_lock); 115 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 116 spin_unlock(conf->hash_locks + i); 117 spin_unlock_irq(conf->hash_locks); 118 } 119 120 /* Find first data disk in a raid6 stripe */ 121 static inline int raid6_d0(struct stripe_head *sh) 122 { 123 if (sh->ddf_layout) 124 /* ddf always start from first device */ 125 return 0; 126 /* md starts just after Q block */ 127 if (sh->qd_idx == sh->disks - 1) 128 return 0; 129 else 130 return sh->qd_idx + 1; 131 } 132 static inline int raid6_next_disk(int disk, int raid_disks) 133 { 134 disk++; 135 return (disk < raid_disks) ? disk : 0; 136 } 137 138 /* When walking through the disks in a raid5, starting at raid6_d0, 139 * We need to map each disk to a 'slot', where the data disks are slot 140 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 141 * is raid_disks-1. This help does that mapping. 142 */ 143 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 144 int *count, int syndrome_disks) 145 { 146 int slot = *count; 147 148 if (sh->ddf_layout) 149 (*count)++; 150 if (idx == sh->pd_idx) 151 return syndrome_disks; 152 if (idx == sh->qd_idx) 153 return syndrome_disks + 1; 154 if (!sh->ddf_layout) 155 (*count)++; 156 return slot; 157 } 158 159 static void print_raid5_conf (struct r5conf *conf); 160 161 static int stripe_operations_active(struct stripe_head *sh) 162 { 163 return sh->check_state || sh->reconstruct_state || 164 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 165 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 166 } 167 168 static bool stripe_is_lowprio(struct stripe_head *sh) 169 { 170 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 171 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 172 !test_bit(STRIPE_R5C_CACHING, &sh->state); 173 } 174 175 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 176 __must_hold(&sh->raid_conf->device_lock) 177 { 178 struct r5conf *conf = sh->raid_conf; 179 struct r5worker_group *group; 180 int thread_cnt; 181 int i, cpu = sh->cpu; 182 183 if (!cpu_online(cpu)) { 184 cpu = cpumask_any(cpu_online_mask); 185 sh->cpu = cpu; 186 } 187 188 if (list_empty(&sh->lru)) { 189 struct r5worker_group *group; 190 group = conf->worker_groups + cpu_to_group(cpu); 191 if (stripe_is_lowprio(sh)) 192 list_add_tail(&sh->lru, &group->loprio_list); 193 else 194 list_add_tail(&sh->lru, &group->handle_list); 195 group->stripes_cnt++; 196 sh->group = group; 197 } 198 199 if (conf->worker_cnt_per_group == 0) { 200 md_wakeup_thread(conf->mddev->thread); 201 return; 202 } 203 204 group = conf->worker_groups + cpu_to_group(sh->cpu); 205 206 group->workers[0].working = true; 207 /* at least one worker should run to avoid race */ 208 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 209 210 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 211 /* wakeup more workers */ 212 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 213 if (group->workers[i].working == false) { 214 group->workers[i].working = true; 215 queue_work_on(sh->cpu, raid5_wq, 216 &group->workers[i].work); 217 thread_cnt--; 218 } 219 } 220 } 221 222 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 223 struct list_head *temp_inactive_list) 224 __must_hold(&conf->device_lock) 225 { 226 int i; 227 int injournal = 0; /* number of date pages with R5_InJournal */ 228 229 BUG_ON(!list_empty(&sh->lru)); 230 BUG_ON(atomic_read(&conf->active_stripes)==0); 231 232 if (r5c_is_writeback(conf->log)) 233 for (i = sh->disks; i--; ) 234 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 235 injournal++; 236 /* 237 * In the following cases, the stripe cannot be released to cached 238 * lists. Therefore, we make the stripe write out and set 239 * STRIPE_HANDLE: 240 * 1. when quiesce in r5c write back; 241 * 2. when resync is requested fot the stripe. 242 */ 243 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 244 (conf->quiesce && r5c_is_writeback(conf->log) && 245 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 246 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 247 r5c_make_stripe_write_out(sh); 248 set_bit(STRIPE_HANDLE, &sh->state); 249 } 250 251 if (test_bit(STRIPE_HANDLE, &sh->state)) { 252 if (test_bit(STRIPE_DELAYED, &sh->state) && 253 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 254 list_add_tail(&sh->lru, &conf->delayed_list); 255 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 256 sh->bm_seq - conf->seq_write > 0) 257 list_add_tail(&sh->lru, &conf->bitmap_list); 258 else { 259 clear_bit(STRIPE_DELAYED, &sh->state); 260 clear_bit(STRIPE_BIT_DELAY, &sh->state); 261 if (conf->worker_cnt_per_group == 0) { 262 if (stripe_is_lowprio(sh)) 263 list_add_tail(&sh->lru, 264 &conf->loprio_list); 265 else 266 list_add_tail(&sh->lru, 267 &conf->handle_list); 268 } else { 269 raid5_wakeup_stripe_thread(sh); 270 return; 271 } 272 } 273 md_wakeup_thread(conf->mddev->thread); 274 } else { 275 BUG_ON(stripe_operations_active(sh)); 276 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 277 if (atomic_dec_return(&conf->preread_active_stripes) 278 < IO_THRESHOLD) 279 md_wakeup_thread(conf->mddev->thread); 280 atomic_dec(&conf->active_stripes); 281 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 282 if (!r5c_is_writeback(conf->log)) 283 list_add_tail(&sh->lru, temp_inactive_list); 284 else { 285 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 286 if (injournal == 0) 287 list_add_tail(&sh->lru, temp_inactive_list); 288 else if (injournal == conf->raid_disks - conf->max_degraded) { 289 /* full stripe */ 290 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 291 atomic_inc(&conf->r5c_cached_full_stripes); 292 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 293 atomic_dec(&conf->r5c_cached_partial_stripes); 294 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 295 r5c_check_cached_full_stripe(conf); 296 } else 297 /* 298 * STRIPE_R5C_PARTIAL_STRIPE is set in 299 * r5c_try_caching_write(). No need to 300 * set it again. 301 */ 302 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 303 } 304 } 305 } 306 } 307 308 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 309 struct list_head *temp_inactive_list) 310 __must_hold(&conf->device_lock) 311 { 312 if (atomic_dec_and_test(&sh->count)) 313 do_release_stripe(conf, sh, temp_inactive_list); 314 } 315 316 /* 317 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 318 * 319 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 320 * given time. Adding stripes only takes device lock, while deleting stripes 321 * only takes hash lock. 322 */ 323 static void release_inactive_stripe_list(struct r5conf *conf, 324 struct list_head *temp_inactive_list, 325 int hash) 326 { 327 int size; 328 bool do_wakeup = false; 329 unsigned long flags; 330 331 if (hash == NR_STRIPE_HASH_LOCKS) { 332 size = NR_STRIPE_HASH_LOCKS; 333 hash = NR_STRIPE_HASH_LOCKS - 1; 334 } else 335 size = 1; 336 while (size) { 337 struct list_head *list = &temp_inactive_list[size - 1]; 338 339 /* 340 * We don't hold any lock here yet, raid5_get_active_stripe() might 341 * remove stripes from the list 342 */ 343 if (!list_empty_careful(list)) { 344 spin_lock_irqsave(conf->hash_locks + hash, flags); 345 if (list_empty(conf->inactive_list + hash) && 346 !list_empty(list)) 347 atomic_dec(&conf->empty_inactive_list_nr); 348 list_splice_tail_init(list, conf->inactive_list + hash); 349 do_wakeup = true; 350 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 351 } 352 size--; 353 hash--; 354 } 355 356 if (do_wakeup) { 357 wake_up(&conf->wait_for_stripe); 358 if (atomic_read(&conf->active_stripes) == 0) 359 wake_up(&conf->wait_for_quiescent); 360 if (conf->retry_read_aligned) 361 md_wakeup_thread(conf->mddev->thread); 362 } 363 } 364 365 static int release_stripe_list(struct r5conf *conf, 366 struct list_head *temp_inactive_list) 367 __must_hold(&conf->device_lock) 368 { 369 struct stripe_head *sh, *t; 370 int count = 0; 371 struct llist_node *head; 372 373 head = llist_del_all(&conf->released_stripes); 374 head = llist_reverse_order(head); 375 llist_for_each_entry_safe(sh, t, head, release_list) { 376 int hash; 377 378 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 379 smp_mb(); 380 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 381 /* 382 * Don't worry the bit is set here, because if the bit is set 383 * again, the count is always > 1. This is true for 384 * STRIPE_ON_UNPLUG_LIST bit too. 385 */ 386 hash = sh->hash_lock_index; 387 __release_stripe(conf, sh, &temp_inactive_list[hash]); 388 count++; 389 } 390 391 return count; 392 } 393 394 void raid5_release_stripe(struct stripe_head *sh) 395 { 396 struct r5conf *conf = sh->raid_conf; 397 unsigned long flags; 398 struct list_head list; 399 int hash; 400 bool wakeup; 401 402 /* Avoid release_list until the last reference. 403 */ 404 if (atomic_add_unless(&sh->count, -1, 1)) 405 return; 406 407 if (unlikely(!conf->mddev->thread) || 408 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 409 goto slow_path; 410 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 411 if (wakeup) 412 md_wakeup_thread(conf->mddev->thread); 413 return; 414 slow_path: 415 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 416 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { 417 INIT_LIST_HEAD(&list); 418 hash = sh->hash_lock_index; 419 do_release_stripe(conf, sh, &list); 420 spin_unlock_irqrestore(&conf->device_lock, flags); 421 release_inactive_stripe_list(conf, &list, hash); 422 } 423 } 424 425 static inline void remove_hash(struct stripe_head *sh) 426 { 427 pr_debug("remove_hash(), stripe %llu\n", 428 (unsigned long long)sh->sector); 429 430 hlist_del_init(&sh->hash); 431 } 432 433 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 434 { 435 struct hlist_head *hp = stripe_hash(conf, sh->sector); 436 437 pr_debug("insert_hash(), stripe %llu\n", 438 (unsigned long long)sh->sector); 439 440 hlist_add_head(&sh->hash, hp); 441 } 442 443 /* find an idle stripe, make sure it is unhashed, and return it. */ 444 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 445 { 446 struct stripe_head *sh = NULL; 447 struct list_head *first; 448 449 if (list_empty(conf->inactive_list + hash)) 450 goto out; 451 first = (conf->inactive_list + hash)->next; 452 sh = list_entry(first, struct stripe_head, lru); 453 list_del_init(first); 454 remove_hash(sh); 455 atomic_inc(&conf->active_stripes); 456 BUG_ON(hash != sh->hash_lock_index); 457 if (list_empty(conf->inactive_list + hash)) 458 atomic_inc(&conf->empty_inactive_list_nr); 459 out: 460 return sh; 461 } 462 463 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 464 static void free_stripe_pages(struct stripe_head *sh) 465 { 466 int i; 467 struct page *p; 468 469 /* Have not allocate page pool */ 470 if (!sh->pages) 471 return; 472 473 for (i = 0; i < sh->nr_pages; i++) { 474 p = sh->pages[i]; 475 if (p) 476 put_page(p); 477 sh->pages[i] = NULL; 478 } 479 } 480 481 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) 482 { 483 int i; 484 struct page *p; 485 486 for (i = 0; i < sh->nr_pages; i++) { 487 /* The page have allocated. */ 488 if (sh->pages[i]) 489 continue; 490 491 p = alloc_page(gfp); 492 if (!p) { 493 free_stripe_pages(sh); 494 return -ENOMEM; 495 } 496 sh->pages[i] = p; 497 } 498 return 0; 499 } 500 501 static int 502 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) 503 { 504 int nr_pages, cnt; 505 506 if (sh->pages) 507 return 0; 508 509 /* Each of the sh->dev[i] need one conf->stripe_size */ 510 cnt = PAGE_SIZE / conf->stripe_size; 511 nr_pages = (disks + cnt - 1) / cnt; 512 513 sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 514 if (!sh->pages) 515 return -ENOMEM; 516 sh->nr_pages = nr_pages; 517 sh->stripes_per_page = cnt; 518 return 0; 519 } 520 #endif 521 522 static void shrink_buffers(struct stripe_head *sh) 523 { 524 int i; 525 int num = sh->raid_conf->pool_size; 526 527 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 528 for (i = 0; i < num ; i++) { 529 struct page *p; 530 531 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 532 p = sh->dev[i].page; 533 if (!p) 534 continue; 535 sh->dev[i].page = NULL; 536 put_page(p); 537 } 538 #else 539 for (i = 0; i < num; i++) 540 sh->dev[i].page = NULL; 541 free_stripe_pages(sh); /* Free pages */ 542 #endif 543 } 544 545 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 546 { 547 int i; 548 int num = sh->raid_conf->pool_size; 549 550 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 551 for (i = 0; i < num; i++) { 552 struct page *page; 553 554 if (!(page = alloc_page(gfp))) { 555 return 1; 556 } 557 sh->dev[i].page = page; 558 sh->dev[i].orig_page = page; 559 sh->dev[i].offset = 0; 560 } 561 #else 562 if (alloc_stripe_pages(sh, gfp)) 563 return -ENOMEM; 564 565 for (i = 0; i < num; i++) { 566 sh->dev[i].page = raid5_get_dev_page(sh, i); 567 sh->dev[i].orig_page = sh->dev[i].page; 568 sh->dev[i].offset = raid5_get_page_offset(sh, i); 569 } 570 #endif 571 return 0; 572 } 573 574 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 575 struct stripe_head *sh); 576 577 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 578 { 579 struct r5conf *conf = sh->raid_conf; 580 int i, seq; 581 582 BUG_ON(atomic_read(&sh->count) != 0); 583 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 584 BUG_ON(stripe_operations_active(sh)); 585 BUG_ON(sh->batch_head); 586 587 pr_debug("init_stripe called, stripe %llu\n", 588 (unsigned long long)sector); 589 retry: 590 seq = read_seqcount_begin(&conf->gen_lock); 591 sh->generation = conf->generation - previous; 592 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 593 sh->sector = sector; 594 stripe_set_idx(sector, conf, previous, sh); 595 sh->state = 0; 596 597 for (i = sh->disks; i--; ) { 598 struct r5dev *dev = &sh->dev[i]; 599 600 if (dev->toread || dev->read || dev->towrite || dev->written || 601 test_bit(R5_LOCKED, &dev->flags)) { 602 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 603 (unsigned long long)sh->sector, i, dev->toread, 604 dev->read, dev->towrite, dev->written, 605 test_bit(R5_LOCKED, &dev->flags)); 606 WARN_ON(1); 607 } 608 dev->flags = 0; 609 dev->sector = raid5_compute_blocknr(sh, i, previous); 610 } 611 if (read_seqcount_retry(&conf->gen_lock, seq)) 612 goto retry; 613 sh->overwrite_disks = 0; 614 insert_hash(conf, sh); 615 sh->cpu = smp_processor_id(); 616 set_bit(STRIPE_BATCH_READY, &sh->state); 617 } 618 619 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 620 short generation) 621 { 622 struct stripe_head *sh; 623 624 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 625 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 626 if (sh->sector == sector && sh->generation == generation) 627 return sh; 628 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 629 return NULL; 630 } 631 632 static struct stripe_head *find_get_stripe(struct r5conf *conf, 633 sector_t sector, short generation, int hash) 634 { 635 int inc_empty_inactive_list_flag; 636 struct stripe_head *sh; 637 638 sh = __find_stripe(conf, sector, generation); 639 if (!sh) 640 return NULL; 641 642 if (atomic_inc_not_zero(&sh->count)) 643 return sh; 644 645 /* 646 * Slow path. The reference count is zero which means the stripe must 647 * be on a list (sh->lru). Must remove the stripe from the list that 648 * references it with the device_lock held. 649 */ 650 651 spin_lock(&conf->device_lock); 652 if (!atomic_read(&sh->count)) { 653 if (!test_bit(STRIPE_HANDLE, &sh->state)) 654 atomic_inc(&conf->active_stripes); 655 BUG_ON(list_empty(&sh->lru) && 656 !test_bit(STRIPE_EXPANDING, &sh->state)); 657 inc_empty_inactive_list_flag = 0; 658 if (!list_empty(conf->inactive_list + hash)) 659 inc_empty_inactive_list_flag = 1; 660 list_del_init(&sh->lru); 661 if (list_empty(conf->inactive_list + hash) && 662 inc_empty_inactive_list_flag) 663 atomic_inc(&conf->empty_inactive_list_nr); 664 if (sh->group) { 665 sh->group->stripes_cnt--; 666 sh->group = NULL; 667 } 668 } 669 atomic_inc(&sh->count); 670 spin_unlock(&conf->device_lock); 671 672 return sh; 673 } 674 675 /* 676 * Need to check if array has failed when deciding whether to: 677 * - start an array 678 * - remove non-faulty devices 679 * - add a spare 680 * - allow a reshape 681 * This determination is simple when no reshape is happening. 682 * However if there is a reshape, we need to carefully check 683 * both the before and after sections. 684 * This is because some failed devices may only affect one 685 * of the two sections, and some non-in_sync devices may 686 * be insync in the section most affected by failed devices. 687 * 688 * Most calls to this function hold &conf->device_lock. Calls 689 * in raid5_run() do not require the lock as no other threads 690 * have been started yet. 691 */ 692 int raid5_calc_degraded(struct r5conf *conf) 693 { 694 int degraded, degraded2; 695 int i; 696 697 rcu_read_lock(); 698 degraded = 0; 699 for (i = 0; i < conf->previous_raid_disks; i++) { 700 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 701 if (rdev && test_bit(Faulty, &rdev->flags)) 702 rdev = rcu_dereference(conf->disks[i].replacement); 703 if (!rdev || test_bit(Faulty, &rdev->flags)) 704 degraded++; 705 else if (test_bit(In_sync, &rdev->flags)) 706 ; 707 else 708 /* not in-sync or faulty. 709 * If the reshape increases the number of devices, 710 * this is being recovered by the reshape, so 711 * this 'previous' section is not in_sync. 712 * If the number of devices is being reduced however, 713 * the device can only be part of the array if 714 * we are reverting a reshape, so this section will 715 * be in-sync. 716 */ 717 if (conf->raid_disks >= conf->previous_raid_disks) 718 degraded++; 719 } 720 rcu_read_unlock(); 721 if (conf->raid_disks == conf->previous_raid_disks) 722 return degraded; 723 rcu_read_lock(); 724 degraded2 = 0; 725 for (i = 0; i < conf->raid_disks; i++) { 726 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 727 if (rdev && test_bit(Faulty, &rdev->flags)) 728 rdev = rcu_dereference(conf->disks[i].replacement); 729 if (!rdev || test_bit(Faulty, &rdev->flags)) 730 degraded2++; 731 else if (test_bit(In_sync, &rdev->flags)) 732 ; 733 else 734 /* not in-sync or faulty. 735 * If reshape increases the number of devices, this 736 * section has already been recovered, else it 737 * almost certainly hasn't. 738 */ 739 if (conf->raid_disks <= conf->previous_raid_disks) 740 degraded2++; 741 } 742 rcu_read_unlock(); 743 if (degraded2 > degraded) 744 return degraded2; 745 return degraded; 746 } 747 748 static bool has_failed(struct r5conf *conf) 749 { 750 int degraded = conf->mddev->degraded; 751 752 if (test_bit(MD_BROKEN, &conf->mddev->flags)) 753 return true; 754 755 if (conf->mddev->reshape_position != MaxSector) 756 degraded = raid5_calc_degraded(conf); 757 758 return degraded > conf->max_degraded; 759 } 760 761 enum stripe_result { 762 STRIPE_SUCCESS = 0, 763 STRIPE_RETRY, 764 STRIPE_SCHEDULE_AND_RETRY, 765 STRIPE_FAIL, 766 }; 767 768 struct stripe_request_ctx { 769 /* a reference to the last stripe_head for batching */ 770 struct stripe_head *batch_last; 771 772 /* first sector in the request */ 773 sector_t first_sector; 774 775 /* last sector in the request */ 776 sector_t last_sector; 777 778 /* 779 * bitmap to track stripe sectors that have been added to stripes 780 * add one to account for unaligned requests 781 */ 782 DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); 783 784 /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ 785 bool do_flush; 786 }; 787 788 /* 789 * Block until another thread clears R5_INACTIVE_BLOCKED or 790 * there are fewer than 3/4 the maximum number of active stripes 791 * and there is an inactive stripe available. 792 */ 793 static bool is_inactive_blocked(struct r5conf *conf, int hash) 794 { 795 if (list_empty(conf->inactive_list + hash)) 796 return false; 797 798 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 799 return true; 800 801 return (atomic_read(&conf->active_stripes) < 802 (conf->max_nr_stripes * 3 / 4)); 803 } 804 805 struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 806 struct stripe_request_ctx *ctx, sector_t sector, 807 unsigned int flags) 808 { 809 struct stripe_head *sh; 810 int hash = stripe_hash_locks_hash(conf, sector); 811 int previous = !!(flags & R5_GAS_PREVIOUS); 812 813 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 814 815 spin_lock_irq(conf->hash_locks + hash); 816 817 for (;;) { 818 if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { 819 /* 820 * Must release the reference to batch_last before 821 * waiting, on quiesce, otherwise the batch_last will 822 * hold a reference to a stripe and raid5_quiesce() 823 * will deadlock waiting for active_stripes to go to 824 * zero. 825 */ 826 if (ctx && ctx->batch_last) { 827 raid5_release_stripe(ctx->batch_last); 828 ctx->batch_last = NULL; 829 } 830 831 wait_event_lock_irq(conf->wait_for_quiescent, 832 !conf->quiesce, 833 *(conf->hash_locks + hash)); 834 } 835 836 sh = find_get_stripe(conf, sector, conf->generation - previous, 837 hash); 838 if (sh) 839 break; 840 841 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 842 sh = get_free_stripe(conf, hash); 843 if (sh) { 844 r5c_check_stripe_cache_usage(conf); 845 init_stripe(sh, sector, previous); 846 atomic_inc(&sh->count); 847 break; 848 } 849 850 if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) 851 set_bit(R5_ALLOC_MORE, &conf->cache_state); 852 } 853 854 if (flags & R5_GAS_NOBLOCK) 855 break; 856 857 set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 858 r5l_wake_reclaim(conf->log, 0); 859 860 /* release batch_last before wait to avoid risk of deadlock */ 861 if (ctx && ctx->batch_last) { 862 raid5_release_stripe(ctx->batch_last); 863 ctx->batch_last = NULL; 864 } 865 866 wait_event_lock_irq(conf->wait_for_stripe, 867 is_inactive_blocked(conf, hash), 868 *(conf->hash_locks + hash)); 869 clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 870 } 871 872 spin_unlock_irq(conf->hash_locks + hash); 873 return sh; 874 } 875 876 static bool is_full_stripe_write(struct stripe_head *sh) 877 { 878 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 879 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 880 } 881 882 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 883 __acquires(&sh1->stripe_lock) 884 __acquires(&sh2->stripe_lock) 885 { 886 if (sh1 > sh2) { 887 spin_lock_irq(&sh2->stripe_lock); 888 spin_lock_nested(&sh1->stripe_lock, 1); 889 } else { 890 spin_lock_irq(&sh1->stripe_lock); 891 spin_lock_nested(&sh2->stripe_lock, 1); 892 } 893 } 894 895 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 896 __releases(&sh1->stripe_lock) 897 __releases(&sh2->stripe_lock) 898 { 899 spin_unlock(&sh1->stripe_lock); 900 spin_unlock_irq(&sh2->stripe_lock); 901 } 902 903 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 904 static bool stripe_can_batch(struct stripe_head *sh) 905 { 906 struct r5conf *conf = sh->raid_conf; 907 908 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 909 return false; 910 return test_bit(STRIPE_BATCH_READY, &sh->state) && 911 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 912 is_full_stripe_write(sh); 913 } 914 915 /* we only do back search */ 916 static void stripe_add_to_batch_list(struct r5conf *conf, 917 struct stripe_head *sh, struct stripe_head *last_sh) 918 { 919 struct stripe_head *head; 920 sector_t head_sector, tmp_sec; 921 int hash; 922 int dd_idx; 923 924 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 925 tmp_sec = sh->sector; 926 if (!sector_div(tmp_sec, conf->chunk_sectors)) 927 return; 928 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); 929 930 if (last_sh && head_sector == last_sh->sector) { 931 head = last_sh; 932 atomic_inc(&head->count); 933 } else { 934 hash = stripe_hash_locks_hash(conf, head_sector); 935 spin_lock_irq(conf->hash_locks + hash); 936 head = find_get_stripe(conf, head_sector, conf->generation, 937 hash); 938 spin_unlock_irq(conf->hash_locks + hash); 939 if (!head) 940 return; 941 if (!stripe_can_batch(head)) 942 goto out; 943 } 944 945 lock_two_stripes(head, sh); 946 /* clear_batch_ready clear the flag */ 947 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 948 goto unlock_out; 949 950 if (sh->batch_head) 951 goto unlock_out; 952 953 dd_idx = 0; 954 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 955 dd_idx++; 956 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 957 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 958 goto unlock_out; 959 960 if (head->batch_head) { 961 spin_lock(&head->batch_head->batch_lock); 962 /* This batch list is already running */ 963 if (!stripe_can_batch(head)) { 964 spin_unlock(&head->batch_head->batch_lock); 965 goto unlock_out; 966 } 967 /* 968 * We must assign batch_head of this stripe within the 969 * batch_lock, otherwise clear_batch_ready of batch head 970 * stripe could clear BATCH_READY bit of this stripe and 971 * this stripe->batch_head doesn't get assigned, which 972 * could confuse clear_batch_ready for this stripe 973 */ 974 sh->batch_head = head->batch_head; 975 976 /* 977 * at this point, head's BATCH_READY could be cleared, but we 978 * can still add the stripe to batch list 979 */ 980 list_add(&sh->batch_list, &head->batch_list); 981 spin_unlock(&head->batch_head->batch_lock); 982 } else { 983 head->batch_head = head; 984 sh->batch_head = head->batch_head; 985 spin_lock(&head->batch_lock); 986 list_add_tail(&sh->batch_list, &head->batch_list); 987 spin_unlock(&head->batch_lock); 988 } 989 990 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 991 if (atomic_dec_return(&conf->preread_active_stripes) 992 < IO_THRESHOLD) 993 md_wakeup_thread(conf->mddev->thread); 994 995 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 996 int seq = sh->bm_seq; 997 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 998 sh->batch_head->bm_seq > seq) 999 seq = sh->batch_head->bm_seq; 1000 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 1001 sh->batch_head->bm_seq = seq; 1002 } 1003 1004 atomic_inc(&sh->count); 1005 unlock_out: 1006 unlock_two_stripes(head, sh); 1007 out: 1008 raid5_release_stripe(head); 1009 } 1010 1011 /* Determine if 'data_offset' or 'new_data_offset' should be used 1012 * in this stripe_head. 1013 */ 1014 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 1015 { 1016 sector_t progress = conf->reshape_progress; 1017 /* Need a memory barrier to make sure we see the value 1018 * of conf->generation, or ->data_offset that was set before 1019 * reshape_progress was updated. 1020 */ 1021 smp_rmb(); 1022 if (progress == MaxSector) 1023 return 0; 1024 if (sh->generation == conf->generation - 1) 1025 return 0; 1026 /* We are in a reshape, and this is a new-generation stripe, 1027 * so use new_data_offset. 1028 */ 1029 return 1; 1030 } 1031 1032 static void dispatch_bio_list(struct bio_list *tmp) 1033 { 1034 struct bio *bio; 1035 1036 while ((bio = bio_list_pop(tmp))) 1037 submit_bio_noacct(bio); 1038 } 1039 1040 static int cmp_stripe(void *priv, const struct list_head *a, 1041 const struct list_head *b) 1042 { 1043 const struct r5pending_data *da = list_entry(a, 1044 struct r5pending_data, sibling); 1045 const struct r5pending_data *db = list_entry(b, 1046 struct r5pending_data, sibling); 1047 if (da->sector > db->sector) 1048 return 1; 1049 if (da->sector < db->sector) 1050 return -1; 1051 return 0; 1052 } 1053 1054 static void dispatch_defer_bios(struct r5conf *conf, int target, 1055 struct bio_list *list) 1056 { 1057 struct r5pending_data *data; 1058 struct list_head *first, *next = NULL; 1059 int cnt = 0; 1060 1061 if (conf->pending_data_cnt == 0) 1062 return; 1063 1064 list_sort(NULL, &conf->pending_list, cmp_stripe); 1065 1066 first = conf->pending_list.next; 1067 1068 /* temporarily move the head */ 1069 if (conf->next_pending_data) 1070 list_move_tail(&conf->pending_list, 1071 &conf->next_pending_data->sibling); 1072 1073 while (!list_empty(&conf->pending_list)) { 1074 data = list_first_entry(&conf->pending_list, 1075 struct r5pending_data, sibling); 1076 if (&data->sibling == first) 1077 first = data->sibling.next; 1078 next = data->sibling.next; 1079 1080 bio_list_merge(list, &data->bios); 1081 list_move(&data->sibling, &conf->free_list); 1082 cnt++; 1083 if (cnt >= target) 1084 break; 1085 } 1086 conf->pending_data_cnt -= cnt; 1087 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 1088 1089 if (next != &conf->pending_list) 1090 conf->next_pending_data = list_entry(next, 1091 struct r5pending_data, sibling); 1092 else 1093 conf->next_pending_data = NULL; 1094 /* list isn't empty */ 1095 if (first != &conf->pending_list) 1096 list_move_tail(&conf->pending_list, first); 1097 } 1098 1099 static void flush_deferred_bios(struct r5conf *conf) 1100 { 1101 struct bio_list tmp = BIO_EMPTY_LIST; 1102 1103 if (conf->pending_data_cnt == 0) 1104 return; 1105 1106 spin_lock(&conf->pending_bios_lock); 1107 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 1108 BUG_ON(conf->pending_data_cnt != 0); 1109 spin_unlock(&conf->pending_bios_lock); 1110 1111 dispatch_bio_list(&tmp); 1112 } 1113 1114 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 1115 struct bio_list *bios) 1116 { 1117 struct bio_list tmp = BIO_EMPTY_LIST; 1118 struct r5pending_data *ent; 1119 1120 spin_lock(&conf->pending_bios_lock); 1121 ent = list_first_entry(&conf->free_list, struct r5pending_data, 1122 sibling); 1123 list_move_tail(&ent->sibling, &conf->pending_list); 1124 ent->sector = sector; 1125 bio_list_init(&ent->bios); 1126 bio_list_merge(&ent->bios, bios); 1127 conf->pending_data_cnt++; 1128 if (conf->pending_data_cnt >= PENDING_IO_MAX) 1129 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 1130 1131 spin_unlock(&conf->pending_bios_lock); 1132 1133 dispatch_bio_list(&tmp); 1134 } 1135 1136 static void 1137 raid5_end_read_request(struct bio *bi); 1138 static void 1139 raid5_end_write_request(struct bio *bi); 1140 1141 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 1142 { 1143 struct r5conf *conf = sh->raid_conf; 1144 int i, disks = sh->disks; 1145 struct stripe_head *head_sh = sh; 1146 struct bio_list pending_bios = BIO_EMPTY_LIST; 1147 struct r5dev *dev; 1148 bool should_defer; 1149 1150 might_sleep(); 1151 1152 if (log_stripe(sh, s) == 0) 1153 return; 1154 1155 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1156 1157 for (i = disks; i--; ) { 1158 enum req_op op; 1159 blk_opf_t op_flags = 0; 1160 int replace_only = 0; 1161 struct bio *bi, *rbi; 1162 struct md_rdev *rdev, *rrdev = NULL; 1163 1164 sh = head_sh; 1165 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1166 op = REQ_OP_WRITE; 1167 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1168 op_flags = REQ_FUA; 1169 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1170 op = REQ_OP_DISCARD; 1171 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1172 op = REQ_OP_READ; 1173 else if (test_and_clear_bit(R5_WantReplace, 1174 &sh->dev[i].flags)) { 1175 op = REQ_OP_WRITE; 1176 replace_only = 1; 1177 } else 1178 continue; 1179 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1180 op_flags |= REQ_SYNC; 1181 1182 again: 1183 dev = &sh->dev[i]; 1184 bi = &dev->req; 1185 rbi = &dev->rreq; /* For writing to replacement */ 1186 1187 rcu_read_lock(); 1188 rrdev = rcu_dereference(conf->disks[i].replacement); 1189 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1190 rdev = rcu_dereference(conf->disks[i].rdev); 1191 if (!rdev) { 1192 rdev = rrdev; 1193 rrdev = NULL; 1194 } 1195 if (op_is_write(op)) { 1196 if (replace_only) 1197 rdev = NULL; 1198 if (rdev == rrdev) 1199 /* We raced and saw duplicates */ 1200 rrdev = NULL; 1201 } else { 1202 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1203 rdev = rrdev; 1204 rrdev = NULL; 1205 } 1206 1207 if (rdev && test_bit(Faulty, &rdev->flags)) 1208 rdev = NULL; 1209 if (rdev) 1210 atomic_inc(&rdev->nr_pending); 1211 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1212 rrdev = NULL; 1213 if (rrdev) 1214 atomic_inc(&rrdev->nr_pending); 1215 rcu_read_unlock(); 1216 1217 /* We have already checked bad blocks for reads. Now 1218 * need to check for writes. We never accept write errors 1219 * on the replacement, so we don't to check rrdev. 1220 */ 1221 while (op_is_write(op) && rdev && 1222 test_bit(WriteErrorSeen, &rdev->flags)) { 1223 sector_t first_bad; 1224 int bad_sectors; 1225 int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 1226 &first_bad, &bad_sectors); 1227 if (!bad) 1228 break; 1229 1230 if (bad < 0) { 1231 set_bit(BlockedBadBlocks, &rdev->flags); 1232 if (!conf->mddev->external && 1233 conf->mddev->sb_flags) { 1234 /* It is very unlikely, but we might 1235 * still need to write out the 1236 * bad block log - better give it 1237 * a chance*/ 1238 md_check_recovery(conf->mddev); 1239 } 1240 /* 1241 * Because md_wait_for_blocked_rdev 1242 * will dec nr_pending, we must 1243 * increment it first. 1244 */ 1245 atomic_inc(&rdev->nr_pending); 1246 md_wait_for_blocked_rdev(rdev, conf->mddev); 1247 } else { 1248 /* Acknowledged bad block - skip the write */ 1249 rdev_dec_pending(rdev, conf->mddev); 1250 rdev = NULL; 1251 } 1252 } 1253 1254 if (rdev) { 1255 if (s->syncing || s->expanding || s->expanded 1256 || s->replacing) 1257 md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); 1258 1259 set_bit(STRIPE_IO_STARTED, &sh->state); 1260 1261 bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); 1262 bi->bi_end_io = op_is_write(op) 1263 ? raid5_end_write_request 1264 : raid5_end_read_request; 1265 bi->bi_private = sh; 1266 1267 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1268 __func__, (unsigned long long)sh->sector, 1269 bi->bi_opf, i); 1270 atomic_inc(&sh->count); 1271 if (sh != head_sh) 1272 atomic_inc(&head_sh->count); 1273 if (use_new_offset(conf, sh)) 1274 bi->bi_iter.bi_sector = (sh->sector 1275 + rdev->new_data_offset); 1276 else 1277 bi->bi_iter.bi_sector = (sh->sector 1278 + rdev->data_offset); 1279 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1280 bi->bi_opf |= REQ_NOMERGE; 1281 1282 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1283 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1284 1285 if (!op_is_write(op) && 1286 test_bit(R5_InJournal, &sh->dev[i].flags)) 1287 /* 1288 * issuing read for a page in journal, this 1289 * must be preparing for prexor in rmw; read 1290 * the data into orig_page 1291 */ 1292 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1293 else 1294 sh->dev[i].vec.bv_page = sh->dev[i].page; 1295 bi->bi_vcnt = 1; 1296 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); 1297 bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; 1298 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); 1299 /* 1300 * If this is discard request, set bi_vcnt 0. We don't 1301 * want to confuse SCSI because SCSI will replace payload 1302 */ 1303 if (op == REQ_OP_DISCARD) 1304 bi->bi_vcnt = 0; 1305 if (rrdev) 1306 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1307 1308 if (conf->mddev->gendisk) 1309 trace_block_bio_remap(bi, 1310 disk_devt(conf->mddev->gendisk), 1311 sh->dev[i].sector); 1312 if (should_defer && op_is_write(op)) 1313 bio_list_add(&pending_bios, bi); 1314 else 1315 submit_bio_noacct(bi); 1316 } 1317 if (rrdev) { 1318 if (s->syncing || s->expanding || s->expanded 1319 || s->replacing) 1320 md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); 1321 1322 set_bit(STRIPE_IO_STARTED, &sh->state); 1323 1324 bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); 1325 BUG_ON(!op_is_write(op)); 1326 rbi->bi_end_io = raid5_end_write_request; 1327 rbi->bi_private = sh; 1328 1329 pr_debug("%s: for %llu schedule op %d on " 1330 "replacement disc %d\n", 1331 __func__, (unsigned long long)sh->sector, 1332 rbi->bi_opf, i); 1333 atomic_inc(&sh->count); 1334 if (sh != head_sh) 1335 atomic_inc(&head_sh->count); 1336 if (use_new_offset(conf, sh)) 1337 rbi->bi_iter.bi_sector = (sh->sector 1338 + rrdev->new_data_offset); 1339 else 1340 rbi->bi_iter.bi_sector = (sh->sector 1341 + rrdev->data_offset); 1342 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1343 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1344 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1345 rbi->bi_vcnt = 1; 1346 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); 1347 rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; 1348 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); 1349 /* 1350 * If this is discard request, set bi_vcnt 0. We don't 1351 * want to confuse SCSI because SCSI will replace payload 1352 */ 1353 if (op == REQ_OP_DISCARD) 1354 rbi->bi_vcnt = 0; 1355 if (conf->mddev->gendisk) 1356 trace_block_bio_remap(rbi, 1357 disk_devt(conf->mddev->gendisk), 1358 sh->dev[i].sector); 1359 if (should_defer && op_is_write(op)) 1360 bio_list_add(&pending_bios, rbi); 1361 else 1362 submit_bio_noacct(rbi); 1363 } 1364 if (!rdev && !rrdev) { 1365 if (op_is_write(op)) 1366 set_bit(STRIPE_DEGRADED, &sh->state); 1367 pr_debug("skip op %d on disc %d for sector %llu\n", 1368 bi->bi_opf, i, (unsigned long long)sh->sector); 1369 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1370 set_bit(STRIPE_HANDLE, &sh->state); 1371 } 1372 1373 if (!head_sh->batch_head) 1374 continue; 1375 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1376 batch_list); 1377 if (sh != head_sh) 1378 goto again; 1379 } 1380 1381 if (should_defer && !bio_list_empty(&pending_bios)) 1382 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1383 } 1384 1385 static struct dma_async_tx_descriptor * 1386 async_copy_data(int frombio, struct bio *bio, struct page **page, 1387 unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, 1388 struct stripe_head *sh, int no_skipcopy) 1389 { 1390 struct bio_vec bvl; 1391 struct bvec_iter iter; 1392 struct page *bio_page; 1393 int page_offset; 1394 struct async_submit_ctl submit; 1395 enum async_tx_flags flags = 0; 1396 struct r5conf *conf = sh->raid_conf; 1397 1398 if (bio->bi_iter.bi_sector >= sector) 1399 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1400 else 1401 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1402 1403 if (frombio) 1404 flags |= ASYNC_TX_FENCE; 1405 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1406 1407 bio_for_each_segment(bvl, bio, iter) { 1408 int len = bvl.bv_len; 1409 int clen; 1410 int b_offset = 0; 1411 1412 if (page_offset < 0) { 1413 b_offset = -page_offset; 1414 page_offset += b_offset; 1415 len -= b_offset; 1416 } 1417 1418 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) 1419 clen = RAID5_STRIPE_SIZE(conf) - page_offset; 1420 else 1421 clen = len; 1422 1423 if (clen > 0) { 1424 b_offset += bvl.bv_offset; 1425 bio_page = bvl.bv_page; 1426 if (frombio) { 1427 if (conf->skip_copy && 1428 b_offset == 0 && page_offset == 0 && 1429 clen == RAID5_STRIPE_SIZE(conf) && 1430 !no_skipcopy) 1431 *page = bio_page; 1432 else 1433 tx = async_memcpy(*page, bio_page, page_offset + poff, 1434 b_offset, clen, &submit); 1435 } else 1436 tx = async_memcpy(bio_page, *page, b_offset, 1437 page_offset + poff, clen, &submit); 1438 } 1439 /* chain the operations */ 1440 submit.depend_tx = tx; 1441 1442 if (clen < len) /* hit end of page */ 1443 break; 1444 page_offset += len; 1445 } 1446 1447 return tx; 1448 } 1449 1450 static void ops_complete_biofill(void *stripe_head_ref) 1451 { 1452 struct stripe_head *sh = stripe_head_ref; 1453 int i; 1454 struct r5conf *conf = sh->raid_conf; 1455 1456 pr_debug("%s: stripe %llu\n", __func__, 1457 (unsigned long long)sh->sector); 1458 1459 /* clear completed biofills */ 1460 for (i = sh->disks; i--; ) { 1461 struct r5dev *dev = &sh->dev[i]; 1462 1463 /* acknowledge completion of a biofill operation */ 1464 /* and check if we need to reply to a read request, 1465 * new R5_Wantfill requests are held off until 1466 * !STRIPE_BIOFILL_RUN 1467 */ 1468 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1469 struct bio *rbi, *rbi2; 1470 1471 BUG_ON(!dev->read); 1472 rbi = dev->read; 1473 dev->read = NULL; 1474 while (rbi && rbi->bi_iter.bi_sector < 1475 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1476 rbi2 = r5_next_bio(conf, rbi, dev->sector); 1477 bio_endio(rbi); 1478 rbi = rbi2; 1479 } 1480 } 1481 } 1482 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1483 1484 set_bit(STRIPE_HANDLE, &sh->state); 1485 raid5_release_stripe(sh); 1486 } 1487 1488 static void ops_run_biofill(struct stripe_head *sh) 1489 { 1490 struct dma_async_tx_descriptor *tx = NULL; 1491 struct async_submit_ctl submit; 1492 int i; 1493 struct r5conf *conf = sh->raid_conf; 1494 1495 BUG_ON(sh->batch_head); 1496 pr_debug("%s: stripe %llu\n", __func__, 1497 (unsigned long long)sh->sector); 1498 1499 for (i = sh->disks; i--; ) { 1500 struct r5dev *dev = &sh->dev[i]; 1501 if (test_bit(R5_Wantfill, &dev->flags)) { 1502 struct bio *rbi; 1503 spin_lock_irq(&sh->stripe_lock); 1504 dev->read = rbi = dev->toread; 1505 dev->toread = NULL; 1506 spin_unlock_irq(&sh->stripe_lock); 1507 while (rbi && rbi->bi_iter.bi_sector < 1508 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1509 tx = async_copy_data(0, rbi, &dev->page, 1510 dev->offset, 1511 dev->sector, tx, sh, 0); 1512 rbi = r5_next_bio(conf, rbi, dev->sector); 1513 } 1514 } 1515 } 1516 1517 atomic_inc(&sh->count); 1518 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1519 async_trigger_callback(&submit); 1520 } 1521 1522 static void mark_target_uptodate(struct stripe_head *sh, int target) 1523 { 1524 struct r5dev *tgt; 1525 1526 if (target < 0) 1527 return; 1528 1529 tgt = &sh->dev[target]; 1530 set_bit(R5_UPTODATE, &tgt->flags); 1531 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1532 clear_bit(R5_Wantcompute, &tgt->flags); 1533 } 1534 1535 static void ops_complete_compute(void *stripe_head_ref) 1536 { 1537 struct stripe_head *sh = stripe_head_ref; 1538 1539 pr_debug("%s: stripe %llu\n", __func__, 1540 (unsigned long long)sh->sector); 1541 1542 /* mark the computed target(s) as uptodate */ 1543 mark_target_uptodate(sh, sh->ops.target); 1544 mark_target_uptodate(sh, sh->ops.target2); 1545 1546 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1547 if (sh->check_state == check_state_compute_run) 1548 sh->check_state = check_state_compute_result; 1549 set_bit(STRIPE_HANDLE, &sh->state); 1550 raid5_release_stripe(sh); 1551 } 1552 1553 /* return a pointer to the address conversion region of the scribble buffer */ 1554 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1555 { 1556 return percpu->scribble + i * percpu->scribble_obj_size; 1557 } 1558 1559 /* return a pointer to the address conversion region of the scribble buffer */ 1560 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1561 struct raid5_percpu *percpu, int i) 1562 { 1563 return (void *) (to_addr_page(percpu, i) + sh->disks + 2); 1564 } 1565 1566 /* 1567 * Return a pointer to record offset address. 1568 */ 1569 static unsigned int * 1570 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) 1571 { 1572 return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); 1573 } 1574 1575 static struct dma_async_tx_descriptor * 1576 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1577 { 1578 int disks = sh->disks; 1579 struct page **xor_srcs = to_addr_page(percpu, 0); 1580 unsigned int *off_srcs = to_addr_offs(sh, percpu); 1581 int target = sh->ops.target; 1582 struct r5dev *tgt = &sh->dev[target]; 1583 struct page *xor_dest = tgt->page; 1584 unsigned int off_dest = tgt->offset; 1585 int count = 0; 1586 struct dma_async_tx_descriptor *tx; 1587 struct async_submit_ctl submit; 1588 int i; 1589 1590 BUG_ON(sh->batch_head); 1591 1592 pr_debug("%s: stripe %llu block: %d\n", 1593 __func__, (unsigned long long)sh->sector, target); 1594 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1595 1596 for (i = disks; i--; ) { 1597 if (i != target) { 1598 off_srcs[count] = sh->dev[i].offset; 1599 xor_srcs[count++] = sh->dev[i].page; 1600 } 1601 } 1602 1603 atomic_inc(&sh->count); 1604 1605 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1606 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1607 if (unlikely(count == 1)) 1608 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], 1609 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1610 else 1611 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 1612 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1613 1614 return tx; 1615 } 1616 1617 /* set_syndrome_sources - populate source buffers for gen_syndrome 1618 * @srcs - (struct page *) array of size sh->disks 1619 * @offs - (unsigned int) array of offset for each page 1620 * @sh - stripe_head to parse 1621 * 1622 * Populates srcs in proper layout order for the stripe and returns the 1623 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1624 * destination buffer is recorded in srcs[count] and the Q destination 1625 * is recorded in srcs[count+1]]. 1626 */ 1627 static int set_syndrome_sources(struct page **srcs, 1628 unsigned int *offs, 1629 struct stripe_head *sh, 1630 int srctype) 1631 { 1632 int disks = sh->disks; 1633 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1634 int d0_idx = raid6_d0(sh); 1635 int count; 1636 int i; 1637 1638 for (i = 0; i < disks; i++) 1639 srcs[i] = NULL; 1640 1641 count = 0; 1642 i = d0_idx; 1643 do { 1644 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1645 struct r5dev *dev = &sh->dev[i]; 1646 1647 if (i == sh->qd_idx || i == sh->pd_idx || 1648 (srctype == SYNDROME_SRC_ALL) || 1649 (srctype == SYNDROME_SRC_WANT_DRAIN && 1650 (test_bit(R5_Wantdrain, &dev->flags) || 1651 test_bit(R5_InJournal, &dev->flags))) || 1652 (srctype == SYNDROME_SRC_WRITTEN && 1653 (dev->written || 1654 test_bit(R5_InJournal, &dev->flags)))) { 1655 if (test_bit(R5_InJournal, &dev->flags)) 1656 srcs[slot] = sh->dev[i].orig_page; 1657 else 1658 srcs[slot] = sh->dev[i].page; 1659 /* 1660 * For R5_InJournal, PAGE_SIZE must be 4KB and will 1661 * not shared page. In that case, dev[i].offset 1662 * is 0. 1663 */ 1664 offs[slot] = sh->dev[i].offset; 1665 } 1666 i = raid6_next_disk(i, disks); 1667 } while (i != d0_idx); 1668 1669 return syndrome_disks; 1670 } 1671 1672 static struct dma_async_tx_descriptor * 1673 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1674 { 1675 int disks = sh->disks; 1676 struct page **blocks = to_addr_page(percpu, 0); 1677 unsigned int *offs = to_addr_offs(sh, percpu); 1678 int target; 1679 int qd_idx = sh->qd_idx; 1680 struct dma_async_tx_descriptor *tx; 1681 struct async_submit_ctl submit; 1682 struct r5dev *tgt; 1683 struct page *dest; 1684 unsigned int dest_off; 1685 int i; 1686 int count; 1687 1688 BUG_ON(sh->batch_head); 1689 if (sh->ops.target < 0) 1690 target = sh->ops.target2; 1691 else if (sh->ops.target2 < 0) 1692 target = sh->ops.target; 1693 else 1694 /* we should only have one valid target */ 1695 BUG(); 1696 BUG_ON(target < 0); 1697 pr_debug("%s: stripe %llu block: %d\n", 1698 __func__, (unsigned long long)sh->sector, target); 1699 1700 tgt = &sh->dev[target]; 1701 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1702 dest = tgt->page; 1703 dest_off = tgt->offset; 1704 1705 atomic_inc(&sh->count); 1706 1707 if (target == qd_idx) { 1708 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); 1709 blocks[count] = NULL; /* regenerating p is not necessary */ 1710 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1711 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1712 ops_complete_compute, sh, 1713 to_addr_conv(sh, percpu, 0)); 1714 tx = async_gen_syndrome(blocks, offs, count+2, 1715 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1716 } else { 1717 /* Compute any data- or p-drive using XOR */ 1718 count = 0; 1719 for (i = disks; i-- ; ) { 1720 if (i == target || i == qd_idx) 1721 continue; 1722 offs[count] = sh->dev[i].offset; 1723 blocks[count++] = sh->dev[i].page; 1724 } 1725 1726 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1727 NULL, ops_complete_compute, sh, 1728 to_addr_conv(sh, percpu, 0)); 1729 tx = async_xor_offs(dest, dest_off, blocks, offs, count, 1730 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1731 } 1732 1733 return tx; 1734 } 1735 1736 static struct dma_async_tx_descriptor * 1737 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1738 { 1739 int i, count, disks = sh->disks; 1740 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1741 int d0_idx = raid6_d0(sh); 1742 int faila = -1, failb = -1; 1743 int target = sh->ops.target; 1744 int target2 = sh->ops.target2; 1745 struct r5dev *tgt = &sh->dev[target]; 1746 struct r5dev *tgt2 = &sh->dev[target2]; 1747 struct dma_async_tx_descriptor *tx; 1748 struct page **blocks = to_addr_page(percpu, 0); 1749 unsigned int *offs = to_addr_offs(sh, percpu); 1750 struct async_submit_ctl submit; 1751 1752 BUG_ON(sh->batch_head); 1753 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1754 __func__, (unsigned long long)sh->sector, target, target2); 1755 BUG_ON(target < 0 || target2 < 0); 1756 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1757 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1758 1759 /* we need to open-code set_syndrome_sources to handle the 1760 * slot number conversion for 'faila' and 'failb' 1761 */ 1762 for (i = 0; i < disks ; i++) { 1763 offs[i] = 0; 1764 blocks[i] = NULL; 1765 } 1766 count = 0; 1767 i = d0_idx; 1768 do { 1769 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1770 1771 offs[slot] = sh->dev[i].offset; 1772 blocks[slot] = sh->dev[i].page; 1773 1774 if (i == target) 1775 faila = slot; 1776 if (i == target2) 1777 failb = slot; 1778 i = raid6_next_disk(i, disks); 1779 } while (i != d0_idx); 1780 1781 BUG_ON(faila == failb); 1782 if (failb < faila) 1783 swap(faila, failb); 1784 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1785 __func__, (unsigned long long)sh->sector, faila, failb); 1786 1787 atomic_inc(&sh->count); 1788 1789 if (failb == syndrome_disks+1) { 1790 /* Q disk is one of the missing disks */ 1791 if (faila == syndrome_disks) { 1792 /* Missing P+Q, just recompute */ 1793 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1794 ops_complete_compute, sh, 1795 to_addr_conv(sh, percpu, 0)); 1796 return async_gen_syndrome(blocks, offs, syndrome_disks+2, 1797 RAID5_STRIPE_SIZE(sh->raid_conf), 1798 &submit); 1799 } else { 1800 struct page *dest; 1801 unsigned int dest_off; 1802 int data_target; 1803 int qd_idx = sh->qd_idx; 1804 1805 /* Missing D+Q: recompute D from P, then recompute Q */ 1806 if (target == qd_idx) 1807 data_target = target2; 1808 else 1809 data_target = target; 1810 1811 count = 0; 1812 for (i = disks; i-- ; ) { 1813 if (i == data_target || i == qd_idx) 1814 continue; 1815 offs[count] = sh->dev[i].offset; 1816 blocks[count++] = sh->dev[i].page; 1817 } 1818 dest = sh->dev[data_target].page; 1819 dest_off = sh->dev[data_target].offset; 1820 init_async_submit(&submit, 1821 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1822 NULL, NULL, NULL, 1823 to_addr_conv(sh, percpu, 0)); 1824 tx = async_xor_offs(dest, dest_off, blocks, offs, count, 1825 RAID5_STRIPE_SIZE(sh->raid_conf), 1826 &submit); 1827 1828 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); 1829 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1830 ops_complete_compute, sh, 1831 to_addr_conv(sh, percpu, 0)); 1832 return async_gen_syndrome(blocks, offs, count+2, 1833 RAID5_STRIPE_SIZE(sh->raid_conf), 1834 &submit); 1835 } 1836 } else { 1837 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1838 ops_complete_compute, sh, 1839 to_addr_conv(sh, percpu, 0)); 1840 if (failb == syndrome_disks) { 1841 /* We're missing D+P. */ 1842 return async_raid6_datap_recov(syndrome_disks+2, 1843 RAID5_STRIPE_SIZE(sh->raid_conf), 1844 faila, 1845 blocks, offs, &submit); 1846 } else { 1847 /* We're missing D+D. */ 1848 return async_raid6_2data_recov(syndrome_disks+2, 1849 RAID5_STRIPE_SIZE(sh->raid_conf), 1850 faila, failb, 1851 blocks, offs, &submit); 1852 } 1853 } 1854 } 1855 1856 static void ops_complete_prexor(void *stripe_head_ref) 1857 { 1858 struct stripe_head *sh = stripe_head_ref; 1859 1860 pr_debug("%s: stripe %llu\n", __func__, 1861 (unsigned long long)sh->sector); 1862 1863 if (r5c_is_writeback(sh->raid_conf->log)) 1864 /* 1865 * raid5-cache write back uses orig_page during prexor. 1866 * After prexor, it is time to free orig_page 1867 */ 1868 r5c_release_extra_page(sh); 1869 } 1870 1871 static struct dma_async_tx_descriptor * 1872 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1873 struct dma_async_tx_descriptor *tx) 1874 { 1875 int disks = sh->disks; 1876 struct page **xor_srcs = to_addr_page(percpu, 0); 1877 unsigned int *off_srcs = to_addr_offs(sh, percpu); 1878 int count = 0, pd_idx = sh->pd_idx, i; 1879 struct async_submit_ctl submit; 1880 1881 /* existing parity data subtracted */ 1882 unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; 1883 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1884 1885 BUG_ON(sh->batch_head); 1886 pr_debug("%s: stripe %llu\n", __func__, 1887 (unsigned long long)sh->sector); 1888 1889 for (i = disks; i--; ) { 1890 struct r5dev *dev = &sh->dev[i]; 1891 /* Only process blocks that are known to be uptodate */ 1892 if (test_bit(R5_InJournal, &dev->flags)) { 1893 /* 1894 * For this case, PAGE_SIZE must be equal to 4KB and 1895 * page offset is zero. 1896 */ 1897 off_srcs[count] = dev->offset; 1898 xor_srcs[count++] = dev->orig_page; 1899 } else if (test_bit(R5_Wantdrain, &dev->flags)) { 1900 off_srcs[count] = dev->offset; 1901 xor_srcs[count++] = dev->page; 1902 } 1903 } 1904 1905 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1906 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1907 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 1908 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1909 1910 return tx; 1911 } 1912 1913 static struct dma_async_tx_descriptor * 1914 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1915 struct dma_async_tx_descriptor *tx) 1916 { 1917 struct page **blocks = to_addr_page(percpu, 0); 1918 unsigned int *offs = to_addr_offs(sh, percpu); 1919 int count; 1920 struct async_submit_ctl submit; 1921 1922 pr_debug("%s: stripe %llu\n", __func__, 1923 (unsigned long long)sh->sector); 1924 1925 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); 1926 1927 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1928 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1929 tx = async_gen_syndrome(blocks, offs, count+2, 1930 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1931 1932 return tx; 1933 } 1934 1935 static struct dma_async_tx_descriptor * 1936 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1937 { 1938 struct r5conf *conf = sh->raid_conf; 1939 int disks = sh->disks; 1940 int i; 1941 struct stripe_head *head_sh = sh; 1942 1943 pr_debug("%s: stripe %llu\n", __func__, 1944 (unsigned long long)sh->sector); 1945 1946 for (i = disks; i--; ) { 1947 struct r5dev *dev; 1948 struct bio *chosen; 1949 1950 sh = head_sh; 1951 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1952 struct bio *wbi; 1953 1954 again: 1955 dev = &sh->dev[i]; 1956 /* 1957 * clear R5_InJournal, so when rewriting a page in 1958 * journal, it is not skipped by r5l_log_stripe() 1959 */ 1960 clear_bit(R5_InJournal, &dev->flags); 1961 spin_lock_irq(&sh->stripe_lock); 1962 chosen = dev->towrite; 1963 dev->towrite = NULL; 1964 sh->overwrite_disks = 0; 1965 BUG_ON(dev->written); 1966 wbi = dev->written = chosen; 1967 spin_unlock_irq(&sh->stripe_lock); 1968 WARN_ON(dev->page != dev->orig_page); 1969 1970 while (wbi && wbi->bi_iter.bi_sector < 1971 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1972 if (wbi->bi_opf & REQ_FUA) 1973 set_bit(R5_WantFUA, &dev->flags); 1974 if (wbi->bi_opf & REQ_SYNC) 1975 set_bit(R5_SyncIO, &dev->flags); 1976 if (bio_op(wbi) == REQ_OP_DISCARD) 1977 set_bit(R5_Discard, &dev->flags); 1978 else { 1979 tx = async_copy_data(1, wbi, &dev->page, 1980 dev->offset, 1981 dev->sector, tx, sh, 1982 r5c_is_writeback(conf->log)); 1983 if (dev->page != dev->orig_page && 1984 !r5c_is_writeback(conf->log)) { 1985 set_bit(R5_SkipCopy, &dev->flags); 1986 clear_bit(R5_UPTODATE, &dev->flags); 1987 clear_bit(R5_OVERWRITE, &dev->flags); 1988 } 1989 } 1990 wbi = r5_next_bio(conf, wbi, dev->sector); 1991 } 1992 1993 if (head_sh->batch_head) { 1994 sh = list_first_entry(&sh->batch_list, 1995 struct stripe_head, 1996 batch_list); 1997 if (sh == head_sh) 1998 continue; 1999 goto again; 2000 } 2001 } 2002 } 2003 2004 return tx; 2005 } 2006 2007 static void ops_complete_reconstruct(void *stripe_head_ref) 2008 { 2009 struct stripe_head *sh = stripe_head_ref; 2010 int disks = sh->disks; 2011 int pd_idx = sh->pd_idx; 2012 int qd_idx = sh->qd_idx; 2013 int i; 2014 bool fua = false, sync = false, discard = false; 2015 2016 pr_debug("%s: stripe %llu\n", __func__, 2017 (unsigned long long)sh->sector); 2018 2019 for (i = disks; i--; ) { 2020 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 2021 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 2022 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 2023 } 2024 2025 for (i = disks; i--; ) { 2026 struct r5dev *dev = &sh->dev[i]; 2027 2028 if (dev->written || i == pd_idx || i == qd_idx) { 2029 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { 2030 set_bit(R5_UPTODATE, &dev->flags); 2031 if (test_bit(STRIPE_EXPAND_READY, &sh->state)) 2032 set_bit(R5_Expanded, &dev->flags); 2033 } 2034 if (fua) 2035 set_bit(R5_WantFUA, &dev->flags); 2036 if (sync) 2037 set_bit(R5_SyncIO, &dev->flags); 2038 } 2039 } 2040 2041 if (sh->reconstruct_state == reconstruct_state_drain_run) 2042 sh->reconstruct_state = reconstruct_state_drain_result; 2043 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 2044 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 2045 else { 2046 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 2047 sh->reconstruct_state = reconstruct_state_result; 2048 } 2049 2050 set_bit(STRIPE_HANDLE, &sh->state); 2051 raid5_release_stripe(sh); 2052 } 2053 2054 static void 2055 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 2056 struct dma_async_tx_descriptor *tx) 2057 { 2058 int disks = sh->disks; 2059 struct page **xor_srcs; 2060 unsigned int *off_srcs; 2061 struct async_submit_ctl submit; 2062 int count, pd_idx = sh->pd_idx, i; 2063 struct page *xor_dest; 2064 unsigned int off_dest; 2065 int prexor = 0; 2066 unsigned long flags; 2067 int j = 0; 2068 struct stripe_head *head_sh = sh; 2069 int last_stripe; 2070 2071 pr_debug("%s: stripe %llu\n", __func__, 2072 (unsigned long long)sh->sector); 2073 2074 for (i = 0; i < sh->disks; i++) { 2075 if (pd_idx == i) 2076 continue; 2077 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 2078 break; 2079 } 2080 if (i >= sh->disks) { 2081 atomic_inc(&sh->count); 2082 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 2083 ops_complete_reconstruct(sh); 2084 return; 2085 } 2086 again: 2087 count = 0; 2088 xor_srcs = to_addr_page(percpu, j); 2089 off_srcs = to_addr_offs(sh, percpu); 2090 /* check if prexor is active which means only process blocks 2091 * that are part of a read-modify-write (written) 2092 */ 2093 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 2094 prexor = 1; 2095 off_dest = off_srcs[count] = sh->dev[pd_idx].offset; 2096 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 2097 for (i = disks; i--; ) { 2098 struct r5dev *dev = &sh->dev[i]; 2099 if (head_sh->dev[i].written || 2100 test_bit(R5_InJournal, &head_sh->dev[i].flags)) { 2101 off_srcs[count] = dev->offset; 2102 xor_srcs[count++] = dev->page; 2103 } 2104 } 2105 } else { 2106 xor_dest = sh->dev[pd_idx].page; 2107 off_dest = sh->dev[pd_idx].offset; 2108 for (i = disks; i--; ) { 2109 struct r5dev *dev = &sh->dev[i]; 2110 if (i != pd_idx) { 2111 off_srcs[count] = dev->offset; 2112 xor_srcs[count++] = dev->page; 2113 } 2114 } 2115 } 2116 2117 /* 1/ if we prexor'd then the dest is reused as a source 2118 * 2/ if we did not prexor then we are redoing the parity 2119 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 2120 * for the synchronous xor case 2121 */ 2122 last_stripe = !head_sh->batch_head || 2123 list_first_entry(&sh->batch_list, 2124 struct stripe_head, batch_list) == head_sh; 2125 if (last_stripe) { 2126 flags = ASYNC_TX_ACK | 2127 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 2128 2129 atomic_inc(&head_sh->count); 2130 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 2131 to_addr_conv(sh, percpu, j)); 2132 } else { 2133 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 2134 init_async_submit(&submit, flags, tx, NULL, NULL, 2135 to_addr_conv(sh, percpu, j)); 2136 } 2137 2138 if (unlikely(count == 1)) 2139 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], 2140 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2141 else 2142 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 2143 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2144 if (!last_stripe) { 2145 j++; 2146 sh = list_first_entry(&sh->batch_list, struct stripe_head, 2147 batch_list); 2148 goto again; 2149 } 2150 } 2151 2152 static void 2153 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 2154 struct dma_async_tx_descriptor *tx) 2155 { 2156 struct async_submit_ctl submit; 2157 struct page **blocks; 2158 unsigned int *offs; 2159 int count, i, j = 0; 2160 struct stripe_head *head_sh = sh; 2161 int last_stripe; 2162 int synflags; 2163 unsigned long txflags; 2164 2165 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 2166 2167 for (i = 0; i < sh->disks; i++) { 2168 if (sh->pd_idx == i || sh->qd_idx == i) 2169 continue; 2170 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 2171 break; 2172 } 2173 if (i >= sh->disks) { 2174 atomic_inc(&sh->count); 2175 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2176 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2177 ops_complete_reconstruct(sh); 2178 return; 2179 } 2180 2181 again: 2182 blocks = to_addr_page(percpu, j); 2183 offs = to_addr_offs(sh, percpu); 2184 2185 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 2186 synflags = SYNDROME_SRC_WRITTEN; 2187 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 2188 } else { 2189 synflags = SYNDROME_SRC_ALL; 2190 txflags = ASYNC_TX_ACK; 2191 } 2192 2193 count = set_syndrome_sources(blocks, offs, sh, synflags); 2194 last_stripe = !head_sh->batch_head || 2195 list_first_entry(&sh->batch_list, 2196 struct stripe_head, batch_list) == head_sh; 2197 2198 if (last_stripe) { 2199 atomic_inc(&head_sh->count); 2200 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 2201 head_sh, to_addr_conv(sh, percpu, j)); 2202 } else 2203 init_async_submit(&submit, 0, tx, NULL, NULL, 2204 to_addr_conv(sh, percpu, j)); 2205 tx = async_gen_syndrome(blocks, offs, count+2, 2206 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2207 if (!last_stripe) { 2208 j++; 2209 sh = list_first_entry(&sh->batch_list, struct stripe_head, 2210 batch_list); 2211 goto again; 2212 } 2213 } 2214 2215 static void ops_complete_check(void *stripe_head_ref) 2216 { 2217 struct stripe_head *sh = stripe_head_ref; 2218 2219 pr_debug("%s: stripe %llu\n", __func__, 2220 (unsigned long long)sh->sector); 2221 2222 sh->check_state = check_state_check_result; 2223 set_bit(STRIPE_HANDLE, &sh->state); 2224 raid5_release_stripe(sh); 2225 } 2226 2227 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2228 { 2229 int disks = sh->disks; 2230 int pd_idx = sh->pd_idx; 2231 int qd_idx = sh->qd_idx; 2232 struct page *xor_dest; 2233 unsigned int off_dest; 2234 struct page **xor_srcs = to_addr_page(percpu, 0); 2235 unsigned int *off_srcs = to_addr_offs(sh, percpu); 2236 struct dma_async_tx_descriptor *tx; 2237 struct async_submit_ctl submit; 2238 int count; 2239 int i; 2240 2241 pr_debug("%s: stripe %llu\n", __func__, 2242 (unsigned long long)sh->sector); 2243 2244 BUG_ON(sh->batch_head); 2245 count = 0; 2246 xor_dest = sh->dev[pd_idx].page; 2247 off_dest = sh->dev[pd_idx].offset; 2248 off_srcs[count] = off_dest; 2249 xor_srcs[count++] = xor_dest; 2250 for (i = disks; i--; ) { 2251 if (i == pd_idx || i == qd_idx) 2252 continue; 2253 off_srcs[count] = sh->dev[i].offset; 2254 xor_srcs[count++] = sh->dev[i].page; 2255 } 2256 2257 init_async_submit(&submit, 0, NULL, NULL, NULL, 2258 to_addr_conv(sh, percpu, 0)); 2259 tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 2260 RAID5_STRIPE_SIZE(sh->raid_conf), 2261 &sh->ops.zero_sum_result, &submit); 2262 2263 atomic_inc(&sh->count); 2264 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2265 tx = async_trigger_callback(&submit); 2266 } 2267 2268 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2269 { 2270 struct page **srcs = to_addr_page(percpu, 0); 2271 unsigned int *offs = to_addr_offs(sh, percpu); 2272 struct async_submit_ctl submit; 2273 int count; 2274 2275 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2276 (unsigned long long)sh->sector, checkp); 2277 2278 BUG_ON(sh->batch_head); 2279 count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); 2280 if (!checkp) 2281 srcs[count] = NULL; 2282 2283 atomic_inc(&sh->count); 2284 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2285 sh, to_addr_conv(sh, percpu, 0)); 2286 async_syndrome_val(srcs, offs, count+2, 2287 RAID5_STRIPE_SIZE(sh->raid_conf), 2288 &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); 2289 } 2290 2291 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2292 { 2293 int overlap_clear = 0, i, disks = sh->disks; 2294 struct dma_async_tx_descriptor *tx = NULL; 2295 struct r5conf *conf = sh->raid_conf; 2296 int level = conf->level; 2297 struct raid5_percpu *percpu; 2298 2299 local_lock(&conf->percpu->lock); 2300 percpu = this_cpu_ptr(conf->percpu); 2301 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2302 ops_run_biofill(sh); 2303 overlap_clear++; 2304 } 2305 2306 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2307 if (level < 6) 2308 tx = ops_run_compute5(sh, percpu); 2309 else { 2310 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2311 tx = ops_run_compute6_1(sh, percpu); 2312 else 2313 tx = ops_run_compute6_2(sh, percpu); 2314 } 2315 /* terminate the chain if reconstruct is not set to be run */ 2316 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2317 async_tx_ack(tx); 2318 } 2319 2320 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2321 if (level < 6) 2322 tx = ops_run_prexor5(sh, percpu, tx); 2323 else 2324 tx = ops_run_prexor6(sh, percpu, tx); 2325 } 2326 2327 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2328 tx = ops_run_partial_parity(sh, percpu, tx); 2329 2330 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2331 tx = ops_run_biodrain(sh, tx); 2332 overlap_clear++; 2333 } 2334 2335 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2336 if (level < 6) 2337 ops_run_reconstruct5(sh, percpu, tx); 2338 else 2339 ops_run_reconstruct6(sh, percpu, tx); 2340 } 2341 2342 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2343 if (sh->check_state == check_state_run) 2344 ops_run_check_p(sh, percpu); 2345 else if (sh->check_state == check_state_run_q) 2346 ops_run_check_pq(sh, percpu, 0); 2347 else if (sh->check_state == check_state_run_pq) 2348 ops_run_check_pq(sh, percpu, 1); 2349 else 2350 BUG(); 2351 } 2352 2353 if (overlap_clear && !sh->batch_head) { 2354 for (i = disks; i--; ) { 2355 struct r5dev *dev = &sh->dev[i]; 2356 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2357 wake_up(&sh->raid_conf->wait_for_overlap); 2358 } 2359 } 2360 local_unlock(&conf->percpu->lock); 2361 } 2362 2363 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2364 { 2365 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2366 kfree(sh->pages); 2367 #endif 2368 if (sh->ppl_page) 2369 __free_page(sh->ppl_page); 2370 kmem_cache_free(sc, sh); 2371 } 2372 2373 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2374 int disks, struct r5conf *conf) 2375 { 2376 struct stripe_head *sh; 2377 2378 sh = kmem_cache_zalloc(sc, gfp); 2379 if (sh) { 2380 spin_lock_init(&sh->stripe_lock); 2381 spin_lock_init(&sh->batch_lock); 2382 INIT_LIST_HEAD(&sh->batch_list); 2383 INIT_LIST_HEAD(&sh->lru); 2384 INIT_LIST_HEAD(&sh->r5c); 2385 INIT_LIST_HEAD(&sh->log_list); 2386 atomic_set(&sh->count, 1); 2387 sh->raid_conf = conf; 2388 sh->log_start = MaxSector; 2389 2390 if (raid5_has_ppl(conf)) { 2391 sh->ppl_page = alloc_page(gfp); 2392 if (!sh->ppl_page) { 2393 free_stripe(sc, sh); 2394 return NULL; 2395 } 2396 } 2397 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2398 if (init_stripe_shared_pages(sh, conf, disks)) { 2399 free_stripe(sc, sh); 2400 return NULL; 2401 } 2402 #endif 2403 } 2404 return sh; 2405 } 2406 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2407 { 2408 struct stripe_head *sh; 2409 2410 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2411 if (!sh) 2412 return 0; 2413 2414 if (grow_buffers(sh, gfp)) { 2415 shrink_buffers(sh); 2416 free_stripe(conf->slab_cache, sh); 2417 return 0; 2418 } 2419 sh->hash_lock_index = 2420 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2421 /* we just created an active stripe so... */ 2422 atomic_inc(&conf->active_stripes); 2423 2424 raid5_release_stripe(sh); 2425 conf->max_nr_stripes++; 2426 return 1; 2427 } 2428 2429 static int grow_stripes(struct r5conf *conf, int num) 2430 { 2431 struct kmem_cache *sc; 2432 size_t namelen = sizeof(conf->cache_name[0]); 2433 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2434 2435 if (conf->mddev->gendisk) 2436 snprintf(conf->cache_name[0], namelen, 2437 "raid%d-%s", conf->level, mdname(conf->mddev)); 2438 else 2439 snprintf(conf->cache_name[0], namelen, 2440 "raid%d-%p", conf->level, conf->mddev); 2441 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2442 2443 conf->active_name = 0; 2444 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2445 struct_size_t(struct stripe_head, dev, devs), 2446 0, 0, NULL); 2447 if (!sc) 2448 return 1; 2449 conf->slab_cache = sc; 2450 conf->pool_size = devs; 2451 while (num--) 2452 if (!grow_one_stripe(conf, GFP_KERNEL)) 2453 return 1; 2454 2455 return 0; 2456 } 2457 2458 /** 2459 * scribble_alloc - allocate percpu scribble buffer for required size 2460 * of the scribble region 2461 * @percpu: from for_each_present_cpu() of the caller 2462 * @num: total number of disks in the array 2463 * @cnt: scribble objs count for required size of the scribble region 2464 * 2465 * The scribble buffer size must be enough to contain: 2466 * 1/ a struct page pointer for each device in the array +2 2467 * 2/ room to convert each entry in (1) to its corresponding dma 2468 * (dma_map_page()) or page (page_address()) address. 2469 * 2470 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2471 * calculate over all devices (not just the data blocks), using zeros in place 2472 * of the P and Q blocks. 2473 */ 2474 static int scribble_alloc(struct raid5_percpu *percpu, 2475 int num, int cnt) 2476 { 2477 size_t obj_size = 2478 sizeof(struct page *) * (num + 2) + 2479 sizeof(addr_conv_t) * (num + 2) + 2480 sizeof(unsigned int) * (num + 2); 2481 void *scribble; 2482 2483 /* 2484 * If here is in raid array suspend context, it is in memalloc noio 2485 * context as well, there is no potential recursive memory reclaim 2486 * I/Os with the GFP_KERNEL flag. 2487 */ 2488 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); 2489 if (!scribble) 2490 return -ENOMEM; 2491 2492 kvfree(percpu->scribble); 2493 2494 percpu->scribble = scribble; 2495 percpu->scribble_obj_size = obj_size; 2496 return 0; 2497 } 2498 2499 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2500 { 2501 unsigned long cpu; 2502 int err = 0; 2503 2504 /* Never shrink. */ 2505 if (conf->scribble_disks >= new_disks && 2506 conf->scribble_sectors >= new_sectors) 2507 return 0; 2508 2509 raid5_quiesce(conf->mddev, true); 2510 cpus_read_lock(); 2511 2512 for_each_present_cpu(cpu) { 2513 struct raid5_percpu *percpu; 2514 2515 percpu = per_cpu_ptr(conf->percpu, cpu); 2516 err = scribble_alloc(percpu, new_disks, 2517 new_sectors / RAID5_STRIPE_SECTORS(conf)); 2518 if (err) 2519 break; 2520 } 2521 2522 cpus_read_unlock(); 2523 raid5_quiesce(conf->mddev, false); 2524 2525 if (!err) { 2526 conf->scribble_disks = new_disks; 2527 conf->scribble_sectors = new_sectors; 2528 } 2529 return err; 2530 } 2531 2532 static int resize_stripes(struct r5conf *conf, int newsize) 2533 { 2534 /* Make all the stripes able to hold 'newsize' devices. 2535 * New slots in each stripe get 'page' set to a new page. 2536 * 2537 * This happens in stages: 2538 * 1/ create a new kmem_cache and allocate the required number of 2539 * stripe_heads. 2540 * 2/ gather all the old stripe_heads and transfer the pages across 2541 * to the new stripe_heads. This will have the side effect of 2542 * freezing the array as once all stripe_heads have been collected, 2543 * no IO will be possible. Old stripe heads are freed once their 2544 * pages have been transferred over, and the old kmem_cache is 2545 * freed when all stripes are done. 2546 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2547 * we simple return a failure status - no need to clean anything up. 2548 * 4/ allocate new pages for the new slots in the new stripe_heads. 2549 * If this fails, we don't bother trying the shrink the 2550 * stripe_heads down again, we just leave them as they are. 2551 * As each stripe_head is processed the new one is released into 2552 * active service. 2553 * 2554 * Once step2 is started, we cannot afford to wait for a write, 2555 * so we use GFP_NOIO allocations. 2556 */ 2557 struct stripe_head *osh, *nsh; 2558 LIST_HEAD(newstripes); 2559 struct disk_info *ndisks; 2560 int err = 0; 2561 struct kmem_cache *sc; 2562 int i; 2563 int hash, cnt; 2564 2565 md_allow_write(conf->mddev); 2566 2567 /* Step 1 */ 2568 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2569 struct_size_t(struct stripe_head, dev, newsize), 2570 0, 0, NULL); 2571 if (!sc) 2572 return -ENOMEM; 2573 2574 /* Need to ensure auto-resizing doesn't interfere */ 2575 mutex_lock(&conf->cache_size_mutex); 2576 2577 for (i = conf->max_nr_stripes; i; i--) { 2578 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2579 if (!nsh) 2580 break; 2581 2582 list_add(&nsh->lru, &newstripes); 2583 } 2584 if (i) { 2585 /* didn't get enough, give up */ 2586 while (!list_empty(&newstripes)) { 2587 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2588 list_del(&nsh->lru); 2589 free_stripe(sc, nsh); 2590 } 2591 kmem_cache_destroy(sc); 2592 mutex_unlock(&conf->cache_size_mutex); 2593 return -ENOMEM; 2594 } 2595 /* Step 2 - Must use GFP_NOIO now. 2596 * OK, we have enough stripes, start collecting inactive 2597 * stripes and copying them over 2598 */ 2599 hash = 0; 2600 cnt = 0; 2601 list_for_each_entry(nsh, &newstripes, lru) { 2602 lock_device_hash_lock(conf, hash); 2603 wait_event_cmd(conf->wait_for_stripe, 2604 !list_empty(conf->inactive_list + hash), 2605 unlock_device_hash_lock(conf, hash), 2606 lock_device_hash_lock(conf, hash)); 2607 osh = get_free_stripe(conf, hash); 2608 unlock_device_hash_lock(conf, hash); 2609 2610 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2611 for (i = 0; i < osh->nr_pages; i++) { 2612 nsh->pages[i] = osh->pages[i]; 2613 osh->pages[i] = NULL; 2614 } 2615 #endif 2616 for(i=0; i<conf->pool_size; i++) { 2617 nsh->dev[i].page = osh->dev[i].page; 2618 nsh->dev[i].orig_page = osh->dev[i].page; 2619 nsh->dev[i].offset = osh->dev[i].offset; 2620 } 2621 nsh->hash_lock_index = hash; 2622 free_stripe(conf->slab_cache, osh); 2623 cnt++; 2624 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2625 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2626 hash++; 2627 cnt = 0; 2628 } 2629 } 2630 kmem_cache_destroy(conf->slab_cache); 2631 2632 /* Step 3. 2633 * At this point, we are holding all the stripes so the array 2634 * is completely stalled, so now is a good time to resize 2635 * conf->disks and the scribble region 2636 */ 2637 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO); 2638 if (ndisks) { 2639 for (i = 0; i < conf->pool_size; i++) 2640 ndisks[i] = conf->disks[i]; 2641 2642 for (i = conf->pool_size; i < newsize; i++) { 2643 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2644 if (!ndisks[i].extra_page) 2645 err = -ENOMEM; 2646 } 2647 2648 if (err) { 2649 for (i = conf->pool_size; i < newsize; i++) 2650 if (ndisks[i].extra_page) 2651 put_page(ndisks[i].extra_page); 2652 kfree(ndisks); 2653 } else { 2654 kfree(conf->disks); 2655 conf->disks = ndisks; 2656 } 2657 } else 2658 err = -ENOMEM; 2659 2660 conf->slab_cache = sc; 2661 conf->active_name = 1-conf->active_name; 2662 2663 /* Step 4, return new stripes to service */ 2664 while(!list_empty(&newstripes)) { 2665 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2666 list_del_init(&nsh->lru); 2667 2668 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2669 for (i = 0; i < nsh->nr_pages; i++) { 2670 if (nsh->pages[i]) 2671 continue; 2672 nsh->pages[i] = alloc_page(GFP_NOIO); 2673 if (!nsh->pages[i]) 2674 err = -ENOMEM; 2675 } 2676 2677 for (i = conf->raid_disks; i < newsize; i++) { 2678 if (nsh->dev[i].page) 2679 continue; 2680 nsh->dev[i].page = raid5_get_dev_page(nsh, i); 2681 nsh->dev[i].orig_page = nsh->dev[i].page; 2682 nsh->dev[i].offset = raid5_get_page_offset(nsh, i); 2683 } 2684 #else 2685 for (i=conf->raid_disks; i < newsize; i++) 2686 if (nsh->dev[i].page == NULL) { 2687 struct page *p = alloc_page(GFP_NOIO); 2688 nsh->dev[i].page = p; 2689 nsh->dev[i].orig_page = p; 2690 nsh->dev[i].offset = 0; 2691 if (!p) 2692 err = -ENOMEM; 2693 } 2694 #endif 2695 raid5_release_stripe(nsh); 2696 } 2697 /* critical section pass, GFP_NOIO no longer needed */ 2698 2699 if (!err) 2700 conf->pool_size = newsize; 2701 mutex_unlock(&conf->cache_size_mutex); 2702 2703 return err; 2704 } 2705 2706 static int drop_one_stripe(struct r5conf *conf) 2707 { 2708 struct stripe_head *sh; 2709 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2710 2711 spin_lock_irq(conf->hash_locks + hash); 2712 sh = get_free_stripe(conf, hash); 2713 spin_unlock_irq(conf->hash_locks + hash); 2714 if (!sh) 2715 return 0; 2716 BUG_ON(atomic_read(&sh->count)); 2717 shrink_buffers(sh); 2718 free_stripe(conf->slab_cache, sh); 2719 atomic_dec(&conf->active_stripes); 2720 conf->max_nr_stripes--; 2721 return 1; 2722 } 2723 2724 static void shrink_stripes(struct r5conf *conf) 2725 { 2726 while (conf->max_nr_stripes && 2727 drop_one_stripe(conf)) 2728 ; 2729 2730 kmem_cache_destroy(conf->slab_cache); 2731 conf->slab_cache = NULL; 2732 } 2733 2734 /* 2735 * This helper wraps rcu_dereference_protected() and can be used when 2736 * it is known that the nr_pending of the rdev is elevated. 2737 */ 2738 static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) 2739 { 2740 return rcu_dereference_protected(rdev, 2741 atomic_read(&rcu_access_pointer(rdev)->nr_pending)); 2742 } 2743 2744 /* 2745 * This helper wraps rcu_dereference_protected() and should be used 2746 * when it is known that the mddev_lock() is held. This is safe 2747 * seeing raid5_remove_disk() has the same lock held. 2748 */ 2749 static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, 2750 struct md_rdev __rcu *rdev) 2751 { 2752 return rcu_dereference_protected(rdev, 2753 lockdep_is_held(&mddev->reconfig_mutex)); 2754 } 2755 2756 static void raid5_end_read_request(struct bio * bi) 2757 { 2758 struct stripe_head *sh = bi->bi_private; 2759 struct r5conf *conf = sh->raid_conf; 2760 int disks = sh->disks, i; 2761 struct md_rdev *rdev = NULL; 2762 sector_t s; 2763 2764 for (i=0 ; i<disks; i++) 2765 if (bi == &sh->dev[i].req) 2766 break; 2767 2768 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2769 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2770 bi->bi_status); 2771 if (i == disks) { 2772 BUG(); 2773 return; 2774 } 2775 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2776 /* If replacement finished while this request was outstanding, 2777 * 'replacement' might be NULL already. 2778 * In that case it moved down to 'rdev'. 2779 * rdev is not removed until all requests are finished. 2780 */ 2781 rdev = rdev_pend_deref(conf->disks[i].replacement); 2782 if (!rdev) 2783 rdev = rdev_pend_deref(conf->disks[i].rdev); 2784 2785 if (use_new_offset(conf, sh)) 2786 s = sh->sector + rdev->new_data_offset; 2787 else 2788 s = sh->sector + rdev->data_offset; 2789 if (!bi->bi_status) { 2790 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2791 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2792 /* Note that this cannot happen on a 2793 * replacement device. We just fail those on 2794 * any error 2795 */ 2796 pr_info_ratelimited( 2797 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n", 2798 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), 2799 (unsigned long long)s, 2800 rdev->bdev); 2801 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); 2802 clear_bit(R5_ReadError, &sh->dev[i].flags); 2803 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2804 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2805 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2806 2807 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2808 /* 2809 * end read for a page in journal, this 2810 * must be preparing for prexor in rmw 2811 */ 2812 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2813 2814 if (atomic_read(&rdev->read_errors)) 2815 atomic_set(&rdev->read_errors, 0); 2816 } else { 2817 int retry = 0; 2818 int set_bad = 0; 2819 2820 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2821 if (!(bi->bi_status == BLK_STS_PROTECTION)) 2822 atomic_inc(&rdev->read_errors); 2823 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2824 pr_warn_ratelimited( 2825 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n", 2826 mdname(conf->mddev), 2827 (unsigned long long)s, 2828 rdev->bdev); 2829 else if (conf->mddev->degraded >= conf->max_degraded) { 2830 set_bad = 1; 2831 pr_warn_ratelimited( 2832 "md/raid:%s: read error not correctable (sector %llu on %pg).\n", 2833 mdname(conf->mddev), 2834 (unsigned long long)s, 2835 rdev->bdev); 2836 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2837 /* Oh, no!!! */ 2838 set_bad = 1; 2839 pr_warn_ratelimited( 2840 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n", 2841 mdname(conf->mddev), 2842 (unsigned long long)s, 2843 rdev->bdev); 2844 } else if (atomic_read(&rdev->read_errors) 2845 > conf->max_nr_stripes) { 2846 if (!test_bit(Faulty, &rdev->flags)) { 2847 pr_warn("md/raid:%s: %d read_errors > %d stripes\n", 2848 mdname(conf->mddev), 2849 atomic_read(&rdev->read_errors), 2850 conf->max_nr_stripes); 2851 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n", 2852 mdname(conf->mddev), rdev->bdev); 2853 } 2854 } else 2855 retry = 1; 2856 if (set_bad && test_bit(In_sync, &rdev->flags) 2857 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2858 retry = 1; 2859 if (retry) 2860 if (sh->qd_idx >= 0 && sh->pd_idx == i) 2861 set_bit(R5_ReadError, &sh->dev[i].flags); 2862 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2863 set_bit(R5_ReadError, &sh->dev[i].flags); 2864 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2865 } else 2866 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2867 else { 2868 clear_bit(R5_ReadError, &sh->dev[i].flags); 2869 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2870 if (!(set_bad 2871 && test_bit(In_sync, &rdev->flags) 2872 && rdev_set_badblocks( 2873 rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) 2874 md_error(conf->mddev, rdev); 2875 } 2876 } 2877 rdev_dec_pending(rdev, conf->mddev); 2878 bio_uninit(bi); 2879 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2880 set_bit(STRIPE_HANDLE, &sh->state); 2881 raid5_release_stripe(sh); 2882 } 2883 2884 static void raid5_end_write_request(struct bio *bi) 2885 { 2886 struct stripe_head *sh = bi->bi_private; 2887 struct r5conf *conf = sh->raid_conf; 2888 int disks = sh->disks, i; 2889 struct md_rdev *rdev; 2890 sector_t first_bad; 2891 int bad_sectors; 2892 int replacement = 0; 2893 2894 for (i = 0 ; i < disks; i++) { 2895 if (bi == &sh->dev[i].req) { 2896 rdev = rdev_pend_deref(conf->disks[i].rdev); 2897 break; 2898 } 2899 if (bi == &sh->dev[i].rreq) { 2900 rdev = rdev_pend_deref(conf->disks[i].replacement); 2901 if (rdev) 2902 replacement = 1; 2903 else 2904 /* rdev was removed and 'replacement' 2905 * replaced it. rdev is not removed 2906 * until all requests are finished. 2907 */ 2908 rdev = rdev_pend_deref(conf->disks[i].rdev); 2909 break; 2910 } 2911 } 2912 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2913 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2914 bi->bi_status); 2915 if (i == disks) { 2916 BUG(); 2917 return; 2918 } 2919 2920 if (replacement) { 2921 if (bi->bi_status) 2922 md_error(conf->mddev, rdev); 2923 else if (is_badblock(rdev, sh->sector, 2924 RAID5_STRIPE_SECTORS(conf), 2925 &first_bad, &bad_sectors)) 2926 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2927 } else { 2928 if (bi->bi_status) { 2929 set_bit(STRIPE_DEGRADED, &sh->state); 2930 set_bit(WriteErrorSeen, &rdev->flags); 2931 set_bit(R5_WriteError, &sh->dev[i].flags); 2932 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2933 set_bit(MD_RECOVERY_NEEDED, 2934 &rdev->mddev->recovery); 2935 } else if (is_badblock(rdev, sh->sector, 2936 RAID5_STRIPE_SECTORS(conf), 2937 &first_bad, &bad_sectors)) { 2938 set_bit(R5_MadeGood, &sh->dev[i].flags); 2939 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2940 /* That was a successful write so make 2941 * sure it looks like we already did 2942 * a re-write. 2943 */ 2944 set_bit(R5_ReWrite, &sh->dev[i].flags); 2945 } 2946 } 2947 rdev_dec_pending(rdev, conf->mddev); 2948 2949 if (sh->batch_head && bi->bi_status && !replacement) 2950 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2951 2952 bio_uninit(bi); 2953 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2954 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2955 set_bit(STRIPE_HANDLE, &sh->state); 2956 2957 if (sh->batch_head && sh != sh->batch_head) 2958 raid5_release_stripe(sh->batch_head); 2959 raid5_release_stripe(sh); 2960 } 2961 2962 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2963 { 2964 struct r5conf *conf = mddev->private; 2965 unsigned long flags; 2966 pr_debug("raid456: error called\n"); 2967 2968 pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n", 2969 mdname(mddev), rdev->bdev); 2970 2971 spin_lock_irqsave(&conf->device_lock, flags); 2972 set_bit(Faulty, &rdev->flags); 2973 clear_bit(In_sync, &rdev->flags); 2974 mddev->degraded = raid5_calc_degraded(conf); 2975 2976 if (has_failed(conf)) { 2977 set_bit(MD_BROKEN, &conf->mddev->flags); 2978 conf->recovery_disabled = mddev->recovery_disabled; 2979 2980 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", 2981 mdname(mddev), mddev->degraded, conf->raid_disks); 2982 } else { 2983 pr_crit("md/raid:%s: Operation continuing on %d devices.\n", 2984 mdname(mddev), conf->raid_disks - mddev->degraded); 2985 } 2986 2987 spin_unlock_irqrestore(&conf->device_lock, flags); 2988 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2989 2990 set_bit(Blocked, &rdev->flags); 2991 set_mask_bits(&mddev->sb_flags, 0, 2992 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2993 r5c_update_on_rdev_error(mddev, rdev); 2994 } 2995 2996 /* 2997 * Input: a 'big' sector number, 2998 * Output: index of the data and parity disk, and the sector # in them. 2999 */ 3000 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 3001 int previous, int *dd_idx, 3002 struct stripe_head *sh) 3003 { 3004 sector_t stripe, stripe2; 3005 sector_t chunk_number; 3006 unsigned int chunk_offset; 3007 int pd_idx, qd_idx; 3008 int ddf_layout = 0; 3009 sector_t new_sector; 3010 int algorithm = previous ? conf->prev_algo 3011 : conf->algorithm; 3012 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 3013 : conf->chunk_sectors; 3014 int raid_disks = previous ? conf->previous_raid_disks 3015 : conf->raid_disks; 3016 int data_disks = raid_disks - conf->max_degraded; 3017 3018 /* First compute the information on this sector */ 3019 3020 /* 3021 * Compute the chunk number and the sector offset inside the chunk 3022 */ 3023 chunk_offset = sector_div(r_sector, sectors_per_chunk); 3024 chunk_number = r_sector; 3025 3026 /* 3027 * Compute the stripe number 3028 */ 3029 stripe = chunk_number; 3030 *dd_idx = sector_div(stripe, data_disks); 3031 stripe2 = stripe; 3032 /* 3033 * Select the parity disk based on the user selected algorithm. 3034 */ 3035 pd_idx = qd_idx = -1; 3036 switch(conf->level) { 3037 case 4: 3038 pd_idx = data_disks; 3039 break; 3040 case 5: 3041 switch (algorithm) { 3042 case ALGORITHM_LEFT_ASYMMETRIC: 3043 pd_idx = data_disks - sector_div(stripe2, raid_disks); 3044 if (*dd_idx >= pd_idx) 3045 (*dd_idx)++; 3046 break; 3047 case ALGORITHM_RIGHT_ASYMMETRIC: 3048 pd_idx = sector_div(stripe2, raid_disks); 3049 if (*dd_idx >= pd_idx) 3050 (*dd_idx)++; 3051 break; 3052 case ALGORITHM_LEFT_SYMMETRIC: 3053 pd_idx = data_disks - sector_div(stripe2, raid_disks); 3054 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3055 break; 3056 case ALGORITHM_RIGHT_SYMMETRIC: 3057 pd_idx = sector_div(stripe2, raid_disks); 3058 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3059 break; 3060 case ALGORITHM_PARITY_0: 3061 pd_idx = 0; 3062 (*dd_idx)++; 3063 break; 3064 case ALGORITHM_PARITY_N: 3065 pd_idx = data_disks; 3066 break; 3067 default: 3068 BUG(); 3069 } 3070 break; 3071 case 6: 3072 3073 switch (algorithm) { 3074 case ALGORITHM_LEFT_ASYMMETRIC: 3075 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3076 qd_idx = pd_idx + 1; 3077 if (pd_idx == raid_disks-1) { 3078 (*dd_idx)++; /* Q D D D P */ 3079 qd_idx = 0; 3080 } else if (*dd_idx >= pd_idx) 3081 (*dd_idx) += 2; /* D D P Q D */ 3082 break; 3083 case ALGORITHM_RIGHT_ASYMMETRIC: 3084 pd_idx = sector_div(stripe2, raid_disks); 3085 qd_idx = pd_idx + 1; 3086 if (pd_idx == raid_disks-1) { 3087 (*dd_idx)++; /* Q D D D P */ 3088 qd_idx = 0; 3089 } else if (*dd_idx >= pd_idx) 3090 (*dd_idx) += 2; /* D D P Q D */ 3091 break; 3092 case ALGORITHM_LEFT_SYMMETRIC: 3093 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3094 qd_idx = (pd_idx + 1) % raid_disks; 3095 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 3096 break; 3097 case ALGORITHM_RIGHT_SYMMETRIC: 3098 pd_idx = sector_div(stripe2, raid_disks); 3099 qd_idx = (pd_idx + 1) % raid_disks; 3100 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 3101 break; 3102 3103 case ALGORITHM_PARITY_0: 3104 pd_idx = 0; 3105 qd_idx = 1; 3106 (*dd_idx) += 2; 3107 break; 3108 case ALGORITHM_PARITY_N: 3109 pd_idx = data_disks; 3110 qd_idx = data_disks + 1; 3111 break; 3112 3113 case ALGORITHM_ROTATING_ZERO_RESTART: 3114 /* Exactly the same as RIGHT_ASYMMETRIC, but or 3115 * of blocks for computing Q is different. 3116 */ 3117 pd_idx = sector_div(stripe2, raid_disks); 3118 qd_idx = pd_idx + 1; 3119 if (pd_idx == raid_disks-1) { 3120 (*dd_idx)++; /* Q D D D P */ 3121 qd_idx = 0; 3122 } else if (*dd_idx >= pd_idx) 3123 (*dd_idx) += 2; /* D D P Q D */ 3124 ddf_layout = 1; 3125 break; 3126 3127 case ALGORITHM_ROTATING_N_RESTART: 3128 /* Same a left_asymmetric, by first stripe is 3129 * D D D P Q rather than 3130 * Q D D D P 3131 */ 3132 stripe2 += 1; 3133 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3134 qd_idx = pd_idx + 1; 3135 if (pd_idx == raid_disks-1) { 3136 (*dd_idx)++; /* Q D D D P */ 3137 qd_idx = 0; 3138 } else if (*dd_idx >= pd_idx) 3139 (*dd_idx) += 2; /* D D P Q D */ 3140 ddf_layout = 1; 3141 break; 3142 3143 case ALGORITHM_ROTATING_N_CONTINUE: 3144 /* Same as left_symmetric but Q is before P */ 3145 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3146 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 3147 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3148 ddf_layout = 1; 3149 break; 3150 3151 case ALGORITHM_LEFT_ASYMMETRIC_6: 3152 /* RAID5 left_asymmetric, with Q on last device */ 3153 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 3154 if (*dd_idx >= pd_idx) 3155 (*dd_idx)++; 3156 qd_idx = raid_disks - 1; 3157 break; 3158 3159 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3160 pd_idx = sector_div(stripe2, raid_disks-1); 3161 if (*dd_idx >= pd_idx) 3162 (*dd_idx)++; 3163 qd_idx = raid_disks - 1; 3164 break; 3165 3166 case ALGORITHM_LEFT_SYMMETRIC_6: 3167 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 3168 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 3169 qd_idx = raid_disks - 1; 3170 break; 3171 3172 case ALGORITHM_RIGHT_SYMMETRIC_6: 3173 pd_idx = sector_div(stripe2, raid_disks-1); 3174 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 3175 qd_idx = raid_disks - 1; 3176 break; 3177 3178 case ALGORITHM_PARITY_0_6: 3179 pd_idx = 0; 3180 (*dd_idx)++; 3181 qd_idx = raid_disks - 1; 3182 break; 3183 3184 default: 3185 BUG(); 3186 } 3187 break; 3188 } 3189 3190 if (sh) { 3191 sh->pd_idx = pd_idx; 3192 sh->qd_idx = qd_idx; 3193 sh->ddf_layout = ddf_layout; 3194 } 3195 /* 3196 * Finally, compute the new sector number 3197 */ 3198 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 3199 return new_sector; 3200 } 3201 3202 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 3203 { 3204 struct r5conf *conf = sh->raid_conf; 3205 int raid_disks = sh->disks; 3206 int data_disks = raid_disks - conf->max_degraded; 3207 sector_t new_sector = sh->sector, check; 3208 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 3209 : conf->chunk_sectors; 3210 int algorithm = previous ? conf->prev_algo 3211 : conf->algorithm; 3212 sector_t stripe; 3213 int chunk_offset; 3214 sector_t chunk_number; 3215 int dummy1, dd_idx = i; 3216 sector_t r_sector; 3217 struct stripe_head sh2; 3218 3219 chunk_offset = sector_div(new_sector, sectors_per_chunk); 3220 stripe = new_sector; 3221 3222 if (i == sh->pd_idx) 3223 return 0; 3224 switch(conf->level) { 3225 case 4: break; 3226 case 5: 3227 switch (algorithm) { 3228 case ALGORITHM_LEFT_ASYMMETRIC: 3229 case ALGORITHM_RIGHT_ASYMMETRIC: 3230 if (i > sh->pd_idx) 3231 i--; 3232 break; 3233 case ALGORITHM_LEFT_SYMMETRIC: 3234 case ALGORITHM_RIGHT_SYMMETRIC: 3235 if (i < sh->pd_idx) 3236 i += raid_disks; 3237 i -= (sh->pd_idx + 1); 3238 break; 3239 case ALGORITHM_PARITY_0: 3240 i -= 1; 3241 break; 3242 case ALGORITHM_PARITY_N: 3243 break; 3244 default: 3245 BUG(); 3246 } 3247 break; 3248 case 6: 3249 if (i == sh->qd_idx) 3250 return 0; /* It is the Q disk */ 3251 switch (algorithm) { 3252 case ALGORITHM_LEFT_ASYMMETRIC: 3253 case ALGORITHM_RIGHT_ASYMMETRIC: 3254 case ALGORITHM_ROTATING_ZERO_RESTART: 3255 case ALGORITHM_ROTATING_N_RESTART: 3256 if (sh->pd_idx == raid_disks-1) 3257 i--; /* Q D D D P */ 3258 else if (i > sh->pd_idx) 3259 i -= 2; /* D D P Q D */ 3260 break; 3261 case ALGORITHM_LEFT_SYMMETRIC: 3262 case ALGORITHM_RIGHT_SYMMETRIC: 3263 if (sh->pd_idx == raid_disks-1) 3264 i--; /* Q D D D P */ 3265 else { 3266 /* D D P Q D */ 3267 if (i < sh->pd_idx) 3268 i += raid_disks; 3269 i -= (sh->pd_idx + 2); 3270 } 3271 break; 3272 case ALGORITHM_PARITY_0: 3273 i -= 2; 3274 break; 3275 case ALGORITHM_PARITY_N: 3276 break; 3277 case ALGORITHM_ROTATING_N_CONTINUE: 3278 /* Like left_symmetric, but P is before Q */ 3279 if (sh->pd_idx == 0) 3280 i--; /* P D D D Q */ 3281 else { 3282 /* D D Q P D */ 3283 if (i < sh->pd_idx) 3284 i += raid_disks; 3285 i -= (sh->pd_idx + 1); 3286 } 3287 break; 3288 case ALGORITHM_LEFT_ASYMMETRIC_6: 3289 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3290 if (i > sh->pd_idx) 3291 i--; 3292 break; 3293 case ALGORITHM_LEFT_SYMMETRIC_6: 3294 case ALGORITHM_RIGHT_SYMMETRIC_6: 3295 if (i < sh->pd_idx) 3296 i += data_disks + 1; 3297 i -= (sh->pd_idx + 1); 3298 break; 3299 case ALGORITHM_PARITY_0_6: 3300 i -= 1; 3301 break; 3302 default: 3303 BUG(); 3304 } 3305 break; 3306 } 3307 3308 chunk_number = stripe * data_disks + i; 3309 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3310 3311 check = raid5_compute_sector(conf, r_sector, 3312 previous, &dummy1, &sh2); 3313 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3314 || sh2.qd_idx != sh->qd_idx) { 3315 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3316 mdname(conf->mddev)); 3317 return 0; 3318 } 3319 return r_sector; 3320 } 3321 3322 /* 3323 * There are cases where we want handle_stripe_dirtying() and 3324 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3325 * 3326 * This function checks whether we want to delay the towrite. Specifically, 3327 * we delay the towrite when: 3328 * 3329 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3330 * stripe has data in journal (for other devices). 3331 * 3332 * In this case, when reading data for the non-overwrite dev, it is 3333 * necessary to handle complex rmw of write back cache (prexor with 3334 * orig_page, and xor with page). To keep read path simple, we would 3335 * like to flush data in journal to RAID disks first, so complex rmw 3336 * is handled in the write patch (handle_stripe_dirtying). 3337 * 3338 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3339 * 3340 * It is important to be able to flush all stripes in raid5-cache. 3341 * Therefore, we need reserve some space on the journal device for 3342 * these flushes. If flush operation includes pending writes to the 3343 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3344 * for the flush out. If we exclude these pending writes from flush 3345 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3346 * Therefore, excluding pending writes in these cases enables more 3347 * efficient use of the journal device. 3348 * 3349 * Note: To make sure the stripe makes progress, we only delay 3350 * towrite for stripes with data already in journal (injournal > 0). 3351 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3352 * no_space_stripes list. 3353 * 3354 * 3. during journal failure 3355 * In journal failure, we try to flush all cached data to raid disks 3356 * based on data in stripe cache. The array is read-only to upper 3357 * layers, so we would skip all pending writes. 3358 * 3359 */ 3360 static inline bool delay_towrite(struct r5conf *conf, 3361 struct r5dev *dev, 3362 struct stripe_head_state *s) 3363 { 3364 /* case 1 above */ 3365 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3366 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3367 return true; 3368 /* case 2 above */ 3369 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3370 s->injournal > 0) 3371 return true; 3372 /* case 3 above */ 3373 if (s->log_failed && s->injournal) 3374 return true; 3375 return false; 3376 } 3377 3378 static void 3379 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3380 int rcw, int expand) 3381 { 3382 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3383 struct r5conf *conf = sh->raid_conf; 3384 int level = conf->level; 3385 3386 if (rcw) { 3387 /* 3388 * In some cases, handle_stripe_dirtying initially decided to 3389 * run rmw and allocates extra page for prexor. However, rcw is 3390 * cheaper later on. We need to free the extra page now, 3391 * because we won't be able to do that in ops_complete_prexor(). 3392 */ 3393 r5c_release_extra_page(sh); 3394 3395 for (i = disks; i--; ) { 3396 struct r5dev *dev = &sh->dev[i]; 3397 3398 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3399 set_bit(R5_LOCKED, &dev->flags); 3400 set_bit(R5_Wantdrain, &dev->flags); 3401 if (!expand) 3402 clear_bit(R5_UPTODATE, &dev->flags); 3403 s->locked++; 3404 } else if (test_bit(R5_InJournal, &dev->flags)) { 3405 set_bit(R5_LOCKED, &dev->flags); 3406 s->locked++; 3407 } 3408 } 3409 /* if we are not expanding this is a proper write request, and 3410 * there will be bios with new data to be drained into the 3411 * stripe cache 3412 */ 3413 if (!expand) { 3414 if (!s->locked) 3415 /* False alarm, nothing to do */ 3416 return; 3417 sh->reconstruct_state = reconstruct_state_drain_run; 3418 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3419 } else 3420 sh->reconstruct_state = reconstruct_state_run; 3421 3422 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3423 3424 if (s->locked + conf->max_degraded == disks) 3425 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3426 atomic_inc(&conf->pending_full_writes); 3427 } else { 3428 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3429 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3430 BUG_ON(level == 6 && 3431 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3432 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3433 3434 for (i = disks; i--; ) { 3435 struct r5dev *dev = &sh->dev[i]; 3436 if (i == pd_idx || i == qd_idx) 3437 continue; 3438 3439 if (dev->towrite && 3440 (test_bit(R5_UPTODATE, &dev->flags) || 3441 test_bit(R5_Wantcompute, &dev->flags))) { 3442 set_bit(R5_Wantdrain, &dev->flags); 3443 set_bit(R5_LOCKED, &dev->flags); 3444 clear_bit(R5_UPTODATE, &dev->flags); 3445 s->locked++; 3446 } else if (test_bit(R5_InJournal, &dev->flags)) { 3447 set_bit(R5_LOCKED, &dev->flags); 3448 s->locked++; 3449 } 3450 } 3451 if (!s->locked) 3452 /* False alarm - nothing to do */ 3453 return; 3454 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3455 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3456 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3457 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3458 } 3459 3460 /* keep the parity disk(s) locked while asynchronous operations 3461 * are in flight 3462 */ 3463 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3464 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3465 s->locked++; 3466 3467 if (level == 6) { 3468 int qd_idx = sh->qd_idx; 3469 struct r5dev *dev = &sh->dev[qd_idx]; 3470 3471 set_bit(R5_LOCKED, &dev->flags); 3472 clear_bit(R5_UPTODATE, &dev->flags); 3473 s->locked++; 3474 } 3475 3476 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3477 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3478 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3479 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3480 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3481 3482 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3483 __func__, (unsigned long long)sh->sector, 3484 s->locked, s->ops_request); 3485 } 3486 3487 static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, 3488 int dd_idx, int forwrite) 3489 { 3490 struct r5conf *conf = sh->raid_conf; 3491 struct bio **bip; 3492 3493 pr_debug("checking bi b#%llu to stripe s#%llu\n", 3494 bi->bi_iter.bi_sector, sh->sector); 3495 3496 /* Don't allow new IO added to stripes in batch list */ 3497 if (sh->batch_head) 3498 return true; 3499 3500 if (forwrite) 3501 bip = &sh->dev[dd_idx].towrite; 3502 else 3503 bip = &sh->dev[dd_idx].toread; 3504 3505 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3506 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3507 return true; 3508 bip = &(*bip)->bi_next; 3509 } 3510 3511 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3512 return true; 3513 3514 if (forwrite && raid5_has_ppl(conf)) { 3515 /* 3516 * With PPL only writes to consecutive data chunks within a 3517 * stripe are allowed because for a single stripe_head we can 3518 * only have one PPL entry at a time, which describes one data 3519 * range. Not really an overlap, but wait_for_overlap can be 3520 * used to handle this. 3521 */ 3522 sector_t sector; 3523 sector_t first = 0; 3524 sector_t last = 0; 3525 int count = 0; 3526 int i; 3527 3528 for (i = 0; i < sh->disks; i++) { 3529 if (i != sh->pd_idx && 3530 (i == dd_idx || sh->dev[i].towrite)) { 3531 sector = sh->dev[i].sector; 3532 if (count == 0 || sector < first) 3533 first = sector; 3534 if (sector > last) 3535 last = sector; 3536 count++; 3537 } 3538 } 3539 3540 if (first + conf->chunk_sectors * (count - 1) != last) 3541 return true; 3542 } 3543 3544 return false; 3545 } 3546 3547 static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3548 int dd_idx, int forwrite, int previous) 3549 { 3550 struct r5conf *conf = sh->raid_conf; 3551 struct bio **bip; 3552 int firstwrite = 0; 3553 3554 if (forwrite) { 3555 bip = &sh->dev[dd_idx].towrite; 3556 if (!*bip) 3557 firstwrite = 1; 3558 } else { 3559 bip = &sh->dev[dd_idx].toread; 3560 } 3561 3562 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) 3563 bip = &(*bip)->bi_next; 3564 3565 if (!forwrite || previous) 3566 clear_bit(STRIPE_BATCH_READY, &sh->state); 3567 3568 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3569 if (*bip) 3570 bi->bi_next = *bip; 3571 *bip = bi; 3572 bio_inc_remaining(bi); 3573 md_write_inc(conf->mddev, bi); 3574 3575 if (forwrite) { 3576 /* check if page is covered */ 3577 sector_t sector = sh->dev[dd_idx].sector; 3578 for (bi=sh->dev[dd_idx].towrite; 3579 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && 3580 bi && bi->bi_iter.bi_sector <= sector; 3581 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { 3582 if (bio_end_sector(bi) >= sector) 3583 sector = bio_end_sector(bi); 3584 } 3585 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) 3586 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3587 sh->overwrite_disks++; 3588 } 3589 3590 pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", 3591 (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, 3592 sh->dev[dd_idx].sector); 3593 3594 if (conf->mddev->bitmap && firstwrite) { 3595 /* Cannot hold spinlock over bitmap_startwrite, 3596 * but must ensure this isn't added to a batch until 3597 * we have added to the bitmap and set bm_seq. 3598 * So set STRIPE_BITMAP_PENDING to prevent 3599 * batching. 3600 * If multiple __add_stripe_bio() calls race here they 3601 * much all set STRIPE_BITMAP_PENDING. So only the first one 3602 * to complete "bitmap_startwrite" gets to set 3603 * STRIPE_BIT_DELAY. This is important as once a stripe 3604 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3605 * any more. 3606 */ 3607 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3608 spin_unlock_irq(&sh->stripe_lock); 3609 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3610 RAID5_STRIPE_SECTORS(conf), 0); 3611 spin_lock_irq(&sh->stripe_lock); 3612 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3613 if (!sh->batch_head) { 3614 sh->bm_seq = conf->seq_flush+1; 3615 set_bit(STRIPE_BIT_DELAY, &sh->state); 3616 } 3617 } 3618 } 3619 3620 /* 3621 * Each stripe/dev can have one or more bios attached. 3622 * toread/towrite point to the first in a chain. 3623 * The bi_next chain must be in order. 3624 */ 3625 static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3626 int dd_idx, int forwrite, int previous) 3627 { 3628 spin_lock_irq(&sh->stripe_lock); 3629 3630 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 3631 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3632 spin_unlock_irq(&sh->stripe_lock); 3633 return false; 3634 } 3635 3636 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 3637 spin_unlock_irq(&sh->stripe_lock); 3638 return true; 3639 } 3640 3641 static void end_reshape(struct r5conf *conf); 3642 3643 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3644 struct stripe_head *sh) 3645 { 3646 int sectors_per_chunk = 3647 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3648 int dd_idx; 3649 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3650 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3651 3652 raid5_compute_sector(conf, 3653 stripe * (disks - conf->max_degraded) 3654 *sectors_per_chunk + chunk_offset, 3655 previous, 3656 &dd_idx, sh); 3657 } 3658 3659 static void 3660 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3661 struct stripe_head_state *s, int disks) 3662 { 3663 int i; 3664 BUG_ON(sh->batch_head); 3665 for (i = disks; i--; ) { 3666 struct bio *bi; 3667 int bitmap_end = 0; 3668 3669 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3670 struct md_rdev *rdev; 3671 rcu_read_lock(); 3672 rdev = rcu_dereference(conf->disks[i].rdev); 3673 if (rdev && test_bit(In_sync, &rdev->flags) && 3674 !test_bit(Faulty, &rdev->flags)) 3675 atomic_inc(&rdev->nr_pending); 3676 else 3677 rdev = NULL; 3678 rcu_read_unlock(); 3679 if (rdev) { 3680 if (!rdev_set_badblocks( 3681 rdev, 3682 sh->sector, 3683 RAID5_STRIPE_SECTORS(conf), 0)) 3684 md_error(conf->mddev, rdev); 3685 rdev_dec_pending(rdev, conf->mddev); 3686 } 3687 } 3688 spin_lock_irq(&sh->stripe_lock); 3689 /* fail all writes first */ 3690 bi = sh->dev[i].towrite; 3691 sh->dev[i].towrite = NULL; 3692 sh->overwrite_disks = 0; 3693 spin_unlock_irq(&sh->stripe_lock); 3694 if (bi) 3695 bitmap_end = 1; 3696 3697 log_stripe_write_finished(sh); 3698 3699 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3700 wake_up(&conf->wait_for_overlap); 3701 3702 while (bi && bi->bi_iter.bi_sector < 3703 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3704 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); 3705 3706 md_write_end(conf->mddev); 3707 bio_io_error(bi); 3708 bi = nextbi; 3709 } 3710 if (bitmap_end) 3711 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3712 RAID5_STRIPE_SECTORS(conf), 0, 0); 3713 bitmap_end = 0; 3714 /* and fail all 'written' */ 3715 bi = sh->dev[i].written; 3716 sh->dev[i].written = NULL; 3717 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3718 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3719 sh->dev[i].page = sh->dev[i].orig_page; 3720 } 3721 3722 if (bi) bitmap_end = 1; 3723 while (bi && bi->bi_iter.bi_sector < 3724 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3725 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); 3726 3727 md_write_end(conf->mddev); 3728 bio_io_error(bi); 3729 bi = bi2; 3730 } 3731 3732 /* fail any reads if this device is non-operational and 3733 * the data has not reached the cache yet. 3734 */ 3735 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3736 s->failed > conf->max_degraded && 3737 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3738 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3739 spin_lock_irq(&sh->stripe_lock); 3740 bi = sh->dev[i].toread; 3741 sh->dev[i].toread = NULL; 3742 spin_unlock_irq(&sh->stripe_lock); 3743 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3744 wake_up(&conf->wait_for_overlap); 3745 if (bi) 3746 s->to_read--; 3747 while (bi && bi->bi_iter.bi_sector < 3748 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3749 struct bio *nextbi = 3750 r5_next_bio(conf, bi, sh->dev[i].sector); 3751 3752 bio_io_error(bi); 3753 bi = nextbi; 3754 } 3755 } 3756 if (bitmap_end) 3757 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3758 RAID5_STRIPE_SECTORS(conf), 0, 0); 3759 /* If we were in the middle of a write the parity block might 3760 * still be locked - so just clear all R5_LOCKED flags 3761 */ 3762 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3763 } 3764 s->to_write = 0; 3765 s->written = 0; 3766 3767 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3768 if (atomic_dec_and_test(&conf->pending_full_writes)) 3769 md_wakeup_thread(conf->mddev->thread); 3770 } 3771 3772 static void 3773 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3774 struct stripe_head_state *s) 3775 { 3776 int abort = 0; 3777 int i; 3778 3779 BUG_ON(sh->batch_head); 3780 clear_bit(STRIPE_SYNCING, &sh->state); 3781 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3782 wake_up(&conf->wait_for_overlap); 3783 s->syncing = 0; 3784 s->replacing = 0; 3785 /* There is nothing more to do for sync/check/repair. 3786 * Don't even need to abort as that is handled elsewhere 3787 * if needed, and not always wanted e.g. if there is a known 3788 * bad block here. 3789 * For recover/replace we need to record a bad block on all 3790 * non-sync devices, or abort the recovery 3791 */ 3792 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3793 /* During recovery devices cannot be removed, so 3794 * locking and refcounting of rdevs is not needed 3795 */ 3796 rcu_read_lock(); 3797 for (i = 0; i < conf->raid_disks; i++) { 3798 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3799 if (rdev 3800 && !test_bit(Faulty, &rdev->flags) 3801 && !test_bit(In_sync, &rdev->flags) 3802 && !rdev_set_badblocks(rdev, sh->sector, 3803 RAID5_STRIPE_SECTORS(conf), 0)) 3804 abort = 1; 3805 rdev = rcu_dereference(conf->disks[i].replacement); 3806 if (rdev 3807 && !test_bit(Faulty, &rdev->flags) 3808 && !test_bit(In_sync, &rdev->flags) 3809 && !rdev_set_badblocks(rdev, sh->sector, 3810 RAID5_STRIPE_SECTORS(conf), 0)) 3811 abort = 1; 3812 } 3813 rcu_read_unlock(); 3814 if (abort) 3815 conf->recovery_disabled = 3816 conf->mddev->recovery_disabled; 3817 } 3818 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); 3819 } 3820 3821 static int want_replace(struct stripe_head *sh, int disk_idx) 3822 { 3823 struct md_rdev *rdev; 3824 int rv = 0; 3825 3826 rcu_read_lock(); 3827 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3828 if (rdev 3829 && !test_bit(Faulty, &rdev->flags) 3830 && !test_bit(In_sync, &rdev->flags) 3831 && (rdev->recovery_offset <= sh->sector 3832 || rdev->mddev->recovery_cp <= sh->sector)) 3833 rv = 1; 3834 rcu_read_unlock(); 3835 return rv; 3836 } 3837 3838 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3839 int disk_idx, int disks) 3840 { 3841 struct r5dev *dev = &sh->dev[disk_idx]; 3842 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3843 &sh->dev[s->failed_num[1]] }; 3844 int i; 3845 bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); 3846 3847 3848 if (test_bit(R5_LOCKED, &dev->flags) || 3849 test_bit(R5_UPTODATE, &dev->flags)) 3850 /* No point reading this as we already have it or have 3851 * decided to get it. 3852 */ 3853 return 0; 3854 3855 if (dev->toread || 3856 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3857 /* We need this block to directly satisfy a request */ 3858 return 1; 3859 3860 if (s->syncing || s->expanding || 3861 (s->replacing && want_replace(sh, disk_idx))) 3862 /* When syncing, or expanding we read everything. 3863 * When replacing, we need the replaced block. 3864 */ 3865 return 1; 3866 3867 if ((s->failed >= 1 && fdev[0]->toread) || 3868 (s->failed >= 2 && fdev[1]->toread)) 3869 /* If we want to read from a failed device, then 3870 * we need to actually read every other device. 3871 */ 3872 return 1; 3873 3874 /* Sometimes neither read-modify-write nor reconstruct-write 3875 * cycles can work. In those cases we read every block we 3876 * can. Then the parity-update is certain to have enough to 3877 * work with. 3878 * This can only be a problem when we need to write something, 3879 * and some device has failed. If either of those tests 3880 * fail we need look no further. 3881 */ 3882 if (!s->failed || !s->to_write) 3883 return 0; 3884 3885 if (test_bit(R5_Insync, &dev->flags) && 3886 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3887 /* Pre-reads at not permitted until after short delay 3888 * to gather multiple requests. However if this 3889 * device is no Insync, the block could only be computed 3890 * and there is no need to delay that. 3891 */ 3892 return 0; 3893 3894 for (i = 0; i < s->failed && i < 2; i++) { 3895 if (fdev[i]->towrite && 3896 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3897 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3898 /* If we have a partial write to a failed 3899 * device, then we will need to reconstruct 3900 * the content of that device, so all other 3901 * devices must be read. 3902 */ 3903 return 1; 3904 3905 if (s->failed >= 2 && 3906 (fdev[i]->towrite || 3907 s->failed_num[i] == sh->pd_idx || 3908 s->failed_num[i] == sh->qd_idx) && 3909 !test_bit(R5_UPTODATE, &fdev[i]->flags)) 3910 /* In max degraded raid6, If the failed disk is P, Q, 3911 * or we want to read the failed disk, we need to do 3912 * reconstruct-write. 3913 */ 3914 force_rcw = true; 3915 } 3916 3917 /* If we are forced to do a reconstruct-write, because parity 3918 * cannot be trusted and we are currently recovering it, there 3919 * is extra need to be careful. 3920 * If one of the devices that we would need to read, because 3921 * it is not being overwritten (and maybe not written at all) 3922 * is missing/faulty, then we need to read everything we can. 3923 */ 3924 if (!force_rcw && 3925 sh->sector < sh->raid_conf->mddev->recovery_cp) 3926 /* reconstruct-write isn't being forced */ 3927 return 0; 3928 for (i = 0; i < s->failed && i < 2; i++) { 3929 if (s->failed_num[i] != sh->pd_idx && 3930 s->failed_num[i] != sh->qd_idx && 3931 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3932 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3933 return 1; 3934 } 3935 3936 return 0; 3937 } 3938 3939 /* fetch_block - checks the given member device to see if its data needs 3940 * to be read or computed to satisfy a request. 3941 * 3942 * Returns 1 when no more member devices need to be checked, otherwise returns 3943 * 0 to tell the loop in handle_stripe_fill to continue 3944 */ 3945 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3946 int disk_idx, int disks) 3947 { 3948 struct r5dev *dev = &sh->dev[disk_idx]; 3949 3950 /* is the data in this block needed, and can we get it? */ 3951 if (need_this_block(sh, s, disk_idx, disks)) { 3952 /* we would like to get this block, possibly by computing it, 3953 * otherwise read it if the backing disk is insync 3954 */ 3955 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3956 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3957 BUG_ON(sh->batch_head); 3958 3959 /* 3960 * In the raid6 case if the only non-uptodate disk is P 3961 * then we already trusted P to compute the other failed 3962 * drives. It is safe to compute rather than re-read P. 3963 * In other cases we only compute blocks from failed 3964 * devices, otherwise check/repair might fail to detect 3965 * a real inconsistency. 3966 */ 3967 3968 if ((s->uptodate == disks - 1) && 3969 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3970 (s->failed && (disk_idx == s->failed_num[0] || 3971 disk_idx == s->failed_num[1])))) { 3972 /* have disk failed, and we're requested to fetch it; 3973 * do compute it 3974 */ 3975 pr_debug("Computing stripe %llu block %d\n", 3976 (unsigned long long)sh->sector, disk_idx); 3977 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3978 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3979 set_bit(R5_Wantcompute, &dev->flags); 3980 sh->ops.target = disk_idx; 3981 sh->ops.target2 = -1; /* no 2nd target */ 3982 s->req_compute = 1; 3983 /* Careful: from this point on 'uptodate' is in the eye 3984 * of raid_run_ops which services 'compute' operations 3985 * before writes. R5_Wantcompute flags a block that will 3986 * be R5_UPTODATE by the time it is needed for a 3987 * subsequent operation. 3988 */ 3989 s->uptodate++; 3990 return 1; 3991 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3992 /* Computing 2-failure is *very* expensive; only 3993 * do it if failed >= 2 3994 */ 3995 int other; 3996 for (other = disks; other--; ) { 3997 if (other == disk_idx) 3998 continue; 3999 if (!test_bit(R5_UPTODATE, 4000 &sh->dev[other].flags)) 4001 break; 4002 } 4003 BUG_ON(other < 0); 4004 pr_debug("Computing stripe %llu blocks %d,%d\n", 4005 (unsigned long long)sh->sector, 4006 disk_idx, other); 4007 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4008 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4009 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 4010 set_bit(R5_Wantcompute, &sh->dev[other].flags); 4011 sh->ops.target = disk_idx; 4012 sh->ops.target2 = other; 4013 s->uptodate += 2; 4014 s->req_compute = 1; 4015 return 1; 4016 } else if (test_bit(R5_Insync, &dev->flags)) { 4017 set_bit(R5_LOCKED, &dev->flags); 4018 set_bit(R5_Wantread, &dev->flags); 4019 s->locked++; 4020 pr_debug("Reading block %d (sync=%d)\n", 4021 disk_idx, s->syncing); 4022 } 4023 } 4024 4025 return 0; 4026 } 4027 4028 /* 4029 * handle_stripe_fill - read or compute data to satisfy pending requests. 4030 */ 4031 static void handle_stripe_fill(struct stripe_head *sh, 4032 struct stripe_head_state *s, 4033 int disks) 4034 { 4035 int i; 4036 4037 /* look for blocks to read/compute, skip this if a compute 4038 * is already in flight, or if the stripe contents are in the 4039 * midst of changing due to a write 4040 */ 4041 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 4042 !sh->reconstruct_state) { 4043 4044 /* 4045 * For degraded stripe with data in journal, do not handle 4046 * read requests yet, instead, flush the stripe to raid 4047 * disks first, this avoids handling complex rmw of write 4048 * back cache (prexor with orig_page, and then xor with 4049 * page) in the read path 4050 */ 4051 if (s->to_read && s->injournal && s->failed) { 4052 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 4053 r5c_make_stripe_write_out(sh); 4054 goto out; 4055 } 4056 4057 for (i = disks; i--; ) 4058 if (fetch_block(sh, s, i, disks)) 4059 break; 4060 } 4061 out: 4062 set_bit(STRIPE_HANDLE, &sh->state); 4063 } 4064 4065 static void break_stripe_batch_list(struct stripe_head *head_sh, 4066 unsigned long handle_flags); 4067 /* handle_stripe_clean_event 4068 * any written block on an uptodate or failed drive can be returned. 4069 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 4070 * never LOCKED, so we don't need to test 'failed' directly. 4071 */ 4072 static void handle_stripe_clean_event(struct r5conf *conf, 4073 struct stripe_head *sh, int disks) 4074 { 4075 int i; 4076 struct r5dev *dev; 4077 int discard_pending = 0; 4078 struct stripe_head *head_sh = sh; 4079 bool do_endio = false; 4080 4081 for (i = disks; i--; ) 4082 if (sh->dev[i].written) { 4083 dev = &sh->dev[i]; 4084 if (!test_bit(R5_LOCKED, &dev->flags) && 4085 (test_bit(R5_UPTODATE, &dev->flags) || 4086 test_bit(R5_Discard, &dev->flags) || 4087 test_bit(R5_SkipCopy, &dev->flags))) { 4088 /* We can return any write requests */ 4089 struct bio *wbi, *wbi2; 4090 pr_debug("Return write for disc %d\n", i); 4091 if (test_and_clear_bit(R5_Discard, &dev->flags)) 4092 clear_bit(R5_UPTODATE, &dev->flags); 4093 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 4094 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 4095 } 4096 do_endio = true; 4097 4098 returnbi: 4099 dev->page = dev->orig_page; 4100 wbi = dev->written; 4101 dev->written = NULL; 4102 while (wbi && wbi->bi_iter.bi_sector < 4103 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 4104 wbi2 = r5_next_bio(conf, wbi, dev->sector); 4105 md_write_end(conf->mddev); 4106 bio_endio(wbi); 4107 wbi = wbi2; 4108 } 4109 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 4110 RAID5_STRIPE_SECTORS(conf), 4111 !test_bit(STRIPE_DEGRADED, &sh->state), 4112 0); 4113 if (head_sh->batch_head) { 4114 sh = list_first_entry(&sh->batch_list, 4115 struct stripe_head, 4116 batch_list); 4117 if (sh != head_sh) { 4118 dev = &sh->dev[i]; 4119 goto returnbi; 4120 } 4121 } 4122 sh = head_sh; 4123 dev = &sh->dev[i]; 4124 } else if (test_bit(R5_Discard, &dev->flags)) 4125 discard_pending = 1; 4126 } 4127 4128 log_stripe_write_finished(sh); 4129 4130 if (!discard_pending && 4131 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 4132 int hash; 4133 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 4134 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4135 if (sh->qd_idx >= 0) { 4136 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 4137 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 4138 } 4139 /* now that discard is done we can proceed with any sync */ 4140 clear_bit(STRIPE_DISCARD, &sh->state); 4141 /* 4142 * SCSI discard will change some bio fields and the stripe has 4143 * no updated data, so remove it from hash list and the stripe 4144 * will be reinitialized 4145 */ 4146 unhash: 4147 hash = sh->hash_lock_index; 4148 spin_lock_irq(conf->hash_locks + hash); 4149 remove_hash(sh); 4150 spin_unlock_irq(conf->hash_locks + hash); 4151 if (head_sh->batch_head) { 4152 sh = list_first_entry(&sh->batch_list, 4153 struct stripe_head, batch_list); 4154 if (sh != head_sh) 4155 goto unhash; 4156 } 4157 sh = head_sh; 4158 4159 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 4160 set_bit(STRIPE_HANDLE, &sh->state); 4161 4162 } 4163 4164 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 4165 if (atomic_dec_and_test(&conf->pending_full_writes)) 4166 md_wakeup_thread(conf->mddev->thread); 4167 4168 if (head_sh->batch_head && do_endio) 4169 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 4170 } 4171 4172 /* 4173 * For RMW in write back cache, we need extra page in prexor to store the 4174 * old data. This page is stored in dev->orig_page. 4175 * 4176 * This function checks whether we have data for prexor. The exact logic 4177 * is: 4178 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 4179 */ 4180 static inline bool uptodate_for_rmw(struct r5dev *dev) 4181 { 4182 return (test_bit(R5_UPTODATE, &dev->flags)) && 4183 (!test_bit(R5_InJournal, &dev->flags) || 4184 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 4185 } 4186 4187 static int handle_stripe_dirtying(struct r5conf *conf, 4188 struct stripe_head *sh, 4189 struct stripe_head_state *s, 4190 int disks) 4191 { 4192 int rmw = 0, rcw = 0, i; 4193 sector_t recovery_cp = conf->mddev->recovery_cp; 4194 4195 /* Check whether resync is now happening or should start. 4196 * If yes, then the array is dirty (after unclean shutdown or 4197 * initial creation), so parity in some stripes might be inconsistent. 4198 * In this case, we need to always do reconstruct-write, to ensure 4199 * that in case of drive failure or read-error correction, we 4200 * generate correct data from the parity. 4201 */ 4202 if (conf->rmw_level == PARITY_DISABLE_RMW || 4203 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 4204 s->failed == 0)) { 4205 /* Calculate the real rcw later - for now make it 4206 * look like rcw is cheaper 4207 */ 4208 rcw = 1; rmw = 2; 4209 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 4210 conf->rmw_level, (unsigned long long)recovery_cp, 4211 (unsigned long long)sh->sector); 4212 } else for (i = disks; i--; ) { 4213 /* would I have to read this buffer for read_modify_write */ 4214 struct r5dev *dev = &sh->dev[i]; 4215 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 4216 i == sh->pd_idx || i == sh->qd_idx || 4217 test_bit(R5_InJournal, &dev->flags)) && 4218 !test_bit(R5_LOCKED, &dev->flags) && 4219 !(uptodate_for_rmw(dev) || 4220 test_bit(R5_Wantcompute, &dev->flags))) { 4221 if (test_bit(R5_Insync, &dev->flags)) 4222 rmw++; 4223 else 4224 rmw += 2*disks; /* cannot read it */ 4225 } 4226 /* Would I have to read this buffer for reconstruct_write */ 4227 if (!test_bit(R5_OVERWRITE, &dev->flags) && 4228 i != sh->pd_idx && i != sh->qd_idx && 4229 !test_bit(R5_LOCKED, &dev->flags) && 4230 !(test_bit(R5_UPTODATE, &dev->flags) || 4231 test_bit(R5_Wantcompute, &dev->flags))) { 4232 if (test_bit(R5_Insync, &dev->flags)) 4233 rcw++; 4234 else 4235 rcw += 2*disks; 4236 } 4237 } 4238 4239 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 4240 (unsigned long long)sh->sector, sh->state, rmw, rcw); 4241 set_bit(STRIPE_HANDLE, &sh->state); 4242 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 4243 /* prefer read-modify-write, but need to get some data */ 4244 if (conf->mddev->queue) 4245 blk_add_trace_msg(conf->mddev->queue, 4246 "raid5 rmw %llu %d", 4247 (unsigned long long)sh->sector, rmw); 4248 for (i = disks; i--; ) { 4249 struct r5dev *dev = &sh->dev[i]; 4250 if (test_bit(R5_InJournal, &dev->flags) && 4251 dev->page == dev->orig_page && 4252 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 4253 /* alloc page for prexor */ 4254 struct page *p = alloc_page(GFP_NOIO); 4255 4256 if (p) { 4257 dev->orig_page = p; 4258 continue; 4259 } 4260 4261 /* 4262 * alloc_page() failed, try use 4263 * disk_info->extra_page 4264 */ 4265 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 4266 &conf->cache_state)) { 4267 r5c_use_extra_page(sh); 4268 break; 4269 } 4270 4271 /* extra_page in use, add to delayed_list */ 4272 set_bit(STRIPE_DELAYED, &sh->state); 4273 s->waiting_extra_page = 1; 4274 return -EAGAIN; 4275 } 4276 } 4277 4278 for (i = disks; i--; ) { 4279 struct r5dev *dev = &sh->dev[i]; 4280 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 4281 i == sh->pd_idx || i == sh->qd_idx || 4282 test_bit(R5_InJournal, &dev->flags)) && 4283 !test_bit(R5_LOCKED, &dev->flags) && 4284 !(uptodate_for_rmw(dev) || 4285 test_bit(R5_Wantcompute, &dev->flags)) && 4286 test_bit(R5_Insync, &dev->flags)) { 4287 if (test_bit(STRIPE_PREREAD_ACTIVE, 4288 &sh->state)) { 4289 pr_debug("Read_old block %d for r-m-w\n", 4290 i); 4291 set_bit(R5_LOCKED, &dev->flags); 4292 set_bit(R5_Wantread, &dev->flags); 4293 s->locked++; 4294 } else 4295 set_bit(STRIPE_DELAYED, &sh->state); 4296 } 4297 } 4298 } 4299 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 4300 /* want reconstruct write, but need to get some data */ 4301 int qread =0; 4302 rcw = 0; 4303 for (i = disks; i--; ) { 4304 struct r5dev *dev = &sh->dev[i]; 4305 if (!test_bit(R5_OVERWRITE, &dev->flags) && 4306 i != sh->pd_idx && i != sh->qd_idx && 4307 !test_bit(R5_LOCKED, &dev->flags) && 4308 !(test_bit(R5_UPTODATE, &dev->flags) || 4309 test_bit(R5_Wantcompute, &dev->flags))) { 4310 rcw++; 4311 if (test_bit(R5_Insync, &dev->flags) && 4312 test_bit(STRIPE_PREREAD_ACTIVE, 4313 &sh->state)) { 4314 pr_debug("Read_old block " 4315 "%d for Reconstruct\n", i); 4316 set_bit(R5_LOCKED, &dev->flags); 4317 set_bit(R5_Wantread, &dev->flags); 4318 s->locked++; 4319 qread++; 4320 } else 4321 set_bit(STRIPE_DELAYED, &sh->state); 4322 } 4323 } 4324 if (rcw && conf->mddev->queue) 4325 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 4326 (unsigned long long)sh->sector, 4327 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 4328 } 4329 4330 if (rcw > disks && rmw > disks && 4331 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4332 set_bit(STRIPE_DELAYED, &sh->state); 4333 4334 /* now if nothing is locked, and if we have enough data, 4335 * we can start a write request 4336 */ 4337 /* since handle_stripe can be called at any time we need to handle the 4338 * case where a compute block operation has been submitted and then a 4339 * subsequent call wants to start a write request. raid_run_ops only 4340 * handles the case where compute block and reconstruct are requested 4341 * simultaneously. If this is not the case then new writes need to be 4342 * held off until the compute completes. 4343 */ 4344 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4345 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4346 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4347 schedule_reconstruction(sh, s, rcw == 0, 0); 4348 return 0; 4349 } 4350 4351 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4352 struct stripe_head_state *s, int disks) 4353 { 4354 struct r5dev *dev = NULL; 4355 4356 BUG_ON(sh->batch_head); 4357 set_bit(STRIPE_HANDLE, &sh->state); 4358 4359 switch (sh->check_state) { 4360 case check_state_idle: 4361 /* start a new check operation if there are no failures */ 4362 if (s->failed == 0) { 4363 BUG_ON(s->uptodate != disks); 4364 sh->check_state = check_state_run; 4365 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4366 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4367 s->uptodate--; 4368 break; 4369 } 4370 dev = &sh->dev[s->failed_num[0]]; 4371 fallthrough; 4372 case check_state_compute_result: 4373 sh->check_state = check_state_idle; 4374 if (!dev) 4375 dev = &sh->dev[sh->pd_idx]; 4376 4377 /* check that a write has not made the stripe insync */ 4378 if (test_bit(STRIPE_INSYNC, &sh->state)) 4379 break; 4380 4381 /* either failed parity check, or recovery is happening */ 4382 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4383 BUG_ON(s->uptodate != disks); 4384 4385 set_bit(R5_LOCKED, &dev->flags); 4386 s->locked++; 4387 set_bit(R5_Wantwrite, &dev->flags); 4388 4389 clear_bit(STRIPE_DEGRADED, &sh->state); 4390 set_bit(STRIPE_INSYNC, &sh->state); 4391 break; 4392 case check_state_run: 4393 break; /* we will be called again upon completion */ 4394 case check_state_check_result: 4395 sh->check_state = check_state_idle; 4396 4397 /* if a failure occurred during the check operation, leave 4398 * STRIPE_INSYNC not set and let the stripe be handled again 4399 */ 4400 if (s->failed) 4401 break; 4402 4403 /* handle a successful check operation, if parity is correct 4404 * we are done. Otherwise update the mismatch count and repair 4405 * parity if !MD_RECOVERY_CHECK 4406 */ 4407 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4408 /* parity is correct (on disc, 4409 * not in buffer any more) 4410 */ 4411 set_bit(STRIPE_INSYNC, &sh->state); 4412 else { 4413 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); 4414 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4415 /* don't try to repair!! */ 4416 set_bit(STRIPE_INSYNC, &sh->state); 4417 pr_warn_ratelimited("%s: mismatch sector in range " 4418 "%llu-%llu\n", mdname(conf->mddev), 4419 (unsigned long long) sh->sector, 4420 (unsigned long long) sh->sector + 4421 RAID5_STRIPE_SECTORS(conf)); 4422 } else { 4423 sh->check_state = check_state_compute_run; 4424 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4425 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4426 set_bit(R5_Wantcompute, 4427 &sh->dev[sh->pd_idx].flags); 4428 sh->ops.target = sh->pd_idx; 4429 sh->ops.target2 = -1; 4430 s->uptodate++; 4431 } 4432 } 4433 break; 4434 case check_state_compute_run: 4435 break; 4436 default: 4437 pr_err("%s: unknown check_state: %d sector: %llu\n", 4438 __func__, sh->check_state, 4439 (unsigned long long) sh->sector); 4440 BUG(); 4441 } 4442 } 4443 4444 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4445 struct stripe_head_state *s, 4446 int disks) 4447 { 4448 int pd_idx = sh->pd_idx; 4449 int qd_idx = sh->qd_idx; 4450 struct r5dev *dev; 4451 4452 BUG_ON(sh->batch_head); 4453 set_bit(STRIPE_HANDLE, &sh->state); 4454 4455 BUG_ON(s->failed > 2); 4456 4457 /* Want to check and possibly repair P and Q. 4458 * However there could be one 'failed' device, in which 4459 * case we can only check one of them, possibly using the 4460 * other to generate missing data 4461 */ 4462 4463 switch (sh->check_state) { 4464 case check_state_idle: 4465 /* start a new check operation if there are < 2 failures */ 4466 if (s->failed == s->q_failed) { 4467 /* The only possible failed device holds Q, so it 4468 * makes sense to check P (If anything else were failed, 4469 * we would have used P to recreate it). 4470 */ 4471 sh->check_state = check_state_run; 4472 } 4473 if (!s->q_failed && s->failed < 2) { 4474 /* Q is not failed, and we didn't use it to generate 4475 * anything, so it makes sense to check it 4476 */ 4477 if (sh->check_state == check_state_run) 4478 sh->check_state = check_state_run_pq; 4479 else 4480 sh->check_state = check_state_run_q; 4481 } 4482 4483 /* discard potentially stale zero_sum_result */ 4484 sh->ops.zero_sum_result = 0; 4485 4486 if (sh->check_state == check_state_run) { 4487 /* async_xor_zero_sum destroys the contents of P */ 4488 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4489 s->uptodate--; 4490 } 4491 if (sh->check_state >= check_state_run && 4492 sh->check_state <= check_state_run_pq) { 4493 /* async_syndrome_zero_sum preserves P and Q, so 4494 * no need to mark them !uptodate here 4495 */ 4496 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4497 break; 4498 } 4499 4500 /* we have 2-disk failure */ 4501 BUG_ON(s->failed != 2); 4502 fallthrough; 4503 case check_state_compute_result: 4504 sh->check_state = check_state_idle; 4505 4506 /* check that a write has not made the stripe insync */ 4507 if (test_bit(STRIPE_INSYNC, &sh->state)) 4508 break; 4509 4510 /* now write out any block on a failed drive, 4511 * or P or Q if they were recomputed 4512 */ 4513 dev = NULL; 4514 if (s->failed == 2) { 4515 dev = &sh->dev[s->failed_num[1]]; 4516 s->locked++; 4517 set_bit(R5_LOCKED, &dev->flags); 4518 set_bit(R5_Wantwrite, &dev->flags); 4519 } 4520 if (s->failed >= 1) { 4521 dev = &sh->dev[s->failed_num[0]]; 4522 s->locked++; 4523 set_bit(R5_LOCKED, &dev->flags); 4524 set_bit(R5_Wantwrite, &dev->flags); 4525 } 4526 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4527 dev = &sh->dev[pd_idx]; 4528 s->locked++; 4529 set_bit(R5_LOCKED, &dev->flags); 4530 set_bit(R5_Wantwrite, &dev->flags); 4531 } 4532 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4533 dev = &sh->dev[qd_idx]; 4534 s->locked++; 4535 set_bit(R5_LOCKED, &dev->flags); 4536 set_bit(R5_Wantwrite, &dev->flags); 4537 } 4538 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), 4539 "%s: disk%td not up to date\n", 4540 mdname(conf->mddev), 4541 dev - (struct r5dev *) &sh->dev)) { 4542 clear_bit(R5_LOCKED, &dev->flags); 4543 clear_bit(R5_Wantwrite, &dev->flags); 4544 s->locked--; 4545 } 4546 clear_bit(STRIPE_DEGRADED, &sh->state); 4547 4548 set_bit(STRIPE_INSYNC, &sh->state); 4549 break; 4550 case check_state_run: 4551 case check_state_run_q: 4552 case check_state_run_pq: 4553 break; /* we will be called again upon completion */ 4554 case check_state_check_result: 4555 sh->check_state = check_state_idle; 4556 4557 /* handle a successful check operation, if parity is correct 4558 * we are done. Otherwise update the mismatch count and repair 4559 * parity if !MD_RECOVERY_CHECK 4560 */ 4561 if (sh->ops.zero_sum_result == 0) { 4562 /* both parities are correct */ 4563 if (!s->failed) 4564 set_bit(STRIPE_INSYNC, &sh->state); 4565 else { 4566 /* in contrast to the raid5 case we can validate 4567 * parity, but still have a failure to write 4568 * back 4569 */ 4570 sh->check_state = check_state_compute_result; 4571 /* Returning at this point means that we may go 4572 * off and bring p and/or q uptodate again so 4573 * we make sure to check zero_sum_result again 4574 * to verify if p or q need writeback 4575 */ 4576 } 4577 } else { 4578 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); 4579 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4580 /* don't try to repair!! */ 4581 set_bit(STRIPE_INSYNC, &sh->state); 4582 pr_warn_ratelimited("%s: mismatch sector in range " 4583 "%llu-%llu\n", mdname(conf->mddev), 4584 (unsigned long long) sh->sector, 4585 (unsigned long long) sh->sector + 4586 RAID5_STRIPE_SECTORS(conf)); 4587 } else { 4588 int *target = &sh->ops.target; 4589 4590 sh->ops.target = -1; 4591 sh->ops.target2 = -1; 4592 sh->check_state = check_state_compute_run; 4593 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4594 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4595 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4596 set_bit(R5_Wantcompute, 4597 &sh->dev[pd_idx].flags); 4598 *target = pd_idx; 4599 target = &sh->ops.target2; 4600 s->uptodate++; 4601 } 4602 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4603 set_bit(R5_Wantcompute, 4604 &sh->dev[qd_idx].flags); 4605 *target = qd_idx; 4606 s->uptodate++; 4607 } 4608 } 4609 } 4610 break; 4611 case check_state_compute_run: 4612 break; 4613 default: 4614 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4615 __func__, sh->check_state, 4616 (unsigned long long) sh->sector); 4617 BUG(); 4618 } 4619 } 4620 4621 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4622 { 4623 int i; 4624 4625 /* We have read all the blocks in this stripe and now we need to 4626 * copy some of them into a target stripe for expand. 4627 */ 4628 struct dma_async_tx_descriptor *tx = NULL; 4629 BUG_ON(sh->batch_head); 4630 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4631 for (i = 0; i < sh->disks; i++) 4632 if (i != sh->pd_idx && i != sh->qd_idx) { 4633 int dd_idx, j; 4634 struct stripe_head *sh2; 4635 struct async_submit_ctl submit; 4636 4637 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4638 sector_t s = raid5_compute_sector(conf, bn, 0, 4639 &dd_idx, NULL); 4640 sh2 = raid5_get_active_stripe(conf, NULL, s, 4641 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 4642 if (sh2 == NULL) 4643 /* so far only the early blocks of this stripe 4644 * have been requested. When later blocks 4645 * get requested, we will try again 4646 */ 4647 continue; 4648 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4649 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4650 /* must have already done this block */ 4651 raid5_release_stripe(sh2); 4652 continue; 4653 } 4654 4655 /* place all the copies on one channel */ 4656 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4657 tx = async_memcpy(sh2->dev[dd_idx].page, 4658 sh->dev[i].page, sh2->dev[dd_idx].offset, 4659 sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), 4660 &submit); 4661 4662 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4663 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4664 for (j = 0; j < conf->raid_disks; j++) 4665 if (j != sh2->pd_idx && 4666 j != sh2->qd_idx && 4667 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4668 break; 4669 if (j == conf->raid_disks) { 4670 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4671 set_bit(STRIPE_HANDLE, &sh2->state); 4672 } 4673 raid5_release_stripe(sh2); 4674 4675 } 4676 /* done submitting copies, wait for them to complete */ 4677 async_tx_quiesce(&tx); 4678 } 4679 4680 /* 4681 * handle_stripe - do things to a stripe. 4682 * 4683 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4684 * state of various bits to see what needs to be done. 4685 * Possible results: 4686 * return some read requests which now have data 4687 * return some write requests which are safely on storage 4688 * schedule a read on some buffers 4689 * schedule a write of some buffers 4690 * return confirmation of parity correctness 4691 * 4692 */ 4693 4694 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4695 { 4696 struct r5conf *conf = sh->raid_conf; 4697 int disks = sh->disks; 4698 struct r5dev *dev; 4699 int i; 4700 int do_recovery = 0; 4701 4702 memset(s, 0, sizeof(*s)); 4703 4704 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4705 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4706 s->failed_num[0] = -1; 4707 s->failed_num[1] = -1; 4708 s->log_failed = r5l_log_disk_error(conf); 4709 4710 /* Now to look around and see what can be done */ 4711 rcu_read_lock(); 4712 for (i=disks; i--; ) { 4713 struct md_rdev *rdev; 4714 sector_t first_bad; 4715 int bad_sectors; 4716 int is_bad = 0; 4717 4718 dev = &sh->dev[i]; 4719 4720 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4721 i, dev->flags, 4722 dev->toread, dev->towrite, dev->written); 4723 /* maybe we can reply to a read 4724 * 4725 * new wantfill requests are only permitted while 4726 * ops_complete_biofill is guaranteed to be inactive 4727 */ 4728 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4729 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4730 set_bit(R5_Wantfill, &dev->flags); 4731 4732 /* now count some things */ 4733 if (test_bit(R5_LOCKED, &dev->flags)) 4734 s->locked++; 4735 if (test_bit(R5_UPTODATE, &dev->flags)) 4736 s->uptodate++; 4737 if (test_bit(R5_Wantcompute, &dev->flags)) { 4738 s->compute++; 4739 BUG_ON(s->compute > 2); 4740 } 4741 4742 if (test_bit(R5_Wantfill, &dev->flags)) 4743 s->to_fill++; 4744 else if (dev->toread) 4745 s->to_read++; 4746 if (dev->towrite) { 4747 s->to_write++; 4748 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4749 s->non_overwrite++; 4750 } 4751 if (dev->written) 4752 s->written++; 4753 /* Prefer to use the replacement for reads, but only 4754 * if it is recovered enough and has no bad blocks. 4755 */ 4756 rdev = rcu_dereference(conf->disks[i].replacement); 4757 if (rdev && !test_bit(Faulty, &rdev->flags) && 4758 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && 4759 !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 4760 &first_bad, &bad_sectors)) 4761 set_bit(R5_ReadRepl, &dev->flags); 4762 else { 4763 if (rdev && !test_bit(Faulty, &rdev->flags)) 4764 set_bit(R5_NeedReplace, &dev->flags); 4765 else 4766 clear_bit(R5_NeedReplace, &dev->flags); 4767 rdev = rcu_dereference(conf->disks[i].rdev); 4768 clear_bit(R5_ReadRepl, &dev->flags); 4769 } 4770 if (rdev && test_bit(Faulty, &rdev->flags)) 4771 rdev = NULL; 4772 if (rdev) { 4773 is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 4774 &first_bad, &bad_sectors); 4775 if (s->blocked_rdev == NULL 4776 && (test_bit(Blocked, &rdev->flags) 4777 || is_bad < 0)) { 4778 if (is_bad < 0) 4779 set_bit(BlockedBadBlocks, 4780 &rdev->flags); 4781 s->blocked_rdev = rdev; 4782 atomic_inc(&rdev->nr_pending); 4783 } 4784 } 4785 clear_bit(R5_Insync, &dev->flags); 4786 if (!rdev) 4787 /* Not in-sync */; 4788 else if (is_bad) { 4789 /* also not in-sync */ 4790 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4791 test_bit(R5_UPTODATE, &dev->flags)) { 4792 /* treat as in-sync, but with a read error 4793 * which we can now try to correct 4794 */ 4795 set_bit(R5_Insync, &dev->flags); 4796 set_bit(R5_ReadError, &dev->flags); 4797 } 4798 } else if (test_bit(In_sync, &rdev->flags)) 4799 set_bit(R5_Insync, &dev->flags); 4800 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) 4801 /* in sync if before recovery_offset */ 4802 set_bit(R5_Insync, &dev->flags); 4803 else if (test_bit(R5_UPTODATE, &dev->flags) && 4804 test_bit(R5_Expanded, &dev->flags)) 4805 /* If we've reshaped into here, we assume it is Insync. 4806 * We will shortly update recovery_offset to make 4807 * it official. 4808 */ 4809 set_bit(R5_Insync, &dev->flags); 4810 4811 if (test_bit(R5_WriteError, &dev->flags)) { 4812 /* This flag does not apply to '.replacement' 4813 * only to .rdev, so make sure to check that*/ 4814 struct md_rdev *rdev2 = rcu_dereference( 4815 conf->disks[i].rdev); 4816 if (rdev2 == rdev) 4817 clear_bit(R5_Insync, &dev->flags); 4818 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4819 s->handle_bad_blocks = 1; 4820 atomic_inc(&rdev2->nr_pending); 4821 } else 4822 clear_bit(R5_WriteError, &dev->flags); 4823 } 4824 if (test_bit(R5_MadeGood, &dev->flags)) { 4825 /* This flag does not apply to '.replacement' 4826 * only to .rdev, so make sure to check that*/ 4827 struct md_rdev *rdev2 = rcu_dereference( 4828 conf->disks[i].rdev); 4829 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4830 s->handle_bad_blocks = 1; 4831 atomic_inc(&rdev2->nr_pending); 4832 } else 4833 clear_bit(R5_MadeGood, &dev->flags); 4834 } 4835 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4836 struct md_rdev *rdev2 = rcu_dereference( 4837 conf->disks[i].replacement); 4838 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4839 s->handle_bad_blocks = 1; 4840 atomic_inc(&rdev2->nr_pending); 4841 } else 4842 clear_bit(R5_MadeGoodRepl, &dev->flags); 4843 } 4844 if (!test_bit(R5_Insync, &dev->flags)) { 4845 /* The ReadError flag will just be confusing now */ 4846 clear_bit(R5_ReadError, &dev->flags); 4847 clear_bit(R5_ReWrite, &dev->flags); 4848 } 4849 if (test_bit(R5_ReadError, &dev->flags)) 4850 clear_bit(R5_Insync, &dev->flags); 4851 if (!test_bit(R5_Insync, &dev->flags)) { 4852 if (s->failed < 2) 4853 s->failed_num[s->failed] = i; 4854 s->failed++; 4855 if (rdev && !test_bit(Faulty, &rdev->flags)) 4856 do_recovery = 1; 4857 else if (!rdev) { 4858 rdev = rcu_dereference( 4859 conf->disks[i].replacement); 4860 if (rdev && !test_bit(Faulty, &rdev->flags)) 4861 do_recovery = 1; 4862 } 4863 } 4864 4865 if (test_bit(R5_InJournal, &dev->flags)) 4866 s->injournal++; 4867 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4868 s->just_cached++; 4869 } 4870 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4871 /* If there is a failed device being replaced, 4872 * we must be recovering. 4873 * else if we are after recovery_cp, we must be syncing 4874 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4875 * else we can only be replacing 4876 * sync and recovery both need to read all devices, and so 4877 * use the same flag. 4878 */ 4879 if (do_recovery || 4880 sh->sector >= conf->mddev->recovery_cp || 4881 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4882 s->syncing = 1; 4883 else 4884 s->replacing = 1; 4885 } 4886 rcu_read_unlock(); 4887 } 4888 4889 /* 4890 * Return '1' if this is a member of batch, or '0' if it is a lone stripe or 4891 * a head which can now be handled. 4892 */ 4893 static int clear_batch_ready(struct stripe_head *sh) 4894 { 4895 struct stripe_head *tmp; 4896 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4897 return (sh->batch_head && sh->batch_head != sh); 4898 spin_lock(&sh->stripe_lock); 4899 if (!sh->batch_head) { 4900 spin_unlock(&sh->stripe_lock); 4901 return 0; 4902 } 4903 4904 /* 4905 * this stripe could be added to a batch list before we check 4906 * BATCH_READY, skips it 4907 */ 4908 if (sh->batch_head != sh) { 4909 spin_unlock(&sh->stripe_lock); 4910 return 1; 4911 } 4912 spin_lock(&sh->batch_lock); 4913 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4914 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4915 spin_unlock(&sh->batch_lock); 4916 spin_unlock(&sh->stripe_lock); 4917 4918 /* 4919 * BATCH_READY is cleared, no new stripes can be added. 4920 * batch_list can be accessed without lock 4921 */ 4922 return 0; 4923 } 4924 4925 static void break_stripe_batch_list(struct stripe_head *head_sh, 4926 unsigned long handle_flags) 4927 { 4928 struct stripe_head *sh, *next; 4929 int i; 4930 int do_wakeup = 0; 4931 4932 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4933 4934 list_del_init(&sh->batch_list); 4935 4936 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4937 (1 << STRIPE_SYNCING) | 4938 (1 << STRIPE_REPLACED) | 4939 (1 << STRIPE_DELAYED) | 4940 (1 << STRIPE_BIT_DELAY) | 4941 (1 << STRIPE_FULL_WRITE) | 4942 (1 << STRIPE_BIOFILL_RUN) | 4943 (1 << STRIPE_COMPUTE_RUN) | 4944 (1 << STRIPE_DISCARD) | 4945 (1 << STRIPE_BATCH_READY) | 4946 (1 << STRIPE_BATCH_ERR) | 4947 (1 << STRIPE_BITMAP_PENDING)), 4948 "stripe state: %lx\n", sh->state); 4949 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4950 (1 << STRIPE_REPLACED)), 4951 "head stripe state: %lx\n", head_sh->state); 4952 4953 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4954 (1 << STRIPE_PREREAD_ACTIVE) | 4955 (1 << STRIPE_DEGRADED) | 4956 (1 << STRIPE_ON_UNPLUG_LIST)), 4957 head_sh->state & (1 << STRIPE_INSYNC)); 4958 4959 sh->check_state = head_sh->check_state; 4960 sh->reconstruct_state = head_sh->reconstruct_state; 4961 spin_lock_irq(&sh->stripe_lock); 4962 sh->batch_head = NULL; 4963 spin_unlock_irq(&sh->stripe_lock); 4964 for (i = 0; i < sh->disks; i++) { 4965 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4966 do_wakeup = 1; 4967 sh->dev[i].flags = head_sh->dev[i].flags & 4968 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4969 } 4970 if (handle_flags == 0 || 4971 sh->state & handle_flags) 4972 set_bit(STRIPE_HANDLE, &sh->state); 4973 raid5_release_stripe(sh); 4974 } 4975 spin_lock_irq(&head_sh->stripe_lock); 4976 head_sh->batch_head = NULL; 4977 spin_unlock_irq(&head_sh->stripe_lock); 4978 for (i = 0; i < head_sh->disks; i++) 4979 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4980 do_wakeup = 1; 4981 if (head_sh->state & handle_flags) 4982 set_bit(STRIPE_HANDLE, &head_sh->state); 4983 4984 if (do_wakeup) 4985 wake_up(&head_sh->raid_conf->wait_for_overlap); 4986 } 4987 4988 static void handle_stripe(struct stripe_head *sh) 4989 { 4990 struct stripe_head_state s; 4991 struct r5conf *conf = sh->raid_conf; 4992 int i; 4993 int prexor; 4994 int disks = sh->disks; 4995 struct r5dev *pdev, *qdev; 4996 4997 clear_bit(STRIPE_HANDLE, &sh->state); 4998 4999 /* 5000 * handle_stripe should not continue handle the batched stripe, only 5001 * the head of batch list or lone stripe can continue. Otherwise we 5002 * could see break_stripe_batch_list warns about the STRIPE_ACTIVE 5003 * is set for the batched stripe. 5004 */ 5005 if (clear_batch_ready(sh)) 5006 return; 5007 5008 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 5009 /* already being handled, ensure it gets handled 5010 * again when current action finishes */ 5011 set_bit(STRIPE_HANDLE, &sh->state); 5012 return; 5013 } 5014 5015 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 5016 break_stripe_batch_list(sh, 0); 5017 5018 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 5019 spin_lock(&sh->stripe_lock); 5020 /* 5021 * Cannot process 'sync' concurrently with 'discard'. 5022 * Flush data in r5cache before 'sync'. 5023 */ 5024 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 5025 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 5026 !test_bit(STRIPE_DISCARD, &sh->state) && 5027 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 5028 set_bit(STRIPE_SYNCING, &sh->state); 5029 clear_bit(STRIPE_INSYNC, &sh->state); 5030 clear_bit(STRIPE_REPLACED, &sh->state); 5031 } 5032 spin_unlock(&sh->stripe_lock); 5033 } 5034 clear_bit(STRIPE_DELAYED, &sh->state); 5035 5036 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 5037 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 5038 (unsigned long long)sh->sector, sh->state, 5039 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 5040 sh->check_state, sh->reconstruct_state); 5041 5042 analyse_stripe(sh, &s); 5043 5044 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 5045 goto finish; 5046 5047 if (s.handle_bad_blocks || 5048 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 5049 set_bit(STRIPE_HANDLE, &sh->state); 5050 goto finish; 5051 } 5052 5053 if (unlikely(s.blocked_rdev)) { 5054 if (s.syncing || s.expanding || s.expanded || 5055 s.replacing || s.to_write || s.written) { 5056 set_bit(STRIPE_HANDLE, &sh->state); 5057 goto finish; 5058 } 5059 /* There is nothing for the blocked_rdev to block */ 5060 rdev_dec_pending(s.blocked_rdev, conf->mddev); 5061 s.blocked_rdev = NULL; 5062 } 5063 5064 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 5065 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 5066 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 5067 } 5068 5069 pr_debug("locked=%d uptodate=%d to_read=%d" 5070 " to_write=%d failed=%d failed_num=%d,%d\n", 5071 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 5072 s.failed_num[0], s.failed_num[1]); 5073 /* 5074 * check if the array has lost more than max_degraded devices and, 5075 * if so, some requests might need to be failed. 5076 * 5077 * When journal device failed (log_failed), we will only process 5078 * the stripe if there is data need write to raid disks 5079 */ 5080 if (s.failed > conf->max_degraded || 5081 (s.log_failed && s.injournal == 0)) { 5082 sh->check_state = 0; 5083 sh->reconstruct_state = 0; 5084 break_stripe_batch_list(sh, 0); 5085 if (s.to_read+s.to_write+s.written) 5086 handle_failed_stripe(conf, sh, &s, disks); 5087 if (s.syncing + s.replacing) 5088 handle_failed_sync(conf, sh, &s); 5089 } 5090 5091 /* Now we check to see if any write operations have recently 5092 * completed 5093 */ 5094 prexor = 0; 5095 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 5096 prexor = 1; 5097 if (sh->reconstruct_state == reconstruct_state_drain_result || 5098 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 5099 sh->reconstruct_state = reconstruct_state_idle; 5100 5101 /* All the 'written' buffers and the parity block are ready to 5102 * be written back to disk 5103 */ 5104 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 5105 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 5106 BUG_ON(sh->qd_idx >= 0 && 5107 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 5108 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 5109 for (i = disks; i--; ) { 5110 struct r5dev *dev = &sh->dev[i]; 5111 if (test_bit(R5_LOCKED, &dev->flags) && 5112 (i == sh->pd_idx || i == sh->qd_idx || 5113 dev->written || test_bit(R5_InJournal, 5114 &dev->flags))) { 5115 pr_debug("Writing block %d\n", i); 5116 set_bit(R5_Wantwrite, &dev->flags); 5117 if (prexor) 5118 continue; 5119 if (s.failed > 1) 5120 continue; 5121 if (!test_bit(R5_Insync, &dev->flags) || 5122 ((i == sh->pd_idx || i == sh->qd_idx) && 5123 s.failed == 0)) 5124 set_bit(STRIPE_INSYNC, &sh->state); 5125 } 5126 } 5127 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5128 s.dec_preread_active = 1; 5129 } 5130 5131 /* 5132 * might be able to return some write requests if the parity blocks 5133 * are safe, or on a failed drive 5134 */ 5135 pdev = &sh->dev[sh->pd_idx]; 5136 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 5137 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 5138 qdev = &sh->dev[sh->qd_idx]; 5139 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 5140 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 5141 || conf->level < 6; 5142 5143 if (s.written && 5144 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 5145 && !test_bit(R5_LOCKED, &pdev->flags) 5146 && (test_bit(R5_UPTODATE, &pdev->flags) || 5147 test_bit(R5_Discard, &pdev->flags))))) && 5148 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 5149 && !test_bit(R5_LOCKED, &qdev->flags) 5150 && (test_bit(R5_UPTODATE, &qdev->flags) || 5151 test_bit(R5_Discard, &qdev->flags)))))) 5152 handle_stripe_clean_event(conf, sh, disks); 5153 5154 if (s.just_cached) 5155 r5c_handle_cached_data_endio(conf, sh, disks); 5156 log_stripe_write_finished(sh); 5157 5158 /* Now we might consider reading some blocks, either to check/generate 5159 * parity, or to satisfy requests 5160 * or to load a block that is being partially written. 5161 */ 5162 if (s.to_read || s.non_overwrite 5163 || (s.to_write && s.failed) 5164 || (s.syncing && (s.uptodate + s.compute < disks)) 5165 || s.replacing 5166 || s.expanding) 5167 handle_stripe_fill(sh, &s, disks); 5168 5169 /* 5170 * When the stripe finishes full journal write cycle (write to journal 5171 * and raid disk), this is the clean up procedure so it is ready for 5172 * next operation. 5173 */ 5174 r5c_finish_stripe_write_out(conf, sh, &s); 5175 5176 /* 5177 * Now to consider new write requests, cache write back and what else, 5178 * if anything should be read. We do not handle new writes when: 5179 * 1/ A 'write' operation (copy+xor) is already in flight. 5180 * 2/ A 'check' operation is in flight, as it may clobber the parity 5181 * block. 5182 * 3/ A r5c cache log write is in flight. 5183 */ 5184 5185 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 5186 if (!r5c_is_writeback(conf->log)) { 5187 if (s.to_write) 5188 handle_stripe_dirtying(conf, sh, &s, disks); 5189 } else { /* write back cache */ 5190 int ret = 0; 5191 5192 /* First, try handle writes in caching phase */ 5193 if (s.to_write) 5194 ret = r5c_try_caching_write(conf, sh, &s, 5195 disks); 5196 /* 5197 * If caching phase failed: ret == -EAGAIN 5198 * OR 5199 * stripe under reclaim: !caching && injournal 5200 * 5201 * fall back to handle_stripe_dirtying() 5202 */ 5203 if (ret == -EAGAIN || 5204 /* stripe under reclaim: !caching && injournal */ 5205 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 5206 s.injournal > 0)) { 5207 ret = handle_stripe_dirtying(conf, sh, &s, 5208 disks); 5209 if (ret == -EAGAIN) 5210 goto finish; 5211 } 5212 } 5213 } 5214 5215 /* maybe we need to check and possibly fix the parity for this stripe 5216 * Any reads will already have been scheduled, so we just see if enough 5217 * data is available. The parity check is held off while parity 5218 * dependent operations are in flight. 5219 */ 5220 if (sh->check_state || 5221 (s.syncing && s.locked == 0 && 5222 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 5223 !test_bit(STRIPE_INSYNC, &sh->state))) { 5224 if (conf->level == 6) 5225 handle_parity_checks6(conf, sh, &s, disks); 5226 else 5227 handle_parity_checks5(conf, sh, &s, disks); 5228 } 5229 5230 if ((s.replacing || s.syncing) && s.locked == 0 5231 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 5232 && !test_bit(STRIPE_REPLACED, &sh->state)) { 5233 /* Write out to replacement devices where possible */ 5234 for (i = 0; i < conf->raid_disks; i++) 5235 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 5236 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 5237 set_bit(R5_WantReplace, &sh->dev[i].flags); 5238 set_bit(R5_LOCKED, &sh->dev[i].flags); 5239 s.locked++; 5240 } 5241 if (s.replacing) 5242 set_bit(STRIPE_INSYNC, &sh->state); 5243 set_bit(STRIPE_REPLACED, &sh->state); 5244 } 5245 if ((s.syncing || s.replacing) && s.locked == 0 && 5246 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 5247 test_bit(STRIPE_INSYNC, &sh->state)) { 5248 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); 5249 clear_bit(STRIPE_SYNCING, &sh->state); 5250 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 5251 wake_up(&conf->wait_for_overlap); 5252 } 5253 5254 /* If the failed drives are just a ReadError, then we might need 5255 * to progress the repair/check process 5256 */ 5257 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 5258 for (i = 0; i < s.failed; i++) { 5259 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 5260 if (test_bit(R5_ReadError, &dev->flags) 5261 && !test_bit(R5_LOCKED, &dev->flags) 5262 && test_bit(R5_UPTODATE, &dev->flags) 5263 ) { 5264 if (!test_bit(R5_ReWrite, &dev->flags)) { 5265 set_bit(R5_Wantwrite, &dev->flags); 5266 set_bit(R5_ReWrite, &dev->flags); 5267 } else 5268 /* let's read it back */ 5269 set_bit(R5_Wantread, &dev->flags); 5270 set_bit(R5_LOCKED, &dev->flags); 5271 s.locked++; 5272 } 5273 } 5274 5275 /* Finish reconstruct operations initiated by the expansion process */ 5276 if (sh->reconstruct_state == reconstruct_state_result) { 5277 struct stripe_head *sh_src 5278 = raid5_get_active_stripe(conf, NULL, sh->sector, 5279 R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | 5280 R5_GAS_NOQUIESCE); 5281 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 5282 /* sh cannot be written until sh_src has been read. 5283 * so arrange for sh to be delayed a little 5284 */ 5285 set_bit(STRIPE_DELAYED, &sh->state); 5286 set_bit(STRIPE_HANDLE, &sh->state); 5287 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 5288 &sh_src->state)) 5289 atomic_inc(&conf->preread_active_stripes); 5290 raid5_release_stripe(sh_src); 5291 goto finish; 5292 } 5293 if (sh_src) 5294 raid5_release_stripe(sh_src); 5295 5296 sh->reconstruct_state = reconstruct_state_idle; 5297 clear_bit(STRIPE_EXPANDING, &sh->state); 5298 for (i = conf->raid_disks; i--; ) { 5299 set_bit(R5_Wantwrite, &sh->dev[i].flags); 5300 set_bit(R5_LOCKED, &sh->dev[i].flags); 5301 s.locked++; 5302 } 5303 } 5304 5305 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 5306 !sh->reconstruct_state) { 5307 /* Need to write out all blocks after computing parity */ 5308 sh->disks = conf->raid_disks; 5309 stripe_set_idx(sh->sector, conf, 0, sh); 5310 schedule_reconstruction(sh, &s, 1, 1); 5311 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 5312 clear_bit(STRIPE_EXPAND_READY, &sh->state); 5313 atomic_dec(&conf->reshape_stripes); 5314 wake_up(&conf->wait_for_overlap); 5315 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); 5316 } 5317 5318 if (s.expanding && s.locked == 0 && 5319 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 5320 handle_stripe_expansion(conf, sh); 5321 5322 finish: 5323 /* wait for this device to become unblocked */ 5324 if (unlikely(s.blocked_rdev)) { 5325 if (conf->mddev->external) 5326 md_wait_for_blocked_rdev(s.blocked_rdev, 5327 conf->mddev); 5328 else 5329 /* Internal metadata will immediately 5330 * be written by raid5d, so we don't 5331 * need to wait here. 5332 */ 5333 rdev_dec_pending(s.blocked_rdev, 5334 conf->mddev); 5335 } 5336 5337 if (s.handle_bad_blocks) 5338 for (i = disks; i--; ) { 5339 struct md_rdev *rdev; 5340 struct r5dev *dev = &sh->dev[i]; 5341 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 5342 /* We own a safe reference to the rdev */ 5343 rdev = rdev_pend_deref(conf->disks[i].rdev); 5344 if (!rdev_set_badblocks(rdev, sh->sector, 5345 RAID5_STRIPE_SECTORS(conf), 0)) 5346 md_error(conf->mddev, rdev); 5347 rdev_dec_pending(rdev, conf->mddev); 5348 } 5349 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 5350 rdev = rdev_pend_deref(conf->disks[i].rdev); 5351 rdev_clear_badblocks(rdev, sh->sector, 5352 RAID5_STRIPE_SECTORS(conf), 0); 5353 rdev_dec_pending(rdev, conf->mddev); 5354 } 5355 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5356 rdev = rdev_pend_deref(conf->disks[i].replacement); 5357 if (!rdev) 5358 /* rdev have been moved down */ 5359 rdev = rdev_pend_deref(conf->disks[i].rdev); 5360 rdev_clear_badblocks(rdev, sh->sector, 5361 RAID5_STRIPE_SECTORS(conf), 0); 5362 rdev_dec_pending(rdev, conf->mddev); 5363 } 5364 } 5365 5366 if (s.ops_request) 5367 raid_run_ops(sh, s.ops_request); 5368 5369 ops_run_io(sh, &s); 5370 5371 if (s.dec_preread_active) { 5372 /* We delay this until after ops_run_io so that if make_request 5373 * is waiting on a flush, it won't continue until the writes 5374 * have actually been submitted. 5375 */ 5376 atomic_dec(&conf->preread_active_stripes); 5377 if (atomic_read(&conf->preread_active_stripes) < 5378 IO_THRESHOLD) 5379 md_wakeup_thread(conf->mddev->thread); 5380 } 5381 5382 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5383 } 5384 5385 static void raid5_activate_delayed(struct r5conf *conf) 5386 __must_hold(&conf->device_lock) 5387 { 5388 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5389 while (!list_empty(&conf->delayed_list)) { 5390 struct list_head *l = conf->delayed_list.next; 5391 struct stripe_head *sh; 5392 sh = list_entry(l, struct stripe_head, lru); 5393 list_del_init(l); 5394 clear_bit(STRIPE_DELAYED, &sh->state); 5395 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5396 atomic_inc(&conf->preread_active_stripes); 5397 list_add_tail(&sh->lru, &conf->hold_list); 5398 raid5_wakeup_stripe_thread(sh); 5399 } 5400 } 5401 } 5402 5403 static void activate_bit_delay(struct r5conf *conf, 5404 struct list_head *temp_inactive_list) 5405 __must_hold(&conf->device_lock) 5406 { 5407 struct list_head head; 5408 list_add(&head, &conf->bitmap_list); 5409 list_del_init(&conf->bitmap_list); 5410 while (!list_empty(&head)) { 5411 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5412 int hash; 5413 list_del_init(&sh->lru); 5414 atomic_inc(&sh->count); 5415 hash = sh->hash_lock_index; 5416 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5417 } 5418 } 5419 5420 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5421 { 5422 struct r5conf *conf = mddev->private; 5423 sector_t sector = bio->bi_iter.bi_sector; 5424 unsigned int chunk_sectors; 5425 unsigned int bio_sectors = bio_sectors(bio); 5426 5427 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5428 return chunk_sectors >= 5429 ((sector & (chunk_sectors - 1)) + bio_sectors); 5430 } 5431 5432 /* 5433 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5434 * later sampled by raid5d. 5435 */ 5436 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5437 { 5438 unsigned long flags; 5439 5440 spin_lock_irqsave(&conf->device_lock, flags); 5441 5442 bi->bi_next = conf->retry_read_aligned_list; 5443 conf->retry_read_aligned_list = bi; 5444 5445 spin_unlock_irqrestore(&conf->device_lock, flags); 5446 md_wakeup_thread(conf->mddev->thread); 5447 } 5448 5449 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5450 unsigned int *offset) 5451 { 5452 struct bio *bi; 5453 5454 bi = conf->retry_read_aligned; 5455 if (bi) { 5456 *offset = conf->retry_read_offset; 5457 conf->retry_read_aligned = NULL; 5458 return bi; 5459 } 5460 bi = conf->retry_read_aligned_list; 5461 if(bi) { 5462 conf->retry_read_aligned_list = bi->bi_next; 5463 bi->bi_next = NULL; 5464 *offset = 0; 5465 } 5466 5467 return bi; 5468 } 5469 5470 /* 5471 * The "raid5_align_endio" should check if the read succeeded and if it 5472 * did, call bio_endio on the original bio (having bio_put the new bio 5473 * first). 5474 * If the read failed.. 5475 */ 5476 static void raid5_align_endio(struct bio *bi) 5477 { 5478 struct bio *raid_bi = bi->bi_private; 5479 struct md_rdev *rdev = (void *)raid_bi->bi_next; 5480 struct mddev *mddev = rdev->mddev; 5481 struct r5conf *conf = mddev->private; 5482 blk_status_t error = bi->bi_status; 5483 5484 bio_put(bi); 5485 raid_bi->bi_next = NULL; 5486 rdev_dec_pending(rdev, conf->mddev); 5487 5488 if (!error) { 5489 bio_endio(raid_bi); 5490 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5491 wake_up(&conf->wait_for_quiescent); 5492 return; 5493 } 5494 5495 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5496 5497 add_bio_to_retry(raid_bi, conf); 5498 } 5499 5500 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5501 { 5502 struct r5conf *conf = mddev->private; 5503 struct bio *align_bio; 5504 struct md_rdev *rdev; 5505 sector_t sector, end_sector, first_bad; 5506 int bad_sectors, dd_idx; 5507 bool did_inc; 5508 5509 if (!in_chunk_boundary(mddev, raid_bio)) { 5510 pr_debug("%s: non aligned\n", __func__); 5511 return 0; 5512 } 5513 5514 sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, 5515 &dd_idx, NULL); 5516 end_sector = sector + bio_sectors(raid_bio); 5517 5518 rcu_read_lock(); 5519 if (r5c_big_stripe_cached(conf, sector)) 5520 goto out_rcu_unlock; 5521 5522 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5523 if (!rdev || test_bit(Faulty, &rdev->flags) || 5524 rdev->recovery_offset < end_sector) { 5525 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5526 if (!rdev) 5527 goto out_rcu_unlock; 5528 if (test_bit(Faulty, &rdev->flags) || 5529 !(test_bit(In_sync, &rdev->flags) || 5530 rdev->recovery_offset >= end_sector)) 5531 goto out_rcu_unlock; 5532 } 5533 5534 atomic_inc(&rdev->nr_pending); 5535 rcu_read_unlock(); 5536 5537 if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, 5538 &bad_sectors)) { 5539 rdev_dec_pending(rdev, mddev); 5540 return 0; 5541 } 5542 5543 md_account_bio(mddev, &raid_bio); 5544 raid_bio->bi_next = (void *)rdev; 5545 5546 align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, 5547 &mddev->bio_set); 5548 align_bio->bi_end_io = raid5_align_endio; 5549 align_bio->bi_private = raid_bio; 5550 align_bio->bi_iter.bi_sector = sector; 5551 5552 /* No reshape active, so we can trust rdev->data_offset */ 5553 align_bio->bi_iter.bi_sector += rdev->data_offset; 5554 5555 did_inc = false; 5556 if (conf->quiesce == 0) { 5557 atomic_inc(&conf->active_aligned_reads); 5558 did_inc = true; 5559 } 5560 /* need a memory barrier to detect the race with raid5_quiesce() */ 5561 if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { 5562 /* quiesce is in progress, so we need to undo io activation and wait 5563 * for it to finish 5564 */ 5565 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) 5566 wake_up(&conf->wait_for_quiescent); 5567 spin_lock_irq(&conf->device_lock); 5568 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, 5569 conf->device_lock); 5570 atomic_inc(&conf->active_aligned_reads); 5571 spin_unlock_irq(&conf->device_lock); 5572 } 5573 5574 if (mddev->gendisk) 5575 trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), 5576 raid_bio->bi_iter.bi_sector); 5577 submit_bio_noacct(align_bio); 5578 return 1; 5579 5580 out_rcu_unlock: 5581 rcu_read_unlock(); 5582 return 0; 5583 } 5584 5585 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5586 { 5587 struct bio *split; 5588 sector_t sector = raid_bio->bi_iter.bi_sector; 5589 unsigned chunk_sects = mddev->chunk_sectors; 5590 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5591 5592 if (sectors < bio_sectors(raid_bio)) { 5593 struct r5conf *conf = mddev->private; 5594 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); 5595 bio_chain(split, raid_bio); 5596 submit_bio_noacct(raid_bio); 5597 raid_bio = split; 5598 } 5599 5600 if (!raid5_read_one_chunk(mddev, raid_bio)) 5601 return raid_bio; 5602 5603 return NULL; 5604 } 5605 5606 /* __get_priority_stripe - get the next stripe to process 5607 * 5608 * Full stripe writes are allowed to pass preread active stripes up until 5609 * the bypass_threshold is exceeded. In general the bypass_count 5610 * increments when the handle_list is handled before the hold_list; however, it 5611 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5612 * stripe with in flight i/o. The bypass_count will be reset when the 5613 * head of the hold_list has changed, i.e. the head was promoted to the 5614 * handle_list. 5615 */ 5616 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5617 __must_hold(&conf->device_lock) 5618 { 5619 struct stripe_head *sh, *tmp; 5620 struct list_head *handle_list = NULL; 5621 struct r5worker_group *wg; 5622 bool second_try = !r5c_is_writeback(conf->log) && 5623 !r5l_log_disk_error(conf); 5624 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5625 r5l_log_disk_error(conf); 5626 5627 again: 5628 wg = NULL; 5629 sh = NULL; 5630 if (conf->worker_cnt_per_group == 0) { 5631 handle_list = try_loprio ? &conf->loprio_list : 5632 &conf->handle_list; 5633 } else if (group != ANY_GROUP) { 5634 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5635 &conf->worker_groups[group].handle_list; 5636 wg = &conf->worker_groups[group]; 5637 } else { 5638 int i; 5639 for (i = 0; i < conf->group_cnt; i++) { 5640 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5641 &conf->worker_groups[i].handle_list; 5642 wg = &conf->worker_groups[i]; 5643 if (!list_empty(handle_list)) 5644 break; 5645 } 5646 } 5647 5648 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5649 __func__, 5650 list_empty(handle_list) ? "empty" : "busy", 5651 list_empty(&conf->hold_list) ? "empty" : "busy", 5652 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5653 5654 if (!list_empty(handle_list)) { 5655 sh = list_entry(handle_list->next, typeof(*sh), lru); 5656 5657 if (list_empty(&conf->hold_list)) 5658 conf->bypass_count = 0; 5659 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5660 if (conf->hold_list.next == conf->last_hold) 5661 conf->bypass_count++; 5662 else { 5663 conf->last_hold = conf->hold_list.next; 5664 conf->bypass_count -= conf->bypass_threshold; 5665 if (conf->bypass_count < 0) 5666 conf->bypass_count = 0; 5667 } 5668 } 5669 } else if (!list_empty(&conf->hold_list) && 5670 ((conf->bypass_threshold && 5671 conf->bypass_count > conf->bypass_threshold) || 5672 atomic_read(&conf->pending_full_writes) == 0)) { 5673 5674 list_for_each_entry(tmp, &conf->hold_list, lru) { 5675 if (conf->worker_cnt_per_group == 0 || 5676 group == ANY_GROUP || 5677 !cpu_online(tmp->cpu) || 5678 cpu_to_group(tmp->cpu) == group) { 5679 sh = tmp; 5680 break; 5681 } 5682 } 5683 5684 if (sh) { 5685 conf->bypass_count -= conf->bypass_threshold; 5686 if (conf->bypass_count < 0) 5687 conf->bypass_count = 0; 5688 } 5689 wg = NULL; 5690 } 5691 5692 if (!sh) { 5693 if (second_try) 5694 return NULL; 5695 second_try = true; 5696 try_loprio = !try_loprio; 5697 goto again; 5698 } 5699 5700 if (wg) { 5701 wg->stripes_cnt--; 5702 sh->group = NULL; 5703 } 5704 list_del_init(&sh->lru); 5705 BUG_ON(atomic_inc_return(&sh->count) != 1); 5706 return sh; 5707 } 5708 5709 struct raid5_plug_cb { 5710 struct blk_plug_cb cb; 5711 struct list_head list; 5712 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5713 }; 5714 5715 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5716 { 5717 struct raid5_plug_cb *cb = container_of( 5718 blk_cb, struct raid5_plug_cb, cb); 5719 struct stripe_head *sh; 5720 struct mddev *mddev = cb->cb.data; 5721 struct r5conf *conf = mddev->private; 5722 int cnt = 0; 5723 int hash; 5724 5725 if (cb->list.next && !list_empty(&cb->list)) { 5726 spin_lock_irq(&conf->device_lock); 5727 while (!list_empty(&cb->list)) { 5728 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5729 list_del_init(&sh->lru); 5730 /* 5731 * avoid race release_stripe_plug() sees 5732 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5733 * is still in our list 5734 */ 5735 smp_mb__before_atomic(); 5736 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5737 /* 5738 * STRIPE_ON_RELEASE_LIST could be set here. In that 5739 * case, the count is always > 1 here 5740 */ 5741 hash = sh->hash_lock_index; 5742 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5743 cnt++; 5744 } 5745 spin_unlock_irq(&conf->device_lock); 5746 } 5747 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5748 NR_STRIPE_HASH_LOCKS); 5749 if (mddev->queue) 5750 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5751 kfree(cb); 5752 } 5753 5754 static void release_stripe_plug(struct mddev *mddev, 5755 struct stripe_head *sh) 5756 { 5757 struct blk_plug_cb *blk_cb = blk_check_plugged( 5758 raid5_unplug, mddev, 5759 sizeof(struct raid5_plug_cb)); 5760 struct raid5_plug_cb *cb; 5761 5762 if (!blk_cb) { 5763 raid5_release_stripe(sh); 5764 return; 5765 } 5766 5767 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5768 5769 if (cb->list.next == NULL) { 5770 int i; 5771 INIT_LIST_HEAD(&cb->list); 5772 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5773 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5774 } 5775 5776 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5777 list_add_tail(&sh->lru, &cb->list); 5778 else 5779 raid5_release_stripe(sh); 5780 } 5781 5782 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5783 { 5784 struct r5conf *conf = mddev->private; 5785 sector_t logical_sector, last_sector; 5786 struct stripe_head *sh; 5787 int stripe_sectors; 5788 5789 /* We need to handle this when io_uring supports discard/trim */ 5790 if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT)) 5791 return; 5792 5793 if (mddev->reshape_position != MaxSector) 5794 /* Skip discard while reshape is happening */ 5795 return; 5796 5797 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 5798 last_sector = bio_end_sector(bi); 5799 5800 bi->bi_next = NULL; 5801 5802 stripe_sectors = conf->chunk_sectors * 5803 (conf->raid_disks - conf->max_degraded); 5804 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5805 stripe_sectors); 5806 sector_div(last_sector, stripe_sectors); 5807 5808 logical_sector *= conf->chunk_sectors; 5809 last_sector *= conf->chunk_sectors; 5810 5811 for (; logical_sector < last_sector; 5812 logical_sector += RAID5_STRIPE_SECTORS(conf)) { 5813 DEFINE_WAIT(w); 5814 int d; 5815 again: 5816 sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); 5817 prepare_to_wait(&conf->wait_for_overlap, &w, 5818 TASK_UNINTERRUPTIBLE); 5819 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5820 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5821 raid5_release_stripe(sh); 5822 schedule(); 5823 goto again; 5824 } 5825 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5826 spin_lock_irq(&sh->stripe_lock); 5827 for (d = 0; d < conf->raid_disks; d++) { 5828 if (d == sh->pd_idx || d == sh->qd_idx) 5829 continue; 5830 if (sh->dev[d].towrite || sh->dev[d].toread) { 5831 set_bit(R5_Overlap, &sh->dev[d].flags); 5832 spin_unlock_irq(&sh->stripe_lock); 5833 raid5_release_stripe(sh); 5834 schedule(); 5835 goto again; 5836 } 5837 } 5838 set_bit(STRIPE_DISCARD, &sh->state); 5839 finish_wait(&conf->wait_for_overlap, &w); 5840 sh->overwrite_disks = 0; 5841 for (d = 0; d < conf->raid_disks; d++) { 5842 if (d == sh->pd_idx || d == sh->qd_idx) 5843 continue; 5844 sh->dev[d].towrite = bi; 5845 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5846 bio_inc_remaining(bi); 5847 md_write_inc(mddev, bi); 5848 sh->overwrite_disks++; 5849 } 5850 spin_unlock_irq(&sh->stripe_lock); 5851 if (conf->mddev->bitmap) { 5852 for (d = 0; 5853 d < conf->raid_disks - conf->max_degraded; 5854 d++) 5855 md_bitmap_startwrite(mddev->bitmap, 5856 sh->sector, 5857 RAID5_STRIPE_SECTORS(conf), 5858 0); 5859 sh->bm_seq = conf->seq_flush + 1; 5860 set_bit(STRIPE_BIT_DELAY, &sh->state); 5861 } 5862 5863 set_bit(STRIPE_HANDLE, &sh->state); 5864 clear_bit(STRIPE_DELAYED, &sh->state); 5865 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5866 atomic_inc(&conf->preread_active_stripes); 5867 release_stripe_plug(mddev, sh); 5868 } 5869 5870 bio_endio(bi); 5871 } 5872 5873 static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, 5874 sector_t reshape_sector) 5875 { 5876 return mddev->reshape_backwards ? sector < reshape_sector : 5877 sector >= reshape_sector; 5878 } 5879 5880 static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, 5881 sector_t max, sector_t reshape_sector) 5882 { 5883 return mddev->reshape_backwards ? max < reshape_sector : 5884 min >= reshape_sector; 5885 } 5886 5887 static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, 5888 struct stripe_head *sh) 5889 { 5890 sector_t max_sector = 0, min_sector = MaxSector; 5891 bool ret = false; 5892 int dd_idx; 5893 5894 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5895 if (dd_idx == sh->pd_idx) 5896 continue; 5897 5898 min_sector = min(min_sector, sh->dev[dd_idx].sector); 5899 max_sector = min(max_sector, sh->dev[dd_idx].sector); 5900 } 5901 5902 spin_lock_irq(&conf->device_lock); 5903 5904 if (!range_ahead_of_reshape(mddev, min_sector, max_sector, 5905 conf->reshape_progress)) 5906 /* mismatch, need to try again */ 5907 ret = true; 5908 5909 spin_unlock_irq(&conf->device_lock); 5910 5911 return ret; 5912 } 5913 5914 static int add_all_stripe_bios(struct r5conf *conf, 5915 struct stripe_request_ctx *ctx, struct stripe_head *sh, 5916 struct bio *bi, int forwrite, int previous) 5917 { 5918 int dd_idx; 5919 int ret = 1; 5920 5921 spin_lock_irq(&sh->stripe_lock); 5922 5923 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5924 struct r5dev *dev = &sh->dev[dd_idx]; 5925 5926 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5927 continue; 5928 5929 if (dev->sector < ctx->first_sector || 5930 dev->sector >= ctx->last_sector) 5931 continue; 5932 5933 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 5934 set_bit(R5_Overlap, &dev->flags); 5935 ret = 0; 5936 continue; 5937 } 5938 } 5939 5940 if (!ret) 5941 goto out; 5942 5943 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5944 struct r5dev *dev = &sh->dev[dd_idx]; 5945 5946 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5947 continue; 5948 5949 if (dev->sector < ctx->first_sector || 5950 dev->sector >= ctx->last_sector) 5951 continue; 5952 5953 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 5954 clear_bit((dev->sector - ctx->first_sector) >> 5955 RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); 5956 } 5957 5958 out: 5959 spin_unlock_irq(&sh->stripe_lock); 5960 return ret; 5961 } 5962 5963 static enum stripe_result make_stripe_request(struct mddev *mddev, 5964 struct r5conf *conf, struct stripe_request_ctx *ctx, 5965 sector_t logical_sector, struct bio *bi) 5966 { 5967 const int rw = bio_data_dir(bi); 5968 enum stripe_result ret; 5969 struct stripe_head *sh; 5970 sector_t new_sector; 5971 int previous = 0, flags = 0; 5972 int seq, dd_idx; 5973 5974 seq = read_seqcount_begin(&conf->gen_lock); 5975 5976 if (unlikely(conf->reshape_progress != MaxSector)) { 5977 /* 5978 * Spinlock is needed as reshape_progress may be 5979 * 64bit on a 32bit platform, and so it might be 5980 * possible to see a half-updated value 5981 * Of course reshape_progress could change after 5982 * the lock is dropped, so once we get a reference 5983 * to the stripe that we think it is, we will have 5984 * to check again. 5985 */ 5986 spin_lock_irq(&conf->device_lock); 5987 if (ahead_of_reshape(mddev, logical_sector, 5988 conf->reshape_progress)) { 5989 previous = 1; 5990 } else { 5991 if (ahead_of_reshape(mddev, logical_sector, 5992 conf->reshape_safe)) { 5993 spin_unlock_irq(&conf->device_lock); 5994 return STRIPE_SCHEDULE_AND_RETRY; 5995 } 5996 } 5997 spin_unlock_irq(&conf->device_lock); 5998 } 5999 6000 new_sector = raid5_compute_sector(conf, logical_sector, previous, 6001 &dd_idx, NULL); 6002 pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, 6003 new_sector, logical_sector); 6004 6005 if (previous) 6006 flags |= R5_GAS_PREVIOUS; 6007 if (bi->bi_opf & REQ_RAHEAD) 6008 flags |= R5_GAS_NOBLOCK; 6009 sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); 6010 if (unlikely(!sh)) { 6011 /* cannot get stripe, just give-up */ 6012 bi->bi_status = BLK_STS_IOERR; 6013 return STRIPE_FAIL; 6014 } 6015 6016 if (unlikely(previous) && 6017 stripe_ahead_of_reshape(mddev, conf, sh)) { 6018 /* 6019 * Expansion moved on while waiting for a stripe. 6020 * Expansion could still move past after this 6021 * test, but as we are holding a reference to 6022 * 'sh', we know that if that happens, 6023 * STRIPE_EXPANDING will get set and the expansion 6024 * won't proceed until we finish with the stripe. 6025 */ 6026 ret = STRIPE_SCHEDULE_AND_RETRY; 6027 goto out_release; 6028 } 6029 6030 if (read_seqcount_retry(&conf->gen_lock, seq)) { 6031 /* Might have got the wrong stripe_head by accident */ 6032 ret = STRIPE_RETRY; 6033 goto out_release; 6034 } 6035 6036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 6037 !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { 6038 /* 6039 * Stripe is busy expanding or add failed due to 6040 * overlap. Flush everything and wait a while. 6041 */ 6042 md_wakeup_thread(mddev->thread); 6043 ret = STRIPE_SCHEDULE_AND_RETRY; 6044 goto out_release; 6045 } 6046 6047 if (stripe_can_batch(sh)) { 6048 stripe_add_to_batch_list(conf, sh, ctx->batch_last); 6049 if (ctx->batch_last) 6050 raid5_release_stripe(ctx->batch_last); 6051 atomic_inc(&sh->count); 6052 ctx->batch_last = sh; 6053 } 6054 6055 if (ctx->do_flush) { 6056 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 6057 /* we only need flush for one stripe */ 6058 ctx->do_flush = false; 6059 } 6060 6061 set_bit(STRIPE_HANDLE, &sh->state); 6062 clear_bit(STRIPE_DELAYED, &sh->state); 6063 if ((!sh->batch_head || sh == sh->batch_head) && 6064 (bi->bi_opf & REQ_SYNC) && 6065 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 6066 atomic_inc(&conf->preread_active_stripes); 6067 6068 release_stripe_plug(mddev, sh); 6069 return STRIPE_SUCCESS; 6070 6071 out_release: 6072 raid5_release_stripe(sh); 6073 return ret; 6074 } 6075 6076 /* 6077 * If the bio covers multiple data disks, find sector within the bio that has 6078 * the lowest chunk offset in the first chunk. 6079 */ 6080 static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, 6081 struct bio *bi) 6082 { 6083 int sectors_per_chunk = conf->chunk_sectors; 6084 int raid_disks = conf->raid_disks; 6085 int dd_idx; 6086 struct stripe_head sh; 6087 unsigned int chunk_offset; 6088 sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6089 sector_t sector; 6090 6091 /* We pass in fake stripe_head to get back parity disk numbers */ 6092 sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh); 6093 chunk_offset = sector_div(sector, sectors_per_chunk); 6094 if (sectors_per_chunk - chunk_offset >= bio_sectors(bi)) 6095 return r_sector; 6096 /* 6097 * Bio crosses to the next data disk. Check whether it's in the same 6098 * chunk. 6099 */ 6100 dd_idx++; 6101 while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx) 6102 dd_idx++; 6103 if (dd_idx >= raid_disks) 6104 return r_sector; 6105 return r_sector + sectors_per_chunk - chunk_offset; 6106 } 6107 6108 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 6109 { 6110 DEFINE_WAIT_FUNC(wait, woken_wake_function); 6111 struct r5conf *conf = mddev->private; 6112 sector_t logical_sector; 6113 struct stripe_request_ctx ctx = {}; 6114 const int rw = bio_data_dir(bi); 6115 enum stripe_result res; 6116 int s, stripe_cnt; 6117 6118 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 6119 int ret = log_handle_flush_request(conf, bi); 6120 6121 if (ret == 0) 6122 return true; 6123 if (ret == -ENODEV) { 6124 if (md_flush_request(mddev, bi)) 6125 return true; 6126 } 6127 /* ret == -EAGAIN, fallback */ 6128 /* 6129 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 6130 * we need to flush journal device 6131 */ 6132 ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; 6133 } 6134 6135 if (!md_write_start(mddev, bi)) 6136 return false; 6137 /* 6138 * If array is degraded, better not do chunk aligned read because 6139 * later we might have to read it again in order to reconstruct 6140 * data on failed drives. 6141 */ 6142 if (rw == READ && mddev->degraded == 0 && 6143 mddev->reshape_position == MaxSector) { 6144 bi = chunk_aligned_read(mddev, bi); 6145 if (!bi) 6146 return true; 6147 } 6148 6149 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 6150 make_discard_request(mddev, bi); 6151 md_write_end(mddev); 6152 return true; 6153 } 6154 6155 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6156 ctx.first_sector = logical_sector; 6157 ctx.last_sector = bio_end_sector(bi); 6158 bi->bi_next = NULL; 6159 6160 stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, 6161 RAID5_STRIPE_SECTORS(conf)); 6162 bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); 6163 6164 pr_debug("raid456: %s, logical %llu to %llu\n", __func__, 6165 bi->bi_iter.bi_sector, ctx.last_sector); 6166 6167 /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ 6168 if ((bi->bi_opf & REQ_NOWAIT) && 6169 (conf->reshape_progress != MaxSector) && 6170 !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && 6171 ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { 6172 bio_wouldblock_error(bi); 6173 if (rw == WRITE) 6174 md_write_end(mddev); 6175 return true; 6176 } 6177 md_account_bio(mddev, &bi); 6178 6179 /* 6180 * Lets start with the stripe with the lowest chunk offset in the first 6181 * chunk. That has the best chances of creating IOs adjacent to 6182 * previous IOs in case of sequential IO and thus creates the most 6183 * sequential IO pattern. We don't bother with the optimization when 6184 * reshaping as the performance benefit is not worth the complexity. 6185 */ 6186 if (likely(conf->reshape_progress == MaxSector)) 6187 logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); 6188 s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); 6189 6190 add_wait_queue(&conf->wait_for_overlap, &wait); 6191 while (1) { 6192 res = make_stripe_request(mddev, conf, &ctx, logical_sector, 6193 bi); 6194 if (res == STRIPE_FAIL) 6195 break; 6196 6197 if (res == STRIPE_RETRY) 6198 continue; 6199 6200 if (res == STRIPE_SCHEDULE_AND_RETRY) { 6201 /* 6202 * Must release the reference to batch_last before 6203 * scheduling and waiting for work to be done, 6204 * otherwise the batch_last stripe head could prevent 6205 * raid5_activate_delayed() from making progress 6206 * and thus deadlocking. 6207 */ 6208 if (ctx.batch_last) { 6209 raid5_release_stripe(ctx.batch_last); 6210 ctx.batch_last = NULL; 6211 } 6212 6213 wait_woken(&wait, TASK_UNINTERRUPTIBLE, 6214 MAX_SCHEDULE_TIMEOUT); 6215 continue; 6216 } 6217 6218 s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); 6219 if (s == stripe_cnt) 6220 break; 6221 6222 logical_sector = ctx.first_sector + 6223 (s << RAID5_STRIPE_SHIFT(conf)); 6224 } 6225 remove_wait_queue(&conf->wait_for_overlap, &wait); 6226 6227 if (ctx.batch_last) 6228 raid5_release_stripe(ctx.batch_last); 6229 6230 if (rw == WRITE) 6231 md_write_end(mddev); 6232 bio_endio(bi); 6233 return true; 6234 } 6235 6236 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 6237 6238 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 6239 { 6240 /* reshaping is quite different to recovery/resync so it is 6241 * handled quite separately ... here. 6242 * 6243 * On each call to sync_request, we gather one chunk worth of 6244 * destination stripes and flag them as expanding. 6245 * Then we find all the source stripes and request reads. 6246 * As the reads complete, handle_stripe will copy the data 6247 * into the destination stripe and release that stripe. 6248 */ 6249 struct r5conf *conf = mddev->private; 6250 struct stripe_head *sh; 6251 struct md_rdev *rdev; 6252 sector_t first_sector, last_sector; 6253 int raid_disks = conf->previous_raid_disks; 6254 int data_disks = raid_disks - conf->max_degraded; 6255 int new_data_disks = conf->raid_disks - conf->max_degraded; 6256 int i; 6257 int dd_idx; 6258 sector_t writepos, readpos, safepos; 6259 sector_t stripe_addr; 6260 int reshape_sectors; 6261 struct list_head stripes; 6262 sector_t retn; 6263 6264 if (sector_nr == 0) { 6265 /* If restarting in the middle, skip the initial sectors */ 6266 if (mddev->reshape_backwards && 6267 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 6268 sector_nr = raid5_size(mddev, 0, 0) 6269 - conf->reshape_progress; 6270 } else if (mddev->reshape_backwards && 6271 conf->reshape_progress == MaxSector) { 6272 /* shouldn't happen, but just in case, finish up.*/ 6273 sector_nr = MaxSector; 6274 } else if (!mddev->reshape_backwards && 6275 conf->reshape_progress > 0) 6276 sector_nr = conf->reshape_progress; 6277 sector_div(sector_nr, new_data_disks); 6278 if (sector_nr) { 6279 mddev->curr_resync_completed = sector_nr; 6280 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6281 *skipped = 1; 6282 retn = sector_nr; 6283 goto finish; 6284 } 6285 } 6286 6287 /* We need to process a full chunk at a time. 6288 * If old and new chunk sizes differ, we need to process the 6289 * largest of these 6290 */ 6291 6292 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 6293 6294 /* We update the metadata at least every 10 seconds, or when 6295 * the data about to be copied would over-write the source of 6296 * the data at the front of the range. i.e. one new_stripe 6297 * along from reshape_progress new_maps to after where 6298 * reshape_safe old_maps to 6299 */ 6300 writepos = conf->reshape_progress; 6301 sector_div(writepos, new_data_disks); 6302 readpos = conf->reshape_progress; 6303 sector_div(readpos, data_disks); 6304 safepos = conf->reshape_safe; 6305 sector_div(safepos, data_disks); 6306 if (mddev->reshape_backwards) { 6307 BUG_ON(writepos < reshape_sectors); 6308 writepos -= reshape_sectors; 6309 readpos += reshape_sectors; 6310 safepos += reshape_sectors; 6311 } else { 6312 writepos += reshape_sectors; 6313 /* readpos and safepos are worst-case calculations. 6314 * A negative number is overly pessimistic, and causes 6315 * obvious problems for unsigned storage. So clip to 0. 6316 */ 6317 readpos -= min_t(sector_t, reshape_sectors, readpos); 6318 safepos -= min_t(sector_t, reshape_sectors, safepos); 6319 } 6320 6321 /* Having calculated the 'writepos' possibly use it 6322 * to set 'stripe_addr' which is where we will write to. 6323 */ 6324 if (mddev->reshape_backwards) { 6325 BUG_ON(conf->reshape_progress == 0); 6326 stripe_addr = writepos; 6327 BUG_ON((mddev->dev_sectors & 6328 ~((sector_t)reshape_sectors - 1)) 6329 - reshape_sectors - stripe_addr 6330 != sector_nr); 6331 } else { 6332 BUG_ON(writepos != sector_nr + reshape_sectors); 6333 stripe_addr = sector_nr; 6334 } 6335 6336 /* 'writepos' is the most advanced device address we might write. 6337 * 'readpos' is the least advanced device address we might read. 6338 * 'safepos' is the least address recorded in the metadata as having 6339 * been reshaped. 6340 * If there is a min_offset_diff, these are adjusted either by 6341 * increasing the safepos/readpos if diff is negative, or 6342 * increasing writepos if diff is positive. 6343 * If 'readpos' is then behind 'writepos', there is no way that we can 6344 * ensure safety in the face of a crash - that must be done by userspace 6345 * making a backup of the data. So in that case there is no particular 6346 * rush to update metadata. 6347 * Otherwise if 'safepos' is behind 'writepos', then we really need to 6348 * update the metadata to advance 'safepos' to match 'readpos' so that 6349 * we can be safe in the event of a crash. 6350 * So we insist on updating metadata if safepos is behind writepos and 6351 * readpos is beyond writepos. 6352 * In any case, update the metadata every 10 seconds. 6353 * Maybe that number should be configurable, but I'm not sure it is 6354 * worth it.... maybe it could be a multiple of safemode_delay??? 6355 */ 6356 if (conf->min_offset_diff < 0) { 6357 safepos += -conf->min_offset_diff; 6358 readpos += -conf->min_offset_diff; 6359 } else 6360 writepos += conf->min_offset_diff; 6361 6362 if ((mddev->reshape_backwards 6363 ? (safepos > writepos && readpos < writepos) 6364 : (safepos < writepos && readpos > writepos)) || 6365 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 6366 /* Cannot proceed until we've updated the superblock... */ 6367 wait_event(conf->wait_for_overlap, 6368 atomic_read(&conf->reshape_stripes)==0 6369 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6370 if (atomic_read(&conf->reshape_stripes) != 0) 6371 return 0; 6372 mddev->reshape_position = conf->reshape_progress; 6373 mddev->curr_resync_completed = sector_nr; 6374 if (!mddev->reshape_backwards) 6375 /* Can update recovery_offset */ 6376 rdev_for_each(rdev, mddev) 6377 if (rdev->raid_disk >= 0 && 6378 !test_bit(Journal, &rdev->flags) && 6379 !test_bit(In_sync, &rdev->flags) && 6380 rdev->recovery_offset < sector_nr) 6381 rdev->recovery_offset = sector_nr; 6382 6383 conf->reshape_checkpoint = jiffies; 6384 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6385 md_wakeup_thread(mddev->thread); 6386 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 6387 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6388 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6389 return 0; 6390 spin_lock_irq(&conf->device_lock); 6391 conf->reshape_safe = mddev->reshape_position; 6392 spin_unlock_irq(&conf->device_lock); 6393 wake_up(&conf->wait_for_overlap); 6394 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6395 } 6396 6397 INIT_LIST_HEAD(&stripes); 6398 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { 6399 int j; 6400 int skipped_disk = 0; 6401 sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, 6402 R5_GAS_NOQUIESCE); 6403 set_bit(STRIPE_EXPANDING, &sh->state); 6404 atomic_inc(&conf->reshape_stripes); 6405 /* If any of this stripe is beyond the end of the old 6406 * array, then we need to zero those blocks 6407 */ 6408 for (j=sh->disks; j--;) { 6409 sector_t s; 6410 if (j == sh->pd_idx) 6411 continue; 6412 if (conf->level == 6 && 6413 j == sh->qd_idx) 6414 continue; 6415 s = raid5_compute_blocknr(sh, j, 0); 6416 if (s < raid5_size(mddev, 0, 0)) { 6417 skipped_disk = 1; 6418 continue; 6419 } 6420 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); 6421 set_bit(R5_Expanded, &sh->dev[j].flags); 6422 set_bit(R5_UPTODATE, &sh->dev[j].flags); 6423 } 6424 if (!skipped_disk) { 6425 set_bit(STRIPE_EXPAND_READY, &sh->state); 6426 set_bit(STRIPE_HANDLE, &sh->state); 6427 } 6428 list_add(&sh->lru, &stripes); 6429 } 6430 spin_lock_irq(&conf->device_lock); 6431 if (mddev->reshape_backwards) 6432 conf->reshape_progress -= reshape_sectors * new_data_disks; 6433 else 6434 conf->reshape_progress += reshape_sectors * new_data_disks; 6435 spin_unlock_irq(&conf->device_lock); 6436 /* Ok, those stripe are ready. We can start scheduling 6437 * reads on the source stripes. 6438 * The source stripes are determined by mapping the first and last 6439 * block on the destination stripes. 6440 */ 6441 first_sector = 6442 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 6443 1, &dd_idx, NULL); 6444 last_sector = 6445 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 6446 * new_data_disks - 1), 6447 1, &dd_idx, NULL); 6448 if (last_sector >= mddev->dev_sectors) 6449 last_sector = mddev->dev_sectors - 1; 6450 while (first_sector <= last_sector) { 6451 sh = raid5_get_active_stripe(conf, NULL, first_sector, 6452 R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); 6453 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 6454 set_bit(STRIPE_HANDLE, &sh->state); 6455 raid5_release_stripe(sh); 6456 first_sector += RAID5_STRIPE_SECTORS(conf); 6457 } 6458 /* Now that the sources are clearly marked, we can release 6459 * the destination stripes 6460 */ 6461 while (!list_empty(&stripes)) { 6462 sh = list_entry(stripes.next, struct stripe_head, lru); 6463 list_del_init(&sh->lru); 6464 raid5_release_stripe(sh); 6465 } 6466 /* If this takes us to the resync_max point where we have to pause, 6467 * then we need to write out the superblock. 6468 */ 6469 sector_nr += reshape_sectors; 6470 retn = reshape_sectors; 6471 finish: 6472 if (mddev->curr_resync_completed > mddev->resync_max || 6473 (sector_nr - mddev->curr_resync_completed) * 2 6474 >= mddev->resync_max - mddev->curr_resync_completed) { 6475 /* Cannot proceed until we've updated the superblock... */ 6476 wait_event(conf->wait_for_overlap, 6477 atomic_read(&conf->reshape_stripes) == 0 6478 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6479 if (atomic_read(&conf->reshape_stripes) != 0) 6480 goto ret; 6481 mddev->reshape_position = conf->reshape_progress; 6482 mddev->curr_resync_completed = sector_nr; 6483 if (!mddev->reshape_backwards) 6484 /* Can update recovery_offset */ 6485 rdev_for_each(rdev, mddev) 6486 if (rdev->raid_disk >= 0 && 6487 !test_bit(Journal, &rdev->flags) && 6488 !test_bit(In_sync, &rdev->flags) && 6489 rdev->recovery_offset < sector_nr) 6490 rdev->recovery_offset = sector_nr; 6491 conf->reshape_checkpoint = jiffies; 6492 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6493 md_wakeup_thread(mddev->thread); 6494 wait_event(mddev->sb_wait, 6495 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 6496 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6497 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6498 goto ret; 6499 spin_lock_irq(&conf->device_lock); 6500 conf->reshape_safe = mddev->reshape_position; 6501 spin_unlock_irq(&conf->device_lock); 6502 wake_up(&conf->wait_for_overlap); 6503 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6504 } 6505 ret: 6506 return retn; 6507 } 6508 6509 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6510 int *skipped) 6511 { 6512 struct r5conf *conf = mddev->private; 6513 struct stripe_head *sh; 6514 sector_t max_sector = mddev->dev_sectors; 6515 sector_t sync_blocks; 6516 int still_degraded = 0; 6517 int i; 6518 6519 if (sector_nr >= max_sector) { 6520 /* just being told to finish up .. nothing much to do */ 6521 6522 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6523 end_reshape(conf); 6524 return 0; 6525 } 6526 6527 if (mddev->curr_resync < max_sector) /* aborted */ 6528 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6529 &sync_blocks, 1); 6530 else /* completed sync */ 6531 conf->fullsync = 0; 6532 md_bitmap_close_sync(mddev->bitmap); 6533 6534 return 0; 6535 } 6536 6537 /* Allow raid5_quiesce to complete */ 6538 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6539 6540 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6541 return reshape_request(mddev, sector_nr, skipped); 6542 6543 /* No need to check resync_max as we never do more than one 6544 * stripe, and as resync_max will always be on a chunk boundary, 6545 * if the check in md_do_sync didn't fire, there is no chance 6546 * of overstepping resync_max here 6547 */ 6548 6549 /* if there is too many failed drives and we are trying 6550 * to resync, then assert that we are finished, because there is 6551 * nothing we can do. 6552 */ 6553 if (mddev->degraded >= conf->max_degraded && 6554 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6555 sector_t rv = mddev->dev_sectors - sector_nr; 6556 *skipped = 1; 6557 return rv; 6558 } 6559 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6560 !conf->fullsync && 6561 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6562 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { 6563 /* we can skip this block, and probably more */ 6564 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); 6565 *skipped = 1; 6566 /* keep things rounded to whole stripes */ 6567 return sync_blocks * RAID5_STRIPE_SECTORS(conf); 6568 } 6569 6570 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6571 6572 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 6573 R5_GAS_NOBLOCK); 6574 if (sh == NULL) { 6575 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); 6576 /* make sure we don't swamp the stripe cache if someone else 6577 * is trying to get access 6578 */ 6579 schedule_timeout_uninterruptible(1); 6580 } 6581 /* Need to check if array will still be degraded after recovery/resync 6582 * Note in case of > 1 drive failures it's possible we're rebuilding 6583 * one drive while leaving another faulty drive in array. 6584 */ 6585 rcu_read_lock(); 6586 for (i = 0; i < conf->raid_disks; i++) { 6587 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 6588 6589 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6590 still_degraded = 1; 6591 } 6592 rcu_read_unlock(); 6593 6594 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6595 6596 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6597 set_bit(STRIPE_HANDLE, &sh->state); 6598 6599 raid5_release_stripe(sh); 6600 6601 return RAID5_STRIPE_SECTORS(conf); 6602 } 6603 6604 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6605 unsigned int offset) 6606 { 6607 /* We may not be able to submit a whole bio at once as there 6608 * may not be enough stripe_heads available. 6609 * We cannot pre-allocate enough stripe_heads as we may need 6610 * more than exist in the cache (if we allow ever large chunks). 6611 * So we do one stripe head at a time and record in 6612 * ->bi_hw_segments how many have been done. 6613 * 6614 * We *know* that this entire raid_bio is in one chunk, so 6615 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6616 */ 6617 struct stripe_head *sh; 6618 int dd_idx; 6619 sector_t sector, logical_sector, last_sector; 6620 int scnt = 0; 6621 int handled = 0; 6622 6623 logical_sector = raid_bio->bi_iter.bi_sector & 6624 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6625 sector = raid5_compute_sector(conf, logical_sector, 6626 0, &dd_idx, NULL); 6627 last_sector = bio_end_sector(raid_bio); 6628 6629 for (; logical_sector < last_sector; 6630 logical_sector += RAID5_STRIPE_SECTORS(conf), 6631 sector += RAID5_STRIPE_SECTORS(conf), 6632 scnt++) { 6633 6634 if (scnt < offset) 6635 /* already done this stripe */ 6636 continue; 6637 6638 sh = raid5_get_active_stripe(conf, NULL, sector, 6639 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 6640 if (!sh) { 6641 /* failed to get a stripe - must wait */ 6642 conf->retry_read_aligned = raid_bio; 6643 conf->retry_read_offset = scnt; 6644 return handled; 6645 } 6646 6647 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6648 raid5_release_stripe(sh); 6649 conf->retry_read_aligned = raid_bio; 6650 conf->retry_read_offset = scnt; 6651 return handled; 6652 } 6653 6654 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6655 handle_stripe(sh); 6656 raid5_release_stripe(sh); 6657 handled++; 6658 } 6659 6660 bio_endio(raid_bio); 6661 6662 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6663 wake_up(&conf->wait_for_quiescent); 6664 return handled; 6665 } 6666 6667 static int handle_active_stripes(struct r5conf *conf, int group, 6668 struct r5worker *worker, 6669 struct list_head *temp_inactive_list) 6670 __must_hold(&conf->device_lock) 6671 { 6672 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6673 int i, batch_size = 0, hash; 6674 bool release_inactive = false; 6675 6676 while (batch_size < MAX_STRIPE_BATCH && 6677 (sh = __get_priority_stripe(conf, group)) != NULL) 6678 batch[batch_size++] = sh; 6679 6680 if (batch_size == 0) { 6681 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6682 if (!list_empty(temp_inactive_list + i)) 6683 break; 6684 if (i == NR_STRIPE_HASH_LOCKS) { 6685 spin_unlock_irq(&conf->device_lock); 6686 log_flush_stripe_to_raid(conf); 6687 spin_lock_irq(&conf->device_lock); 6688 return batch_size; 6689 } 6690 release_inactive = true; 6691 } 6692 spin_unlock_irq(&conf->device_lock); 6693 6694 release_inactive_stripe_list(conf, temp_inactive_list, 6695 NR_STRIPE_HASH_LOCKS); 6696 6697 r5l_flush_stripe_to_raid(conf->log); 6698 if (release_inactive) { 6699 spin_lock_irq(&conf->device_lock); 6700 return 0; 6701 } 6702 6703 for (i = 0; i < batch_size; i++) 6704 handle_stripe(batch[i]); 6705 log_write_stripe_run(conf); 6706 6707 cond_resched(); 6708 6709 spin_lock_irq(&conf->device_lock); 6710 for (i = 0; i < batch_size; i++) { 6711 hash = batch[i]->hash_lock_index; 6712 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6713 } 6714 return batch_size; 6715 } 6716 6717 static void raid5_do_work(struct work_struct *work) 6718 { 6719 struct r5worker *worker = container_of(work, struct r5worker, work); 6720 struct r5worker_group *group = worker->group; 6721 struct r5conf *conf = group->conf; 6722 struct mddev *mddev = conf->mddev; 6723 int group_id = group - conf->worker_groups; 6724 int handled; 6725 struct blk_plug plug; 6726 6727 pr_debug("+++ raid5worker active\n"); 6728 6729 blk_start_plug(&plug); 6730 handled = 0; 6731 spin_lock_irq(&conf->device_lock); 6732 while (1) { 6733 int batch_size, released; 6734 6735 released = release_stripe_list(conf, worker->temp_inactive_list); 6736 6737 batch_size = handle_active_stripes(conf, group_id, worker, 6738 worker->temp_inactive_list); 6739 worker->working = false; 6740 if (!batch_size && !released) 6741 break; 6742 handled += batch_size; 6743 wait_event_lock_irq(mddev->sb_wait, 6744 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6745 conf->device_lock); 6746 } 6747 pr_debug("%d stripes handled\n", handled); 6748 6749 spin_unlock_irq(&conf->device_lock); 6750 6751 flush_deferred_bios(conf); 6752 6753 r5l_flush_stripe_to_raid(conf->log); 6754 6755 async_tx_issue_pending_all(); 6756 blk_finish_plug(&plug); 6757 6758 pr_debug("--- raid5worker inactive\n"); 6759 } 6760 6761 /* 6762 * This is our raid5 kernel thread. 6763 * 6764 * We scan the hash table for stripes which can be handled now. 6765 * During the scan, completed stripes are saved for us by the interrupt 6766 * handler, so that they will not have to wait for our next wakeup. 6767 */ 6768 static void raid5d(struct md_thread *thread) 6769 { 6770 struct mddev *mddev = thread->mddev; 6771 struct r5conf *conf = mddev->private; 6772 int handled; 6773 struct blk_plug plug; 6774 6775 pr_debug("+++ raid5d active\n"); 6776 6777 md_check_recovery(mddev); 6778 6779 blk_start_plug(&plug); 6780 handled = 0; 6781 spin_lock_irq(&conf->device_lock); 6782 while (1) { 6783 struct bio *bio; 6784 int batch_size, released; 6785 unsigned int offset; 6786 6787 released = release_stripe_list(conf, conf->temp_inactive_list); 6788 if (released) 6789 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6790 6791 if ( 6792 !list_empty(&conf->bitmap_list)) { 6793 /* Now is a good time to flush some bitmap updates */ 6794 conf->seq_flush++; 6795 spin_unlock_irq(&conf->device_lock); 6796 md_bitmap_unplug(mddev->bitmap); 6797 spin_lock_irq(&conf->device_lock); 6798 conf->seq_write = conf->seq_flush; 6799 activate_bit_delay(conf, conf->temp_inactive_list); 6800 } 6801 raid5_activate_delayed(conf); 6802 6803 while ((bio = remove_bio_from_retry(conf, &offset))) { 6804 int ok; 6805 spin_unlock_irq(&conf->device_lock); 6806 ok = retry_aligned_read(conf, bio, offset); 6807 spin_lock_irq(&conf->device_lock); 6808 if (!ok) 6809 break; 6810 handled++; 6811 } 6812 6813 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6814 conf->temp_inactive_list); 6815 if (!batch_size && !released) 6816 break; 6817 handled += batch_size; 6818 6819 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6820 spin_unlock_irq(&conf->device_lock); 6821 md_check_recovery(mddev); 6822 spin_lock_irq(&conf->device_lock); 6823 6824 /* 6825 * Waiting on MD_SB_CHANGE_PENDING below may deadlock 6826 * seeing md_check_recovery() is needed to clear 6827 * the flag when using mdmon. 6828 */ 6829 continue; 6830 } 6831 6832 wait_event_lock_irq(mddev->sb_wait, 6833 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6834 conf->device_lock); 6835 } 6836 pr_debug("%d stripes handled\n", handled); 6837 6838 spin_unlock_irq(&conf->device_lock); 6839 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6840 mutex_trylock(&conf->cache_size_mutex)) { 6841 grow_one_stripe(conf, __GFP_NOWARN); 6842 /* Set flag even if allocation failed. This helps 6843 * slow down allocation requests when mem is short 6844 */ 6845 set_bit(R5_DID_ALLOC, &conf->cache_state); 6846 mutex_unlock(&conf->cache_size_mutex); 6847 } 6848 6849 flush_deferred_bios(conf); 6850 6851 r5l_flush_stripe_to_raid(conf->log); 6852 6853 async_tx_issue_pending_all(); 6854 blk_finish_plug(&plug); 6855 6856 pr_debug("--- raid5d inactive\n"); 6857 } 6858 6859 static ssize_t 6860 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6861 { 6862 struct r5conf *conf; 6863 int ret = 0; 6864 spin_lock(&mddev->lock); 6865 conf = mddev->private; 6866 if (conf) 6867 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6868 spin_unlock(&mddev->lock); 6869 return ret; 6870 } 6871 6872 int 6873 raid5_set_cache_size(struct mddev *mddev, int size) 6874 { 6875 int result = 0; 6876 struct r5conf *conf = mddev->private; 6877 6878 if (size <= 16 || size > 32768) 6879 return -EINVAL; 6880 6881 conf->min_nr_stripes = size; 6882 mutex_lock(&conf->cache_size_mutex); 6883 while (size < conf->max_nr_stripes && 6884 drop_one_stripe(conf)) 6885 ; 6886 mutex_unlock(&conf->cache_size_mutex); 6887 6888 md_allow_write(mddev); 6889 6890 mutex_lock(&conf->cache_size_mutex); 6891 while (size > conf->max_nr_stripes) 6892 if (!grow_one_stripe(conf, GFP_KERNEL)) { 6893 conf->min_nr_stripes = conf->max_nr_stripes; 6894 result = -ENOMEM; 6895 break; 6896 } 6897 mutex_unlock(&conf->cache_size_mutex); 6898 6899 return result; 6900 } 6901 EXPORT_SYMBOL(raid5_set_cache_size); 6902 6903 static ssize_t 6904 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6905 { 6906 struct r5conf *conf; 6907 unsigned long new; 6908 int err; 6909 6910 if (len >= PAGE_SIZE) 6911 return -EINVAL; 6912 if (kstrtoul(page, 10, &new)) 6913 return -EINVAL; 6914 err = mddev_lock(mddev); 6915 if (err) 6916 return err; 6917 conf = mddev->private; 6918 if (!conf) 6919 err = -ENODEV; 6920 else 6921 err = raid5_set_cache_size(mddev, new); 6922 mddev_unlock(mddev); 6923 6924 return err ?: len; 6925 } 6926 6927 static struct md_sysfs_entry 6928 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6929 raid5_show_stripe_cache_size, 6930 raid5_store_stripe_cache_size); 6931 6932 static ssize_t 6933 raid5_show_rmw_level(struct mddev *mddev, char *page) 6934 { 6935 struct r5conf *conf = mddev->private; 6936 if (conf) 6937 return sprintf(page, "%d\n", conf->rmw_level); 6938 else 6939 return 0; 6940 } 6941 6942 static ssize_t 6943 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6944 { 6945 struct r5conf *conf = mddev->private; 6946 unsigned long new; 6947 6948 if (!conf) 6949 return -ENODEV; 6950 6951 if (len >= PAGE_SIZE) 6952 return -EINVAL; 6953 6954 if (kstrtoul(page, 10, &new)) 6955 return -EINVAL; 6956 6957 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6958 return -EINVAL; 6959 6960 if (new != PARITY_DISABLE_RMW && 6961 new != PARITY_ENABLE_RMW && 6962 new != PARITY_PREFER_RMW) 6963 return -EINVAL; 6964 6965 conf->rmw_level = new; 6966 return len; 6967 } 6968 6969 static struct md_sysfs_entry 6970 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6971 raid5_show_rmw_level, 6972 raid5_store_rmw_level); 6973 6974 static ssize_t 6975 raid5_show_stripe_size(struct mddev *mddev, char *page) 6976 { 6977 struct r5conf *conf; 6978 int ret = 0; 6979 6980 spin_lock(&mddev->lock); 6981 conf = mddev->private; 6982 if (conf) 6983 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); 6984 spin_unlock(&mddev->lock); 6985 return ret; 6986 } 6987 6988 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 6989 static ssize_t 6990 raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) 6991 { 6992 struct r5conf *conf; 6993 unsigned long new; 6994 int err; 6995 int size; 6996 6997 if (len >= PAGE_SIZE) 6998 return -EINVAL; 6999 if (kstrtoul(page, 10, &new)) 7000 return -EINVAL; 7001 7002 /* 7003 * The value should not be bigger than PAGE_SIZE. It requires to 7004 * be multiple of DEFAULT_STRIPE_SIZE and the value should be power 7005 * of two. 7006 */ 7007 if (new % DEFAULT_STRIPE_SIZE != 0 || 7008 new > PAGE_SIZE || new == 0 || 7009 new != roundup_pow_of_two(new)) 7010 return -EINVAL; 7011 7012 err = mddev_suspend_and_lock(mddev); 7013 if (err) 7014 return err; 7015 7016 conf = mddev->private; 7017 if (!conf) { 7018 err = -ENODEV; 7019 goto out_unlock; 7020 } 7021 7022 if (new == conf->stripe_size) 7023 goto out_unlock; 7024 7025 pr_debug("md/raid: change stripe_size from %lu to %lu\n", 7026 conf->stripe_size, new); 7027 7028 if (mddev->sync_thread || 7029 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7030 mddev->reshape_position != MaxSector || 7031 mddev->sysfs_active) { 7032 err = -EBUSY; 7033 goto out_unlock; 7034 } 7035 7036 mutex_lock(&conf->cache_size_mutex); 7037 size = conf->max_nr_stripes; 7038 7039 shrink_stripes(conf); 7040 7041 conf->stripe_size = new; 7042 conf->stripe_shift = ilog2(new) - 9; 7043 conf->stripe_sectors = new >> 9; 7044 if (grow_stripes(conf, size)) { 7045 pr_warn("md/raid:%s: couldn't allocate buffers\n", 7046 mdname(mddev)); 7047 err = -ENOMEM; 7048 } 7049 mutex_unlock(&conf->cache_size_mutex); 7050 7051 out_unlock: 7052 mddev_unlock_and_resume(mddev); 7053 return err ?: len; 7054 } 7055 7056 static struct md_sysfs_entry 7057 raid5_stripe_size = __ATTR(stripe_size, 0644, 7058 raid5_show_stripe_size, 7059 raid5_store_stripe_size); 7060 #else 7061 static struct md_sysfs_entry 7062 raid5_stripe_size = __ATTR(stripe_size, 0444, 7063 raid5_show_stripe_size, 7064 NULL); 7065 #endif 7066 7067 static ssize_t 7068 raid5_show_preread_threshold(struct mddev *mddev, char *page) 7069 { 7070 struct r5conf *conf; 7071 int ret = 0; 7072 spin_lock(&mddev->lock); 7073 conf = mddev->private; 7074 if (conf) 7075 ret = sprintf(page, "%d\n", conf->bypass_threshold); 7076 spin_unlock(&mddev->lock); 7077 return ret; 7078 } 7079 7080 static ssize_t 7081 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 7082 { 7083 struct r5conf *conf; 7084 unsigned long new; 7085 int err; 7086 7087 if (len >= PAGE_SIZE) 7088 return -EINVAL; 7089 if (kstrtoul(page, 10, &new)) 7090 return -EINVAL; 7091 7092 err = mddev_lock(mddev); 7093 if (err) 7094 return err; 7095 conf = mddev->private; 7096 if (!conf) 7097 err = -ENODEV; 7098 else if (new > conf->min_nr_stripes) 7099 err = -EINVAL; 7100 else 7101 conf->bypass_threshold = new; 7102 mddev_unlock(mddev); 7103 return err ?: len; 7104 } 7105 7106 static struct md_sysfs_entry 7107 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 7108 S_IRUGO | S_IWUSR, 7109 raid5_show_preread_threshold, 7110 raid5_store_preread_threshold); 7111 7112 static ssize_t 7113 raid5_show_skip_copy(struct mddev *mddev, char *page) 7114 { 7115 struct r5conf *conf; 7116 int ret = 0; 7117 spin_lock(&mddev->lock); 7118 conf = mddev->private; 7119 if (conf) 7120 ret = sprintf(page, "%d\n", conf->skip_copy); 7121 spin_unlock(&mddev->lock); 7122 return ret; 7123 } 7124 7125 static ssize_t 7126 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 7127 { 7128 struct r5conf *conf; 7129 unsigned long new; 7130 int err; 7131 7132 if (len >= PAGE_SIZE) 7133 return -EINVAL; 7134 if (kstrtoul(page, 10, &new)) 7135 return -EINVAL; 7136 new = !!new; 7137 7138 err = mddev_suspend_and_lock(mddev); 7139 if (err) 7140 return err; 7141 conf = mddev->private; 7142 if (!conf) 7143 err = -ENODEV; 7144 else if (new != conf->skip_copy) { 7145 struct request_queue *q = mddev->queue; 7146 7147 conf->skip_copy = new; 7148 if (new) 7149 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 7150 else 7151 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 7152 } 7153 mddev_unlock_and_resume(mddev); 7154 return err ?: len; 7155 } 7156 7157 static struct md_sysfs_entry 7158 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 7159 raid5_show_skip_copy, 7160 raid5_store_skip_copy); 7161 7162 static ssize_t 7163 stripe_cache_active_show(struct mddev *mddev, char *page) 7164 { 7165 struct r5conf *conf = mddev->private; 7166 if (conf) 7167 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 7168 else 7169 return 0; 7170 } 7171 7172 static struct md_sysfs_entry 7173 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 7174 7175 static ssize_t 7176 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 7177 { 7178 struct r5conf *conf; 7179 int ret = 0; 7180 spin_lock(&mddev->lock); 7181 conf = mddev->private; 7182 if (conf) 7183 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 7184 spin_unlock(&mddev->lock); 7185 return ret; 7186 } 7187 7188 static int alloc_thread_groups(struct r5conf *conf, int cnt, 7189 int *group_cnt, 7190 struct r5worker_group **worker_groups); 7191 static ssize_t 7192 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 7193 { 7194 struct r5conf *conf; 7195 unsigned int new; 7196 int err; 7197 struct r5worker_group *new_groups, *old_groups; 7198 int group_cnt; 7199 7200 if (len >= PAGE_SIZE) 7201 return -EINVAL; 7202 if (kstrtouint(page, 10, &new)) 7203 return -EINVAL; 7204 /* 8192 should be big enough */ 7205 if (new > 8192) 7206 return -EINVAL; 7207 7208 err = mddev_suspend_and_lock(mddev); 7209 if (err) 7210 return err; 7211 conf = mddev->private; 7212 if (!conf) 7213 err = -ENODEV; 7214 else if (new != conf->worker_cnt_per_group) { 7215 old_groups = conf->worker_groups; 7216 if (old_groups) 7217 flush_workqueue(raid5_wq); 7218 7219 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); 7220 if (!err) { 7221 spin_lock_irq(&conf->device_lock); 7222 conf->group_cnt = group_cnt; 7223 conf->worker_cnt_per_group = new; 7224 conf->worker_groups = new_groups; 7225 spin_unlock_irq(&conf->device_lock); 7226 7227 if (old_groups) 7228 kfree(old_groups[0].workers); 7229 kfree(old_groups); 7230 } 7231 } 7232 mddev_unlock_and_resume(mddev); 7233 7234 return err ?: len; 7235 } 7236 7237 static struct md_sysfs_entry 7238 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 7239 raid5_show_group_thread_cnt, 7240 raid5_store_group_thread_cnt); 7241 7242 static struct attribute *raid5_attrs[] = { 7243 &raid5_stripecache_size.attr, 7244 &raid5_stripecache_active.attr, 7245 &raid5_preread_bypass_threshold.attr, 7246 &raid5_group_thread_cnt.attr, 7247 &raid5_skip_copy.attr, 7248 &raid5_rmw_level.attr, 7249 &raid5_stripe_size.attr, 7250 &r5c_journal_mode.attr, 7251 &ppl_write_hint.attr, 7252 NULL, 7253 }; 7254 static const struct attribute_group raid5_attrs_group = { 7255 .name = NULL, 7256 .attrs = raid5_attrs, 7257 }; 7258 7259 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, 7260 struct r5worker_group **worker_groups) 7261 { 7262 int i, j, k; 7263 ssize_t size; 7264 struct r5worker *workers; 7265 7266 if (cnt == 0) { 7267 *group_cnt = 0; 7268 *worker_groups = NULL; 7269 return 0; 7270 } 7271 *group_cnt = num_possible_nodes(); 7272 size = sizeof(struct r5worker) * cnt; 7273 workers = kcalloc(size, *group_cnt, GFP_NOIO); 7274 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group), 7275 GFP_NOIO); 7276 if (!*worker_groups || !workers) { 7277 kfree(workers); 7278 kfree(*worker_groups); 7279 return -ENOMEM; 7280 } 7281 7282 for (i = 0; i < *group_cnt; i++) { 7283 struct r5worker_group *group; 7284 7285 group = &(*worker_groups)[i]; 7286 INIT_LIST_HEAD(&group->handle_list); 7287 INIT_LIST_HEAD(&group->loprio_list); 7288 group->conf = conf; 7289 group->workers = workers + i * cnt; 7290 7291 for (j = 0; j < cnt; j++) { 7292 struct r5worker *worker = group->workers + j; 7293 worker->group = group; 7294 INIT_WORK(&worker->work, raid5_do_work); 7295 7296 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 7297 INIT_LIST_HEAD(worker->temp_inactive_list + k); 7298 } 7299 } 7300 7301 return 0; 7302 } 7303 7304 static void free_thread_groups(struct r5conf *conf) 7305 { 7306 if (conf->worker_groups) 7307 kfree(conf->worker_groups[0].workers); 7308 kfree(conf->worker_groups); 7309 conf->worker_groups = NULL; 7310 } 7311 7312 static sector_t 7313 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 7314 { 7315 struct r5conf *conf = mddev->private; 7316 7317 if (!sectors) 7318 sectors = mddev->dev_sectors; 7319 if (!raid_disks) 7320 /* size is defined by the smallest of previous and new size */ 7321 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 7322 7323 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7324 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 7325 return sectors * (raid_disks - conf->max_degraded); 7326 } 7327 7328 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 7329 { 7330 safe_put_page(percpu->spare_page); 7331 percpu->spare_page = NULL; 7332 kvfree(percpu->scribble); 7333 percpu->scribble = NULL; 7334 } 7335 7336 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 7337 { 7338 if (conf->level == 6 && !percpu->spare_page) { 7339 percpu->spare_page = alloc_page(GFP_KERNEL); 7340 if (!percpu->spare_page) 7341 return -ENOMEM; 7342 } 7343 7344 if (scribble_alloc(percpu, 7345 max(conf->raid_disks, 7346 conf->previous_raid_disks), 7347 max(conf->chunk_sectors, 7348 conf->prev_chunk_sectors) 7349 / RAID5_STRIPE_SECTORS(conf))) { 7350 free_scratch_buffer(conf, percpu); 7351 return -ENOMEM; 7352 } 7353 7354 local_lock_init(&percpu->lock); 7355 return 0; 7356 } 7357 7358 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 7359 { 7360 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 7361 7362 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 7363 return 0; 7364 } 7365 7366 static void raid5_free_percpu(struct r5conf *conf) 7367 { 7368 if (!conf->percpu) 7369 return; 7370 7371 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 7372 free_percpu(conf->percpu); 7373 } 7374 7375 static void free_conf(struct r5conf *conf) 7376 { 7377 int i; 7378 7379 log_exit(conf); 7380 7381 shrinker_free(conf->shrinker); 7382 free_thread_groups(conf); 7383 shrink_stripes(conf); 7384 raid5_free_percpu(conf); 7385 for (i = 0; i < conf->pool_size; i++) 7386 if (conf->disks[i].extra_page) 7387 put_page(conf->disks[i].extra_page); 7388 kfree(conf->disks); 7389 bioset_exit(&conf->bio_split); 7390 kfree(conf->stripe_hashtbl); 7391 kfree(conf->pending_data); 7392 kfree(conf); 7393 } 7394 7395 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 7396 { 7397 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 7398 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 7399 7400 if (alloc_scratch_buffer(conf, percpu)) { 7401 pr_warn("%s: failed memory allocation for cpu%u\n", 7402 __func__, cpu); 7403 return -ENOMEM; 7404 } 7405 return 0; 7406 } 7407 7408 static int raid5_alloc_percpu(struct r5conf *conf) 7409 { 7410 int err = 0; 7411 7412 conf->percpu = alloc_percpu(struct raid5_percpu); 7413 if (!conf->percpu) 7414 return -ENOMEM; 7415 7416 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 7417 if (!err) { 7418 conf->scribble_disks = max(conf->raid_disks, 7419 conf->previous_raid_disks); 7420 conf->scribble_sectors = max(conf->chunk_sectors, 7421 conf->prev_chunk_sectors); 7422 } 7423 return err; 7424 } 7425 7426 static unsigned long raid5_cache_scan(struct shrinker *shrink, 7427 struct shrink_control *sc) 7428 { 7429 struct r5conf *conf = shrink->private_data; 7430 unsigned long ret = SHRINK_STOP; 7431 7432 if (mutex_trylock(&conf->cache_size_mutex)) { 7433 ret= 0; 7434 while (ret < sc->nr_to_scan && 7435 conf->max_nr_stripes > conf->min_nr_stripes) { 7436 if (drop_one_stripe(conf) == 0) { 7437 ret = SHRINK_STOP; 7438 break; 7439 } 7440 ret++; 7441 } 7442 mutex_unlock(&conf->cache_size_mutex); 7443 } 7444 return ret; 7445 } 7446 7447 static unsigned long raid5_cache_count(struct shrinker *shrink, 7448 struct shrink_control *sc) 7449 { 7450 struct r5conf *conf = shrink->private_data; 7451 7452 if (conf->max_nr_stripes < conf->min_nr_stripes) 7453 /* unlikely, but not impossible */ 7454 return 0; 7455 return conf->max_nr_stripes - conf->min_nr_stripes; 7456 } 7457 7458 static struct r5conf *setup_conf(struct mddev *mddev) 7459 { 7460 struct r5conf *conf; 7461 int raid_disk, memory, max_disks; 7462 struct md_rdev *rdev; 7463 struct disk_info *disk; 7464 char pers_name[6]; 7465 int i; 7466 int group_cnt; 7467 struct r5worker_group *new_group; 7468 int ret = -ENOMEM; 7469 7470 if (mddev->new_level != 5 7471 && mddev->new_level != 4 7472 && mddev->new_level != 6) { 7473 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 7474 mdname(mddev), mddev->new_level); 7475 return ERR_PTR(-EIO); 7476 } 7477 if ((mddev->new_level == 5 7478 && !algorithm_valid_raid5(mddev->new_layout)) || 7479 (mddev->new_level == 6 7480 && !algorithm_valid_raid6(mddev->new_layout))) { 7481 pr_warn("md/raid:%s: layout %d not supported\n", 7482 mdname(mddev), mddev->new_layout); 7483 return ERR_PTR(-EIO); 7484 } 7485 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 7486 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 7487 mdname(mddev), mddev->raid_disks); 7488 return ERR_PTR(-EINVAL); 7489 } 7490 7491 if (!mddev->new_chunk_sectors || 7492 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 7493 !is_power_of_2(mddev->new_chunk_sectors)) { 7494 pr_warn("md/raid:%s: invalid chunk size %d\n", 7495 mdname(mddev), mddev->new_chunk_sectors << 9); 7496 return ERR_PTR(-EINVAL); 7497 } 7498 7499 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 7500 if (conf == NULL) 7501 goto abort; 7502 7503 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 7504 conf->stripe_size = DEFAULT_STRIPE_SIZE; 7505 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; 7506 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; 7507 #endif 7508 INIT_LIST_HEAD(&conf->free_list); 7509 INIT_LIST_HEAD(&conf->pending_list); 7510 conf->pending_data = kcalloc(PENDING_IO_MAX, 7511 sizeof(struct r5pending_data), 7512 GFP_KERNEL); 7513 if (!conf->pending_data) 7514 goto abort; 7515 for (i = 0; i < PENDING_IO_MAX; i++) 7516 list_add(&conf->pending_data[i].sibling, &conf->free_list); 7517 /* Don't enable multi-threading by default*/ 7518 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { 7519 conf->group_cnt = group_cnt; 7520 conf->worker_cnt_per_group = 0; 7521 conf->worker_groups = new_group; 7522 } else 7523 goto abort; 7524 spin_lock_init(&conf->device_lock); 7525 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); 7526 mutex_init(&conf->cache_size_mutex); 7527 7528 init_waitqueue_head(&conf->wait_for_quiescent); 7529 init_waitqueue_head(&conf->wait_for_stripe); 7530 init_waitqueue_head(&conf->wait_for_overlap); 7531 INIT_LIST_HEAD(&conf->handle_list); 7532 INIT_LIST_HEAD(&conf->loprio_list); 7533 INIT_LIST_HEAD(&conf->hold_list); 7534 INIT_LIST_HEAD(&conf->delayed_list); 7535 INIT_LIST_HEAD(&conf->bitmap_list); 7536 init_llist_head(&conf->released_stripes); 7537 atomic_set(&conf->active_stripes, 0); 7538 atomic_set(&conf->preread_active_stripes, 0); 7539 atomic_set(&conf->active_aligned_reads, 0); 7540 spin_lock_init(&conf->pending_bios_lock); 7541 conf->batch_bio_dispatch = true; 7542 rdev_for_each(rdev, mddev) { 7543 if (test_bit(Journal, &rdev->flags)) 7544 continue; 7545 if (bdev_nonrot(rdev->bdev)) { 7546 conf->batch_bio_dispatch = false; 7547 break; 7548 } 7549 } 7550 7551 conf->bypass_threshold = BYPASS_THRESHOLD; 7552 conf->recovery_disabled = mddev->recovery_disabled - 1; 7553 7554 conf->raid_disks = mddev->raid_disks; 7555 if (mddev->reshape_position == MaxSector) 7556 conf->previous_raid_disks = mddev->raid_disks; 7557 else 7558 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 7559 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 7560 7561 conf->disks = kcalloc(max_disks, sizeof(struct disk_info), 7562 GFP_KERNEL); 7563 7564 if (!conf->disks) 7565 goto abort; 7566 7567 for (i = 0; i < max_disks; i++) { 7568 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 7569 if (!conf->disks[i].extra_page) 7570 goto abort; 7571 } 7572 7573 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 7574 if (ret) 7575 goto abort; 7576 conf->mddev = mddev; 7577 7578 ret = -ENOMEM; 7579 conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL); 7580 if (!conf->stripe_hashtbl) 7581 goto abort; 7582 7583 /* We init hash_locks[0] separately to that it can be used 7584 * as the reference lock in the spin_lock_nest_lock() call 7585 * in lock_all_device_hash_locks_irq in order to convince 7586 * lockdep that we know what we are doing. 7587 */ 7588 spin_lock_init(conf->hash_locks); 7589 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 7590 spin_lock_init(conf->hash_locks + i); 7591 7592 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7593 INIT_LIST_HEAD(conf->inactive_list + i); 7594 7595 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7596 INIT_LIST_HEAD(conf->temp_inactive_list + i); 7597 7598 atomic_set(&conf->r5c_cached_full_stripes, 0); 7599 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 7600 atomic_set(&conf->r5c_cached_partial_stripes, 0); 7601 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 7602 atomic_set(&conf->r5c_flushing_full_stripes, 0); 7603 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 7604 7605 conf->level = mddev->new_level; 7606 conf->chunk_sectors = mddev->new_chunk_sectors; 7607 ret = raid5_alloc_percpu(conf); 7608 if (ret) 7609 goto abort; 7610 7611 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 7612 7613 ret = -EIO; 7614 rdev_for_each(rdev, mddev) { 7615 raid_disk = rdev->raid_disk; 7616 if (raid_disk >= max_disks 7617 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 7618 continue; 7619 disk = conf->disks + raid_disk; 7620 7621 if (test_bit(Replacement, &rdev->flags)) { 7622 if (disk->replacement) 7623 goto abort; 7624 RCU_INIT_POINTER(disk->replacement, rdev); 7625 } else { 7626 if (disk->rdev) 7627 goto abort; 7628 RCU_INIT_POINTER(disk->rdev, rdev); 7629 } 7630 7631 if (test_bit(In_sync, &rdev->flags)) { 7632 pr_info("md/raid:%s: device %pg operational as raid disk %d\n", 7633 mdname(mddev), rdev->bdev, raid_disk); 7634 } else if (rdev->saved_raid_disk != raid_disk) 7635 /* Cannot rely on bitmap to complete recovery */ 7636 conf->fullsync = 1; 7637 } 7638 7639 conf->level = mddev->new_level; 7640 if (conf->level == 6) { 7641 conf->max_degraded = 2; 7642 if (raid6_call.xor_syndrome) 7643 conf->rmw_level = PARITY_ENABLE_RMW; 7644 else 7645 conf->rmw_level = PARITY_DISABLE_RMW; 7646 } else { 7647 conf->max_degraded = 1; 7648 conf->rmw_level = PARITY_ENABLE_RMW; 7649 } 7650 conf->algorithm = mddev->new_layout; 7651 conf->reshape_progress = mddev->reshape_position; 7652 if (conf->reshape_progress != MaxSector) { 7653 conf->prev_chunk_sectors = mddev->chunk_sectors; 7654 conf->prev_algo = mddev->layout; 7655 } else { 7656 conf->prev_chunk_sectors = conf->chunk_sectors; 7657 conf->prev_algo = conf->algorithm; 7658 } 7659 7660 conf->min_nr_stripes = NR_STRIPES; 7661 if (mddev->reshape_position != MaxSector) { 7662 int stripes = max_t(int, 7663 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, 7664 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); 7665 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7666 if (conf->min_nr_stripes != NR_STRIPES) 7667 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7668 mdname(mddev), conf->min_nr_stripes); 7669 } 7670 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7671 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7672 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7673 if (grow_stripes(conf, conf->min_nr_stripes)) { 7674 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7675 mdname(mddev), memory); 7676 ret = -ENOMEM; 7677 goto abort; 7678 } else 7679 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7680 /* 7681 * Losing a stripe head costs more than the time to refill it, 7682 * it reduces the queue depth and so can hurt throughput. 7683 * So set it rather large, scaled by number of devices. 7684 */ 7685 conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev)); 7686 if (!conf->shrinker) { 7687 ret = -ENOMEM; 7688 pr_warn("md/raid:%s: couldn't allocate shrinker.\n", 7689 mdname(mddev)); 7690 goto abort; 7691 } 7692 7693 conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7694 conf->shrinker->scan_objects = raid5_cache_scan; 7695 conf->shrinker->count_objects = raid5_cache_count; 7696 conf->shrinker->batch = 128; 7697 conf->shrinker->private_data = conf; 7698 7699 shrinker_register(conf->shrinker); 7700 7701 sprintf(pers_name, "raid%d", mddev->new_level); 7702 rcu_assign_pointer(conf->thread, 7703 md_register_thread(raid5d, mddev, pers_name)); 7704 if (!conf->thread) { 7705 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7706 mdname(mddev)); 7707 ret = -ENOMEM; 7708 goto abort; 7709 } 7710 7711 return conf; 7712 7713 abort: 7714 if (conf) 7715 free_conf(conf); 7716 return ERR_PTR(ret); 7717 } 7718 7719 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7720 { 7721 switch (algo) { 7722 case ALGORITHM_PARITY_0: 7723 if (raid_disk < max_degraded) 7724 return 1; 7725 break; 7726 case ALGORITHM_PARITY_N: 7727 if (raid_disk >= raid_disks - max_degraded) 7728 return 1; 7729 break; 7730 case ALGORITHM_PARITY_0_6: 7731 if (raid_disk == 0 || 7732 raid_disk == raid_disks - 1) 7733 return 1; 7734 break; 7735 case ALGORITHM_LEFT_ASYMMETRIC_6: 7736 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7737 case ALGORITHM_LEFT_SYMMETRIC_6: 7738 case ALGORITHM_RIGHT_SYMMETRIC_6: 7739 if (raid_disk == raid_disks - 1) 7740 return 1; 7741 } 7742 return 0; 7743 } 7744 7745 static void raid5_set_io_opt(struct r5conf *conf) 7746 { 7747 blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * 7748 (conf->raid_disks - conf->max_degraded)); 7749 } 7750 7751 static int raid5_run(struct mddev *mddev) 7752 { 7753 struct r5conf *conf; 7754 int dirty_parity_disks = 0; 7755 struct md_rdev *rdev; 7756 struct md_rdev *journal_dev = NULL; 7757 sector_t reshape_offset = 0; 7758 int i; 7759 long long min_offset_diff = 0; 7760 int first = 1; 7761 7762 if (mddev->recovery_cp != MaxSector) 7763 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7764 mdname(mddev)); 7765 7766 rdev_for_each(rdev, mddev) { 7767 long long diff; 7768 7769 if (test_bit(Journal, &rdev->flags)) { 7770 journal_dev = rdev; 7771 continue; 7772 } 7773 if (rdev->raid_disk < 0) 7774 continue; 7775 diff = (rdev->new_data_offset - rdev->data_offset); 7776 if (first) { 7777 min_offset_diff = diff; 7778 first = 0; 7779 } else if (mddev->reshape_backwards && 7780 diff < min_offset_diff) 7781 min_offset_diff = diff; 7782 else if (!mddev->reshape_backwards && 7783 diff > min_offset_diff) 7784 min_offset_diff = diff; 7785 } 7786 7787 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && 7788 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7789 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7790 mdname(mddev)); 7791 return -EINVAL; 7792 } 7793 7794 if (mddev->reshape_position != MaxSector) { 7795 /* Check that we can continue the reshape. 7796 * Difficulties arise if the stripe we would write to 7797 * next is at or after the stripe we would read from next. 7798 * For a reshape that changes the number of devices, this 7799 * is only possible for a very short time, and mdadm makes 7800 * sure that time appears to have past before assembling 7801 * the array. So we fail if that time hasn't passed. 7802 * For a reshape that keeps the number of devices the same 7803 * mdadm must be monitoring the reshape can keeping the 7804 * critical areas read-only and backed up. It will start 7805 * the array in read-only mode, so we check for that. 7806 */ 7807 sector_t here_new, here_old; 7808 int old_disks; 7809 int max_degraded = (mddev->level == 6 ? 2 : 1); 7810 int chunk_sectors; 7811 int new_data_disks; 7812 7813 if (journal_dev) { 7814 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7815 mdname(mddev)); 7816 return -EINVAL; 7817 } 7818 7819 if (mddev->new_level != mddev->level) { 7820 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7821 mdname(mddev)); 7822 return -EINVAL; 7823 } 7824 old_disks = mddev->raid_disks - mddev->delta_disks; 7825 /* reshape_position must be on a new-stripe boundary, and one 7826 * further up in new geometry must map after here in old 7827 * geometry. 7828 * If the chunk sizes are different, then as we perform reshape 7829 * in units of the largest of the two, reshape_position needs 7830 * be a multiple of the largest chunk size times new data disks. 7831 */ 7832 here_new = mddev->reshape_position; 7833 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7834 new_data_disks = mddev->raid_disks - max_degraded; 7835 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7836 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7837 mdname(mddev)); 7838 return -EINVAL; 7839 } 7840 reshape_offset = here_new * chunk_sectors; 7841 /* here_new is the stripe we will write to */ 7842 here_old = mddev->reshape_position; 7843 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7844 /* here_old is the first stripe that we might need to read 7845 * from */ 7846 if (mddev->delta_disks == 0) { 7847 /* We cannot be sure it is safe to start an in-place 7848 * reshape. It is only safe if user-space is monitoring 7849 * and taking constant backups. 7850 * mdadm always starts a situation like this in 7851 * readonly mode so it can take control before 7852 * allowing any writes. So just check for that. 7853 */ 7854 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7855 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7856 /* not really in-place - so OK */; 7857 else if (mddev->ro == 0) { 7858 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7859 mdname(mddev)); 7860 return -EINVAL; 7861 } 7862 } else if (mddev->reshape_backwards 7863 ? (here_new * chunk_sectors + min_offset_diff <= 7864 here_old * chunk_sectors) 7865 : (here_new * chunk_sectors >= 7866 here_old * chunk_sectors + (-min_offset_diff))) { 7867 /* Reading from the same stripe as writing to - bad */ 7868 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7869 mdname(mddev)); 7870 return -EINVAL; 7871 } 7872 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7873 /* OK, we should be able to continue; */ 7874 } else { 7875 BUG_ON(mddev->level != mddev->new_level); 7876 BUG_ON(mddev->layout != mddev->new_layout); 7877 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7878 BUG_ON(mddev->delta_disks != 0); 7879 } 7880 7881 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7882 test_bit(MD_HAS_PPL, &mddev->flags)) { 7883 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7884 mdname(mddev)); 7885 clear_bit(MD_HAS_PPL, &mddev->flags); 7886 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7887 } 7888 7889 if (mddev->private == NULL) 7890 conf = setup_conf(mddev); 7891 else 7892 conf = mddev->private; 7893 7894 if (IS_ERR(conf)) 7895 return PTR_ERR(conf); 7896 7897 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7898 if (!journal_dev) { 7899 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7900 mdname(mddev)); 7901 mddev->ro = 1; 7902 set_disk_ro(mddev->gendisk, 1); 7903 } else if (mddev->recovery_cp == MaxSector) 7904 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7905 } 7906 7907 conf->min_offset_diff = min_offset_diff; 7908 rcu_assign_pointer(mddev->thread, conf->thread); 7909 rcu_assign_pointer(conf->thread, NULL); 7910 mddev->private = conf; 7911 7912 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7913 i++) { 7914 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); 7915 if (!rdev && conf->disks[i].replacement) { 7916 /* The replacement is all we have yet */ 7917 rdev = rdev_mdlock_deref(mddev, 7918 conf->disks[i].replacement); 7919 conf->disks[i].replacement = NULL; 7920 clear_bit(Replacement, &rdev->flags); 7921 rcu_assign_pointer(conf->disks[i].rdev, rdev); 7922 } 7923 if (!rdev) 7924 continue; 7925 if (rcu_access_pointer(conf->disks[i].replacement) && 7926 conf->reshape_progress != MaxSector) { 7927 /* replacements and reshape simply do not mix. */ 7928 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7929 goto abort; 7930 } 7931 if (test_bit(In_sync, &rdev->flags)) 7932 continue; 7933 /* This disc is not fully in-sync. However if it 7934 * just stored parity (beyond the recovery_offset), 7935 * when we don't need to be concerned about the 7936 * array being dirty. 7937 * When reshape goes 'backwards', we never have 7938 * partially completed devices, so we only need 7939 * to worry about reshape going forwards. 7940 */ 7941 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7942 if (mddev->major_version == 0 && 7943 mddev->minor_version > 90) 7944 rdev->recovery_offset = reshape_offset; 7945 7946 if (rdev->recovery_offset < reshape_offset) { 7947 /* We need to check old and new layout */ 7948 if (!only_parity(rdev->raid_disk, 7949 conf->algorithm, 7950 conf->raid_disks, 7951 conf->max_degraded)) 7952 continue; 7953 } 7954 if (!only_parity(rdev->raid_disk, 7955 conf->prev_algo, 7956 conf->previous_raid_disks, 7957 conf->max_degraded)) 7958 continue; 7959 dirty_parity_disks++; 7960 } 7961 7962 /* 7963 * 0 for a fully functional array, 1 or 2 for a degraded array. 7964 */ 7965 mddev->degraded = raid5_calc_degraded(conf); 7966 7967 if (has_failed(conf)) { 7968 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7969 mdname(mddev), mddev->degraded, conf->raid_disks); 7970 goto abort; 7971 } 7972 7973 /* device size must be a multiple of chunk size */ 7974 mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1); 7975 mddev->resync_max_sectors = mddev->dev_sectors; 7976 7977 if (mddev->degraded > dirty_parity_disks && 7978 mddev->recovery_cp != MaxSector) { 7979 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7980 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7981 mdname(mddev)); 7982 else if (mddev->ok_start_degraded) 7983 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7984 mdname(mddev)); 7985 else { 7986 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7987 mdname(mddev)); 7988 goto abort; 7989 } 7990 } 7991 7992 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7993 mdname(mddev), conf->level, 7994 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7995 mddev->new_layout); 7996 7997 print_raid5_conf(conf); 7998 7999 if (conf->reshape_progress != MaxSector) { 8000 conf->reshape_safe = conf->reshape_progress; 8001 atomic_set(&conf->reshape_stripes, 0); 8002 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8003 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8004 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8005 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8006 rcu_assign_pointer(mddev->sync_thread, 8007 md_register_thread(md_do_sync, mddev, "reshape")); 8008 if (!mddev->sync_thread) 8009 goto abort; 8010 } 8011 8012 /* Ok, everything is just fine now */ 8013 if (mddev->to_remove == &raid5_attrs_group) 8014 mddev->to_remove = NULL; 8015 else if (mddev->kobj.sd && 8016 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 8017 pr_warn("raid5: failed to create sysfs attributes for %s\n", 8018 mdname(mddev)); 8019 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 8020 8021 if (mddev->queue) { 8022 int chunk_size; 8023 /* read-ahead size must cover two whole stripes, which 8024 * is 2 * (datadisks) * chunksize where 'n' is the 8025 * number of raid devices 8026 */ 8027 int data_disks = conf->previous_raid_disks - conf->max_degraded; 8028 int stripe = data_disks * 8029 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 8030 8031 chunk_size = mddev->chunk_sectors << 9; 8032 blk_queue_io_min(mddev->queue, chunk_size); 8033 raid5_set_io_opt(conf); 8034 mddev->queue->limits.raid_partial_stripes_expensive = 1; 8035 /* 8036 * We can only discard a whole stripe. It doesn't make sense to 8037 * discard data disk but write parity disk 8038 */ 8039 stripe = stripe * PAGE_SIZE; 8040 stripe = roundup_pow_of_two(stripe); 8041 mddev->queue->limits.discard_granularity = stripe; 8042 8043 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 8044 8045 rdev_for_each(rdev, mddev) { 8046 disk_stack_limits(mddev->gendisk, rdev->bdev, 8047 rdev->data_offset << 9); 8048 disk_stack_limits(mddev->gendisk, rdev->bdev, 8049 rdev->new_data_offset << 9); 8050 } 8051 8052 /* 8053 * zeroing is required, otherwise data 8054 * could be lost. Consider a scenario: discard a stripe 8055 * (the stripe could be inconsistent if 8056 * discard_zeroes_data is 0); write one disk of the 8057 * stripe (the stripe could be inconsistent again 8058 * depending on which disks are used to calculate 8059 * parity); the disk is broken; The stripe data of this 8060 * disk is lost. 8061 * 8062 * We only allow DISCARD if the sysadmin has confirmed that 8063 * only safe devices are in use by setting a module parameter. 8064 * A better idea might be to turn DISCARD into WRITE_ZEROES 8065 * requests, as that is required to be safe. 8066 */ 8067 if (!devices_handle_discard_safely || 8068 mddev->queue->limits.max_discard_sectors < (stripe >> 9) || 8069 mddev->queue->limits.discard_granularity < stripe) 8070 blk_queue_max_discard_sectors(mddev->queue, 0); 8071 8072 /* 8073 * Requests require having a bitmap for each stripe. 8074 * Limit the max sectors based on this. 8075 */ 8076 blk_queue_max_hw_sectors(mddev->queue, 8077 RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); 8078 8079 /* No restrictions on the number of segments in the request */ 8080 blk_queue_max_segments(mddev->queue, USHRT_MAX); 8081 } 8082 8083 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 8084 goto abort; 8085 8086 return 0; 8087 abort: 8088 md_unregister_thread(mddev, &mddev->thread); 8089 print_raid5_conf(conf); 8090 free_conf(conf); 8091 mddev->private = NULL; 8092 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 8093 return -EIO; 8094 } 8095 8096 static void raid5_free(struct mddev *mddev, void *priv) 8097 { 8098 struct r5conf *conf = priv; 8099 8100 free_conf(conf); 8101 mddev->to_remove = &raid5_attrs_group; 8102 } 8103 8104 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 8105 { 8106 struct r5conf *conf = mddev->private; 8107 int i; 8108 8109 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 8110 conf->chunk_sectors / 2, mddev->layout); 8111 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 8112 rcu_read_lock(); 8113 for (i = 0; i < conf->raid_disks; i++) { 8114 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 8115 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 8116 } 8117 rcu_read_unlock(); 8118 seq_printf (seq, "]"); 8119 } 8120 8121 static void print_raid5_conf (struct r5conf *conf) 8122 { 8123 struct md_rdev *rdev; 8124 int i; 8125 8126 pr_debug("RAID conf printout:\n"); 8127 if (!conf) { 8128 pr_debug("(conf==NULL)\n"); 8129 return; 8130 } 8131 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 8132 conf->raid_disks, 8133 conf->raid_disks - conf->mddev->degraded); 8134 8135 rcu_read_lock(); 8136 for (i = 0; i < conf->raid_disks; i++) { 8137 rdev = rcu_dereference(conf->disks[i].rdev); 8138 if (rdev) 8139 pr_debug(" disk %d, o:%d, dev:%pg\n", 8140 i, !test_bit(Faulty, &rdev->flags), 8141 rdev->bdev); 8142 } 8143 rcu_read_unlock(); 8144 } 8145 8146 static int raid5_spare_active(struct mddev *mddev) 8147 { 8148 int i; 8149 struct r5conf *conf = mddev->private; 8150 struct md_rdev *rdev, *replacement; 8151 int count = 0; 8152 unsigned long flags; 8153 8154 for (i = 0; i < conf->raid_disks; i++) { 8155 rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); 8156 replacement = rdev_mdlock_deref(mddev, 8157 conf->disks[i].replacement); 8158 if (replacement 8159 && replacement->recovery_offset == MaxSector 8160 && !test_bit(Faulty, &replacement->flags) 8161 && !test_and_set_bit(In_sync, &replacement->flags)) { 8162 /* Replacement has just become active. */ 8163 if (!rdev 8164 || !test_and_clear_bit(In_sync, &rdev->flags)) 8165 count++; 8166 if (rdev) { 8167 /* Replaced device not technically faulty, 8168 * but we need to be sure it gets removed 8169 * and never re-added. 8170 */ 8171 set_bit(Faulty, &rdev->flags); 8172 sysfs_notify_dirent_safe( 8173 rdev->sysfs_state); 8174 } 8175 sysfs_notify_dirent_safe(replacement->sysfs_state); 8176 } else if (rdev 8177 && rdev->recovery_offset == MaxSector 8178 && !test_bit(Faulty, &rdev->flags) 8179 && !test_and_set_bit(In_sync, &rdev->flags)) { 8180 count++; 8181 sysfs_notify_dirent_safe(rdev->sysfs_state); 8182 } 8183 } 8184 spin_lock_irqsave(&conf->device_lock, flags); 8185 mddev->degraded = raid5_calc_degraded(conf); 8186 spin_unlock_irqrestore(&conf->device_lock, flags); 8187 print_raid5_conf(conf); 8188 return count; 8189 } 8190 8191 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 8192 { 8193 struct r5conf *conf = mddev->private; 8194 int err = 0; 8195 int number = rdev->raid_disk; 8196 struct md_rdev __rcu **rdevp; 8197 struct disk_info *p; 8198 struct md_rdev *tmp; 8199 8200 print_raid5_conf(conf); 8201 if (test_bit(Journal, &rdev->flags) && conf->log) { 8202 /* 8203 * we can't wait pending write here, as this is called in 8204 * raid5d, wait will deadlock. 8205 * neilb: there is no locking about new writes here, 8206 * so this cannot be safe. 8207 */ 8208 if (atomic_read(&conf->active_stripes) || 8209 atomic_read(&conf->r5c_cached_full_stripes) || 8210 atomic_read(&conf->r5c_cached_partial_stripes)) { 8211 return -EBUSY; 8212 } 8213 log_exit(conf); 8214 return 0; 8215 } 8216 if (unlikely(number >= conf->pool_size)) 8217 return 0; 8218 p = conf->disks + number; 8219 if (rdev == rcu_access_pointer(p->rdev)) 8220 rdevp = &p->rdev; 8221 else if (rdev == rcu_access_pointer(p->replacement)) 8222 rdevp = &p->replacement; 8223 else 8224 return 0; 8225 8226 if (number >= conf->raid_disks && 8227 conf->reshape_progress == MaxSector) 8228 clear_bit(In_sync, &rdev->flags); 8229 8230 if (test_bit(In_sync, &rdev->flags) || 8231 atomic_read(&rdev->nr_pending)) { 8232 err = -EBUSY; 8233 goto abort; 8234 } 8235 /* Only remove non-faulty devices if recovery 8236 * isn't possible. 8237 */ 8238 if (!test_bit(Faulty, &rdev->flags) && 8239 mddev->recovery_disabled != conf->recovery_disabled && 8240 !has_failed(conf) && 8241 (!rcu_access_pointer(p->replacement) || 8242 rcu_access_pointer(p->replacement) == rdev) && 8243 number < conf->raid_disks) { 8244 err = -EBUSY; 8245 goto abort; 8246 } 8247 *rdevp = NULL; 8248 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 8249 lockdep_assert_held(&mddev->reconfig_mutex); 8250 synchronize_rcu(); 8251 if (atomic_read(&rdev->nr_pending)) { 8252 /* lost the race, try later */ 8253 err = -EBUSY; 8254 rcu_assign_pointer(*rdevp, rdev); 8255 } 8256 } 8257 if (!err) { 8258 err = log_modify(conf, rdev, false); 8259 if (err) 8260 goto abort; 8261 } 8262 8263 tmp = rcu_access_pointer(p->replacement); 8264 if (tmp) { 8265 /* We must have just cleared 'rdev' */ 8266 rcu_assign_pointer(p->rdev, tmp); 8267 clear_bit(Replacement, &tmp->flags); 8268 smp_mb(); /* Make sure other CPUs may see both as identical 8269 * but will never see neither - if they are careful 8270 */ 8271 rcu_assign_pointer(p->replacement, NULL); 8272 8273 if (!err) 8274 err = log_modify(conf, tmp, true); 8275 } 8276 8277 clear_bit(WantReplacement, &rdev->flags); 8278 abort: 8279 8280 print_raid5_conf(conf); 8281 return err; 8282 } 8283 8284 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 8285 { 8286 struct r5conf *conf = mddev->private; 8287 int ret, err = -EEXIST; 8288 int disk; 8289 struct disk_info *p; 8290 struct md_rdev *tmp; 8291 int first = 0; 8292 int last = conf->raid_disks - 1; 8293 8294 if (test_bit(Journal, &rdev->flags)) { 8295 if (conf->log) 8296 return -EBUSY; 8297 8298 rdev->raid_disk = 0; 8299 /* 8300 * The array is in readonly mode if journal is missing, so no 8301 * write requests running. We should be safe 8302 */ 8303 ret = log_init(conf, rdev, false); 8304 if (ret) 8305 return ret; 8306 8307 ret = r5l_start(conf->log); 8308 if (ret) 8309 return ret; 8310 8311 return 0; 8312 } 8313 if (mddev->recovery_disabled == conf->recovery_disabled) 8314 return -EBUSY; 8315 8316 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 8317 /* no point adding a device */ 8318 return -EINVAL; 8319 8320 if (rdev->raid_disk >= 0) 8321 first = last = rdev->raid_disk; 8322 8323 /* 8324 * find the disk ... but prefer rdev->saved_raid_disk 8325 * if possible. 8326 */ 8327 if (rdev->saved_raid_disk >= first && 8328 rdev->saved_raid_disk <= last && 8329 conf->disks[rdev->saved_raid_disk].rdev == NULL) 8330 first = rdev->saved_raid_disk; 8331 8332 for (disk = first; disk <= last; disk++) { 8333 p = conf->disks + disk; 8334 if (p->rdev == NULL) { 8335 clear_bit(In_sync, &rdev->flags); 8336 rdev->raid_disk = disk; 8337 if (rdev->saved_raid_disk != disk) 8338 conf->fullsync = 1; 8339 rcu_assign_pointer(p->rdev, rdev); 8340 8341 err = log_modify(conf, rdev, true); 8342 8343 goto out; 8344 } 8345 } 8346 for (disk = first; disk <= last; disk++) { 8347 p = conf->disks + disk; 8348 tmp = rdev_mdlock_deref(mddev, p->rdev); 8349 if (test_bit(WantReplacement, &tmp->flags) && 8350 mddev->reshape_position == MaxSector && 8351 p->replacement == NULL) { 8352 clear_bit(In_sync, &rdev->flags); 8353 set_bit(Replacement, &rdev->flags); 8354 rdev->raid_disk = disk; 8355 err = 0; 8356 conf->fullsync = 1; 8357 rcu_assign_pointer(p->replacement, rdev); 8358 break; 8359 } 8360 } 8361 out: 8362 print_raid5_conf(conf); 8363 return err; 8364 } 8365 8366 static int raid5_resize(struct mddev *mddev, sector_t sectors) 8367 { 8368 /* no resync is happening, and there is enough space 8369 * on all devices, so we can resize. 8370 * We need to make sure resync covers any new space. 8371 * If the array is shrinking we should possibly wait until 8372 * any io in the removed space completes, but it hardly seems 8373 * worth it. 8374 */ 8375 sector_t newsize; 8376 struct r5conf *conf = mddev->private; 8377 8378 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 8379 return -EINVAL; 8380 sectors &= ~((sector_t)conf->chunk_sectors - 1); 8381 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 8382 if (mddev->external_size && 8383 mddev->array_sectors > newsize) 8384 return -EINVAL; 8385 if (mddev->bitmap) { 8386 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); 8387 if (ret) 8388 return ret; 8389 } 8390 md_set_array_sectors(mddev, newsize); 8391 if (sectors > mddev->dev_sectors && 8392 mddev->recovery_cp > mddev->dev_sectors) { 8393 mddev->recovery_cp = mddev->dev_sectors; 8394 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8395 } 8396 mddev->dev_sectors = sectors; 8397 mddev->resync_max_sectors = sectors; 8398 return 0; 8399 } 8400 8401 static int check_stripe_cache(struct mddev *mddev) 8402 { 8403 /* Can only proceed if there are plenty of stripe_heads. 8404 * We need a minimum of one full stripe,, and for sensible progress 8405 * it is best to have about 4 times that. 8406 * If we require 4 times, then the default 256 4K stripe_heads will 8407 * allow for chunk sizes up to 256K, which is probably OK. 8408 * If the chunk size is greater, user-space should request more 8409 * stripe_heads first. 8410 */ 8411 struct r5conf *conf = mddev->private; 8412 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 8413 > conf->min_nr_stripes || 8414 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 8415 > conf->min_nr_stripes) { 8416 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 8417 mdname(mddev), 8418 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 8419 / RAID5_STRIPE_SIZE(conf))*4); 8420 return 0; 8421 } 8422 return 1; 8423 } 8424 8425 static int check_reshape(struct mddev *mddev) 8426 { 8427 struct r5conf *conf = mddev->private; 8428 8429 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 8430 return -EINVAL; 8431 if (mddev->delta_disks == 0 && 8432 mddev->new_layout == mddev->layout && 8433 mddev->new_chunk_sectors == mddev->chunk_sectors) 8434 return 0; /* nothing to do */ 8435 if (has_failed(conf)) 8436 return -EINVAL; 8437 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 8438 /* We might be able to shrink, but the devices must 8439 * be made bigger first. 8440 * For raid6, 4 is the minimum size. 8441 * Otherwise 2 is the minimum 8442 */ 8443 int min = 2; 8444 if (mddev->level == 6) 8445 min = 4; 8446 if (mddev->raid_disks + mddev->delta_disks < min) 8447 return -EINVAL; 8448 } 8449 8450 if (!check_stripe_cache(mddev)) 8451 return -ENOSPC; 8452 8453 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 8454 mddev->delta_disks > 0) 8455 if (resize_chunks(conf, 8456 conf->previous_raid_disks 8457 + max(0, mddev->delta_disks), 8458 max(mddev->new_chunk_sectors, 8459 mddev->chunk_sectors) 8460 ) < 0) 8461 return -ENOMEM; 8462 8463 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 8464 return 0; /* never bother to shrink */ 8465 return resize_stripes(conf, (conf->previous_raid_disks 8466 + mddev->delta_disks)); 8467 } 8468 8469 static int raid5_start_reshape(struct mddev *mddev) 8470 { 8471 struct r5conf *conf = mddev->private; 8472 struct md_rdev *rdev; 8473 int spares = 0; 8474 int i; 8475 unsigned long flags; 8476 8477 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 8478 return -EBUSY; 8479 8480 if (!check_stripe_cache(mddev)) 8481 return -ENOSPC; 8482 8483 if (has_failed(conf)) 8484 return -EINVAL; 8485 8486 /* raid5 can't handle concurrent reshape and recovery */ 8487 if (mddev->recovery_cp < MaxSector) 8488 return -EBUSY; 8489 for (i = 0; i < conf->raid_disks; i++) 8490 if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) 8491 return -EBUSY; 8492 8493 rdev_for_each(rdev, mddev) { 8494 if (!test_bit(In_sync, &rdev->flags) 8495 && !test_bit(Faulty, &rdev->flags)) 8496 spares++; 8497 } 8498 8499 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 8500 /* Not enough devices even to make a degraded array 8501 * of that size 8502 */ 8503 return -EINVAL; 8504 8505 /* Refuse to reduce size of the array. Any reductions in 8506 * array size must be through explicit setting of array_size 8507 * attribute. 8508 */ 8509 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 8510 < mddev->array_sectors) { 8511 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 8512 mdname(mddev)); 8513 return -EINVAL; 8514 } 8515 8516 atomic_set(&conf->reshape_stripes, 0); 8517 spin_lock_irq(&conf->device_lock); 8518 write_seqcount_begin(&conf->gen_lock); 8519 conf->previous_raid_disks = conf->raid_disks; 8520 conf->raid_disks += mddev->delta_disks; 8521 conf->prev_chunk_sectors = conf->chunk_sectors; 8522 conf->chunk_sectors = mddev->new_chunk_sectors; 8523 conf->prev_algo = conf->algorithm; 8524 conf->algorithm = mddev->new_layout; 8525 conf->generation++; 8526 /* Code that selects data_offset needs to see the generation update 8527 * if reshape_progress has been set - so a memory barrier needed. 8528 */ 8529 smp_mb(); 8530 if (mddev->reshape_backwards) 8531 conf->reshape_progress = raid5_size(mddev, 0, 0); 8532 else 8533 conf->reshape_progress = 0; 8534 conf->reshape_safe = conf->reshape_progress; 8535 write_seqcount_end(&conf->gen_lock); 8536 spin_unlock_irq(&conf->device_lock); 8537 8538 /* Now make sure any requests that proceeded on the assumption 8539 * the reshape wasn't running - like Discard or Read - have 8540 * completed. 8541 */ 8542 raid5_quiesce(mddev, true); 8543 raid5_quiesce(mddev, false); 8544 8545 /* Add some new drives, as many as will fit. 8546 * We know there are enough to make the newly sized array work. 8547 * Don't add devices if we are reducing the number of 8548 * devices in the array. This is because it is not possible 8549 * to correctly record the "partially reconstructed" state of 8550 * such devices during the reshape and confusion could result. 8551 */ 8552 if (mddev->delta_disks >= 0) { 8553 rdev_for_each(rdev, mddev) 8554 if (rdev->raid_disk < 0 && 8555 !test_bit(Faulty, &rdev->flags)) { 8556 if (raid5_add_disk(mddev, rdev) == 0) { 8557 if (rdev->raid_disk 8558 >= conf->previous_raid_disks) 8559 set_bit(In_sync, &rdev->flags); 8560 else 8561 rdev->recovery_offset = 0; 8562 8563 /* Failure here is OK */ 8564 sysfs_link_rdev(mddev, rdev); 8565 } 8566 } else if (rdev->raid_disk >= conf->previous_raid_disks 8567 && !test_bit(Faulty, &rdev->flags)) { 8568 /* This is a spare that was manually added */ 8569 set_bit(In_sync, &rdev->flags); 8570 } 8571 8572 /* When a reshape changes the number of devices, 8573 * ->degraded is measured against the larger of the 8574 * pre and post number of devices. 8575 */ 8576 spin_lock_irqsave(&conf->device_lock, flags); 8577 mddev->degraded = raid5_calc_degraded(conf); 8578 spin_unlock_irqrestore(&conf->device_lock, flags); 8579 } 8580 mddev->raid_disks = conf->raid_disks; 8581 mddev->reshape_position = conf->reshape_progress; 8582 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8583 8584 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8585 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8586 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8587 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8588 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8589 rcu_assign_pointer(mddev->sync_thread, 8590 md_register_thread(md_do_sync, mddev, "reshape")); 8591 if (!mddev->sync_thread) { 8592 mddev->recovery = 0; 8593 spin_lock_irq(&conf->device_lock); 8594 write_seqcount_begin(&conf->gen_lock); 8595 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 8596 mddev->new_chunk_sectors = 8597 conf->chunk_sectors = conf->prev_chunk_sectors; 8598 mddev->new_layout = conf->algorithm = conf->prev_algo; 8599 rdev_for_each(rdev, mddev) 8600 rdev->new_data_offset = rdev->data_offset; 8601 smp_wmb(); 8602 conf->generation --; 8603 conf->reshape_progress = MaxSector; 8604 mddev->reshape_position = MaxSector; 8605 write_seqcount_end(&conf->gen_lock); 8606 spin_unlock_irq(&conf->device_lock); 8607 return -EAGAIN; 8608 } 8609 conf->reshape_checkpoint = jiffies; 8610 md_wakeup_thread(mddev->sync_thread); 8611 md_new_event(); 8612 return 0; 8613 } 8614 8615 /* This is called from the reshape thread and should make any 8616 * changes needed in 'conf' 8617 */ 8618 static void end_reshape(struct r5conf *conf) 8619 { 8620 8621 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 8622 struct md_rdev *rdev; 8623 8624 spin_lock_irq(&conf->device_lock); 8625 conf->previous_raid_disks = conf->raid_disks; 8626 md_finish_reshape(conf->mddev); 8627 smp_wmb(); 8628 conf->reshape_progress = MaxSector; 8629 conf->mddev->reshape_position = MaxSector; 8630 rdev_for_each(rdev, conf->mddev) 8631 if (rdev->raid_disk >= 0 && 8632 !test_bit(Journal, &rdev->flags) && 8633 !test_bit(In_sync, &rdev->flags)) 8634 rdev->recovery_offset = MaxSector; 8635 spin_unlock_irq(&conf->device_lock); 8636 wake_up(&conf->wait_for_overlap); 8637 8638 if (conf->mddev->queue) 8639 raid5_set_io_opt(conf); 8640 } 8641 } 8642 8643 /* This is called from the raid5d thread with mddev_lock held. 8644 * It makes config changes to the device. 8645 */ 8646 static void raid5_finish_reshape(struct mddev *mddev) 8647 { 8648 struct r5conf *conf = mddev->private; 8649 struct md_rdev *rdev; 8650 8651 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8652 8653 if (mddev->delta_disks <= 0) { 8654 int d; 8655 spin_lock_irq(&conf->device_lock); 8656 mddev->degraded = raid5_calc_degraded(conf); 8657 spin_unlock_irq(&conf->device_lock); 8658 for (d = conf->raid_disks ; 8659 d < conf->raid_disks - mddev->delta_disks; 8660 d++) { 8661 rdev = rdev_mdlock_deref(mddev, 8662 conf->disks[d].rdev); 8663 if (rdev) 8664 clear_bit(In_sync, &rdev->flags); 8665 rdev = rdev_mdlock_deref(mddev, 8666 conf->disks[d].replacement); 8667 if (rdev) 8668 clear_bit(In_sync, &rdev->flags); 8669 } 8670 } 8671 mddev->layout = conf->algorithm; 8672 mddev->chunk_sectors = conf->chunk_sectors; 8673 mddev->reshape_position = MaxSector; 8674 mddev->delta_disks = 0; 8675 mddev->reshape_backwards = 0; 8676 } 8677 } 8678 8679 static void raid5_quiesce(struct mddev *mddev, int quiesce) 8680 { 8681 struct r5conf *conf = mddev->private; 8682 8683 if (quiesce) { 8684 /* stop all writes */ 8685 lock_all_device_hash_locks_irq(conf); 8686 /* '2' tells resync/reshape to pause so that all 8687 * active stripes can drain 8688 */ 8689 r5c_flush_cache(conf, INT_MAX); 8690 /* need a memory barrier to make sure read_one_chunk() sees 8691 * quiesce started and reverts to slow (locked) path. 8692 */ 8693 smp_store_release(&conf->quiesce, 2); 8694 wait_event_cmd(conf->wait_for_quiescent, 8695 atomic_read(&conf->active_stripes) == 0 && 8696 atomic_read(&conf->active_aligned_reads) == 0, 8697 unlock_all_device_hash_locks_irq(conf), 8698 lock_all_device_hash_locks_irq(conf)); 8699 conf->quiesce = 1; 8700 unlock_all_device_hash_locks_irq(conf); 8701 /* allow reshape to continue */ 8702 wake_up(&conf->wait_for_overlap); 8703 } else { 8704 /* re-enable writes */ 8705 lock_all_device_hash_locks_irq(conf); 8706 conf->quiesce = 0; 8707 wake_up(&conf->wait_for_quiescent); 8708 wake_up(&conf->wait_for_overlap); 8709 unlock_all_device_hash_locks_irq(conf); 8710 } 8711 log_quiesce(conf, quiesce); 8712 } 8713 8714 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8715 { 8716 struct r0conf *raid0_conf = mddev->private; 8717 sector_t sectors; 8718 8719 /* for raid0 takeover only one zone is supported */ 8720 if (raid0_conf->nr_strip_zones > 1) { 8721 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8722 mdname(mddev)); 8723 return ERR_PTR(-EINVAL); 8724 } 8725 8726 sectors = raid0_conf->strip_zone[0].zone_end; 8727 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8728 mddev->dev_sectors = sectors; 8729 mddev->new_level = level; 8730 mddev->new_layout = ALGORITHM_PARITY_N; 8731 mddev->new_chunk_sectors = mddev->chunk_sectors; 8732 mddev->raid_disks += 1; 8733 mddev->delta_disks = 1; 8734 /* make sure it will be not marked as dirty */ 8735 mddev->recovery_cp = MaxSector; 8736 8737 return setup_conf(mddev); 8738 } 8739 8740 static void *raid5_takeover_raid1(struct mddev *mddev) 8741 { 8742 int chunksect; 8743 void *ret; 8744 8745 if (mddev->raid_disks != 2 || 8746 mddev->degraded > 1) 8747 return ERR_PTR(-EINVAL); 8748 8749 /* Should check if there are write-behind devices? */ 8750 8751 chunksect = 64*2; /* 64K by default */ 8752 8753 /* The array must be an exact multiple of chunksize */ 8754 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8755 chunksect >>= 1; 8756 8757 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) 8758 /* array size does not allow a suitable chunk size */ 8759 return ERR_PTR(-EINVAL); 8760 8761 mddev->new_level = 5; 8762 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8763 mddev->new_chunk_sectors = chunksect; 8764 8765 ret = setup_conf(mddev); 8766 if (!IS_ERR(ret)) 8767 mddev_clear_unsupported_flags(mddev, 8768 UNSUPPORTED_MDDEV_FLAGS); 8769 return ret; 8770 } 8771 8772 static void *raid5_takeover_raid6(struct mddev *mddev) 8773 { 8774 int new_layout; 8775 8776 switch (mddev->layout) { 8777 case ALGORITHM_LEFT_ASYMMETRIC_6: 8778 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8779 break; 8780 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8781 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8782 break; 8783 case ALGORITHM_LEFT_SYMMETRIC_6: 8784 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8785 break; 8786 case ALGORITHM_RIGHT_SYMMETRIC_6: 8787 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8788 break; 8789 case ALGORITHM_PARITY_0_6: 8790 new_layout = ALGORITHM_PARITY_0; 8791 break; 8792 case ALGORITHM_PARITY_N: 8793 new_layout = ALGORITHM_PARITY_N; 8794 break; 8795 default: 8796 return ERR_PTR(-EINVAL); 8797 } 8798 mddev->new_level = 5; 8799 mddev->new_layout = new_layout; 8800 mddev->delta_disks = -1; 8801 mddev->raid_disks -= 1; 8802 return setup_conf(mddev); 8803 } 8804 8805 static int raid5_check_reshape(struct mddev *mddev) 8806 { 8807 /* For a 2-drive array, the layout and chunk size can be changed 8808 * immediately as not restriping is needed. 8809 * For larger arrays we record the new value - after validation 8810 * to be used by a reshape pass. 8811 */ 8812 struct r5conf *conf = mddev->private; 8813 int new_chunk = mddev->new_chunk_sectors; 8814 8815 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8816 return -EINVAL; 8817 if (new_chunk > 0) { 8818 if (!is_power_of_2(new_chunk)) 8819 return -EINVAL; 8820 if (new_chunk < (PAGE_SIZE>>9)) 8821 return -EINVAL; 8822 if (mddev->array_sectors & (new_chunk-1)) 8823 /* not factor of array size */ 8824 return -EINVAL; 8825 } 8826 8827 /* They look valid */ 8828 8829 if (mddev->raid_disks == 2) { 8830 /* can make the change immediately */ 8831 if (mddev->new_layout >= 0) { 8832 conf->algorithm = mddev->new_layout; 8833 mddev->layout = mddev->new_layout; 8834 } 8835 if (new_chunk > 0) { 8836 conf->chunk_sectors = new_chunk ; 8837 mddev->chunk_sectors = new_chunk; 8838 } 8839 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8840 md_wakeup_thread(mddev->thread); 8841 } 8842 return check_reshape(mddev); 8843 } 8844 8845 static int raid6_check_reshape(struct mddev *mddev) 8846 { 8847 int new_chunk = mddev->new_chunk_sectors; 8848 8849 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8850 return -EINVAL; 8851 if (new_chunk > 0) { 8852 if (!is_power_of_2(new_chunk)) 8853 return -EINVAL; 8854 if (new_chunk < (PAGE_SIZE >> 9)) 8855 return -EINVAL; 8856 if (mddev->array_sectors & (new_chunk-1)) 8857 /* not factor of array size */ 8858 return -EINVAL; 8859 } 8860 8861 /* They look valid */ 8862 return check_reshape(mddev); 8863 } 8864 8865 static void *raid5_takeover(struct mddev *mddev) 8866 { 8867 /* raid5 can take over: 8868 * raid0 - if there is only one strip zone - make it a raid4 layout 8869 * raid1 - if there are two drives. We need to know the chunk size 8870 * raid4 - trivial - just use a raid4 layout. 8871 * raid6 - Providing it is a *_6 layout 8872 */ 8873 if (mddev->level == 0) 8874 return raid45_takeover_raid0(mddev, 5); 8875 if (mddev->level == 1) 8876 return raid5_takeover_raid1(mddev); 8877 if (mddev->level == 4) { 8878 mddev->new_layout = ALGORITHM_PARITY_N; 8879 mddev->new_level = 5; 8880 return setup_conf(mddev); 8881 } 8882 if (mddev->level == 6) 8883 return raid5_takeover_raid6(mddev); 8884 8885 return ERR_PTR(-EINVAL); 8886 } 8887 8888 static void *raid4_takeover(struct mddev *mddev) 8889 { 8890 /* raid4 can take over: 8891 * raid0 - if there is only one strip zone 8892 * raid5 - if layout is right 8893 */ 8894 if (mddev->level == 0) 8895 return raid45_takeover_raid0(mddev, 4); 8896 if (mddev->level == 5 && 8897 mddev->layout == ALGORITHM_PARITY_N) { 8898 mddev->new_layout = 0; 8899 mddev->new_level = 4; 8900 return setup_conf(mddev); 8901 } 8902 return ERR_PTR(-EINVAL); 8903 } 8904 8905 static struct md_personality raid5_personality; 8906 8907 static void *raid6_takeover(struct mddev *mddev) 8908 { 8909 /* Currently can only take over a raid5. We map the 8910 * personality to an equivalent raid6 personality 8911 * with the Q block at the end. 8912 */ 8913 int new_layout; 8914 8915 if (mddev->pers != &raid5_personality) 8916 return ERR_PTR(-EINVAL); 8917 if (mddev->degraded > 1) 8918 return ERR_PTR(-EINVAL); 8919 if (mddev->raid_disks > 253) 8920 return ERR_PTR(-EINVAL); 8921 if (mddev->raid_disks < 3) 8922 return ERR_PTR(-EINVAL); 8923 8924 switch (mddev->layout) { 8925 case ALGORITHM_LEFT_ASYMMETRIC: 8926 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8927 break; 8928 case ALGORITHM_RIGHT_ASYMMETRIC: 8929 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8930 break; 8931 case ALGORITHM_LEFT_SYMMETRIC: 8932 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8933 break; 8934 case ALGORITHM_RIGHT_SYMMETRIC: 8935 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8936 break; 8937 case ALGORITHM_PARITY_0: 8938 new_layout = ALGORITHM_PARITY_0_6; 8939 break; 8940 case ALGORITHM_PARITY_N: 8941 new_layout = ALGORITHM_PARITY_N; 8942 break; 8943 default: 8944 return ERR_PTR(-EINVAL); 8945 } 8946 mddev->new_level = 6; 8947 mddev->new_layout = new_layout; 8948 mddev->delta_disks = 1; 8949 mddev->raid_disks += 1; 8950 return setup_conf(mddev); 8951 } 8952 8953 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8954 { 8955 struct r5conf *conf; 8956 int err; 8957 8958 err = mddev_suspend_and_lock(mddev); 8959 if (err) 8960 return err; 8961 conf = mddev->private; 8962 if (!conf) { 8963 mddev_unlock_and_resume(mddev); 8964 return -ENODEV; 8965 } 8966 8967 if (strncmp(buf, "ppl", 3) == 0) { 8968 /* ppl only works with RAID 5 */ 8969 if (!raid5_has_ppl(conf) && conf->level == 5) { 8970 err = log_init(conf, NULL, true); 8971 if (!err) { 8972 err = resize_stripes(conf, conf->pool_size); 8973 if (err) 8974 log_exit(conf); 8975 } 8976 } else 8977 err = -EINVAL; 8978 } else if (strncmp(buf, "resync", 6) == 0) { 8979 if (raid5_has_ppl(conf)) { 8980 log_exit(conf); 8981 err = resize_stripes(conf, conf->pool_size); 8982 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8983 r5l_log_disk_error(conf)) { 8984 bool journal_dev_exists = false; 8985 struct md_rdev *rdev; 8986 8987 rdev_for_each(rdev, mddev) 8988 if (test_bit(Journal, &rdev->flags)) { 8989 journal_dev_exists = true; 8990 break; 8991 } 8992 8993 if (!journal_dev_exists) 8994 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8995 else /* need remove journal device first */ 8996 err = -EBUSY; 8997 } else 8998 err = -EINVAL; 8999 } else { 9000 err = -EINVAL; 9001 } 9002 9003 if (!err) 9004 md_update_sb(mddev, 1); 9005 9006 mddev_unlock_and_resume(mddev); 9007 9008 return err; 9009 } 9010 9011 static int raid5_start(struct mddev *mddev) 9012 { 9013 struct r5conf *conf = mddev->private; 9014 9015 return r5l_start(conf->log); 9016 } 9017 9018 static struct md_personality raid6_personality = 9019 { 9020 .name = "raid6", 9021 .level = 6, 9022 .owner = THIS_MODULE, 9023 .make_request = raid5_make_request, 9024 .run = raid5_run, 9025 .start = raid5_start, 9026 .free = raid5_free, 9027 .status = raid5_status, 9028 .error_handler = raid5_error, 9029 .hot_add_disk = raid5_add_disk, 9030 .hot_remove_disk= raid5_remove_disk, 9031 .spare_active = raid5_spare_active, 9032 .sync_request = raid5_sync_request, 9033 .resize = raid5_resize, 9034 .size = raid5_size, 9035 .check_reshape = raid6_check_reshape, 9036 .start_reshape = raid5_start_reshape, 9037 .finish_reshape = raid5_finish_reshape, 9038 .quiesce = raid5_quiesce, 9039 .takeover = raid6_takeover, 9040 .change_consistency_policy = raid5_change_consistency_policy, 9041 }; 9042 static struct md_personality raid5_personality = 9043 { 9044 .name = "raid5", 9045 .level = 5, 9046 .owner = THIS_MODULE, 9047 .make_request = raid5_make_request, 9048 .run = raid5_run, 9049 .start = raid5_start, 9050 .free = raid5_free, 9051 .status = raid5_status, 9052 .error_handler = raid5_error, 9053 .hot_add_disk = raid5_add_disk, 9054 .hot_remove_disk= raid5_remove_disk, 9055 .spare_active = raid5_spare_active, 9056 .sync_request = raid5_sync_request, 9057 .resize = raid5_resize, 9058 .size = raid5_size, 9059 .check_reshape = raid5_check_reshape, 9060 .start_reshape = raid5_start_reshape, 9061 .finish_reshape = raid5_finish_reshape, 9062 .quiesce = raid5_quiesce, 9063 .takeover = raid5_takeover, 9064 .change_consistency_policy = raid5_change_consistency_policy, 9065 }; 9066 9067 static struct md_personality raid4_personality = 9068 { 9069 .name = "raid4", 9070 .level = 4, 9071 .owner = THIS_MODULE, 9072 .make_request = raid5_make_request, 9073 .run = raid5_run, 9074 .start = raid5_start, 9075 .free = raid5_free, 9076 .status = raid5_status, 9077 .error_handler = raid5_error, 9078 .hot_add_disk = raid5_add_disk, 9079 .hot_remove_disk= raid5_remove_disk, 9080 .spare_active = raid5_spare_active, 9081 .sync_request = raid5_sync_request, 9082 .resize = raid5_resize, 9083 .size = raid5_size, 9084 .check_reshape = raid5_check_reshape, 9085 .start_reshape = raid5_start_reshape, 9086 .finish_reshape = raid5_finish_reshape, 9087 .quiesce = raid5_quiesce, 9088 .takeover = raid4_takeover, 9089 .change_consistency_policy = raid5_change_consistency_policy, 9090 }; 9091 9092 static int __init raid5_init(void) 9093 { 9094 int ret; 9095 9096 raid5_wq = alloc_workqueue("raid5wq", 9097 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 9098 if (!raid5_wq) 9099 return -ENOMEM; 9100 9101 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 9102 "md/raid5:prepare", 9103 raid456_cpu_up_prepare, 9104 raid456_cpu_dead); 9105 if (ret) { 9106 destroy_workqueue(raid5_wq); 9107 return ret; 9108 } 9109 register_md_personality(&raid6_personality); 9110 register_md_personality(&raid5_personality); 9111 register_md_personality(&raid4_personality); 9112 return 0; 9113 } 9114 9115 static void raid5_exit(void) 9116 { 9117 unregister_md_personality(&raid6_personality); 9118 unregister_md_personality(&raid5_personality); 9119 unregister_md_personality(&raid4_personality); 9120 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 9121 destroy_workqueue(raid5_wq); 9122 } 9123 9124 module_init(raid5_init); 9125 module_exit(raid5_exit); 9126 MODULE_LICENSE("GPL"); 9127 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 9128 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 9129 MODULE_ALIAS("md-raid5"); 9130 MODULE_ALIAS("md-raid4"); 9131 MODULE_ALIAS("md-level-5"); 9132 MODULE_ALIAS("md-level-4"); 9133 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 9134 MODULE_ALIAS("md-raid6"); 9135 MODULE_ALIAS("md-level-6"); 9136 9137 /* This used to be two separate modules, they were: */ 9138 MODULE_ALIAS("raid5"); 9139 MODULE_ALIAS("raid6"); 9140