1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * raid5.c : Multiple Devices driver for Linux 4 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 5 * Copyright (C) 1999, 2000 Ingo Molnar 6 * Copyright (C) 2002, 2003 H. Peter Anvin 7 * 8 * RAID-4/5/6 management functions. 9 * Thanks to Penguin Computing for making the RAID-6 development possible 10 * by donating a test server! 11 */ 12 13 /* 14 * BITMAP UNPLUGGING: 15 * 16 * The sequencing for updating the bitmap reliably is a little 17 * subtle (and I got it wrong the first time) so it deserves some 18 * explanation. 19 * 20 * We group bitmap updates into batches. Each batch has a number. 21 * We may write out several batches at once, but that isn't very important. 22 * conf->seq_write is the number of the last batch successfully written. 23 * conf->seq_flush is the number of the last batch that was closed to 24 * new additions. 25 * When we discover that we will need to write to any block in a stripe 26 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 27 * the number of the batch it will be in. This is seq_flush+1. 28 * When we are ready to do a write, if that batch hasn't been written yet, 29 * we plug the array and queue the stripe for later. 30 * When an unplug happens, we increment bm_flush, thus closing the current 31 * batch. 32 * When we notice that bm_flush > bm_write, we write out all pending updates 33 * to the bitmap, and advance bm_write to where bm_flush was. 34 * This may occasionally write a bit out twice, but is sure never to 35 * miss any bits. 36 */ 37 38 #include <linux/blkdev.h> 39 #include <linux/kthread.h> 40 #include <linux/raid/pq.h> 41 #include <linux/async_tx.h> 42 #include <linux/module.h> 43 #include <linux/async.h> 44 #include <linux/seq_file.h> 45 #include <linux/cpu.h> 46 #include <linux/slab.h> 47 #include <linux/ratelimit.h> 48 #include <linux/nodemask.h> 49 50 #include <trace/events/block.h> 51 #include <linux/list_sort.h> 52 53 #include "md.h" 54 #include "raid5.h" 55 #include "raid0.h" 56 #include "md-bitmap.h" 57 #include "raid5-log.h" 58 59 #define UNSUPPORTED_MDDEV_FLAGS \ 60 ((1L << MD_FAILFAST_SUPPORTED) | \ 61 (1L << MD_FAILLAST_DEV) | \ 62 (1L << MD_SERIALIZE_POLICY)) 63 64 65 #define cpu_to_group(cpu) cpu_to_node(cpu) 66 #define ANY_GROUP NUMA_NO_NODE 67 68 #define RAID5_MAX_REQ_STRIPES 256 69 70 static bool devices_handle_discard_safely = false; 71 module_param(devices_handle_discard_safely, bool, 0644); 72 MODULE_PARM_DESC(devices_handle_discard_safely, 73 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 74 static struct workqueue_struct *raid5_wq; 75 76 static void raid5_quiesce(struct mddev *mddev, int quiesce); 77 78 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 79 { 80 int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; 81 return &conf->stripe_hashtbl[hash]; 82 } 83 84 static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) 85 { 86 return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; 87 } 88 89 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 90 __acquires(&conf->device_lock) 91 { 92 spin_lock_irq(conf->hash_locks + hash); 93 spin_lock(&conf->device_lock); 94 } 95 96 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 97 __releases(&conf->device_lock) 98 { 99 spin_unlock(&conf->device_lock); 100 spin_unlock_irq(conf->hash_locks + hash); 101 } 102 103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 104 __acquires(&conf->device_lock) 105 { 106 int i; 107 spin_lock_irq(conf->hash_locks); 108 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 109 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 110 spin_lock(&conf->device_lock); 111 } 112 113 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 114 __releases(&conf->device_lock) 115 { 116 int i; 117 spin_unlock(&conf->device_lock); 118 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 119 spin_unlock(conf->hash_locks + i); 120 spin_unlock_irq(conf->hash_locks); 121 } 122 123 /* Find first data disk in a raid6 stripe */ 124 static inline int raid6_d0(struct stripe_head *sh) 125 { 126 if (sh->ddf_layout) 127 /* ddf always start from first device */ 128 return 0; 129 /* md starts just after Q block */ 130 if (sh->qd_idx == sh->disks - 1) 131 return 0; 132 else 133 return sh->qd_idx + 1; 134 } 135 static inline int raid6_next_disk(int disk, int raid_disks) 136 { 137 disk++; 138 return (disk < raid_disks) ? disk : 0; 139 } 140 141 /* When walking through the disks in a raid5, starting at raid6_d0, 142 * We need to map each disk to a 'slot', where the data disks are slot 143 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 144 * is raid_disks-1. This help does that mapping. 145 */ 146 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 147 int *count, int syndrome_disks) 148 { 149 int slot = *count; 150 151 if (sh->ddf_layout) 152 (*count)++; 153 if (idx == sh->pd_idx) 154 return syndrome_disks; 155 if (idx == sh->qd_idx) 156 return syndrome_disks + 1; 157 if (!sh->ddf_layout) 158 (*count)++; 159 return slot; 160 } 161 162 static void print_raid5_conf(struct r5conf *conf); 163 164 static int stripe_operations_active(struct stripe_head *sh) 165 { 166 return sh->check_state || sh->reconstruct_state || 167 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 168 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 169 } 170 171 static bool stripe_is_lowprio(struct stripe_head *sh) 172 { 173 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 174 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 175 !test_bit(STRIPE_R5C_CACHING, &sh->state); 176 } 177 178 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 179 __must_hold(&sh->raid_conf->device_lock) 180 { 181 struct r5conf *conf = sh->raid_conf; 182 struct r5worker_group *group; 183 int thread_cnt; 184 int i, cpu = sh->cpu; 185 186 if (!cpu_online(cpu)) { 187 cpu = cpumask_any(cpu_online_mask); 188 sh->cpu = cpu; 189 } 190 191 if (list_empty(&sh->lru)) { 192 struct r5worker_group *group; 193 group = conf->worker_groups + cpu_to_group(cpu); 194 if (stripe_is_lowprio(sh)) 195 list_add_tail(&sh->lru, &group->loprio_list); 196 else 197 list_add_tail(&sh->lru, &group->handle_list); 198 group->stripes_cnt++; 199 sh->group = group; 200 } 201 202 if (conf->worker_cnt_per_group == 0) { 203 md_wakeup_thread(conf->mddev->thread); 204 return; 205 } 206 207 group = conf->worker_groups + cpu_to_group(sh->cpu); 208 209 group->workers[0].working = true; 210 /* at least one worker should run to avoid race */ 211 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 212 213 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 214 /* wakeup more workers */ 215 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 216 if (group->workers[i].working == false) { 217 group->workers[i].working = true; 218 queue_work_on(sh->cpu, raid5_wq, 219 &group->workers[i].work); 220 thread_cnt--; 221 } 222 } 223 } 224 225 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 226 struct list_head *temp_inactive_list) 227 __must_hold(&conf->device_lock) 228 { 229 int i; 230 int injournal = 0; /* number of date pages with R5_InJournal */ 231 232 BUG_ON(!list_empty(&sh->lru)); 233 BUG_ON(atomic_read(&conf->active_stripes)==0); 234 235 if (r5c_is_writeback(conf->log)) 236 for (i = sh->disks; i--; ) 237 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 238 injournal++; 239 /* 240 * In the following cases, the stripe cannot be released to cached 241 * lists. Therefore, we make the stripe write out and set 242 * STRIPE_HANDLE: 243 * 1. when quiesce in r5c write back; 244 * 2. when resync is requested fot the stripe. 245 */ 246 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 247 (conf->quiesce && r5c_is_writeback(conf->log) && 248 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 249 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 250 r5c_make_stripe_write_out(sh); 251 set_bit(STRIPE_HANDLE, &sh->state); 252 } 253 254 if (test_bit(STRIPE_HANDLE, &sh->state)) { 255 if (test_bit(STRIPE_DELAYED, &sh->state) && 256 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 257 list_add_tail(&sh->lru, &conf->delayed_list); 258 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 259 sh->bm_seq - conf->seq_write > 0) 260 list_add_tail(&sh->lru, &conf->bitmap_list); 261 else { 262 clear_bit(STRIPE_DELAYED, &sh->state); 263 clear_bit(STRIPE_BIT_DELAY, &sh->state); 264 if (conf->worker_cnt_per_group == 0) { 265 if (stripe_is_lowprio(sh)) 266 list_add_tail(&sh->lru, 267 &conf->loprio_list); 268 else 269 list_add_tail(&sh->lru, 270 &conf->handle_list); 271 } else { 272 raid5_wakeup_stripe_thread(sh); 273 return; 274 } 275 } 276 md_wakeup_thread(conf->mddev->thread); 277 } else { 278 BUG_ON(stripe_operations_active(sh)); 279 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 280 if (atomic_dec_return(&conf->preread_active_stripes) 281 < IO_THRESHOLD) 282 md_wakeup_thread(conf->mddev->thread); 283 atomic_dec(&conf->active_stripes); 284 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 285 if (!r5c_is_writeback(conf->log)) 286 list_add_tail(&sh->lru, temp_inactive_list); 287 else { 288 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 289 if (injournal == 0) 290 list_add_tail(&sh->lru, temp_inactive_list); 291 else if (injournal == conf->raid_disks - conf->max_degraded) { 292 /* full stripe */ 293 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 294 atomic_inc(&conf->r5c_cached_full_stripes); 295 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 296 atomic_dec(&conf->r5c_cached_partial_stripes); 297 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 298 r5c_check_cached_full_stripe(conf); 299 } else 300 /* 301 * STRIPE_R5C_PARTIAL_STRIPE is set in 302 * r5c_try_caching_write(). No need to 303 * set it again. 304 */ 305 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 306 } 307 } 308 } 309 } 310 311 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 312 struct list_head *temp_inactive_list) 313 __must_hold(&conf->device_lock) 314 { 315 if (atomic_dec_and_test(&sh->count)) 316 do_release_stripe(conf, sh, temp_inactive_list); 317 } 318 319 /* 320 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 321 * 322 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 323 * given time. Adding stripes only takes device lock, while deleting stripes 324 * only takes hash lock. 325 */ 326 static void release_inactive_stripe_list(struct r5conf *conf, 327 struct list_head *temp_inactive_list, 328 int hash) 329 { 330 int size; 331 bool do_wakeup = false; 332 unsigned long flags; 333 334 if (hash == NR_STRIPE_HASH_LOCKS) { 335 size = NR_STRIPE_HASH_LOCKS; 336 hash = NR_STRIPE_HASH_LOCKS - 1; 337 } else 338 size = 1; 339 while (size) { 340 struct list_head *list = &temp_inactive_list[size - 1]; 341 342 /* 343 * We don't hold any lock here yet, raid5_get_active_stripe() might 344 * remove stripes from the list 345 */ 346 if (!list_empty_careful(list)) { 347 spin_lock_irqsave(conf->hash_locks + hash, flags); 348 if (list_empty(conf->inactive_list + hash) && 349 !list_empty(list)) 350 atomic_dec(&conf->empty_inactive_list_nr); 351 list_splice_tail_init(list, conf->inactive_list + hash); 352 do_wakeup = true; 353 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 354 } 355 size--; 356 hash--; 357 } 358 359 if (do_wakeup) { 360 wake_up(&conf->wait_for_stripe); 361 if (atomic_read(&conf->active_stripes) == 0) 362 wake_up(&conf->wait_for_quiescent); 363 if (conf->retry_read_aligned) 364 md_wakeup_thread(conf->mddev->thread); 365 } 366 } 367 368 static int release_stripe_list(struct r5conf *conf, 369 struct list_head *temp_inactive_list) 370 __must_hold(&conf->device_lock) 371 { 372 struct stripe_head *sh, *t; 373 int count = 0; 374 struct llist_node *head; 375 376 head = llist_del_all(&conf->released_stripes); 377 head = llist_reverse_order(head); 378 llist_for_each_entry_safe(sh, t, head, release_list) { 379 int hash; 380 381 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 382 smp_mb(); 383 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 384 /* 385 * Don't worry the bit is set here, because if the bit is set 386 * again, the count is always > 1. This is true for 387 * STRIPE_ON_UNPLUG_LIST bit too. 388 */ 389 hash = sh->hash_lock_index; 390 __release_stripe(conf, sh, &temp_inactive_list[hash]); 391 count++; 392 } 393 394 return count; 395 } 396 397 void raid5_release_stripe(struct stripe_head *sh) 398 { 399 struct r5conf *conf = sh->raid_conf; 400 unsigned long flags; 401 struct list_head list; 402 int hash; 403 bool wakeup; 404 405 /* Avoid release_list until the last reference. 406 */ 407 if (atomic_add_unless(&sh->count, -1, 1)) 408 return; 409 410 if (unlikely(!conf->mddev->thread) || 411 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 412 goto slow_path; 413 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 414 if (wakeup) 415 md_wakeup_thread(conf->mddev->thread); 416 return; 417 slow_path: 418 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 419 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { 420 INIT_LIST_HEAD(&list); 421 hash = sh->hash_lock_index; 422 do_release_stripe(conf, sh, &list); 423 spin_unlock_irqrestore(&conf->device_lock, flags); 424 release_inactive_stripe_list(conf, &list, hash); 425 } 426 } 427 428 static inline void remove_hash(struct stripe_head *sh) 429 { 430 pr_debug("remove_hash(), stripe %llu\n", 431 (unsigned long long)sh->sector); 432 433 hlist_del_init(&sh->hash); 434 } 435 436 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 437 { 438 struct hlist_head *hp = stripe_hash(conf, sh->sector); 439 440 pr_debug("insert_hash(), stripe %llu\n", 441 (unsigned long long)sh->sector); 442 443 hlist_add_head(&sh->hash, hp); 444 } 445 446 /* find an idle stripe, make sure it is unhashed, and return it. */ 447 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 448 { 449 struct stripe_head *sh = NULL; 450 struct list_head *first; 451 452 if (list_empty(conf->inactive_list + hash)) 453 goto out; 454 first = (conf->inactive_list + hash)->next; 455 sh = list_entry(first, struct stripe_head, lru); 456 list_del_init(first); 457 remove_hash(sh); 458 atomic_inc(&conf->active_stripes); 459 BUG_ON(hash != sh->hash_lock_index); 460 if (list_empty(conf->inactive_list + hash)) 461 atomic_inc(&conf->empty_inactive_list_nr); 462 out: 463 return sh; 464 } 465 466 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 467 static void free_stripe_pages(struct stripe_head *sh) 468 { 469 int i; 470 struct page *p; 471 472 /* Have not allocate page pool */ 473 if (!sh->pages) 474 return; 475 476 for (i = 0; i < sh->nr_pages; i++) { 477 p = sh->pages[i]; 478 if (p) 479 put_page(p); 480 sh->pages[i] = NULL; 481 } 482 } 483 484 static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) 485 { 486 int i; 487 struct page *p; 488 489 for (i = 0; i < sh->nr_pages; i++) { 490 /* The page have allocated. */ 491 if (sh->pages[i]) 492 continue; 493 494 p = alloc_page(gfp); 495 if (!p) { 496 free_stripe_pages(sh); 497 return -ENOMEM; 498 } 499 sh->pages[i] = p; 500 } 501 return 0; 502 } 503 504 static int 505 init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) 506 { 507 int nr_pages, cnt; 508 509 if (sh->pages) 510 return 0; 511 512 /* Each of the sh->dev[i] need one conf->stripe_size */ 513 cnt = PAGE_SIZE / conf->stripe_size; 514 nr_pages = (disks + cnt - 1) / cnt; 515 516 sh->pages = kzalloc_objs(struct page *, nr_pages); 517 if (!sh->pages) 518 return -ENOMEM; 519 sh->nr_pages = nr_pages; 520 sh->stripes_per_page = cnt; 521 return 0; 522 } 523 #endif 524 525 static void shrink_buffers(struct stripe_head *sh) 526 { 527 int i; 528 int num = sh->raid_conf->pool_size; 529 530 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 531 for (i = 0; i < num ; i++) { 532 struct page *p; 533 534 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 535 p = sh->dev[i].page; 536 if (!p) 537 continue; 538 sh->dev[i].page = NULL; 539 put_page(p); 540 } 541 #else 542 for (i = 0; i < num; i++) 543 sh->dev[i].page = NULL; 544 free_stripe_pages(sh); /* Free pages */ 545 #endif 546 } 547 548 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 549 { 550 int i; 551 int num = sh->raid_conf->pool_size; 552 553 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 554 for (i = 0; i < num; i++) { 555 struct page *page; 556 557 if (!(page = alloc_page(gfp))) { 558 return 1; 559 } 560 sh->dev[i].page = page; 561 sh->dev[i].orig_page = page; 562 sh->dev[i].offset = 0; 563 } 564 #else 565 if (alloc_stripe_pages(sh, gfp)) 566 return -ENOMEM; 567 568 for (i = 0; i < num; i++) { 569 sh->dev[i].page = raid5_get_dev_page(sh, i); 570 sh->dev[i].orig_page = sh->dev[i].page; 571 sh->dev[i].offset = raid5_get_page_offset(sh, i); 572 } 573 #endif 574 return 0; 575 } 576 577 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 578 struct stripe_head *sh); 579 580 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 581 { 582 struct r5conf *conf = sh->raid_conf; 583 int i, seq; 584 585 BUG_ON(atomic_read(&sh->count) != 0); 586 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 587 BUG_ON(stripe_operations_active(sh)); 588 BUG_ON(sh->batch_head); 589 590 pr_debug("init_stripe called, stripe %llu\n", 591 (unsigned long long)sector); 592 retry: 593 seq = read_seqcount_begin(&conf->gen_lock); 594 sh->generation = conf->generation - previous; 595 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 596 sh->sector = sector; 597 stripe_set_idx(sector, conf, previous, sh); 598 sh->state = 0; 599 600 for (i = sh->disks; i--; ) { 601 struct r5dev *dev = &sh->dev[i]; 602 603 if (dev->toread || dev->read || dev->towrite || dev->written || 604 test_bit(R5_LOCKED, &dev->flags)) { 605 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 606 (unsigned long long)sh->sector, i, dev->toread, 607 dev->read, dev->towrite, dev->written, 608 test_bit(R5_LOCKED, &dev->flags)); 609 WARN_ON(1); 610 } 611 dev->flags = 0; 612 dev->sector = raid5_compute_blocknr(sh, i, previous); 613 } 614 if (read_seqcount_retry(&conf->gen_lock, seq)) 615 goto retry; 616 sh->overwrite_disks = 0; 617 insert_hash(conf, sh); 618 sh->cpu = smp_processor_id(); 619 set_bit(STRIPE_BATCH_READY, &sh->state); 620 } 621 622 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 623 short generation) 624 { 625 struct stripe_head *sh; 626 627 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 628 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 629 if (sh->sector == sector && sh->generation == generation) 630 return sh; 631 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 632 return NULL; 633 } 634 635 static struct stripe_head *find_get_stripe(struct r5conf *conf, 636 sector_t sector, short generation, int hash) 637 { 638 int inc_empty_inactive_list_flag; 639 struct stripe_head *sh; 640 641 sh = __find_stripe(conf, sector, generation); 642 if (!sh) 643 return NULL; 644 645 if (atomic_inc_not_zero(&sh->count)) 646 return sh; 647 648 /* 649 * Slow path. The reference count is zero which means the stripe must 650 * be on a list (sh->lru). Must remove the stripe from the list that 651 * references it with the device_lock held. 652 */ 653 654 spin_lock(&conf->device_lock); 655 if (!atomic_read(&sh->count)) { 656 if (!test_bit(STRIPE_HANDLE, &sh->state)) 657 atomic_inc(&conf->active_stripes); 658 BUG_ON(list_empty(&sh->lru) && 659 !test_bit(STRIPE_EXPANDING, &sh->state)); 660 inc_empty_inactive_list_flag = 0; 661 if (!list_empty(conf->inactive_list + hash)) 662 inc_empty_inactive_list_flag = 1; 663 list_del_init(&sh->lru); 664 if (list_empty(conf->inactive_list + hash) && 665 inc_empty_inactive_list_flag) 666 atomic_inc(&conf->empty_inactive_list_nr); 667 if (sh->group) { 668 sh->group->stripes_cnt--; 669 sh->group = NULL; 670 } 671 } 672 atomic_inc(&sh->count); 673 spin_unlock(&conf->device_lock); 674 675 return sh; 676 } 677 678 /* 679 * Need to check if array has failed when deciding whether to: 680 * - start an array 681 * - remove non-faulty devices 682 * - add a spare 683 * - allow a reshape 684 * This determination is simple when no reshape is happening. 685 * However if there is a reshape, we need to carefully check 686 * both the before and after sections. 687 * This is because some failed devices may only affect one 688 * of the two sections, and some non-in_sync devices may 689 * be insync in the section most affected by failed devices. 690 * 691 * Most calls to this function hold &conf->device_lock. Calls 692 * in raid5_run() do not require the lock as no other threads 693 * have been started yet. 694 */ 695 int raid5_calc_degraded(struct r5conf *conf) 696 { 697 int degraded, degraded2; 698 int i; 699 700 degraded = 0; 701 for (i = 0; i < conf->previous_raid_disks; i++) { 702 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 703 704 if (rdev && test_bit(Faulty, &rdev->flags)) 705 rdev = READ_ONCE(conf->disks[i].replacement); 706 if (!rdev || test_bit(Faulty, &rdev->flags)) 707 degraded++; 708 else if (test_bit(In_sync, &rdev->flags)) 709 ; 710 else 711 /* not in-sync or faulty. 712 * If the reshape increases the number of devices, 713 * this is being recovered by the reshape, so 714 * this 'previous' section is not in_sync. 715 * If the number of devices is being reduced however, 716 * the device can only be part of the array if 717 * we are reverting a reshape, so this section will 718 * be in-sync. 719 */ 720 if (conf->raid_disks >= conf->previous_raid_disks) 721 degraded++; 722 } 723 if (conf->raid_disks == conf->previous_raid_disks) 724 return degraded; 725 degraded2 = 0; 726 for (i = 0; i < conf->raid_disks; i++) { 727 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 728 729 if (rdev && test_bit(Faulty, &rdev->flags)) 730 rdev = READ_ONCE(conf->disks[i].replacement); 731 if (!rdev || test_bit(Faulty, &rdev->flags)) 732 degraded2++; 733 else if (test_bit(In_sync, &rdev->flags)) 734 ; 735 else 736 /* not in-sync or faulty. 737 * If reshape increases the number of devices, this 738 * section has already been recovered, else it 739 * almost certainly hasn't. 740 */ 741 if (conf->raid_disks <= conf->previous_raid_disks) 742 degraded2++; 743 } 744 if (degraded2 > degraded) 745 return degraded2; 746 return degraded; 747 } 748 749 static bool has_failed(struct r5conf *conf) 750 { 751 int degraded = conf->mddev->degraded; 752 753 if (test_bit(MD_BROKEN, &conf->mddev->flags)) 754 return true; 755 756 if (conf->mddev->reshape_position != MaxSector) 757 degraded = raid5_calc_degraded(conf); 758 759 return degraded > conf->max_degraded; 760 } 761 762 enum stripe_result { 763 STRIPE_SUCCESS = 0, 764 STRIPE_RETRY, 765 STRIPE_SCHEDULE_AND_RETRY, 766 STRIPE_FAIL, 767 STRIPE_WAIT_RESHAPE, 768 }; 769 770 struct stripe_request_ctx { 771 /* a reference to the last stripe_head for batching */ 772 struct stripe_head *batch_last; 773 774 /* first sector in the request */ 775 sector_t first_sector; 776 777 /* last sector in the request */ 778 sector_t last_sector; 779 780 /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ 781 bool do_flush; 782 783 /* 784 * bitmap to track stripe sectors that have been added to stripes 785 * add one to account for unaligned requests 786 */ 787 unsigned long sectors_to_do[]; 788 }; 789 790 /* 791 * Block until another thread clears R5_INACTIVE_BLOCKED or 792 * there are fewer than 3/4 the maximum number of active stripes 793 * and there is an inactive stripe available. 794 */ 795 static bool is_inactive_blocked(struct r5conf *conf, int hash) 796 { 797 if (list_empty(conf->inactive_list + hash)) 798 return false; 799 800 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 801 return true; 802 803 return (atomic_read(&conf->active_stripes) < 804 (conf->max_nr_stripes * 3 / 4)); 805 } 806 807 struct stripe_head *raid5_get_active_stripe(struct r5conf *conf, 808 struct stripe_request_ctx *ctx, sector_t sector, 809 unsigned int flags) 810 { 811 struct stripe_head *sh; 812 int hash = stripe_hash_locks_hash(conf, sector); 813 int previous = !!(flags & R5_GAS_PREVIOUS); 814 815 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 816 817 spin_lock_irq(conf->hash_locks + hash); 818 819 for (;;) { 820 if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) { 821 /* 822 * Must release the reference to batch_last before 823 * waiting, on quiesce, otherwise the batch_last will 824 * hold a reference to a stripe and raid5_quiesce() 825 * will deadlock waiting for active_stripes to go to 826 * zero. 827 */ 828 if (ctx && ctx->batch_last) { 829 raid5_release_stripe(ctx->batch_last); 830 ctx->batch_last = NULL; 831 } 832 833 wait_event_lock_irq(conf->wait_for_quiescent, 834 !conf->quiesce, 835 *(conf->hash_locks + hash)); 836 } 837 838 sh = find_get_stripe(conf, sector, conf->generation - previous, 839 hash); 840 if (sh) 841 break; 842 843 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 844 sh = get_free_stripe(conf, hash); 845 if (sh) { 846 r5c_check_stripe_cache_usage(conf); 847 init_stripe(sh, sector, previous); 848 atomic_inc(&sh->count); 849 break; 850 } 851 852 if (!test_bit(R5_DID_ALLOC, &conf->cache_state)) 853 set_bit(R5_ALLOC_MORE, &conf->cache_state); 854 } 855 856 if (flags & R5_GAS_NOBLOCK) 857 break; 858 859 set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 860 r5l_wake_reclaim(conf->log, 0); 861 862 /* release batch_last before wait to avoid risk of deadlock */ 863 if (ctx && ctx->batch_last) { 864 raid5_release_stripe(ctx->batch_last); 865 ctx->batch_last = NULL; 866 } 867 868 wait_event_lock_irq(conf->wait_for_stripe, 869 is_inactive_blocked(conf, hash), 870 *(conf->hash_locks + hash)); 871 clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); 872 } 873 874 spin_unlock_irq(conf->hash_locks + hash); 875 return sh; 876 } 877 878 static bool is_full_stripe_write(struct stripe_head *sh) 879 { 880 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 881 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 882 } 883 884 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 885 __acquires(&sh1->stripe_lock) 886 __acquires(&sh2->stripe_lock) 887 { 888 if (sh1 > sh2) { 889 spin_lock_irq(&sh2->stripe_lock); 890 spin_lock_nested(&sh1->stripe_lock, 1); 891 } else { 892 spin_lock_irq(&sh1->stripe_lock); 893 spin_lock_nested(&sh2->stripe_lock, 1); 894 } 895 } 896 897 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 898 __releases(&sh1->stripe_lock) 899 __releases(&sh2->stripe_lock) 900 { 901 spin_unlock(&sh1->stripe_lock); 902 spin_unlock_irq(&sh2->stripe_lock); 903 } 904 905 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 906 static bool stripe_can_batch(struct stripe_head *sh) 907 { 908 struct r5conf *conf = sh->raid_conf; 909 910 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 911 return false; 912 return test_bit(STRIPE_BATCH_READY, &sh->state) && 913 is_full_stripe_write(sh); 914 } 915 916 /* we only do back search */ 917 static void stripe_add_to_batch_list(struct r5conf *conf, 918 struct stripe_head *sh, struct stripe_head *last_sh) 919 { 920 struct stripe_head *head; 921 sector_t head_sector, tmp_sec; 922 int hash; 923 int dd_idx; 924 925 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 926 tmp_sec = sh->sector; 927 if (!sector_div(tmp_sec, conf->chunk_sectors)) 928 return; 929 head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); 930 931 if (last_sh && head_sector == last_sh->sector) { 932 head = last_sh; 933 atomic_inc(&head->count); 934 } else { 935 hash = stripe_hash_locks_hash(conf, head_sector); 936 spin_lock_irq(conf->hash_locks + hash); 937 head = find_get_stripe(conf, head_sector, conf->generation, 938 hash); 939 spin_unlock_irq(conf->hash_locks + hash); 940 if (!head) 941 return; 942 if (!stripe_can_batch(head)) 943 goto out; 944 } 945 946 lock_two_stripes(head, sh); 947 /* clear_batch_ready clear the flag */ 948 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 949 goto unlock_out; 950 951 if (sh->batch_head) 952 goto unlock_out; 953 954 dd_idx = 0; 955 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 956 dd_idx++; 957 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 958 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 959 goto unlock_out; 960 961 if (head->batch_head) { 962 spin_lock(&head->batch_head->batch_lock); 963 /* This batch list is already running */ 964 if (!stripe_can_batch(head)) { 965 spin_unlock(&head->batch_head->batch_lock); 966 goto unlock_out; 967 } 968 /* 969 * We must assign batch_head of this stripe within the 970 * batch_lock, otherwise clear_batch_ready of batch head 971 * stripe could clear BATCH_READY bit of this stripe and 972 * this stripe->batch_head doesn't get assigned, which 973 * could confuse clear_batch_ready for this stripe 974 */ 975 sh->batch_head = head->batch_head; 976 977 /* 978 * at this point, head's BATCH_READY could be cleared, but we 979 * can still add the stripe to batch list 980 */ 981 list_add(&sh->batch_list, &head->batch_list); 982 spin_unlock(&head->batch_head->batch_lock); 983 } else { 984 head->batch_head = head; 985 sh->batch_head = head->batch_head; 986 spin_lock(&head->batch_lock); 987 list_add_tail(&sh->batch_list, &head->batch_list); 988 spin_unlock(&head->batch_lock); 989 } 990 991 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 992 if (atomic_dec_return(&conf->preread_active_stripes) 993 < IO_THRESHOLD) 994 md_wakeup_thread(conf->mddev->thread); 995 996 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 997 int seq = sh->bm_seq; 998 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 999 sh->batch_head->bm_seq - seq > 0) 1000 seq = sh->batch_head->bm_seq; 1001 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 1002 sh->batch_head->bm_seq = seq; 1003 } 1004 1005 atomic_inc(&sh->count); 1006 unlock_out: 1007 unlock_two_stripes(head, sh); 1008 out: 1009 raid5_release_stripe(head); 1010 } 1011 1012 /* Determine if 'data_offset' or 'new_data_offset' should be used 1013 * in this stripe_head. 1014 */ 1015 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 1016 { 1017 sector_t progress = conf->reshape_progress; 1018 /* Need a memory barrier to make sure we see the value 1019 * of conf->generation, or ->data_offset that was set before 1020 * reshape_progress was updated. 1021 */ 1022 smp_rmb(); 1023 if (progress == MaxSector) 1024 return 0; 1025 if (sh->generation == conf->generation - 1) 1026 return 0; 1027 /* We are in a reshape, and this is a new-generation stripe, 1028 * so use new_data_offset. 1029 */ 1030 return 1; 1031 } 1032 1033 static void dispatch_bio_list(struct bio_list *tmp) 1034 { 1035 struct bio *bio; 1036 1037 while ((bio = bio_list_pop(tmp))) 1038 submit_bio_noacct(bio); 1039 } 1040 1041 static int cmp_stripe(void *priv, const struct list_head *a, 1042 const struct list_head *b) 1043 { 1044 const struct r5pending_data *da = list_entry(a, 1045 struct r5pending_data, sibling); 1046 const struct r5pending_data *db = list_entry(b, 1047 struct r5pending_data, sibling); 1048 if (da->sector > db->sector) 1049 return 1; 1050 if (da->sector < db->sector) 1051 return -1; 1052 return 0; 1053 } 1054 1055 static void dispatch_defer_bios(struct r5conf *conf, int target, 1056 struct bio_list *list) 1057 { 1058 struct r5pending_data *data; 1059 struct list_head *first, *next = NULL; 1060 int cnt = 0; 1061 1062 if (conf->pending_data_cnt == 0) 1063 return; 1064 1065 list_sort(NULL, &conf->pending_list, cmp_stripe); 1066 1067 first = conf->pending_list.next; 1068 1069 /* temporarily move the head */ 1070 if (conf->next_pending_data) 1071 list_move_tail(&conf->pending_list, 1072 &conf->next_pending_data->sibling); 1073 1074 while (!list_empty(&conf->pending_list)) { 1075 data = list_first_entry(&conf->pending_list, 1076 struct r5pending_data, sibling); 1077 if (&data->sibling == first) 1078 first = data->sibling.next; 1079 next = data->sibling.next; 1080 1081 bio_list_merge(list, &data->bios); 1082 list_move(&data->sibling, &conf->free_list); 1083 cnt++; 1084 if (cnt >= target) 1085 break; 1086 } 1087 conf->pending_data_cnt -= cnt; 1088 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 1089 1090 if (next != &conf->pending_list) 1091 conf->next_pending_data = list_entry(next, 1092 struct r5pending_data, sibling); 1093 else 1094 conf->next_pending_data = NULL; 1095 /* list isn't empty */ 1096 if (first != &conf->pending_list) 1097 list_move_tail(&conf->pending_list, first); 1098 } 1099 1100 static void flush_deferred_bios(struct r5conf *conf) 1101 { 1102 struct bio_list tmp = BIO_EMPTY_LIST; 1103 1104 if (conf->pending_data_cnt == 0) 1105 return; 1106 1107 spin_lock(&conf->pending_bios_lock); 1108 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 1109 BUG_ON(conf->pending_data_cnt != 0); 1110 spin_unlock(&conf->pending_bios_lock); 1111 1112 dispatch_bio_list(&tmp); 1113 } 1114 1115 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 1116 struct bio_list *bios) 1117 { 1118 struct bio_list tmp = BIO_EMPTY_LIST; 1119 struct r5pending_data *ent; 1120 1121 spin_lock(&conf->pending_bios_lock); 1122 ent = list_first_entry(&conf->free_list, struct r5pending_data, 1123 sibling); 1124 list_move_tail(&ent->sibling, &conf->pending_list); 1125 ent->sector = sector; 1126 bio_list_init(&ent->bios); 1127 bio_list_merge(&ent->bios, bios); 1128 conf->pending_data_cnt++; 1129 if (conf->pending_data_cnt >= PENDING_IO_MAX) 1130 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 1131 1132 spin_unlock(&conf->pending_bios_lock); 1133 1134 dispatch_bio_list(&tmp); 1135 } 1136 1137 static bool raid5_discard_limits(struct mddev *mddev, struct bio *bi) 1138 { 1139 struct r5conf *conf = mddev->private; 1140 1141 if (mddev->bitmap_id == ID_LLBITMAP) 1142 return true; 1143 1144 if (!conf->raid5_discard_unsupported) 1145 return true; 1146 1147 bi->bi_status = BLK_STS_NOTSUPP; 1148 bio_endio(bi); 1149 return false; 1150 } 1151 1152 static void 1153 raid5_end_read_request(struct bio *bi); 1154 static void 1155 raid5_end_write_request(struct bio *bi); 1156 1157 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 1158 { 1159 struct r5conf *conf = sh->raid_conf; 1160 int i, disks = sh->disks; 1161 struct stripe_head *head_sh = sh; 1162 struct bio_list pending_bios = BIO_EMPTY_LIST; 1163 struct r5dev *dev; 1164 bool should_defer; 1165 1166 might_sleep(); 1167 1168 if (log_stripe(sh, s) == 0) 1169 return; 1170 1171 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1172 1173 for (i = disks; i--; ) { 1174 enum req_op op; 1175 blk_opf_t op_flags = 0; 1176 int replace_only = 0; 1177 struct bio *bi, *rbi; 1178 struct md_rdev *rdev, *rrdev = NULL; 1179 1180 sh = head_sh; 1181 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1182 op = REQ_OP_WRITE; 1183 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1184 op_flags = REQ_FUA; 1185 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1186 op = REQ_OP_DISCARD; 1187 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1188 op = REQ_OP_READ; 1189 else if (test_and_clear_bit(R5_WantReplace, 1190 &sh->dev[i].flags)) { 1191 op = REQ_OP_WRITE; 1192 replace_only = 1; 1193 } else 1194 continue; 1195 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1196 op_flags |= REQ_SYNC; 1197 1198 again: 1199 dev = &sh->dev[i]; 1200 bi = &dev->req; 1201 rbi = &dev->rreq; /* For writing to replacement */ 1202 1203 rdev = conf->disks[i].rdev; 1204 rrdev = conf->disks[i].replacement; 1205 if (op_is_write(op)) { 1206 if (replace_only) 1207 rdev = NULL; 1208 if (rdev == rrdev) 1209 /* We raced and saw duplicates */ 1210 rrdev = NULL; 1211 } else { 1212 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1213 rdev = rrdev; 1214 rrdev = NULL; 1215 } 1216 1217 if (rdev && test_bit(Faulty, &rdev->flags)) 1218 rdev = NULL; 1219 if (rdev) 1220 atomic_inc(&rdev->nr_pending); 1221 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1222 rrdev = NULL; 1223 if (rrdev) 1224 atomic_inc(&rrdev->nr_pending); 1225 1226 /* We have already checked bad blocks for reads. Now 1227 * need to check for writes. We never accept write errors 1228 * on the replacement, so we don't to check rrdev. 1229 */ 1230 while (op_is_write(op) && rdev && 1231 test_bit(WriteErrorSeen, &rdev->flags)) { 1232 int bad = rdev_has_badblock(rdev, sh->sector, 1233 RAID5_STRIPE_SECTORS(conf)); 1234 if (!bad) 1235 break; 1236 1237 if (bad < 0) { 1238 set_bit(BlockedBadBlocks, &rdev->flags); 1239 if (!conf->mddev->external && 1240 conf->mddev->sb_flags) { 1241 /* It is very unlikely, but we might 1242 * still need to write out the 1243 * bad block log - better give it 1244 * a chance*/ 1245 md_check_recovery(conf->mddev); 1246 } 1247 /* 1248 * Because md_wait_for_blocked_rdev 1249 * will dec nr_pending, we must 1250 * increment it first. 1251 */ 1252 atomic_inc(&rdev->nr_pending); 1253 md_wait_for_blocked_rdev(rdev, conf->mddev); 1254 } else { 1255 /* Acknowledged bad block - skip the write */ 1256 rdev_dec_pending(rdev, conf->mddev); 1257 rdev = NULL; 1258 } 1259 } 1260 1261 if (rdev) { 1262 set_bit(STRIPE_IO_STARTED, &sh->state); 1263 1264 bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); 1265 bi->bi_end_io = op_is_write(op) 1266 ? raid5_end_write_request 1267 : raid5_end_read_request; 1268 bi->bi_private = sh; 1269 1270 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1271 __func__, (unsigned long long)sh->sector, 1272 bi->bi_opf, i); 1273 atomic_inc(&sh->count); 1274 if (sh != head_sh) 1275 atomic_inc(&head_sh->count); 1276 if (use_new_offset(conf, sh)) 1277 bi->bi_iter.bi_sector = (sh->sector 1278 + rdev->new_data_offset); 1279 else 1280 bi->bi_iter.bi_sector = (sh->sector 1281 + rdev->data_offset); 1282 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1283 bi->bi_opf |= REQ_NOMERGE; 1284 1285 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1286 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1287 1288 if (!op_is_write(op) && 1289 test_bit(R5_InJournal, &sh->dev[i].flags)) 1290 /* 1291 * issuing read for a page in journal, this 1292 * must be preparing for prexor in rmw; read 1293 * the data into orig_page 1294 */ 1295 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1296 else 1297 sh->dev[i].vec.bv_page = sh->dev[i].page; 1298 bi->bi_vcnt = 1; 1299 bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); 1300 bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; 1301 bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); 1302 /* 1303 * If this is discard request, set bi_vcnt 0. We don't 1304 * want to confuse SCSI because SCSI will replace payload 1305 */ 1306 if (op == REQ_OP_DISCARD) 1307 bi->bi_vcnt = 0; 1308 if (rrdev) 1309 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1310 1311 mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector); 1312 if (should_defer && op_is_write(op)) 1313 bio_list_add(&pending_bios, bi); 1314 else 1315 submit_bio_noacct(bi); 1316 } 1317 if (rrdev) { 1318 set_bit(STRIPE_IO_STARTED, &sh->state); 1319 1320 bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); 1321 BUG_ON(!op_is_write(op)); 1322 rbi->bi_end_io = raid5_end_write_request; 1323 rbi->bi_private = sh; 1324 1325 pr_debug("%s: for %llu schedule op %d on " 1326 "replacement disc %d\n", 1327 __func__, (unsigned long long)sh->sector, 1328 rbi->bi_opf, i); 1329 atomic_inc(&sh->count); 1330 if (sh != head_sh) 1331 atomic_inc(&head_sh->count); 1332 if (use_new_offset(conf, sh)) 1333 rbi->bi_iter.bi_sector = (sh->sector 1334 + rrdev->new_data_offset); 1335 else 1336 rbi->bi_iter.bi_sector = (sh->sector 1337 + rrdev->data_offset); 1338 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1339 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1340 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1341 rbi->bi_vcnt = 1; 1342 rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); 1343 rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; 1344 rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); 1345 /* 1346 * If this is discard request, set bi_vcnt 0. We don't 1347 * want to confuse SCSI because SCSI will replace payload 1348 */ 1349 if (op == REQ_OP_DISCARD) 1350 rbi->bi_vcnt = 0; 1351 mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector); 1352 if (should_defer && op_is_write(op)) 1353 bio_list_add(&pending_bios, rbi); 1354 else 1355 submit_bio_noacct(rbi); 1356 } 1357 if (!rdev && !rrdev) { 1358 pr_debug("skip op %d on disc %d for sector %llu\n", 1359 bi->bi_opf, i, (unsigned long long)sh->sector); 1360 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1361 set_bit(STRIPE_HANDLE, &sh->state); 1362 } 1363 1364 if (!head_sh->batch_head) 1365 continue; 1366 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1367 batch_list); 1368 if (sh != head_sh) 1369 goto again; 1370 } 1371 1372 if (should_defer && !bio_list_empty(&pending_bios)) 1373 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1374 } 1375 1376 static struct dma_async_tx_descriptor * 1377 async_copy_data(int frombio, struct bio *bio, struct page **page, 1378 unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, 1379 struct stripe_head *sh, int no_skipcopy) 1380 { 1381 struct bio_vec bvl; 1382 struct bvec_iter iter; 1383 struct page *bio_page; 1384 int page_offset; 1385 struct async_submit_ctl submit; 1386 enum async_tx_flags flags = 0; 1387 struct r5conf *conf = sh->raid_conf; 1388 1389 if (bio->bi_iter.bi_sector >= sector) 1390 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1391 else 1392 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1393 1394 if (frombio) 1395 flags |= ASYNC_TX_FENCE; 1396 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1397 1398 bio_for_each_segment(bvl, bio, iter) { 1399 int len = bvl.bv_len; 1400 int clen; 1401 int b_offset = 0; 1402 1403 if (page_offset < 0) { 1404 b_offset = -page_offset; 1405 page_offset += b_offset; 1406 len -= b_offset; 1407 } 1408 1409 if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) 1410 clen = RAID5_STRIPE_SIZE(conf) - page_offset; 1411 else 1412 clen = len; 1413 1414 if (clen > 0) { 1415 b_offset += bvl.bv_offset; 1416 bio_page = bvl.bv_page; 1417 if (frombio) { 1418 if (conf->skip_copy && 1419 b_offset == 0 && page_offset == 0 && 1420 clen == RAID5_STRIPE_SIZE(conf) && 1421 !no_skipcopy) 1422 *page = bio_page; 1423 else 1424 tx = async_memcpy(*page, bio_page, page_offset + poff, 1425 b_offset, clen, &submit); 1426 } else 1427 tx = async_memcpy(bio_page, *page, b_offset, 1428 page_offset + poff, clen, &submit); 1429 } 1430 /* chain the operations */ 1431 submit.depend_tx = tx; 1432 1433 if (clen < len) /* hit end of page */ 1434 break; 1435 page_offset += len; 1436 } 1437 1438 return tx; 1439 } 1440 1441 static void ops_complete_biofill(void *stripe_head_ref) 1442 { 1443 struct stripe_head *sh = stripe_head_ref; 1444 int i; 1445 struct r5conf *conf = sh->raid_conf; 1446 1447 pr_debug("%s: stripe %llu\n", __func__, 1448 (unsigned long long)sh->sector); 1449 1450 /* clear completed biofills */ 1451 for (i = sh->disks; i--; ) { 1452 struct r5dev *dev = &sh->dev[i]; 1453 1454 /* acknowledge completion of a biofill operation */ 1455 /* and check if we need to reply to a read request, 1456 * new R5_Wantfill requests are held off until 1457 * !STRIPE_BIOFILL_RUN 1458 */ 1459 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1460 struct bio *rbi, *rbi2; 1461 1462 BUG_ON(!dev->read); 1463 rbi = dev->read; 1464 dev->read = NULL; 1465 while (rbi && rbi->bi_iter.bi_sector < 1466 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1467 rbi2 = r5_next_bio(conf, rbi, dev->sector); 1468 bio_endio(rbi); 1469 rbi = rbi2; 1470 } 1471 } 1472 } 1473 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1474 1475 set_bit(STRIPE_HANDLE, &sh->state); 1476 raid5_release_stripe(sh); 1477 } 1478 1479 static void ops_run_biofill(struct stripe_head *sh) 1480 { 1481 struct dma_async_tx_descriptor *tx = NULL; 1482 struct async_submit_ctl submit; 1483 int i; 1484 struct r5conf *conf = sh->raid_conf; 1485 1486 BUG_ON(sh->batch_head); 1487 pr_debug("%s: stripe %llu\n", __func__, 1488 (unsigned long long)sh->sector); 1489 1490 for (i = sh->disks; i--; ) { 1491 struct r5dev *dev = &sh->dev[i]; 1492 if (test_bit(R5_Wantfill, &dev->flags)) { 1493 struct bio *rbi; 1494 spin_lock_irq(&sh->stripe_lock); 1495 dev->read = rbi = dev->toread; 1496 dev->toread = NULL; 1497 spin_unlock_irq(&sh->stripe_lock); 1498 while (rbi && rbi->bi_iter.bi_sector < 1499 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1500 tx = async_copy_data(0, rbi, &dev->page, 1501 dev->offset, 1502 dev->sector, tx, sh, 0); 1503 rbi = r5_next_bio(conf, rbi, dev->sector); 1504 } 1505 } 1506 } 1507 1508 atomic_inc(&sh->count); 1509 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1510 async_trigger_callback(&submit); 1511 } 1512 1513 static void mark_target_uptodate(struct stripe_head *sh, int target) 1514 { 1515 struct r5dev *tgt; 1516 1517 if (target < 0) 1518 return; 1519 1520 tgt = &sh->dev[target]; 1521 set_bit(R5_UPTODATE, &tgt->flags); 1522 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1523 clear_bit(R5_Wantcompute, &tgt->flags); 1524 } 1525 1526 static void ops_complete_compute(void *stripe_head_ref) 1527 { 1528 struct stripe_head *sh = stripe_head_ref; 1529 1530 pr_debug("%s: stripe %llu\n", __func__, 1531 (unsigned long long)sh->sector); 1532 1533 /* mark the computed target(s) as uptodate */ 1534 mark_target_uptodate(sh, sh->ops.target); 1535 mark_target_uptodate(sh, sh->ops.target2); 1536 1537 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1538 if (sh->check_state == check_state_compute_run) 1539 sh->check_state = check_state_compute_result; 1540 set_bit(STRIPE_HANDLE, &sh->state); 1541 raid5_release_stripe(sh); 1542 } 1543 1544 /* return a pointer to the address conversion region of the scribble buffer */ 1545 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1546 { 1547 return percpu->scribble + i * percpu->scribble_obj_size; 1548 } 1549 1550 /* return a pointer to the address conversion region of the scribble buffer */ 1551 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1552 struct raid5_percpu *percpu, int i) 1553 { 1554 return (void *) (to_addr_page(percpu, i) + sh->disks + 2); 1555 } 1556 1557 /* 1558 * Return a pointer to record offset address. 1559 */ 1560 static unsigned int * 1561 to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) 1562 { 1563 return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); 1564 } 1565 1566 static struct dma_async_tx_descriptor * 1567 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1568 { 1569 int disks = sh->disks; 1570 struct page **xor_srcs = to_addr_page(percpu, 0); 1571 unsigned int *off_srcs = to_addr_offs(sh, percpu); 1572 int target = sh->ops.target; 1573 struct r5dev *tgt = &sh->dev[target]; 1574 struct page *xor_dest = tgt->page; 1575 unsigned int off_dest = tgt->offset; 1576 int count = 0; 1577 struct dma_async_tx_descriptor *tx; 1578 struct async_submit_ctl submit; 1579 int i; 1580 1581 BUG_ON(sh->batch_head); 1582 1583 pr_debug("%s: stripe %llu block: %d\n", 1584 __func__, (unsigned long long)sh->sector, target); 1585 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1586 1587 for (i = disks; i--; ) { 1588 if (i != target) { 1589 off_srcs[count] = sh->dev[i].offset; 1590 xor_srcs[count++] = sh->dev[i].page; 1591 } 1592 } 1593 1594 atomic_inc(&sh->count); 1595 1596 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1597 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1598 if (unlikely(count == 1)) 1599 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], 1600 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1601 else 1602 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 1603 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1604 1605 return tx; 1606 } 1607 1608 /* set_syndrome_sources - populate source buffers for gen_syndrome 1609 * @srcs - (struct page *) array of size sh->disks 1610 * @offs - (unsigned int) array of offset for each page 1611 * @sh - stripe_head to parse 1612 * 1613 * Populates srcs in proper layout order for the stripe and returns the 1614 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1615 * destination buffer is recorded in srcs[count] and the Q destination 1616 * is recorded in srcs[count+1]]. 1617 */ 1618 static int set_syndrome_sources(struct page **srcs, 1619 unsigned int *offs, 1620 struct stripe_head *sh, 1621 int srctype) 1622 { 1623 int disks = sh->disks; 1624 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1625 int d0_idx = raid6_d0(sh); 1626 int count; 1627 int i; 1628 1629 for (i = 0; i < disks; i++) 1630 srcs[i] = NULL; 1631 1632 count = 0; 1633 i = d0_idx; 1634 do { 1635 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1636 struct r5dev *dev = &sh->dev[i]; 1637 1638 if (i == sh->qd_idx || i == sh->pd_idx || 1639 (srctype == SYNDROME_SRC_ALL) || 1640 (srctype == SYNDROME_SRC_WANT_DRAIN && 1641 (test_bit(R5_Wantdrain, &dev->flags) || 1642 test_bit(R5_InJournal, &dev->flags))) || 1643 (srctype == SYNDROME_SRC_WRITTEN && 1644 (dev->written || 1645 test_bit(R5_InJournal, &dev->flags)))) { 1646 if (test_bit(R5_InJournal, &dev->flags)) 1647 srcs[slot] = sh->dev[i].orig_page; 1648 else 1649 srcs[slot] = sh->dev[i].page; 1650 /* 1651 * For R5_InJournal, PAGE_SIZE must be 4KB and will 1652 * not shared page. In that case, dev[i].offset 1653 * is 0. 1654 */ 1655 offs[slot] = sh->dev[i].offset; 1656 } 1657 i = raid6_next_disk(i, disks); 1658 } while (i != d0_idx); 1659 1660 return syndrome_disks; 1661 } 1662 1663 static struct dma_async_tx_descriptor * 1664 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1665 { 1666 int disks = sh->disks; 1667 struct page **blocks = to_addr_page(percpu, 0); 1668 unsigned int *offs = to_addr_offs(sh, percpu); 1669 int target; 1670 int qd_idx = sh->qd_idx; 1671 struct dma_async_tx_descriptor *tx; 1672 struct async_submit_ctl submit; 1673 struct r5dev *tgt; 1674 struct page *dest; 1675 unsigned int dest_off; 1676 int i; 1677 int count; 1678 1679 BUG_ON(sh->batch_head); 1680 if (sh->ops.target < 0) 1681 target = sh->ops.target2; 1682 else if (sh->ops.target2 < 0) 1683 target = sh->ops.target; 1684 else 1685 /* we should only have one valid target */ 1686 BUG(); 1687 BUG_ON(target < 0); 1688 pr_debug("%s: stripe %llu block: %d\n", 1689 __func__, (unsigned long long)sh->sector, target); 1690 1691 tgt = &sh->dev[target]; 1692 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1693 dest = tgt->page; 1694 dest_off = tgt->offset; 1695 1696 atomic_inc(&sh->count); 1697 1698 if (target == qd_idx) { 1699 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); 1700 blocks[count] = NULL; /* regenerating p is not necessary */ 1701 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1702 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1703 ops_complete_compute, sh, 1704 to_addr_conv(sh, percpu, 0)); 1705 tx = async_gen_syndrome(blocks, offs, count+2, 1706 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1707 } else { 1708 /* Compute any data- or p-drive using XOR */ 1709 count = 0; 1710 for (i = disks; i-- ; ) { 1711 if (i == target || i == qd_idx) 1712 continue; 1713 offs[count] = sh->dev[i].offset; 1714 blocks[count++] = sh->dev[i].page; 1715 } 1716 1717 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1718 NULL, ops_complete_compute, sh, 1719 to_addr_conv(sh, percpu, 0)); 1720 tx = async_xor_offs(dest, dest_off, blocks, offs, count, 1721 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1722 } 1723 1724 return tx; 1725 } 1726 1727 static struct dma_async_tx_descriptor * 1728 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1729 { 1730 int i, count, disks = sh->disks; 1731 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1732 int d0_idx = raid6_d0(sh); 1733 int faila = -1, failb = -1; 1734 int target = sh->ops.target; 1735 int target2 = sh->ops.target2; 1736 struct r5dev *tgt = &sh->dev[target]; 1737 struct r5dev *tgt2 = &sh->dev[target2]; 1738 struct dma_async_tx_descriptor *tx; 1739 struct page **blocks = to_addr_page(percpu, 0); 1740 unsigned int *offs = to_addr_offs(sh, percpu); 1741 struct async_submit_ctl submit; 1742 1743 BUG_ON(sh->batch_head); 1744 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1745 __func__, (unsigned long long)sh->sector, target, target2); 1746 BUG_ON(target < 0 || target2 < 0); 1747 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1748 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1749 1750 /* we need to open-code set_syndrome_sources to handle the 1751 * slot number conversion for 'faila' and 'failb' 1752 */ 1753 for (i = 0; i < disks ; i++) { 1754 offs[i] = 0; 1755 blocks[i] = NULL; 1756 } 1757 count = 0; 1758 i = d0_idx; 1759 do { 1760 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1761 1762 offs[slot] = sh->dev[i].offset; 1763 blocks[slot] = sh->dev[i].page; 1764 1765 if (i == target) 1766 faila = slot; 1767 if (i == target2) 1768 failb = slot; 1769 i = raid6_next_disk(i, disks); 1770 } while (i != d0_idx); 1771 1772 BUG_ON(faila == failb); 1773 if (failb < faila) 1774 swap(faila, failb); 1775 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1776 __func__, (unsigned long long)sh->sector, faila, failb); 1777 1778 atomic_inc(&sh->count); 1779 1780 if (failb == syndrome_disks+1) { 1781 /* Q disk is one of the missing disks */ 1782 if (faila == syndrome_disks) { 1783 /* Missing P+Q, just recompute */ 1784 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1785 ops_complete_compute, sh, 1786 to_addr_conv(sh, percpu, 0)); 1787 return async_gen_syndrome(blocks, offs, syndrome_disks+2, 1788 RAID5_STRIPE_SIZE(sh->raid_conf), 1789 &submit); 1790 } else { 1791 struct page *dest; 1792 unsigned int dest_off; 1793 int data_target; 1794 int qd_idx = sh->qd_idx; 1795 1796 /* Missing D+Q: recompute D from P, then recompute Q */ 1797 if (target == qd_idx) 1798 data_target = target2; 1799 else 1800 data_target = target; 1801 1802 count = 0; 1803 for (i = disks; i-- ; ) { 1804 if (i == data_target || i == qd_idx) 1805 continue; 1806 offs[count] = sh->dev[i].offset; 1807 blocks[count++] = sh->dev[i].page; 1808 } 1809 dest = sh->dev[data_target].page; 1810 dest_off = sh->dev[data_target].offset; 1811 init_async_submit(&submit, 1812 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1813 NULL, NULL, NULL, 1814 to_addr_conv(sh, percpu, 0)); 1815 tx = async_xor_offs(dest, dest_off, blocks, offs, count, 1816 RAID5_STRIPE_SIZE(sh->raid_conf), 1817 &submit); 1818 1819 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); 1820 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1821 ops_complete_compute, sh, 1822 to_addr_conv(sh, percpu, 0)); 1823 return async_gen_syndrome(blocks, offs, count+2, 1824 RAID5_STRIPE_SIZE(sh->raid_conf), 1825 &submit); 1826 } 1827 } else { 1828 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1829 ops_complete_compute, sh, 1830 to_addr_conv(sh, percpu, 0)); 1831 if (failb == syndrome_disks) { 1832 /* We're missing D+P. */ 1833 return async_raid6_datap_recov(syndrome_disks+2, 1834 RAID5_STRIPE_SIZE(sh->raid_conf), 1835 faila, 1836 blocks, offs, &submit); 1837 } else { 1838 /* We're missing D+D. */ 1839 return async_raid6_2data_recov(syndrome_disks+2, 1840 RAID5_STRIPE_SIZE(sh->raid_conf), 1841 faila, failb, 1842 blocks, offs, &submit); 1843 } 1844 } 1845 } 1846 1847 static void ops_complete_prexor(void *stripe_head_ref) 1848 { 1849 struct stripe_head *sh = stripe_head_ref; 1850 1851 pr_debug("%s: stripe %llu\n", __func__, 1852 (unsigned long long)sh->sector); 1853 1854 if (r5c_is_writeback(sh->raid_conf->log)) 1855 /* 1856 * raid5-cache write back uses orig_page during prexor. 1857 * After prexor, it is time to free orig_page 1858 */ 1859 r5c_release_extra_page(sh); 1860 } 1861 1862 static struct dma_async_tx_descriptor * 1863 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1864 struct dma_async_tx_descriptor *tx) 1865 { 1866 int disks = sh->disks; 1867 struct page **xor_srcs = to_addr_page(percpu, 0); 1868 unsigned int *off_srcs = to_addr_offs(sh, percpu); 1869 int count = 0, pd_idx = sh->pd_idx, i; 1870 struct async_submit_ctl submit; 1871 1872 /* existing parity data subtracted */ 1873 unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; 1874 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1875 1876 BUG_ON(sh->batch_head); 1877 pr_debug("%s: stripe %llu\n", __func__, 1878 (unsigned long long)sh->sector); 1879 1880 for (i = disks; i--; ) { 1881 struct r5dev *dev = &sh->dev[i]; 1882 /* Only process blocks that are known to be uptodate */ 1883 if (test_bit(R5_InJournal, &dev->flags)) { 1884 /* 1885 * For this case, PAGE_SIZE must be equal to 4KB and 1886 * page offset is zero. 1887 */ 1888 off_srcs[count] = dev->offset; 1889 xor_srcs[count++] = dev->orig_page; 1890 } else if (test_bit(R5_Wantdrain, &dev->flags)) { 1891 off_srcs[count] = dev->offset; 1892 xor_srcs[count++] = dev->page; 1893 } 1894 } 1895 1896 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1897 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1898 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 1899 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1900 1901 return tx; 1902 } 1903 1904 static struct dma_async_tx_descriptor * 1905 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1906 struct dma_async_tx_descriptor *tx) 1907 { 1908 struct page **blocks = to_addr_page(percpu, 0); 1909 unsigned int *offs = to_addr_offs(sh, percpu); 1910 int count; 1911 struct async_submit_ctl submit; 1912 1913 pr_debug("%s: stripe %llu\n", __func__, 1914 (unsigned long long)sh->sector); 1915 1916 count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); 1917 1918 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1919 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1920 tx = async_gen_syndrome(blocks, offs, count+2, 1921 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 1922 1923 return tx; 1924 } 1925 1926 static struct dma_async_tx_descriptor * 1927 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1928 { 1929 struct r5conf *conf = sh->raid_conf; 1930 int disks = sh->disks; 1931 int i; 1932 struct stripe_head *head_sh = sh; 1933 1934 pr_debug("%s: stripe %llu\n", __func__, 1935 (unsigned long long)sh->sector); 1936 1937 for (i = disks; i--; ) { 1938 struct r5dev *dev; 1939 struct bio *chosen; 1940 1941 sh = head_sh; 1942 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1943 struct bio *wbi; 1944 1945 again: 1946 dev = &sh->dev[i]; 1947 /* 1948 * clear R5_InJournal, so when rewriting a page in 1949 * journal, it is not skipped by r5l_log_stripe() 1950 */ 1951 clear_bit(R5_InJournal, &dev->flags); 1952 spin_lock_irq(&sh->stripe_lock); 1953 chosen = dev->towrite; 1954 dev->towrite = NULL; 1955 sh->overwrite_disks = 0; 1956 BUG_ON(dev->written); 1957 wbi = dev->written = chosen; 1958 spin_unlock_irq(&sh->stripe_lock); 1959 WARN_ON(dev->page != dev->orig_page); 1960 1961 while (wbi && wbi->bi_iter.bi_sector < 1962 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 1963 if (wbi->bi_opf & REQ_FUA) 1964 set_bit(R5_WantFUA, &dev->flags); 1965 if (wbi->bi_opf & REQ_SYNC) 1966 set_bit(R5_SyncIO, &dev->flags); 1967 if (bio_op(wbi) == REQ_OP_DISCARD) 1968 set_bit(R5_Discard, &dev->flags); 1969 else { 1970 tx = async_copy_data(1, wbi, &dev->page, 1971 dev->offset, 1972 dev->sector, tx, sh, 1973 r5c_is_writeback(conf->log)); 1974 if (dev->page != dev->orig_page && 1975 !r5c_is_writeback(conf->log)) { 1976 set_bit(R5_SkipCopy, &dev->flags); 1977 clear_bit(R5_UPTODATE, &dev->flags); 1978 clear_bit(R5_OVERWRITE, &dev->flags); 1979 } 1980 } 1981 wbi = r5_next_bio(conf, wbi, dev->sector); 1982 } 1983 1984 if (head_sh->batch_head) { 1985 sh = list_first_entry(&sh->batch_list, 1986 struct stripe_head, 1987 batch_list); 1988 if (sh == head_sh) 1989 continue; 1990 goto again; 1991 } 1992 } 1993 } 1994 1995 return tx; 1996 } 1997 1998 static void ops_complete_reconstruct(void *stripe_head_ref) 1999 { 2000 struct stripe_head *sh = stripe_head_ref; 2001 int disks = sh->disks; 2002 int pd_idx = sh->pd_idx; 2003 int qd_idx = sh->qd_idx; 2004 int i; 2005 bool fua = false, sync = false, discard = false; 2006 2007 pr_debug("%s: stripe %llu\n", __func__, 2008 (unsigned long long)sh->sector); 2009 2010 for (i = disks; i--; ) { 2011 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 2012 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 2013 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 2014 } 2015 2016 for (i = disks; i--; ) { 2017 struct r5dev *dev = &sh->dev[i]; 2018 2019 if (dev->written || i == pd_idx || i == qd_idx) { 2020 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { 2021 set_bit(R5_UPTODATE, &dev->flags); 2022 if (test_bit(STRIPE_EXPAND_READY, &sh->state)) 2023 set_bit(R5_Expanded, &dev->flags); 2024 } 2025 if (fua) 2026 set_bit(R5_WantFUA, &dev->flags); 2027 if (sync) 2028 set_bit(R5_SyncIO, &dev->flags); 2029 } 2030 } 2031 2032 if (sh->reconstruct_state == reconstruct_state_drain_run) 2033 sh->reconstruct_state = reconstruct_state_drain_result; 2034 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 2035 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 2036 else { 2037 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 2038 sh->reconstruct_state = reconstruct_state_result; 2039 } 2040 2041 set_bit(STRIPE_HANDLE, &sh->state); 2042 raid5_release_stripe(sh); 2043 } 2044 2045 static void 2046 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 2047 struct dma_async_tx_descriptor *tx) 2048 { 2049 int disks = sh->disks; 2050 struct page **xor_srcs; 2051 unsigned int *off_srcs; 2052 struct async_submit_ctl submit; 2053 int count, pd_idx = sh->pd_idx, i; 2054 struct page *xor_dest; 2055 unsigned int off_dest; 2056 int prexor = 0; 2057 unsigned long flags; 2058 int j = 0; 2059 struct stripe_head *head_sh = sh; 2060 int last_stripe; 2061 2062 pr_debug("%s: stripe %llu\n", __func__, 2063 (unsigned long long)sh->sector); 2064 2065 for (i = 0; i < sh->disks; i++) { 2066 if (pd_idx == i) 2067 continue; 2068 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 2069 break; 2070 } 2071 if (i >= sh->disks) { 2072 atomic_inc(&sh->count); 2073 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 2074 ops_complete_reconstruct(sh); 2075 return; 2076 } 2077 again: 2078 count = 0; 2079 xor_srcs = to_addr_page(percpu, j); 2080 off_srcs = to_addr_offs(sh, percpu); 2081 /* check if prexor is active which means only process blocks 2082 * that are part of a read-modify-write (written) 2083 */ 2084 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 2085 prexor = 1; 2086 off_dest = off_srcs[count] = sh->dev[pd_idx].offset; 2087 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 2088 for (i = disks; i--; ) { 2089 struct r5dev *dev = &sh->dev[i]; 2090 if (head_sh->dev[i].written || 2091 test_bit(R5_InJournal, &head_sh->dev[i].flags)) { 2092 off_srcs[count] = dev->offset; 2093 xor_srcs[count++] = dev->page; 2094 } 2095 } 2096 } else { 2097 xor_dest = sh->dev[pd_idx].page; 2098 off_dest = sh->dev[pd_idx].offset; 2099 for (i = disks; i--; ) { 2100 struct r5dev *dev = &sh->dev[i]; 2101 if (i != pd_idx) { 2102 off_srcs[count] = dev->offset; 2103 xor_srcs[count++] = dev->page; 2104 } 2105 } 2106 } 2107 2108 /* 1/ if we prexor'd then the dest is reused as a source 2109 * 2/ if we did not prexor then we are redoing the parity 2110 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 2111 * for the synchronous xor case 2112 */ 2113 last_stripe = !head_sh->batch_head || 2114 list_first_entry(&sh->batch_list, 2115 struct stripe_head, batch_list) == head_sh; 2116 if (last_stripe) { 2117 flags = ASYNC_TX_ACK | 2118 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 2119 2120 atomic_inc(&head_sh->count); 2121 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 2122 to_addr_conv(sh, percpu, j)); 2123 } else { 2124 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 2125 init_async_submit(&submit, flags, tx, NULL, NULL, 2126 to_addr_conv(sh, percpu, j)); 2127 } 2128 2129 if (unlikely(count == 1)) 2130 tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], 2131 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2132 else 2133 tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 2134 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2135 if (!last_stripe) { 2136 j++; 2137 sh = list_first_entry(&sh->batch_list, struct stripe_head, 2138 batch_list); 2139 goto again; 2140 } 2141 } 2142 2143 static void 2144 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 2145 struct dma_async_tx_descriptor *tx) 2146 { 2147 struct async_submit_ctl submit; 2148 struct page **blocks; 2149 unsigned int *offs; 2150 int count, i, j = 0; 2151 struct stripe_head *head_sh = sh; 2152 int last_stripe; 2153 int synflags; 2154 unsigned long txflags; 2155 2156 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 2157 2158 for (i = 0; i < sh->disks; i++) { 2159 if (sh->pd_idx == i || sh->qd_idx == i) 2160 continue; 2161 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 2162 break; 2163 } 2164 if (i >= sh->disks) { 2165 atomic_inc(&sh->count); 2166 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 2167 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 2168 ops_complete_reconstruct(sh); 2169 return; 2170 } 2171 2172 again: 2173 blocks = to_addr_page(percpu, j); 2174 offs = to_addr_offs(sh, percpu); 2175 2176 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 2177 synflags = SYNDROME_SRC_WRITTEN; 2178 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 2179 } else { 2180 synflags = SYNDROME_SRC_ALL; 2181 txflags = ASYNC_TX_ACK; 2182 } 2183 2184 count = set_syndrome_sources(blocks, offs, sh, synflags); 2185 last_stripe = !head_sh->batch_head || 2186 list_first_entry(&sh->batch_list, 2187 struct stripe_head, batch_list) == head_sh; 2188 2189 if (last_stripe) { 2190 atomic_inc(&head_sh->count); 2191 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 2192 head_sh, to_addr_conv(sh, percpu, j)); 2193 } else 2194 init_async_submit(&submit, 0, tx, NULL, NULL, 2195 to_addr_conv(sh, percpu, j)); 2196 tx = async_gen_syndrome(blocks, offs, count+2, 2197 RAID5_STRIPE_SIZE(sh->raid_conf), &submit); 2198 if (!last_stripe) { 2199 j++; 2200 sh = list_first_entry(&sh->batch_list, struct stripe_head, 2201 batch_list); 2202 goto again; 2203 } 2204 } 2205 2206 static void ops_complete_check(void *stripe_head_ref) 2207 { 2208 struct stripe_head *sh = stripe_head_ref; 2209 2210 pr_debug("%s: stripe %llu\n", __func__, 2211 (unsigned long long)sh->sector); 2212 2213 sh->check_state = check_state_check_result; 2214 set_bit(STRIPE_HANDLE, &sh->state); 2215 raid5_release_stripe(sh); 2216 } 2217 2218 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2219 { 2220 int disks = sh->disks; 2221 int pd_idx = sh->pd_idx; 2222 int qd_idx = sh->qd_idx; 2223 struct page *xor_dest; 2224 unsigned int off_dest; 2225 struct page **xor_srcs = to_addr_page(percpu, 0); 2226 unsigned int *off_srcs = to_addr_offs(sh, percpu); 2227 struct dma_async_tx_descriptor *tx; 2228 struct async_submit_ctl submit; 2229 int count; 2230 int i; 2231 2232 pr_debug("%s: stripe %llu\n", __func__, 2233 (unsigned long long)sh->sector); 2234 2235 BUG_ON(sh->batch_head); 2236 count = 0; 2237 xor_dest = sh->dev[pd_idx].page; 2238 off_dest = sh->dev[pd_idx].offset; 2239 off_srcs[count] = off_dest; 2240 xor_srcs[count++] = xor_dest; 2241 for (i = disks; i--; ) { 2242 if (i == pd_idx || i == qd_idx) 2243 continue; 2244 off_srcs[count] = sh->dev[i].offset; 2245 xor_srcs[count++] = sh->dev[i].page; 2246 } 2247 2248 init_async_submit(&submit, 0, NULL, NULL, NULL, 2249 to_addr_conv(sh, percpu, 0)); 2250 tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, 2251 RAID5_STRIPE_SIZE(sh->raid_conf), 2252 &sh->ops.zero_sum_result, &submit); 2253 2254 atomic_inc(&sh->count); 2255 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2256 tx = async_trigger_callback(&submit); 2257 } 2258 2259 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2260 { 2261 struct page **srcs = to_addr_page(percpu, 0); 2262 unsigned int *offs = to_addr_offs(sh, percpu); 2263 struct async_submit_ctl submit; 2264 int count; 2265 2266 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2267 (unsigned long long)sh->sector, checkp); 2268 2269 BUG_ON(sh->batch_head); 2270 count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); 2271 if (!checkp) 2272 srcs[count] = NULL; 2273 2274 atomic_inc(&sh->count); 2275 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2276 sh, to_addr_conv(sh, percpu, 0)); 2277 async_syndrome_val(srcs, offs, count+2, 2278 RAID5_STRIPE_SIZE(sh->raid_conf), 2279 &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); 2280 } 2281 2282 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2283 { 2284 int overlap_clear = 0, i, disks = sh->disks; 2285 struct dma_async_tx_descriptor *tx = NULL; 2286 struct r5conf *conf = sh->raid_conf; 2287 int level = conf->level; 2288 struct raid5_percpu *percpu; 2289 2290 local_lock(&conf->percpu->lock); 2291 percpu = this_cpu_ptr(conf->percpu); 2292 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2293 ops_run_biofill(sh); 2294 overlap_clear++; 2295 } 2296 2297 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2298 if (level < 6) 2299 tx = ops_run_compute5(sh, percpu); 2300 else { 2301 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2302 tx = ops_run_compute6_1(sh, percpu); 2303 else 2304 tx = ops_run_compute6_2(sh, percpu); 2305 } 2306 /* terminate the chain if reconstruct is not set to be run */ 2307 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2308 async_tx_ack(tx); 2309 } 2310 2311 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2312 if (level < 6) 2313 tx = ops_run_prexor5(sh, percpu, tx); 2314 else 2315 tx = ops_run_prexor6(sh, percpu, tx); 2316 } 2317 2318 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2319 tx = ops_run_partial_parity(sh, percpu, tx); 2320 2321 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2322 tx = ops_run_biodrain(sh, tx); 2323 overlap_clear++; 2324 } 2325 2326 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2327 if (level < 6) 2328 ops_run_reconstruct5(sh, percpu, tx); 2329 else 2330 ops_run_reconstruct6(sh, percpu, tx); 2331 } 2332 2333 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2334 if (sh->check_state == check_state_run) 2335 ops_run_check_p(sh, percpu); 2336 else if (sh->check_state == check_state_run_q) 2337 ops_run_check_pq(sh, percpu, 0); 2338 else if (sh->check_state == check_state_run_pq) 2339 ops_run_check_pq(sh, percpu, 1); 2340 else 2341 BUG(); 2342 } 2343 2344 if (overlap_clear && !sh->batch_head) { 2345 for (i = disks; i--; ) { 2346 struct r5dev *dev = &sh->dev[i]; 2347 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2348 wake_up_bit(&dev->flags, R5_Overlap); 2349 } 2350 } 2351 local_unlock(&conf->percpu->lock); 2352 } 2353 2354 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2355 { 2356 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2357 kfree(sh->pages); 2358 #endif 2359 if (sh->ppl_page) 2360 __free_page(sh->ppl_page); 2361 kmem_cache_free(sc, sh); 2362 } 2363 2364 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2365 int disks, struct r5conf *conf) 2366 { 2367 struct stripe_head *sh; 2368 2369 sh = kmem_cache_zalloc(sc, gfp); 2370 if (sh) { 2371 spin_lock_init(&sh->stripe_lock); 2372 spin_lock_init(&sh->batch_lock); 2373 INIT_LIST_HEAD(&sh->batch_list); 2374 INIT_LIST_HEAD(&sh->lru); 2375 INIT_LIST_HEAD(&sh->r5c); 2376 INIT_LIST_HEAD(&sh->log_list); 2377 atomic_set(&sh->count, 1); 2378 sh->raid_conf = conf; 2379 sh->log_start = MaxSector; 2380 2381 if (raid5_has_ppl(conf)) { 2382 sh->ppl_page = alloc_page(gfp); 2383 if (!sh->ppl_page) { 2384 free_stripe(sc, sh); 2385 return NULL; 2386 } 2387 } 2388 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2389 if (init_stripe_shared_pages(sh, conf, disks)) { 2390 free_stripe(sc, sh); 2391 return NULL; 2392 } 2393 #endif 2394 } 2395 return sh; 2396 } 2397 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2398 { 2399 struct stripe_head *sh; 2400 2401 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2402 if (!sh) 2403 return 0; 2404 2405 if (grow_buffers(sh, gfp)) { 2406 shrink_buffers(sh); 2407 free_stripe(conf->slab_cache, sh); 2408 return 0; 2409 } 2410 sh->hash_lock_index = 2411 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2412 /* we just created an active stripe so... */ 2413 atomic_inc(&conf->active_stripes); 2414 2415 raid5_release_stripe(sh); 2416 WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1); 2417 return 1; 2418 } 2419 2420 static int grow_stripes(struct r5conf *conf, int num) 2421 { 2422 struct kmem_cache *sc; 2423 size_t namelen = sizeof(conf->cache_name[0]); 2424 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2425 2426 if (mddev_is_dm(conf->mddev)) 2427 snprintf(conf->cache_name[0], namelen, 2428 "raid%d-%p", conf->level, conf->mddev); 2429 else 2430 snprintf(conf->cache_name[0], namelen, 2431 "raid%d-%s", conf->level, mdname(conf->mddev)); 2432 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2433 2434 conf->active_name = 0; 2435 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2436 struct_size_t(struct stripe_head, dev, devs), 2437 0, 0, NULL); 2438 if (!sc) 2439 return 1; 2440 conf->slab_cache = sc; 2441 conf->pool_size = devs; 2442 while (num--) 2443 if (!grow_one_stripe(conf, GFP_KERNEL)) 2444 return 1; 2445 2446 return 0; 2447 } 2448 2449 /** 2450 * scribble_alloc - allocate percpu scribble buffer for required size 2451 * of the scribble region 2452 * @percpu: from for_each_present_cpu() of the caller 2453 * @num: total number of disks in the array 2454 * @cnt: scribble objs count for required size of the scribble region 2455 * 2456 * The scribble buffer size must be enough to contain: 2457 * 1/ a struct page pointer for each device in the array +2 2458 * 2/ room to convert each entry in (1) to its corresponding dma 2459 * (dma_map_page()) or page (page_address()) address. 2460 * 2461 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2462 * calculate over all devices (not just the data blocks), using zeros in place 2463 * of the P and Q blocks. 2464 */ 2465 static int scribble_alloc(struct raid5_percpu *percpu, 2466 int num, int cnt) 2467 { 2468 size_t obj_size = 2469 sizeof(struct page *) * (num + 2) + 2470 sizeof(addr_conv_t) * (num + 2) + 2471 sizeof(unsigned int) * (num + 2); 2472 void *scribble; 2473 2474 /* 2475 * If here is in raid array suspend context, it is in memalloc noio 2476 * context as well, there is no potential recursive memory reclaim 2477 * I/Os with the GFP_KERNEL flag. 2478 */ 2479 scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); 2480 if (!scribble) 2481 return -ENOMEM; 2482 2483 kvfree(percpu->scribble); 2484 2485 percpu->scribble = scribble; 2486 percpu->scribble_obj_size = obj_size; 2487 return 0; 2488 } 2489 2490 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2491 { 2492 unsigned long cpu; 2493 int err = 0; 2494 2495 /* Never shrink. */ 2496 if (conf->scribble_disks >= new_disks && 2497 conf->scribble_sectors >= new_sectors) 2498 return 0; 2499 2500 raid5_quiesce(conf->mddev, true); 2501 cpus_read_lock(); 2502 2503 for_each_present_cpu(cpu) { 2504 struct raid5_percpu *percpu; 2505 2506 percpu = per_cpu_ptr(conf->percpu, cpu); 2507 err = scribble_alloc(percpu, new_disks, 2508 new_sectors / RAID5_STRIPE_SECTORS(conf)); 2509 if (err) 2510 break; 2511 } 2512 2513 cpus_read_unlock(); 2514 raid5_quiesce(conf->mddev, false); 2515 2516 if (!err) { 2517 conf->scribble_disks = new_disks; 2518 conf->scribble_sectors = new_sectors; 2519 } 2520 return err; 2521 } 2522 2523 static int resize_stripes(struct r5conf *conf, int newsize) 2524 { 2525 /* Make all the stripes able to hold 'newsize' devices. 2526 * New slots in each stripe get 'page' set to a new page. 2527 * 2528 * This happens in stages: 2529 * 1/ create a new kmem_cache and allocate the required number of 2530 * stripe_heads. 2531 * 2/ gather all the old stripe_heads and transfer the pages across 2532 * to the new stripe_heads. This will have the side effect of 2533 * freezing the array as once all stripe_heads have been collected, 2534 * no IO will be possible. Old stripe heads are freed once their 2535 * pages have been transferred over, and the old kmem_cache is 2536 * freed when all stripes are done. 2537 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2538 * we simple return a failure status - no need to clean anything up. 2539 * 4/ allocate new pages for the new slots in the new stripe_heads. 2540 * If this fails, we don't bother trying the shrink the 2541 * stripe_heads down again, we just leave them as they are. 2542 * As each stripe_head is processed the new one is released into 2543 * active service. 2544 * 2545 * Once step2 is started, we cannot afford to wait for a write, 2546 * so we use GFP_NOIO allocations. 2547 */ 2548 struct stripe_head *osh, *nsh; 2549 LIST_HEAD(newstripes); 2550 struct disk_info *ndisks; 2551 int err = 0; 2552 struct kmem_cache *sc; 2553 int i; 2554 int hash, cnt; 2555 2556 md_allow_write(conf->mddev); 2557 2558 /* Step 1 */ 2559 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2560 struct_size_t(struct stripe_head, dev, newsize), 2561 0, 0, NULL); 2562 if (!sc) 2563 return -ENOMEM; 2564 2565 /* Need to ensure auto-resizing doesn't interfere */ 2566 mutex_lock(&conf->cache_size_mutex); 2567 2568 for (i = conf->max_nr_stripes; i; i--) { 2569 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2570 if (!nsh) 2571 break; 2572 2573 list_add(&nsh->lru, &newstripes); 2574 } 2575 if (i) { 2576 /* didn't get enough, give up */ 2577 while (!list_empty(&newstripes)) { 2578 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2579 list_del(&nsh->lru); 2580 free_stripe(sc, nsh); 2581 } 2582 kmem_cache_destroy(sc); 2583 mutex_unlock(&conf->cache_size_mutex); 2584 return -ENOMEM; 2585 } 2586 /* Step 2 - Must use GFP_NOIO now. 2587 * OK, we have enough stripes, start collecting inactive 2588 * stripes and copying them over 2589 */ 2590 hash = 0; 2591 cnt = 0; 2592 list_for_each_entry(nsh, &newstripes, lru) { 2593 lock_device_hash_lock(conf, hash); 2594 wait_event_cmd(conf->wait_for_stripe, 2595 !list_empty(conf->inactive_list + hash), 2596 unlock_device_hash_lock(conf, hash), 2597 lock_device_hash_lock(conf, hash)); 2598 osh = get_free_stripe(conf, hash); 2599 unlock_device_hash_lock(conf, hash); 2600 2601 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2602 for (i = 0; i < osh->nr_pages; i++) { 2603 nsh->pages[i] = osh->pages[i]; 2604 osh->pages[i] = NULL; 2605 } 2606 #endif 2607 for(i=0; i<conf->pool_size; i++) { 2608 nsh->dev[i].page = osh->dev[i].page; 2609 nsh->dev[i].orig_page = osh->dev[i].page; 2610 nsh->dev[i].offset = osh->dev[i].offset; 2611 } 2612 nsh->hash_lock_index = hash; 2613 free_stripe(conf->slab_cache, osh); 2614 cnt++; 2615 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2616 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2617 hash++; 2618 cnt = 0; 2619 } 2620 } 2621 kmem_cache_destroy(conf->slab_cache); 2622 2623 /* Step 3. 2624 * At this point, we are holding all the stripes so the array 2625 * is completely stalled, so now is a good time to resize 2626 * conf->disks and the scribble region 2627 */ 2628 ndisks = kzalloc_objs(struct disk_info, newsize, GFP_NOIO); 2629 if (ndisks) { 2630 for (i = 0; i < conf->pool_size; i++) 2631 ndisks[i] = conf->disks[i]; 2632 2633 for (i = conf->pool_size; i < newsize; i++) { 2634 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2635 if (!ndisks[i].extra_page) 2636 err = -ENOMEM; 2637 } 2638 2639 if (err) { 2640 for (i = conf->pool_size; i < newsize; i++) 2641 if (ndisks[i].extra_page) 2642 put_page(ndisks[i].extra_page); 2643 kfree(ndisks); 2644 } else { 2645 kfree(conf->disks); 2646 conf->disks = ndisks; 2647 } 2648 } else 2649 err = -ENOMEM; 2650 2651 conf->slab_cache = sc; 2652 conf->active_name = 1-conf->active_name; 2653 2654 /* Step 4, return new stripes to service */ 2655 while(!list_empty(&newstripes)) { 2656 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2657 list_del_init(&nsh->lru); 2658 2659 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2660 for (i = 0; i < nsh->nr_pages; i++) { 2661 if (nsh->pages[i]) 2662 continue; 2663 nsh->pages[i] = alloc_page(GFP_NOIO); 2664 if (!nsh->pages[i]) 2665 err = -ENOMEM; 2666 } 2667 2668 for (i = conf->raid_disks; i < newsize; i++) { 2669 if (nsh->dev[i].page) 2670 continue; 2671 nsh->dev[i].page = raid5_get_dev_page(nsh, i); 2672 nsh->dev[i].orig_page = nsh->dev[i].page; 2673 nsh->dev[i].offset = raid5_get_page_offset(nsh, i); 2674 } 2675 #else 2676 for (i=conf->raid_disks; i < newsize; i++) 2677 if (nsh->dev[i].page == NULL) { 2678 struct page *p = alloc_page(GFP_NOIO); 2679 nsh->dev[i].page = p; 2680 nsh->dev[i].orig_page = p; 2681 nsh->dev[i].offset = 0; 2682 if (!p) 2683 err = -ENOMEM; 2684 } 2685 #endif 2686 raid5_release_stripe(nsh); 2687 } 2688 /* critical section pass, GFP_NOIO no longer needed */ 2689 2690 if (!err) 2691 conf->pool_size = newsize; 2692 mutex_unlock(&conf->cache_size_mutex); 2693 2694 return err; 2695 } 2696 2697 static int drop_one_stripe(struct r5conf *conf) 2698 { 2699 struct stripe_head *sh; 2700 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2701 2702 spin_lock_irq(conf->hash_locks + hash); 2703 sh = get_free_stripe(conf, hash); 2704 spin_unlock_irq(conf->hash_locks + hash); 2705 if (!sh) 2706 return 0; 2707 BUG_ON(atomic_read(&sh->count)); 2708 shrink_buffers(sh); 2709 free_stripe(conf->slab_cache, sh); 2710 atomic_dec(&conf->active_stripes); 2711 WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1); 2712 return 1; 2713 } 2714 2715 static void shrink_stripes(struct r5conf *conf) 2716 { 2717 while (conf->max_nr_stripes && 2718 drop_one_stripe(conf)) 2719 ; 2720 2721 kmem_cache_destroy(conf->slab_cache); 2722 conf->slab_cache = NULL; 2723 } 2724 2725 static void raid5_end_read_request(struct bio * bi) 2726 { 2727 struct stripe_head *sh = bi->bi_private; 2728 struct r5conf *conf = sh->raid_conf; 2729 int disks = sh->disks, i; 2730 struct md_rdev *rdev = NULL; 2731 sector_t s; 2732 2733 for (i=0 ; i<disks; i++) 2734 if (bi == &sh->dev[i].req) 2735 break; 2736 2737 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2738 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2739 bi->bi_status); 2740 if (i == disks) { 2741 BUG(); 2742 return; 2743 } 2744 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2745 /* If replacement finished while this request was outstanding, 2746 * 'replacement' might be NULL already. 2747 * In that case it moved down to 'rdev'. 2748 * rdev is not removed until all requests are finished. 2749 */ 2750 rdev = conf->disks[i].replacement; 2751 if (!rdev) 2752 rdev = conf->disks[i].rdev; 2753 2754 if (use_new_offset(conf, sh)) 2755 s = sh->sector + rdev->new_data_offset; 2756 else 2757 s = sh->sector + rdev->data_offset; 2758 if (!bi->bi_status) { 2759 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2760 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2761 /* Note that this cannot happen on a 2762 * replacement device. We just fail those on 2763 * any error 2764 */ 2765 pr_info_ratelimited( 2766 "md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n", 2767 mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), 2768 (unsigned long long)s, 2769 rdev->bdev); 2770 atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); 2771 clear_bit(R5_ReadError, &sh->dev[i].flags); 2772 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2773 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2774 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2775 2776 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2777 /* 2778 * end read for a page in journal, this 2779 * must be preparing for prexor in rmw 2780 */ 2781 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2782 2783 if (atomic_read(&rdev->read_errors)) 2784 atomic_set(&rdev->read_errors, 0); 2785 } else { 2786 int retry = 0; 2787 int set_bad = 0; 2788 2789 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2790 if (!(bi->bi_status == BLK_STS_PROTECTION)) 2791 atomic_inc(&rdev->read_errors); 2792 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2793 pr_warn_ratelimited( 2794 "md/raid:%s: read error on replacement device (sector %llu on %pg).\n", 2795 mdname(conf->mddev), 2796 (unsigned long long)s, 2797 rdev->bdev); 2798 else if (conf->mddev->degraded >= conf->max_degraded) { 2799 set_bad = 1; 2800 pr_warn_ratelimited( 2801 "md/raid:%s: read error not correctable (sector %llu on %pg).\n", 2802 mdname(conf->mddev), 2803 (unsigned long long)s, 2804 rdev->bdev); 2805 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2806 /* Oh, no!!! */ 2807 set_bad = 1; 2808 pr_warn_ratelimited( 2809 "md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n", 2810 mdname(conf->mddev), 2811 (unsigned long long)s, 2812 rdev->bdev); 2813 } else if (atomic_read(&rdev->read_errors) 2814 > conf->max_nr_stripes) { 2815 if (!test_bit(Faulty, &rdev->flags)) { 2816 pr_warn("md/raid:%s: %d read_errors > %d stripes\n", 2817 mdname(conf->mddev), 2818 atomic_read(&rdev->read_errors), 2819 conf->max_nr_stripes); 2820 pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n", 2821 mdname(conf->mddev), rdev->bdev); 2822 } 2823 } else 2824 retry = 1; 2825 if (set_bad && test_bit(In_sync, &rdev->flags) 2826 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2827 retry = 1; 2828 if (retry) 2829 if (sh->qd_idx >= 0 && sh->pd_idx == i) 2830 set_bit(R5_ReadError, &sh->dev[i].flags); 2831 else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2832 set_bit(R5_ReadError, &sh->dev[i].flags); 2833 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2834 } else 2835 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2836 else { 2837 clear_bit(R5_ReadError, &sh->dev[i].flags); 2838 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2839 if (!(set_bad && test_bit(In_sync, &rdev->flags))) 2840 rdev_set_badblocks(rdev, sh->sector, 2841 RAID5_STRIPE_SECTORS(conf), 0); 2842 } 2843 } 2844 rdev_dec_pending(rdev, conf->mddev); 2845 bio_uninit(bi); 2846 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2847 set_bit(STRIPE_HANDLE, &sh->state); 2848 raid5_release_stripe(sh); 2849 } 2850 2851 static void raid5_end_write_request(struct bio *bi) 2852 { 2853 struct stripe_head *sh = bi->bi_private; 2854 struct r5conf *conf = sh->raid_conf; 2855 int disks = sh->disks, i; 2856 struct md_rdev *rdev; 2857 int replacement = 0; 2858 2859 for (i = 0 ; i < disks; i++) { 2860 if (bi == &sh->dev[i].req) { 2861 rdev = conf->disks[i].rdev; 2862 break; 2863 } 2864 if (bi == &sh->dev[i].rreq) { 2865 rdev = conf->disks[i].replacement; 2866 if (rdev) 2867 replacement = 1; 2868 else 2869 /* rdev was removed and 'replacement' 2870 * replaced it. rdev is not removed 2871 * until all requests are finished. 2872 */ 2873 rdev = conf->disks[i].rdev; 2874 break; 2875 } 2876 } 2877 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2878 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2879 bi->bi_status); 2880 if (i == disks) { 2881 BUG(); 2882 return; 2883 } 2884 2885 if (replacement) { 2886 if (bi->bi_status) 2887 md_error(conf->mddev, rdev); 2888 else if (rdev_has_badblock(rdev, sh->sector, 2889 RAID5_STRIPE_SECTORS(conf))) 2890 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2891 } else { 2892 if (bi->bi_status) { 2893 set_bit(WriteErrorSeen, &rdev->flags); 2894 set_bit(R5_WriteError, &sh->dev[i].flags); 2895 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2896 set_bit(MD_RECOVERY_NEEDED, 2897 &rdev->mddev->recovery); 2898 } else if (rdev_has_badblock(rdev, sh->sector, 2899 RAID5_STRIPE_SECTORS(conf))) { 2900 set_bit(R5_MadeGood, &sh->dev[i].flags); 2901 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2902 /* That was a successful write so make 2903 * sure it looks like we already did 2904 * a re-write. 2905 */ 2906 set_bit(R5_ReWrite, &sh->dev[i].flags); 2907 } 2908 } 2909 rdev_dec_pending(rdev, conf->mddev); 2910 2911 if (sh->batch_head && bi->bi_status && !replacement) 2912 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2913 2914 bio_uninit(bi); 2915 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2916 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2917 set_bit(STRIPE_HANDLE, &sh->state); 2918 2919 if (sh->batch_head && sh != sh->batch_head) 2920 raid5_release_stripe(sh->batch_head); 2921 raid5_release_stripe(sh); 2922 } 2923 2924 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2925 { 2926 struct r5conf *conf = mddev->private; 2927 unsigned long flags; 2928 pr_debug("raid456: error called\n"); 2929 2930 pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n", 2931 mdname(mddev), rdev->bdev); 2932 2933 spin_lock_irqsave(&conf->device_lock, flags); 2934 set_bit(Faulty, &rdev->flags); 2935 clear_bit(In_sync, &rdev->flags); 2936 mddev->degraded = raid5_calc_degraded(conf); 2937 2938 if (has_failed(conf)) { 2939 set_bit(MD_BROKEN, &conf->mddev->flags); 2940 2941 pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", 2942 mdname(mddev), mddev->degraded, conf->raid_disks); 2943 } else { 2944 pr_crit("md/raid:%s: Operation continuing on %d devices.\n", 2945 mdname(mddev), conf->raid_disks - mddev->degraded); 2946 } 2947 2948 spin_unlock_irqrestore(&conf->device_lock, flags); 2949 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2950 2951 set_bit(Blocked, &rdev->flags); 2952 set_mask_bits(&mddev->sb_flags, 0, 2953 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2954 r5c_update_on_rdev_error(mddev, rdev); 2955 } 2956 2957 /* 2958 * Input: a 'big' sector number, 2959 * Output: index of the data and parity disk, and the sector # in them. 2960 */ 2961 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2962 int previous, int *dd_idx, 2963 struct stripe_head *sh) 2964 { 2965 sector_t stripe, stripe2; 2966 sector_t chunk_number; 2967 unsigned int chunk_offset; 2968 int pd_idx, qd_idx; 2969 int ddf_layout = 0; 2970 sector_t new_sector; 2971 int algorithm = previous ? conf->prev_algo 2972 : conf->algorithm; 2973 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2974 : conf->chunk_sectors; 2975 int raid_disks = previous ? conf->previous_raid_disks 2976 : conf->raid_disks; 2977 int data_disks = raid_disks - conf->max_degraded; 2978 2979 /* First compute the information on this sector */ 2980 2981 /* 2982 * Compute the chunk number and the sector offset inside the chunk 2983 */ 2984 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2985 chunk_number = r_sector; 2986 2987 /* 2988 * Compute the stripe number 2989 */ 2990 stripe = chunk_number; 2991 *dd_idx = sector_div(stripe, data_disks); 2992 stripe2 = stripe; 2993 /* 2994 * Select the parity disk based on the user selected algorithm. 2995 */ 2996 pd_idx = qd_idx = -1; 2997 switch(conf->level) { 2998 case 4: 2999 pd_idx = data_disks; 3000 break; 3001 case 5: 3002 switch (algorithm) { 3003 case ALGORITHM_LEFT_ASYMMETRIC: 3004 pd_idx = data_disks - sector_div(stripe2, raid_disks); 3005 if (*dd_idx >= pd_idx) 3006 (*dd_idx)++; 3007 break; 3008 case ALGORITHM_RIGHT_ASYMMETRIC: 3009 pd_idx = sector_div(stripe2, raid_disks); 3010 if (*dd_idx >= pd_idx) 3011 (*dd_idx)++; 3012 break; 3013 case ALGORITHM_LEFT_SYMMETRIC: 3014 pd_idx = data_disks - sector_div(stripe2, raid_disks); 3015 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3016 break; 3017 case ALGORITHM_RIGHT_SYMMETRIC: 3018 pd_idx = sector_div(stripe2, raid_disks); 3019 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3020 break; 3021 case ALGORITHM_PARITY_0: 3022 pd_idx = 0; 3023 (*dd_idx)++; 3024 break; 3025 case ALGORITHM_PARITY_N: 3026 pd_idx = data_disks; 3027 break; 3028 default: 3029 BUG(); 3030 } 3031 break; 3032 case 6: 3033 3034 switch (algorithm) { 3035 case ALGORITHM_LEFT_ASYMMETRIC: 3036 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3037 qd_idx = pd_idx + 1; 3038 if (pd_idx == raid_disks-1) { 3039 (*dd_idx)++; /* Q D D D P */ 3040 qd_idx = 0; 3041 } else if (*dd_idx >= pd_idx) 3042 (*dd_idx) += 2; /* D D P Q D */ 3043 break; 3044 case ALGORITHM_RIGHT_ASYMMETRIC: 3045 pd_idx = sector_div(stripe2, raid_disks); 3046 qd_idx = pd_idx + 1; 3047 if (pd_idx == raid_disks-1) { 3048 (*dd_idx)++; /* Q D D D P */ 3049 qd_idx = 0; 3050 } else if (*dd_idx >= pd_idx) 3051 (*dd_idx) += 2; /* D D P Q D */ 3052 break; 3053 case ALGORITHM_LEFT_SYMMETRIC: 3054 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3055 qd_idx = (pd_idx + 1) % raid_disks; 3056 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 3057 break; 3058 case ALGORITHM_RIGHT_SYMMETRIC: 3059 pd_idx = sector_div(stripe2, raid_disks); 3060 qd_idx = (pd_idx + 1) % raid_disks; 3061 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 3062 break; 3063 3064 case ALGORITHM_PARITY_0: 3065 pd_idx = 0; 3066 qd_idx = 1; 3067 (*dd_idx) += 2; 3068 break; 3069 case ALGORITHM_PARITY_N: 3070 pd_idx = data_disks; 3071 qd_idx = data_disks + 1; 3072 break; 3073 3074 case ALGORITHM_ROTATING_ZERO_RESTART: 3075 /* Exactly the same as RIGHT_ASYMMETRIC, but or 3076 * of blocks for computing Q is different. 3077 */ 3078 pd_idx = sector_div(stripe2, raid_disks); 3079 qd_idx = pd_idx + 1; 3080 if (pd_idx == raid_disks-1) { 3081 (*dd_idx)++; /* Q D D D P */ 3082 qd_idx = 0; 3083 } else if (*dd_idx >= pd_idx) 3084 (*dd_idx) += 2; /* D D P Q D */ 3085 ddf_layout = 1; 3086 break; 3087 3088 case ALGORITHM_ROTATING_N_RESTART: 3089 /* Same a left_asymmetric, by first stripe is 3090 * D D D P Q rather than 3091 * Q D D D P 3092 */ 3093 stripe2 += 1; 3094 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3095 qd_idx = pd_idx + 1; 3096 if (pd_idx == raid_disks-1) { 3097 (*dd_idx)++; /* Q D D D P */ 3098 qd_idx = 0; 3099 } else if (*dd_idx >= pd_idx) 3100 (*dd_idx) += 2; /* D D P Q D */ 3101 ddf_layout = 1; 3102 break; 3103 3104 case ALGORITHM_ROTATING_N_CONTINUE: 3105 /* Same as left_symmetric but Q is before P */ 3106 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 3107 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 3108 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 3109 ddf_layout = 1; 3110 break; 3111 3112 case ALGORITHM_LEFT_ASYMMETRIC_6: 3113 /* RAID5 left_asymmetric, with Q on last device */ 3114 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 3115 if (*dd_idx >= pd_idx) 3116 (*dd_idx)++; 3117 qd_idx = raid_disks - 1; 3118 break; 3119 3120 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3121 pd_idx = sector_div(stripe2, raid_disks-1); 3122 if (*dd_idx >= pd_idx) 3123 (*dd_idx)++; 3124 qd_idx = raid_disks - 1; 3125 break; 3126 3127 case ALGORITHM_LEFT_SYMMETRIC_6: 3128 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 3129 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 3130 qd_idx = raid_disks - 1; 3131 break; 3132 3133 case ALGORITHM_RIGHT_SYMMETRIC_6: 3134 pd_idx = sector_div(stripe2, raid_disks-1); 3135 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 3136 qd_idx = raid_disks - 1; 3137 break; 3138 3139 case ALGORITHM_PARITY_0_6: 3140 pd_idx = 0; 3141 (*dd_idx)++; 3142 qd_idx = raid_disks - 1; 3143 break; 3144 3145 default: 3146 BUG(); 3147 } 3148 break; 3149 } 3150 3151 if (sh) { 3152 sh->pd_idx = pd_idx; 3153 sh->qd_idx = qd_idx; 3154 sh->ddf_layout = ddf_layout; 3155 } 3156 /* 3157 * Finally, compute the new sector number 3158 */ 3159 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 3160 return new_sector; 3161 } 3162 3163 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 3164 { 3165 struct r5conf *conf = sh->raid_conf; 3166 int raid_disks = sh->disks; 3167 int data_disks = raid_disks - conf->max_degraded; 3168 sector_t new_sector = sh->sector, check; 3169 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 3170 : conf->chunk_sectors; 3171 int algorithm = previous ? conf->prev_algo 3172 : conf->algorithm; 3173 sector_t stripe; 3174 int chunk_offset; 3175 sector_t chunk_number; 3176 int dummy1, dd_idx = i; 3177 sector_t r_sector; 3178 struct stripe_head sh2; 3179 3180 chunk_offset = sector_div(new_sector, sectors_per_chunk); 3181 stripe = new_sector; 3182 3183 if (i == sh->pd_idx) 3184 return 0; 3185 switch(conf->level) { 3186 case 4: break; 3187 case 5: 3188 switch (algorithm) { 3189 case ALGORITHM_LEFT_ASYMMETRIC: 3190 case ALGORITHM_RIGHT_ASYMMETRIC: 3191 if (i > sh->pd_idx) 3192 i--; 3193 break; 3194 case ALGORITHM_LEFT_SYMMETRIC: 3195 case ALGORITHM_RIGHT_SYMMETRIC: 3196 if (i < sh->pd_idx) 3197 i += raid_disks; 3198 i -= (sh->pd_idx + 1); 3199 break; 3200 case ALGORITHM_PARITY_0: 3201 i -= 1; 3202 break; 3203 case ALGORITHM_PARITY_N: 3204 break; 3205 default: 3206 BUG(); 3207 } 3208 break; 3209 case 6: 3210 if (i == sh->qd_idx) 3211 return 0; /* It is the Q disk */ 3212 switch (algorithm) { 3213 case ALGORITHM_LEFT_ASYMMETRIC: 3214 case ALGORITHM_RIGHT_ASYMMETRIC: 3215 case ALGORITHM_ROTATING_ZERO_RESTART: 3216 case ALGORITHM_ROTATING_N_RESTART: 3217 if (sh->pd_idx == raid_disks-1) 3218 i--; /* Q D D D P */ 3219 else if (i > sh->pd_idx) 3220 i -= 2; /* D D P Q D */ 3221 break; 3222 case ALGORITHM_LEFT_SYMMETRIC: 3223 case ALGORITHM_RIGHT_SYMMETRIC: 3224 if (sh->pd_idx == raid_disks-1) 3225 i--; /* Q D D D P */ 3226 else { 3227 /* D D P Q D */ 3228 if (i < sh->pd_idx) 3229 i += raid_disks; 3230 i -= (sh->pd_idx + 2); 3231 } 3232 break; 3233 case ALGORITHM_PARITY_0: 3234 i -= 2; 3235 break; 3236 case ALGORITHM_PARITY_N: 3237 break; 3238 case ALGORITHM_ROTATING_N_CONTINUE: 3239 /* Like left_symmetric, but P is before Q */ 3240 if (sh->pd_idx == 0) 3241 i--; /* P D D D Q */ 3242 else { 3243 /* D D Q P D */ 3244 if (i < sh->pd_idx) 3245 i += raid_disks; 3246 i -= (sh->pd_idx + 1); 3247 } 3248 break; 3249 case ALGORITHM_LEFT_ASYMMETRIC_6: 3250 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3251 if (i > sh->pd_idx) 3252 i--; 3253 break; 3254 case ALGORITHM_LEFT_SYMMETRIC_6: 3255 case ALGORITHM_RIGHT_SYMMETRIC_6: 3256 if (i < sh->pd_idx) 3257 i += data_disks + 1; 3258 i -= (sh->pd_idx + 1); 3259 break; 3260 case ALGORITHM_PARITY_0_6: 3261 i -= 1; 3262 break; 3263 default: 3264 BUG(); 3265 } 3266 break; 3267 } 3268 3269 chunk_number = stripe * data_disks + i; 3270 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3271 3272 check = raid5_compute_sector(conf, r_sector, 3273 previous, &dummy1, &sh2); 3274 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3275 || sh2.qd_idx != sh->qd_idx) { 3276 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3277 mdname(conf->mddev)); 3278 return 0; 3279 } 3280 return r_sector; 3281 } 3282 3283 /* 3284 * There are cases where we want handle_stripe_dirtying() and 3285 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3286 * 3287 * This function checks whether we want to delay the towrite. Specifically, 3288 * we delay the towrite when: 3289 * 3290 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3291 * stripe has data in journal (for other devices). 3292 * 3293 * In this case, when reading data for the non-overwrite dev, it is 3294 * necessary to handle complex rmw of write back cache (prexor with 3295 * orig_page, and xor with page). To keep read path simple, we would 3296 * like to flush data in journal to RAID disks first, so complex rmw 3297 * is handled in the write patch (handle_stripe_dirtying). 3298 * 3299 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3300 * 3301 * It is important to be able to flush all stripes in raid5-cache. 3302 * Therefore, we need reserve some space on the journal device for 3303 * these flushes. If flush operation includes pending writes to the 3304 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3305 * for the flush out. If we exclude these pending writes from flush 3306 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3307 * Therefore, excluding pending writes in these cases enables more 3308 * efficient use of the journal device. 3309 * 3310 * Note: To make sure the stripe makes progress, we only delay 3311 * towrite for stripes with data already in journal (injournal > 0). 3312 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3313 * no_space_stripes list. 3314 * 3315 * 3. during journal failure 3316 * In journal failure, we try to flush all cached data to raid disks 3317 * based on data in stripe cache. The array is read-only to upper 3318 * layers, so we would skip all pending writes. 3319 * 3320 */ 3321 static inline bool delay_towrite(struct r5conf *conf, 3322 struct r5dev *dev, 3323 struct stripe_head_state *s) 3324 { 3325 /* case 1 above */ 3326 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3327 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3328 return true; 3329 /* case 2 above */ 3330 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3331 s->injournal > 0) 3332 return true; 3333 /* case 3 above */ 3334 if (s->log_failed && s->injournal) 3335 return true; 3336 return false; 3337 } 3338 3339 static void 3340 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3341 int rcw, int expand) 3342 { 3343 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3344 struct r5conf *conf = sh->raid_conf; 3345 int level = conf->level; 3346 3347 if (rcw) { 3348 /* 3349 * In some cases, handle_stripe_dirtying initially decided to 3350 * run rmw and allocates extra page for prexor. However, rcw is 3351 * cheaper later on. We need to free the extra page now, 3352 * because we won't be able to do that in ops_complete_prexor(). 3353 */ 3354 r5c_release_extra_page(sh); 3355 3356 for (i = disks; i--; ) { 3357 struct r5dev *dev = &sh->dev[i]; 3358 3359 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3360 set_bit(R5_LOCKED, &dev->flags); 3361 set_bit(R5_Wantdrain, &dev->flags); 3362 if (!expand) 3363 clear_bit(R5_UPTODATE, &dev->flags); 3364 s->locked++; 3365 } else if (test_bit(R5_InJournal, &dev->flags)) { 3366 set_bit(R5_LOCKED, &dev->flags); 3367 s->locked++; 3368 } 3369 } 3370 /* if we are not expanding this is a proper write request, and 3371 * there will be bios with new data to be drained into the 3372 * stripe cache 3373 */ 3374 if (!expand) { 3375 if (!s->locked) 3376 /* False alarm, nothing to do */ 3377 return; 3378 sh->reconstruct_state = reconstruct_state_drain_run; 3379 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3380 } else 3381 sh->reconstruct_state = reconstruct_state_run; 3382 3383 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3384 3385 if (s->locked + conf->max_degraded == disks) 3386 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3387 atomic_inc(&conf->pending_full_writes); 3388 } else { 3389 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3390 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3391 BUG_ON(level == 6 && 3392 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3393 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3394 3395 for (i = disks; i--; ) { 3396 struct r5dev *dev = &sh->dev[i]; 3397 if (i == pd_idx || i == qd_idx) 3398 continue; 3399 3400 if (dev->towrite && 3401 (test_bit(R5_UPTODATE, &dev->flags) || 3402 test_bit(R5_Wantcompute, &dev->flags))) { 3403 set_bit(R5_Wantdrain, &dev->flags); 3404 set_bit(R5_LOCKED, &dev->flags); 3405 clear_bit(R5_UPTODATE, &dev->flags); 3406 s->locked++; 3407 } else if (test_bit(R5_InJournal, &dev->flags)) { 3408 set_bit(R5_LOCKED, &dev->flags); 3409 s->locked++; 3410 } 3411 } 3412 if (!s->locked) 3413 /* False alarm - nothing to do */ 3414 return; 3415 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3416 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3417 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3418 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3419 } 3420 3421 /* keep the parity disk(s) locked while asynchronous operations 3422 * are in flight 3423 */ 3424 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3425 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3426 s->locked++; 3427 3428 if (level == 6) { 3429 int qd_idx = sh->qd_idx; 3430 struct r5dev *dev = &sh->dev[qd_idx]; 3431 3432 set_bit(R5_LOCKED, &dev->flags); 3433 clear_bit(R5_UPTODATE, &dev->flags); 3434 s->locked++; 3435 } 3436 3437 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3438 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3439 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3440 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3441 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3442 3443 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3444 __func__, (unsigned long long)sh->sector, 3445 s->locked, s->ops_request); 3446 } 3447 3448 static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, 3449 int dd_idx, int forwrite) 3450 { 3451 struct r5conf *conf = sh->raid_conf; 3452 struct bio **bip; 3453 3454 pr_debug("checking bi b#%llu to stripe s#%llu\n", 3455 bi->bi_iter.bi_sector, sh->sector); 3456 3457 /* Don't allow new IO added to stripes in batch list */ 3458 if (sh->batch_head) 3459 return true; 3460 3461 if (forwrite) 3462 bip = &sh->dev[dd_idx].towrite; 3463 else 3464 bip = &sh->dev[dd_idx].toread; 3465 3466 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3467 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3468 return true; 3469 bip = &(*bip)->bi_next; 3470 } 3471 3472 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3473 return true; 3474 3475 if (forwrite && raid5_has_ppl(conf)) { 3476 /* 3477 * With PPL only writes to consecutive data chunks within a 3478 * stripe are allowed because for a single stripe_head we can 3479 * only have one PPL entry at a time, which describes one data 3480 * range. Not really an overlap, but R5_Overlap can be 3481 * used to handle this. 3482 */ 3483 sector_t sector; 3484 sector_t first = 0; 3485 sector_t last = 0; 3486 int count = 0; 3487 int i; 3488 3489 for (i = 0; i < sh->disks; i++) { 3490 if (i != sh->pd_idx && 3491 (i == dd_idx || sh->dev[i].towrite)) { 3492 sector = sh->dev[i].sector; 3493 if (count == 0 || sector < first) 3494 first = sector; 3495 if (sector > last) 3496 last = sector; 3497 count++; 3498 } 3499 } 3500 3501 if (first + conf->chunk_sectors * (count - 1) != last) 3502 return true; 3503 } 3504 3505 return false; 3506 } 3507 3508 static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3509 int dd_idx, int forwrite, int previous) 3510 { 3511 struct r5conf *conf = sh->raid_conf; 3512 struct bio **bip; 3513 int firstwrite = 0; 3514 3515 if (forwrite) { 3516 bip = &sh->dev[dd_idx].towrite; 3517 if (!*bip) 3518 firstwrite = 1; 3519 } else { 3520 bip = &sh->dev[dd_idx].toread; 3521 } 3522 3523 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) 3524 bip = &(*bip)->bi_next; 3525 3526 if (!forwrite || previous) 3527 clear_bit(STRIPE_BATCH_READY, &sh->state); 3528 3529 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3530 if (*bip) 3531 bi->bi_next = *bip; 3532 *bip = bi; 3533 bio_inc_remaining(bi); 3534 md_write_inc(conf->mddev, bi); 3535 3536 if (forwrite) { 3537 /* check if page is covered */ 3538 sector_t sector = sh->dev[dd_idx].sector; 3539 for (bi=sh->dev[dd_idx].towrite; 3540 sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && 3541 bi && bi->bi_iter.bi_sector <= sector; 3542 bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { 3543 if (bio_end_sector(bi) >= sector) 3544 sector = bio_end_sector(bi); 3545 } 3546 if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) 3547 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3548 sh->overwrite_disks++; 3549 } 3550 3551 pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", 3552 (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, 3553 sh->dev[dd_idx].sector); 3554 3555 if (conf->mddev->bitmap && firstwrite && !sh->batch_head) { 3556 sh->bm_seq = conf->seq_flush+1; 3557 set_bit(STRIPE_BIT_DELAY, &sh->state); 3558 } 3559 } 3560 3561 /* 3562 * Each stripe/dev can have one or more bios attached. 3563 * toread/towrite point to the first in a chain. 3564 * The bi_next chain must be in order. 3565 */ 3566 static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, 3567 int dd_idx, int forwrite, int previous) 3568 { 3569 spin_lock_irq(&sh->stripe_lock); 3570 3571 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 3572 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3573 spin_unlock_irq(&sh->stripe_lock); 3574 return false; 3575 } 3576 3577 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 3578 spin_unlock_irq(&sh->stripe_lock); 3579 return true; 3580 } 3581 3582 static void end_reshape(struct r5conf *conf); 3583 3584 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3585 struct stripe_head *sh) 3586 { 3587 int sectors_per_chunk = 3588 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3589 int dd_idx; 3590 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3591 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3592 3593 raid5_compute_sector(conf, 3594 stripe * (disks - conf->max_degraded) 3595 *sectors_per_chunk + chunk_offset, 3596 previous, 3597 &dd_idx, sh); 3598 } 3599 3600 static void 3601 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3602 struct stripe_head_state *s, int disks) 3603 { 3604 int i; 3605 BUG_ON(sh->batch_head); 3606 for (i = disks; i--; ) { 3607 struct bio *bi; 3608 3609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3610 struct md_rdev *rdev = conf->disks[i].rdev; 3611 3612 if (rdev && test_bit(In_sync, &rdev->flags) && 3613 !test_bit(Faulty, &rdev->flags)) 3614 atomic_inc(&rdev->nr_pending); 3615 else 3616 rdev = NULL; 3617 if (rdev) { 3618 rdev_set_badblocks(rdev, 3619 sh->sector, 3620 RAID5_STRIPE_SECTORS(conf), 3621 0); 3622 rdev_dec_pending(rdev, conf->mddev); 3623 } 3624 } 3625 spin_lock_irq(&sh->stripe_lock); 3626 /* fail all writes first */ 3627 bi = sh->dev[i].towrite; 3628 sh->dev[i].towrite = NULL; 3629 sh->overwrite_disks = 0; 3630 spin_unlock_irq(&sh->stripe_lock); 3631 3632 log_stripe_write_finished(sh); 3633 3634 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3635 wake_up_bit(&sh->dev[i].flags, R5_Overlap); 3636 3637 while (bi && bi->bi_iter.bi_sector < 3638 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3639 struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); 3640 3641 md_write_end(conf->mddev); 3642 bio_io_error(bi); 3643 bi = nextbi; 3644 } 3645 /* and fail all 'written' */ 3646 bi = sh->dev[i].written; 3647 sh->dev[i].written = NULL; 3648 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3649 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3650 sh->dev[i].page = sh->dev[i].orig_page; 3651 } 3652 3653 while (bi && bi->bi_iter.bi_sector < 3654 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3655 struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); 3656 3657 md_write_end(conf->mddev); 3658 bio_io_error(bi); 3659 bi = bi2; 3660 } 3661 3662 /* fail any reads if this device is non-operational and 3663 * the data has not reached the cache yet. 3664 */ 3665 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3666 s->failed > conf->max_degraded && 3667 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3668 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3669 spin_lock_irq(&sh->stripe_lock); 3670 bi = sh->dev[i].toread; 3671 sh->dev[i].toread = NULL; 3672 spin_unlock_irq(&sh->stripe_lock); 3673 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3674 wake_up_bit(&sh->dev[i].flags, R5_Overlap); 3675 if (bi) 3676 s->to_read--; 3677 while (bi && bi->bi_iter.bi_sector < 3678 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { 3679 struct bio *nextbi = 3680 r5_next_bio(conf, bi, sh->dev[i].sector); 3681 3682 bio_io_error(bi); 3683 bi = nextbi; 3684 } 3685 } 3686 /* If we were in the middle of a write the parity block might 3687 * still be locked - so just clear all R5_LOCKED flags 3688 */ 3689 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3690 } 3691 s->to_write = 0; 3692 s->written = 0; 3693 3694 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3695 if (atomic_dec_and_test(&conf->pending_full_writes)) 3696 md_wakeup_thread(conf->mddev->thread); 3697 } 3698 3699 static void 3700 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3701 struct stripe_head_state *s) 3702 { 3703 int abort = 0; 3704 int i; 3705 3706 BUG_ON(sh->batch_head); 3707 clear_bit(STRIPE_SYNCING, &sh->state); 3708 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3709 wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); 3710 s->syncing = 0; 3711 s->replacing = 0; 3712 /* There is nothing more to do for sync/check/repair. 3713 * Don't even need to abort as that is handled elsewhere 3714 * if needed, and not always wanted e.g. if there is a known 3715 * bad block here. 3716 * For recover/replace we need to record a bad block on all 3717 * non-sync devices, or abort the recovery 3718 */ 3719 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3720 /* During recovery devices cannot be removed, so 3721 * locking and refcounting of rdevs is not needed 3722 */ 3723 for (i = 0; i < conf->raid_disks; i++) { 3724 struct md_rdev *rdev = conf->disks[i].rdev; 3725 3726 if (rdev 3727 && !test_bit(Faulty, &rdev->flags) 3728 && !test_bit(In_sync, &rdev->flags) 3729 && !rdev_set_badblocks(rdev, sh->sector, 3730 RAID5_STRIPE_SECTORS(conf), 0)) 3731 abort = 1; 3732 rdev = conf->disks[i].replacement; 3733 3734 if (rdev 3735 && !test_bit(Faulty, &rdev->flags) 3736 && !test_bit(In_sync, &rdev->flags) 3737 && !rdev_set_badblocks(rdev, sh->sector, 3738 RAID5_STRIPE_SECTORS(conf), 0)) 3739 abort = 1; 3740 } 3741 } 3742 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); 3743 3744 if (abort) 3745 md_sync_error(conf->mddev); 3746 } 3747 3748 static int want_replace(struct stripe_head *sh, int disk_idx) 3749 { 3750 struct md_rdev *rdev; 3751 int rv = 0; 3752 3753 rdev = sh->raid_conf->disks[disk_idx].replacement; 3754 if (rdev 3755 && !test_bit(Faulty, &rdev->flags) 3756 && !test_bit(In_sync, &rdev->flags) 3757 && (rdev->recovery_offset <= sh->sector 3758 || rdev->mddev->resync_offset <= sh->sector)) 3759 rv = 1; 3760 return rv; 3761 } 3762 3763 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3764 int disk_idx, int disks) 3765 { 3766 struct r5dev *dev = &sh->dev[disk_idx]; 3767 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3768 &sh->dev[s->failed_num[1]] }; 3769 struct mddev *mddev = sh->raid_conf->mddev; 3770 bool force_rcw = false; 3771 int i; 3772 3773 if (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW || 3774 (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && 3775 !mddev->bitmap_ops->blocks_synced(mddev, sh->sector))) 3776 force_rcw = true; 3777 3778 if (test_bit(R5_LOCKED, &dev->flags) || 3779 test_bit(R5_UPTODATE, &dev->flags)) 3780 /* No point reading this as we already have it or have 3781 * decided to get it. 3782 */ 3783 return 0; 3784 3785 if (dev->toread || 3786 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3787 /* We need this block to directly satisfy a request */ 3788 return 1; 3789 3790 if (s->syncing || s->expanding || 3791 (s->replacing && want_replace(sh, disk_idx))) 3792 /* When syncing, or expanding we read everything. 3793 * When replacing, we need the replaced block. 3794 */ 3795 return 1; 3796 3797 if ((s->failed >= 1 && fdev[0]->toread) || 3798 (s->failed >= 2 && fdev[1]->toread)) 3799 /* If we want to read from a failed device, then 3800 * we need to actually read every other device. 3801 */ 3802 return 1; 3803 3804 /* Sometimes neither read-modify-write nor reconstruct-write 3805 * cycles can work. In those cases we read every block we 3806 * can. Then the parity-update is certain to have enough to 3807 * work with. 3808 * This can only be a problem when we need to write something, 3809 * and some device has failed. If either of those tests 3810 * fail we need look no further. 3811 */ 3812 if (!s->failed || !s->to_write) 3813 return 0; 3814 3815 if (test_bit(R5_Insync, &dev->flags) && 3816 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3817 /* Pre-reads at not permitted until after short delay 3818 * to gather multiple requests. However if this 3819 * device is no Insync, the block could only be computed 3820 * and there is no need to delay that. 3821 */ 3822 return 0; 3823 3824 for (i = 0; i < s->failed && i < 2; i++) { 3825 if (fdev[i]->towrite && 3826 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3827 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3828 /* If we have a partial write to a failed 3829 * device, then we will need to reconstruct 3830 * the content of that device, so all other 3831 * devices must be read. 3832 */ 3833 return 1; 3834 3835 if (s->failed >= 2 && 3836 (fdev[i]->towrite || 3837 s->failed_num[i] == sh->pd_idx || 3838 s->failed_num[i] == sh->qd_idx) && 3839 !test_bit(R5_UPTODATE, &fdev[i]->flags)) 3840 /* In max degraded raid6, If the failed disk is P, Q, 3841 * or we want to read the failed disk, we need to do 3842 * reconstruct-write. 3843 */ 3844 force_rcw = true; 3845 } 3846 3847 /* If we are forced to do a reconstruct-write, because parity 3848 * cannot be trusted and we are currently recovering it, there 3849 * is extra need to be careful. 3850 * If one of the devices that we would need to read, because 3851 * it is not being overwritten (and maybe not written at all) 3852 * is missing/faulty, then we need to read everything we can. 3853 */ 3854 if (!force_rcw && 3855 sh->sector < sh->raid_conf->mddev->resync_offset) 3856 /* reconstruct-write isn't being forced */ 3857 return 0; 3858 for (i = 0; i < s->failed && i < 2; i++) { 3859 if (s->failed_num[i] != sh->pd_idx && 3860 s->failed_num[i] != sh->qd_idx && 3861 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3862 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3863 return 1; 3864 } 3865 3866 return 0; 3867 } 3868 3869 /* fetch_block - checks the given member device to see if its data needs 3870 * to be read or computed to satisfy a request. 3871 * 3872 * Returns 1 when no more member devices need to be checked, otherwise returns 3873 * 0 to tell the loop in handle_stripe_fill to continue 3874 */ 3875 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3876 int disk_idx, int disks) 3877 { 3878 struct r5dev *dev = &sh->dev[disk_idx]; 3879 3880 /* is the data in this block needed, and can we get it? */ 3881 if (need_this_block(sh, s, disk_idx, disks)) { 3882 /* we would like to get this block, possibly by computing it, 3883 * otherwise read it if the backing disk is insync 3884 */ 3885 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3886 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3887 BUG_ON(sh->batch_head); 3888 3889 /* 3890 * In the raid6 case if the only non-uptodate disk is P 3891 * then we already trusted P to compute the other failed 3892 * drives. It is safe to compute rather than re-read P. 3893 * In other cases we only compute blocks from failed 3894 * devices, otherwise check/repair might fail to detect 3895 * a real inconsistency. 3896 */ 3897 3898 if ((s->uptodate == disks - 1) && 3899 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3900 (s->failed && (disk_idx == s->failed_num[0] || 3901 disk_idx == s->failed_num[1])))) { 3902 /* have disk failed, and we're requested to fetch it; 3903 * do compute it 3904 */ 3905 pr_debug("Computing stripe %llu block %d\n", 3906 (unsigned long long)sh->sector, disk_idx); 3907 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3908 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3909 set_bit(R5_Wantcompute, &dev->flags); 3910 sh->ops.target = disk_idx; 3911 sh->ops.target2 = -1; /* no 2nd target */ 3912 s->req_compute = 1; 3913 /* Careful: from this point on 'uptodate' is in the eye 3914 * of raid_run_ops which services 'compute' operations 3915 * before writes. R5_Wantcompute flags a block that will 3916 * be R5_UPTODATE by the time it is needed for a 3917 * subsequent operation. 3918 */ 3919 s->uptodate++; 3920 return 1; 3921 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3922 /* Computing 2-failure is *very* expensive; only 3923 * do it if failed >= 2 3924 */ 3925 int other; 3926 for (other = disks; other--; ) { 3927 if (other == disk_idx) 3928 continue; 3929 if (!test_bit(R5_UPTODATE, 3930 &sh->dev[other].flags)) 3931 break; 3932 } 3933 BUG_ON(other < 0); 3934 if (test_bit(R5_LOCKED, &sh->dev[other].flags)) 3935 return 0; 3936 pr_debug("Computing stripe %llu blocks %d,%d\n", 3937 (unsigned long long)sh->sector, 3938 disk_idx, other); 3939 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3940 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3941 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3942 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3943 sh->ops.target = disk_idx; 3944 sh->ops.target2 = other; 3945 s->uptodate += 2; 3946 s->req_compute = 1; 3947 return 1; 3948 } else if (test_bit(R5_Insync, &dev->flags)) { 3949 set_bit(R5_LOCKED, &dev->flags); 3950 set_bit(R5_Wantread, &dev->flags); 3951 s->locked++; 3952 pr_debug("Reading block %d (sync=%d)\n", 3953 disk_idx, s->syncing); 3954 } 3955 } 3956 3957 return 0; 3958 } 3959 3960 /* 3961 * handle_stripe_fill - read or compute data to satisfy pending requests. 3962 */ 3963 static void handle_stripe_fill(struct stripe_head *sh, 3964 struct stripe_head_state *s, 3965 int disks) 3966 { 3967 int i; 3968 3969 /* look for blocks to read/compute, skip this if a compute 3970 * is already in flight, or if the stripe contents are in the 3971 * midst of changing due to a write 3972 */ 3973 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3974 !sh->reconstruct_state) { 3975 3976 /* 3977 * For degraded stripe with data in journal, do not handle 3978 * read requests yet, instead, flush the stripe to raid 3979 * disks first, this avoids handling complex rmw of write 3980 * back cache (prexor with orig_page, and then xor with 3981 * page) in the read path 3982 */ 3983 if (s->to_read && s->injournal && s->failed) { 3984 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3985 r5c_make_stripe_write_out(sh); 3986 goto out; 3987 } 3988 3989 for (i = disks; i--; ) 3990 if (fetch_block(sh, s, i, disks)) 3991 break; 3992 } 3993 out: 3994 set_bit(STRIPE_HANDLE, &sh->state); 3995 } 3996 3997 static void break_stripe_batch_list(struct stripe_head *head_sh, 3998 unsigned long handle_flags); 3999 /* handle_stripe_clean_event 4000 * any written block on an uptodate or failed drive can be returned. 4001 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 4002 * never LOCKED, so we don't need to test 'failed' directly. 4003 */ 4004 static void handle_stripe_clean_event(struct r5conf *conf, 4005 struct stripe_head *sh, int disks) 4006 { 4007 int i; 4008 struct r5dev *dev; 4009 int discard_pending = 0; 4010 struct stripe_head *head_sh = sh; 4011 bool do_endio = false; 4012 4013 for (i = disks; i--; ) 4014 if (sh->dev[i].written) { 4015 dev = &sh->dev[i]; 4016 if (!test_bit(R5_LOCKED, &dev->flags) && 4017 (test_bit(R5_UPTODATE, &dev->flags) || 4018 test_bit(R5_Discard, &dev->flags) || 4019 test_bit(R5_SkipCopy, &dev->flags))) { 4020 /* We can return any write requests */ 4021 struct bio *wbi, *wbi2; 4022 pr_debug("Return write for disc %d\n", i); 4023 if (test_and_clear_bit(R5_Discard, &dev->flags)) 4024 clear_bit(R5_UPTODATE, &dev->flags); 4025 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 4026 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 4027 } 4028 do_endio = true; 4029 4030 returnbi: 4031 dev->page = dev->orig_page; 4032 wbi = dev->written; 4033 dev->written = NULL; 4034 while (wbi && wbi->bi_iter.bi_sector < 4035 dev->sector + RAID5_STRIPE_SECTORS(conf)) { 4036 wbi2 = r5_next_bio(conf, wbi, dev->sector); 4037 md_write_end(conf->mddev); 4038 bio_endio(wbi); 4039 wbi = wbi2; 4040 } 4041 4042 if (head_sh->batch_head) { 4043 sh = list_first_entry(&sh->batch_list, 4044 struct stripe_head, 4045 batch_list); 4046 if (sh != head_sh) { 4047 dev = &sh->dev[i]; 4048 goto returnbi; 4049 } 4050 } 4051 sh = head_sh; 4052 dev = &sh->dev[i]; 4053 } else if (test_bit(R5_Discard, &dev->flags)) 4054 discard_pending = 1; 4055 } 4056 4057 log_stripe_write_finished(sh); 4058 4059 if (!discard_pending && 4060 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 4061 int hash; 4062 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 4063 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4064 if (sh->qd_idx >= 0) { 4065 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 4066 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 4067 } 4068 /* now that discard is done we can proceed with any sync */ 4069 clear_bit(STRIPE_DISCARD, &sh->state); 4070 /* 4071 * SCSI discard will change some bio fields and the stripe has 4072 * no updated data, so remove it from hash list and the stripe 4073 * will be reinitialized 4074 */ 4075 unhash: 4076 hash = sh->hash_lock_index; 4077 spin_lock_irq(conf->hash_locks + hash); 4078 remove_hash(sh); 4079 spin_unlock_irq(conf->hash_locks + hash); 4080 if (head_sh->batch_head) { 4081 sh = list_first_entry(&sh->batch_list, 4082 struct stripe_head, batch_list); 4083 if (sh != head_sh) 4084 goto unhash; 4085 } 4086 sh = head_sh; 4087 4088 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 4089 set_bit(STRIPE_HANDLE, &sh->state); 4090 4091 } 4092 4093 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 4094 if (atomic_dec_and_test(&conf->pending_full_writes)) 4095 md_wakeup_thread(conf->mddev->thread); 4096 4097 if (head_sh->batch_head && do_endio) 4098 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 4099 } 4100 4101 /* 4102 * For RMW in write back cache, we need extra page in prexor to store the 4103 * old data. This page is stored in dev->orig_page. 4104 * 4105 * This function checks whether we have data for prexor. The exact logic 4106 * is: 4107 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 4108 */ 4109 static inline bool uptodate_for_rmw(struct r5dev *dev) 4110 { 4111 return (test_bit(R5_UPTODATE, &dev->flags)) && 4112 (!test_bit(R5_InJournal, &dev->flags) || 4113 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 4114 } 4115 4116 static int handle_stripe_dirtying(struct r5conf *conf, 4117 struct stripe_head *sh, 4118 struct stripe_head_state *s, 4119 int disks) 4120 { 4121 int rmw = 0, rcw = 0, i; 4122 struct mddev *mddev = conf->mddev; 4123 sector_t resync_offset = mddev->resync_offset; 4124 4125 /* Check whether resync is now happening or should start. 4126 * If yes, then the array is dirty (after unclean shutdown or 4127 * initial creation), so parity in some stripes might be inconsistent. 4128 * In this case, we need to always do reconstruct-write, to ensure 4129 * that in case of drive failure or read-error correction, we 4130 * generate correct data from the parity. 4131 */ 4132 if (conf->rmw_level == PARITY_DISABLE_RMW || 4133 (resync_offset < MaxSector && sh->sector >= resync_offset && 4134 s->failed == 0)) { 4135 /* Calculate the real rcw later - for now make it 4136 * look like rcw is cheaper 4137 */ 4138 rcw = 1; rmw = 2; 4139 pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", 4140 conf->rmw_level, (unsigned long long)resync_offset, 4141 (unsigned long long)sh->sector); 4142 } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && 4143 !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { 4144 /* The initial recover is not done, must read everything */ 4145 rcw = 1; rmw = 2; 4146 pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", 4147 sh->sector); 4148 } else for (i = disks; i--; ) { 4149 /* would I have to read this buffer for read_modify_write */ 4150 struct r5dev *dev = &sh->dev[i]; 4151 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 4152 i == sh->pd_idx || i == sh->qd_idx || 4153 test_bit(R5_InJournal, &dev->flags)) && 4154 !test_bit(R5_LOCKED, &dev->flags) && 4155 !(uptodate_for_rmw(dev) || 4156 test_bit(R5_Wantcompute, &dev->flags))) { 4157 if (test_bit(R5_Insync, &dev->flags)) 4158 rmw++; 4159 else 4160 rmw += 2*disks; /* cannot read it */ 4161 } 4162 /* Would I have to read this buffer for reconstruct_write */ 4163 if (!test_bit(R5_OVERWRITE, &dev->flags) && 4164 i != sh->pd_idx && i != sh->qd_idx && 4165 !test_bit(R5_LOCKED, &dev->flags) && 4166 !(test_bit(R5_UPTODATE, &dev->flags) || 4167 test_bit(R5_Wantcompute, &dev->flags))) { 4168 if (test_bit(R5_Insync, &dev->flags)) 4169 rcw++; 4170 else 4171 rcw += 2*disks; 4172 } 4173 } 4174 4175 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 4176 (unsigned long long)sh->sector, sh->state, rmw, rcw); 4177 set_bit(STRIPE_HANDLE, &sh->state); 4178 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 4179 /* prefer read-modify-write, but need to get some data */ 4180 mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", 4181 sh->sector, rmw); 4182 4183 for (i = disks; i--; ) { 4184 struct r5dev *dev = &sh->dev[i]; 4185 if (test_bit(R5_InJournal, &dev->flags) && 4186 dev->page == dev->orig_page && 4187 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 4188 /* alloc page for prexor */ 4189 struct page *p = alloc_page(GFP_NOIO); 4190 4191 if (p) { 4192 dev->orig_page = p; 4193 continue; 4194 } 4195 4196 /* 4197 * alloc_page() failed, try use 4198 * disk_info->extra_page 4199 */ 4200 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 4201 &conf->cache_state)) { 4202 r5c_use_extra_page(sh); 4203 break; 4204 } 4205 4206 /* extra_page in use, add to delayed_list */ 4207 set_bit(STRIPE_DELAYED, &sh->state); 4208 s->waiting_extra_page = 1; 4209 return -EAGAIN; 4210 } 4211 } 4212 4213 for (i = disks; i--; ) { 4214 struct r5dev *dev = &sh->dev[i]; 4215 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 4216 i == sh->pd_idx || i == sh->qd_idx || 4217 test_bit(R5_InJournal, &dev->flags)) && 4218 !test_bit(R5_LOCKED, &dev->flags) && 4219 !(uptodate_for_rmw(dev) || 4220 test_bit(R5_Wantcompute, &dev->flags)) && 4221 test_bit(R5_Insync, &dev->flags)) { 4222 if (test_bit(STRIPE_PREREAD_ACTIVE, 4223 &sh->state)) { 4224 pr_debug("Read_old block %d for r-m-w\n", 4225 i); 4226 set_bit(R5_LOCKED, &dev->flags); 4227 set_bit(R5_Wantread, &dev->flags); 4228 s->locked++; 4229 } else 4230 set_bit(STRIPE_DELAYED, &sh->state); 4231 } 4232 } 4233 } 4234 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 4235 /* want reconstruct write, but need to get some data */ 4236 int qread =0; 4237 rcw = 0; 4238 for (i = disks; i--; ) { 4239 struct r5dev *dev = &sh->dev[i]; 4240 if (!test_bit(R5_OVERWRITE, &dev->flags) && 4241 i != sh->pd_idx && i != sh->qd_idx && 4242 !test_bit(R5_LOCKED, &dev->flags) && 4243 !(test_bit(R5_UPTODATE, &dev->flags) || 4244 test_bit(R5_Wantcompute, &dev->flags))) { 4245 rcw++; 4246 if (test_bit(R5_Insync, &dev->flags) && 4247 test_bit(STRIPE_PREREAD_ACTIVE, 4248 &sh->state)) { 4249 pr_debug("Read_old block " 4250 "%d for Reconstruct\n", i); 4251 set_bit(R5_LOCKED, &dev->flags); 4252 set_bit(R5_Wantread, &dev->flags); 4253 s->locked++; 4254 qread++; 4255 } else 4256 set_bit(STRIPE_DELAYED, &sh->state); 4257 } 4258 } 4259 if (rcw && !mddev_is_dm(mddev)) 4260 blk_add_trace_msg(mddev->gendisk->queue, 4261 "raid5 rcw %llu %d %d %d", 4262 (unsigned long long)sh->sector, rcw, qread, 4263 test_bit(STRIPE_DELAYED, &sh->state)); 4264 } 4265 4266 if (rcw > disks && rmw > disks && 4267 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4268 set_bit(STRIPE_DELAYED, &sh->state); 4269 4270 /* now if nothing is locked, and if we have enough data, 4271 * we can start a write request 4272 */ 4273 /* since handle_stripe can be called at any time we need to handle the 4274 * case where a compute block operation has been submitted and then a 4275 * subsequent call wants to start a write request. raid_run_ops only 4276 * handles the case where compute block and reconstruct are requested 4277 * simultaneously. If this is not the case then new writes need to be 4278 * held off until the compute completes. 4279 */ 4280 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4281 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4282 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4283 schedule_reconstruction(sh, s, rcw == 0, 0); 4284 return 0; 4285 } 4286 4287 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4288 struct stripe_head_state *s, int disks) 4289 { 4290 struct r5dev *dev = NULL; 4291 4292 BUG_ON(sh->batch_head); 4293 set_bit(STRIPE_HANDLE, &sh->state); 4294 4295 switch (sh->check_state) { 4296 case check_state_idle: 4297 /* start a new check operation if there are no failures */ 4298 if (s->failed == 0) { 4299 BUG_ON(s->uptodate != disks); 4300 sh->check_state = check_state_run; 4301 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4302 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4303 s->uptodate--; 4304 break; 4305 } 4306 dev = &sh->dev[s->failed_num[0]]; 4307 fallthrough; 4308 case check_state_compute_result: 4309 sh->check_state = check_state_idle; 4310 if (!dev) 4311 dev = &sh->dev[sh->pd_idx]; 4312 4313 /* check that a write has not made the stripe insync */ 4314 if (test_bit(STRIPE_INSYNC, &sh->state)) 4315 break; 4316 4317 /* either failed parity check, or recovery is happening */ 4318 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4319 BUG_ON(s->uptodate != disks); 4320 4321 set_bit(R5_LOCKED, &dev->flags); 4322 s->locked++; 4323 set_bit(R5_Wantwrite, &dev->flags); 4324 4325 set_bit(STRIPE_INSYNC, &sh->state); 4326 break; 4327 case check_state_run: 4328 break; /* we will be called again upon completion */ 4329 case check_state_check_result: 4330 sh->check_state = check_state_idle; 4331 4332 /* if a failure occurred during the check operation, leave 4333 * STRIPE_INSYNC not set and let the stripe be handled again 4334 */ 4335 if (s->failed) 4336 break; 4337 4338 /* handle a successful check operation, if parity is correct 4339 * we are done. Otherwise update the mismatch count and repair 4340 * parity if !MD_RECOVERY_CHECK 4341 */ 4342 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4343 /* parity is correct (on disc, 4344 * not in buffer any more) 4345 */ 4346 set_bit(STRIPE_INSYNC, &sh->state); 4347 else { 4348 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); 4349 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4350 /* don't try to repair!! */ 4351 set_bit(STRIPE_INSYNC, &sh->state); 4352 pr_warn_ratelimited("%s: mismatch sector in range " 4353 "%llu-%llu\n", mdname(conf->mddev), 4354 (unsigned long long) sh->sector, 4355 (unsigned long long) sh->sector + 4356 RAID5_STRIPE_SECTORS(conf)); 4357 } else { 4358 sh->check_state = check_state_compute_run; 4359 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4360 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4361 set_bit(R5_Wantcompute, 4362 &sh->dev[sh->pd_idx].flags); 4363 sh->ops.target = sh->pd_idx; 4364 sh->ops.target2 = -1; 4365 s->uptodate++; 4366 } 4367 } 4368 break; 4369 case check_state_compute_run: 4370 break; 4371 default: 4372 pr_err("%s: unknown check_state: %d sector: %llu\n", 4373 __func__, sh->check_state, 4374 (unsigned long long) sh->sector); 4375 BUG(); 4376 } 4377 } 4378 4379 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4380 struct stripe_head_state *s, 4381 int disks) 4382 { 4383 int pd_idx = sh->pd_idx; 4384 int qd_idx = sh->qd_idx; 4385 struct r5dev *dev; 4386 4387 BUG_ON(sh->batch_head); 4388 set_bit(STRIPE_HANDLE, &sh->state); 4389 4390 BUG_ON(s->failed > 2); 4391 4392 /* Want to check and possibly repair P and Q. 4393 * However there could be one 'failed' device, in which 4394 * case we can only check one of them, possibly using the 4395 * other to generate missing data 4396 */ 4397 4398 switch (sh->check_state) { 4399 case check_state_idle: 4400 /* start a new check operation if there are < 2 failures */ 4401 if (s->failed == s->q_failed) { 4402 /* The only possible failed device holds Q, so it 4403 * makes sense to check P (If anything else were failed, 4404 * we would have used P to recreate it). 4405 */ 4406 sh->check_state = check_state_run; 4407 } 4408 if (!s->q_failed && s->failed < 2) { 4409 /* Q is not failed, and we didn't use it to generate 4410 * anything, so it makes sense to check it 4411 */ 4412 if (sh->check_state == check_state_run) 4413 sh->check_state = check_state_run_pq; 4414 else 4415 sh->check_state = check_state_run_q; 4416 } 4417 4418 /* discard potentially stale zero_sum_result */ 4419 sh->ops.zero_sum_result = 0; 4420 4421 if (sh->check_state == check_state_run) { 4422 /* async_xor_zero_sum destroys the contents of P */ 4423 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4424 s->uptodate--; 4425 } 4426 if (sh->check_state >= check_state_run && 4427 sh->check_state <= check_state_run_pq) { 4428 /* async_syndrome_zero_sum preserves P and Q, so 4429 * no need to mark them !uptodate here 4430 */ 4431 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4432 break; 4433 } 4434 4435 /* we have 2-disk failure */ 4436 BUG_ON(s->failed != 2); 4437 fallthrough; 4438 case check_state_compute_result: 4439 sh->check_state = check_state_idle; 4440 4441 /* check that a write has not made the stripe insync */ 4442 if (test_bit(STRIPE_INSYNC, &sh->state)) 4443 break; 4444 4445 /* now write out any block on a failed drive, 4446 * or P or Q if they were recomputed 4447 */ 4448 dev = NULL; 4449 if (s->failed == 2) { 4450 dev = &sh->dev[s->failed_num[1]]; 4451 s->locked++; 4452 set_bit(R5_LOCKED, &dev->flags); 4453 set_bit(R5_Wantwrite, &dev->flags); 4454 } 4455 if (s->failed >= 1) { 4456 dev = &sh->dev[s->failed_num[0]]; 4457 s->locked++; 4458 set_bit(R5_LOCKED, &dev->flags); 4459 set_bit(R5_Wantwrite, &dev->flags); 4460 } 4461 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4462 dev = &sh->dev[pd_idx]; 4463 s->locked++; 4464 set_bit(R5_LOCKED, &dev->flags); 4465 set_bit(R5_Wantwrite, &dev->flags); 4466 } 4467 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4468 dev = &sh->dev[qd_idx]; 4469 s->locked++; 4470 set_bit(R5_LOCKED, &dev->flags); 4471 set_bit(R5_Wantwrite, &dev->flags); 4472 } 4473 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), 4474 "%s: disk%td not up to date\n", 4475 mdname(conf->mddev), 4476 dev - (struct r5dev *) &sh->dev)) { 4477 clear_bit(R5_LOCKED, &dev->flags); 4478 clear_bit(R5_Wantwrite, &dev->flags); 4479 s->locked--; 4480 } 4481 4482 set_bit(STRIPE_INSYNC, &sh->state); 4483 break; 4484 case check_state_run: 4485 case check_state_run_q: 4486 case check_state_run_pq: 4487 break; /* we will be called again upon completion */ 4488 case check_state_check_result: 4489 sh->check_state = check_state_idle; 4490 4491 /* handle a successful check operation, if parity is correct 4492 * we are done. Otherwise update the mismatch count and repair 4493 * parity if !MD_RECOVERY_CHECK 4494 */ 4495 if (sh->ops.zero_sum_result == 0) { 4496 /* both parities are correct */ 4497 if (!s->failed) 4498 set_bit(STRIPE_INSYNC, &sh->state); 4499 else { 4500 /* in contrast to the raid5 case we can validate 4501 * parity, but still have a failure to write 4502 * back 4503 */ 4504 sh->check_state = check_state_compute_result; 4505 /* Returning at this point means that we may go 4506 * off and bring p and/or q uptodate again so 4507 * we make sure to check zero_sum_result again 4508 * to verify if p or q need writeback 4509 */ 4510 } 4511 } else { 4512 atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); 4513 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4514 /* don't try to repair!! */ 4515 set_bit(STRIPE_INSYNC, &sh->state); 4516 pr_warn_ratelimited("%s: mismatch sector in range " 4517 "%llu-%llu\n", mdname(conf->mddev), 4518 (unsigned long long) sh->sector, 4519 (unsigned long long) sh->sector + 4520 RAID5_STRIPE_SECTORS(conf)); 4521 } else { 4522 int *target = &sh->ops.target; 4523 4524 sh->ops.target = -1; 4525 sh->ops.target2 = -1; 4526 sh->check_state = check_state_compute_run; 4527 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4528 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4529 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4530 set_bit(R5_Wantcompute, 4531 &sh->dev[pd_idx].flags); 4532 *target = pd_idx; 4533 target = &sh->ops.target2; 4534 s->uptodate++; 4535 } 4536 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4537 set_bit(R5_Wantcompute, 4538 &sh->dev[qd_idx].flags); 4539 *target = qd_idx; 4540 s->uptodate++; 4541 } 4542 } 4543 } 4544 break; 4545 case check_state_compute_run: 4546 break; 4547 default: 4548 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4549 __func__, sh->check_state, 4550 (unsigned long long) sh->sector); 4551 BUG(); 4552 } 4553 } 4554 4555 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4556 { 4557 int i; 4558 4559 /* We have read all the blocks in this stripe and now we need to 4560 * copy some of them into a target stripe for expand. 4561 */ 4562 struct dma_async_tx_descriptor *tx = NULL; 4563 BUG_ON(sh->batch_head); 4564 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4565 for (i = 0; i < sh->disks; i++) 4566 if (i != sh->pd_idx && i != sh->qd_idx) { 4567 int dd_idx, j; 4568 struct stripe_head *sh2; 4569 struct async_submit_ctl submit; 4570 4571 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4572 sector_t s = raid5_compute_sector(conf, bn, 0, 4573 &dd_idx, NULL); 4574 sh2 = raid5_get_active_stripe(conf, NULL, s, 4575 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 4576 if (sh2 == NULL) 4577 /* so far only the early blocks of this stripe 4578 * have been requested. When later blocks 4579 * get requested, we will try again 4580 */ 4581 continue; 4582 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4583 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4584 /* must have already done this block */ 4585 raid5_release_stripe(sh2); 4586 continue; 4587 } 4588 4589 /* place all the copies on one channel */ 4590 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4591 tx = async_memcpy(sh2->dev[dd_idx].page, 4592 sh->dev[i].page, sh2->dev[dd_idx].offset, 4593 sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), 4594 &submit); 4595 4596 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4597 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4598 for (j = 0; j < conf->raid_disks; j++) 4599 if (j != sh2->pd_idx && 4600 j != sh2->qd_idx && 4601 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4602 break; 4603 if (j == conf->raid_disks) { 4604 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4605 set_bit(STRIPE_HANDLE, &sh2->state); 4606 } 4607 raid5_release_stripe(sh2); 4608 4609 } 4610 /* done submitting copies, wait for them to complete */ 4611 async_tx_quiesce(&tx); 4612 } 4613 4614 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4615 { 4616 struct r5conf *conf = sh->raid_conf; 4617 int disks = sh->disks; 4618 struct r5dev *dev; 4619 int i; 4620 int do_recovery = 0; 4621 4622 memset(s, 0, sizeof(*s)); 4623 4624 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4625 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4626 s->failed_num[0] = -1; 4627 s->failed_num[1] = -1; 4628 s->log_failed = r5l_log_disk_error(conf); 4629 4630 /* Now to look around and see what can be done */ 4631 for (i=disks; i--; ) { 4632 struct md_rdev *rdev; 4633 int is_bad = 0; 4634 4635 dev = &sh->dev[i]; 4636 4637 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4638 i, dev->flags, 4639 dev->toread, dev->towrite, dev->written); 4640 /* maybe we can reply to a read 4641 * 4642 * new wantfill requests are only permitted while 4643 * ops_complete_biofill is guaranteed to be inactive 4644 */ 4645 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4646 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4647 set_bit(R5_Wantfill, &dev->flags); 4648 4649 /* now count some things */ 4650 if (test_bit(R5_LOCKED, &dev->flags)) 4651 s->locked++; 4652 if (test_bit(R5_UPTODATE, &dev->flags)) 4653 s->uptodate++; 4654 if (test_bit(R5_Wantcompute, &dev->flags)) { 4655 s->compute++; 4656 BUG_ON(s->compute > 2); 4657 } 4658 4659 if (test_bit(R5_Wantfill, &dev->flags)) 4660 s->to_fill++; 4661 else if (dev->toread) 4662 s->to_read++; 4663 if (dev->towrite) { 4664 s->to_write++; 4665 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4666 s->non_overwrite++; 4667 } 4668 if (dev->written) 4669 s->written++; 4670 /* Prefer to use the replacement for reads, but only 4671 * if it is recovered enough and has no bad blocks. 4672 */ 4673 rdev = conf->disks[i].replacement; 4674 if (rdev && !test_bit(Faulty, &rdev->flags) && 4675 rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && 4676 !rdev_has_badblock(rdev, sh->sector, 4677 RAID5_STRIPE_SECTORS(conf))) 4678 set_bit(R5_ReadRepl, &dev->flags); 4679 else { 4680 if (rdev && !test_bit(Faulty, &rdev->flags)) 4681 set_bit(R5_NeedReplace, &dev->flags); 4682 else 4683 clear_bit(R5_NeedReplace, &dev->flags); 4684 rdev = conf->disks[i].rdev; 4685 clear_bit(R5_ReadRepl, &dev->flags); 4686 } 4687 if (rdev && test_bit(Faulty, &rdev->flags)) 4688 rdev = NULL; 4689 if (rdev) { 4690 is_bad = rdev_has_badblock(rdev, sh->sector, 4691 RAID5_STRIPE_SECTORS(conf)); 4692 if (s->blocked_rdev == NULL) { 4693 if (is_bad < 0) 4694 set_bit(BlockedBadBlocks, &rdev->flags); 4695 if (rdev_blocked(rdev)) { 4696 s->blocked_rdev = rdev; 4697 atomic_inc(&rdev->nr_pending); 4698 } 4699 } 4700 } 4701 clear_bit(R5_Insync, &dev->flags); 4702 if (!rdev) 4703 /* Not in-sync */; 4704 else if (is_bad) { 4705 /* also not in-sync */ 4706 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4707 test_bit(R5_UPTODATE, &dev->flags)) { 4708 /* treat as in-sync, but with a read error 4709 * which we can now try to correct 4710 */ 4711 set_bit(R5_Insync, &dev->flags); 4712 set_bit(R5_ReadError, &dev->flags); 4713 } 4714 } else if (test_bit(In_sync, &rdev->flags)) 4715 set_bit(R5_Insync, &dev->flags); 4716 else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= 4717 rdev->recovery_offset) { 4718 /* 4719 * in sync if: 4720 * - normal IO, or 4721 * - resync IO that is not lazy recovery 4722 * 4723 * For lazy recovery, we have to mark the rdev without 4724 * In_sync as failed, to build initial xor data. 4725 */ 4726 if (!test_bit(STRIPE_SYNCING, &sh->state) || 4727 !test_bit(MD_RECOVERY_LAZY_RECOVER, 4728 &conf->mddev->recovery)) 4729 set_bit(R5_Insync, &dev->flags); 4730 } else if (test_bit(R5_UPTODATE, &dev->flags) && 4731 test_bit(R5_Expanded, &dev->flags)) 4732 /* If we've reshaped into here, we assume it is Insync. 4733 * We will shortly update recovery_offset to make 4734 * it official. 4735 */ 4736 set_bit(R5_Insync, &dev->flags); 4737 4738 if (test_bit(R5_WriteError, &dev->flags)) { 4739 /* This flag does not apply to '.replacement' 4740 * only to .rdev, so make sure to check that*/ 4741 struct md_rdev *rdev2 = conf->disks[i].rdev; 4742 4743 if (rdev2 == rdev) 4744 clear_bit(R5_Insync, &dev->flags); 4745 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4746 s->handle_bad_blocks = 1; 4747 atomic_inc(&rdev2->nr_pending); 4748 } else 4749 clear_bit(R5_WriteError, &dev->flags); 4750 } 4751 if (test_bit(R5_MadeGood, &dev->flags)) { 4752 /* This flag does not apply to '.replacement' 4753 * only to .rdev, so make sure to check that*/ 4754 struct md_rdev *rdev2 = conf->disks[i].rdev; 4755 4756 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4757 s->handle_bad_blocks = 1; 4758 atomic_inc(&rdev2->nr_pending); 4759 } else 4760 clear_bit(R5_MadeGood, &dev->flags); 4761 } 4762 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4763 struct md_rdev *rdev2 = conf->disks[i].replacement; 4764 4765 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4766 s->handle_bad_blocks = 1; 4767 atomic_inc(&rdev2->nr_pending); 4768 } else 4769 clear_bit(R5_MadeGoodRepl, &dev->flags); 4770 } 4771 if (!test_bit(R5_Insync, &dev->flags)) { 4772 /* The ReadError flag will just be confusing now */ 4773 clear_bit(R5_ReadError, &dev->flags); 4774 clear_bit(R5_ReWrite, &dev->flags); 4775 } 4776 if (test_bit(R5_ReadError, &dev->flags)) 4777 clear_bit(R5_Insync, &dev->flags); 4778 if (!test_bit(R5_Insync, &dev->flags)) { 4779 if (s->failed < 2) 4780 s->failed_num[s->failed] = i; 4781 s->failed++; 4782 if (rdev && !test_bit(Faulty, &rdev->flags)) 4783 do_recovery = 1; 4784 else if (!rdev) { 4785 rdev = conf->disks[i].replacement; 4786 if (rdev && !test_bit(Faulty, &rdev->flags)) 4787 do_recovery = 1; 4788 } 4789 } 4790 4791 if (test_bit(R5_InJournal, &dev->flags)) 4792 s->injournal++; 4793 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4794 s->just_cached++; 4795 } 4796 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4797 /* If there is a failed device being replaced, 4798 * we must be recovering. 4799 * else if we are after resync_offset, we must be syncing 4800 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4801 * else we can only be replacing 4802 * sync and recovery both need to read all devices, and so 4803 * use the same flag. 4804 */ 4805 if (do_recovery || 4806 sh->sector >= conf->mddev->resync_offset || 4807 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4808 s->syncing = 1; 4809 else 4810 s->replacing = 1; 4811 } 4812 } 4813 4814 /* 4815 * Return '1' if this is a member of batch, or '0' if it is a lone stripe or 4816 * a head which can now be handled. 4817 */ 4818 static int clear_batch_ready(struct stripe_head *sh) 4819 { 4820 struct stripe_head *tmp; 4821 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4822 return (sh->batch_head && sh->batch_head != sh); 4823 spin_lock(&sh->stripe_lock); 4824 if (!sh->batch_head) { 4825 spin_unlock(&sh->stripe_lock); 4826 return 0; 4827 } 4828 4829 /* 4830 * this stripe could be added to a batch list before we check 4831 * BATCH_READY, skips it 4832 */ 4833 if (sh->batch_head != sh) { 4834 spin_unlock(&sh->stripe_lock); 4835 return 1; 4836 } 4837 spin_lock(&sh->batch_lock); 4838 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4839 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4840 spin_unlock(&sh->batch_lock); 4841 spin_unlock(&sh->stripe_lock); 4842 4843 /* 4844 * BATCH_READY is cleared, no new stripes can be added. 4845 * batch_list can be accessed without lock 4846 */ 4847 return 0; 4848 } 4849 4850 static void break_stripe_batch_list(struct stripe_head *head_sh, 4851 unsigned long handle_flags) 4852 { 4853 struct stripe_head *sh, *next; 4854 int i; 4855 unsigned long state; 4856 4857 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4858 4859 list_del_init(&sh->batch_list); 4860 4861 state = READ_ONCE(sh->state); 4862 WARN_ONCE(state & ((1 << STRIPE_ACTIVE) | 4863 (1 << STRIPE_SYNCING) | 4864 (1 << STRIPE_REPLACED) | 4865 (1 << STRIPE_DELAYED) | 4866 (1 << STRIPE_BIT_DELAY) | 4867 (1 << STRIPE_FULL_WRITE) | 4868 (1 << STRIPE_BIOFILL_RUN) | 4869 (1 << STRIPE_COMPUTE_RUN) | 4870 (1 << STRIPE_DISCARD) | 4871 (1 << STRIPE_BATCH_READY) | 4872 (1 << STRIPE_BATCH_ERR)), 4873 "stripe state: %lx\n", state); 4874 4875 state = READ_ONCE(head_sh->state); 4876 WARN_ONCE(state & ((1 << STRIPE_DISCARD) | 4877 (1 << STRIPE_REPLACED)), 4878 "head stripe state: %lx\n", state); 4879 4880 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4881 (1 << STRIPE_PREREAD_ACTIVE) | 4882 (1 << STRIPE_ON_UNPLUG_LIST)), 4883 state & (1 << STRIPE_INSYNC)); 4884 4885 sh->check_state = head_sh->check_state; 4886 sh->reconstruct_state = head_sh->reconstruct_state; 4887 spin_lock_irq(&sh->stripe_lock); 4888 for (i = 0; i < sh->disks; i++) { 4889 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4890 wake_up_bit(&sh->dev[i].flags, R5_Overlap); 4891 sh->dev[i].flags = READ_ONCE(head_sh->dev[i].flags) & 4892 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4893 } 4894 sh->batch_head = NULL; 4895 spin_unlock_irq(&sh->stripe_lock); 4896 4897 state = READ_ONCE(sh->state); 4898 if (handle_flags == 0 || (state & handle_flags)) 4899 set_bit(STRIPE_HANDLE, &sh->state); 4900 raid5_release_stripe(sh); 4901 } 4902 spin_lock_irq(&head_sh->stripe_lock); 4903 for (i = 0; i < head_sh->disks; i++) 4904 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4905 wake_up_bit(&head_sh->dev[i].flags, R5_Overlap); 4906 head_sh->batch_head = NULL; 4907 spin_unlock_irq(&head_sh->stripe_lock); 4908 4909 state = READ_ONCE(head_sh->state); 4910 if (state & handle_flags) 4911 set_bit(STRIPE_HANDLE, &head_sh->state); 4912 } 4913 4914 /* 4915 * handle_stripe - do things to a stripe. 4916 * 4917 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4918 * state of various bits to see what needs to be done. 4919 * Possible results: 4920 * return some read requests which now have data 4921 * return some write requests which are safely on storage 4922 * schedule a read on some buffers 4923 * schedule a write of some buffers 4924 * return confirmation of parity correctness 4925 */ 4926 static void handle_stripe(struct stripe_head *sh) 4927 { 4928 struct stripe_head_state s; 4929 struct r5conf *conf = sh->raid_conf; 4930 int i; 4931 int prexor; 4932 int disks = sh->disks; 4933 struct r5dev *pdev, *qdev; 4934 4935 clear_bit(STRIPE_HANDLE, &sh->state); 4936 4937 /* 4938 * handle_stripe should not continue handle the batched stripe, only 4939 * the head of batch list or lone stripe can continue. Otherwise we 4940 * could see break_stripe_batch_list warns about the STRIPE_ACTIVE 4941 * is set for the batched stripe. 4942 */ 4943 if (clear_batch_ready(sh)) 4944 return; 4945 4946 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4947 /* already being handled, ensure it gets handled 4948 * again when current action finishes */ 4949 set_bit(STRIPE_HANDLE, &sh->state); 4950 return; 4951 } 4952 4953 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4954 break_stripe_batch_list(sh, 0); 4955 4956 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4957 spin_lock(&sh->stripe_lock); 4958 /* 4959 * Cannot process 'sync' concurrently with 'discard'. 4960 * Flush data in r5cache before 'sync'. 4961 */ 4962 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 4963 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 4964 !test_bit(STRIPE_DISCARD, &sh->state) && 4965 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4966 set_bit(STRIPE_SYNCING, &sh->state); 4967 clear_bit(STRIPE_INSYNC, &sh->state); 4968 clear_bit(STRIPE_REPLACED, &sh->state); 4969 } 4970 spin_unlock(&sh->stripe_lock); 4971 } 4972 clear_bit(STRIPE_DELAYED, &sh->state); 4973 4974 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4975 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4976 (unsigned long long)sh->sector, sh->state, 4977 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4978 sh->check_state, sh->reconstruct_state); 4979 4980 analyse_stripe(sh, &s); 4981 4982 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4983 goto finish; 4984 4985 if (s.handle_bad_blocks || 4986 (md_is_rdwr(conf->mddev) && 4987 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) { 4988 set_bit(STRIPE_HANDLE, &sh->state); 4989 goto finish; 4990 } 4991 4992 if (unlikely(s.blocked_rdev)) { 4993 if (s.syncing || s.expanding || s.expanded || 4994 s.replacing || s.to_write || s.written) { 4995 set_bit(STRIPE_HANDLE, &sh->state); 4996 goto finish; 4997 } 4998 /* There is nothing for the blocked_rdev to block */ 4999 rdev_dec_pending(s.blocked_rdev, conf->mddev); 5000 s.blocked_rdev = NULL; 5001 } 5002 5003 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 5004 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 5005 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 5006 } 5007 5008 pr_debug("locked=%d uptodate=%d to_read=%d" 5009 " to_write=%d failed=%d failed_num=%d,%d\n", 5010 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 5011 s.failed_num[0], s.failed_num[1]); 5012 /* 5013 * check if the array has lost more than max_degraded devices and, 5014 * if so, some requests might need to be failed. 5015 * 5016 * When journal device failed (log_failed), we will only process 5017 * the stripe if there is data need write to raid disks 5018 */ 5019 if (s.failed > conf->max_degraded || 5020 (s.log_failed && s.injournal == 0)) { 5021 sh->check_state = 0; 5022 sh->reconstruct_state = 0; 5023 break_stripe_batch_list(sh, 0); 5024 if (s.to_read+s.to_write+s.written) 5025 handle_failed_stripe(conf, sh, &s, disks); 5026 if (s.syncing + s.replacing) 5027 handle_failed_sync(conf, sh, &s); 5028 } 5029 5030 /* Now we check to see if any write operations have recently 5031 * completed 5032 */ 5033 prexor = 0; 5034 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 5035 prexor = 1; 5036 if (sh->reconstruct_state == reconstruct_state_drain_result || 5037 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 5038 sh->reconstruct_state = reconstruct_state_idle; 5039 5040 /* All the 'written' buffers and the parity block are ready to 5041 * be written back to disk 5042 */ 5043 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 5044 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 5045 BUG_ON(sh->qd_idx >= 0 && 5046 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 5047 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 5048 for (i = disks; i--; ) { 5049 struct r5dev *dev = &sh->dev[i]; 5050 if (test_bit(R5_LOCKED, &dev->flags) && 5051 (i == sh->pd_idx || i == sh->qd_idx || 5052 dev->written || test_bit(R5_InJournal, 5053 &dev->flags))) { 5054 pr_debug("Writing block %d\n", i); 5055 set_bit(R5_Wantwrite, &dev->flags); 5056 if (prexor) 5057 continue; 5058 if (s.failed > 1) 5059 continue; 5060 if (!test_bit(R5_Insync, &dev->flags) || 5061 ((i == sh->pd_idx || i == sh->qd_idx) && 5062 s.failed == 0)) 5063 set_bit(STRIPE_INSYNC, &sh->state); 5064 } 5065 } 5066 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5067 s.dec_preread_active = 1; 5068 } 5069 5070 /* 5071 * might be able to return some write requests if the parity blocks 5072 * are safe, or on a failed drive 5073 */ 5074 pdev = &sh->dev[sh->pd_idx]; 5075 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 5076 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 5077 qdev = &sh->dev[sh->qd_idx]; 5078 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 5079 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 5080 || conf->level < 6; 5081 5082 if (s.written && 5083 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 5084 && !test_bit(R5_LOCKED, &pdev->flags) 5085 && (test_bit(R5_UPTODATE, &pdev->flags) || 5086 test_bit(R5_Discard, &pdev->flags))))) && 5087 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 5088 && !test_bit(R5_LOCKED, &qdev->flags) 5089 && (test_bit(R5_UPTODATE, &qdev->flags) || 5090 test_bit(R5_Discard, &qdev->flags)))))) 5091 handle_stripe_clean_event(conf, sh, disks); 5092 5093 if (s.just_cached) 5094 r5c_handle_cached_data_endio(conf, sh, disks); 5095 log_stripe_write_finished(sh); 5096 5097 /* Now we might consider reading some blocks, either to check/generate 5098 * parity, or to satisfy requests 5099 * or to load a block that is being partially written. 5100 */ 5101 if (s.to_read || s.non_overwrite 5102 || (s.to_write && s.failed) 5103 || (s.syncing && (s.uptodate + s.compute < disks)) 5104 || s.replacing 5105 || s.expanding) 5106 handle_stripe_fill(sh, &s, disks); 5107 5108 /* 5109 * When the stripe finishes full journal write cycle (write to journal 5110 * and raid disk), this is the clean up procedure so it is ready for 5111 * next operation. 5112 */ 5113 r5c_finish_stripe_write_out(conf, sh, &s); 5114 5115 /* 5116 * Now to consider new write requests, cache write back and what else, 5117 * if anything should be read. We do not handle new writes when: 5118 * 1/ A 'write' operation (copy+xor) is already in flight. 5119 * 2/ A 'check' operation is in flight, as it may clobber the parity 5120 * block. 5121 * 3/ A r5c cache log write is in flight. 5122 */ 5123 5124 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 5125 if (!r5c_is_writeback(conf->log)) { 5126 if (s.to_write) 5127 handle_stripe_dirtying(conf, sh, &s, disks); 5128 } else { /* write back cache */ 5129 int ret = 0; 5130 5131 /* First, try handle writes in caching phase */ 5132 if (s.to_write) 5133 ret = r5c_try_caching_write(conf, sh, &s, 5134 disks); 5135 /* 5136 * If caching phase failed: ret == -EAGAIN 5137 * OR 5138 * stripe under reclaim: !caching && injournal 5139 * 5140 * fall back to handle_stripe_dirtying() 5141 */ 5142 if (ret == -EAGAIN || 5143 /* stripe under reclaim: !caching && injournal */ 5144 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 5145 s.injournal > 0)) { 5146 ret = handle_stripe_dirtying(conf, sh, &s, 5147 disks); 5148 if (ret == -EAGAIN) 5149 goto finish; 5150 } 5151 } 5152 } 5153 5154 /* maybe we need to check and possibly fix the parity for this stripe 5155 * Any reads will already have been scheduled, so we just see if enough 5156 * data is available. The parity check is held off while parity 5157 * dependent operations are in flight. 5158 */ 5159 if (sh->check_state || 5160 (s.syncing && s.locked == 0 && 5161 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 5162 !test_bit(STRIPE_INSYNC, &sh->state))) { 5163 if (conf->level == 6) 5164 handle_parity_checks6(conf, sh, &s, disks); 5165 else 5166 handle_parity_checks5(conf, sh, &s, disks); 5167 } 5168 5169 if ((s.replacing || s.syncing) && s.locked == 0 5170 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 5171 && !test_bit(STRIPE_REPLACED, &sh->state)) { 5172 /* Write out to replacement devices where possible */ 5173 for (i = 0; i < conf->raid_disks; i++) 5174 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 5175 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 5176 set_bit(R5_WantReplace, &sh->dev[i].flags); 5177 set_bit(R5_LOCKED, &sh->dev[i].flags); 5178 s.locked++; 5179 } 5180 if (s.replacing) 5181 set_bit(STRIPE_INSYNC, &sh->state); 5182 set_bit(STRIPE_REPLACED, &sh->state); 5183 } 5184 if ((s.syncing || s.replacing) && s.locked == 0 && 5185 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 5186 test_bit(STRIPE_INSYNC, &sh->state)) { 5187 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); 5188 clear_bit(STRIPE_SYNCING, &sh->state); 5189 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 5190 wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); 5191 } 5192 5193 /* If the failed drives are just a ReadError, then we might need 5194 * to progress the repair/check process 5195 */ 5196 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 5197 for (i = 0; i < s.failed; i++) { 5198 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 5199 if (test_bit(R5_ReadError, &dev->flags) 5200 && !test_bit(R5_LOCKED, &dev->flags) 5201 && test_bit(R5_UPTODATE, &dev->flags) 5202 ) { 5203 if (!test_bit(R5_ReWrite, &dev->flags)) { 5204 set_bit(R5_Wantwrite, &dev->flags); 5205 set_bit(R5_ReWrite, &dev->flags); 5206 } else 5207 /* let's read it back */ 5208 set_bit(R5_Wantread, &dev->flags); 5209 set_bit(R5_LOCKED, &dev->flags); 5210 s.locked++; 5211 } 5212 } 5213 5214 /* Finish reconstruct operations initiated by the expansion process */ 5215 if (sh->reconstruct_state == reconstruct_state_result) { 5216 struct stripe_head *sh_src 5217 = raid5_get_active_stripe(conf, NULL, sh->sector, 5218 R5_GAS_PREVIOUS | R5_GAS_NOBLOCK | 5219 R5_GAS_NOQUIESCE); 5220 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 5221 /* sh cannot be written until sh_src has been read. 5222 * so arrange for sh to be delayed a little 5223 */ 5224 set_bit(STRIPE_DELAYED, &sh->state); 5225 set_bit(STRIPE_HANDLE, &sh->state); 5226 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 5227 &sh_src->state)) 5228 atomic_inc(&conf->preread_active_stripes); 5229 raid5_release_stripe(sh_src); 5230 goto finish; 5231 } 5232 if (sh_src) 5233 raid5_release_stripe(sh_src); 5234 5235 sh->reconstruct_state = reconstruct_state_idle; 5236 clear_bit(STRIPE_EXPANDING, &sh->state); 5237 for (i = conf->raid_disks; i--; ) { 5238 set_bit(R5_Wantwrite, &sh->dev[i].flags); 5239 set_bit(R5_LOCKED, &sh->dev[i].flags); 5240 s.locked++; 5241 } 5242 } 5243 5244 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 5245 !sh->reconstruct_state) { 5246 /* Need to write out all blocks after computing parity */ 5247 sh->disks = conf->raid_disks; 5248 stripe_set_idx(sh->sector, conf, 0, sh); 5249 schedule_reconstruction(sh, &s, 1, 1); 5250 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 5251 clear_bit(STRIPE_EXPAND_READY, &sh->state); 5252 atomic_dec(&conf->reshape_stripes); 5253 wake_up(&conf->wait_for_reshape); 5254 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); 5255 } 5256 5257 if (s.expanding && s.locked == 0 && 5258 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 5259 handle_stripe_expansion(conf, sh); 5260 5261 finish: 5262 /* wait for this device to become unblocked */ 5263 if (unlikely(s.blocked_rdev)) { 5264 if (conf->mddev->external) 5265 md_wait_for_blocked_rdev(s.blocked_rdev, 5266 conf->mddev); 5267 else 5268 /* Internal metadata will immediately 5269 * be written by raid5d, so we don't 5270 * need to wait here. 5271 */ 5272 rdev_dec_pending(s.blocked_rdev, 5273 conf->mddev); 5274 } 5275 5276 if (s.handle_bad_blocks) 5277 for (i = disks; i--; ) { 5278 struct md_rdev *rdev; 5279 struct r5dev *dev = &sh->dev[i]; 5280 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 5281 /* We own a safe reference to the rdev */ 5282 rdev = conf->disks[i].rdev; 5283 rdev_set_badblocks(rdev, sh->sector, 5284 RAID5_STRIPE_SECTORS(conf), 0); 5285 rdev_dec_pending(rdev, conf->mddev); 5286 } 5287 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 5288 rdev = conf->disks[i].rdev; 5289 rdev_clear_badblocks(rdev, sh->sector, 5290 RAID5_STRIPE_SECTORS(conf), 0); 5291 rdev_dec_pending(rdev, conf->mddev); 5292 } 5293 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5294 rdev = conf->disks[i].replacement; 5295 if (!rdev) 5296 /* rdev have been moved down */ 5297 rdev = conf->disks[i].rdev; 5298 rdev_clear_badblocks(rdev, sh->sector, 5299 RAID5_STRIPE_SECTORS(conf), 0); 5300 rdev_dec_pending(rdev, conf->mddev); 5301 } 5302 } 5303 5304 if (s.ops_request) 5305 raid_run_ops(sh, s.ops_request); 5306 5307 ops_run_io(sh, &s); 5308 5309 if (s.dec_preread_active) { 5310 /* We delay this until after ops_run_io so that if make_request 5311 * is waiting on a flush, it won't continue until the writes 5312 * have actually been submitted. 5313 */ 5314 atomic_dec(&conf->preread_active_stripes); 5315 if (atomic_read(&conf->preread_active_stripes) < 5316 IO_THRESHOLD) 5317 md_wakeup_thread(conf->mddev->thread); 5318 } 5319 5320 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5321 } 5322 5323 static void raid5_activate_delayed(struct r5conf *conf) 5324 __must_hold(&conf->device_lock) 5325 { 5326 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5327 while (!list_empty(&conf->delayed_list)) { 5328 struct list_head *l = conf->delayed_list.next; 5329 struct stripe_head *sh; 5330 sh = list_entry(l, struct stripe_head, lru); 5331 list_del_init(l); 5332 clear_bit(STRIPE_DELAYED, &sh->state); 5333 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5334 atomic_inc(&conf->preread_active_stripes); 5335 list_add_tail(&sh->lru, &conf->hold_list); 5336 raid5_wakeup_stripe_thread(sh); 5337 } 5338 } 5339 } 5340 5341 static void activate_bit_delay(struct r5conf *conf, 5342 struct list_head *temp_inactive_list) 5343 __must_hold(&conf->device_lock) 5344 { 5345 struct list_head head; 5346 list_add(&head, &conf->bitmap_list); 5347 list_del_init(&conf->bitmap_list); 5348 while (!list_empty(&head)) { 5349 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5350 int hash; 5351 list_del_init(&sh->lru); 5352 atomic_inc(&sh->count); 5353 hash = sh->hash_lock_index; 5354 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5355 } 5356 } 5357 5358 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5359 { 5360 struct r5conf *conf = mddev->private; 5361 sector_t sector = bio->bi_iter.bi_sector; 5362 unsigned int chunk_sectors; 5363 unsigned int bio_sectors = bio_sectors(bio); 5364 5365 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5366 return chunk_sectors >= 5367 ((sector & (chunk_sectors - 1)) + bio_sectors); 5368 } 5369 5370 /* 5371 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5372 * later sampled by raid5d. 5373 */ 5374 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5375 { 5376 unsigned long flags; 5377 5378 spin_lock_irqsave(&conf->device_lock, flags); 5379 5380 bi->bi_next = conf->retry_read_aligned_list; 5381 conf->retry_read_aligned_list = bi; 5382 5383 spin_unlock_irqrestore(&conf->device_lock, flags); 5384 md_wakeup_thread(conf->mddev->thread); 5385 } 5386 5387 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5388 unsigned int *offset) 5389 { 5390 struct bio *bi; 5391 5392 bi = conf->retry_read_aligned; 5393 if (bi) { 5394 *offset = conf->retry_read_offset; 5395 conf->retry_read_aligned = NULL; 5396 return bi; 5397 } 5398 bi = conf->retry_read_aligned_list; 5399 if(bi) { 5400 conf->retry_read_aligned_list = bi->bi_next; 5401 bi->bi_next = NULL; 5402 *offset = 0; 5403 } 5404 5405 return bi; 5406 } 5407 5408 /* 5409 * The "raid5_align_endio" should check if the read succeeded and if it 5410 * did, call bio_endio on the original bio (having bio_put the new bio 5411 * first). 5412 * If the read failed.. 5413 */ 5414 static void raid5_align_endio(struct bio *bi) 5415 { 5416 struct bio *raid_bi = bi->bi_private; 5417 struct md_rdev *rdev = (void *)raid_bi->bi_next; 5418 struct mddev *mddev = rdev->mddev; 5419 struct r5conf *conf = mddev->private; 5420 blk_status_t error = bi->bi_status; 5421 5422 bio_put(bi); 5423 raid_bi->bi_next = NULL; 5424 rdev_dec_pending(rdev, conf->mddev); 5425 5426 if (!error) { 5427 bio_endio(raid_bi); 5428 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5429 wake_up(&conf->wait_for_quiescent); 5430 return; 5431 } 5432 5433 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5434 5435 add_bio_to_retry(raid_bi, conf); 5436 } 5437 5438 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5439 { 5440 struct r5conf *conf = mddev->private; 5441 struct bio *align_bio; 5442 struct md_rdev *rdev; 5443 sector_t sector, end_sector; 5444 int dd_idx; 5445 bool did_inc; 5446 5447 if (!in_chunk_boundary(mddev, raid_bio)) { 5448 pr_debug("%s: non aligned\n", __func__); 5449 return 0; 5450 } 5451 5452 sector = raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 0, 5453 &dd_idx, NULL); 5454 end_sector = sector + bio_sectors(raid_bio); 5455 5456 if (r5c_big_stripe_cached(conf, sector)) 5457 return 0; 5458 5459 rdev = conf->disks[dd_idx].replacement; 5460 if (!rdev || test_bit(Faulty, &rdev->flags) || 5461 rdev->recovery_offset < end_sector) { 5462 rdev = conf->disks[dd_idx].rdev; 5463 if (!rdev) 5464 return 0; 5465 if (test_bit(Faulty, &rdev->flags) || 5466 !(test_bit(In_sync, &rdev->flags) || 5467 rdev->recovery_offset >= end_sector)) 5468 return 0; 5469 } 5470 5471 atomic_inc(&rdev->nr_pending); 5472 5473 if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) { 5474 rdev_dec_pending(rdev, mddev); 5475 return 0; 5476 } 5477 5478 md_account_bio(mddev, &raid_bio); 5479 raid_bio->bi_next = (void *)rdev; 5480 5481 align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, 5482 &mddev->bio_set); 5483 align_bio->bi_end_io = raid5_align_endio; 5484 align_bio->bi_private = raid_bio; 5485 align_bio->bi_iter.bi_sector = sector; 5486 5487 /* No reshape active, so we can trust rdev->data_offset */ 5488 align_bio->bi_iter.bi_sector += rdev->data_offset; 5489 5490 did_inc = false; 5491 if (conf->quiesce == 0) { 5492 atomic_inc(&conf->active_aligned_reads); 5493 did_inc = true; 5494 } 5495 /* need a memory barrier to detect the race with raid5_quiesce() */ 5496 if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { 5497 /* quiesce is in progress, so we need to undo io activation and wait 5498 * for it to finish 5499 */ 5500 if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) 5501 wake_up(&conf->wait_for_quiescent); 5502 spin_lock_irq(&conf->device_lock); 5503 wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, 5504 conf->device_lock); 5505 atomic_inc(&conf->active_aligned_reads); 5506 spin_unlock_irq(&conf->device_lock); 5507 } 5508 5509 mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector); 5510 submit_bio_noacct(align_bio); 5511 return 1; 5512 } 5513 5514 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5515 { 5516 sector_t sector = raid_bio->bi_iter.bi_sector; 5517 unsigned chunk_sects = mddev->chunk_sectors; 5518 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5519 5520 if (sectors < bio_sectors(raid_bio)) { 5521 struct r5conf *conf = mddev->private; 5522 5523 raid_bio = bio_submit_split_bioset(raid_bio, sectors, 5524 &conf->bio_split); 5525 if (!raid_bio) 5526 return NULL; 5527 } 5528 5529 if (!raid5_read_one_chunk(mddev, raid_bio)) 5530 return raid_bio; 5531 5532 return NULL; 5533 } 5534 5535 /* __get_priority_stripe - get the next stripe to process 5536 * 5537 * Full stripe writes are allowed to pass preread active stripes up until 5538 * the bypass_threshold is exceeded. In general the bypass_count 5539 * increments when the handle_list is handled before the hold_list; however, it 5540 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5541 * stripe with in flight i/o. The bypass_count will be reset when the 5542 * head of the hold_list has changed, i.e. the head was promoted to the 5543 * handle_list. 5544 */ 5545 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5546 __must_hold(&conf->device_lock) 5547 { 5548 struct stripe_head *sh, *tmp; 5549 struct list_head *handle_list = NULL; 5550 struct r5worker_group *wg; 5551 bool second_try = !r5c_is_writeback(conf->log) && 5552 !r5l_log_disk_error(conf); 5553 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5554 r5l_log_disk_error(conf); 5555 5556 again: 5557 wg = NULL; 5558 sh = NULL; 5559 if (conf->worker_cnt_per_group == 0) { 5560 handle_list = try_loprio ? &conf->loprio_list : 5561 &conf->handle_list; 5562 } else if (group != ANY_GROUP) { 5563 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5564 &conf->worker_groups[group].handle_list; 5565 wg = &conf->worker_groups[group]; 5566 } else { 5567 int i; 5568 for (i = 0; i < conf->group_cnt; i++) { 5569 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5570 &conf->worker_groups[i].handle_list; 5571 wg = &conf->worker_groups[i]; 5572 if (!list_empty(handle_list)) 5573 break; 5574 } 5575 } 5576 5577 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5578 __func__, 5579 list_empty(handle_list) ? "empty" : "busy", 5580 list_empty(&conf->hold_list) ? "empty" : "busy", 5581 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5582 5583 if (!list_empty(handle_list)) { 5584 sh = list_entry(handle_list->next, typeof(*sh), lru); 5585 5586 if (list_empty(&conf->hold_list)) 5587 conf->bypass_count = 0; 5588 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5589 if (conf->hold_list.next == conf->last_hold) 5590 conf->bypass_count++; 5591 else { 5592 conf->last_hold = conf->hold_list.next; 5593 conf->bypass_count -= conf->bypass_threshold; 5594 if (conf->bypass_count < 0) 5595 conf->bypass_count = 0; 5596 } 5597 } 5598 } else if (!list_empty(&conf->hold_list) && 5599 ((conf->bypass_threshold && 5600 conf->bypass_count > conf->bypass_threshold) || 5601 atomic_read(&conf->pending_full_writes) == 0)) { 5602 5603 list_for_each_entry(tmp, &conf->hold_list, lru) { 5604 if (conf->worker_cnt_per_group == 0 || 5605 group == ANY_GROUP || 5606 !cpu_online(tmp->cpu) || 5607 cpu_to_group(tmp->cpu) == group) { 5608 sh = tmp; 5609 break; 5610 } 5611 } 5612 5613 if (sh) { 5614 conf->bypass_count -= conf->bypass_threshold; 5615 if (conf->bypass_count < 0) 5616 conf->bypass_count = 0; 5617 } 5618 wg = NULL; 5619 } 5620 5621 if (!sh) { 5622 if (second_try) 5623 return NULL; 5624 second_try = true; 5625 try_loprio = !try_loprio; 5626 goto again; 5627 } 5628 5629 if (wg) { 5630 wg->stripes_cnt--; 5631 sh->group = NULL; 5632 } 5633 list_del_init(&sh->lru); 5634 BUG_ON(atomic_inc_return(&sh->count) != 1); 5635 return sh; 5636 } 5637 5638 struct raid5_plug_cb { 5639 struct blk_plug_cb cb; 5640 struct list_head list; 5641 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5642 }; 5643 5644 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5645 { 5646 struct raid5_plug_cb *cb = container_of( 5647 blk_cb, struct raid5_plug_cb, cb); 5648 struct stripe_head *sh; 5649 struct mddev *mddev = cb->cb.data; 5650 struct r5conf *conf = mddev->private; 5651 int cnt = 0; 5652 int hash; 5653 5654 if (cb->list.next && !list_empty(&cb->list)) { 5655 spin_lock_irq(&conf->device_lock); 5656 while (!list_empty(&cb->list)) { 5657 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5658 list_del_init(&sh->lru); 5659 /* 5660 * avoid race release_stripe_plug() sees 5661 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5662 * is still in our list 5663 */ 5664 smp_mb__before_atomic(); 5665 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5666 /* 5667 * STRIPE_ON_RELEASE_LIST could be set here. In that 5668 * case, the count is always > 1 here 5669 */ 5670 hash = sh->hash_lock_index; 5671 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5672 cnt++; 5673 } 5674 spin_unlock_irq(&conf->device_lock); 5675 } 5676 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5677 NR_STRIPE_HASH_LOCKS); 5678 if (!mddev_is_dm(mddev)) 5679 trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule); 5680 kfree(cb); 5681 } 5682 5683 static void release_stripe_plug(struct mddev *mddev, 5684 struct stripe_head *sh) 5685 { 5686 struct blk_plug_cb *blk_cb = blk_check_plugged( 5687 raid5_unplug, mddev, 5688 sizeof(struct raid5_plug_cb)); 5689 struct raid5_plug_cb *cb; 5690 5691 if (!blk_cb) { 5692 raid5_release_stripe(sh); 5693 return; 5694 } 5695 5696 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5697 5698 if (cb->list.next == NULL) { 5699 int i; 5700 INIT_LIST_HEAD(&cb->list); 5701 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5702 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5703 } 5704 5705 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5706 list_add_tail(&sh->lru, &cb->list); 5707 else 5708 raid5_release_stripe(sh); 5709 } 5710 5711 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5712 { 5713 struct r5conf *conf = mddev->private; 5714 sector_t logical_sector, last_sector; 5715 sector_t first_stripe, last_stripe; 5716 struct stripe_head *sh; 5717 struct bvec_iter bi_iter; 5718 struct bio *orig_bi = bi; 5719 int stripe_sectors; 5720 5721 /* We need to handle this when io_uring supports discard/trim */ 5722 if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT)) 5723 return; 5724 5725 if (mddev->reshape_position != MaxSector) 5726 /* Skip discard while reshape is happening */ 5727 return; 5728 5729 if (!raid5_discard_limits(mddev, bi)) 5730 return; 5731 5732 stripe_sectors = conf->chunk_sectors * 5733 (conf->raid_disks - conf->max_degraded); 5734 first_stripe = DIV_ROUND_UP_SECTOR_T(bi->bi_iter.bi_sector, 5735 stripe_sectors); 5736 last_stripe = bio_end_sector(bi); 5737 sector_div(last_stripe, stripe_sectors); 5738 5739 if (first_stripe >= last_stripe) { 5740 bio_endio(bi); 5741 return; 5742 } 5743 5744 bi_iter = bi->bi_iter; 5745 bi->bi_iter.bi_sector = first_stripe * stripe_sectors; 5746 bi->bi_iter.bi_size = ((last_stripe - first_stripe) * 5747 stripe_sectors) << 9; 5748 md_account_bio(mddev, &bi); 5749 orig_bi->bi_iter = bi_iter; 5750 bi->bi_iter = bi_iter; 5751 bi->bi_next = NULL; 5752 5753 if (mddev->bitmap_id == ID_LLBITMAP && 5754 conf->raid5_discard_unsupported) { 5755 bio_endio(bi); 5756 return; 5757 } 5758 5759 logical_sector = first_stripe * conf->chunk_sectors; 5760 last_sector = last_stripe * conf->chunk_sectors; 5761 5762 for (; logical_sector < last_sector; 5763 logical_sector += RAID5_STRIPE_SECTORS(conf)) { 5764 DEFINE_WAIT(w); 5765 int d; 5766 again: 5767 sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); 5768 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5769 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5770 raid5_release_stripe(sh); 5771 wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap, 5772 TASK_UNINTERRUPTIBLE); 5773 goto again; 5774 } 5775 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5776 spin_lock_irq(&sh->stripe_lock); 5777 for (d = 0; d < conf->raid_disks; d++) { 5778 if (d == sh->pd_idx || d == sh->qd_idx) 5779 continue; 5780 if (sh->dev[d].towrite || sh->dev[d].toread) { 5781 set_bit(R5_Overlap, &sh->dev[d].flags); 5782 spin_unlock_irq(&sh->stripe_lock); 5783 raid5_release_stripe(sh); 5784 wait_on_bit(&sh->dev[d].flags, R5_Overlap, 5785 TASK_UNINTERRUPTIBLE); 5786 goto again; 5787 } 5788 } 5789 set_bit(STRIPE_DISCARD, &sh->state); 5790 sh->overwrite_disks = 0; 5791 for (d = 0; d < conf->raid_disks; d++) { 5792 if (d == sh->pd_idx || d == sh->qd_idx) 5793 continue; 5794 sh->dev[d].towrite = bi; 5795 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5796 bio_inc_remaining(bi); 5797 md_write_inc(mddev, bi); 5798 sh->overwrite_disks++; 5799 } 5800 spin_unlock_irq(&sh->stripe_lock); 5801 if (conf->mddev->bitmap) { 5802 sh->bm_seq = conf->seq_flush + 1; 5803 set_bit(STRIPE_BIT_DELAY, &sh->state); 5804 } 5805 5806 set_bit(STRIPE_HANDLE, &sh->state); 5807 clear_bit(STRIPE_DELAYED, &sh->state); 5808 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5809 atomic_inc(&conf->preread_active_stripes); 5810 release_stripe_plug(mddev, sh); 5811 } 5812 5813 bio_endio(bi); 5814 } 5815 5816 static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, 5817 sector_t reshape_sector) 5818 { 5819 return mddev->reshape_backwards ? sector < reshape_sector : 5820 sector >= reshape_sector; 5821 } 5822 5823 static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, 5824 sector_t max, sector_t reshape_sector) 5825 { 5826 return mddev->reshape_backwards ? max < reshape_sector : 5827 min >= reshape_sector; 5828 } 5829 5830 static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, 5831 struct stripe_head *sh) 5832 { 5833 sector_t max_sector = 0, min_sector = MaxSector; 5834 bool ret = false; 5835 int dd_idx; 5836 5837 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5838 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5839 continue; 5840 5841 min_sector = min(min_sector, sh->dev[dd_idx].sector); 5842 max_sector = max(max_sector, sh->dev[dd_idx].sector); 5843 } 5844 5845 spin_lock_irq(&conf->device_lock); 5846 5847 if (!range_ahead_of_reshape(mddev, min_sector, max_sector, 5848 conf->reshape_progress)) 5849 /* mismatch, need to try again */ 5850 ret = true; 5851 5852 spin_unlock_irq(&conf->device_lock); 5853 5854 return ret; 5855 } 5856 5857 static int add_all_stripe_bios(struct r5conf *conf, 5858 struct stripe_request_ctx *ctx, struct stripe_head *sh, 5859 struct bio *bi, int forwrite, int previous) 5860 { 5861 int dd_idx; 5862 5863 spin_lock_irq(&sh->stripe_lock); 5864 5865 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5866 struct r5dev *dev = &sh->dev[dd_idx]; 5867 5868 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5869 continue; 5870 5871 if (dev->sector < ctx->first_sector || 5872 dev->sector >= ctx->last_sector) 5873 continue; 5874 5875 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 5876 set_bit(R5_Overlap, &dev->flags); 5877 spin_unlock_irq(&sh->stripe_lock); 5878 raid5_release_stripe(sh); 5879 /* release batch_last before wait to avoid risk of deadlock */ 5880 if (ctx->batch_last) { 5881 raid5_release_stripe(ctx->batch_last); 5882 ctx->batch_last = NULL; 5883 } 5884 md_wakeup_thread(conf->mddev->thread); 5885 wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE); 5886 return 0; 5887 } 5888 } 5889 5890 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5891 struct r5dev *dev = &sh->dev[dd_idx]; 5892 5893 if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5894 continue; 5895 5896 if (dev->sector < ctx->first_sector || 5897 dev->sector >= ctx->last_sector) 5898 continue; 5899 5900 __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); 5901 clear_bit((dev->sector - ctx->first_sector) >> 5902 RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); 5903 } 5904 5905 spin_unlock_irq(&sh->stripe_lock); 5906 return 1; 5907 } 5908 5909 enum reshape_loc { 5910 LOC_NO_RESHAPE, 5911 LOC_AHEAD_OF_RESHAPE, 5912 LOC_INSIDE_RESHAPE, 5913 LOC_BEHIND_RESHAPE, 5914 }; 5915 5916 static enum reshape_loc get_reshape_loc(struct mddev *mddev, 5917 struct r5conf *conf, sector_t logical_sector) 5918 { 5919 sector_t reshape_progress, reshape_safe; 5920 5921 if (likely(conf->reshape_progress == MaxSector)) 5922 return LOC_NO_RESHAPE; 5923 /* 5924 * Spinlock is needed as reshape_progress may be 5925 * 64bit on a 32bit platform, and so it might be 5926 * possible to see a half-updated value 5927 * Of course reshape_progress could change after 5928 * the lock is dropped, so once we get a reference 5929 * to the stripe that we think it is, we will have 5930 * to check again. 5931 */ 5932 spin_lock_irq(&conf->device_lock); 5933 reshape_progress = conf->reshape_progress; 5934 reshape_safe = conf->reshape_safe; 5935 spin_unlock_irq(&conf->device_lock); 5936 if (reshape_progress == MaxSector) 5937 return LOC_NO_RESHAPE; 5938 if (ahead_of_reshape(mddev, logical_sector, reshape_progress)) 5939 return LOC_AHEAD_OF_RESHAPE; 5940 if (ahead_of_reshape(mddev, logical_sector, reshape_safe)) 5941 return LOC_INSIDE_RESHAPE; 5942 return LOC_BEHIND_RESHAPE; 5943 } 5944 5945 static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset, 5946 unsigned long *sectors) 5947 { 5948 struct r5conf *conf = mddev->private; 5949 sector_t start = *offset; 5950 sector_t end = start + *sectors; 5951 sector_t prev_start = start; 5952 sector_t prev_end = end; 5953 int sectors_per_chunk; 5954 enum reshape_loc loc; 5955 int dd_idx; 5956 5957 sectors_per_chunk = conf->chunk_sectors * 5958 (conf->raid_disks - conf->max_degraded); 5959 start = round_down(start, sectors_per_chunk); 5960 end = round_up(end, sectors_per_chunk); 5961 5962 start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL); 5963 end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL); 5964 5965 /* 5966 * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make 5967 * progress, hence it's the same as LOC_BEHIND_RESHAPE. 5968 */ 5969 loc = get_reshape_loc(mddev, conf, prev_start); 5970 if (likely(loc != LOC_AHEAD_OF_RESHAPE)) { 5971 *offset = start; 5972 *sectors = end - start; 5973 return; 5974 } 5975 5976 sectors_per_chunk = conf->prev_chunk_sectors * 5977 (conf->previous_raid_disks - conf->max_degraded); 5978 prev_start = round_down(prev_start, sectors_per_chunk); 5979 prev_end = round_down(prev_end, sectors_per_chunk); 5980 5981 prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL); 5982 prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL); 5983 5984 /* 5985 * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO 5986 * is handled in make_stripe_request(), we can't know this here hence 5987 * we set bits for both. 5988 */ 5989 *offset = min(start, prev_start); 5990 *sectors = max(end, prev_end) - *offset; 5991 } 5992 5993 static enum stripe_result make_stripe_request(struct mddev *mddev, 5994 struct r5conf *conf, struct stripe_request_ctx *ctx, 5995 sector_t logical_sector, struct bio *bi) 5996 { 5997 const int rw = bio_data_dir(bi); 5998 enum stripe_result ret; 5999 struct stripe_head *sh; 6000 enum reshape_loc loc; 6001 sector_t new_sector; 6002 int previous = 0, flags = 0; 6003 int seq, dd_idx; 6004 6005 seq = read_seqcount_begin(&conf->gen_lock); 6006 loc = get_reshape_loc(mddev, conf, logical_sector); 6007 if (loc == LOC_INSIDE_RESHAPE) { 6008 ret = STRIPE_SCHEDULE_AND_RETRY; 6009 goto out; 6010 } 6011 if (loc == LOC_AHEAD_OF_RESHAPE) 6012 previous = 1; 6013 6014 new_sector = raid5_compute_sector(conf, logical_sector, previous, 6015 &dd_idx, NULL); 6016 pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, 6017 new_sector, logical_sector); 6018 6019 if (previous) 6020 flags |= R5_GAS_PREVIOUS; 6021 if (bi->bi_opf & REQ_RAHEAD) 6022 flags |= R5_GAS_NOBLOCK; 6023 sh = raid5_get_active_stripe(conf, ctx, new_sector, flags); 6024 if (unlikely(!sh)) { 6025 /* cannot get stripe, just give-up */ 6026 bi->bi_status = BLK_STS_IOERR; 6027 return STRIPE_FAIL; 6028 } 6029 6030 if (unlikely(previous) && 6031 stripe_ahead_of_reshape(mddev, conf, sh)) { 6032 /* 6033 * Expansion moved on while waiting for a stripe. 6034 * Expansion could still move past after this 6035 * test, but as we are holding a reference to 6036 * 'sh', we know that if that happens, 6037 * STRIPE_EXPANDING will get set and the expansion 6038 * won't proceed until we finish with the stripe. 6039 */ 6040 ret = STRIPE_SCHEDULE_AND_RETRY; 6041 goto out_release; 6042 } 6043 6044 if (read_seqcount_retry(&conf->gen_lock, seq)) { 6045 /* Might have got the wrong stripe_head by accident */ 6046 ret = STRIPE_RETRY; 6047 goto out_release; 6048 } 6049 6050 if (test_bit(STRIPE_EXPANDING, &sh->state)) { 6051 md_wakeup_thread(mddev->thread); 6052 ret = STRIPE_SCHEDULE_AND_RETRY; 6053 goto out_release; 6054 } 6055 6056 if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { 6057 ret = STRIPE_RETRY; 6058 goto out; 6059 } 6060 6061 if (stripe_can_batch(sh)) { 6062 stripe_add_to_batch_list(conf, sh, ctx->batch_last); 6063 if (ctx->batch_last) 6064 raid5_release_stripe(ctx->batch_last); 6065 atomic_inc(&sh->count); 6066 ctx->batch_last = sh; 6067 } 6068 6069 if (ctx->do_flush) { 6070 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 6071 /* we only need flush for one stripe */ 6072 ctx->do_flush = false; 6073 } 6074 6075 set_bit(STRIPE_HANDLE, &sh->state); 6076 clear_bit(STRIPE_DELAYED, &sh->state); 6077 if ((!sh->batch_head || sh == sh->batch_head) && 6078 (bi->bi_opf & REQ_SYNC) && 6079 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 6080 atomic_inc(&conf->preread_active_stripes); 6081 6082 release_stripe_plug(mddev, sh); 6083 return STRIPE_SUCCESS; 6084 6085 out_release: 6086 raid5_release_stripe(sh); 6087 out: 6088 if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) { 6089 if (!mddev_is_dm(mddev) || 6090 test_bit(MD_DM_SUSPENDING, &mddev->flags)) { 6091 bi->bi_status = BLK_STS_RESOURCE; 6092 ret = STRIPE_WAIT_RESHAPE; 6093 } 6094 pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress"); 6095 } 6096 return ret; 6097 } 6098 6099 /* 6100 * If the bio covers multiple data disks, find sector within the bio that has 6101 * the lowest chunk offset in the first chunk. 6102 */ 6103 static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, 6104 struct bio *bi) 6105 { 6106 int sectors_per_chunk = conf->chunk_sectors; 6107 int raid_disks = conf->raid_disks; 6108 int dd_idx; 6109 struct stripe_head sh; 6110 unsigned int chunk_offset; 6111 sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6112 sector_t sector; 6113 6114 /* We pass in fake stripe_head to get back parity disk numbers */ 6115 sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh); 6116 chunk_offset = sector_div(sector, sectors_per_chunk); 6117 if (sectors_per_chunk - chunk_offset >= bio_sectors(bi)) 6118 return r_sector; 6119 /* 6120 * Bio crosses to the next data disk. Check whether it's in the same 6121 * chunk. 6122 */ 6123 dd_idx++; 6124 while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx) 6125 dd_idx++; 6126 if (dd_idx >= raid_disks) 6127 return r_sector; 6128 return r_sector + sectors_per_chunk - chunk_offset; 6129 } 6130 6131 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 6132 { 6133 DEFINE_WAIT_FUNC(wait, woken_wake_function); 6134 struct r5conf *conf = mddev->private; 6135 const int rw = bio_data_dir(bi); 6136 struct stripe_request_ctx *ctx; 6137 sector_t logical_sector; 6138 enum stripe_result res; 6139 int s, stripe_cnt; 6140 bool on_wq; 6141 6142 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 6143 int ret = log_handle_flush_request(conf, bi); 6144 6145 if (ret == 0) 6146 return true; 6147 if (ret == -ENODEV) { 6148 if (md_flush_request(mddev, bi)) 6149 return true; 6150 } 6151 /* ret == -EAGAIN, fallback */ 6152 } 6153 6154 md_write_start(mddev, bi); 6155 /* 6156 * If array is degraded, better not do chunk aligned read because 6157 * later we might have to read it again in order to reconstruct 6158 * data on failed drives. 6159 */ 6160 if (rw == READ && mddev->degraded == 0 && 6161 mddev->reshape_position == MaxSector) { 6162 bi = chunk_aligned_read(mddev, bi); 6163 if (!bi) 6164 return true; 6165 } 6166 6167 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 6168 make_discard_request(mddev, bi); 6169 md_write_end(mddev); 6170 return true; 6171 } 6172 6173 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6174 bi->bi_next = NULL; 6175 6176 ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO); 6177 memset(ctx, 0, conf->ctx_size); 6178 ctx->first_sector = logical_sector; 6179 ctx->last_sector = bio_end_sector(bi); 6180 /* 6181 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 6182 * we need to flush journal device 6183 */ 6184 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) 6185 ctx->do_flush = true; 6186 6187 stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector, 6188 RAID5_STRIPE_SECTORS(conf)); 6189 bitmap_set(ctx->sectors_to_do, 0, stripe_cnt); 6190 6191 pr_debug("raid456: %s, logical %llu to %llu\n", __func__, 6192 bi->bi_iter.bi_sector, ctx->last_sector); 6193 6194 /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ 6195 if ((bi->bi_opf & REQ_NOWAIT) && 6196 get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { 6197 bio_wouldblock_error(bi); 6198 if (rw == WRITE) 6199 md_write_end(mddev); 6200 mempool_free(ctx, conf->ctx_pool); 6201 return true; 6202 } 6203 md_account_bio(mddev, &bi); 6204 6205 /* 6206 * Lets start with the stripe with the lowest chunk offset in the first 6207 * chunk. That has the best chances of creating IOs adjacent to 6208 * previous IOs in case of sequential IO and thus creates the most 6209 * sequential IO pattern. We don't bother with the optimization when 6210 * reshaping as the performance benefit is not worth the complexity. 6211 */ 6212 if (likely(conf->reshape_progress == MaxSector)) { 6213 logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); 6214 on_wq = false; 6215 } else { 6216 add_wait_queue(&conf->wait_for_reshape, &wait); 6217 on_wq = true; 6218 } 6219 s = (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf); 6220 6221 while (1) { 6222 res = make_stripe_request(mddev, conf, ctx, logical_sector, 6223 bi); 6224 if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) 6225 break; 6226 6227 if (res == STRIPE_RETRY) 6228 continue; 6229 6230 if (res == STRIPE_SCHEDULE_AND_RETRY) { 6231 WARN_ON_ONCE(!on_wq); 6232 /* 6233 * Must release the reference to batch_last before 6234 * scheduling and waiting for work to be done, 6235 * otherwise the batch_last stripe head could prevent 6236 * raid5_activate_delayed() from making progress 6237 * and thus deadlocking. 6238 */ 6239 if (ctx->batch_last) { 6240 raid5_release_stripe(ctx->batch_last); 6241 ctx->batch_last = NULL; 6242 } 6243 6244 wait_woken(&wait, TASK_UNINTERRUPTIBLE, 6245 MAX_SCHEDULE_TIMEOUT); 6246 continue; 6247 } 6248 6249 s = find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s); 6250 if (s == stripe_cnt) 6251 break; 6252 6253 logical_sector = ctx->first_sector + 6254 (s << RAID5_STRIPE_SHIFT(conf)); 6255 } 6256 if (unlikely(on_wq)) 6257 remove_wait_queue(&conf->wait_for_reshape, &wait); 6258 6259 if (ctx->batch_last) 6260 raid5_release_stripe(ctx->batch_last); 6261 6262 if (rw == WRITE) 6263 md_write_end(mddev); 6264 6265 mempool_free(ctx, conf->ctx_pool); 6266 if (res == STRIPE_WAIT_RESHAPE) { 6267 DECLARE_COMPLETION_ONSTACK(done); 6268 WRITE_ONCE(bi->bi_private, &done); 6269 6270 bio_endio(bi); 6271 6272 wait_for_completion(&done); 6273 return false; 6274 } 6275 6276 bio_endio(bi); 6277 return true; 6278 } 6279 6280 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 6281 6282 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 6283 { 6284 /* reshaping is quite different to recovery/resync so it is 6285 * handled quite separately ... here. 6286 * 6287 * On each call to sync_request, we gather one chunk worth of 6288 * destination stripes and flag them as expanding. 6289 * Then we find all the source stripes and request reads. 6290 * As the reads complete, handle_stripe will copy the data 6291 * into the destination stripe and release that stripe. 6292 */ 6293 struct r5conf *conf = mddev->private; 6294 struct stripe_head *sh; 6295 struct md_rdev *rdev; 6296 sector_t first_sector, last_sector; 6297 int raid_disks = conf->previous_raid_disks; 6298 int data_disks = raid_disks - conf->max_degraded; 6299 int new_data_disks = conf->raid_disks - conf->max_degraded; 6300 int i; 6301 int dd_idx; 6302 sector_t writepos, readpos, safepos; 6303 sector_t stripe_addr; 6304 int reshape_sectors; 6305 struct list_head stripes; 6306 sector_t retn; 6307 6308 if (sector_nr == 0) { 6309 /* If restarting in the middle, skip the initial sectors */ 6310 if (mddev->reshape_backwards && 6311 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 6312 sector_nr = raid5_size(mddev, 0, 0) 6313 - conf->reshape_progress; 6314 } else if (mddev->reshape_backwards && 6315 conf->reshape_progress == MaxSector) { 6316 /* shouldn't happen, but just in case, finish up.*/ 6317 sector_nr = MaxSector; 6318 } else if (!mddev->reshape_backwards && 6319 conf->reshape_progress > 0) 6320 sector_nr = conf->reshape_progress; 6321 sector_div(sector_nr, new_data_disks); 6322 if (sector_nr) { 6323 mddev->curr_resync_completed = sector_nr; 6324 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6325 *skipped = 1; 6326 retn = sector_nr; 6327 goto finish; 6328 } 6329 } 6330 6331 /* We need to process a full chunk at a time. 6332 * If old and new chunk sizes differ, we need to process the 6333 * largest of these 6334 */ 6335 6336 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 6337 6338 /* We update the metadata at least every 10 seconds, or when 6339 * the data about to be copied would over-write the source of 6340 * the data at the front of the range. i.e. one new_stripe 6341 * along from reshape_progress new_maps to after where 6342 * reshape_safe old_maps to 6343 */ 6344 writepos = conf->reshape_progress; 6345 sector_div(writepos, new_data_disks); 6346 readpos = conf->reshape_progress; 6347 sector_div(readpos, data_disks); 6348 safepos = conf->reshape_safe; 6349 sector_div(safepos, data_disks); 6350 if (mddev->reshape_backwards) { 6351 if (WARN_ON(writepos < reshape_sectors)) 6352 return MaxSector; 6353 6354 writepos -= reshape_sectors; 6355 readpos += reshape_sectors; 6356 safepos += reshape_sectors; 6357 } else { 6358 writepos += reshape_sectors; 6359 /* readpos and safepos are worst-case calculations. 6360 * A negative number is overly pessimistic, and causes 6361 * obvious problems for unsigned storage. So clip to 0. 6362 */ 6363 readpos -= min_t(sector_t, reshape_sectors, readpos); 6364 safepos -= min_t(sector_t, reshape_sectors, safepos); 6365 } 6366 6367 /* Having calculated the 'writepos' possibly use it 6368 * to set 'stripe_addr' which is where we will write to. 6369 */ 6370 if (mddev->reshape_backwards) { 6371 if (WARN_ON(conf->reshape_progress == 0)) 6372 return MaxSector; 6373 6374 stripe_addr = writepos; 6375 if (WARN_ON((mddev->dev_sectors & 6376 ~((sector_t)reshape_sectors - 1)) - 6377 reshape_sectors - stripe_addr != sector_nr)) 6378 return MaxSector; 6379 } else { 6380 if (WARN_ON(writepos != sector_nr + reshape_sectors)) 6381 return MaxSector; 6382 6383 stripe_addr = sector_nr; 6384 } 6385 6386 /* 'writepos' is the most advanced device address we might write. 6387 * 'readpos' is the least advanced device address we might read. 6388 * 'safepos' is the least address recorded in the metadata as having 6389 * been reshaped. 6390 * If there is a min_offset_diff, these are adjusted either by 6391 * increasing the safepos/readpos if diff is negative, or 6392 * increasing writepos if diff is positive. 6393 * If 'readpos' is then behind 'writepos', there is no way that we can 6394 * ensure safety in the face of a crash - that must be done by userspace 6395 * making a backup of the data. So in that case there is no particular 6396 * rush to update metadata. 6397 * Otherwise if 'safepos' is behind 'writepos', then we really need to 6398 * update the metadata to advance 'safepos' to match 'readpos' so that 6399 * we can be safe in the event of a crash. 6400 * So we insist on updating metadata if safepos is behind writepos and 6401 * readpos is beyond writepos. 6402 * In any case, update the metadata every 10 seconds. 6403 * Maybe that number should be configurable, but I'm not sure it is 6404 * worth it.... maybe it could be a multiple of safemode_delay??? 6405 */ 6406 if (conf->min_offset_diff < 0) { 6407 safepos += -conf->min_offset_diff; 6408 readpos += -conf->min_offset_diff; 6409 } else 6410 writepos += conf->min_offset_diff; 6411 6412 if ((mddev->reshape_backwards 6413 ? (safepos > writepos && readpos < writepos) 6414 : (safepos < writepos && readpos > writepos)) || 6415 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 6416 /* Cannot proceed until we've updated the superblock... */ 6417 wait_event(conf->wait_for_reshape, 6418 atomic_read(&conf->reshape_stripes)==0 6419 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6420 if (atomic_read(&conf->reshape_stripes) != 0) 6421 return 0; 6422 mddev->reshape_position = conf->reshape_progress; 6423 mddev->curr_resync_completed = sector_nr; 6424 if (!mddev->reshape_backwards) 6425 /* Can update recovery_offset */ 6426 rdev_for_each(rdev, mddev) 6427 if (rdev->raid_disk >= 0 && 6428 !test_bit(Journal, &rdev->flags) && 6429 !test_bit(In_sync, &rdev->flags) && 6430 rdev->recovery_offset < sector_nr) 6431 rdev->recovery_offset = sector_nr; 6432 6433 conf->reshape_checkpoint = jiffies; 6434 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6435 md_wakeup_thread(mddev->thread); 6436 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 6437 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6438 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6439 return 0; 6440 spin_lock_irq(&conf->device_lock); 6441 conf->reshape_safe = mddev->reshape_position; 6442 spin_unlock_irq(&conf->device_lock); 6443 wake_up(&conf->wait_for_reshape); 6444 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6445 } 6446 6447 INIT_LIST_HEAD(&stripes); 6448 for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { 6449 int j; 6450 int skipped_disk = 0; 6451 sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i, 6452 R5_GAS_NOQUIESCE); 6453 set_bit(STRIPE_EXPANDING, &sh->state); 6454 atomic_inc(&conf->reshape_stripes); 6455 /* If any of this stripe is beyond the end of the old 6456 * array, then we need to zero those blocks 6457 */ 6458 for (j=sh->disks; j--;) { 6459 sector_t s; 6460 if (j == sh->pd_idx) 6461 continue; 6462 if (conf->level == 6 && 6463 j == sh->qd_idx) 6464 continue; 6465 s = raid5_compute_blocknr(sh, j, 0); 6466 if (s < raid5_size(mddev, 0, 0)) { 6467 skipped_disk = 1; 6468 continue; 6469 } 6470 memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); 6471 set_bit(R5_Expanded, &sh->dev[j].flags); 6472 set_bit(R5_UPTODATE, &sh->dev[j].flags); 6473 } 6474 if (!skipped_disk) { 6475 set_bit(STRIPE_EXPAND_READY, &sh->state); 6476 set_bit(STRIPE_HANDLE, &sh->state); 6477 } 6478 list_add(&sh->lru, &stripes); 6479 } 6480 spin_lock_irq(&conf->device_lock); 6481 if (mddev->reshape_backwards) 6482 conf->reshape_progress -= reshape_sectors * new_data_disks; 6483 else 6484 conf->reshape_progress += reshape_sectors * new_data_disks; 6485 spin_unlock_irq(&conf->device_lock); 6486 /* Ok, those stripe are ready. We can start scheduling 6487 * reads on the source stripes. 6488 * The source stripes are determined by mapping the first and last 6489 * block on the destination stripes. 6490 */ 6491 first_sector = 6492 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 6493 1, &dd_idx, NULL); 6494 last_sector = 6495 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 6496 * new_data_disks - 1), 6497 1, &dd_idx, NULL); 6498 if (last_sector >= mddev->dev_sectors) 6499 last_sector = mddev->dev_sectors - 1; 6500 while (first_sector <= last_sector) { 6501 sh = raid5_get_active_stripe(conf, NULL, first_sector, 6502 R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE); 6503 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 6504 set_bit(STRIPE_HANDLE, &sh->state); 6505 raid5_release_stripe(sh); 6506 first_sector += RAID5_STRIPE_SECTORS(conf); 6507 } 6508 /* Now that the sources are clearly marked, we can release 6509 * the destination stripes 6510 */ 6511 while (!list_empty(&stripes)) { 6512 sh = list_entry(stripes.next, struct stripe_head, lru); 6513 list_del_init(&sh->lru); 6514 raid5_release_stripe(sh); 6515 } 6516 /* If this takes us to the resync_max point where we have to pause, 6517 * then we need to write out the superblock. 6518 */ 6519 sector_nr += reshape_sectors; 6520 retn = reshape_sectors; 6521 finish: 6522 if (mddev->curr_resync_completed > mddev->resync_max || 6523 (sector_nr - mddev->curr_resync_completed) * 2 6524 >= mddev->resync_max - mddev->curr_resync_completed) { 6525 /* Cannot proceed until we've updated the superblock... */ 6526 wait_event(conf->wait_for_reshape, 6527 atomic_read(&conf->reshape_stripes) == 0 6528 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6529 if (atomic_read(&conf->reshape_stripes) != 0) 6530 goto ret; 6531 mddev->reshape_position = conf->reshape_progress; 6532 mddev->curr_resync_completed = sector_nr; 6533 if (!mddev->reshape_backwards) 6534 /* Can update recovery_offset */ 6535 rdev_for_each(rdev, mddev) 6536 if (rdev->raid_disk >= 0 && 6537 !test_bit(Journal, &rdev->flags) && 6538 !test_bit(In_sync, &rdev->flags) && 6539 rdev->recovery_offset < sector_nr) 6540 rdev->recovery_offset = sector_nr; 6541 conf->reshape_checkpoint = jiffies; 6542 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6543 md_wakeup_thread(mddev->thread); 6544 wait_event(mddev->sb_wait, 6545 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 6546 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6547 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6548 goto ret; 6549 spin_lock_irq(&conf->device_lock); 6550 conf->reshape_safe = mddev->reshape_position; 6551 spin_unlock_irq(&conf->device_lock); 6552 wake_up(&conf->wait_for_reshape); 6553 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6554 } 6555 ret: 6556 return retn; 6557 } 6558 6559 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6560 sector_t max_sector, int *skipped) 6561 { 6562 struct r5conf *conf = mddev->private; 6563 struct stripe_head *sh; 6564 sector_t sync_blocks; 6565 bool still_degraded = false; 6566 int i; 6567 6568 if (sector_nr >= max_sector) { 6569 /* just being told to finish up .. nothing much to do */ 6570 6571 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6572 end_reshape(conf); 6573 return 0; 6574 } 6575 6576 if (mddev->curr_resync < max_sector) /* aborted */ 6577 md_bitmap_end_sync(mddev, mddev->curr_resync, 6578 &sync_blocks); 6579 else /* completed sync */ 6580 conf->fullsync = 0; 6581 if (md_bitmap_enabled(mddev, false)) 6582 mddev->bitmap_ops->close_sync(mddev); 6583 6584 return 0; 6585 } 6586 6587 /* Allow raid5_quiesce to complete */ 6588 wait_event(conf->wait_for_reshape, conf->quiesce != 2); 6589 6590 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6591 return reshape_request(mddev, sector_nr, skipped); 6592 6593 /* No need to check resync_max as we never do more than one 6594 * stripe, and as resync_max will always be on a chunk boundary, 6595 * if the check in md_do_sync didn't fire, there is no chance 6596 * of overstepping resync_max here 6597 */ 6598 6599 /* if there is too many failed drives and we are trying 6600 * to resync, then assert that we are finished, because there is 6601 * nothing we can do. 6602 */ 6603 if (mddev->degraded >= conf->max_degraded && 6604 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6605 sector_t rv = mddev->dev_sectors - sector_nr; 6606 *skipped = 1; 6607 return rv; 6608 } 6609 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6610 !conf->fullsync && 6611 !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && 6612 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { 6613 /* we can skip this block, and probably more */ 6614 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); 6615 *skipped = 1; 6616 /* keep things rounded to whole stripes */ 6617 return sync_blocks * RAID5_STRIPE_SECTORS(conf); 6618 } 6619 6620 if (md_bitmap_enabled(mddev, false)) 6621 mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); 6622 6623 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 6624 R5_GAS_NOBLOCK); 6625 if (sh == NULL) { 6626 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0); 6627 /* make sure we don't swamp the stripe cache if someone else 6628 * is trying to get access 6629 */ 6630 schedule_timeout_uninterruptible(1); 6631 } 6632 /* Need to check if array will still be degraded after recovery/resync 6633 * Note in case of > 1 drive failures it's possible we're rebuilding 6634 * one drive while leaving another faulty drive in array. 6635 */ 6636 for (i = 0; i < conf->raid_disks; i++) { 6637 struct md_rdev *rdev = conf->disks[i].rdev; 6638 6639 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6640 still_degraded = true; 6641 } 6642 6643 md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); 6644 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6645 set_bit(STRIPE_HANDLE, &sh->state); 6646 6647 raid5_release_stripe(sh); 6648 6649 return RAID5_STRIPE_SECTORS(conf); 6650 } 6651 6652 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6653 unsigned int offset) 6654 { 6655 /* We may not be able to submit a whole bio at once as there 6656 * may not be enough stripe_heads available. 6657 * We cannot pre-allocate enough stripe_heads as we may need 6658 * more than exist in the cache (if we allow ever large chunks). 6659 * So we do one stripe head at a time and record in 6660 * ->bi_hw_segments how many have been done. 6661 * 6662 * We *know* that this entire raid_bio is in one chunk, so 6663 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6664 */ 6665 struct stripe_head *sh; 6666 int dd_idx; 6667 sector_t sector, logical_sector, last_sector; 6668 int scnt = 0; 6669 int handled = 0; 6670 6671 logical_sector = raid_bio->bi_iter.bi_sector & 6672 ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); 6673 sector = raid5_compute_sector(conf, logical_sector, 6674 0, &dd_idx, NULL); 6675 last_sector = bio_end_sector(raid_bio); 6676 6677 for (; logical_sector < last_sector; 6678 logical_sector += RAID5_STRIPE_SECTORS(conf), 6679 sector += RAID5_STRIPE_SECTORS(conf), 6680 scnt++) { 6681 6682 if (scnt < offset) 6683 /* already done this stripe */ 6684 continue; 6685 6686 sh = raid5_get_active_stripe(conf, NULL, sector, 6687 R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE); 6688 if (!sh) { 6689 /* failed to get a stripe - must wait */ 6690 conf->retry_read_aligned = raid_bio; 6691 conf->retry_read_offset = scnt; 6692 return handled; 6693 } 6694 6695 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6696 int hash; 6697 6698 spin_lock_irq(&conf->device_lock); 6699 hash = sh->hash_lock_index; 6700 __release_stripe(conf, sh, 6701 &conf->temp_inactive_list[hash]); 6702 spin_unlock_irq(&conf->device_lock); 6703 conf->retry_read_aligned = raid_bio; 6704 conf->retry_read_offset = scnt; 6705 return handled; 6706 } 6707 6708 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6709 handle_stripe(sh); 6710 raid5_release_stripe(sh); 6711 handled++; 6712 } 6713 6714 bio_endio(raid_bio); 6715 6716 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6717 wake_up(&conf->wait_for_quiescent); 6718 return handled; 6719 } 6720 6721 static int handle_active_stripes(struct r5conf *conf, int group, 6722 struct r5worker *worker, 6723 struct list_head *temp_inactive_list) 6724 __must_hold(&conf->device_lock) 6725 { 6726 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6727 int i, batch_size = 0, hash; 6728 bool release_inactive = false; 6729 6730 while (batch_size < MAX_STRIPE_BATCH && 6731 (sh = __get_priority_stripe(conf, group)) != NULL) 6732 batch[batch_size++] = sh; 6733 6734 if (batch_size == 0) { 6735 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6736 if (!list_empty(temp_inactive_list + i)) 6737 break; 6738 if (i == NR_STRIPE_HASH_LOCKS) { 6739 spin_unlock_irq(&conf->device_lock); 6740 log_flush_stripe_to_raid(conf); 6741 spin_lock_irq(&conf->device_lock); 6742 return batch_size; 6743 } 6744 release_inactive = true; 6745 } 6746 spin_unlock_irq(&conf->device_lock); 6747 6748 release_inactive_stripe_list(conf, temp_inactive_list, 6749 NR_STRIPE_HASH_LOCKS); 6750 6751 r5l_flush_stripe_to_raid(conf->log); 6752 if (release_inactive) { 6753 spin_lock_irq(&conf->device_lock); 6754 return 0; 6755 } 6756 6757 for (i = 0; i < batch_size; i++) 6758 handle_stripe(batch[i]); 6759 log_write_stripe_run(conf); 6760 6761 cond_resched(); 6762 6763 spin_lock_irq(&conf->device_lock); 6764 for (i = 0; i < batch_size; i++) { 6765 hash = batch[i]->hash_lock_index; 6766 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6767 } 6768 return batch_size; 6769 } 6770 6771 static void raid5_do_work(struct work_struct *work) 6772 { 6773 struct r5worker *worker = container_of(work, struct r5worker, work); 6774 struct r5worker_group *group = worker->group; 6775 struct r5conf *conf = group->conf; 6776 struct mddev *mddev = conf->mddev; 6777 int group_id = group - conf->worker_groups; 6778 int handled; 6779 struct blk_plug plug; 6780 6781 pr_debug("+++ raid5worker active\n"); 6782 6783 blk_start_plug(&plug); 6784 handled = 0; 6785 spin_lock_irq(&conf->device_lock); 6786 while (1) { 6787 int batch_size, released; 6788 6789 released = release_stripe_list(conf, worker->temp_inactive_list); 6790 6791 batch_size = handle_active_stripes(conf, group_id, worker, 6792 worker->temp_inactive_list); 6793 worker->working = false; 6794 if (!batch_size && !released) 6795 break; 6796 handled += batch_size; 6797 wait_event_lock_irq(mddev->sb_wait, 6798 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6799 conf->device_lock); 6800 } 6801 pr_debug("%d stripes handled\n", handled); 6802 6803 spin_unlock_irq(&conf->device_lock); 6804 6805 flush_deferred_bios(conf); 6806 6807 r5l_flush_stripe_to_raid(conf->log); 6808 6809 async_tx_issue_pending_all(); 6810 blk_finish_plug(&plug); 6811 6812 pr_debug("--- raid5worker inactive\n"); 6813 } 6814 6815 /* 6816 * This is our raid5 kernel thread. 6817 * 6818 * We scan the hash table for stripes which can be handled now. 6819 * During the scan, completed stripes are saved for us by the interrupt 6820 * handler, so that they will not have to wait for our next wakeup. 6821 */ 6822 static void raid5d(struct md_thread *thread) 6823 { 6824 struct mddev *mddev = thread->mddev; 6825 struct r5conf *conf = mddev->private; 6826 int handled; 6827 struct blk_plug plug; 6828 6829 pr_debug("+++ raid5d active\n"); 6830 6831 md_check_recovery(mddev); 6832 6833 blk_start_plug(&plug); 6834 handled = 0; 6835 spin_lock_irq(&conf->device_lock); 6836 while (1) { 6837 struct bio *bio; 6838 int batch_size, released; 6839 unsigned int offset; 6840 6841 if (md_is_rdwr(mddev) && 6842 test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6843 break; 6844 6845 released = release_stripe_list(conf, conf->temp_inactive_list); 6846 if (released) 6847 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6848 6849 if ( 6850 !list_empty(&conf->bitmap_list)) { 6851 /* Now is a good time to flush some bitmap updates */ 6852 conf->seq_flush++; 6853 spin_unlock_irq(&conf->device_lock); 6854 if (md_bitmap_enabled(mddev, true)) 6855 mddev->bitmap_ops->unplug(mddev, true); 6856 spin_lock_irq(&conf->device_lock); 6857 conf->seq_write = conf->seq_flush; 6858 activate_bit_delay(conf, conf->temp_inactive_list); 6859 } 6860 raid5_activate_delayed(conf); 6861 6862 while ((bio = remove_bio_from_retry(conf, &offset))) { 6863 int ok; 6864 spin_unlock_irq(&conf->device_lock); 6865 ok = retry_aligned_read(conf, bio, offset); 6866 spin_lock_irq(&conf->device_lock); 6867 if (!ok) 6868 break; 6869 handled++; 6870 } 6871 6872 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6873 conf->temp_inactive_list); 6874 if (!batch_size && !released) 6875 break; 6876 handled += batch_size; 6877 6878 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6879 spin_unlock_irq(&conf->device_lock); 6880 md_check_recovery(mddev); 6881 spin_lock_irq(&conf->device_lock); 6882 } 6883 } 6884 pr_debug("%d stripes handled\n", handled); 6885 6886 spin_unlock_irq(&conf->device_lock); 6887 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6888 mutex_trylock(&conf->cache_size_mutex)) { 6889 grow_one_stripe(conf, __GFP_NOWARN); 6890 /* Set flag even if allocation failed. This helps 6891 * slow down allocation requests when mem is short 6892 */ 6893 set_bit(R5_DID_ALLOC, &conf->cache_state); 6894 mutex_unlock(&conf->cache_size_mutex); 6895 } 6896 6897 flush_deferred_bios(conf); 6898 6899 r5l_flush_stripe_to_raid(conf->log); 6900 6901 async_tx_issue_pending_all(); 6902 blk_finish_plug(&plug); 6903 6904 pr_debug("--- raid5d inactive\n"); 6905 } 6906 6907 static ssize_t 6908 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6909 { 6910 struct r5conf *conf; 6911 int ret = 0; 6912 spin_lock(&mddev->lock); 6913 conf = mddev->private; 6914 if (conf) 6915 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6916 spin_unlock(&mddev->lock); 6917 return ret; 6918 } 6919 6920 int 6921 raid5_set_cache_size(struct mddev *mddev, int size) 6922 { 6923 int result = 0; 6924 struct r5conf *conf = mddev->private; 6925 6926 if (size <= 16 || size > 32768) 6927 return -EINVAL; 6928 6929 WRITE_ONCE(conf->min_nr_stripes, size); 6930 mutex_lock(&conf->cache_size_mutex); 6931 while (size < conf->max_nr_stripes && 6932 drop_one_stripe(conf)) 6933 ; 6934 mutex_unlock(&conf->cache_size_mutex); 6935 6936 md_allow_write(mddev); 6937 6938 mutex_lock(&conf->cache_size_mutex); 6939 while (size > conf->max_nr_stripes) 6940 if (!grow_one_stripe(conf, GFP_KERNEL)) { 6941 WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes); 6942 result = -ENOMEM; 6943 break; 6944 } 6945 mutex_unlock(&conf->cache_size_mutex); 6946 6947 return result; 6948 } 6949 EXPORT_SYMBOL(raid5_set_cache_size); 6950 6951 static ssize_t 6952 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6953 { 6954 struct r5conf *conf; 6955 unsigned long new; 6956 int err; 6957 6958 if (len >= PAGE_SIZE) 6959 return -EINVAL; 6960 if (kstrtoul(page, 10, &new)) 6961 return -EINVAL; 6962 err = mddev_lock(mddev); 6963 if (err) 6964 return err; 6965 conf = mddev->private; 6966 if (!conf) 6967 err = -ENODEV; 6968 else 6969 err = raid5_set_cache_size(mddev, new); 6970 mddev_unlock(mddev); 6971 6972 return err ?: len; 6973 } 6974 6975 static struct md_sysfs_entry 6976 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6977 raid5_show_stripe_cache_size, 6978 raid5_store_stripe_cache_size); 6979 6980 static ssize_t 6981 raid5_show_rmw_level(struct mddev *mddev, char *page) 6982 { 6983 struct r5conf *conf = mddev->private; 6984 if (conf) 6985 return sprintf(page, "%d\n", conf->rmw_level); 6986 else 6987 return 0; 6988 } 6989 6990 static ssize_t 6991 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6992 { 6993 struct r5conf *conf = mddev->private; 6994 unsigned long new; 6995 6996 if (!conf) 6997 return -ENODEV; 6998 6999 if (len >= PAGE_SIZE) 7000 return -EINVAL; 7001 7002 if (kstrtoul(page, 10, &new)) 7003 return -EINVAL; 7004 7005 if (new != PARITY_DISABLE_RMW && !raid6_can_xor_syndrome()) 7006 return -EINVAL; 7007 7008 if (new != PARITY_DISABLE_RMW && 7009 new != PARITY_ENABLE_RMW && 7010 new != PARITY_PREFER_RMW) 7011 return -EINVAL; 7012 7013 conf->rmw_level = new; 7014 return len; 7015 } 7016 7017 static struct md_sysfs_entry 7018 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 7019 raid5_show_rmw_level, 7020 raid5_store_rmw_level); 7021 7022 static ssize_t 7023 raid5_show_stripe_size(struct mddev *mddev, char *page) 7024 { 7025 struct r5conf *conf; 7026 int ret = 0; 7027 7028 spin_lock(&mddev->lock); 7029 conf = mddev->private; 7030 if (conf) 7031 ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); 7032 spin_unlock(&mddev->lock); 7033 return ret; 7034 } 7035 7036 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 7037 static ssize_t 7038 raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) 7039 { 7040 struct r5conf *conf; 7041 unsigned long new; 7042 int err; 7043 int size; 7044 7045 if (len >= PAGE_SIZE) 7046 return -EINVAL; 7047 if (kstrtoul(page, 10, &new)) 7048 return -EINVAL; 7049 7050 /* 7051 * The value should not be bigger than PAGE_SIZE. It requires to 7052 * be multiple of DEFAULT_STRIPE_SIZE and the value should be power 7053 * of two. 7054 */ 7055 if (new % DEFAULT_STRIPE_SIZE != 0 || 7056 new > PAGE_SIZE || new == 0 || 7057 new != roundup_pow_of_two(new)) 7058 return -EINVAL; 7059 7060 err = mddev_suspend_and_lock(mddev); 7061 if (err) 7062 return err; 7063 7064 conf = mddev->private; 7065 if (!conf) { 7066 err = -ENODEV; 7067 goto out_unlock; 7068 } 7069 7070 if (new == conf->stripe_size) 7071 goto out_unlock; 7072 7073 pr_debug("md/raid: change stripe_size from %lu to %lu\n", 7074 conf->stripe_size, new); 7075 7076 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7077 mddev->reshape_position != MaxSector || mddev->sysfs_active) { 7078 err = -EBUSY; 7079 goto out_unlock; 7080 } 7081 7082 mutex_lock(&conf->cache_size_mutex); 7083 size = conf->max_nr_stripes; 7084 7085 shrink_stripes(conf); 7086 7087 conf->stripe_size = new; 7088 conf->stripe_shift = ilog2(new) - 9; 7089 conf->stripe_sectors = new >> 9; 7090 if (grow_stripes(conf, size)) { 7091 pr_warn("md/raid:%s: couldn't allocate buffers\n", 7092 mdname(mddev)); 7093 err = -ENOMEM; 7094 } 7095 mutex_unlock(&conf->cache_size_mutex); 7096 7097 out_unlock: 7098 mddev_unlock_and_resume(mddev); 7099 return err ?: len; 7100 } 7101 7102 static struct md_sysfs_entry 7103 raid5_stripe_size = __ATTR(stripe_size, 0644, 7104 raid5_show_stripe_size, 7105 raid5_store_stripe_size); 7106 #else 7107 static struct md_sysfs_entry 7108 raid5_stripe_size = __ATTR(stripe_size, 0444, 7109 raid5_show_stripe_size, 7110 NULL); 7111 #endif 7112 7113 static ssize_t 7114 raid5_show_preread_threshold(struct mddev *mddev, char *page) 7115 { 7116 struct r5conf *conf; 7117 int ret = 0; 7118 spin_lock(&mddev->lock); 7119 conf = mddev->private; 7120 if (conf) 7121 ret = sprintf(page, "%d\n", conf->bypass_threshold); 7122 spin_unlock(&mddev->lock); 7123 return ret; 7124 } 7125 7126 static ssize_t 7127 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 7128 { 7129 struct r5conf *conf; 7130 unsigned long new; 7131 int err; 7132 7133 if (len >= PAGE_SIZE) 7134 return -EINVAL; 7135 if (kstrtoul(page, 10, &new)) 7136 return -EINVAL; 7137 7138 err = mddev_lock(mddev); 7139 if (err) 7140 return err; 7141 conf = mddev->private; 7142 if (!conf) 7143 err = -ENODEV; 7144 else if (new > conf->min_nr_stripes) 7145 err = -EINVAL; 7146 else 7147 conf->bypass_threshold = new; 7148 mddev_unlock(mddev); 7149 return err ?: len; 7150 } 7151 7152 static struct md_sysfs_entry 7153 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 7154 S_IRUGO | S_IWUSR, 7155 raid5_show_preread_threshold, 7156 raid5_store_preread_threshold); 7157 7158 static ssize_t 7159 raid5_show_skip_copy(struct mddev *mddev, char *page) 7160 { 7161 struct r5conf *conf; 7162 int ret = 0; 7163 spin_lock(&mddev->lock); 7164 conf = mddev->private; 7165 if (conf) 7166 ret = sprintf(page, "%d\n", conf->skip_copy); 7167 spin_unlock(&mddev->lock); 7168 return ret; 7169 } 7170 7171 static ssize_t 7172 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 7173 { 7174 struct r5conf *conf; 7175 unsigned long new; 7176 int err; 7177 7178 if (len >= PAGE_SIZE) 7179 return -EINVAL; 7180 if (kstrtoul(page, 10, &new)) 7181 return -EINVAL; 7182 new = !!new; 7183 7184 err = mddev_suspend_and_lock(mddev); 7185 if (err) 7186 return err; 7187 conf = mddev->private; 7188 if (!conf) 7189 err = -ENODEV; 7190 else if (new != conf->skip_copy) { 7191 struct request_queue *q = mddev->gendisk->queue; 7192 struct queue_limits lim = queue_limits_start_update(q); 7193 7194 conf->skip_copy = new; 7195 if (new) 7196 lim.features |= BLK_FEAT_STABLE_WRITES; 7197 else 7198 lim.features &= ~BLK_FEAT_STABLE_WRITES; 7199 err = queue_limits_commit_update(q, &lim); 7200 } 7201 mddev_unlock_and_resume(mddev); 7202 return err ?: len; 7203 } 7204 7205 static struct md_sysfs_entry 7206 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 7207 raid5_show_skip_copy, 7208 raid5_store_skip_copy); 7209 7210 static ssize_t 7211 stripe_cache_active_show(struct mddev *mddev, char *page) 7212 { 7213 struct r5conf *conf = mddev->private; 7214 if (conf) 7215 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 7216 else 7217 return 0; 7218 } 7219 7220 static struct md_sysfs_entry 7221 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 7222 7223 static ssize_t 7224 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 7225 { 7226 struct r5conf *conf; 7227 int ret = 0; 7228 spin_lock(&mddev->lock); 7229 conf = mddev->private; 7230 if (conf) 7231 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 7232 spin_unlock(&mddev->lock); 7233 return ret; 7234 } 7235 7236 static int alloc_thread_groups(struct r5conf *conf, int cnt, 7237 int *group_cnt, 7238 struct r5worker_group **worker_groups); 7239 static ssize_t 7240 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 7241 { 7242 struct r5conf *conf; 7243 unsigned int new; 7244 int err; 7245 struct r5worker_group *new_groups, *old_groups; 7246 int group_cnt; 7247 7248 if (len >= PAGE_SIZE) 7249 return -EINVAL; 7250 if (kstrtouint(page, 10, &new)) 7251 return -EINVAL; 7252 /* 8192 should be big enough */ 7253 if (new > 8192) 7254 return -EINVAL; 7255 7256 err = mddev_suspend_and_lock(mddev); 7257 if (err) 7258 return err; 7259 conf = mddev->private; 7260 if (!conf) { 7261 mddev_unlock_and_resume(mddev); 7262 return -ENODEV; 7263 } 7264 raid5_quiesce(mddev, true); 7265 7266 if (new != conf->worker_cnt_per_group) { 7267 old_groups = conf->worker_groups; 7268 if (old_groups) 7269 flush_workqueue(raid5_wq); 7270 7271 err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); 7272 if (!err) { 7273 spin_lock_irq(&conf->device_lock); 7274 conf->group_cnt = group_cnt; 7275 conf->worker_cnt_per_group = new; 7276 conf->worker_groups = new_groups; 7277 spin_unlock_irq(&conf->device_lock); 7278 7279 if (old_groups) 7280 kfree(old_groups[0].workers); 7281 kfree(old_groups); 7282 } 7283 } 7284 7285 raid5_quiesce(mddev, false); 7286 mddev_unlock_and_resume(mddev); 7287 7288 return err ?: len; 7289 } 7290 7291 static struct md_sysfs_entry 7292 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 7293 raid5_show_group_thread_cnt, 7294 raid5_store_group_thread_cnt); 7295 7296 static struct attribute *raid5_attrs[] = { 7297 &raid5_stripecache_size.attr, 7298 &raid5_stripecache_active.attr, 7299 &raid5_preread_bypass_threshold.attr, 7300 &raid5_group_thread_cnt.attr, 7301 &raid5_skip_copy.attr, 7302 &raid5_rmw_level.attr, 7303 &raid5_stripe_size.attr, 7304 &r5c_journal_mode.attr, 7305 &ppl_write_hint.attr, 7306 NULL, 7307 }; 7308 static const struct attribute_group raid5_attrs_group = { 7309 .name = NULL, 7310 .attrs = raid5_attrs, 7311 }; 7312 7313 static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, 7314 struct r5worker_group **worker_groups) 7315 { 7316 int i, j, k; 7317 ssize_t size; 7318 struct r5worker *workers; 7319 7320 if (cnt == 0) { 7321 *group_cnt = 0; 7322 *worker_groups = NULL; 7323 return 0; 7324 } 7325 *group_cnt = num_possible_nodes(); 7326 size = sizeof(struct r5worker) * cnt; 7327 workers = kcalloc(size, *group_cnt, GFP_NOIO); 7328 *worker_groups = kzalloc_objs(struct r5worker_group, *group_cnt, 7329 GFP_NOIO); 7330 if (!*worker_groups || !workers) { 7331 kfree(workers); 7332 kfree(*worker_groups); 7333 return -ENOMEM; 7334 } 7335 7336 for (i = 0; i < *group_cnt; i++) { 7337 struct r5worker_group *group; 7338 7339 group = &(*worker_groups)[i]; 7340 INIT_LIST_HEAD(&group->handle_list); 7341 INIT_LIST_HEAD(&group->loprio_list); 7342 group->conf = conf; 7343 group->workers = workers + i * cnt; 7344 7345 for (j = 0; j < cnt; j++) { 7346 struct r5worker *worker = group->workers + j; 7347 worker->group = group; 7348 INIT_WORK(&worker->work, raid5_do_work); 7349 7350 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 7351 INIT_LIST_HEAD(worker->temp_inactive_list + k); 7352 } 7353 } 7354 7355 return 0; 7356 } 7357 7358 static void free_thread_groups(struct r5conf *conf) 7359 { 7360 if (conf->worker_groups) 7361 kfree(conf->worker_groups[0].workers); 7362 kfree(conf->worker_groups); 7363 conf->worker_groups = NULL; 7364 } 7365 7366 static sector_t 7367 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 7368 { 7369 struct r5conf *conf = mddev->private; 7370 7371 if (!sectors) 7372 sectors = mddev->dev_sectors; 7373 if (!raid_disks) 7374 /* size is defined by the smallest of previous and new size */ 7375 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 7376 7377 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7378 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 7379 return sectors * (raid_disks - conf->max_degraded); 7380 } 7381 7382 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 7383 { 7384 safe_put_page(percpu->spare_page); 7385 percpu->spare_page = NULL; 7386 kvfree(percpu->scribble); 7387 percpu->scribble = NULL; 7388 } 7389 7390 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 7391 { 7392 if (conf->level == 6 && !percpu->spare_page) { 7393 percpu->spare_page = alloc_page(GFP_KERNEL); 7394 if (!percpu->spare_page) 7395 return -ENOMEM; 7396 } 7397 7398 if (scribble_alloc(percpu, 7399 max(conf->raid_disks, 7400 conf->previous_raid_disks), 7401 max(conf->chunk_sectors, 7402 conf->prev_chunk_sectors) 7403 / RAID5_STRIPE_SECTORS(conf))) { 7404 free_scratch_buffer(conf, percpu); 7405 return -ENOMEM; 7406 } 7407 7408 local_lock_init(&percpu->lock); 7409 return 0; 7410 } 7411 7412 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 7413 { 7414 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 7415 7416 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 7417 return 0; 7418 } 7419 7420 static void raid5_free_percpu(struct r5conf *conf) 7421 { 7422 if (!conf->percpu) 7423 return; 7424 7425 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 7426 free_percpu(conf->percpu); 7427 } 7428 7429 static void free_conf(struct r5conf *conf) 7430 { 7431 int i; 7432 7433 log_exit(conf); 7434 7435 shrinker_free(conf->shrinker); 7436 free_thread_groups(conf); 7437 shrink_stripes(conf); 7438 raid5_free_percpu(conf); 7439 for (i = 0; i < conf->pool_size; i++) 7440 if (conf->disks[i].extra_page) 7441 put_page(conf->disks[i].extra_page); 7442 kfree(conf->disks); 7443 bioset_exit(&conf->bio_split); 7444 kfree(conf->stripe_hashtbl); 7445 kfree(conf->pending_data); 7446 7447 mempool_destroy(conf->ctx_pool); 7448 7449 kfree(conf); 7450 } 7451 7452 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 7453 { 7454 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 7455 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 7456 7457 if (alloc_scratch_buffer(conf, percpu)) { 7458 pr_warn("%s: failed memory allocation for cpu%u\n", 7459 __func__, cpu); 7460 return -ENOMEM; 7461 } 7462 return 0; 7463 } 7464 7465 static int raid5_alloc_percpu(struct r5conf *conf) 7466 { 7467 int err = 0; 7468 7469 conf->percpu = alloc_percpu(struct raid5_percpu); 7470 if (!conf->percpu) 7471 return -ENOMEM; 7472 7473 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 7474 if (!err) { 7475 conf->scribble_disks = max(conf->raid_disks, 7476 conf->previous_raid_disks); 7477 conf->scribble_sectors = max(conf->chunk_sectors, 7478 conf->prev_chunk_sectors); 7479 } 7480 return err; 7481 } 7482 7483 static unsigned long raid5_cache_scan(struct shrinker *shrink, 7484 struct shrink_control *sc) 7485 { 7486 struct r5conf *conf = shrink->private_data; 7487 unsigned long ret = SHRINK_STOP; 7488 7489 if (mutex_trylock(&conf->cache_size_mutex)) { 7490 ret= 0; 7491 while (ret < sc->nr_to_scan && 7492 conf->max_nr_stripes > conf->min_nr_stripes) { 7493 if (drop_one_stripe(conf) == 0) { 7494 ret = SHRINK_STOP; 7495 break; 7496 } 7497 ret++; 7498 } 7499 mutex_unlock(&conf->cache_size_mutex); 7500 } 7501 return ret; 7502 } 7503 7504 static unsigned long raid5_cache_count(struct shrinker *shrink, 7505 struct shrink_control *sc) 7506 { 7507 struct r5conf *conf = shrink->private_data; 7508 int max_stripes = READ_ONCE(conf->max_nr_stripes); 7509 int min_stripes = READ_ONCE(conf->min_nr_stripes); 7510 7511 if (max_stripes < min_stripes) 7512 /* unlikely, but not impossible */ 7513 return 0; 7514 return max_stripes - min_stripes; 7515 } 7516 7517 static struct r5conf *setup_conf(struct mddev *mddev) 7518 { 7519 struct r5conf *conf; 7520 int raid_disk, memory, max_disks; 7521 struct md_rdev *rdev; 7522 struct disk_info *disk; 7523 char pers_name[6]; 7524 int i; 7525 int group_cnt; 7526 struct r5worker_group *new_group; 7527 int ret = -ENOMEM; 7528 7529 if (mddev->new_level != 5 7530 && mddev->new_level != 4 7531 && mddev->new_level != 6) { 7532 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 7533 mdname(mddev), mddev->new_level); 7534 return ERR_PTR(-EIO); 7535 } 7536 if ((mddev->new_level == 5 7537 && !algorithm_valid_raid5(mddev->new_layout)) || 7538 (mddev->new_level == 6 7539 && !algorithm_valid_raid6(mddev->new_layout))) { 7540 pr_warn("md/raid:%s: layout %d not supported\n", 7541 mdname(mddev), mddev->new_layout); 7542 return ERR_PTR(-EIO); 7543 } 7544 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 7545 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 7546 mdname(mddev), mddev->raid_disks); 7547 return ERR_PTR(-EINVAL); 7548 } 7549 7550 if (!mddev->new_chunk_sectors || 7551 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 7552 !is_power_of_2(mddev->new_chunk_sectors)) { 7553 pr_warn("md/raid:%s: invalid chunk size %d\n", 7554 mdname(mddev), mddev->new_chunk_sectors << 9); 7555 return ERR_PTR(-EINVAL); 7556 } 7557 7558 conf = kzalloc_obj(struct r5conf); 7559 if (conf == NULL) 7560 goto abort; 7561 7562 #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 7563 conf->stripe_size = DEFAULT_STRIPE_SIZE; 7564 conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; 7565 conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; 7566 #endif 7567 INIT_LIST_HEAD(&conf->free_list); 7568 INIT_LIST_HEAD(&conf->pending_list); 7569 conf->pending_data = kzalloc_objs(struct r5pending_data, PENDING_IO_MAX); 7570 if (!conf->pending_data) 7571 goto abort; 7572 for (i = 0; i < PENDING_IO_MAX; i++) 7573 list_add(&conf->pending_data[i].sibling, &conf->free_list); 7574 /* Don't enable multi-threading by default*/ 7575 if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { 7576 conf->group_cnt = group_cnt; 7577 conf->worker_cnt_per_group = 0; 7578 conf->worker_groups = new_group; 7579 } else 7580 goto abort; 7581 spin_lock_init(&conf->device_lock); 7582 seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); 7583 mutex_init(&conf->cache_size_mutex); 7584 7585 init_waitqueue_head(&conf->wait_for_quiescent); 7586 init_waitqueue_head(&conf->wait_for_stripe); 7587 init_waitqueue_head(&conf->wait_for_reshape); 7588 INIT_LIST_HEAD(&conf->handle_list); 7589 INIT_LIST_HEAD(&conf->loprio_list); 7590 INIT_LIST_HEAD(&conf->hold_list); 7591 INIT_LIST_HEAD(&conf->delayed_list); 7592 INIT_LIST_HEAD(&conf->bitmap_list); 7593 init_llist_head(&conf->released_stripes); 7594 atomic_set(&conf->active_stripes, 0); 7595 atomic_set(&conf->preread_active_stripes, 0); 7596 atomic_set(&conf->active_aligned_reads, 0); 7597 spin_lock_init(&conf->pending_bios_lock); 7598 conf->batch_bio_dispatch = true; 7599 rdev_for_each(rdev, mddev) { 7600 if (test_bit(Journal, &rdev->flags)) 7601 continue; 7602 if (!bdev_rot(rdev->bdev)) { 7603 conf->batch_bio_dispatch = false; 7604 break; 7605 } 7606 } 7607 7608 conf->bypass_threshold = BYPASS_THRESHOLD; 7609 conf->raid_disks = mddev->raid_disks; 7610 if (mddev->reshape_position == MaxSector) 7611 conf->previous_raid_disks = mddev->raid_disks; 7612 else 7613 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 7614 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 7615 7616 conf->disks = kzalloc_objs(struct disk_info, max_disks); 7617 7618 if (!conf->disks) 7619 goto abort; 7620 7621 for (i = 0; i < max_disks; i++) { 7622 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 7623 if (!conf->disks[i].extra_page) 7624 goto abort; 7625 } 7626 7627 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 7628 if (ret) 7629 goto abort; 7630 conf->mddev = mddev; 7631 7632 ret = -ENOMEM; 7633 conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL); 7634 if (!conf->stripe_hashtbl) 7635 goto abort; 7636 7637 /* We init hash_locks[0] separately to that it can be used 7638 * as the reference lock in the spin_lock_nest_lock() call 7639 * in lock_all_device_hash_locks_irq in order to convince 7640 * lockdep that we know what we are doing. 7641 */ 7642 spin_lock_init(conf->hash_locks); 7643 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 7644 spin_lock_init(conf->hash_locks + i); 7645 7646 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7647 INIT_LIST_HEAD(conf->inactive_list + i); 7648 7649 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7650 INIT_LIST_HEAD(conf->temp_inactive_list + i); 7651 7652 atomic_set(&conf->r5c_cached_full_stripes, 0); 7653 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 7654 atomic_set(&conf->r5c_cached_partial_stripes, 0); 7655 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 7656 atomic_set(&conf->r5c_flushing_full_stripes, 0); 7657 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 7658 7659 conf->level = mddev->new_level; 7660 conf->chunk_sectors = mddev->new_chunk_sectors; 7661 ret = raid5_alloc_percpu(conf); 7662 if (ret) 7663 goto abort; 7664 7665 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 7666 7667 ret = -EIO; 7668 rdev_for_each(rdev, mddev) { 7669 raid_disk = rdev->raid_disk; 7670 if (raid_disk >= max_disks 7671 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 7672 continue; 7673 disk = conf->disks + raid_disk; 7674 7675 if (test_bit(Replacement, &rdev->flags)) { 7676 if (disk->replacement) 7677 goto abort; 7678 disk->replacement = rdev; 7679 } else { 7680 if (disk->rdev) 7681 goto abort; 7682 disk->rdev = rdev; 7683 } 7684 7685 if (test_bit(In_sync, &rdev->flags)) { 7686 pr_info("md/raid:%s: device %pg operational as raid disk %d\n", 7687 mdname(mddev), rdev->bdev, raid_disk); 7688 } else if (rdev->saved_raid_disk != raid_disk) 7689 /* Cannot rely on bitmap to complete recovery */ 7690 conf->fullsync = 1; 7691 } 7692 7693 conf->level = mddev->new_level; 7694 if (conf->level == 6) { 7695 conf->max_degraded = 2; 7696 if (raid6_can_xor_syndrome()) 7697 conf->rmw_level = PARITY_ENABLE_RMW; 7698 else 7699 conf->rmw_level = PARITY_DISABLE_RMW; 7700 } else { 7701 conf->max_degraded = 1; 7702 conf->rmw_level = PARITY_ENABLE_RMW; 7703 } 7704 conf->algorithm = mddev->new_layout; 7705 conf->reshape_progress = mddev->reshape_position; 7706 if (conf->reshape_progress != MaxSector) { 7707 conf->prev_chunk_sectors = mddev->chunk_sectors; 7708 conf->prev_algo = mddev->layout; 7709 } else { 7710 conf->prev_chunk_sectors = conf->chunk_sectors; 7711 conf->prev_algo = conf->algorithm; 7712 } 7713 7714 conf->min_nr_stripes = NR_STRIPES; 7715 if (mddev->reshape_position != MaxSector) { 7716 int stripes = max_t(int, 7717 ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, 7718 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); 7719 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7720 if (conf->min_nr_stripes != NR_STRIPES) 7721 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7722 mdname(mddev), conf->min_nr_stripes); 7723 } 7724 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7725 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7726 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7727 if (grow_stripes(conf, conf->min_nr_stripes)) { 7728 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7729 mdname(mddev), memory); 7730 ret = -ENOMEM; 7731 goto abort; 7732 } else 7733 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7734 /* 7735 * Losing a stripe head costs more than the time to refill it, 7736 * it reduces the queue depth and so can hurt throughput. 7737 * So set it rather large, scaled by number of devices. 7738 */ 7739 conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev)); 7740 if (!conf->shrinker) { 7741 ret = -ENOMEM; 7742 pr_warn("md/raid:%s: couldn't allocate shrinker.\n", 7743 mdname(mddev)); 7744 goto abort; 7745 } 7746 7747 conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7748 conf->shrinker->scan_objects = raid5_cache_scan; 7749 conf->shrinker->count_objects = raid5_cache_count; 7750 conf->shrinker->batch = 128; 7751 conf->shrinker->private_data = conf; 7752 7753 shrinker_register(conf->shrinker); 7754 7755 sprintf(pers_name, "raid%d", mddev->new_level); 7756 rcu_assign_pointer(conf->thread, 7757 md_register_thread(raid5d, mddev, pers_name)); 7758 if (!conf->thread) { 7759 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7760 mdname(mddev)); 7761 ret = -ENOMEM; 7762 goto abort; 7763 } 7764 7765 return conf; 7766 7767 abort: 7768 if (conf) 7769 free_conf(conf); 7770 return ERR_PTR(ret); 7771 } 7772 7773 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7774 { 7775 switch (algo) { 7776 case ALGORITHM_PARITY_0: 7777 if (raid_disk < max_degraded) 7778 return 1; 7779 break; 7780 case ALGORITHM_PARITY_N: 7781 if (raid_disk >= raid_disks - max_degraded) 7782 return 1; 7783 break; 7784 case ALGORITHM_PARITY_0_6: 7785 if (raid_disk == 0 || 7786 raid_disk == raid_disks - 1) 7787 return 1; 7788 break; 7789 case ALGORITHM_LEFT_ASYMMETRIC_6: 7790 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7791 case ALGORITHM_LEFT_SYMMETRIC_6: 7792 case ALGORITHM_RIGHT_SYMMETRIC_6: 7793 if (raid_disk == raid_disks - 1) 7794 return 1; 7795 } 7796 return 0; 7797 } 7798 7799 static int raid5_create_ctx_pool(struct r5conf *conf) 7800 { 7801 struct stripe_request_ctx *ctx; 7802 int size; 7803 7804 if (mddev_is_dm(conf->mddev)) 7805 size = BITS_TO_LONGS(RAID5_MAX_REQ_STRIPES); 7806 else 7807 size = BITS_TO_LONGS( 7808 queue_max_hw_sectors(conf->mddev->gendisk->queue) >> 7809 RAID5_STRIPE_SHIFT(conf)); 7810 7811 conf->ctx_size = struct_size(ctx, sectors_to_do, size); 7812 conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, 7813 conf->ctx_size); 7814 7815 return conf->ctx_pool ? 0 : -ENOMEM; 7816 } 7817 7818 static int raid5_set_limits(struct mddev *mddev) 7819 { 7820 struct r5conf *conf = mddev->private; 7821 struct queue_limits lim; 7822 int data_disks, stripe; 7823 struct md_rdev *rdev; 7824 7825 /* 7826 * The read-ahead size must cover two whole stripes, which is 7827 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices. 7828 */ 7829 data_disks = conf->previous_raid_disks - conf->max_degraded; 7830 7831 /* 7832 * We can only discard a whole stripe. It doesn't make sense to 7833 * discard data disk but write parity disk 7834 */ 7835 stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); 7836 7837 md_init_stacking_limits(&lim); 7838 lim.logical_block_size = mddev->logical_block_size; 7839 lim.io_min = mddev->chunk_sectors << 9; 7840 lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); 7841 lim.chunk_sectors = lim.io_opt >> 9; 7842 lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; 7843 lim.discard_granularity = stripe; 7844 lim.max_write_zeroes_sectors = 0; 7845 lim.max_hw_wzeroes_unmap_sectors = 0; 7846 mddev_stack_rdev_limits(mddev, &lim, 0); 7847 rdev_for_each(rdev, mddev) 7848 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, 7849 mddev->gendisk->disk_name); 7850 7851 if (!devices_handle_discard_safely || 7852 lim.max_discard_sectors < (stripe >> 9) || 7853 lim.discard_granularity < stripe) 7854 conf->raid5_discard_unsupported = true; 7855 else 7856 conf->raid5_discard_unsupported = false; 7857 7858 /* 7859 * Requests require having a bitmap for each stripe. 7860 * Limit the max sectors based on this. 7861 */ 7862 lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); 7863 if ((lim.max_hw_sectors << 9) < lim.io_opt) 7864 lim.max_hw_sectors = lim.io_opt >> 9; 7865 lim.max_hw_discard_sectors = UINT_MAX; 7866 7867 /* No restrictions on the number of segments in the request */ 7868 lim.max_segments = USHRT_MAX; 7869 7870 return queue_limits_set(mddev->gendisk->queue, &lim); 7871 } 7872 7873 static int raid5_run(struct mddev *mddev) 7874 { 7875 struct r5conf *conf; 7876 int dirty_parity_disks = 0; 7877 struct md_rdev *rdev; 7878 struct md_rdev *journal_dev = NULL; 7879 sector_t reshape_offset = 0; 7880 int i; 7881 long long min_offset_diff = 0; 7882 int first = 1; 7883 int ret = -EIO; 7884 7885 if (mddev->resync_offset != MaxSector) 7886 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7887 mdname(mddev)); 7888 7889 rdev_for_each(rdev, mddev) { 7890 long long diff; 7891 7892 if (test_bit(Journal, &rdev->flags)) { 7893 journal_dev = rdev; 7894 continue; 7895 } 7896 if (rdev->raid_disk < 0) 7897 continue; 7898 diff = (rdev->new_data_offset - rdev->data_offset); 7899 if (first) { 7900 min_offset_diff = diff; 7901 first = 0; 7902 } else if (mddev->reshape_backwards && 7903 diff < min_offset_diff) 7904 min_offset_diff = diff; 7905 else if (!mddev->reshape_backwards && 7906 diff > min_offset_diff) 7907 min_offset_diff = diff; 7908 } 7909 7910 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && 7911 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7912 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7913 mdname(mddev)); 7914 return -EINVAL; 7915 } 7916 7917 if (mddev->reshape_position != MaxSector) { 7918 /* Check that we can continue the reshape. 7919 * Difficulties arise if the stripe we would write to 7920 * next is at or after the stripe we would read from next. 7921 * For a reshape that changes the number of devices, this 7922 * is only possible for a very short time, and mdadm makes 7923 * sure that time appears to have past before assembling 7924 * the array. So we fail if that time hasn't passed. 7925 * For a reshape that keeps the number of devices the same 7926 * mdadm must be monitoring the reshape can keeping the 7927 * critical areas read-only and backed up. It will start 7928 * the array in read-only mode, so we check for that. 7929 */ 7930 sector_t here_new, here_old; 7931 int old_disks; 7932 int max_degraded = (mddev->level == 6 ? 2 : 1); 7933 int chunk_sectors; 7934 int new_data_disks; 7935 7936 if (journal_dev) { 7937 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7938 mdname(mddev)); 7939 return -EINVAL; 7940 } 7941 7942 if (mddev->new_level != mddev->level) { 7943 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7944 mdname(mddev)); 7945 return -EINVAL; 7946 } 7947 old_disks = mddev->raid_disks - mddev->delta_disks; 7948 /* reshape_position must be on a new-stripe boundary, and one 7949 * further up in new geometry must map after here in old 7950 * geometry. 7951 * If the chunk sizes are different, then as we perform reshape 7952 * in units of the largest of the two, reshape_position needs 7953 * be a multiple of the largest chunk size times new data disks. 7954 */ 7955 here_new = mddev->reshape_position; 7956 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7957 new_data_disks = mddev->raid_disks - max_degraded; 7958 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7959 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7960 mdname(mddev)); 7961 return -EINVAL; 7962 } 7963 reshape_offset = here_new * chunk_sectors; 7964 /* here_new is the stripe we will write to */ 7965 here_old = mddev->reshape_position; 7966 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7967 /* here_old is the first stripe that we might need to read 7968 * from */ 7969 if (mddev->delta_disks == 0) { 7970 /* We cannot be sure it is safe to start an in-place 7971 * reshape. It is only safe if user-space is monitoring 7972 * and taking constant backups. 7973 * mdadm always starts a situation like this in 7974 * readonly mode so it can take control before 7975 * allowing any writes. So just check for that. 7976 */ 7977 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7978 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7979 /* not really in-place - so OK */; 7980 else if (mddev->ro == 0) { 7981 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7982 mdname(mddev)); 7983 return -EINVAL; 7984 } 7985 } else if (mddev->reshape_backwards 7986 ? (here_new * chunk_sectors + min_offset_diff <= 7987 here_old * chunk_sectors) 7988 : (here_new * chunk_sectors >= 7989 here_old * chunk_sectors + (-min_offset_diff))) { 7990 /* Reading from the same stripe as writing to - bad */ 7991 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7992 mdname(mddev)); 7993 return -EINVAL; 7994 } 7995 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7996 /* OK, we should be able to continue; */ 7997 } else { 7998 BUG_ON(mddev->level != mddev->new_level); 7999 BUG_ON(mddev->layout != mddev->new_layout); 8000 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 8001 BUG_ON(mddev->delta_disks != 0); 8002 } 8003 8004 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 8005 test_bit(MD_HAS_PPL, &mddev->flags)) { 8006 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 8007 mdname(mddev)); 8008 clear_bit(MD_HAS_PPL, &mddev->flags); 8009 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 8010 } 8011 8012 if (mddev->private == NULL) 8013 conf = setup_conf(mddev); 8014 else 8015 conf = mddev->private; 8016 8017 if (IS_ERR(conf)) 8018 return PTR_ERR(conf); 8019 8020 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 8021 if (!journal_dev) { 8022 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 8023 mdname(mddev)); 8024 mddev->ro = 1; 8025 set_disk_ro(mddev->gendisk, 1); 8026 } else if (mddev->resync_offset == MaxSector) 8027 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 8028 } 8029 8030 conf->min_offset_diff = min_offset_diff; 8031 rcu_assign_pointer(mddev->thread, conf->thread); 8032 rcu_assign_pointer(conf->thread, NULL); 8033 mddev->private = conf; 8034 8035 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 8036 i++) { 8037 rdev = conf->disks[i].rdev; 8038 if (!rdev) 8039 continue; 8040 if (conf->disks[i].replacement && 8041 conf->reshape_progress != MaxSector) { 8042 /* replacements and reshape simply do not mix. */ 8043 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 8044 goto abort; 8045 } 8046 if (test_bit(In_sync, &rdev->flags)) 8047 continue; 8048 /* This disc is not fully in-sync. However if it 8049 * just stored parity (beyond the recovery_offset), 8050 * when we don't need to be concerned about the 8051 * array being dirty. 8052 * When reshape goes 'backwards', we never have 8053 * partially completed devices, so we only need 8054 * to worry about reshape going forwards. 8055 */ 8056 /* Hack because v0.91 doesn't store recovery_offset properly. */ 8057 if (mddev->major_version == 0 && 8058 mddev->minor_version > 90) 8059 rdev->recovery_offset = reshape_offset; 8060 8061 if (rdev->recovery_offset < reshape_offset) { 8062 /* We need to check old and new layout */ 8063 if (!only_parity(rdev->raid_disk, 8064 conf->algorithm, 8065 conf->raid_disks, 8066 conf->max_degraded)) 8067 continue; 8068 } 8069 if (!only_parity(rdev->raid_disk, 8070 conf->prev_algo, 8071 conf->previous_raid_disks, 8072 conf->max_degraded)) 8073 continue; 8074 dirty_parity_disks++; 8075 } 8076 8077 /* 8078 * 0 for a fully functional array, 1 or 2 for a degraded array. 8079 */ 8080 mddev->degraded = raid5_calc_degraded(conf); 8081 8082 if (has_failed(conf)) { 8083 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 8084 mdname(mddev), mddev->degraded, conf->raid_disks); 8085 goto abort; 8086 } 8087 8088 /* device size must be a multiple of chunk size */ 8089 mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1); 8090 mddev->resync_max_sectors = mddev->dev_sectors; 8091 8092 if (mddev->degraded > dirty_parity_disks && 8093 mddev->resync_offset != MaxSector) { 8094 if (test_bit(MD_HAS_PPL, &mddev->flags)) 8095 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 8096 mdname(mddev)); 8097 else if (mddev->ok_start_degraded) 8098 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 8099 mdname(mddev)); 8100 else { 8101 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 8102 mdname(mddev)); 8103 goto abort; 8104 } 8105 } 8106 8107 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 8108 mdname(mddev), conf->level, 8109 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 8110 mddev->new_layout); 8111 8112 print_raid5_conf(conf); 8113 8114 if (conf->reshape_progress != MaxSector) { 8115 conf->reshape_safe = conf->reshape_progress; 8116 atomic_set(&conf->reshape_stripes, 0); 8117 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8118 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8119 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8120 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8121 } 8122 8123 /* Ok, everything is just fine now */ 8124 if (mddev->to_remove == &raid5_attrs_group) 8125 mddev->to_remove = NULL; 8126 else if (mddev->kobj.sd && 8127 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 8128 pr_warn("raid5: failed to create sysfs attributes for %s\n", 8129 mdname(mddev)); 8130 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 8131 8132 if (!mddev_is_dm(mddev)) { 8133 ret = raid5_set_limits(mddev); 8134 if (ret) 8135 goto abort; 8136 } 8137 8138 ret = raid5_create_ctx_pool(conf); 8139 if (ret) 8140 goto abort; 8141 8142 ret = log_init(conf, journal_dev, raid5_has_ppl(conf)); 8143 if (ret) 8144 goto abort; 8145 8146 return 0; 8147 abort: 8148 md_unregister_thread(mddev, &mddev->thread); 8149 print_raid5_conf(conf); 8150 free_conf(conf); 8151 mddev->private = NULL; 8152 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 8153 return ret; 8154 } 8155 8156 static void raid5_free(struct mddev *mddev, void *priv) 8157 { 8158 struct r5conf *conf = priv; 8159 8160 free_conf(conf); 8161 mddev->to_remove = &raid5_attrs_group; 8162 } 8163 8164 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 8165 { 8166 struct r5conf *conf = mddev->private; 8167 int i; 8168 8169 lockdep_assert_held(&mddev->lock); 8170 8171 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 8172 conf->chunk_sectors / 2, mddev->layout); 8173 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 8174 for (i = 0; i < conf->raid_disks; i++) { 8175 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 8176 8177 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 8178 } 8179 seq_printf (seq, "]"); 8180 } 8181 8182 static void print_raid5_conf(struct r5conf *conf) 8183 { 8184 struct md_rdev *rdev; 8185 int i; 8186 8187 pr_debug("RAID conf printout:\n"); 8188 if (!conf) { 8189 pr_debug("(conf==NULL)\n"); 8190 return; 8191 } 8192 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 8193 conf->raid_disks, 8194 conf->raid_disks - conf->mddev->degraded); 8195 8196 for (i = 0; i < conf->raid_disks; i++) { 8197 rdev = conf->disks[i].rdev; 8198 if (rdev) 8199 pr_debug(" disk %d, o:%d, dev:%pg\n", 8200 i, !test_bit(Faulty, &rdev->flags), 8201 rdev->bdev); 8202 } 8203 } 8204 8205 static int raid5_spare_active(struct mddev *mddev) 8206 { 8207 int i; 8208 struct r5conf *conf = mddev->private; 8209 struct md_rdev *rdev, *replacement; 8210 int count = 0; 8211 unsigned long flags; 8212 8213 for (i = 0; i < conf->raid_disks; i++) { 8214 rdev = conf->disks[i].rdev; 8215 replacement = conf->disks[i].replacement; 8216 if (replacement 8217 && replacement->recovery_offset == MaxSector 8218 && !test_bit(Faulty, &replacement->flags) 8219 && !test_and_set_bit(In_sync, &replacement->flags)) { 8220 /* Replacement has just become active. */ 8221 if (!rdev 8222 || !test_and_clear_bit(In_sync, &rdev->flags)) 8223 count++; 8224 if (rdev) { 8225 /* Replaced device not technically faulty, 8226 * but we need to be sure it gets removed 8227 * and never re-added. 8228 */ 8229 set_bit(Faulty, &rdev->flags); 8230 sysfs_notify_dirent_safe( 8231 rdev->sysfs_state); 8232 } 8233 sysfs_notify_dirent_safe(replacement->sysfs_state); 8234 } else if (rdev 8235 && rdev->recovery_offset == MaxSector 8236 && !test_bit(Faulty, &rdev->flags) 8237 && !test_and_set_bit(In_sync, &rdev->flags)) { 8238 count++; 8239 sysfs_notify_dirent_safe(rdev->sysfs_state); 8240 } 8241 } 8242 spin_lock_irqsave(&conf->device_lock, flags); 8243 mddev->degraded = raid5_calc_degraded(conf); 8244 spin_unlock_irqrestore(&conf->device_lock, flags); 8245 print_raid5_conf(conf); 8246 return count; 8247 } 8248 8249 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 8250 { 8251 struct r5conf *conf = mddev->private; 8252 int err = 0; 8253 int number = rdev->raid_disk; 8254 struct md_rdev **rdevp; 8255 struct disk_info *p; 8256 struct md_rdev *tmp; 8257 8258 print_raid5_conf(conf); 8259 if (test_bit(Journal, &rdev->flags) && conf->log) { 8260 /* 8261 * we can't wait pending write here, as this is called in 8262 * raid5d, wait will deadlock. 8263 * neilb: there is no locking about new writes here, 8264 * so this cannot be safe. 8265 */ 8266 if (atomic_read(&conf->active_stripes) || 8267 atomic_read(&conf->r5c_cached_full_stripes) || 8268 atomic_read(&conf->r5c_cached_partial_stripes)) { 8269 return -EBUSY; 8270 } 8271 log_exit(conf); 8272 return 0; 8273 } 8274 if (unlikely(number >= conf->pool_size)) 8275 return 0; 8276 p = conf->disks + number; 8277 if (rdev == p->rdev) 8278 rdevp = &p->rdev; 8279 else if (rdev == p->replacement) 8280 rdevp = &p->replacement; 8281 else 8282 return 0; 8283 8284 if (number >= conf->raid_disks && 8285 conf->reshape_progress == MaxSector) 8286 clear_bit(In_sync, &rdev->flags); 8287 8288 if (test_bit(In_sync, &rdev->flags) || 8289 atomic_read(&rdev->nr_pending)) { 8290 err = -EBUSY; 8291 goto abort; 8292 } 8293 /* Only remove non-faulty devices if recovery 8294 * isn't possible. 8295 */ 8296 if (!test_bit(Faulty, &rdev->flags) && 8297 !has_failed(conf) && 8298 (!p->replacement || p->replacement == rdev) && 8299 number < conf->raid_disks) { 8300 err = -EBUSY; 8301 goto abort; 8302 } 8303 WRITE_ONCE(*rdevp, NULL); 8304 if (!err) { 8305 err = log_modify(conf, rdev, false); 8306 if (err) 8307 goto abort; 8308 } 8309 8310 tmp = p->replacement; 8311 if (tmp) { 8312 /* We must have just cleared 'rdev' */ 8313 WRITE_ONCE(p->rdev, tmp); 8314 clear_bit(Replacement, &tmp->flags); 8315 WRITE_ONCE(p->replacement, NULL); 8316 8317 if (!err) 8318 err = log_modify(conf, tmp, true); 8319 } 8320 8321 clear_bit(WantReplacement, &rdev->flags); 8322 abort: 8323 8324 print_raid5_conf(conf); 8325 return err; 8326 } 8327 8328 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 8329 { 8330 struct r5conf *conf = mddev->private; 8331 int ret, err = -EEXIST; 8332 int disk; 8333 struct disk_info *p; 8334 struct md_rdev *tmp; 8335 int first = 0; 8336 int last = conf->raid_disks - 1; 8337 8338 if (test_bit(Journal, &rdev->flags)) { 8339 if (conf->log) 8340 return -EBUSY; 8341 8342 rdev->raid_disk = 0; 8343 /* 8344 * The array is in readonly mode if journal is missing, so no 8345 * write requests running. We should be safe 8346 */ 8347 ret = log_init(conf, rdev, false); 8348 if (ret) 8349 return ret; 8350 8351 ret = r5l_start(conf->log); 8352 if (ret) 8353 return ret; 8354 8355 return 0; 8356 } 8357 8358 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 8359 /* no point adding a device */ 8360 return -EINVAL; 8361 8362 if (rdev->raid_disk >= 0) 8363 first = last = rdev->raid_disk; 8364 8365 /* 8366 * find the disk ... but prefer rdev->saved_raid_disk 8367 * if possible. 8368 */ 8369 if (rdev->saved_raid_disk >= first && 8370 rdev->saved_raid_disk <= last && 8371 conf->disks[rdev->saved_raid_disk].rdev == NULL) 8372 first = rdev->saved_raid_disk; 8373 8374 for (disk = first; disk <= last; disk++) { 8375 p = conf->disks + disk; 8376 if (p->rdev == NULL) { 8377 clear_bit(In_sync, &rdev->flags); 8378 rdev->raid_disk = disk; 8379 if (rdev->saved_raid_disk != disk) 8380 conf->fullsync = 1; 8381 WRITE_ONCE(p->rdev, rdev); 8382 8383 err = log_modify(conf, rdev, true); 8384 8385 goto out; 8386 } 8387 } 8388 for (disk = first; disk <= last; disk++) { 8389 p = conf->disks + disk; 8390 tmp = p->rdev; 8391 if (test_bit(WantReplacement, &tmp->flags) && 8392 mddev->reshape_position == MaxSector && 8393 p->replacement == NULL) { 8394 clear_bit(In_sync, &rdev->flags); 8395 set_bit(Replacement, &rdev->flags); 8396 rdev->raid_disk = disk; 8397 err = 0; 8398 conf->fullsync = 1; 8399 WRITE_ONCE(p->replacement, rdev); 8400 break; 8401 } 8402 } 8403 out: 8404 print_raid5_conf(conf); 8405 return err; 8406 } 8407 8408 static int raid5_resize(struct mddev *mddev, sector_t sectors) 8409 { 8410 /* no resync is happening, and there is enough space 8411 * on all devices, so we can resize. 8412 * We need to make sure resync covers any new space. 8413 * If the array is shrinking we should possibly wait until 8414 * any io in the removed space completes, but it hardly seems 8415 * worth it. 8416 */ 8417 sector_t newsize; 8418 struct r5conf *conf = mddev->private; 8419 8420 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 8421 return -EINVAL; 8422 sectors &= ~((sector_t)conf->chunk_sectors - 1); 8423 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 8424 if (mddev->external_size && 8425 mddev->array_sectors > newsize) 8426 return -EINVAL; 8427 8428 if (md_bitmap_enabled(mddev, false)) { 8429 int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); 8430 8431 if (ret) 8432 return ret; 8433 } 8434 8435 md_set_array_sectors(mddev, newsize); 8436 if (sectors > mddev->dev_sectors && 8437 mddev->resync_offset > mddev->dev_sectors) { 8438 mddev->resync_offset = mddev->dev_sectors; 8439 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8440 } 8441 mddev->dev_sectors = sectors; 8442 mddev->resync_max_sectors = sectors; 8443 return 0; 8444 } 8445 8446 static int check_stripe_cache(struct mddev *mddev) 8447 { 8448 /* Can only proceed if there are plenty of stripe_heads. 8449 * We need a minimum of one full stripe,, and for sensible progress 8450 * it is best to have about 4 times that. 8451 * If we require 4 times, then the default 256 4K stripe_heads will 8452 * allow for chunk sizes up to 256K, which is probably OK. 8453 * If the chunk size is greater, user-space should request more 8454 * stripe_heads first. 8455 */ 8456 struct r5conf *conf = mddev->private; 8457 if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 8458 > conf->min_nr_stripes || 8459 ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 8460 > conf->min_nr_stripes) { 8461 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 8462 mdname(mddev), 8463 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 8464 / RAID5_STRIPE_SIZE(conf))*4); 8465 return 0; 8466 } 8467 return 1; 8468 } 8469 8470 static int check_reshape(struct mddev *mddev) 8471 { 8472 struct r5conf *conf = mddev->private; 8473 8474 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 8475 return -EINVAL; 8476 if (mddev->delta_disks == 0 && 8477 mddev->new_layout == mddev->layout && 8478 mddev->new_chunk_sectors == mddev->chunk_sectors) 8479 return 0; /* nothing to do */ 8480 if (has_failed(conf)) 8481 return -EINVAL; 8482 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 8483 /* We might be able to shrink, but the devices must 8484 * be made bigger first. 8485 * For raid6, 4 is the minimum size. 8486 * Otherwise 2 is the minimum 8487 */ 8488 int min = 2; 8489 if (mddev->level == 6) 8490 min = 4; 8491 if (mddev->raid_disks + mddev->delta_disks < min) 8492 return -EINVAL; 8493 } 8494 8495 if (!check_stripe_cache(mddev)) 8496 return -ENOSPC; 8497 8498 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 8499 mddev->delta_disks > 0) 8500 if (resize_chunks(conf, 8501 conf->previous_raid_disks 8502 + max(0, mddev->delta_disks), 8503 max(mddev->new_chunk_sectors, 8504 mddev->chunk_sectors) 8505 ) < 0) 8506 return -ENOMEM; 8507 8508 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 8509 return 0; /* never bother to shrink */ 8510 return resize_stripes(conf, (conf->previous_raid_disks 8511 + mddev->delta_disks)); 8512 } 8513 8514 static int raid5_start_reshape(struct mddev *mddev) 8515 { 8516 struct r5conf *conf = mddev->private; 8517 struct md_rdev *rdev; 8518 int spares = 0; 8519 int i; 8520 unsigned long flags; 8521 8522 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 8523 return -EBUSY; 8524 8525 if (!check_stripe_cache(mddev)) 8526 return -ENOSPC; 8527 8528 if (has_failed(conf)) 8529 return -EINVAL; 8530 8531 /* raid5 can't handle concurrent reshape and recovery */ 8532 if (mddev->resync_offset < MaxSector) 8533 return -EBUSY; 8534 for (i = 0; i < conf->raid_disks; i++) 8535 if (conf->disks[i].replacement) 8536 return -EBUSY; 8537 8538 rdev_for_each(rdev, mddev) { 8539 if (!test_bit(In_sync, &rdev->flags) 8540 && !test_bit(Faulty, &rdev->flags)) 8541 spares++; 8542 } 8543 8544 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 8545 /* Not enough devices even to make a degraded array 8546 * of that size 8547 */ 8548 return -EINVAL; 8549 8550 /* Refuse to reduce size of the array. Any reductions in 8551 * array size must be through explicit setting of array_size 8552 * attribute. 8553 */ 8554 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 8555 < mddev->array_sectors) { 8556 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 8557 mdname(mddev)); 8558 return -EINVAL; 8559 } 8560 8561 atomic_set(&conf->reshape_stripes, 0); 8562 spin_lock_irq(&conf->device_lock); 8563 write_seqcount_begin(&conf->gen_lock); 8564 conf->previous_raid_disks = conf->raid_disks; 8565 conf->raid_disks += mddev->delta_disks; 8566 conf->prev_chunk_sectors = conf->chunk_sectors; 8567 conf->chunk_sectors = mddev->new_chunk_sectors; 8568 conf->prev_algo = conf->algorithm; 8569 conf->algorithm = mddev->new_layout; 8570 conf->generation++; 8571 /* Code that selects data_offset needs to see the generation update 8572 * if reshape_progress has been set - so a memory barrier needed. 8573 */ 8574 smp_mb(); 8575 if (mddev->reshape_backwards) 8576 conf->reshape_progress = raid5_size(mddev, 0, 0); 8577 else 8578 conf->reshape_progress = 0; 8579 conf->reshape_safe = conf->reshape_progress; 8580 write_seqcount_end(&conf->gen_lock); 8581 spin_unlock_irq(&conf->device_lock); 8582 8583 /* Now make sure any requests that proceeded on the assumption 8584 * the reshape wasn't running - like Discard or Read - have 8585 * completed. 8586 */ 8587 raid5_quiesce(mddev, true); 8588 raid5_quiesce(mddev, false); 8589 8590 /* Add some new drives, as many as will fit. 8591 * We know there are enough to make the newly sized array work. 8592 * Don't add devices if we are reducing the number of 8593 * devices in the array. This is because it is not possible 8594 * to correctly record the "partially reconstructed" state of 8595 * such devices during the reshape and confusion could result. 8596 */ 8597 if (mddev->delta_disks >= 0) { 8598 rdev_for_each(rdev, mddev) 8599 if (rdev->raid_disk < 0 && 8600 !test_bit(Faulty, &rdev->flags)) { 8601 if (raid5_add_disk(mddev, rdev) == 0) { 8602 if (rdev->raid_disk 8603 >= conf->previous_raid_disks) 8604 set_bit(In_sync, &rdev->flags); 8605 else 8606 rdev->recovery_offset = 0; 8607 8608 /* Failure here is OK */ 8609 sysfs_link_rdev(mddev, rdev); 8610 } 8611 } else if (rdev->raid_disk >= conf->previous_raid_disks 8612 && !test_bit(Faulty, &rdev->flags)) { 8613 /* This is a spare that was manually added */ 8614 set_bit(In_sync, &rdev->flags); 8615 } 8616 8617 /* When a reshape changes the number of devices, 8618 * ->degraded is measured against the larger of the 8619 * pre and post number of devices. 8620 */ 8621 spin_lock_irqsave(&conf->device_lock, flags); 8622 mddev->degraded = raid5_calc_degraded(conf); 8623 spin_unlock_irqrestore(&conf->device_lock, flags); 8624 } 8625 mddev->raid_disks = conf->raid_disks; 8626 mddev->reshape_position = conf->reshape_progress; 8627 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8628 8629 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8630 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8631 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8632 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8633 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8634 conf->reshape_checkpoint = jiffies; 8635 md_new_event(); 8636 return 0; 8637 } 8638 8639 /* This is called from the reshape thread and should make any 8640 * changes needed in 'conf' 8641 */ 8642 static void end_reshape(struct r5conf *conf) 8643 { 8644 8645 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 8646 struct md_rdev *rdev; 8647 8648 spin_lock_irq(&conf->device_lock); 8649 conf->previous_raid_disks = conf->raid_disks; 8650 md_finish_reshape(conf->mddev); 8651 smp_wmb(); 8652 conf->reshape_progress = MaxSector; 8653 conf->mddev->reshape_position = MaxSector; 8654 rdev_for_each(rdev, conf->mddev) 8655 if (rdev->raid_disk >= 0 && 8656 !test_bit(Journal, &rdev->flags) && 8657 !test_bit(In_sync, &rdev->flags)) 8658 rdev->recovery_offset = MaxSector; 8659 spin_unlock_irq(&conf->device_lock); 8660 wake_up(&conf->wait_for_reshape); 8661 8662 mddev_update_io_opt(conf->mddev, 8663 conf->raid_disks - conf->max_degraded); 8664 } 8665 } 8666 8667 /* This is called from the raid5d thread with mddev_lock held. 8668 * It makes config changes to the device. 8669 */ 8670 static void raid5_finish_reshape(struct mddev *mddev) 8671 { 8672 struct r5conf *conf = mddev->private; 8673 struct md_rdev *rdev; 8674 8675 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8676 8677 if (mddev->delta_disks <= 0) { 8678 int d; 8679 spin_lock_irq(&conf->device_lock); 8680 mddev->degraded = raid5_calc_degraded(conf); 8681 spin_unlock_irq(&conf->device_lock); 8682 for (d = conf->raid_disks ; 8683 d < conf->raid_disks - mddev->delta_disks; 8684 d++) { 8685 rdev = conf->disks[d].rdev; 8686 if (rdev) 8687 clear_bit(In_sync, &rdev->flags); 8688 rdev = conf->disks[d].replacement; 8689 if (rdev) 8690 clear_bit(In_sync, &rdev->flags); 8691 } 8692 } 8693 mddev->layout = conf->algorithm; 8694 mddev->chunk_sectors = conf->chunk_sectors; 8695 mddev->reshape_position = MaxSector; 8696 mddev->delta_disks = 0; 8697 mddev->reshape_backwards = 0; 8698 } 8699 } 8700 8701 static void raid5_quiesce(struct mddev *mddev, int quiesce) 8702 { 8703 struct r5conf *conf = mddev->private; 8704 8705 if (quiesce) { 8706 /* stop all writes */ 8707 lock_all_device_hash_locks_irq(conf); 8708 /* '2' tells resync/reshape to pause so that all 8709 * active stripes can drain 8710 */ 8711 r5c_flush_cache(conf, INT_MAX); 8712 /* need a memory barrier to make sure read_one_chunk() sees 8713 * quiesce started and reverts to slow (locked) path. 8714 */ 8715 smp_store_release(&conf->quiesce, 2); 8716 wait_event_cmd(conf->wait_for_quiescent, 8717 atomic_read(&conf->active_stripes) == 0 && 8718 atomic_read(&conf->active_aligned_reads) == 0, 8719 unlock_all_device_hash_locks_irq(conf), 8720 lock_all_device_hash_locks_irq(conf)); 8721 conf->quiesce = 1; 8722 unlock_all_device_hash_locks_irq(conf); 8723 /* allow reshape to continue */ 8724 wake_up(&conf->wait_for_reshape); 8725 } else { 8726 /* re-enable writes */ 8727 lock_all_device_hash_locks_irq(conf); 8728 conf->quiesce = 0; 8729 wake_up(&conf->wait_for_quiescent); 8730 wake_up(&conf->wait_for_reshape); 8731 unlock_all_device_hash_locks_irq(conf); 8732 } 8733 log_quiesce(conf, quiesce); 8734 } 8735 8736 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8737 { 8738 struct r0conf *raid0_conf = mddev->private; 8739 sector_t sectors; 8740 8741 /* for raid0 takeover only one zone is supported */ 8742 if (raid0_conf->nr_strip_zones > 1) { 8743 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8744 mdname(mddev)); 8745 return ERR_PTR(-EINVAL); 8746 } 8747 8748 sectors = raid0_conf->strip_zone[0].zone_end; 8749 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8750 mddev->dev_sectors = sectors; 8751 mddev->new_level = level; 8752 mddev->new_layout = ALGORITHM_PARITY_N; 8753 mddev->new_chunk_sectors = mddev->chunk_sectors; 8754 mddev->raid_disks += 1; 8755 mddev->delta_disks = 1; 8756 /* make sure it will be not marked as dirty */ 8757 mddev->resync_offset = MaxSector; 8758 8759 return setup_conf(mddev); 8760 } 8761 8762 static void *raid5_takeover_raid1(struct mddev *mddev) 8763 { 8764 int chunksect; 8765 void *ret; 8766 8767 if (mddev->raid_disks != 2 || 8768 mddev->degraded > 1) 8769 return ERR_PTR(-EINVAL); 8770 8771 /* Should check if there are write-behind devices? */ 8772 8773 chunksect = 64*2; /* 64K by default */ 8774 8775 /* The array must be an exact multiple of chunksize */ 8776 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8777 chunksect >>= 1; 8778 8779 if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) 8780 /* array size does not allow a suitable chunk size */ 8781 return ERR_PTR(-EINVAL); 8782 8783 mddev->new_level = 5; 8784 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8785 mddev->new_chunk_sectors = chunksect; 8786 8787 ret = setup_conf(mddev); 8788 if (!IS_ERR(ret)) 8789 mddev_clear_unsupported_flags(mddev, 8790 UNSUPPORTED_MDDEV_FLAGS); 8791 return ret; 8792 } 8793 8794 static void *raid5_takeover_raid6(struct mddev *mddev) 8795 { 8796 int new_layout; 8797 8798 switch (mddev->layout) { 8799 case ALGORITHM_LEFT_ASYMMETRIC_6: 8800 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8801 break; 8802 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8803 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8804 break; 8805 case ALGORITHM_LEFT_SYMMETRIC_6: 8806 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8807 break; 8808 case ALGORITHM_RIGHT_SYMMETRIC_6: 8809 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8810 break; 8811 case ALGORITHM_PARITY_0_6: 8812 new_layout = ALGORITHM_PARITY_0; 8813 break; 8814 case ALGORITHM_PARITY_N: 8815 new_layout = ALGORITHM_PARITY_N; 8816 break; 8817 default: 8818 return ERR_PTR(-EINVAL); 8819 } 8820 mddev->new_level = 5; 8821 mddev->new_layout = new_layout; 8822 mddev->delta_disks = -1; 8823 mddev->raid_disks -= 1; 8824 return setup_conf(mddev); 8825 } 8826 8827 static int raid5_check_reshape(struct mddev *mddev) 8828 { 8829 /* For a 2-drive array, the layout and chunk size can be changed 8830 * immediately as not restriping is needed. 8831 * For larger arrays we record the new value - after validation 8832 * to be used by a reshape pass. 8833 */ 8834 struct r5conf *conf = mddev->private; 8835 int new_chunk = mddev->new_chunk_sectors; 8836 8837 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8838 return -EINVAL; 8839 if (new_chunk > 0) { 8840 if (!is_power_of_2(new_chunk)) 8841 return -EINVAL; 8842 if (new_chunk < (PAGE_SIZE>>9)) 8843 return -EINVAL; 8844 if (mddev->array_sectors & (new_chunk-1)) 8845 /* not factor of array size */ 8846 return -EINVAL; 8847 } 8848 8849 /* They look valid */ 8850 8851 if (mddev->raid_disks == 2) { 8852 /* can make the change immediately */ 8853 if (mddev->new_layout >= 0) { 8854 conf->algorithm = mddev->new_layout; 8855 mddev->layout = mddev->new_layout; 8856 } 8857 if (new_chunk > 0) { 8858 conf->chunk_sectors = new_chunk ; 8859 mddev->chunk_sectors = new_chunk; 8860 } 8861 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8862 md_wakeup_thread(mddev->thread); 8863 } 8864 return check_reshape(mddev); 8865 } 8866 8867 static int raid6_check_reshape(struct mddev *mddev) 8868 { 8869 int new_chunk = mddev->new_chunk_sectors; 8870 8871 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8872 return -EINVAL; 8873 if (new_chunk > 0) { 8874 if (!is_power_of_2(new_chunk)) 8875 return -EINVAL; 8876 if (new_chunk < (PAGE_SIZE >> 9)) 8877 return -EINVAL; 8878 if (mddev->array_sectors & (new_chunk-1)) 8879 /* not factor of array size */ 8880 return -EINVAL; 8881 } 8882 8883 /* They look valid */ 8884 return check_reshape(mddev); 8885 } 8886 8887 static void *raid5_takeover(struct mddev *mddev) 8888 { 8889 /* raid5 can take over: 8890 * raid0 - if there is only one strip zone - make it a raid4 layout 8891 * raid1 - if there are two drives. We need to know the chunk size 8892 * raid4 - trivial - just use a raid4 layout. 8893 * raid6 - Providing it is a *_6 layout 8894 */ 8895 if (mddev->level == 0) 8896 return raid45_takeover_raid0(mddev, 5); 8897 if (mddev->level == 1) 8898 return raid5_takeover_raid1(mddev); 8899 if (mddev->level == 4) { 8900 mddev->new_layout = ALGORITHM_PARITY_N; 8901 mddev->new_level = 5; 8902 return setup_conf(mddev); 8903 } 8904 if (mddev->level == 6) 8905 return raid5_takeover_raid6(mddev); 8906 8907 return ERR_PTR(-EINVAL); 8908 } 8909 8910 static void *raid4_takeover(struct mddev *mddev) 8911 { 8912 /* raid4 can take over: 8913 * raid0 - if there is only one strip zone 8914 * raid5 - if layout is right 8915 */ 8916 if (mddev->level == 0) 8917 return raid45_takeover_raid0(mddev, 4); 8918 if (mddev->level == 5 && 8919 mddev->layout == ALGORITHM_PARITY_N) { 8920 mddev->new_layout = 0; 8921 mddev->new_level = 4; 8922 return setup_conf(mddev); 8923 } 8924 return ERR_PTR(-EINVAL); 8925 } 8926 8927 static struct md_personality raid5_personality; 8928 8929 static void *raid6_takeover(struct mddev *mddev) 8930 { 8931 /* Currently can only take over a raid5. We map the 8932 * personality to an equivalent raid6 personality 8933 * with the Q block at the end. 8934 */ 8935 int new_layout; 8936 8937 if (mddev->pers != &raid5_personality) 8938 return ERR_PTR(-EINVAL); 8939 if (mddev->degraded > 1) 8940 return ERR_PTR(-EINVAL); 8941 if (mddev->raid_disks > 253) 8942 return ERR_PTR(-EINVAL); 8943 if (mddev->raid_disks < 3) 8944 return ERR_PTR(-EINVAL); 8945 8946 switch (mddev->layout) { 8947 case ALGORITHM_LEFT_ASYMMETRIC: 8948 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8949 break; 8950 case ALGORITHM_RIGHT_ASYMMETRIC: 8951 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8952 break; 8953 case ALGORITHM_LEFT_SYMMETRIC: 8954 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8955 break; 8956 case ALGORITHM_RIGHT_SYMMETRIC: 8957 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8958 break; 8959 case ALGORITHM_PARITY_0: 8960 new_layout = ALGORITHM_PARITY_0_6; 8961 break; 8962 case ALGORITHM_PARITY_N: 8963 new_layout = ALGORITHM_PARITY_N; 8964 break; 8965 default: 8966 return ERR_PTR(-EINVAL); 8967 } 8968 mddev->new_level = 6; 8969 mddev->new_layout = new_layout; 8970 mddev->delta_disks = 1; 8971 mddev->raid_disks += 1; 8972 return setup_conf(mddev); 8973 } 8974 8975 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8976 { 8977 struct r5conf *conf; 8978 int err; 8979 8980 err = mddev_suspend_and_lock(mddev); 8981 if (err) 8982 return err; 8983 conf = mddev->private; 8984 if (!conf) { 8985 mddev_unlock_and_resume(mddev); 8986 return -ENODEV; 8987 } 8988 8989 if (strncmp(buf, "ppl", 3) == 0) { 8990 /* ppl only works with RAID 5 */ 8991 if (!raid5_has_ppl(conf) && conf->level == 5) { 8992 err = log_init(conf, NULL, true); 8993 if (!err) { 8994 err = resize_stripes(conf, conf->pool_size); 8995 if (err) 8996 log_exit(conf); 8997 } 8998 } else 8999 err = -EINVAL; 9000 } else if (strncmp(buf, "resync", 6) == 0) { 9001 if (raid5_has_ppl(conf)) { 9002 log_exit(conf); 9003 err = resize_stripes(conf, conf->pool_size); 9004 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 9005 r5l_log_disk_error(conf)) { 9006 bool journal_dev_exists = false; 9007 struct md_rdev *rdev; 9008 9009 rdev_for_each(rdev, mddev) 9010 if (test_bit(Journal, &rdev->flags)) { 9011 journal_dev_exists = true; 9012 break; 9013 } 9014 9015 if (!journal_dev_exists) 9016 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 9017 else /* need remove journal device first */ 9018 err = -EBUSY; 9019 } else 9020 err = -EINVAL; 9021 } else { 9022 err = -EINVAL; 9023 } 9024 9025 if (!err) 9026 md_update_sb(mddev, 1); 9027 9028 mddev_unlock_and_resume(mddev); 9029 9030 return err; 9031 } 9032 9033 static int raid5_start(struct mddev *mddev) 9034 { 9035 struct r5conf *conf = mddev->private; 9036 9037 return r5l_start(conf->log); 9038 } 9039 9040 /* 9041 * This is only used for dm-raid456, caller already frozen sync_thread, hence 9042 * if rehsape is still in progress, io that is waiting for reshape can never be 9043 * done now, hence wake up and handle those IO. 9044 */ 9045 static void raid5_prepare_suspend(struct mddev *mddev) 9046 { 9047 struct r5conf *conf = mddev->private; 9048 9049 wake_up(&conf->wait_for_reshape); 9050 } 9051 9052 static struct md_personality raid6_personality = 9053 { 9054 .head = { 9055 .type = MD_PERSONALITY, 9056 .id = ID_RAID6, 9057 .name = "raid6", 9058 .owner = THIS_MODULE, 9059 }, 9060 9061 .make_request = raid5_make_request, 9062 .run = raid5_run, 9063 .start = raid5_start, 9064 .free = raid5_free, 9065 .status = raid5_status, 9066 .error_handler = raid5_error, 9067 .hot_add_disk = raid5_add_disk, 9068 .hot_remove_disk= raid5_remove_disk, 9069 .spare_active = raid5_spare_active, 9070 .sync_request = raid5_sync_request, 9071 .resize = raid5_resize, 9072 .size = raid5_size, 9073 .check_reshape = raid6_check_reshape, 9074 .start_reshape = raid5_start_reshape, 9075 .finish_reshape = raid5_finish_reshape, 9076 .quiesce = raid5_quiesce, 9077 .takeover = raid6_takeover, 9078 .change_consistency_policy = raid5_change_consistency_policy, 9079 .prepare_suspend = raid5_prepare_suspend, 9080 .bitmap_sector = raid5_bitmap_sector, 9081 }; 9082 static struct md_personality raid5_personality = 9083 { 9084 .head = { 9085 .type = MD_PERSONALITY, 9086 .id = ID_RAID5, 9087 .name = "raid5", 9088 .owner = THIS_MODULE, 9089 }, 9090 9091 .make_request = raid5_make_request, 9092 .run = raid5_run, 9093 .start = raid5_start, 9094 .free = raid5_free, 9095 .status = raid5_status, 9096 .error_handler = raid5_error, 9097 .hot_add_disk = raid5_add_disk, 9098 .hot_remove_disk= raid5_remove_disk, 9099 .spare_active = raid5_spare_active, 9100 .sync_request = raid5_sync_request, 9101 .resize = raid5_resize, 9102 .size = raid5_size, 9103 .check_reshape = raid5_check_reshape, 9104 .start_reshape = raid5_start_reshape, 9105 .finish_reshape = raid5_finish_reshape, 9106 .quiesce = raid5_quiesce, 9107 .takeover = raid5_takeover, 9108 .change_consistency_policy = raid5_change_consistency_policy, 9109 .prepare_suspend = raid5_prepare_suspend, 9110 .bitmap_sector = raid5_bitmap_sector, 9111 }; 9112 9113 static struct md_personality raid4_personality = 9114 { 9115 .head = { 9116 .type = MD_PERSONALITY, 9117 .id = ID_RAID4, 9118 .name = "raid4", 9119 .owner = THIS_MODULE, 9120 }, 9121 9122 .make_request = raid5_make_request, 9123 .run = raid5_run, 9124 .start = raid5_start, 9125 .free = raid5_free, 9126 .status = raid5_status, 9127 .error_handler = raid5_error, 9128 .hot_add_disk = raid5_add_disk, 9129 .hot_remove_disk= raid5_remove_disk, 9130 .spare_active = raid5_spare_active, 9131 .sync_request = raid5_sync_request, 9132 .resize = raid5_resize, 9133 .size = raid5_size, 9134 .check_reshape = raid5_check_reshape, 9135 .start_reshape = raid5_start_reshape, 9136 .finish_reshape = raid5_finish_reshape, 9137 .quiesce = raid5_quiesce, 9138 .takeover = raid4_takeover, 9139 .change_consistency_policy = raid5_change_consistency_policy, 9140 .prepare_suspend = raid5_prepare_suspend, 9141 .bitmap_sector = raid5_bitmap_sector, 9142 }; 9143 9144 static int __init raid5_init(void) 9145 { 9146 int ret; 9147 9148 raid5_wq = alloc_workqueue("raid5wq", 9149 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_SYSFS, 0); 9150 if (!raid5_wq) 9151 return -ENOMEM; 9152 9153 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 9154 "md/raid5:prepare", 9155 raid456_cpu_up_prepare, 9156 raid456_cpu_dead); 9157 if (ret) 9158 goto err_destroy_wq; 9159 9160 ret = register_md_submodule(&raid6_personality.head); 9161 if (ret) 9162 goto err_cpuhp_remove; 9163 9164 ret = register_md_submodule(&raid5_personality.head); 9165 if (ret) 9166 goto err_unregister_raid6; 9167 9168 ret = register_md_submodule(&raid4_personality.head); 9169 if (ret) 9170 goto err_unregister_raid5; 9171 9172 return 0; 9173 9174 err_unregister_raid5: 9175 unregister_md_submodule(&raid5_personality.head); 9176 err_unregister_raid6: 9177 unregister_md_submodule(&raid6_personality.head); 9178 err_cpuhp_remove: 9179 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 9180 err_destroy_wq: 9181 destroy_workqueue(raid5_wq); 9182 return ret; 9183 } 9184 9185 static void __exit raid5_exit(void) 9186 { 9187 unregister_md_submodule(&raid6_personality.head); 9188 unregister_md_submodule(&raid5_personality.head); 9189 unregister_md_submodule(&raid4_personality.head); 9190 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 9191 destroy_workqueue(raid5_wq); 9192 } 9193 9194 module_init(raid5_init); 9195 module_exit(raid5_exit); 9196 MODULE_LICENSE("GPL"); 9197 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 9198 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 9199 MODULE_ALIAS("md-raid5"); 9200 MODULE_ALIAS("md-raid4"); 9201 MODULE_ALIAS("md-level-5"); 9202 MODULE_ALIAS("md-level-4"); 9203 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 9204 MODULE_ALIAS("md-raid6"); 9205 MODULE_ALIAS("md-level-6"); 9206 9207 /* This used to be two separate modules, they were: */ 9208 MODULE_ALIAS("raid5"); 9209 MODULE_ALIAS("raid6"); 9210