1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * 6 * RAID-5 management functions. 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * You should have received a copy of the GNU General Public License 14 * (for example /usr/src/linux/COPYING); if not, write to the Free 15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 16 */ 17 18 19 #include <linux/config.h> 20 #include <linux/module.h> 21 #include <linux/slab.h> 22 #include <linux/raid/raid5.h> 23 #include <linux/highmem.h> 24 #include <linux/bitops.h> 25 #include <linux/kthread.h> 26 #include <asm/atomic.h> 27 28 #include <linux/raid/bitmap.h> 29 30 /* 31 * Stripe cache 32 */ 33 34 #define NR_STRIPES 256 35 #define STRIPE_SIZE PAGE_SIZE 36 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 37 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 38 #define IO_THRESHOLD 1 39 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 40 #define HASH_MASK (NR_HASH - 1) 41 42 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 43 44 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 45 * order without overlap. There may be several bio's per stripe+device, and 46 * a bio could span several devices. 47 * When walking this list for a particular stripe+device, we must never proceed 48 * beyond a bio that extends past this device, as the next bio might no longer 49 * be valid. 50 * This macro is used to determine the 'next' bio in the list, given the sector 51 * of the current stripe+device 52 */ 53 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 54 /* 55 * The following can be used to debug the driver 56 */ 57 #define RAID5_DEBUG 0 58 #define RAID5_PARANOIA 1 59 #if RAID5_PARANOIA && defined(CONFIG_SMP) 60 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 61 #else 62 # define CHECK_DEVLOCK() 63 #endif 64 65 #define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) 66 #if RAID5_DEBUG 67 #define inline 68 #define __inline__ 69 #endif 70 71 static void print_raid5_conf (raid5_conf_t *conf); 72 73 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 74 { 75 if (atomic_dec_and_test(&sh->count)) { 76 if (!list_empty(&sh->lru)) 77 BUG(); 78 if (atomic_read(&conf->active_stripes)==0) 79 BUG(); 80 if (test_bit(STRIPE_HANDLE, &sh->state)) { 81 if (test_bit(STRIPE_DELAYED, &sh->state)) 82 list_add_tail(&sh->lru, &conf->delayed_list); 83 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 84 conf->seq_write == sh->bm_seq) 85 list_add_tail(&sh->lru, &conf->bitmap_list); 86 else { 87 clear_bit(STRIPE_BIT_DELAY, &sh->state); 88 list_add_tail(&sh->lru, &conf->handle_list); 89 } 90 md_wakeup_thread(conf->mddev->thread); 91 } else { 92 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 93 atomic_dec(&conf->preread_active_stripes); 94 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 95 md_wakeup_thread(conf->mddev->thread); 96 } 97 atomic_dec(&conf->active_stripes); 98 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 99 list_add_tail(&sh->lru, &conf->inactive_list); 100 wake_up(&conf->wait_for_stripe); 101 } 102 } 103 } 104 } 105 static void release_stripe(struct stripe_head *sh) 106 { 107 raid5_conf_t *conf = sh->raid_conf; 108 unsigned long flags; 109 110 spin_lock_irqsave(&conf->device_lock, flags); 111 __release_stripe(conf, sh); 112 spin_unlock_irqrestore(&conf->device_lock, flags); 113 } 114 115 static inline void remove_hash(struct stripe_head *sh) 116 { 117 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 118 119 hlist_del_init(&sh->hash); 120 } 121 122 static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 123 { 124 struct hlist_head *hp = stripe_hash(conf, sh->sector); 125 126 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 127 128 CHECK_DEVLOCK(); 129 hlist_add_head(&sh->hash, hp); 130 } 131 132 133 /* find an idle stripe, make sure it is unhashed, and return it. */ 134 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 135 { 136 struct stripe_head *sh = NULL; 137 struct list_head *first; 138 139 CHECK_DEVLOCK(); 140 if (list_empty(&conf->inactive_list)) 141 goto out; 142 first = conf->inactive_list.next; 143 sh = list_entry(first, struct stripe_head, lru); 144 list_del_init(first); 145 remove_hash(sh); 146 atomic_inc(&conf->active_stripes); 147 out: 148 return sh; 149 } 150 151 static void shrink_buffers(struct stripe_head *sh, int num) 152 { 153 struct page *p; 154 int i; 155 156 for (i=0; i<num ; i++) { 157 p = sh->dev[i].page; 158 if (!p) 159 continue; 160 sh->dev[i].page = NULL; 161 put_page(p); 162 } 163 } 164 165 static int grow_buffers(struct stripe_head *sh, int num) 166 { 167 int i; 168 169 for (i=0; i<num; i++) { 170 struct page *page; 171 172 if (!(page = alloc_page(GFP_KERNEL))) { 173 return 1; 174 } 175 sh->dev[i].page = page; 176 } 177 return 0; 178 } 179 180 static void raid5_build_block (struct stripe_head *sh, int i); 181 182 static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) 183 { 184 raid5_conf_t *conf = sh->raid_conf; 185 int i; 186 187 if (atomic_read(&sh->count) != 0) 188 BUG(); 189 if (test_bit(STRIPE_HANDLE, &sh->state)) 190 BUG(); 191 192 CHECK_DEVLOCK(); 193 PRINTK("init_stripe called, stripe %llu\n", 194 (unsigned long long)sh->sector); 195 196 remove_hash(sh); 197 198 sh->sector = sector; 199 sh->pd_idx = pd_idx; 200 sh->state = 0; 201 202 sh->disks = disks; 203 204 for (i = sh->disks; i--; ) { 205 struct r5dev *dev = &sh->dev[i]; 206 207 if (dev->toread || dev->towrite || dev->written || 208 test_bit(R5_LOCKED, &dev->flags)) { 209 printk("sector=%llx i=%d %p %p %p %d\n", 210 (unsigned long long)sh->sector, i, dev->toread, 211 dev->towrite, dev->written, 212 test_bit(R5_LOCKED, &dev->flags)); 213 BUG(); 214 } 215 dev->flags = 0; 216 raid5_build_block(sh, i); 217 } 218 insert_hash(conf, sh); 219 } 220 221 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) 222 { 223 struct stripe_head *sh; 224 struct hlist_node *hn; 225 226 CHECK_DEVLOCK(); 227 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 228 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 229 if (sh->sector == sector && sh->disks == disks) 230 return sh; 231 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 232 return NULL; 233 } 234 235 static void unplug_slaves(mddev_t *mddev); 236 static void raid5_unplug_device(request_queue_t *q); 237 238 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, 239 int pd_idx, int noblock) 240 { 241 struct stripe_head *sh; 242 243 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); 244 245 spin_lock_irq(&conf->device_lock); 246 247 do { 248 wait_event_lock_irq(conf->wait_for_stripe, 249 conf->quiesce == 0, 250 conf->device_lock, /* nothing */); 251 sh = __find_stripe(conf, sector, disks); 252 if (!sh) { 253 if (!conf->inactive_blocked) 254 sh = get_free_stripe(conf); 255 if (noblock && sh == NULL) 256 break; 257 if (!sh) { 258 conf->inactive_blocked = 1; 259 wait_event_lock_irq(conf->wait_for_stripe, 260 !list_empty(&conf->inactive_list) && 261 (atomic_read(&conf->active_stripes) 262 < (conf->max_nr_stripes *3/4) 263 || !conf->inactive_blocked), 264 conf->device_lock, 265 unplug_slaves(conf->mddev) 266 ); 267 conf->inactive_blocked = 0; 268 } else 269 init_stripe(sh, sector, pd_idx, disks); 270 } else { 271 if (atomic_read(&sh->count)) { 272 if (!list_empty(&sh->lru)) 273 BUG(); 274 } else { 275 if (!test_bit(STRIPE_HANDLE, &sh->state)) 276 atomic_inc(&conf->active_stripes); 277 if (!list_empty(&sh->lru)) 278 list_del_init(&sh->lru); 279 } 280 } 281 } while (sh == NULL); 282 283 if (sh) 284 atomic_inc(&sh->count); 285 286 spin_unlock_irq(&conf->device_lock); 287 return sh; 288 } 289 290 static int grow_one_stripe(raid5_conf_t *conf) 291 { 292 struct stripe_head *sh; 293 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 294 if (!sh) 295 return 0; 296 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); 297 sh->raid_conf = conf; 298 spin_lock_init(&sh->lock); 299 300 if (grow_buffers(sh, conf->raid_disks)) { 301 shrink_buffers(sh, conf->raid_disks); 302 kmem_cache_free(conf->slab_cache, sh); 303 return 0; 304 } 305 sh->disks = conf->raid_disks; 306 /* we just created an active stripe so... */ 307 atomic_set(&sh->count, 1); 308 atomic_inc(&conf->active_stripes); 309 INIT_LIST_HEAD(&sh->lru); 310 release_stripe(sh); 311 return 1; 312 } 313 314 static int grow_stripes(raid5_conf_t *conf, int num) 315 { 316 kmem_cache_t *sc; 317 int devs = conf->raid_disks; 318 319 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); 320 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev)); 321 conf->active_name = 0; 322 sc = kmem_cache_create(conf->cache_name[conf->active_name], 323 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 324 0, 0, NULL, NULL); 325 if (!sc) 326 return 1; 327 conf->slab_cache = sc; 328 conf->pool_size = devs; 329 while (num--) { 330 if (!grow_one_stripe(conf)) 331 return 1; 332 } 333 return 0; 334 } 335 336 #ifdef CONFIG_MD_RAID5_RESHAPE 337 static int resize_stripes(raid5_conf_t *conf, int newsize) 338 { 339 /* Make all the stripes able to hold 'newsize' devices. 340 * New slots in each stripe get 'page' set to a new page. 341 * 342 * This happens in stages: 343 * 1/ create a new kmem_cache and allocate the required number of 344 * stripe_heads. 345 * 2/ gather all the old stripe_heads and tranfer the pages across 346 * to the new stripe_heads. This will have the side effect of 347 * freezing the array as once all stripe_heads have been collected, 348 * no IO will be possible. Old stripe heads are freed once their 349 * pages have been transferred over, and the old kmem_cache is 350 * freed when all stripes are done. 351 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 352 * we simple return a failre status - no need to clean anything up. 353 * 4/ allocate new pages for the new slots in the new stripe_heads. 354 * If this fails, we don't bother trying the shrink the 355 * stripe_heads down again, we just leave them as they are. 356 * As each stripe_head is processed the new one is released into 357 * active service. 358 * 359 * Once step2 is started, we cannot afford to wait for a write, 360 * so we use GFP_NOIO allocations. 361 */ 362 struct stripe_head *osh, *nsh; 363 LIST_HEAD(newstripes); 364 struct disk_info *ndisks; 365 int err = 0; 366 kmem_cache_t *sc; 367 int i; 368 369 if (newsize <= conf->pool_size) 370 return 0; /* never bother to shrink */ 371 372 /* Step 1 */ 373 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 374 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 375 0, 0, NULL, NULL); 376 if (!sc) 377 return -ENOMEM; 378 379 for (i = conf->max_nr_stripes; i; i--) { 380 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 381 if (!nsh) 382 break; 383 384 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 385 386 nsh->raid_conf = conf; 387 spin_lock_init(&nsh->lock); 388 389 list_add(&nsh->lru, &newstripes); 390 } 391 if (i) { 392 /* didn't get enough, give up */ 393 while (!list_empty(&newstripes)) { 394 nsh = list_entry(newstripes.next, struct stripe_head, lru); 395 list_del(&nsh->lru); 396 kmem_cache_free(sc, nsh); 397 } 398 kmem_cache_destroy(sc); 399 return -ENOMEM; 400 } 401 /* Step 2 - Must use GFP_NOIO now. 402 * OK, we have enough stripes, start collecting inactive 403 * stripes and copying them over 404 */ 405 list_for_each_entry(nsh, &newstripes, lru) { 406 spin_lock_irq(&conf->device_lock); 407 wait_event_lock_irq(conf->wait_for_stripe, 408 !list_empty(&conf->inactive_list), 409 conf->device_lock, 410 unplug_slaves(conf->mddev) 411 ); 412 osh = get_free_stripe(conf); 413 spin_unlock_irq(&conf->device_lock); 414 atomic_set(&nsh->count, 1); 415 for(i=0; i<conf->pool_size; i++) 416 nsh->dev[i].page = osh->dev[i].page; 417 for( ; i<newsize; i++) 418 nsh->dev[i].page = NULL; 419 kmem_cache_free(conf->slab_cache, osh); 420 } 421 kmem_cache_destroy(conf->slab_cache); 422 423 /* Step 3. 424 * At this point, we are holding all the stripes so the array 425 * is completely stalled, so now is a good time to resize 426 * conf->disks. 427 */ 428 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 429 if (ndisks) { 430 for (i=0; i<conf->raid_disks; i++) 431 ndisks[i] = conf->disks[i]; 432 kfree(conf->disks); 433 conf->disks = ndisks; 434 } else 435 err = -ENOMEM; 436 437 /* Step 4, return new stripes to service */ 438 while(!list_empty(&newstripes)) { 439 nsh = list_entry(newstripes.next, struct stripe_head, lru); 440 list_del_init(&nsh->lru); 441 for (i=conf->raid_disks; i < newsize; i++) 442 if (nsh->dev[i].page == NULL) { 443 struct page *p = alloc_page(GFP_NOIO); 444 nsh->dev[i].page = p; 445 if (!p) 446 err = -ENOMEM; 447 } 448 release_stripe(nsh); 449 } 450 /* critical section pass, GFP_NOIO no longer needed */ 451 452 conf->slab_cache = sc; 453 conf->active_name = 1-conf->active_name; 454 conf->pool_size = newsize; 455 return err; 456 } 457 #endif 458 459 static int drop_one_stripe(raid5_conf_t *conf) 460 { 461 struct stripe_head *sh; 462 463 spin_lock_irq(&conf->device_lock); 464 sh = get_free_stripe(conf); 465 spin_unlock_irq(&conf->device_lock); 466 if (!sh) 467 return 0; 468 if (atomic_read(&sh->count)) 469 BUG(); 470 shrink_buffers(sh, conf->pool_size); 471 kmem_cache_free(conf->slab_cache, sh); 472 atomic_dec(&conf->active_stripes); 473 return 1; 474 } 475 476 static void shrink_stripes(raid5_conf_t *conf) 477 { 478 while (drop_one_stripe(conf)) 479 ; 480 481 if (conf->slab_cache) 482 kmem_cache_destroy(conf->slab_cache); 483 conf->slab_cache = NULL; 484 } 485 486 static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, 487 int error) 488 { 489 struct stripe_head *sh = bi->bi_private; 490 raid5_conf_t *conf = sh->raid_conf; 491 int disks = sh->disks, i; 492 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 493 494 if (bi->bi_size) 495 return 1; 496 497 for (i=0 ; i<disks; i++) 498 if (bi == &sh->dev[i].req) 499 break; 500 501 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", 502 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 503 uptodate); 504 if (i == disks) { 505 BUG(); 506 return 0; 507 } 508 509 if (uptodate) { 510 #if 0 511 struct bio *bio; 512 unsigned long flags; 513 spin_lock_irqsave(&conf->device_lock, flags); 514 /* we can return a buffer if we bypassed the cache or 515 * if the top buffer is not in highmem. If there are 516 * multiple buffers, leave the extra work to 517 * handle_stripe 518 */ 519 buffer = sh->bh_read[i]; 520 if (buffer && 521 (!PageHighMem(buffer->b_page) 522 || buffer->b_page == bh->b_page ) 523 ) { 524 sh->bh_read[i] = buffer->b_reqnext; 525 buffer->b_reqnext = NULL; 526 } else 527 buffer = NULL; 528 spin_unlock_irqrestore(&conf->device_lock, flags); 529 if (sh->bh_page[i]==bh->b_page) 530 set_buffer_uptodate(bh); 531 if (buffer) { 532 if (buffer->b_page != bh->b_page) 533 memcpy(buffer->b_data, bh->b_data, bh->b_size); 534 buffer->b_end_io(buffer, 1); 535 } 536 #else 537 set_bit(R5_UPTODATE, &sh->dev[i].flags); 538 #endif 539 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 540 printk(KERN_INFO "raid5: read error corrected!!\n"); 541 clear_bit(R5_ReadError, &sh->dev[i].flags); 542 clear_bit(R5_ReWrite, &sh->dev[i].flags); 543 } 544 if (atomic_read(&conf->disks[i].rdev->read_errors)) 545 atomic_set(&conf->disks[i].rdev->read_errors, 0); 546 } else { 547 int retry = 0; 548 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 549 atomic_inc(&conf->disks[i].rdev->read_errors); 550 if (conf->mddev->degraded) 551 printk(KERN_WARNING "raid5: read error not correctable.\n"); 552 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 553 /* Oh, no!!! */ 554 printk(KERN_WARNING "raid5: read error NOT corrected!!\n"); 555 else if (atomic_read(&conf->disks[i].rdev->read_errors) 556 > conf->max_nr_stripes) 557 printk(KERN_WARNING 558 "raid5: Too many read errors, failing device.\n"); 559 else 560 retry = 1; 561 if (retry) 562 set_bit(R5_ReadError, &sh->dev[i].flags); 563 else { 564 clear_bit(R5_ReadError, &sh->dev[i].flags); 565 clear_bit(R5_ReWrite, &sh->dev[i].flags); 566 md_error(conf->mddev, conf->disks[i].rdev); 567 } 568 } 569 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 570 #if 0 571 /* must restore b_page before unlocking buffer... */ 572 if (sh->bh_page[i] != bh->b_page) { 573 bh->b_page = sh->bh_page[i]; 574 bh->b_data = page_address(bh->b_page); 575 clear_buffer_uptodate(bh); 576 } 577 #endif 578 clear_bit(R5_LOCKED, &sh->dev[i].flags); 579 set_bit(STRIPE_HANDLE, &sh->state); 580 release_stripe(sh); 581 return 0; 582 } 583 584 static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, 585 int error) 586 { 587 struct stripe_head *sh = bi->bi_private; 588 raid5_conf_t *conf = sh->raid_conf; 589 int disks = sh->disks, i; 590 unsigned long flags; 591 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 592 593 if (bi->bi_size) 594 return 1; 595 596 for (i=0 ; i<disks; i++) 597 if (bi == &sh->dev[i].req) 598 break; 599 600 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", 601 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 602 uptodate); 603 if (i == disks) { 604 BUG(); 605 return 0; 606 } 607 608 spin_lock_irqsave(&conf->device_lock, flags); 609 if (!uptodate) 610 md_error(conf->mddev, conf->disks[i].rdev); 611 612 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 613 614 clear_bit(R5_LOCKED, &sh->dev[i].flags); 615 set_bit(STRIPE_HANDLE, &sh->state); 616 __release_stripe(conf, sh); 617 spin_unlock_irqrestore(&conf->device_lock, flags); 618 return 0; 619 } 620 621 622 static sector_t compute_blocknr(struct stripe_head *sh, int i); 623 624 static void raid5_build_block (struct stripe_head *sh, int i) 625 { 626 struct r5dev *dev = &sh->dev[i]; 627 628 bio_init(&dev->req); 629 dev->req.bi_io_vec = &dev->vec; 630 dev->req.bi_vcnt++; 631 dev->req.bi_max_vecs++; 632 dev->vec.bv_page = dev->page; 633 dev->vec.bv_len = STRIPE_SIZE; 634 dev->vec.bv_offset = 0; 635 636 dev->req.bi_sector = sh->sector; 637 dev->req.bi_private = sh; 638 639 dev->flags = 0; 640 if (i != sh->pd_idx) 641 dev->sector = compute_blocknr(sh, i); 642 } 643 644 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 645 { 646 char b[BDEVNAME_SIZE]; 647 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 648 PRINTK("raid5: error called\n"); 649 650 if (!test_bit(Faulty, &rdev->flags)) { 651 mddev->sb_dirty = 1; 652 if (test_bit(In_sync, &rdev->flags)) { 653 conf->working_disks--; 654 mddev->degraded++; 655 conf->failed_disks++; 656 clear_bit(In_sync, &rdev->flags); 657 /* 658 * if recovery was running, make sure it aborts. 659 */ 660 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 661 } 662 set_bit(Faulty, &rdev->flags); 663 printk (KERN_ALERT 664 "raid5: Disk failure on %s, disabling device." 665 " Operation continuing on %d devices\n", 666 bdevname(rdev->bdev,b), conf->working_disks); 667 } 668 } 669 670 /* 671 * Input: a 'big' sector number, 672 * Output: index of the data and parity disk, and the sector # in them. 673 */ 674 static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, 675 unsigned int data_disks, unsigned int * dd_idx, 676 unsigned int * pd_idx, raid5_conf_t *conf) 677 { 678 long stripe; 679 unsigned long chunk_number; 680 unsigned int chunk_offset; 681 sector_t new_sector; 682 int sectors_per_chunk = conf->chunk_size >> 9; 683 684 /* First compute the information on this sector */ 685 686 /* 687 * Compute the chunk number and the sector offset inside the chunk 688 */ 689 chunk_offset = sector_div(r_sector, sectors_per_chunk); 690 chunk_number = r_sector; 691 BUG_ON(r_sector != chunk_number); 692 693 /* 694 * Compute the stripe number 695 */ 696 stripe = chunk_number / data_disks; 697 698 /* 699 * Compute the data disk and parity disk indexes inside the stripe 700 */ 701 *dd_idx = chunk_number % data_disks; 702 703 /* 704 * Select the parity disk based on the user selected algorithm. 705 */ 706 if (conf->level == 4) 707 *pd_idx = data_disks; 708 else switch (conf->algorithm) { 709 case ALGORITHM_LEFT_ASYMMETRIC: 710 *pd_idx = data_disks - stripe % raid_disks; 711 if (*dd_idx >= *pd_idx) 712 (*dd_idx)++; 713 break; 714 case ALGORITHM_RIGHT_ASYMMETRIC: 715 *pd_idx = stripe % raid_disks; 716 if (*dd_idx >= *pd_idx) 717 (*dd_idx)++; 718 break; 719 case ALGORITHM_LEFT_SYMMETRIC: 720 *pd_idx = data_disks - stripe % raid_disks; 721 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 722 break; 723 case ALGORITHM_RIGHT_SYMMETRIC: 724 *pd_idx = stripe % raid_disks; 725 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 726 break; 727 default: 728 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 729 conf->algorithm); 730 } 731 732 /* 733 * Finally, compute the new sector number 734 */ 735 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 736 return new_sector; 737 } 738 739 740 static sector_t compute_blocknr(struct stripe_head *sh, int i) 741 { 742 raid5_conf_t *conf = sh->raid_conf; 743 int raid_disks = sh->disks, data_disks = raid_disks - 1; 744 sector_t new_sector = sh->sector, check; 745 int sectors_per_chunk = conf->chunk_size >> 9; 746 sector_t stripe; 747 int chunk_offset; 748 int chunk_number, dummy1, dummy2, dd_idx = i; 749 sector_t r_sector; 750 751 chunk_offset = sector_div(new_sector, sectors_per_chunk); 752 stripe = new_sector; 753 BUG_ON(new_sector != stripe); 754 755 756 switch (conf->algorithm) { 757 case ALGORITHM_LEFT_ASYMMETRIC: 758 case ALGORITHM_RIGHT_ASYMMETRIC: 759 if (i > sh->pd_idx) 760 i--; 761 break; 762 case ALGORITHM_LEFT_SYMMETRIC: 763 case ALGORITHM_RIGHT_SYMMETRIC: 764 if (i < sh->pd_idx) 765 i += raid_disks; 766 i -= (sh->pd_idx + 1); 767 break; 768 default: 769 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 770 conf->algorithm); 771 } 772 773 chunk_number = stripe * data_disks + i; 774 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 775 776 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); 777 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { 778 printk(KERN_ERR "compute_blocknr: map not correct\n"); 779 return 0; 780 } 781 return r_sector; 782 } 783 784 785 786 /* 787 * Copy data between a page in the stripe cache, and a bio. 788 * There are no alignment or size guarantees between the page or the 789 * bio except that there is some overlap. 790 * All iovecs in the bio must be considered. 791 */ 792 static void copy_data(int frombio, struct bio *bio, 793 struct page *page, 794 sector_t sector) 795 { 796 char *pa = page_address(page); 797 struct bio_vec *bvl; 798 int i; 799 int page_offset; 800 801 if (bio->bi_sector >= sector) 802 page_offset = (signed)(bio->bi_sector - sector) * 512; 803 else 804 page_offset = (signed)(sector - bio->bi_sector) * -512; 805 bio_for_each_segment(bvl, bio, i) { 806 int len = bio_iovec_idx(bio,i)->bv_len; 807 int clen; 808 int b_offset = 0; 809 810 if (page_offset < 0) { 811 b_offset = -page_offset; 812 page_offset += b_offset; 813 len -= b_offset; 814 } 815 816 if (len > 0 && page_offset + len > STRIPE_SIZE) 817 clen = STRIPE_SIZE - page_offset; 818 else clen = len; 819 820 if (clen > 0) { 821 char *ba = __bio_kmap_atomic(bio, i, KM_USER0); 822 if (frombio) 823 memcpy(pa+page_offset, ba+b_offset, clen); 824 else 825 memcpy(ba+b_offset, pa+page_offset, clen); 826 __bio_kunmap_atomic(ba, KM_USER0); 827 } 828 if (clen < len) /* hit end of page */ 829 break; 830 page_offset += len; 831 } 832 } 833 834 #define check_xor() do { \ 835 if (count == MAX_XOR_BLOCKS) { \ 836 xor_block(count, STRIPE_SIZE, ptr); \ 837 count = 1; \ 838 } \ 839 } while(0) 840 841 842 static void compute_block(struct stripe_head *sh, int dd_idx) 843 { 844 int i, count, disks = sh->disks; 845 void *ptr[MAX_XOR_BLOCKS], *p; 846 847 PRINTK("compute_block, stripe %llu, idx %d\n", 848 (unsigned long long)sh->sector, dd_idx); 849 850 ptr[0] = page_address(sh->dev[dd_idx].page); 851 memset(ptr[0], 0, STRIPE_SIZE); 852 count = 1; 853 for (i = disks ; i--; ) { 854 if (i == dd_idx) 855 continue; 856 p = page_address(sh->dev[i].page); 857 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 858 ptr[count++] = p; 859 else 860 printk(KERN_ERR "compute_block() %d, stripe %llu, %d" 861 " not present\n", dd_idx, 862 (unsigned long long)sh->sector, i); 863 864 check_xor(); 865 } 866 if (count != 1) 867 xor_block(count, STRIPE_SIZE, ptr); 868 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 869 } 870 871 static void compute_parity(struct stripe_head *sh, int method) 872 { 873 raid5_conf_t *conf = sh->raid_conf; 874 int i, pd_idx = sh->pd_idx, disks = sh->disks, count; 875 void *ptr[MAX_XOR_BLOCKS]; 876 struct bio *chosen; 877 878 PRINTK("compute_parity, stripe %llu, method %d\n", 879 (unsigned long long)sh->sector, method); 880 881 count = 1; 882 ptr[0] = page_address(sh->dev[pd_idx].page); 883 switch(method) { 884 case READ_MODIFY_WRITE: 885 if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)) 886 BUG(); 887 for (i=disks ; i-- ;) { 888 if (i==pd_idx) 889 continue; 890 if (sh->dev[i].towrite && 891 test_bit(R5_UPTODATE, &sh->dev[i].flags)) { 892 ptr[count++] = page_address(sh->dev[i].page); 893 chosen = sh->dev[i].towrite; 894 sh->dev[i].towrite = NULL; 895 896 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 897 wake_up(&conf->wait_for_overlap); 898 899 if (sh->dev[i].written) BUG(); 900 sh->dev[i].written = chosen; 901 check_xor(); 902 } 903 } 904 break; 905 case RECONSTRUCT_WRITE: 906 memset(ptr[0], 0, STRIPE_SIZE); 907 for (i= disks; i-- ;) 908 if (i!=pd_idx && sh->dev[i].towrite) { 909 chosen = sh->dev[i].towrite; 910 sh->dev[i].towrite = NULL; 911 912 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 913 wake_up(&conf->wait_for_overlap); 914 915 if (sh->dev[i].written) BUG(); 916 sh->dev[i].written = chosen; 917 } 918 break; 919 case CHECK_PARITY: 920 break; 921 } 922 if (count>1) { 923 xor_block(count, STRIPE_SIZE, ptr); 924 count = 1; 925 } 926 927 for (i = disks; i--;) 928 if (sh->dev[i].written) { 929 sector_t sector = sh->dev[i].sector; 930 struct bio *wbi = sh->dev[i].written; 931 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { 932 copy_data(1, wbi, sh->dev[i].page, sector); 933 wbi = r5_next_bio(wbi, sector); 934 } 935 936 set_bit(R5_LOCKED, &sh->dev[i].flags); 937 set_bit(R5_UPTODATE, &sh->dev[i].flags); 938 } 939 940 switch(method) { 941 case RECONSTRUCT_WRITE: 942 case CHECK_PARITY: 943 for (i=disks; i--;) 944 if (i != pd_idx) { 945 ptr[count++] = page_address(sh->dev[i].page); 946 check_xor(); 947 } 948 break; 949 case READ_MODIFY_WRITE: 950 for (i = disks; i--;) 951 if (sh->dev[i].written) { 952 ptr[count++] = page_address(sh->dev[i].page); 953 check_xor(); 954 } 955 } 956 if (count != 1) 957 xor_block(count, STRIPE_SIZE, ptr); 958 959 if (method != CHECK_PARITY) { 960 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 961 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 962 } else 963 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 964 } 965 966 /* 967 * Each stripe/dev can have one or more bion attached. 968 * toread/towrite point to the first in a chain. 969 * The bi_next chain must be in order. 970 */ 971 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 972 { 973 struct bio **bip; 974 raid5_conf_t *conf = sh->raid_conf; 975 int firstwrite=0; 976 977 PRINTK("adding bh b#%llu to stripe s#%llu\n", 978 (unsigned long long)bi->bi_sector, 979 (unsigned long long)sh->sector); 980 981 982 spin_lock(&sh->lock); 983 spin_lock_irq(&conf->device_lock); 984 if (forwrite) { 985 bip = &sh->dev[dd_idx].towrite; 986 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 987 firstwrite = 1; 988 } else 989 bip = &sh->dev[dd_idx].toread; 990 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 991 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 992 goto overlap; 993 bip = & (*bip)->bi_next; 994 } 995 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 996 goto overlap; 997 998 if (*bip && bi->bi_next && (*bip) != bi->bi_next) 999 BUG(); 1000 if (*bip) 1001 bi->bi_next = *bip; 1002 *bip = bi; 1003 bi->bi_phys_segments ++; 1004 spin_unlock_irq(&conf->device_lock); 1005 spin_unlock(&sh->lock); 1006 1007 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", 1008 (unsigned long long)bi->bi_sector, 1009 (unsigned long long)sh->sector, dd_idx); 1010 1011 if (conf->mddev->bitmap && firstwrite) { 1012 sh->bm_seq = conf->seq_write; 1013 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 1014 STRIPE_SECTORS, 0); 1015 set_bit(STRIPE_BIT_DELAY, &sh->state); 1016 } 1017 1018 if (forwrite) { 1019 /* check if page is covered */ 1020 sector_t sector = sh->dev[dd_idx].sector; 1021 for (bi=sh->dev[dd_idx].towrite; 1022 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 1023 bi && bi->bi_sector <= sector; 1024 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 1025 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 1026 sector = bi->bi_sector + (bi->bi_size>>9); 1027 } 1028 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 1029 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 1030 } 1031 return 1; 1032 1033 overlap: 1034 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 1035 spin_unlock_irq(&conf->device_lock); 1036 spin_unlock(&sh->lock); 1037 return 0; 1038 } 1039 1040 static void end_reshape(raid5_conf_t *conf); 1041 1042 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1043 { 1044 int sectors_per_chunk = conf->chunk_size >> 9; 1045 sector_t x = stripe; 1046 int pd_idx, dd_idx; 1047 int chunk_offset = sector_div(x, sectors_per_chunk); 1048 stripe = x; 1049 raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk 1050 + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); 1051 return pd_idx; 1052 } 1053 1054 1055 /* 1056 * handle_stripe - do things to a stripe. 1057 * 1058 * We lock the stripe and then examine the state of various bits 1059 * to see what needs to be done. 1060 * Possible results: 1061 * return some read request which now have data 1062 * return some write requests which are safely on disc 1063 * schedule a read on some buffers 1064 * schedule a write of some buffers 1065 * return confirmation of parity correctness 1066 * 1067 * Parity calculations are done inside the stripe lock 1068 * buffers are taken off read_list or write_list, and bh_cache buffers 1069 * get BH_Lock set before the stripe lock is released. 1070 * 1071 */ 1072 1073 static void handle_stripe(struct stripe_head *sh) 1074 { 1075 raid5_conf_t *conf = sh->raid_conf; 1076 int disks = sh->disks; 1077 struct bio *return_bi= NULL; 1078 struct bio *bi; 1079 int i; 1080 int syncing, expanding, expanded; 1081 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1082 int non_overwrite = 0; 1083 int failed_num=0; 1084 struct r5dev *dev; 1085 1086 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", 1087 (unsigned long long)sh->sector, atomic_read(&sh->count), 1088 sh->pd_idx); 1089 1090 spin_lock(&sh->lock); 1091 clear_bit(STRIPE_HANDLE, &sh->state); 1092 clear_bit(STRIPE_DELAYED, &sh->state); 1093 1094 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1095 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 1096 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 1097 /* Now to look around and see what can be done */ 1098 1099 rcu_read_lock(); 1100 for (i=disks; i--; ) { 1101 mdk_rdev_t *rdev; 1102 dev = &sh->dev[i]; 1103 clear_bit(R5_Insync, &dev->flags); 1104 1105 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 1106 i, dev->flags, dev->toread, dev->towrite, dev->written); 1107 /* maybe we can reply to a read */ 1108 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 1109 struct bio *rbi, *rbi2; 1110 PRINTK("Return read for disc %d\n", i); 1111 spin_lock_irq(&conf->device_lock); 1112 rbi = dev->toread; 1113 dev->toread = NULL; 1114 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1115 wake_up(&conf->wait_for_overlap); 1116 spin_unlock_irq(&conf->device_lock); 1117 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { 1118 copy_data(0, rbi, dev->page, dev->sector); 1119 rbi2 = r5_next_bio(rbi, dev->sector); 1120 spin_lock_irq(&conf->device_lock); 1121 if (--rbi->bi_phys_segments == 0) { 1122 rbi->bi_next = return_bi; 1123 return_bi = rbi; 1124 } 1125 spin_unlock_irq(&conf->device_lock); 1126 rbi = rbi2; 1127 } 1128 } 1129 1130 /* now count some things */ 1131 if (test_bit(R5_LOCKED, &dev->flags)) locked++; 1132 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; 1133 1134 1135 if (dev->toread) to_read++; 1136 if (dev->towrite) { 1137 to_write++; 1138 if (!test_bit(R5_OVERWRITE, &dev->flags)) 1139 non_overwrite++; 1140 } 1141 if (dev->written) written++; 1142 rdev = rcu_dereference(conf->disks[i].rdev); 1143 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 1144 /* The ReadError flag will just be confusing now */ 1145 clear_bit(R5_ReadError, &dev->flags); 1146 clear_bit(R5_ReWrite, &dev->flags); 1147 } 1148 if (!rdev || !test_bit(In_sync, &rdev->flags) 1149 || test_bit(R5_ReadError, &dev->flags)) { 1150 failed++; 1151 failed_num = i; 1152 } else 1153 set_bit(R5_Insync, &dev->flags); 1154 } 1155 rcu_read_unlock(); 1156 PRINTK("locked=%d uptodate=%d to_read=%d" 1157 " to_write=%d failed=%d failed_num=%d\n", 1158 locked, uptodate, to_read, to_write, failed, failed_num); 1159 /* check if the array has lost two devices and, if so, some requests might 1160 * need to be failed 1161 */ 1162 if (failed > 1 && to_read+to_write+written) { 1163 for (i=disks; i--; ) { 1164 int bitmap_end = 0; 1165 1166 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1167 mdk_rdev_t *rdev; 1168 rcu_read_lock(); 1169 rdev = rcu_dereference(conf->disks[i].rdev); 1170 if (rdev && test_bit(In_sync, &rdev->flags)) 1171 /* multiple read failures in one stripe */ 1172 md_error(conf->mddev, rdev); 1173 rcu_read_unlock(); 1174 } 1175 1176 spin_lock_irq(&conf->device_lock); 1177 /* fail all writes first */ 1178 bi = sh->dev[i].towrite; 1179 sh->dev[i].towrite = NULL; 1180 if (bi) { to_write--; bitmap_end = 1; } 1181 1182 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1183 wake_up(&conf->wait_for_overlap); 1184 1185 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ 1186 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 1187 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1188 if (--bi->bi_phys_segments == 0) { 1189 md_write_end(conf->mddev); 1190 bi->bi_next = return_bi; 1191 return_bi = bi; 1192 } 1193 bi = nextbi; 1194 } 1195 /* and fail all 'written' */ 1196 bi = sh->dev[i].written; 1197 sh->dev[i].written = NULL; 1198 if (bi) bitmap_end = 1; 1199 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { 1200 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1201 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1202 if (--bi->bi_phys_segments == 0) { 1203 md_write_end(conf->mddev); 1204 bi->bi_next = return_bi; 1205 return_bi = bi; 1206 } 1207 bi = bi2; 1208 } 1209 1210 /* fail any reads if this device is non-operational */ 1211 if (!test_bit(R5_Insync, &sh->dev[i].flags) || 1212 test_bit(R5_ReadError, &sh->dev[i].flags)) { 1213 bi = sh->dev[i].toread; 1214 sh->dev[i].toread = NULL; 1215 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1216 wake_up(&conf->wait_for_overlap); 1217 if (bi) to_read--; 1218 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ 1219 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 1220 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1221 if (--bi->bi_phys_segments == 0) { 1222 bi->bi_next = return_bi; 1223 return_bi = bi; 1224 } 1225 bi = nextbi; 1226 } 1227 } 1228 spin_unlock_irq(&conf->device_lock); 1229 if (bitmap_end) 1230 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 1231 STRIPE_SECTORS, 0, 0); 1232 } 1233 } 1234 if (failed > 1 && syncing) { 1235 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 1236 clear_bit(STRIPE_SYNCING, &sh->state); 1237 syncing = 0; 1238 } 1239 1240 /* might be able to return some write requests if the parity block 1241 * is safe, or on a failed drive 1242 */ 1243 dev = &sh->dev[sh->pd_idx]; 1244 if ( written && 1245 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && 1246 test_bit(R5_UPTODATE, &dev->flags)) 1247 || (failed == 1 && failed_num == sh->pd_idx)) 1248 ) { 1249 /* any written block on an uptodate or failed drive can be returned. 1250 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 1251 * never LOCKED, so we don't need to test 'failed' directly. 1252 */ 1253 for (i=disks; i--; ) 1254 if (sh->dev[i].written) { 1255 dev = &sh->dev[i]; 1256 if (!test_bit(R5_LOCKED, &dev->flags) && 1257 test_bit(R5_UPTODATE, &dev->flags) ) { 1258 /* We can return any write requests */ 1259 struct bio *wbi, *wbi2; 1260 int bitmap_end = 0; 1261 PRINTK("Return write for disc %d\n", i); 1262 spin_lock_irq(&conf->device_lock); 1263 wbi = dev->written; 1264 dev->written = NULL; 1265 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { 1266 wbi2 = r5_next_bio(wbi, dev->sector); 1267 if (--wbi->bi_phys_segments == 0) { 1268 md_write_end(conf->mddev); 1269 wbi->bi_next = return_bi; 1270 return_bi = wbi; 1271 } 1272 wbi = wbi2; 1273 } 1274 if (dev->towrite == NULL) 1275 bitmap_end = 1; 1276 spin_unlock_irq(&conf->device_lock); 1277 if (bitmap_end) 1278 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 1279 STRIPE_SECTORS, 1280 !test_bit(STRIPE_DEGRADED, &sh->state), 0); 1281 } 1282 } 1283 } 1284 1285 /* Now we might consider reading some blocks, either to check/generate 1286 * parity, or to satisfy requests 1287 * or to load a block that is being partially written. 1288 */ 1289 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { 1290 for (i=disks; i--;) { 1291 dev = &sh->dev[i]; 1292 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1293 (dev->toread || 1294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1295 syncing || 1296 expanding || 1297 (failed && (sh->dev[failed_num].toread || 1298 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 1299 ) 1300 ) { 1301 /* we would like to get this block, possibly 1302 * by computing it, but we might not be able to 1303 */ 1304 if (uptodate == disks-1) { 1305 PRINTK("Computing block %d\n", i); 1306 compute_block(sh, i); 1307 uptodate++; 1308 } else if (test_bit(R5_Insync, &dev->flags)) { 1309 set_bit(R5_LOCKED, &dev->flags); 1310 set_bit(R5_Wantread, &dev->flags); 1311 #if 0 1312 /* if I am just reading this block and we don't have 1313 a failed drive, or any pending writes then sidestep the cache */ 1314 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && 1315 ! syncing && !failed && !to_write) { 1316 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; 1317 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; 1318 } 1319 #endif 1320 locked++; 1321 PRINTK("Reading block %d (sync=%d)\n", 1322 i, syncing); 1323 } 1324 } 1325 } 1326 set_bit(STRIPE_HANDLE, &sh->state); 1327 } 1328 1329 /* now to consider writing and what else, if anything should be read */ 1330 if (to_write) { 1331 int rmw=0, rcw=0; 1332 for (i=disks ; i--;) { 1333 /* would I have to read this buffer for read_modify_write */ 1334 dev = &sh->dev[i]; 1335 if ((dev->towrite || i == sh->pd_idx) && 1336 (!test_bit(R5_LOCKED, &dev->flags) 1337 #if 0 1338 || sh->bh_page[i]!=bh->b_page 1339 #endif 1340 ) && 1341 !test_bit(R5_UPTODATE, &dev->flags)) { 1342 if (test_bit(R5_Insync, &dev->flags) 1343 /* && !(!mddev->insync && i == sh->pd_idx) */ 1344 ) 1345 rmw++; 1346 else rmw += 2*disks; /* cannot read it */ 1347 } 1348 /* Would I have to read this buffer for reconstruct_write */ 1349 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 1350 (!test_bit(R5_LOCKED, &dev->flags) 1351 #if 0 1352 || sh->bh_page[i] != bh->b_page 1353 #endif 1354 ) && 1355 !test_bit(R5_UPTODATE, &dev->flags)) { 1356 if (test_bit(R5_Insync, &dev->flags)) rcw++; 1357 else rcw += 2*disks; 1358 } 1359 } 1360 PRINTK("for sector %llu, rmw=%d rcw=%d\n", 1361 (unsigned long long)sh->sector, rmw, rcw); 1362 set_bit(STRIPE_HANDLE, &sh->state); 1363 if (rmw < rcw && rmw > 0) 1364 /* prefer read-modify-write, but need to get some data */ 1365 for (i=disks; i--;) { 1366 dev = &sh->dev[i]; 1367 if ((dev->towrite || i == sh->pd_idx) && 1368 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1369 test_bit(R5_Insync, &dev->flags)) { 1370 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1371 { 1372 PRINTK("Read_old block %d for r-m-w\n", i); 1373 set_bit(R5_LOCKED, &dev->flags); 1374 set_bit(R5_Wantread, &dev->flags); 1375 locked++; 1376 } else { 1377 set_bit(STRIPE_DELAYED, &sh->state); 1378 set_bit(STRIPE_HANDLE, &sh->state); 1379 } 1380 } 1381 } 1382 if (rcw <= rmw && rcw > 0) 1383 /* want reconstruct write, but need to get some data */ 1384 for (i=disks; i--;) { 1385 dev = &sh->dev[i]; 1386 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 1387 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1388 test_bit(R5_Insync, &dev->flags)) { 1389 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1390 { 1391 PRINTK("Read_old block %d for Reconstruct\n", i); 1392 set_bit(R5_LOCKED, &dev->flags); 1393 set_bit(R5_Wantread, &dev->flags); 1394 locked++; 1395 } else { 1396 set_bit(STRIPE_DELAYED, &sh->state); 1397 set_bit(STRIPE_HANDLE, &sh->state); 1398 } 1399 } 1400 } 1401 /* now if nothing is locked, and if we have enough data, we can start a write request */ 1402 if (locked == 0 && (rcw == 0 ||rmw == 0) && 1403 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 1404 PRINTK("Computing parity...\n"); 1405 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1406 /* now every locked buffer is ready to be written */ 1407 for (i=disks; i--;) 1408 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 1409 PRINTK("Writing block %d\n", i); 1410 locked++; 1411 set_bit(R5_Wantwrite, &sh->dev[i].flags); 1412 if (!test_bit(R5_Insync, &sh->dev[i].flags) 1413 || (i==sh->pd_idx && failed == 0)) 1414 set_bit(STRIPE_INSYNC, &sh->state); 1415 } 1416 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 1417 atomic_dec(&conf->preread_active_stripes); 1418 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 1419 md_wakeup_thread(conf->mddev->thread); 1420 } 1421 } 1422 } 1423 1424 /* maybe we need to check and possibly fix the parity for this stripe 1425 * Any reads will already have been scheduled, so we just see if enough data 1426 * is available 1427 */ 1428 if (syncing && locked == 0 && 1429 !test_bit(STRIPE_INSYNC, &sh->state)) { 1430 set_bit(STRIPE_HANDLE, &sh->state); 1431 if (failed == 0) { 1432 char *pagea; 1433 if (uptodate != disks) 1434 BUG(); 1435 compute_parity(sh, CHECK_PARITY); 1436 uptodate--; 1437 pagea = page_address(sh->dev[sh->pd_idx].page); 1438 if ((*(u32*)pagea) == 0 && 1439 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { 1440 /* parity is correct (on disc, not in buffer any more) */ 1441 set_bit(STRIPE_INSYNC, &sh->state); 1442 } else { 1443 conf->mddev->resync_mismatches += STRIPE_SECTORS; 1444 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 1445 /* don't try to repair!! */ 1446 set_bit(STRIPE_INSYNC, &sh->state); 1447 else { 1448 compute_block(sh, sh->pd_idx); 1449 uptodate++; 1450 } 1451 } 1452 } 1453 if (!test_bit(STRIPE_INSYNC, &sh->state)) { 1454 /* either failed parity check, or recovery is happening */ 1455 if (failed==0) 1456 failed_num = sh->pd_idx; 1457 dev = &sh->dev[failed_num]; 1458 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 1459 BUG_ON(uptodate != disks); 1460 1461 set_bit(R5_LOCKED, &dev->flags); 1462 set_bit(R5_Wantwrite, &dev->flags); 1463 clear_bit(STRIPE_DEGRADED, &sh->state); 1464 locked++; 1465 set_bit(STRIPE_INSYNC, &sh->state); 1466 } 1467 } 1468 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1469 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 1470 clear_bit(STRIPE_SYNCING, &sh->state); 1471 } 1472 1473 /* If the failed drive is just a ReadError, then we might need to progress 1474 * the repair/check process 1475 */ 1476 if (failed == 1 && ! conf->mddev->ro && 1477 test_bit(R5_ReadError, &sh->dev[failed_num].flags) 1478 && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) 1479 && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) 1480 ) { 1481 dev = &sh->dev[failed_num]; 1482 if (!test_bit(R5_ReWrite, &dev->flags)) { 1483 set_bit(R5_Wantwrite, &dev->flags); 1484 set_bit(R5_ReWrite, &dev->flags); 1485 set_bit(R5_LOCKED, &dev->flags); 1486 locked++; 1487 } else { 1488 /* let's read it back */ 1489 set_bit(R5_Wantread, &dev->flags); 1490 set_bit(R5_LOCKED, &dev->flags); 1491 locked++; 1492 } 1493 } 1494 1495 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 1496 /* Need to write out all blocks after computing parity */ 1497 sh->disks = conf->raid_disks; 1498 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); 1499 compute_parity(sh, RECONSTRUCT_WRITE); 1500 for (i= conf->raid_disks; i--;) { 1501 set_bit(R5_LOCKED, &sh->dev[i].flags); 1502 locked++; 1503 set_bit(R5_Wantwrite, &sh->dev[i].flags); 1504 } 1505 clear_bit(STRIPE_EXPANDING, &sh->state); 1506 } else if (expanded) { 1507 clear_bit(STRIPE_EXPAND_READY, &sh->state); 1508 atomic_dec(&conf->reshape_stripes); 1509 wake_up(&conf->wait_for_overlap); 1510 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 1511 } 1512 1513 if (expanding && locked == 0) { 1514 /* We have read all the blocks in this stripe and now we need to 1515 * copy some of them into a target stripe for expand. 1516 */ 1517 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 1518 for (i=0; i< sh->disks; i++) 1519 if (i != sh->pd_idx) { 1520 int dd_idx, pd_idx, j; 1521 struct stripe_head *sh2; 1522 1523 sector_t bn = compute_blocknr(sh, i); 1524 sector_t s = raid5_compute_sector(bn, conf->raid_disks, 1525 conf->raid_disks-1, 1526 &dd_idx, &pd_idx, conf); 1527 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1); 1528 if (sh2 == NULL) 1529 /* so far only the early blocks of this stripe 1530 * have been requested. When later blocks 1531 * get requested, we will try again 1532 */ 1533 continue; 1534 if(!test_bit(STRIPE_EXPANDING, &sh2->state) || 1535 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 1536 /* must have already done this block */ 1537 release_stripe(sh2); 1538 continue; 1539 } 1540 memcpy(page_address(sh2->dev[dd_idx].page), 1541 page_address(sh->dev[i].page), 1542 STRIPE_SIZE); 1543 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 1544 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 1545 for (j=0; j<conf->raid_disks; j++) 1546 if (j != sh2->pd_idx && 1547 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 1548 break; 1549 if (j == conf->raid_disks) { 1550 set_bit(STRIPE_EXPAND_READY, &sh2->state); 1551 set_bit(STRIPE_HANDLE, &sh2->state); 1552 } 1553 release_stripe(sh2); 1554 } 1555 } 1556 1557 spin_unlock(&sh->lock); 1558 1559 while ((bi=return_bi)) { 1560 int bytes = bi->bi_size; 1561 1562 return_bi = bi->bi_next; 1563 bi->bi_next = NULL; 1564 bi->bi_size = 0; 1565 bi->bi_end_io(bi, bytes, 0); 1566 } 1567 for (i=disks; i-- ;) { 1568 int rw; 1569 struct bio *bi; 1570 mdk_rdev_t *rdev; 1571 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 1572 rw = 1; 1573 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1574 rw = 0; 1575 else 1576 continue; 1577 1578 bi = &sh->dev[i].req; 1579 1580 bi->bi_rw = rw; 1581 if (rw) 1582 bi->bi_end_io = raid5_end_write_request; 1583 else 1584 bi->bi_end_io = raid5_end_read_request; 1585 1586 rcu_read_lock(); 1587 rdev = rcu_dereference(conf->disks[i].rdev); 1588 if (rdev && test_bit(Faulty, &rdev->flags)) 1589 rdev = NULL; 1590 if (rdev) 1591 atomic_inc(&rdev->nr_pending); 1592 rcu_read_unlock(); 1593 1594 if (rdev) { 1595 if (syncing || expanding || expanded) 1596 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1597 1598 bi->bi_bdev = rdev->bdev; 1599 PRINTK("for %llu schedule op %ld on disc %d\n", 1600 (unsigned long long)sh->sector, bi->bi_rw, i); 1601 atomic_inc(&sh->count); 1602 bi->bi_sector = sh->sector + rdev->data_offset; 1603 bi->bi_flags = 1 << BIO_UPTODATE; 1604 bi->bi_vcnt = 1; 1605 bi->bi_max_vecs = 1; 1606 bi->bi_idx = 0; 1607 bi->bi_io_vec = &sh->dev[i].vec; 1608 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1609 bi->bi_io_vec[0].bv_offset = 0; 1610 bi->bi_size = STRIPE_SIZE; 1611 bi->bi_next = NULL; 1612 if (rw == WRITE && 1613 test_bit(R5_ReWrite, &sh->dev[i].flags)) 1614 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1615 generic_make_request(bi); 1616 } else { 1617 if (rw == 1) 1618 set_bit(STRIPE_DEGRADED, &sh->state); 1619 PRINTK("skip op %ld on disc %d for sector %llu\n", 1620 bi->bi_rw, i, (unsigned long long)sh->sector); 1621 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1622 set_bit(STRIPE_HANDLE, &sh->state); 1623 } 1624 } 1625 } 1626 1627 static void raid5_activate_delayed(raid5_conf_t *conf) 1628 { 1629 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 1630 while (!list_empty(&conf->delayed_list)) { 1631 struct list_head *l = conf->delayed_list.next; 1632 struct stripe_head *sh; 1633 sh = list_entry(l, struct stripe_head, lru); 1634 list_del_init(l); 1635 clear_bit(STRIPE_DELAYED, &sh->state); 1636 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1637 atomic_inc(&conf->preread_active_stripes); 1638 list_add_tail(&sh->lru, &conf->handle_list); 1639 } 1640 } 1641 } 1642 1643 static void activate_bit_delay(raid5_conf_t *conf) 1644 { 1645 /* device_lock is held */ 1646 struct list_head head; 1647 list_add(&head, &conf->bitmap_list); 1648 list_del_init(&conf->bitmap_list); 1649 while (!list_empty(&head)) { 1650 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 1651 list_del_init(&sh->lru); 1652 atomic_inc(&sh->count); 1653 __release_stripe(conf, sh); 1654 } 1655 } 1656 1657 static void unplug_slaves(mddev_t *mddev) 1658 { 1659 raid5_conf_t *conf = mddev_to_conf(mddev); 1660 int i; 1661 1662 rcu_read_lock(); 1663 for (i=0; i<mddev->raid_disks; i++) { 1664 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 1665 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 1666 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 1667 1668 atomic_inc(&rdev->nr_pending); 1669 rcu_read_unlock(); 1670 1671 if (r_queue->unplug_fn) 1672 r_queue->unplug_fn(r_queue); 1673 1674 rdev_dec_pending(rdev, mddev); 1675 rcu_read_lock(); 1676 } 1677 } 1678 rcu_read_unlock(); 1679 } 1680 1681 static void raid5_unplug_device(request_queue_t *q) 1682 { 1683 mddev_t *mddev = q->queuedata; 1684 raid5_conf_t *conf = mddev_to_conf(mddev); 1685 unsigned long flags; 1686 1687 spin_lock_irqsave(&conf->device_lock, flags); 1688 1689 if (blk_remove_plug(q)) { 1690 conf->seq_flush++; 1691 raid5_activate_delayed(conf); 1692 } 1693 md_wakeup_thread(mddev->thread); 1694 1695 spin_unlock_irqrestore(&conf->device_lock, flags); 1696 1697 unplug_slaves(mddev); 1698 } 1699 1700 static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, 1701 sector_t *error_sector) 1702 { 1703 mddev_t *mddev = q->queuedata; 1704 raid5_conf_t *conf = mddev_to_conf(mddev); 1705 int i, ret = 0; 1706 1707 rcu_read_lock(); 1708 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 1709 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 1710 if (rdev && !test_bit(Faulty, &rdev->flags)) { 1711 struct block_device *bdev = rdev->bdev; 1712 request_queue_t *r_queue = bdev_get_queue(bdev); 1713 1714 if (!r_queue->issue_flush_fn) 1715 ret = -EOPNOTSUPP; 1716 else { 1717 atomic_inc(&rdev->nr_pending); 1718 rcu_read_unlock(); 1719 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 1720 error_sector); 1721 rdev_dec_pending(rdev, mddev); 1722 rcu_read_lock(); 1723 } 1724 } 1725 } 1726 rcu_read_unlock(); 1727 return ret; 1728 } 1729 1730 static inline void raid5_plug_device(raid5_conf_t *conf) 1731 { 1732 spin_lock_irq(&conf->device_lock); 1733 blk_plug_device(conf->mddev->queue); 1734 spin_unlock_irq(&conf->device_lock); 1735 } 1736 1737 static int make_request(request_queue_t *q, struct bio * bi) 1738 { 1739 mddev_t *mddev = q->queuedata; 1740 raid5_conf_t *conf = mddev_to_conf(mddev); 1741 unsigned int dd_idx, pd_idx; 1742 sector_t new_sector; 1743 sector_t logical_sector, last_sector; 1744 struct stripe_head *sh; 1745 const int rw = bio_data_dir(bi); 1746 int remaining; 1747 1748 if (unlikely(bio_barrier(bi))) { 1749 bio_endio(bi, bi->bi_size, -EOPNOTSUPP); 1750 return 0; 1751 } 1752 1753 md_write_start(mddev, bi); 1754 1755 disk_stat_inc(mddev->gendisk, ios[rw]); 1756 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); 1757 1758 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 1759 last_sector = bi->bi_sector + (bi->bi_size>>9); 1760 bi->bi_next = NULL; 1761 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 1762 1763 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1764 DEFINE_WAIT(w); 1765 int disks; 1766 1767 retry: 1768 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 1769 if (likely(conf->expand_progress == MaxSector)) 1770 disks = conf->raid_disks; 1771 else { 1772 /* spinlock is needed as expand_progress may be 1773 * 64bit on a 32bit platform, and so it might be 1774 * possible to see a half-updated value 1775 * Ofcourse expand_progress could change after 1776 * the lock is dropped, so once we get a reference 1777 * to the stripe that we think it is, we will have 1778 * to check again. 1779 */ 1780 spin_lock_irq(&conf->device_lock); 1781 disks = conf->raid_disks; 1782 if (logical_sector >= conf->expand_progress) 1783 disks = conf->previous_raid_disks; 1784 else { 1785 if (logical_sector >= conf->expand_lo) { 1786 spin_unlock_irq(&conf->device_lock); 1787 schedule(); 1788 goto retry; 1789 } 1790 } 1791 spin_unlock_irq(&conf->device_lock); 1792 } 1793 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1, 1794 &dd_idx, &pd_idx, conf); 1795 PRINTK("raid5: make_request, sector %llu logical %llu\n", 1796 (unsigned long long)new_sector, 1797 (unsigned long long)logical_sector); 1798 1799 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); 1800 if (sh) { 1801 if (unlikely(conf->expand_progress != MaxSector)) { 1802 /* expansion might have moved on while waiting for a 1803 * stripe, so we must do the range check again. 1804 * Expansion could still move past after this 1805 * test, but as we are holding a reference to 1806 * 'sh', we know that if that happens, 1807 * STRIPE_EXPANDING will get set and the expansion 1808 * won't proceed until we finish with the stripe. 1809 */ 1810 int must_retry = 0; 1811 spin_lock_irq(&conf->device_lock); 1812 if (logical_sector < conf->expand_progress && 1813 disks == conf->previous_raid_disks) 1814 /* mismatch, need to try again */ 1815 must_retry = 1; 1816 spin_unlock_irq(&conf->device_lock); 1817 if (must_retry) { 1818 release_stripe(sh); 1819 goto retry; 1820 } 1821 } 1822 /* FIXME what if we get a false positive because these 1823 * are being updated. 1824 */ 1825 if (logical_sector >= mddev->suspend_lo && 1826 logical_sector < mddev->suspend_hi) { 1827 release_stripe(sh); 1828 schedule(); 1829 goto retry; 1830 } 1831 1832 if (test_bit(STRIPE_EXPANDING, &sh->state) || 1833 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 1834 /* Stripe is busy expanding or 1835 * add failed due to overlap. Flush everything 1836 * and wait a while 1837 */ 1838 raid5_unplug_device(mddev->queue); 1839 release_stripe(sh); 1840 schedule(); 1841 goto retry; 1842 } 1843 finish_wait(&conf->wait_for_overlap, &w); 1844 raid5_plug_device(conf); 1845 handle_stripe(sh); 1846 release_stripe(sh); 1847 } else { 1848 /* cannot get stripe for read-ahead, just give-up */ 1849 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1850 finish_wait(&conf->wait_for_overlap, &w); 1851 break; 1852 } 1853 1854 } 1855 spin_lock_irq(&conf->device_lock); 1856 remaining = --bi->bi_phys_segments; 1857 spin_unlock_irq(&conf->device_lock); 1858 if (remaining == 0) { 1859 int bytes = bi->bi_size; 1860 1861 if ( bio_data_dir(bi) == WRITE ) 1862 md_write_end(mddev); 1863 bi->bi_size = 0; 1864 bi->bi_end_io(bi, bytes, 0); 1865 } 1866 return 0; 1867 } 1868 1869 /* FIXME go_faster isn't used */ 1870 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1871 { 1872 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1873 struct stripe_head *sh; 1874 int pd_idx; 1875 sector_t first_sector, last_sector; 1876 int raid_disks = conf->raid_disks; 1877 int data_disks = raid_disks-1; 1878 sector_t max_sector = mddev->size << 1; 1879 int sync_blocks; 1880 1881 if (sector_nr >= max_sector) { 1882 /* just being told to finish up .. nothing much to do */ 1883 unplug_slaves(mddev); 1884 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 1885 end_reshape(conf); 1886 return 0; 1887 } 1888 1889 if (mddev->curr_resync < max_sector) /* aborted */ 1890 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1891 &sync_blocks, 1); 1892 else /* compelted sync */ 1893 conf->fullsync = 0; 1894 bitmap_close_sync(mddev->bitmap); 1895 1896 return 0; 1897 } 1898 1899 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 1900 /* reshaping is quite different to recovery/resync so it is 1901 * handled quite separately ... here. 1902 * 1903 * On each call to sync_request, we gather one chunk worth of 1904 * destination stripes and flag them as expanding. 1905 * Then we find all the source stripes and request reads. 1906 * As the reads complete, handle_stripe will copy the data 1907 * into the destination stripe and release that stripe. 1908 */ 1909 int i; 1910 int dd_idx; 1911 sector_t writepos, safepos, gap; 1912 1913 if (sector_nr == 0 && 1914 conf->expand_progress != 0) { 1915 /* restarting in the middle, skip the initial sectors */ 1916 sector_nr = conf->expand_progress; 1917 sector_div(sector_nr, conf->raid_disks-1); 1918 *skipped = 1; 1919 return sector_nr; 1920 } 1921 1922 /* we update the metadata when there is more than 3Meg 1923 * in the block range (that is rather arbitrary, should 1924 * probably be time based) or when the data about to be 1925 * copied would over-write the source of the data at 1926 * the front of the range. 1927 * i.e. one new_stripe forward from expand_progress new_maps 1928 * to after where expand_lo old_maps to 1929 */ 1930 writepos = conf->expand_progress + 1931 conf->chunk_size/512*(conf->raid_disks-1); 1932 sector_div(writepos, conf->raid_disks-1); 1933 safepos = conf->expand_lo; 1934 sector_div(safepos, conf->previous_raid_disks-1); 1935 gap = conf->expand_progress - conf->expand_lo; 1936 1937 if (writepos >= safepos || 1938 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { 1939 /* Cannot proceed until we've updated the superblock... */ 1940 wait_event(conf->wait_for_overlap, 1941 atomic_read(&conf->reshape_stripes)==0); 1942 mddev->reshape_position = conf->expand_progress; 1943 mddev->sb_dirty = 1; 1944 md_wakeup_thread(mddev->thread); 1945 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || 1946 kthread_should_stop()); 1947 spin_lock_irq(&conf->device_lock); 1948 conf->expand_lo = mddev->reshape_position; 1949 spin_unlock_irq(&conf->device_lock); 1950 wake_up(&conf->wait_for_overlap); 1951 } 1952 1953 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 1954 int j; 1955 int skipped = 0; 1956 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); 1957 sh = get_active_stripe(conf, sector_nr+i, 1958 conf->raid_disks, pd_idx, 0); 1959 set_bit(STRIPE_EXPANDING, &sh->state); 1960 atomic_inc(&conf->reshape_stripes); 1961 /* If any of this stripe is beyond the end of the old 1962 * array, then we need to zero those blocks 1963 */ 1964 for (j=sh->disks; j--;) { 1965 sector_t s; 1966 if (j == sh->pd_idx) 1967 continue; 1968 s = compute_blocknr(sh, j); 1969 if (s < (mddev->array_size<<1)) { 1970 skipped = 1; 1971 continue; 1972 } 1973 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 1974 set_bit(R5_Expanded, &sh->dev[j].flags); 1975 set_bit(R5_UPTODATE, &sh->dev[j].flags); 1976 } 1977 if (!skipped) { 1978 set_bit(STRIPE_EXPAND_READY, &sh->state); 1979 set_bit(STRIPE_HANDLE, &sh->state); 1980 } 1981 release_stripe(sh); 1982 } 1983 spin_lock_irq(&conf->device_lock); 1984 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); 1985 spin_unlock_irq(&conf->device_lock); 1986 /* Ok, those stripe are ready. We can start scheduling 1987 * reads on the source stripes. 1988 * The source stripes are determined by mapping the first and last 1989 * block on the destination stripes. 1990 */ 1991 raid_disks = conf->previous_raid_disks; 1992 data_disks = raid_disks - 1; 1993 first_sector = 1994 raid5_compute_sector(sector_nr*(conf->raid_disks-1), 1995 raid_disks, data_disks, 1996 &dd_idx, &pd_idx, conf); 1997 last_sector = 1998 raid5_compute_sector((sector_nr+conf->chunk_size/512) 1999 *(conf->raid_disks-1) -1, 2000 raid_disks, data_disks, 2001 &dd_idx, &pd_idx, conf); 2002 if (last_sector >= (mddev->size<<1)) 2003 last_sector = (mddev->size<<1)-1; 2004 while (first_sector <= last_sector) { 2005 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); 2006 sh = get_active_stripe(conf, first_sector, 2007 conf->previous_raid_disks, pd_idx, 0); 2008 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2009 set_bit(STRIPE_HANDLE, &sh->state); 2010 release_stripe(sh); 2011 first_sector += STRIPE_SECTORS; 2012 } 2013 return conf->chunk_size>>9; 2014 } 2015 /* if there is 1 or more failed drives and we are trying 2016 * to resync, then assert that we are finished, because there is 2017 * nothing we can do. 2018 */ 2019 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2020 sector_t rv = (mddev->size << 1) - sector_nr; 2021 *skipped = 1; 2022 return rv; 2023 } 2024 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 2025 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 2026 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 2027 /* we can skip this block, and probably more */ 2028 sync_blocks /= STRIPE_SECTORS; 2029 *skipped = 1; 2030 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 2031 } 2032 2033 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); 2034 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); 2035 if (sh == NULL) { 2036 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 2037 /* make sure we don't swamp the stripe cache if someone else 2038 * is trying to get access 2039 */ 2040 schedule_timeout_uninterruptible(1); 2041 } 2042 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); 2043 spin_lock(&sh->lock); 2044 set_bit(STRIPE_SYNCING, &sh->state); 2045 clear_bit(STRIPE_INSYNC, &sh->state); 2046 spin_unlock(&sh->lock); 2047 2048 handle_stripe(sh); 2049 release_stripe(sh); 2050 2051 return STRIPE_SECTORS; 2052 } 2053 2054 /* 2055 * This is our raid5 kernel thread. 2056 * 2057 * We scan the hash table for stripes which can be handled now. 2058 * During the scan, completed stripes are saved for us by the interrupt 2059 * handler, so that they will not have to wait for our next wakeup. 2060 */ 2061 static void raid5d (mddev_t *mddev) 2062 { 2063 struct stripe_head *sh; 2064 raid5_conf_t *conf = mddev_to_conf(mddev); 2065 int handled; 2066 2067 PRINTK("+++ raid5d active\n"); 2068 2069 md_check_recovery(mddev); 2070 2071 handled = 0; 2072 spin_lock_irq(&conf->device_lock); 2073 while (1) { 2074 struct list_head *first; 2075 2076 if (conf->seq_flush - conf->seq_write > 0) { 2077 int seq = conf->seq_flush; 2078 spin_unlock_irq(&conf->device_lock); 2079 bitmap_unplug(mddev->bitmap); 2080 spin_lock_irq(&conf->device_lock); 2081 conf->seq_write = seq; 2082 activate_bit_delay(conf); 2083 } 2084 2085 if (list_empty(&conf->handle_list) && 2086 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && 2087 !blk_queue_plugged(mddev->queue) && 2088 !list_empty(&conf->delayed_list)) 2089 raid5_activate_delayed(conf); 2090 2091 if (list_empty(&conf->handle_list)) 2092 break; 2093 2094 first = conf->handle_list.next; 2095 sh = list_entry(first, struct stripe_head, lru); 2096 2097 list_del_init(first); 2098 atomic_inc(&sh->count); 2099 if (atomic_read(&sh->count)!= 1) 2100 BUG(); 2101 spin_unlock_irq(&conf->device_lock); 2102 2103 handled++; 2104 handle_stripe(sh); 2105 release_stripe(sh); 2106 2107 spin_lock_irq(&conf->device_lock); 2108 } 2109 PRINTK("%d stripes handled\n", handled); 2110 2111 spin_unlock_irq(&conf->device_lock); 2112 2113 unplug_slaves(mddev); 2114 2115 PRINTK("--- raid5d inactive\n"); 2116 } 2117 2118 static ssize_t 2119 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 2120 { 2121 raid5_conf_t *conf = mddev_to_conf(mddev); 2122 if (conf) 2123 return sprintf(page, "%d\n", conf->max_nr_stripes); 2124 else 2125 return 0; 2126 } 2127 2128 static ssize_t 2129 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 2130 { 2131 raid5_conf_t *conf = mddev_to_conf(mddev); 2132 char *end; 2133 int new; 2134 if (len >= PAGE_SIZE) 2135 return -EINVAL; 2136 if (!conf) 2137 return -ENODEV; 2138 2139 new = simple_strtoul(page, &end, 10); 2140 if (!*page || (*end && *end != '\n') ) 2141 return -EINVAL; 2142 if (new <= 16 || new > 32768) 2143 return -EINVAL; 2144 while (new < conf->max_nr_stripes) { 2145 if (drop_one_stripe(conf)) 2146 conf->max_nr_stripes--; 2147 else 2148 break; 2149 } 2150 while (new > conf->max_nr_stripes) { 2151 if (grow_one_stripe(conf)) 2152 conf->max_nr_stripes++; 2153 else break; 2154 } 2155 return len; 2156 } 2157 2158 static struct md_sysfs_entry 2159 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 2160 raid5_show_stripe_cache_size, 2161 raid5_store_stripe_cache_size); 2162 2163 static ssize_t 2164 stripe_cache_active_show(mddev_t *mddev, char *page) 2165 { 2166 raid5_conf_t *conf = mddev_to_conf(mddev); 2167 if (conf) 2168 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 2169 else 2170 return 0; 2171 } 2172 2173 static struct md_sysfs_entry 2174 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 2175 2176 static struct attribute *raid5_attrs[] = { 2177 &raid5_stripecache_size.attr, 2178 &raid5_stripecache_active.attr, 2179 NULL, 2180 }; 2181 static struct attribute_group raid5_attrs_group = { 2182 .name = NULL, 2183 .attrs = raid5_attrs, 2184 }; 2185 2186 static int run(mddev_t *mddev) 2187 { 2188 raid5_conf_t *conf; 2189 int raid_disk, memory; 2190 mdk_rdev_t *rdev; 2191 struct disk_info *disk; 2192 struct list_head *tmp; 2193 2194 if (mddev->level != 5 && mddev->level != 4) { 2195 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", 2196 mdname(mddev), mddev->level); 2197 return -EIO; 2198 } 2199 2200 if (mddev->reshape_position != MaxSector) { 2201 /* Check that we can continue the reshape. 2202 * Currently only disks can change, it must 2203 * increase, and we must be past the point where 2204 * a stripe over-writes itself 2205 */ 2206 sector_t here_new, here_old; 2207 int old_disks; 2208 2209 if (mddev->new_level != mddev->level || 2210 mddev->new_layout != mddev->layout || 2211 mddev->new_chunk != mddev->chunk_size) { 2212 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", 2213 mdname(mddev)); 2214 return -EINVAL; 2215 } 2216 if (mddev->delta_disks <= 0) { 2217 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", 2218 mdname(mddev)); 2219 return -EINVAL; 2220 } 2221 old_disks = mddev->raid_disks - mddev->delta_disks; 2222 /* reshape_position must be on a new-stripe boundary, and one 2223 * further up in new geometry must map after here in old geometry. 2224 */ 2225 here_new = mddev->reshape_position; 2226 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { 2227 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); 2228 return -EINVAL; 2229 } 2230 /* here_new is the stripe we will write to */ 2231 here_old = mddev->reshape_position; 2232 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); 2233 /* here_old is the first stripe that we might need to read from */ 2234 if (here_new >= here_old) { 2235 /* Reading from the same stripe as writing to - bad */ 2236 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); 2237 return -EINVAL; 2238 } 2239 printk(KERN_INFO "raid5: reshape will continue\n"); 2240 /* OK, we should be able to continue; */ 2241 } 2242 2243 2244 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); 2245 if ((conf = mddev->private) == NULL) 2246 goto abort; 2247 if (mddev->reshape_position == MaxSector) { 2248 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; 2249 } else { 2250 conf->raid_disks = mddev->raid_disks; 2251 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 2252 } 2253 2254 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 2255 GFP_KERNEL); 2256 if (!conf->disks) 2257 goto abort; 2258 2259 conf->mddev = mddev; 2260 2261 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 2262 goto abort; 2263 2264 spin_lock_init(&conf->device_lock); 2265 init_waitqueue_head(&conf->wait_for_stripe); 2266 init_waitqueue_head(&conf->wait_for_overlap); 2267 INIT_LIST_HEAD(&conf->handle_list); 2268 INIT_LIST_HEAD(&conf->delayed_list); 2269 INIT_LIST_HEAD(&conf->bitmap_list); 2270 INIT_LIST_HEAD(&conf->inactive_list); 2271 atomic_set(&conf->active_stripes, 0); 2272 atomic_set(&conf->preread_active_stripes, 0); 2273 2274 PRINTK("raid5: run(%s) called.\n", mdname(mddev)); 2275 2276 ITERATE_RDEV(mddev,rdev,tmp) { 2277 raid_disk = rdev->raid_disk; 2278 if (raid_disk >= conf->raid_disks 2279 || raid_disk < 0) 2280 continue; 2281 disk = conf->disks + raid_disk; 2282 2283 disk->rdev = rdev; 2284 2285 if (test_bit(In_sync, &rdev->flags)) { 2286 char b[BDEVNAME_SIZE]; 2287 printk(KERN_INFO "raid5: device %s operational as raid" 2288 " disk %d\n", bdevname(rdev->bdev,b), 2289 raid_disk); 2290 conf->working_disks++; 2291 } 2292 } 2293 2294 /* 2295 * 0 for a fully functional array, 1 for a degraded array. 2296 */ 2297 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks; 2298 conf->mddev = mddev; 2299 conf->chunk_size = mddev->chunk_size; 2300 conf->level = mddev->level; 2301 conf->algorithm = mddev->layout; 2302 conf->max_nr_stripes = NR_STRIPES; 2303 conf->expand_progress = mddev->reshape_position; 2304 2305 /* device size must be a multiple of chunk size */ 2306 mddev->size &= ~(mddev->chunk_size/1024 -1); 2307 mddev->resync_max_sectors = mddev->size << 1; 2308 2309 if (!conf->chunk_size || conf->chunk_size % 4) { 2310 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 2311 conf->chunk_size, mdname(mddev)); 2312 goto abort; 2313 } 2314 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { 2315 printk(KERN_ERR 2316 "raid5: unsupported parity algorithm %d for %s\n", 2317 conf->algorithm, mdname(mddev)); 2318 goto abort; 2319 } 2320 if (mddev->degraded > 1) { 2321 printk(KERN_ERR "raid5: not enough operational devices for %s" 2322 " (%d/%d failed)\n", 2323 mdname(mddev), conf->failed_disks, conf->raid_disks); 2324 goto abort; 2325 } 2326 2327 if (mddev->degraded == 1 && 2328 mddev->recovery_cp != MaxSector) { 2329 if (mddev->ok_start_degraded) 2330 printk(KERN_WARNING 2331 "raid5: starting dirty degraded array: %s" 2332 "- data corruption possible.\n", 2333 mdname(mddev)); 2334 else { 2335 printk(KERN_ERR 2336 "raid5: cannot start dirty degraded array for %s\n", 2337 mdname(mddev)); 2338 goto abort; 2339 } 2340 } 2341 2342 { 2343 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); 2344 if (!mddev->thread) { 2345 printk(KERN_ERR 2346 "raid5: couldn't allocate thread for %s\n", 2347 mdname(mddev)); 2348 goto abort; 2349 } 2350 } 2351 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 2352 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 2353 if (grow_stripes(conf, conf->max_nr_stripes)) { 2354 printk(KERN_ERR 2355 "raid5: couldn't allocate %dkB for buffers\n", memory); 2356 shrink_stripes(conf); 2357 md_unregister_thread(mddev->thread); 2358 goto abort; 2359 } else 2360 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 2361 memory, mdname(mddev)); 2362 2363 if (mddev->degraded == 0) 2364 printk("raid5: raid level %d set %s active with %d out of %d" 2365 " devices, algorithm %d\n", conf->level, mdname(mddev), 2366 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 2367 conf->algorithm); 2368 else 2369 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 2370 " out of %d devices, algorithm %d\n", conf->level, 2371 mdname(mddev), mddev->raid_disks - mddev->degraded, 2372 mddev->raid_disks, conf->algorithm); 2373 2374 print_raid5_conf(conf); 2375 2376 if (conf->expand_progress != MaxSector) { 2377 printk("...ok start reshape thread\n"); 2378 conf->expand_lo = conf->expand_progress; 2379 atomic_set(&conf->reshape_stripes, 0); 2380 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2381 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2382 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 2383 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 2384 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 2385 "%s_reshape"); 2386 /* FIXME if md_register_thread fails?? */ 2387 md_wakeup_thread(mddev->sync_thread); 2388 2389 } 2390 2391 /* read-ahead size must cover two whole stripes, which is 2392 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 2393 */ 2394 { 2395 int stripe = (mddev->raid_disks-1) * mddev->chunk_size 2396 / PAGE_SIZE; 2397 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 2398 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 2399 } 2400 2401 /* Ok, everything is just fine now */ 2402 sysfs_create_group(&mddev->kobj, &raid5_attrs_group); 2403 2404 mddev->queue->unplug_fn = raid5_unplug_device; 2405 mddev->queue->issue_flush_fn = raid5_issue_flush; 2406 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); 2407 2408 return 0; 2409 abort: 2410 if (conf) { 2411 print_raid5_conf(conf); 2412 kfree(conf->disks); 2413 kfree(conf->stripe_hashtbl); 2414 kfree(conf); 2415 } 2416 mddev->private = NULL; 2417 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 2418 return -EIO; 2419 } 2420 2421 2422 2423 static int stop(mddev_t *mddev) 2424 { 2425 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 2426 2427 md_unregister_thread(mddev->thread); 2428 mddev->thread = NULL; 2429 shrink_stripes(conf); 2430 kfree(conf->stripe_hashtbl); 2431 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2432 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 2433 kfree(conf->disks); 2434 kfree(conf); 2435 mddev->private = NULL; 2436 return 0; 2437 } 2438 2439 #if RAID5_DEBUG 2440 static void print_sh (struct stripe_head *sh) 2441 { 2442 int i; 2443 2444 printk("sh %llu, pd_idx %d, state %ld.\n", 2445 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 2446 printk("sh %llu, count %d.\n", 2447 (unsigned long long)sh->sector, atomic_read(&sh->count)); 2448 printk("sh %llu, ", (unsigned long long)sh->sector); 2449 for (i = 0; i < sh->disks; i++) { 2450 printk("(cache%d: %p %ld) ", 2451 i, sh->dev[i].page, sh->dev[i].flags); 2452 } 2453 printk("\n"); 2454 } 2455 2456 static void printall (raid5_conf_t *conf) 2457 { 2458 struct stripe_head *sh; 2459 struct hlist_node *hn; 2460 int i; 2461 2462 spin_lock_irq(&conf->device_lock); 2463 for (i = 0; i < NR_HASH; i++) { 2464 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 2465 if (sh->raid_conf != conf) 2466 continue; 2467 print_sh(sh); 2468 } 2469 } 2470 spin_unlock_irq(&conf->device_lock); 2471 } 2472 #endif 2473 2474 static void status (struct seq_file *seq, mddev_t *mddev) 2475 { 2476 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 2477 int i; 2478 2479 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); 2480 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); 2481 for (i = 0; i < conf->raid_disks; i++) 2482 seq_printf (seq, "%s", 2483 conf->disks[i].rdev && 2484 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 2485 seq_printf (seq, "]"); 2486 #if RAID5_DEBUG 2487 #define D(x) \ 2488 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x)) 2489 printall(conf); 2490 #endif 2491 } 2492 2493 static void print_raid5_conf (raid5_conf_t *conf) 2494 { 2495 int i; 2496 struct disk_info *tmp; 2497 2498 printk("RAID5 conf printout:\n"); 2499 if (!conf) { 2500 printk("(conf==NULL)\n"); 2501 return; 2502 } 2503 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, 2504 conf->working_disks, conf->failed_disks); 2505 2506 for (i = 0; i < conf->raid_disks; i++) { 2507 char b[BDEVNAME_SIZE]; 2508 tmp = conf->disks + i; 2509 if (tmp->rdev) 2510 printk(" disk %d, o:%d, dev:%s\n", 2511 i, !test_bit(Faulty, &tmp->rdev->flags), 2512 bdevname(tmp->rdev->bdev,b)); 2513 } 2514 } 2515 2516 static int raid5_spare_active(mddev_t *mddev) 2517 { 2518 int i; 2519 raid5_conf_t *conf = mddev->private; 2520 struct disk_info *tmp; 2521 2522 for (i = 0; i < conf->raid_disks; i++) { 2523 tmp = conf->disks + i; 2524 if (tmp->rdev 2525 && !test_bit(Faulty, &tmp->rdev->flags) 2526 && !test_bit(In_sync, &tmp->rdev->flags)) { 2527 mddev->degraded--; 2528 conf->failed_disks--; 2529 conf->working_disks++; 2530 set_bit(In_sync, &tmp->rdev->flags); 2531 } 2532 } 2533 print_raid5_conf(conf); 2534 return 0; 2535 } 2536 2537 static int raid5_remove_disk(mddev_t *mddev, int number) 2538 { 2539 raid5_conf_t *conf = mddev->private; 2540 int err = 0; 2541 mdk_rdev_t *rdev; 2542 struct disk_info *p = conf->disks + number; 2543 2544 print_raid5_conf(conf); 2545 rdev = p->rdev; 2546 if (rdev) { 2547 if (test_bit(In_sync, &rdev->flags) || 2548 atomic_read(&rdev->nr_pending)) { 2549 err = -EBUSY; 2550 goto abort; 2551 } 2552 p->rdev = NULL; 2553 synchronize_rcu(); 2554 if (atomic_read(&rdev->nr_pending)) { 2555 /* lost the race, try later */ 2556 err = -EBUSY; 2557 p->rdev = rdev; 2558 } 2559 } 2560 abort: 2561 2562 print_raid5_conf(conf); 2563 return err; 2564 } 2565 2566 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 2567 { 2568 raid5_conf_t *conf = mddev->private; 2569 int found = 0; 2570 int disk; 2571 struct disk_info *p; 2572 2573 if (mddev->degraded > 1) 2574 /* no point adding a device */ 2575 return 0; 2576 2577 /* 2578 * find the disk ... 2579 */ 2580 for (disk=0; disk < conf->raid_disks; disk++) 2581 if ((p=conf->disks + disk)->rdev == NULL) { 2582 clear_bit(In_sync, &rdev->flags); 2583 rdev->raid_disk = disk; 2584 found = 1; 2585 if (rdev->saved_raid_disk != disk) 2586 conf->fullsync = 1; 2587 rcu_assign_pointer(p->rdev, rdev); 2588 break; 2589 } 2590 print_raid5_conf(conf); 2591 return found; 2592 } 2593 2594 static int raid5_resize(mddev_t *mddev, sector_t sectors) 2595 { 2596 /* no resync is happening, and there is enough space 2597 * on all devices, so we can resize. 2598 * We need to make sure resync covers any new space. 2599 * If the array is shrinking we should possibly wait until 2600 * any io in the removed space completes, but it hardly seems 2601 * worth it. 2602 */ 2603 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 2604 mddev->array_size = (sectors * (mddev->raid_disks-1))>>1; 2605 set_capacity(mddev->gendisk, mddev->array_size << 1); 2606 mddev->changed = 1; 2607 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 2608 mddev->recovery_cp = mddev->size << 1; 2609 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2610 } 2611 mddev->size = sectors /2; 2612 mddev->resync_max_sectors = sectors; 2613 return 0; 2614 } 2615 2616 #ifdef CONFIG_MD_RAID5_RESHAPE 2617 static int raid5_check_reshape(mddev_t *mddev) 2618 { 2619 raid5_conf_t *conf = mddev_to_conf(mddev); 2620 int err; 2621 2622 if (mddev->delta_disks < 0 || 2623 mddev->new_level != mddev->level) 2624 return -EINVAL; /* Cannot shrink array or change level yet */ 2625 if (mddev->delta_disks == 0) 2626 return 0; /* nothing to do */ 2627 2628 /* Can only proceed if there are plenty of stripe_heads. 2629 * We need a minimum of one full stripe,, and for sensible progress 2630 * it is best to have about 4 times that. 2631 * If we require 4 times, then the default 256 4K stripe_heads will 2632 * allow for chunk sizes up to 256K, which is probably OK. 2633 * If the chunk size is greater, user-space should request more 2634 * stripe_heads first. 2635 */ 2636 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || 2637 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { 2638 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 2639 (mddev->chunk_size / STRIPE_SIZE)*4); 2640 return -ENOSPC; 2641 } 2642 2643 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 2644 if (err) 2645 return err; 2646 2647 /* looks like we might be able to manage this */ 2648 return 0; 2649 } 2650 2651 static int raid5_start_reshape(mddev_t *mddev) 2652 { 2653 raid5_conf_t *conf = mddev_to_conf(mddev); 2654 mdk_rdev_t *rdev; 2655 struct list_head *rtmp; 2656 int spares = 0; 2657 int added_devices = 0; 2658 2659 if (mddev->degraded || 2660 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2661 return -EBUSY; 2662 2663 ITERATE_RDEV(mddev, rdev, rtmp) 2664 if (rdev->raid_disk < 0 && 2665 !test_bit(Faulty, &rdev->flags)) 2666 spares++; 2667 2668 if (spares < mddev->delta_disks-1) 2669 /* Not enough devices even to make a degraded array 2670 * of that size 2671 */ 2672 return -EINVAL; 2673 2674 atomic_set(&conf->reshape_stripes, 0); 2675 spin_lock_irq(&conf->device_lock); 2676 conf->previous_raid_disks = conf->raid_disks; 2677 conf->raid_disks += mddev->delta_disks; 2678 conf->expand_progress = 0; 2679 conf->expand_lo = 0; 2680 spin_unlock_irq(&conf->device_lock); 2681 2682 /* Add some new drives, as many as will fit. 2683 * We know there are enough to make the newly sized array work. 2684 */ 2685 ITERATE_RDEV(mddev, rdev, rtmp) 2686 if (rdev->raid_disk < 0 && 2687 !test_bit(Faulty, &rdev->flags)) { 2688 if (raid5_add_disk(mddev, rdev)) { 2689 char nm[20]; 2690 set_bit(In_sync, &rdev->flags); 2691 conf->working_disks++; 2692 added_devices++; 2693 sprintf(nm, "rd%d", rdev->raid_disk); 2694 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 2695 } else 2696 break; 2697 } 2698 2699 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; 2700 mddev->raid_disks = conf->raid_disks; 2701 mddev->reshape_position = 0; 2702 mddev->sb_dirty = 1; 2703 2704 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2705 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2706 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 2707 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 2708 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 2709 "%s_reshape"); 2710 if (!mddev->sync_thread) { 2711 mddev->recovery = 0; 2712 spin_lock_irq(&conf->device_lock); 2713 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 2714 conf->expand_progress = MaxSector; 2715 spin_unlock_irq(&conf->device_lock); 2716 return -EAGAIN; 2717 } 2718 md_wakeup_thread(mddev->sync_thread); 2719 md_new_event(mddev); 2720 return 0; 2721 } 2722 #endif 2723 2724 static void end_reshape(raid5_conf_t *conf) 2725 { 2726 struct block_device *bdev; 2727 2728 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 2729 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); 2730 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 2731 conf->mddev->changed = 1; 2732 2733 bdev = bdget_disk(conf->mddev->gendisk, 0); 2734 if (bdev) { 2735 mutex_lock(&bdev->bd_inode->i_mutex); 2736 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); 2737 mutex_unlock(&bdev->bd_inode->i_mutex); 2738 bdput(bdev); 2739 } 2740 spin_lock_irq(&conf->device_lock); 2741 conf->expand_progress = MaxSector; 2742 spin_unlock_irq(&conf->device_lock); 2743 conf->mddev->reshape_position = MaxSector; 2744 } 2745 } 2746 2747 static void raid5_quiesce(mddev_t *mddev, int state) 2748 { 2749 raid5_conf_t *conf = mddev_to_conf(mddev); 2750 2751 switch(state) { 2752 case 2: /* resume for a suspend */ 2753 wake_up(&conf->wait_for_overlap); 2754 break; 2755 2756 case 1: /* stop all writes */ 2757 spin_lock_irq(&conf->device_lock); 2758 conf->quiesce = 1; 2759 wait_event_lock_irq(conf->wait_for_stripe, 2760 atomic_read(&conf->active_stripes) == 0, 2761 conf->device_lock, /* nothing */); 2762 spin_unlock_irq(&conf->device_lock); 2763 break; 2764 2765 case 0: /* re-enable writes */ 2766 spin_lock_irq(&conf->device_lock); 2767 conf->quiesce = 0; 2768 wake_up(&conf->wait_for_stripe); 2769 wake_up(&conf->wait_for_overlap); 2770 spin_unlock_irq(&conf->device_lock); 2771 break; 2772 } 2773 } 2774 2775 static struct mdk_personality raid5_personality = 2776 { 2777 .name = "raid5", 2778 .level = 5, 2779 .owner = THIS_MODULE, 2780 .make_request = make_request, 2781 .run = run, 2782 .stop = stop, 2783 .status = status, 2784 .error_handler = error, 2785 .hot_add_disk = raid5_add_disk, 2786 .hot_remove_disk= raid5_remove_disk, 2787 .spare_active = raid5_spare_active, 2788 .sync_request = sync_request, 2789 .resize = raid5_resize, 2790 #ifdef CONFIG_MD_RAID5_RESHAPE 2791 .check_reshape = raid5_check_reshape, 2792 .start_reshape = raid5_start_reshape, 2793 #endif 2794 .quiesce = raid5_quiesce, 2795 }; 2796 2797 static struct mdk_personality raid4_personality = 2798 { 2799 .name = "raid4", 2800 .level = 4, 2801 .owner = THIS_MODULE, 2802 .make_request = make_request, 2803 .run = run, 2804 .stop = stop, 2805 .status = status, 2806 .error_handler = error, 2807 .hot_add_disk = raid5_add_disk, 2808 .hot_remove_disk= raid5_remove_disk, 2809 .spare_active = raid5_spare_active, 2810 .sync_request = sync_request, 2811 .resize = raid5_resize, 2812 .quiesce = raid5_quiesce, 2813 }; 2814 2815 static int __init raid5_init(void) 2816 { 2817 register_md_personality(&raid5_personality); 2818 register_md_personality(&raid4_personality); 2819 return 0; 2820 } 2821 2822 static void raid5_exit(void) 2823 { 2824 unregister_md_personality(&raid5_personality); 2825 unregister_md_personality(&raid4_personality); 2826 } 2827 2828 module_init(raid5_init); 2829 module_exit(raid5_exit); 2830 MODULE_LICENSE("GPL"); 2831 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 2832 MODULE_ALIAS("md-raid5"); 2833 MODULE_ALIAS("md-raid4"); 2834 MODULE_ALIAS("md-level-5"); 2835 MODULE_ALIAS("md-level-4"); 2836