1 /* 2 * raid1.c : Multiple Devices driver for Linux 3 * 4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 5 * 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 7 * 8 * RAID-1 management functions. 9 * 10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 11 * 12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 14 * 15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support 16 * bitmapped intelligence in resync: 17 * 18 * - bitmap marked during normal i/o 19 * - bitmap used to skip nondirty blocks during sync 20 * 21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: 22 * - persistent bitmap code 23 * 24 * This program is free software; you can redistribute it and/or modify 25 * it under the terms of the GNU General Public License as published by 26 * the Free Software Foundation; either version 2, or (at your option) 27 * any later version. 28 * 29 * You should have received a copy of the GNU General Public License 30 * (for example /usr/src/linux/COPYING); if not, write to the Free 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 */ 33 34 #include "dm-bio-list.h" 35 #include <linux/raid/raid1.h> 36 #include <linux/raid/bitmap.h> 37 38 #define DEBUG 0 39 #if DEBUG 40 #define PRINTK(x...) printk(x) 41 #else 42 #define PRINTK(x...) 43 #endif 44 45 /* 46 * Number of guaranteed r1bios in case of extreme VM load: 47 */ 48 #define NR_RAID1_BIOS 256 49 50 static mdk_personality_t raid1_personality; 51 52 static void unplug_slaves(mddev_t *mddev); 53 54 55 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56 { 57 struct pool_info *pi = data; 58 r1bio_t *r1_bio; 59 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 61 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 r1_bio = kmalloc(size, gfp_flags); 63 if (r1_bio) 64 memset(r1_bio, 0, size); 65 else 66 unplug_slaves(pi->mddev); 67 68 return r1_bio; 69 } 70 71 static void r1bio_pool_free(void *r1_bio, void *data) 72 { 73 kfree(r1_bio); 74 } 75 76 #define RESYNC_BLOCK_SIZE (64*1024) 77 //#define RESYNC_BLOCK_SIZE PAGE_SIZE 78 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 79 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 80 #define RESYNC_WINDOW (2048*1024) 81 82 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 83 { 84 struct pool_info *pi = data; 85 struct page *page; 86 r1bio_t *r1_bio; 87 struct bio *bio; 88 int i, j; 89 90 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 91 if (!r1_bio) { 92 unplug_slaves(pi->mddev); 93 return NULL; 94 } 95 96 /* 97 * Allocate bios : 1 for reading, n-1 for writing 98 */ 99 for (j = pi->raid_disks ; j-- ; ) { 100 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 101 if (!bio) 102 goto out_free_bio; 103 r1_bio->bios[j] = bio; 104 } 105 /* 106 * Allocate RESYNC_PAGES data pages and attach them to 107 * the first bio; 108 */ 109 bio = r1_bio->bios[0]; 110 for (i = 0; i < RESYNC_PAGES; i++) { 111 page = alloc_page(gfp_flags); 112 if (unlikely(!page)) 113 goto out_free_pages; 114 115 bio->bi_io_vec[i].bv_page = page; 116 } 117 118 r1_bio->master_bio = NULL; 119 120 return r1_bio; 121 122 out_free_pages: 123 for ( ; i > 0 ; i--) 124 __free_page(bio->bi_io_vec[i-1].bv_page); 125 out_free_bio: 126 while ( ++j < pi->raid_disks ) 127 bio_put(r1_bio->bios[j]); 128 r1bio_pool_free(r1_bio, data); 129 return NULL; 130 } 131 132 static void r1buf_pool_free(void *__r1_bio, void *data) 133 { 134 struct pool_info *pi = data; 135 int i; 136 r1bio_t *r1bio = __r1_bio; 137 struct bio *bio = r1bio->bios[0]; 138 139 for (i = 0; i < RESYNC_PAGES; i++) { 140 __free_page(bio->bi_io_vec[i].bv_page); 141 bio->bi_io_vec[i].bv_page = NULL; 142 } 143 for (i=0 ; i < pi->raid_disks; i++) 144 bio_put(r1bio->bios[i]); 145 146 r1bio_pool_free(r1bio, data); 147 } 148 149 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) 150 { 151 int i; 152 153 for (i = 0; i < conf->raid_disks; i++) { 154 struct bio **bio = r1_bio->bios + i; 155 if (*bio) 156 bio_put(*bio); 157 *bio = NULL; 158 } 159 } 160 161 static inline void free_r1bio(r1bio_t *r1_bio) 162 { 163 unsigned long flags; 164 165 conf_t *conf = mddev_to_conf(r1_bio->mddev); 166 167 /* 168 * Wake up any possible resync thread that waits for the device 169 * to go idle. 170 */ 171 spin_lock_irqsave(&conf->resync_lock, flags); 172 if (!--conf->nr_pending) { 173 wake_up(&conf->wait_idle); 174 wake_up(&conf->wait_resume); 175 } 176 spin_unlock_irqrestore(&conf->resync_lock, flags); 177 178 put_all_bios(conf, r1_bio); 179 mempool_free(r1_bio, conf->r1bio_pool); 180 } 181 182 static inline void put_buf(r1bio_t *r1_bio) 183 { 184 conf_t *conf = mddev_to_conf(r1_bio->mddev); 185 unsigned long flags; 186 187 mempool_free(r1_bio, conf->r1buf_pool); 188 189 spin_lock_irqsave(&conf->resync_lock, flags); 190 if (!conf->barrier) 191 BUG(); 192 --conf->barrier; 193 wake_up(&conf->wait_resume); 194 wake_up(&conf->wait_idle); 195 196 if (!--conf->nr_pending) { 197 wake_up(&conf->wait_idle); 198 wake_up(&conf->wait_resume); 199 } 200 spin_unlock_irqrestore(&conf->resync_lock, flags); 201 } 202 203 static void reschedule_retry(r1bio_t *r1_bio) 204 { 205 unsigned long flags; 206 mddev_t *mddev = r1_bio->mddev; 207 conf_t *conf = mddev_to_conf(mddev); 208 209 spin_lock_irqsave(&conf->device_lock, flags); 210 list_add(&r1_bio->retry_list, &conf->retry_list); 211 spin_unlock_irqrestore(&conf->device_lock, flags); 212 213 md_wakeup_thread(mddev->thread); 214 } 215 216 /* 217 * raid_end_bio_io() is called when we have finished servicing a mirrored 218 * operation and are ready to return a success/failure code to the buffer 219 * cache layer. 220 */ 221 static void raid_end_bio_io(r1bio_t *r1_bio) 222 { 223 struct bio *bio = r1_bio->master_bio; 224 225 /* if nobody has done the final endio yet, do it now */ 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", 228 (bio_data_dir(bio) == WRITE) ? "write" : "read", 229 (unsigned long long) bio->bi_sector, 230 (unsigned long long) bio->bi_sector + 231 (bio->bi_size >> 9) - 1); 232 233 bio_endio(bio, bio->bi_size, 234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 235 } 236 free_r1bio(r1_bio); 237 } 238 239 /* 240 * Update disk head position estimator based on IRQ completion info. 241 */ 242 static inline void update_head_pos(int disk, r1bio_t *r1_bio) 243 { 244 conf_t *conf = mddev_to_conf(r1_bio->mddev); 245 246 conf->mirrors[disk].head_position = 247 r1_bio->sector + (r1_bio->sectors); 248 } 249 250 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) 251 { 252 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 253 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 254 int mirror; 255 conf_t *conf = mddev_to_conf(r1_bio->mddev); 256 257 if (bio->bi_size) 258 return 1; 259 260 mirror = r1_bio->read_disk; 261 /* 262 * this branch is our 'one mirror IO has finished' event handler: 263 */ 264 if (!uptodate) 265 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 266 else 267 /* 268 * Set R1BIO_Uptodate in our master bio, so that 269 * we will return a good error code for to the higher 270 * levels even if IO on some other mirrored buffer fails. 271 * 272 * The 'master' represents the composite IO operation to 273 * user-side. So if something waits for IO, then it will 274 * wait for the 'master' bio. 275 */ 276 set_bit(R1BIO_Uptodate, &r1_bio->state); 277 278 update_head_pos(mirror, r1_bio); 279 280 /* 281 * we have only one bio on the read side 282 */ 283 if (uptodate) 284 raid_end_bio_io(r1_bio); 285 else { 286 /* 287 * oops, read error: 288 */ 289 char b[BDEVNAME_SIZE]; 290 if (printk_ratelimit()) 291 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 292 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 293 reschedule_retry(r1_bio); 294 } 295 296 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 297 return 0; 298 } 299 300 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) 301 { 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 304 int mirror, behind; 305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 306 307 if (bio->bi_size) 308 return 1; 309 310 for (mirror = 0; mirror < conf->raid_disks; mirror++) 311 if (r1_bio->bios[mirror] == bio) 312 break; 313 314 /* 315 * this branch is our 'one mirror IO has finished' event handler: 316 */ 317 if (!uptodate) { 318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 319 /* an I/O failed, we can't clear the bitmap */ 320 set_bit(R1BIO_Degraded, &r1_bio->state); 321 } else 322 /* 323 * Set R1BIO_Uptodate in our master bio, so that 324 * we will return a good error code for to the higher 325 * levels even if IO on some other mirrored buffer fails. 326 * 327 * The 'master' represents the composite IO operation to 328 * user-side. So if something waits for IO, then it will 329 * wait for the 'master' bio. 330 */ 331 set_bit(R1BIO_Uptodate, &r1_bio->state); 332 333 update_head_pos(mirror, r1_bio); 334 335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 336 if (behind) { 337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 338 atomic_dec(&r1_bio->behind_remaining); 339 340 /* In behind mode, we ACK the master bio once the I/O has safely 341 * reached all non-writemostly disks. Setting the Returned bit 342 * ensures that this gets done only once -- we don't ever want to 343 * return -EIO here, instead we'll wait */ 344 345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 347 /* Maybe we can return now */ 348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 349 struct bio *mbio = r1_bio->master_bio; 350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 351 (unsigned long long) mbio->bi_sector, 352 (unsigned long long) mbio->bi_sector + 353 (mbio->bi_size >> 9) - 1); 354 bio_endio(mbio, mbio->bi_size, 0); 355 } 356 } 357 } 358 /* 359 * 360 * Let's see if all mirrored write operations have finished 361 * already. 362 */ 363 if (atomic_dec_and_test(&r1_bio->remaining)) { 364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 365 /* free extra copy of the data pages */ 366 int i = bio->bi_vcnt; 367 while (i--) 368 __free_page(bio->bi_io_vec[i].bv_page); 369 } 370 /* clear the bitmap if all writes complete successfully */ 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 372 r1_bio->sectors, 373 !test_bit(R1BIO_Degraded, &r1_bio->state), 374 behind); 375 md_write_end(r1_bio->mddev); 376 raid_end_bio_io(r1_bio); 377 } 378 379 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 380 return 0; 381 } 382 383 384 /* 385 * This routine returns the disk from which the requested read should 386 * be done. There is a per-array 'next expected sequential IO' sector 387 * number - if this matches on the next IO then we use the last disk. 388 * There is also a per-disk 'last know head position' sector that is 389 * maintained from IRQ contexts, both the normal and the resync IO 390 * completion handlers update this position correctly. If there is no 391 * perfect sequential match then we pick the disk whose head is closest. 392 * 393 * If there are 2 mirrors in the same 2 devices, performance degrades 394 * because position is mirror, not device based. 395 * 396 * The rdev for the device selected will have nr_pending incremented. 397 */ 398 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 399 { 400 const unsigned long this_sector = r1_bio->sector; 401 int new_disk = conf->last_used, disk = new_disk; 402 int wonly_disk = -1; 403 const int sectors = r1_bio->sectors; 404 sector_t new_distance, current_distance; 405 mdk_rdev_t *rdev; 406 407 rcu_read_lock(); 408 /* 409 * Check if we can balance. We can balance on the whole 410 * device if no resync is going on, or below the resync window. 411 * We take the first readable disk when above the resync window. 412 */ 413 retry: 414 if (conf->mddev->recovery_cp < MaxSector && 415 (this_sector + sectors >= conf->next_resync)) { 416 /* Choose the first operation device, for consistancy */ 417 new_disk = 0; 418 419 for (rdev = conf->mirrors[new_disk].rdev; 420 !rdev || !rdev->in_sync 421 || test_bit(WriteMostly, &rdev->flags); 422 rdev = conf->mirrors[++new_disk].rdev) { 423 424 if (rdev && rdev->in_sync) 425 wonly_disk = new_disk; 426 427 if (new_disk == conf->raid_disks - 1) { 428 new_disk = wonly_disk; 429 break; 430 } 431 } 432 goto rb_out; 433 } 434 435 436 /* make sure the disk is operational */ 437 for (rdev = conf->mirrors[new_disk].rdev; 438 !rdev || !rdev->in_sync || 439 test_bit(WriteMostly, &rdev->flags); 440 rdev = conf->mirrors[new_disk].rdev) { 441 442 if (rdev && rdev->in_sync) 443 wonly_disk = new_disk; 444 445 if (new_disk <= 0) 446 new_disk = conf->raid_disks; 447 new_disk--; 448 if (new_disk == disk) { 449 new_disk = wonly_disk; 450 break; 451 } 452 } 453 454 if (new_disk < 0) 455 goto rb_out; 456 457 disk = new_disk; 458 /* now disk == new_disk == starting point for search */ 459 460 /* 461 * Don't change to another disk for sequential reads: 462 */ 463 if (conf->next_seq_sect == this_sector) 464 goto rb_out; 465 if (this_sector == conf->mirrors[new_disk].head_position) 466 goto rb_out; 467 468 current_distance = abs(this_sector - conf->mirrors[disk].head_position); 469 470 /* Find the disk whose head is closest */ 471 472 do { 473 if (disk <= 0) 474 disk = conf->raid_disks; 475 disk--; 476 477 rdev = conf->mirrors[disk].rdev; 478 479 if (!rdev || 480 !rdev->in_sync || 481 test_bit(WriteMostly, &rdev->flags)) 482 continue; 483 484 if (!atomic_read(&rdev->nr_pending)) { 485 new_disk = disk; 486 break; 487 } 488 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 489 if (new_distance < current_distance) { 490 current_distance = new_distance; 491 new_disk = disk; 492 } 493 } while (disk != conf->last_used); 494 495 rb_out: 496 497 498 if (new_disk >= 0) { 499 rdev = conf->mirrors[new_disk].rdev; 500 if (!rdev) 501 goto retry; 502 atomic_inc(&rdev->nr_pending); 503 if (!rdev->in_sync) { 504 /* cannot risk returning a device that failed 505 * before we inc'ed nr_pending 506 */ 507 atomic_dec(&rdev->nr_pending); 508 goto retry; 509 } 510 conf->next_seq_sect = this_sector + sectors; 511 conf->last_used = new_disk; 512 } 513 rcu_read_unlock(); 514 515 return new_disk; 516 } 517 518 static void unplug_slaves(mddev_t *mddev) 519 { 520 conf_t *conf = mddev_to_conf(mddev); 521 int i; 522 523 rcu_read_lock(); 524 for (i=0; i<mddev->raid_disks; i++) { 525 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 526 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 527 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 528 529 atomic_inc(&rdev->nr_pending); 530 rcu_read_unlock(); 531 532 if (r_queue->unplug_fn) 533 r_queue->unplug_fn(r_queue); 534 535 rdev_dec_pending(rdev, mddev); 536 rcu_read_lock(); 537 } 538 } 539 rcu_read_unlock(); 540 } 541 542 static void raid1_unplug(request_queue_t *q) 543 { 544 mddev_t *mddev = q->queuedata; 545 546 unplug_slaves(mddev); 547 md_wakeup_thread(mddev->thread); 548 } 549 550 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 551 sector_t *error_sector) 552 { 553 mddev_t *mddev = q->queuedata; 554 conf_t *conf = mddev_to_conf(mddev); 555 int i, ret = 0; 556 557 rcu_read_lock(); 558 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 559 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 560 if (rdev && !rdev->faulty) { 561 struct block_device *bdev = rdev->bdev; 562 request_queue_t *r_queue = bdev_get_queue(bdev); 563 564 if (!r_queue->issue_flush_fn) 565 ret = -EOPNOTSUPP; 566 else { 567 atomic_inc(&rdev->nr_pending); 568 rcu_read_unlock(); 569 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 570 error_sector); 571 rdev_dec_pending(rdev, mddev); 572 rcu_read_lock(); 573 } 574 } 575 } 576 rcu_read_unlock(); 577 return ret; 578 } 579 580 /* 581 * Throttle resync depth, so that we can both get proper overlapping of 582 * requests, but are still able to handle normal requests quickly. 583 */ 584 #define RESYNC_DEPTH 32 585 586 static void device_barrier(conf_t *conf, sector_t sect) 587 { 588 spin_lock_irq(&conf->resync_lock); 589 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 590 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 591 592 if (!conf->barrier++) { 593 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 594 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 595 if (conf->nr_pending) 596 BUG(); 597 } 598 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 599 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 600 conf->next_resync = sect; 601 spin_unlock_irq(&conf->resync_lock); 602 } 603 604 /* duplicate the data pages for behind I/O */ 605 static struct page **alloc_behind_pages(struct bio *bio) 606 { 607 int i; 608 struct bio_vec *bvec; 609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), 610 GFP_NOIO); 611 if (unlikely(!pages)) 612 goto do_sync_io; 613 614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); 615 616 bio_for_each_segment(bvec, bio, i) { 617 pages[i] = alloc_page(GFP_NOIO); 618 if (unlikely(!pages[i])) 619 goto do_sync_io; 620 memcpy(kmap(pages[i]) + bvec->bv_offset, 621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 622 kunmap(pages[i]); 623 kunmap(bvec->bv_page); 624 } 625 626 return pages; 627 628 do_sync_io: 629 if (pages) 630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 631 __free_page(pages[i]); 632 kfree(pages); 633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 634 return NULL; 635 } 636 637 static int make_request(request_queue_t *q, struct bio * bio) 638 { 639 mddev_t *mddev = q->queuedata; 640 conf_t *conf = mddev_to_conf(mddev); 641 mirror_info_t *mirror; 642 r1bio_t *r1_bio; 643 struct bio *read_bio; 644 int i, targets = 0, disks; 645 mdk_rdev_t *rdev; 646 struct bitmap *bitmap = mddev->bitmap; 647 unsigned long flags; 648 struct bio_list bl; 649 struct page **behind_pages = NULL; 650 651 if (unlikely(bio_barrier(bio))) { 652 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 653 return 0; 654 } 655 656 /* 657 * Register the new request and wait if the reconstruction 658 * thread has put up a bar for new requests. 659 * Continue immediately if no resync is active currently. 660 */ 661 md_write_start(mddev, bio); /* wait on superblock update early */ 662 663 spin_lock_irq(&conf->resync_lock); 664 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); 665 conf->nr_pending++; 666 spin_unlock_irq(&conf->resync_lock); 667 668 if (bio_data_dir(bio)==WRITE) { 669 disk_stat_inc(mddev->gendisk, writes); 670 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); 671 } else { 672 disk_stat_inc(mddev->gendisk, reads); 673 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); 674 } 675 676 /* 677 * make_request() can abort the operation when READA is being 678 * used and no empty request is available. 679 * 680 */ 681 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 682 683 r1_bio->master_bio = bio; 684 r1_bio->sectors = bio->bi_size >> 9; 685 r1_bio->state = 0; 686 r1_bio->mddev = mddev; 687 r1_bio->sector = bio->bi_sector; 688 689 if (bio_data_dir(bio) == READ) { 690 /* 691 * read balancing logic: 692 */ 693 int rdisk = read_balance(conf, r1_bio); 694 695 if (rdisk < 0) { 696 /* couldn't find anywhere to read from */ 697 raid_end_bio_io(r1_bio); 698 return 0; 699 } 700 mirror = conf->mirrors + rdisk; 701 702 r1_bio->read_disk = rdisk; 703 704 read_bio = bio_clone(bio, GFP_NOIO); 705 706 r1_bio->bios[rdisk] = read_bio; 707 708 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 709 read_bio->bi_bdev = mirror->rdev->bdev; 710 read_bio->bi_end_io = raid1_end_read_request; 711 read_bio->bi_rw = READ; 712 read_bio->bi_private = r1_bio; 713 714 generic_make_request(read_bio); 715 return 0; 716 } 717 718 /* 719 * WRITE: 720 */ 721 /* first select target devices under spinlock and 722 * inc refcount on their rdev. Record them by setting 723 * bios[x] to bio 724 */ 725 disks = conf->raid_disks; 726 #if 0 727 { static int first=1; 728 if (first) printk("First Write sector %llu disks %d\n", 729 (unsigned long long)r1_bio->sector, disks); 730 first = 0; 731 } 732 #endif 733 rcu_read_lock(); 734 for (i = 0; i < disks; i++) { 735 if ((rdev=conf->mirrors[i].rdev) != NULL && 736 !rdev->faulty) { 737 atomic_inc(&rdev->nr_pending); 738 if (rdev->faulty) { 739 atomic_dec(&rdev->nr_pending); 740 r1_bio->bios[i] = NULL; 741 } else 742 r1_bio->bios[i] = bio; 743 targets++; 744 } else 745 r1_bio->bios[i] = NULL; 746 } 747 rcu_read_unlock(); 748 749 BUG_ON(targets == 0); /* we never fail the last device */ 750 751 if (targets < conf->raid_disks) { 752 /* array is degraded, we will not clear the bitmap 753 * on I/O completion (see raid1_end_write_request) */ 754 set_bit(R1BIO_Degraded, &r1_bio->state); 755 } 756 757 /* do behind I/O ? */ 758 if (bitmap && 759 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 760 (behind_pages = alloc_behind_pages(bio)) != NULL) 761 set_bit(R1BIO_BehindIO, &r1_bio->state); 762 763 atomic_set(&r1_bio->remaining, 0); 764 atomic_set(&r1_bio->behind_remaining, 0); 765 766 bio_list_init(&bl); 767 for (i = 0; i < disks; i++) { 768 struct bio *mbio; 769 if (!r1_bio->bios[i]) 770 continue; 771 772 mbio = bio_clone(bio, GFP_NOIO); 773 r1_bio->bios[i] = mbio; 774 775 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 776 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 777 mbio->bi_end_io = raid1_end_write_request; 778 mbio->bi_rw = WRITE; 779 mbio->bi_private = r1_bio; 780 781 if (behind_pages) { 782 struct bio_vec *bvec; 783 int j; 784 785 /* Yes, I really want the '__' version so that 786 * we clear any unused pointer in the io_vec, rather 787 * than leave them unchanged. This is important 788 * because when we come to free the pages, we won't 789 * know the originial bi_idx, so we just free 790 * them all 791 */ 792 __bio_for_each_segment(bvec, mbio, j, 0) 793 bvec->bv_page = behind_pages[j]; 794 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 795 atomic_inc(&r1_bio->behind_remaining); 796 } 797 798 atomic_inc(&r1_bio->remaining); 799 800 bio_list_add(&bl, mbio); 801 } 802 kfree(behind_pages); /* the behind pages are attached to the bios now */ 803 804 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 805 test_bit(R1BIO_BehindIO, &r1_bio->state)); 806 spin_lock_irqsave(&conf->device_lock, flags); 807 bio_list_merge(&conf->pending_bio_list, &bl); 808 bio_list_init(&bl); 809 810 blk_plug_device(mddev->queue); 811 spin_unlock_irqrestore(&conf->device_lock, flags); 812 813 #if 0 814 while ((bio = bio_list_pop(&bl)) != NULL) 815 generic_make_request(bio); 816 #endif 817 818 return 0; 819 } 820 821 static void status(struct seq_file *seq, mddev_t *mddev) 822 { 823 conf_t *conf = mddev_to_conf(mddev); 824 int i; 825 826 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 827 conf->working_disks); 828 for (i = 0; i < conf->raid_disks; i++) 829 seq_printf(seq, "%s", 830 conf->mirrors[i].rdev && 831 conf->mirrors[i].rdev->in_sync ? "U" : "_"); 832 seq_printf(seq, "]"); 833 } 834 835 836 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 837 { 838 char b[BDEVNAME_SIZE]; 839 conf_t *conf = mddev_to_conf(mddev); 840 841 /* 842 * If it is not operational, then we have already marked it as dead 843 * else if it is the last working disks, ignore the error, let the 844 * next level up know. 845 * else mark the drive as failed 846 */ 847 if (rdev->in_sync 848 && conf->working_disks == 1) 849 /* 850 * Don't fail the drive, act as though we were just a 851 * normal single drive 852 */ 853 return; 854 if (rdev->in_sync) { 855 mddev->degraded++; 856 conf->working_disks--; 857 /* 858 * if recovery is running, make sure it aborts. 859 */ 860 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 861 } 862 rdev->in_sync = 0; 863 rdev->faulty = 1; 864 mddev->sb_dirty = 1; 865 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 866 " Operation continuing on %d devices\n", 867 bdevname(rdev->bdev,b), conf->working_disks); 868 } 869 870 static void print_conf(conf_t *conf) 871 { 872 int i; 873 mirror_info_t *tmp; 874 875 printk("RAID1 conf printout:\n"); 876 if (!conf) { 877 printk("(!conf)\n"); 878 return; 879 } 880 printk(" --- wd:%d rd:%d\n", conf->working_disks, 881 conf->raid_disks); 882 883 for (i = 0; i < conf->raid_disks; i++) { 884 char b[BDEVNAME_SIZE]; 885 tmp = conf->mirrors + i; 886 if (tmp->rdev) 887 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 888 i, !tmp->rdev->in_sync, !tmp->rdev->faulty, 889 bdevname(tmp->rdev->bdev,b)); 890 } 891 } 892 893 static void close_sync(conf_t *conf) 894 { 895 spin_lock_irq(&conf->resync_lock); 896 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 897 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 898 spin_unlock_irq(&conf->resync_lock); 899 900 if (conf->barrier) BUG(); 901 if (waitqueue_active(&conf->wait_idle)) BUG(); 902 903 mempool_destroy(conf->r1buf_pool); 904 conf->r1buf_pool = NULL; 905 } 906 907 static int raid1_spare_active(mddev_t *mddev) 908 { 909 int i; 910 conf_t *conf = mddev->private; 911 mirror_info_t *tmp; 912 913 /* 914 * Find all failed disks within the RAID1 configuration 915 * and mark them readable 916 */ 917 for (i = 0; i < conf->raid_disks; i++) { 918 tmp = conf->mirrors + i; 919 if (tmp->rdev 920 && !tmp->rdev->faulty 921 && !tmp->rdev->in_sync) { 922 conf->working_disks++; 923 mddev->degraded--; 924 tmp->rdev->in_sync = 1; 925 } 926 } 927 928 print_conf(conf); 929 return 0; 930 } 931 932 933 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 934 { 935 conf_t *conf = mddev->private; 936 int found = 0; 937 int mirror = 0; 938 mirror_info_t *p; 939 940 if (rdev->saved_raid_disk >= 0 && 941 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 942 mirror = rdev->saved_raid_disk; 943 for (mirror=0; mirror < mddev->raid_disks; mirror++) 944 if ( !(p=conf->mirrors+mirror)->rdev) { 945 946 blk_queue_stack_limits(mddev->queue, 947 rdev->bdev->bd_disk->queue); 948 /* as we don't honour merge_bvec_fn, we must never risk 949 * violating it, so limit ->max_sector to one PAGE, as 950 * a one page request is never in violation. 951 */ 952 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 953 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 954 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 955 956 p->head_position = 0; 957 rdev->raid_disk = mirror; 958 found = 1; 959 if (rdev->saved_raid_disk != mirror) 960 conf->fullsync = 1; 961 p->rdev = rdev; 962 break; 963 } 964 965 print_conf(conf); 966 return found; 967 } 968 969 static int raid1_remove_disk(mddev_t *mddev, int number) 970 { 971 conf_t *conf = mddev->private; 972 int err = 0; 973 mdk_rdev_t *rdev; 974 mirror_info_t *p = conf->mirrors+ number; 975 976 print_conf(conf); 977 rdev = p->rdev; 978 if (rdev) { 979 if (rdev->in_sync || 980 atomic_read(&rdev->nr_pending)) { 981 err = -EBUSY; 982 goto abort; 983 } 984 p->rdev = NULL; 985 synchronize_rcu(); 986 if (atomic_read(&rdev->nr_pending)) { 987 /* lost the race, try later */ 988 err = -EBUSY; 989 p->rdev = rdev; 990 } 991 } 992 abort: 993 994 print_conf(conf); 995 return err; 996 } 997 998 999 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1000 { 1001 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1002 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1003 conf_t *conf = mddev_to_conf(r1_bio->mddev); 1004 1005 if (bio->bi_size) 1006 return 1; 1007 1008 if (r1_bio->bios[r1_bio->read_disk] != bio) 1009 BUG(); 1010 update_head_pos(r1_bio->read_disk, r1_bio); 1011 /* 1012 * we have read a block, now it needs to be re-written, 1013 * or re-read if the read failed. 1014 * We don't do much here, just schedule handling by raid1d 1015 */ 1016 if (!uptodate) { 1017 md_error(r1_bio->mddev, 1018 conf->mirrors[r1_bio->read_disk].rdev); 1019 } else 1020 set_bit(R1BIO_Uptodate, &r1_bio->state); 1021 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 1022 reschedule_retry(r1_bio); 1023 return 0; 1024 } 1025 1026 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) 1027 { 1028 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1029 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1030 mddev_t *mddev = r1_bio->mddev; 1031 conf_t *conf = mddev_to_conf(mddev); 1032 int i; 1033 int mirror=0; 1034 1035 if (bio->bi_size) 1036 return 1; 1037 1038 for (i = 0; i < conf->raid_disks; i++) 1039 if (r1_bio->bios[i] == bio) { 1040 mirror = i; 1041 break; 1042 } 1043 if (!uptodate) 1044 md_error(mddev, conf->mirrors[mirror].rdev); 1045 1046 update_head_pos(mirror, r1_bio); 1047 1048 if (atomic_dec_and_test(&r1_bio->remaining)) { 1049 md_done_sync(mddev, r1_bio->sectors, uptodate); 1050 put_buf(r1_bio); 1051 } 1052 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); 1053 return 0; 1054 } 1055 1056 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1057 { 1058 conf_t *conf = mddev_to_conf(mddev); 1059 int i; 1060 int disks = conf->raid_disks; 1061 struct bio *bio, *wbio; 1062 1063 bio = r1_bio->bios[r1_bio->read_disk]; 1064 1065 /* 1066 if (r1_bio->sector == 0) printk("First sync write startss\n"); 1067 */ 1068 /* 1069 * schedule writes 1070 */ 1071 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1072 /* 1073 * There is no point trying a read-for-reconstruct as 1074 * reconstruct is about to be aborted 1075 */ 1076 char b[BDEVNAME_SIZE]; 1077 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1078 " for block %llu\n", 1079 bdevname(bio->bi_bdev,b), 1080 (unsigned long long)r1_bio->sector); 1081 md_done_sync(mddev, r1_bio->sectors, 0); 1082 put_buf(r1_bio); 1083 return; 1084 } 1085 1086 atomic_set(&r1_bio->remaining, 1); 1087 for (i = 0; i < disks ; i++) { 1088 wbio = r1_bio->bios[i]; 1089 if (wbio->bi_end_io != end_sync_write) 1090 continue; 1091 1092 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 1093 atomic_inc(&r1_bio->remaining); 1094 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1095 1096 generic_make_request(wbio); 1097 } 1098 1099 if (atomic_dec_and_test(&r1_bio->remaining)) { 1100 /* if we're here, all write(s) have completed, so clean up */ 1101 md_done_sync(mddev, r1_bio->sectors, 1); 1102 put_buf(r1_bio); 1103 } 1104 } 1105 1106 /* 1107 * This is a kernel thread which: 1108 * 1109 * 1. Retries failed read operations on working mirrors. 1110 * 2. Updates the raid superblock when problems encounter. 1111 * 3. Performs writes following reads for array syncronising. 1112 */ 1113 1114 static void raid1d(mddev_t *mddev) 1115 { 1116 r1bio_t *r1_bio; 1117 struct bio *bio; 1118 unsigned long flags; 1119 conf_t *conf = mddev_to_conf(mddev); 1120 struct list_head *head = &conf->retry_list; 1121 int unplug=0; 1122 mdk_rdev_t *rdev; 1123 1124 md_check_recovery(mddev); 1125 1126 for (;;) { 1127 char b[BDEVNAME_SIZE]; 1128 spin_lock_irqsave(&conf->device_lock, flags); 1129 1130 if (conf->pending_bio_list.head) { 1131 bio = bio_list_get(&conf->pending_bio_list); 1132 blk_remove_plug(mddev->queue); 1133 spin_unlock_irqrestore(&conf->device_lock, flags); 1134 /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 1135 if (bitmap_unplug(mddev->bitmap) != 0) 1136 printk("%s: bitmap file write failed!\n", mdname(mddev)); 1137 1138 while (bio) { /* submit pending writes */ 1139 struct bio *next = bio->bi_next; 1140 bio->bi_next = NULL; 1141 generic_make_request(bio); 1142 bio = next; 1143 } 1144 unplug = 1; 1145 1146 continue; 1147 } 1148 1149 if (list_empty(head)) 1150 break; 1151 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1152 list_del(head->prev); 1153 spin_unlock_irqrestore(&conf->device_lock, flags); 1154 1155 mddev = r1_bio->mddev; 1156 conf = mddev_to_conf(mddev); 1157 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1158 sync_request_write(mddev, r1_bio); 1159 unplug = 1; 1160 } else { 1161 int disk; 1162 bio = r1_bio->bios[r1_bio->read_disk]; 1163 if ((disk=read_balance(conf, r1_bio)) == -1) { 1164 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1165 " read error for block %llu\n", 1166 bdevname(bio->bi_bdev,b), 1167 (unsigned long long)r1_bio->sector); 1168 raid_end_bio_io(r1_bio); 1169 } else { 1170 r1_bio->bios[r1_bio->read_disk] = NULL; 1171 r1_bio->read_disk = disk; 1172 bio_put(bio); 1173 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1174 r1_bio->bios[r1_bio->read_disk] = bio; 1175 rdev = conf->mirrors[disk].rdev; 1176 if (printk_ratelimit()) 1177 printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1178 " another mirror\n", 1179 bdevname(rdev->bdev,b), 1180 (unsigned long long)r1_bio->sector); 1181 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1182 bio->bi_bdev = rdev->bdev; 1183 bio->bi_end_io = raid1_end_read_request; 1184 bio->bi_rw = READ; 1185 bio->bi_private = r1_bio; 1186 unplug = 1; 1187 generic_make_request(bio); 1188 } 1189 } 1190 } 1191 spin_unlock_irqrestore(&conf->device_lock, flags); 1192 if (unplug) 1193 unplug_slaves(mddev); 1194 } 1195 1196 1197 static int init_resync(conf_t *conf) 1198 { 1199 int buffs; 1200 1201 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 1202 if (conf->r1buf_pool) 1203 BUG(); 1204 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, 1205 conf->poolinfo); 1206 if (!conf->r1buf_pool) 1207 return -ENOMEM; 1208 conf->next_resync = 0; 1209 return 0; 1210 } 1211 1212 /* 1213 * perform a "sync" on one "block" 1214 * 1215 * We need to make sure that no normal I/O request - particularly write 1216 * requests - conflict with active sync requests. 1217 * 1218 * This is achieved by tracking pending requests and a 'barrier' concept 1219 * that can be installed to exclude normal IO requests. 1220 */ 1221 1222 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1223 { 1224 conf_t *conf = mddev_to_conf(mddev); 1225 mirror_info_t *mirror; 1226 r1bio_t *r1_bio; 1227 struct bio *bio; 1228 sector_t max_sector, nr_sectors; 1229 int disk; 1230 int i; 1231 int wonly; 1232 int write_targets = 0; 1233 int sync_blocks; 1234 int still_degraded = 0; 1235 1236 if (!conf->r1buf_pool) 1237 { 1238 /* 1239 printk("sync start - bitmap %p\n", mddev->bitmap); 1240 */ 1241 if (init_resync(conf)) 1242 return 0; 1243 } 1244 1245 max_sector = mddev->size << 1; 1246 if (sector_nr >= max_sector) { 1247 /* If we aborted, we need to abort the 1248 * sync on the 'current' bitmap chunk (there will 1249 * only be one in raid1 resync. 1250 * We can find the current addess in mddev->curr_resync 1251 */ 1252 if (mddev->curr_resync < max_sector) /* aborted */ 1253 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1254 &sync_blocks, 1); 1255 else /* completed sync */ 1256 conf->fullsync = 0; 1257 1258 bitmap_close_sync(mddev->bitmap); 1259 close_sync(conf); 1260 return 0; 1261 } 1262 1263 /* before building a request, check if we can skip these blocks.. 1264 * This call the bitmap_start_sync doesn't actually record anything 1265 */ 1266 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1267 !conf->fullsync) { 1268 /* We can skip this block, and probably several more */ 1269 *skipped = 1; 1270 return sync_blocks; 1271 } 1272 /* 1273 * If there is non-resync activity waiting for us then 1274 * put in a delay to throttle resync. 1275 */ 1276 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1277 msleep_interruptible(1000); 1278 device_barrier(conf, sector_nr + RESYNC_SECTORS); 1279 1280 /* 1281 * If reconstructing, and >1 working disc, 1282 * could dedicate one to rebuild and others to 1283 * service read requests .. 1284 */ 1285 disk = conf->last_used; 1286 /* make sure disk is operational */ 1287 wonly = disk; 1288 while (conf->mirrors[disk].rdev == NULL || 1289 !conf->mirrors[disk].rdev->in_sync || 1290 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) 1291 ) { 1292 if (conf->mirrors[disk].rdev && 1293 conf->mirrors[disk].rdev->in_sync) 1294 wonly = disk; 1295 if (disk <= 0) 1296 disk = conf->raid_disks; 1297 disk--; 1298 if (disk == conf->last_used) { 1299 disk = wonly; 1300 break; 1301 } 1302 } 1303 conf->last_used = disk; 1304 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 1305 1306 1307 mirror = conf->mirrors + disk; 1308 1309 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1310 1311 spin_lock_irq(&conf->resync_lock); 1312 conf->nr_pending++; 1313 spin_unlock_irq(&conf->resync_lock); 1314 1315 r1_bio->mddev = mddev; 1316 r1_bio->sector = sector_nr; 1317 r1_bio->state = 0; 1318 set_bit(R1BIO_IsSync, &r1_bio->state); 1319 r1_bio->read_disk = disk; 1320 1321 for (i=0; i < conf->raid_disks; i++) { 1322 bio = r1_bio->bios[i]; 1323 1324 /* take from bio_init */ 1325 bio->bi_next = NULL; 1326 bio->bi_flags |= 1 << BIO_UPTODATE; 1327 bio->bi_rw = 0; 1328 bio->bi_vcnt = 0; 1329 bio->bi_idx = 0; 1330 bio->bi_phys_segments = 0; 1331 bio->bi_hw_segments = 0; 1332 bio->bi_size = 0; 1333 bio->bi_end_io = NULL; 1334 bio->bi_private = NULL; 1335 1336 if (i == disk) { 1337 bio->bi_rw = READ; 1338 bio->bi_end_io = end_sync_read; 1339 } else if (conf->mirrors[i].rdev == NULL || 1340 conf->mirrors[i].rdev->faulty) { 1341 still_degraded = 1; 1342 continue; 1343 } else if (!conf->mirrors[i].rdev->in_sync || 1344 sector_nr + RESYNC_SECTORS > mddev->recovery_cp) { 1345 bio->bi_rw = WRITE; 1346 bio->bi_end_io = end_sync_write; 1347 write_targets ++; 1348 } else 1349 /* no need to read or write here */ 1350 continue; 1351 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; 1352 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1353 bio->bi_private = r1_bio; 1354 } 1355 1356 if (write_targets == 0) { 1357 /* There is nowhere to write, so all non-sync 1358 * drives must be failed - so we are finished 1359 */ 1360 sector_t rv = max_sector - sector_nr; 1361 *skipped = 1; 1362 put_buf(r1_bio); 1363 rdev_dec_pending(conf->mirrors[disk].rdev, mddev); 1364 return rv; 1365 } 1366 1367 nr_sectors = 0; 1368 sync_blocks = 0; 1369 do { 1370 struct page *page; 1371 int len = PAGE_SIZE; 1372 if (sector_nr + (len>>9) > max_sector) 1373 len = (max_sector - sector_nr) << 9; 1374 if (len == 0) 1375 break; 1376 if (sync_blocks == 0) { 1377 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1378 &sync_blocks, still_degraded) && 1379 !conf->fullsync) 1380 break; 1381 if (sync_blocks < (PAGE_SIZE>>9)) 1382 BUG(); 1383 if (len > (sync_blocks<<9)) 1384 len = sync_blocks<<9; 1385 } 1386 1387 for (i=0 ; i < conf->raid_disks; i++) { 1388 bio = r1_bio->bios[i]; 1389 if (bio->bi_end_io) { 1390 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; 1391 if (bio_add_page(bio, page, len, 0) == 0) { 1392 /* stop here */ 1393 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; 1394 while (i > 0) { 1395 i--; 1396 bio = r1_bio->bios[i]; 1397 if (bio->bi_end_io==NULL) 1398 continue; 1399 /* remove last page from this bio */ 1400 bio->bi_vcnt--; 1401 bio->bi_size -= len; 1402 bio->bi_flags &= ~(1<< BIO_SEG_VALID); 1403 } 1404 goto bio_full; 1405 } 1406 } 1407 } 1408 nr_sectors += len>>9; 1409 sector_nr += len>>9; 1410 sync_blocks -= (len>>9); 1411 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1412 bio_full: 1413 bio = r1_bio->bios[disk]; 1414 r1_bio->sectors = nr_sectors; 1415 1416 md_sync_acct(mirror->rdev->bdev, nr_sectors); 1417 1418 generic_make_request(bio); 1419 1420 return nr_sectors; 1421 } 1422 1423 static int run(mddev_t *mddev) 1424 { 1425 conf_t *conf; 1426 int i, j, disk_idx; 1427 mirror_info_t *disk; 1428 mdk_rdev_t *rdev; 1429 struct list_head *tmp; 1430 1431 if (mddev->level != 1) { 1432 printk("raid1: %s: raid level not set to mirroring (%d)\n", 1433 mdname(mddev), mddev->level); 1434 goto out; 1435 } 1436 /* 1437 * copy the already verified devices into our private RAID1 1438 * bookkeeping area. [whatever we allocate in run(), 1439 * should be freed in stop()] 1440 */ 1441 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1442 mddev->private = conf; 1443 if (!conf) 1444 goto out_no_mem; 1445 1446 memset(conf, 0, sizeof(*conf)); 1447 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1448 GFP_KERNEL); 1449 if (!conf->mirrors) 1450 goto out_no_mem; 1451 1452 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1453 1454 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1455 if (!conf->poolinfo) 1456 goto out_no_mem; 1457 conf->poolinfo->mddev = mddev; 1458 conf->poolinfo->raid_disks = mddev->raid_disks; 1459 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1460 r1bio_pool_free, 1461 conf->poolinfo); 1462 if (!conf->r1bio_pool) 1463 goto out_no_mem; 1464 1465 ITERATE_RDEV(mddev, rdev, tmp) { 1466 disk_idx = rdev->raid_disk; 1467 if (disk_idx >= mddev->raid_disks 1468 || disk_idx < 0) 1469 continue; 1470 disk = conf->mirrors + disk_idx; 1471 1472 disk->rdev = rdev; 1473 1474 blk_queue_stack_limits(mddev->queue, 1475 rdev->bdev->bd_disk->queue); 1476 /* as we don't honour merge_bvec_fn, we must never risk 1477 * violating it, so limit ->max_sector to one PAGE, as 1478 * a one page request is never in violation. 1479 */ 1480 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1481 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1482 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1483 1484 disk->head_position = 0; 1485 if (!rdev->faulty && rdev->in_sync) 1486 conf->working_disks++; 1487 } 1488 conf->raid_disks = mddev->raid_disks; 1489 conf->mddev = mddev; 1490 spin_lock_init(&conf->device_lock); 1491 INIT_LIST_HEAD(&conf->retry_list); 1492 if (conf->working_disks == 1) 1493 mddev->recovery_cp = MaxSector; 1494 1495 spin_lock_init(&conf->resync_lock); 1496 init_waitqueue_head(&conf->wait_idle); 1497 init_waitqueue_head(&conf->wait_resume); 1498 1499 bio_list_init(&conf->pending_bio_list); 1500 bio_list_init(&conf->flushing_bio_list); 1501 1502 if (!conf->working_disks) { 1503 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1504 mdname(mddev)); 1505 goto out_free_conf; 1506 } 1507 1508 mddev->degraded = 0; 1509 for (i = 0; i < conf->raid_disks; i++) { 1510 1511 disk = conf->mirrors + i; 1512 1513 if (!disk->rdev) { 1514 disk->head_position = 0; 1515 mddev->degraded++; 1516 } 1517 } 1518 1519 /* 1520 * find the first working one and use it as a starting point 1521 * to read balancing. 1522 */ 1523 for (j = 0; j < conf->raid_disks && 1524 (!conf->mirrors[j].rdev || 1525 !conf->mirrors[j].rdev->in_sync) ; j++) 1526 /* nothing */; 1527 conf->last_used = j; 1528 1529 1530 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1531 if (!mddev->thread) { 1532 printk(KERN_ERR 1533 "raid1: couldn't allocate thread for %s\n", 1534 mdname(mddev)); 1535 goto out_free_conf; 1536 } 1537 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 1538 1539 printk(KERN_INFO 1540 "raid1: raid set %s active with %d out of %d mirrors\n", 1541 mdname(mddev), mddev->raid_disks - mddev->degraded, 1542 mddev->raid_disks); 1543 /* 1544 * Ok, everything is just fine now 1545 */ 1546 mddev->array_size = mddev->size; 1547 1548 mddev->queue->unplug_fn = raid1_unplug; 1549 mddev->queue->issue_flush_fn = raid1_issue_flush; 1550 1551 return 0; 1552 1553 out_no_mem: 1554 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", 1555 mdname(mddev)); 1556 1557 out_free_conf: 1558 if (conf) { 1559 if (conf->r1bio_pool) 1560 mempool_destroy(conf->r1bio_pool); 1561 kfree(conf->mirrors); 1562 kfree(conf->poolinfo); 1563 kfree(conf); 1564 mddev->private = NULL; 1565 } 1566 out: 1567 return -EIO; 1568 } 1569 1570 static int stop(mddev_t *mddev) 1571 { 1572 conf_t *conf = mddev_to_conf(mddev); 1573 struct bitmap *bitmap = mddev->bitmap; 1574 int behind_wait = 0; 1575 1576 /* wait for behind writes to complete */ 1577 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 1578 behind_wait++; 1579 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 1580 set_current_state(TASK_UNINTERRUPTIBLE); 1581 schedule_timeout(HZ); /* wait a second */ 1582 /* need to kick something here to make sure I/O goes? */ 1583 } 1584 1585 md_unregister_thread(mddev->thread); 1586 mddev->thread = NULL; 1587 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1588 if (conf->r1bio_pool) 1589 mempool_destroy(conf->r1bio_pool); 1590 kfree(conf->mirrors); 1591 kfree(conf->poolinfo); 1592 kfree(conf); 1593 mddev->private = NULL; 1594 return 0; 1595 } 1596 1597 static int raid1_resize(mddev_t *mddev, sector_t sectors) 1598 { 1599 /* no resync is happening, and there is enough space 1600 * on all devices, so we can resize. 1601 * We need to make sure resync covers any new space. 1602 * If the array is shrinking we should possibly wait until 1603 * any io in the removed space completes, but it hardly seems 1604 * worth it. 1605 */ 1606 mddev->array_size = sectors>>1; 1607 set_capacity(mddev->gendisk, mddev->array_size << 1); 1608 mddev->changed = 1; 1609 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 1610 mddev->recovery_cp = mddev->size << 1; 1611 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1612 } 1613 mddev->size = mddev->array_size; 1614 mddev->resync_max_sectors = sectors; 1615 return 0; 1616 } 1617 1618 static int raid1_reshape(mddev_t *mddev, int raid_disks) 1619 { 1620 /* We need to: 1621 * 1/ resize the r1bio_pool 1622 * 2/ resize conf->mirrors 1623 * 1624 * We allocate a new r1bio_pool if we can. 1625 * Then raise a device barrier and wait until all IO stops. 1626 * Then resize conf->mirrors and swap in the new r1bio pool. 1627 * 1628 * At the same time, we "pack" the devices so that all the missing 1629 * devices have the higher raid_disk numbers. 1630 */ 1631 mempool_t *newpool, *oldpool; 1632 struct pool_info *newpoolinfo; 1633 mirror_info_t *newmirrors; 1634 conf_t *conf = mddev_to_conf(mddev); 1635 int cnt; 1636 1637 int d, d2; 1638 1639 if (raid_disks < conf->raid_disks) { 1640 cnt=0; 1641 for (d= 0; d < conf->raid_disks; d++) 1642 if (conf->mirrors[d].rdev) 1643 cnt++; 1644 if (cnt > raid_disks) 1645 return -EBUSY; 1646 } 1647 1648 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 1649 if (!newpoolinfo) 1650 return -ENOMEM; 1651 newpoolinfo->mddev = mddev; 1652 newpoolinfo->raid_disks = raid_disks; 1653 1654 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1655 r1bio_pool_free, newpoolinfo); 1656 if (!newpool) { 1657 kfree(newpoolinfo); 1658 return -ENOMEM; 1659 } 1660 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 1661 if (!newmirrors) { 1662 kfree(newpoolinfo); 1663 mempool_destroy(newpool); 1664 return -ENOMEM; 1665 } 1666 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); 1667 1668 spin_lock_irq(&conf->resync_lock); 1669 conf->barrier++; 1670 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1671 conf->resync_lock, raid1_unplug(mddev->queue)); 1672 spin_unlock_irq(&conf->resync_lock); 1673 1674 /* ok, everything is stopped */ 1675 oldpool = conf->r1bio_pool; 1676 conf->r1bio_pool = newpool; 1677 1678 for (d=d2=0; d < conf->raid_disks; d++) 1679 if (conf->mirrors[d].rdev) { 1680 conf->mirrors[d].rdev->raid_disk = d2; 1681 newmirrors[d2++].rdev = conf->mirrors[d].rdev; 1682 } 1683 kfree(conf->mirrors); 1684 conf->mirrors = newmirrors; 1685 kfree(conf->poolinfo); 1686 conf->poolinfo = newpoolinfo; 1687 1688 mddev->degraded += (raid_disks - conf->raid_disks); 1689 conf->raid_disks = mddev->raid_disks = raid_disks; 1690 1691 conf->last_used = 0; /* just make sure it is in-range */ 1692 spin_lock_irq(&conf->resync_lock); 1693 conf->barrier--; 1694 spin_unlock_irq(&conf->resync_lock); 1695 wake_up(&conf->wait_resume); 1696 wake_up(&conf->wait_idle); 1697 1698 1699 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1700 md_wakeup_thread(mddev->thread); 1701 1702 mempool_destroy(oldpool); 1703 return 0; 1704 } 1705 1706 static void raid1_quiesce(mddev_t *mddev, int state) 1707 { 1708 conf_t *conf = mddev_to_conf(mddev); 1709 1710 switch(state) { 1711 case 1: 1712 spin_lock_irq(&conf->resync_lock); 1713 conf->barrier++; 1714 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1715 conf->resync_lock, raid1_unplug(mddev->queue)); 1716 spin_unlock_irq(&conf->resync_lock); 1717 break; 1718 case 0: 1719 spin_lock_irq(&conf->resync_lock); 1720 conf->barrier--; 1721 spin_unlock_irq(&conf->resync_lock); 1722 wake_up(&conf->wait_resume); 1723 wake_up(&conf->wait_idle); 1724 break; 1725 } 1726 if (mddev->thread) { 1727 if (mddev->bitmap) 1728 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 1729 else 1730 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1731 md_wakeup_thread(mddev->thread); 1732 } 1733 } 1734 1735 1736 static mdk_personality_t raid1_personality = 1737 { 1738 .name = "raid1", 1739 .owner = THIS_MODULE, 1740 .make_request = make_request, 1741 .run = run, 1742 .stop = stop, 1743 .status = status, 1744 .error_handler = error, 1745 .hot_add_disk = raid1_add_disk, 1746 .hot_remove_disk= raid1_remove_disk, 1747 .spare_active = raid1_spare_active, 1748 .sync_request = sync_request, 1749 .resize = raid1_resize, 1750 .reshape = raid1_reshape, 1751 .quiesce = raid1_quiesce, 1752 }; 1753 1754 static int __init raid_init(void) 1755 { 1756 return register_md_personality(RAID1, &raid1_personality); 1757 } 1758 1759 static void raid_exit(void) 1760 { 1761 unregister_md_personality(RAID1); 1762 } 1763 1764 module_init(raid_init); 1765 module_exit(raid_exit); 1766 MODULE_LICENSE("GPL"); 1767 MODULE_ALIAS("md-personality-3"); /* RAID1 */ 1768