1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 #include "dm-io.h" 10 #include "dm-log.h" 11 #include "kcopyd.h" 12 13 #include <linux/ctype.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/pagemap.h> 18 #include <linux/slab.h> 19 #include <linux/time.h> 20 #include <linux/vmalloc.h> 21 #include <linux/workqueue.h> 22 23 #define DM_MSG_PREFIX "raid1" 24 25 static struct workqueue_struct *_kmirrord_wq; 26 static struct work_struct _kmirrord_work; 27 28 static inline void wake(void) 29 { 30 queue_work(_kmirrord_wq, &_kmirrord_work); 31 } 32 33 /*----------------------------------------------------------------- 34 * Region hash 35 * 36 * The mirror splits itself up into discrete regions. Each 37 * region can be in one of three states: clean, dirty, 38 * nosync. There is no need to put clean regions in the hash. 39 * 40 * In addition to being present in the hash table a region _may_ 41 * be present on one of three lists. 42 * 43 * clean_regions: Regions on this list have no io pending to 44 * them, they are in sync, we are no longer interested in them, 45 * they are dull. rh_update_states() will remove them from the 46 * hash table. 47 * 48 * quiesced_regions: These regions have been spun down, ready 49 * for recovery. rh_recovery_start() will remove regions from 50 * this list and hand them to kmirrord, which will schedule the 51 * recovery io with kcopyd. 52 * 53 * recovered_regions: Regions that kcopyd has successfully 54 * recovered. rh_update_states() will now schedule any delayed 55 * io, up the recovery_count, and remove the region from the 56 * hash. 57 * 58 * There are 2 locks: 59 * A rw spin lock 'hash_lock' protects just the hash table, 60 * this is never held in write mode from interrupt context, 61 * which I believe means that we only have to disable irqs when 62 * doing a write lock. 63 * 64 * An ordinary spin lock 'region_lock' that protects the three 65 * lists in the region_hash, with the 'state', 'list' and 66 * 'bhs_delayed' fields of the regions. This is used from irq 67 * context, so all other uses will have to suspend local irqs. 68 *---------------------------------------------------------------*/ 69 struct mirror_set; 70 struct region_hash { 71 struct mirror_set *ms; 72 uint32_t region_size; 73 unsigned region_shift; 74 75 /* holds persistent region state */ 76 struct dirty_log *log; 77 78 /* hash table */ 79 rwlock_t hash_lock; 80 mempool_t *region_pool; 81 unsigned int mask; 82 unsigned int nr_buckets; 83 struct list_head *buckets; 84 85 spinlock_t region_lock; 86 struct semaphore recovery_count; 87 struct list_head clean_regions; 88 struct list_head quiesced_regions; 89 struct list_head recovered_regions; 90 }; 91 92 enum { 93 RH_CLEAN, 94 RH_DIRTY, 95 RH_NOSYNC, 96 RH_RECOVERING 97 }; 98 99 struct region { 100 struct region_hash *rh; /* FIXME: can we get rid of this ? */ 101 region_t key; 102 int state; 103 104 struct list_head hash_list; 105 struct list_head list; 106 107 atomic_t pending; 108 struct bio_list delayed_bios; 109 }; 110 111 112 /*----------------------------------------------------------------- 113 * Mirror set structures. 114 *---------------------------------------------------------------*/ 115 struct mirror { 116 atomic_t error_count; 117 struct dm_dev *dev; 118 sector_t offset; 119 }; 120 121 struct mirror_set { 122 struct dm_target *ti; 123 struct list_head list; 124 struct region_hash rh; 125 struct kcopyd_client *kcopyd_client; 126 127 spinlock_t lock; /* protects the next two lists */ 128 struct bio_list reads; 129 struct bio_list writes; 130 131 /* recovery */ 132 region_t nr_regions; 133 int in_sync; 134 135 struct mirror *default_mirror; /* Default mirror */ 136 137 unsigned int nr_mirrors; 138 struct mirror mirror[0]; 139 }; 140 141 /* 142 * Conversion fns 143 */ 144 static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) 145 { 146 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; 147 } 148 149 static inline sector_t region_to_sector(struct region_hash *rh, region_t region) 150 { 151 return region << rh->region_shift; 152 } 153 154 /* FIXME move this */ 155 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); 156 157 #define MIN_REGIONS 64 158 #define MAX_RECOVERY 1 159 static int rh_init(struct region_hash *rh, struct mirror_set *ms, 160 struct dirty_log *log, uint32_t region_size, 161 region_t nr_regions) 162 { 163 unsigned int nr_buckets, max_buckets; 164 size_t i; 165 166 /* 167 * Calculate a suitable number of buckets for our hash 168 * table. 169 */ 170 max_buckets = nr_regions >> 6; 171 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 172 ; 173 nr_buckets >>= 1; 174 175 rh->ms = ms; 176 rh->log = log; 177 rh->region_size = region_size; 178 rh->region_shift = ffs(region_size) - 1; 179 rwlock_init(&rh->hash_lock); 180 rh->mask = nr_buckets - 1; 181 rh->nr_buckets = nr_buckets; 182 183 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 184 if (!rh->buckets) { 185 DMERR("unable to allocate region hash memory"); 186 return -ENOMEM; 187 } 188 189 for (i = 0; i < nr_buckets; i++) 190 INIT_LIST_HEAD(rh->buckets + i); 191 192 spin_lock_init(&rh->region_lock); 193 sema_init(&rh->recovery_count, 0); 194 INIT_LIST_HEAD(&rh->clean_regions); 195 INIT_LIST_HEAD(&rh->quiesced_regions); 196 INIT_LIST_HEAD(&rh->recovered_regions); 197 198 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 199 sizeof(struct region)); 200 if (!rh->region_pool) { 201 vfree(rh->buckets); 202 rh->buckets = NULL; 203 return -ENOMEM; 204 } 205 206 return 0; 207 } 208 209 static void rh_exit(struct region_hash *rh) 210 { 211 unsigned int h; 212 struct region *reg, *nreg; 213 214 BUG_ON(!list_empty(&rh->quiesced_regions)); 215 for (h = 0; h < rh->nr_buckets; h++) { 216 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { 217 BUG_ON(atomic_read(®->pending)); 218 mempool_free(reg, rh->region_pool); 219 } 220 } 221 222 if (rh->log) 223 dm_destroy_dirty_log(rh->log); 224 if (rh->region_pool) 225 mempool_destroy(rh->region_pool); 226 vfree(rh->buckets); 227 } 228 229 #define RH_HASH_MULT 2654435387U 230 231 static inline unsigned int rh_hash(struct region_hash *rh, region_t region) 232 { 233 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; 234 } 235 236 static struct region *__rh_lookup(struct region_hash *rh, region_t region) 237 { 238 struct region *reg; 239 240 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) 241 if (reg->key == region) 242 return reg; 243 244 return NULL; 245 } 246 247 static void __rh_insert(struct region_hash *rh, struct region *reg) 248 { 249 unsigned int h = rh_hash(rh, reg->key); 250 list_add(®->hash_list, rh->buckets + h); 251 } 252 253 static struct region *__rh_alloc(struct region_hash *rh, region_t region) 254 { 255 struct region *reg, *nreg; 256 257 read_unlock(&rh->hash_lock); 258 nreg = mempool_alloc(rh->region_pool, GFP_NOIO); 259 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 260 RH_CLEAN : RH_NOSYNC; 261 nreg->rh = rh; 262 nreg->key = region; 263 264 INIT_LIST_HEAD(&nreg->list); 265 266 atomic_set(&nreg->pending, 0); 267 bio_list_init(&nreg->delayed_bios); 268 write_lock_irq(&rh->hash_lock); 269 270 reg = __rh_lookup(rh, region); 271 if (reg) 272 /* we lost the race */ 273 mempool_free(nreg, rh->region_pool); 274 275 else { 276 __rh_insert(rh, nreg); 277 if (nreg->state == RH_CLEAN) { 278 spin_lock(&rh->region_lock); 279 list_add(&nreg->list, &rh->clean_regions); 280 spin_unlock(&rh->region_lock); 281 } 282 reg = nreg; 283 } 284 write_unlock_irq(&rh->hash_lock); 285 read_lock(&rh->hash_lock); 286 287 return reg; 288 } 289 290 static inline struct region *__rh_find(struct region_hash *rh, region_t region) 291 { 292 struct region *reg; 293 294 reg = __rh_lookup(rh, region); 295 if (!reg) 296 reg = __rh_alloc(rh, region); 297 298 return reg; 299 } 300 301 static int rh_state(struct region_hash *rh, region_t region, int may_block) 302 { 303 int r; 304 struct region *reg; 305 306 read_lock(&rh->hash_lock); 307 reg = __rh_lookup(rh, region); 308 read_unlock(&rh->hash_lock); 309 310 if (reg) 311 return reg->state; 312 313 /* 314 * The region wasn't in the hash, so we fall back to the 315 * dirty log. 316 */ 317 r = rh->log->type->in_sync(rh->log, region, may_block); 318 319 /* 320 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 321 * taken as a RH_NOSYNC 322 */ 323 return r == 1 ? RH_CLEAN : RH_NOSYNC; 324 } 325 326 static inline int rh_in_sync(struct region_hash *rh, 327 region_t region, int may_block) 328 { 329 int state = rh_state(rh, region, may_block); 330 return state == RH_CLEAN || state == RH_DIRTY; 331 } 332 333 static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) 334 { 335 struct bio *bio; 336 337 while ((bio = bio_list_pop(bio_list))) { 338 queue_bio(ms, bio, WRITE); 339 } 340 } 341 342 static void rh_update_states(struct region_hash *rh) 343 { 344 struct region *reg, *next; 345 346 LIST_HEAD(clean); 347 LIST_HEAD(recovered); 348 349 /* 350 * Quickly grab the lists. 351 */ 352 write_lock_irq(&rh->hash_lock); 353 spin_lock(&rh->region_lock); 354 if (!list_empty(&rh->clean_regions)) { 355 list_splice(&rh->clean_regions, &clean); 356 INIT_LIST_HEAD(&rh->clean_regions); 357 358 list_for_each_entry (reg, &clean, list) { 359 rh->log->type->clear_region(rh->log, reg->key); 360 list_del(®->hash_list); 361 } 362 } 363 364 if (!list_empty(&rh->recovered_regions)) { 365 list_splice(&rh->recovered_regions, &recovered); 366 INIT_LIST_HEAD(&rh->recovered_regions); 367 368 list_for_each_entry (reg, &recovered, list) 369 list_del(®->hash_list); 370 } 371 spin_unlock(&rh->region_lock); 372 write_unlock_irq(&rh->hash_lock); 373 374 /* 375 * All the regions on the recovered and clean lists have 376 * now been pulled out of the system, so no need to do 377 * any more locking. 378 */ 379 list_for_each_entry_safe (reg, next, &recovered, list) { 380 rh->log->type->clear_region(rh->log, reg->key); 381 rh->log->type->complete_resync_work(rh->log, reg->key, 1); 382 dispatch_bios(rh->ms, ®->delayed_bios); 383 up(&rh->recovery_count); 384 mempool_free(reg, rh->region_pool); 385 } 386 387 if (!list_empty(&recovered)) 388 rh->log->type->flush(rh->log); 389 390 list_for_each_entry_safe (reg, next, &clean, list) 391 mempool_free(reg, rh->region_pool); 392 } 393 394 static void rh_inc(struct region_hash *rh, region_t region) 395 { 396 struct region *reg; 397 398 read_lock(&rh->hash_lock); 399 reg = __rh_find(rh, region); 400 401 spin_lock_irq(&rh->region_lock); 402 atomic_inc(®->pending); 403 404 if (reg->state == RH_CLEAN) { 405 reg->state = RH_DIRTY; 406 list_del_init(®->list); /* take off the clean list */ 407 spin_unlock_irq(&rh->region_lock); 408 409 rh->log->type->mark_region(rh->log, reg->key); 410 } else 411 spin_unlock_irq(&rh->region_lock); 412 413 414 read_unlock(&rh->hash_lock); 415 } 416 417 static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) 418 { 419 struct bio *bio; 420 421 for (bio = bios->head; bio; bio = bio->bi_next) 422 rh_inc(rh, bio_to_region(rh, bio)); 423 } 424 425 static void rh_dec(struct region_hash *rh, region_t region) 426 { 427 unsigned long flags; 428 struct region *reg; 429 int should_wake = 0; 430 431 read_lock(&rh->hash_lock); 432 reg = __rh_lookup(rh, region); 433 read_unlock(&rh->hash_lock); 434 435 spin_lock_irqsave(&rh->region_lock, flags); 436 if (atomic_dec_and_test(®->pending)) { 437 /* 438 * There is no pending I/O for this region. 439 * We can move the region to corresponding list for next action. 440 * At this point, the region is not yet connected to any list. 441 * 442 * If the state is RH_NOSYNC, the region should be kept off 443 * from clean list. 444 * The hash entry for RH_NOSYNC will remain in memory 445 * until the region is recovered or the map is reloaded. 446 */ 447 448 /* do nothing for RH_NOSYNC */ 449 if (reg->state == RH_RECOVERING) { 450 list_add_tail(®->list, &rh->quiesced_regions); 451 } else if (reg->state == RH_DIRTY) { 452 reg->state = RH_CLEAN; 453 list_add(®->list, &rh->clean_regions); 454 } 455 should_wake = 1; 456 } 457 spin_unlock_irqrestore(&rh->region_lock, flags); 458 459 if (should_wake) 460 wake(); 461 } 462 463 /* 464 * Starts quiescing a region in preparation for recovery. 465 */ 466 static int __rh_recovery_prepare(struct region_hash *rh) 467 { 468 int r; 469 struct region *reg; 470 region_t region; 471 472 /* 473 * Ask the dirty log what's next. 474 */ 475 r = rh->log->type->get_resync_work(rh->log, ®ion); 476 if (r <= 0) 477 return r; 478 479 /* 480 * Get this region, and start it quiescing by setting the 481 * recovering flag. 482 */ 483 read_lock(&rh->hash_lock); 484 reg = __rh_find(rh, region); 485 read_unlock(&rh->hash_lock); 486 487 spin_lock_irq(&rh->region_lock); 488 reg->state = RH_RECOVERING; 489 490 /* Already quiesced ? */ 491 if (atomic_read(®->pending)) 492 list_del_init(®->list); 493 else 494 list_move(®->list, &rh->quiesced_regions); 495 496 spin_unlock_irq(&rh->region_lock); 497 498 return 1; 499 } 500 501 static void rh_recovery_prepare(struct region_hash *rh) 502 { 503 while (!down_trylock(&rh->recovery_count)) 504 if (__rh_recovery_prepare(rh) <= 0) { 505 up(&rh->recovery_count); 506 break; 507 } 508 } 509 510 /* 511 * Returns any quiesced regions. 512 */ 513 static struct region *rh_recovery_start(struct region_hash *rh) 514 { 515 struct region *reg = NULL; 516 517 spin_lock_irq(&rh->region_lock); 518 if (!list_empty(&rh->quiesced_regions)) { 519 reg = list_entry(rh->quiesced_regions.next, 520 struct region, list); 521 list_del_init(®->list); /* remove from the quiesced list */ 522 } 523 spin_unlock_irq(&rh->region_lock); 524 525 return reg; 526 } 527 528 /* FIXME: success ignored for now */ 529 static void rh_recovery_end(struct region *reg, int success) 530 { 531 struct region_hash *rh = reg->rh; 532 533 spin_lock_irq(&rh->region_lock); 534 list_add(®->list, ®->rh->recovered_regions); 535 spin_unlock_irq(&rh->region_lock); 536 537 wake(); 538 } 539 540 static void rh_flush(struct region_hash *rh) 541 { 542 rh->log->type->flush(rh->log); 543 } 544 545 static void rh_delay(struct region_hash *rh, struct bio *bio) 546 { 547 struct region *reg; 548 549 read_lock(&rh->hash_lock); 550 reg = __rh_find(rh, bio_to_region(rh, bio)); 551 bio_list_add(®->delayed_bios, bio); 552 read_unlock(&rh->hash_lock); 553 } 554 555 static void rh_stop_recovery(struct region_hash *rh) 556 { 557 int i; 558 559 /* wait for any recovering regions */ 560 for (i = 0; i < MAX_RECOVERY; i++) 561 down(&rh->recovery_count); 562 } 563 564 static void rh_start_recovery(struct region_hash *rh) 565 { 566 int i; 567 568 for (i = 0; i < MAX_RECOVERY; i++) 569 up(&rh->recovery_count); 570 571 wake(); 572 } 573 574 /* 575 * Every mirror should look like this one. 576 */ 577 #define DEFAULT_MIRROR 0 578 579 /* 580 * This is yucky. We squirrel the mirror_set struct away inside 581 * bi_next for write buffers. This is safe since the bh 582 * doesn't get submitted to the lower levels of block layer. 583 */ 584 static struct mirror_set *bio_get_ms(struct bio *bio) 585 { 586 return (struct mirror_set *) bio->bi_next; 587 } 588 589 static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 590 { 591 bio->bi_next = (struct bio *) ms; 592 } 593 594 /*----------------------------------------------------------------- 595 * Recovery. 596 * 597 * When a mirror is first activated we may find that some regions 598 * are in the no-sync state. We have to recover these by 599 * recopying from the default mirror to all the others. 600 *---------------------------------------------------------------*/ 601 static void recovery_complete(int read_err, unsigned int write_err, 602 void *context) 603 { 604 struct region *reg = (struct region *) context; 605 606 /* FIXME: better error handling */ 607 rh_recovery_end(reg, !(read_err || write_err)); 608 } 609 610 static int recover(struct mirror_set *ms, struct region *reg) 611 { 612 int r; 613 unsigned int i; 614 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; 615 struct mirror *m; 616 unsigned long flags = 0; 617 618 /* fill in the source */ 619 m = ms->default_mirror; 620 from.bdev = m->dev->bdev; 621 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 622 if (reg->key == (ms->nr_regions - 1)) { 623 /* 624 * The final region may be smaller than 625 * region_size. 626 */ 627 from.count = ms->ti->len & (reg->rh->region_size - 1); 628 if (!from.count) 629 from.count = reg->rh->region_size; 630 } else 631 from.count = reg->rh->region_size; 632 633 /* fill in the destinations */ 634 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 635 if (&ms->mirror[i] == ms->default_mirror) 636 continue; 637 638 m = ms->mirror + i; 639 dest->bdev = m->dev->bdev; 640 dest->sector = m->offset + region_to_sector(reg->rh, reg->key); 641 dest->count = from.count; 642 dest++; 643 } 644 645 /* hand to kcopyd */ 646 set_bit(KCOPYD_IGNORE_ERROR, &flags); 647 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, 648 recovery_complete, reg); 649 650 return r; 651 } 652 653 static void do_recovery(struct mirror_set *ms) 654 { 655 int r; 656 struct region *reg; 657 struct dirty_log *log = ms->rh.log; 658 659 /* 660 * Start quiescing some regions. 661 */ 662 rh_recovery_prepare(&ms->rh); 663 664 /* 665 * Copy any already quiesced regions. 666 */ 667 while ((reg = rh_recovery_start(&ms->rh))) { 668 r = recover(ms, reg); 669 if (r) 670 rh_recovery_end(reg, 0); 671 } 672 673 /* 674 * Update the in sync flag. 675 */ 676 if (!ms->in_sync && 677 (log->type->get_sync_count(log) == ms->nr_regions)) { 678 /* the sync is complete */ 679 dm_table_event(ms->ti->table); 680 ms->in_sync = 1; 681 } 682 } 683 684 /*----------------------------------------------------------------- 685 * Reads 686 *---------------------------------------------------------------*/ 687 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 688 { 689 /* FIXME: add read balancing */ 690 return ms->default_mirror; 691 } 692 693 /* 694 * remap a buffer to a particular mirror. 695 */ 696 static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 697 { 698 bio->bi_bdev = m->dev->bdev; 699 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 700 } 701 702 static void do_reads(struct mirror_set *ms, struct bio_list *reads) 703 { 704 region_t region; 705 struct bio *bio; 706 struct mirror *m; 707 708 while ((bio = bio_list_pop(reads))) { 709 region = bio_to_region(&ms->rh, bio); 710 711 /* 712 * We can only read balance if the region is in sync. 713 */ 714 if (rh_in_sync(&ms->rh, region, 0)) 715 m = choose_mirror(ms, bio->bi_sector); 716 else 717 m = ms->default_mirror; 718 719 map_bio(ms, m, bio); 720 generic_make_request(bio); 721 } 722 } 723 724 /*----------------------------------------------------------------- 725 * Writes. 726 * 727 * We do different things with the write io depending on the 728 * state of the region that it's in: 729 * 730 * SYNC: increment pending, use kcopyd to write to *all* mirrors 731 * RECOVERING: delay the io until recovery completes 732 * NOSYNC: increment pending, just write to the default mirror 733 *---------------------------------------------------------------*/ 734 static void write_callback(unsigned long error, void *context) 735 { 736 unsigned int i; 737 int uptodate = 1; 738 struct bio *bio = (struct bio *) context; 739 struct mirror_set *ms; 740 741 ms = bio_get_ms(bio); 742 bio_set_ms(bio, NULL); 743 744 /* 745 * NOTE: We don't decrement the pending count here, 746 * instead it is done by the targets endio function. 747 * This way we handle both writes to SYNC and NOSYNC 748 * regions with the same code. 749 */ 750 751 if (error) { 752 /* 753 * only error the io if all mirrors failed. 754 * FIXME: bogus 755 */ 756 uptodate = 0; 757 for (i = 0; i < ms->nr_mirrors; i++) 758 if (!test_bit(i, &error)) { 759 uptodate = 1; 760 break; 761 } 762 } 763 bio_endio(bio, bio->bi_size, 0); 764 } 765 766 static void do_write(struct mirror_set *ms, struct bio *bio) 767 { 768 unsigned int i; 769 struct io_region io[KCOPYD_MAX_REGIONS+1]; 770 struct mirror *m; 771 772 for (i = 0; i < ms->nr_mirrors; i++) { 773 m = ms->mirror + i; 774 775 io[i].bdev = m->dev->bdev; 776 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 777 io[i].count = bio->bi_size >> 9; 778 } 779 780 bio_set_ms(bio, ms); 781 dm_io_async_bvec(ms->nr_mirrors, io, WRITE, 782 bio->bi_io_vec + bio->bi_idx, 783 write_callback, bio); 784 } 785 786 static void do_writes(struct mirror_set *ms, struct bio_list *writes) 787 { 788 int state; 789 struct bio *bio; 790 struct bio_list sync, nosync, recover, *this_list = NULL; 791 792 if (!writes->head) 793 return; 794 795 /* 796 * Classify each write. 797 */ 798 bio_list_init(&sync); 799 bio_list_init(&nosync); 800 bio_list_init(&recover); 801 802 while ((bio = bio_list_pop(writes))) { 803 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); 804 switch (state) { 805 case RH_CLEAN: 806 case RH_DIRTY: 807 this_list = &sync; 808 break; 809 810 case RH_NOSYNC: 811 this_list = &nosync; 812 break; 813 814 case RH_RECOVERING: 815 this_list = &recover; 816 break; 817 } 818 819 bio_list_add(this_list, bio); 820 } 821 822 /* 823 * Increment the pending counts for any regions that will 824 * be written to (writes to recover regions are going to 825 * be delayed). 826 */ 827 rh_inc_pending(&ms->rh, &sync); 828 rh_inc_pending(&ms->rh, &nosync); 829 rh_flush(&ms->rh); 830 831 /* 832 * Dispatch io. 833 */ 834 while ((bio = bio_list_pop(&sync))) 835 do_write(ms, bio); 836 837 while ((bio = bio_list_pop(&recover))) 838 rh_delay(&ms->rh, bio); 839 840 while ((bio = bio_list_pop(&nosync))) { 841 map_bio(ms, ms->default_mirror, bio); 842 generic_make_request(bio); 843 } 844 } 845 846 /*----------------------------------------------------------------- 847 * kmirrord 848 *---------------------------------------------------------------*/ 849 static LIST_HEAD(_mirror_sets); 850 static DECLARE_RWSEM(_mirror_sets_lock); 851 852 static void do_mirror(struct mirror_set *ms) 853 { 854 struct bio_list reads, writes; 855 856 spin_lock(&ms->lock); 857 reads = ms->reads; 858 writes = ms->writes; 859 bio_list_init(&ms->reads); 860 bio_list_init(&ms->writes); 861 spin_unlock(&ms->lock); 862 863 rh_update_states(&ms->rh); 864 do_recovery(ms); 865 do_reads(ms, &reads); 866 do_writes(ms, &writes); 867 } 868 869 static void do_work(void *ignored) 870 { 871 struct mirror_set *ms; 872 873 down_read(&_mirror_sets_lock); 874 list_for_each_entry (ms, &_mirror_sets, list) 875 do_mirror(ms); 876 up_read(&_mirror_sets_lock); 877 } 878 879 /*----------------------------------------------------------------- 880 * Target functions 881 *---------------------------------------------------------------*/ 882 static struct mirror_set *alloc_context(unsigned int nr_mirrors, 883 uint32_t region_size, 884 struct dm_target *ti, 885 struct dirty_log *dl) 886 { 887 size_t len; 888 struct mirror_set *ms = NULL; 889 890 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) 891 return NULL; 892 893 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); 894 895 ms = kmalloc(len, GFP_KERNEL); 896 if (!ms) { 897 ti->error = "Cannot allocate mirror context"; 898 return NULL; 899 } 900 901 memset(ms, 0, len); 902 spin_lock_init(&ms->lock); 903 904 ms->ti = ti; 905 ms->nr_mirrors = nr_mirrors; 906 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 907 ms->in_sync = 0; 908 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 909 910 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 911 ti->error = "Error creating dirty region hash"; 912 kfree(ms); 913 return NULL; 914 } 915 916 return ms; 917 } 918 919 static void free_context(struct mirror_set *ms, struct dm_target *ti, 920 unsigned int m) 921 { 922 while (m--) 923 dm_put_device(ti, ms->mirror[m].dev); 924 925 rh_exit(&ms->rh); 926 kfree(ms); 927 } 928 929 static inline int _check_region_size(struct dm_target *ti, uint32_t size) 930 { 931 return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || 932 size > ti->len); 933 } 934 935 static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 936 unsigned int mirror, char **argv) 937 { 938 unsigned long long offset; 939 940 if (sscanf(argv[1], "%llu", &offset) != 1) { 941 ti->error = "Invalid offset"; 942 return -EINVAL; 943 } 944 945 if (dm_get_device(ti, argv[0], offset, ti->len, 946 dm_table_get_mode(ti->table), 947 &ms->mirror[mirror].dev)) { 948 ti->error = "Device lookup failure"; 949 return -ENXIO; 950 } 951 952 ms->mirror[mirror].offset = offset; 953 954 return 0; 955 } 956 957 static int add_mirror_set(struct mirror_set *ms) 958 { 959 down_write(&_mirror_sets_lock); 960 list_add_tail(&ms->list, &_mirror_sets); 961 up_write(&_mirror_sets_lock); 962 wake(); 963 964 return 0; 965 } 966 967 static void del_mirror_set(struct mirror_set *ms) 968 { 969 down_write(&_mirror_sets_lock); 970 list_del(&ms->list); 971 up_write(&_mirror_sets_lock); 972 } 973 974 /* 975 * Create dirty log: log_type #log_params <log_params> 976 */ 977 static struct dirty_log *create_dirty_log(struct dm_target *ti, 978 unsigned int argc, char **argv, 979 unsigned int *args_used) 980 { 981 unsigned int param_count; 982 struct dirty_log *dl; 983 984 if (argc < 2) { 985 ti->error = "Insufficient mirror log arguments"; 986 return NULL; 987 } 988 989 if (sscanf(argv[1], "%u", ¶m_count) != 1) { 990 ti->error = "Invalid mirror log argument count"; 991 return NULL; 992 } 993 994 *args_used = 2 + param_count; 995 996 if (argc < *args_used) { 997 ti->error = "Insufficient mirror log arguments"; 998 return NULL; 999 } 1000 1001 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1002 if (!dl) { 1003 ti->error = "Error creating mirror dirty log"; 1004 return NULL; 1005 } 1006 1007 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1008 ti->error = "Invalid region size"; 1009 dm_destroy_dirty_log(dl); 1010 return NULL; 1011 } 1012 1013 return dl; 1014 } 1015 1016 /* 1017 * Construct a mirror mapping: 1018 * 1019 * log_type #log_params <log_params> 1020 * #mirrors [mirror_path offset]{2,} 1021 * 1022 * log_type is "core" or "disk" 1023 * #log_params is between 1 and 3 1024 */ 1025 #define DM_IO_PAGES 64 1026 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1027 { 1028 int r; 1029 unsigned int nr_mirrors, m, args_used; 1030 struct mirror_set *ms; 1031 struct dirty_log *dl; 1032 1033 dl = create_dirty_log(ti, argc, argv, &args_used); 1034 if (!dl) 1035 return -EINVAL; 1036 1037 argv += args_used; 1038 argc -= args_used; 1039 1040 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1041 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1042 ti->error = "Invalid number of mirrors"; 1043 dm_destroy_dirty_log(dl); 1044 return -EINVAL; 1045 } 1046 1047 argv++, argc--; 1048 1049 if (argc != nr_mirrors * 2) { 1050 ti->error = "Wrong number of mirror arguments"; 1051 dm_destroy_dirty_log(dl); 1052 return -EINVAL; 1053 } 1054 1055 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); 1056 if (!ms) { 1057 dm_destroy_dirty_log(dl); 1058 return -ENOMEM; 1059 } 1060 1061 /* Get the mirror parameter sets */ 1062 for (m = 0; m < nr_mirrors; m++) { 1063 r = get_mirror(ms, ti, m, argv); 1064 if (r) { 1065 free_context(ms, ti, m); 1066 return r; 1067 } 1068 argv += 2; 1069 argc -= 2; 1070 } 1071 1072 ti->private = ms; 1073 ti->split_io = ms->rh.region_size; 1074 1075 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1076 if (r) { 1077 free_context(ms, ti, ms->nr_mirrors); 1078 return r; 1079 } 1080 1081 add_mirror_set(ms); 1082 return 0; 1083 } 1084 1085 static void mirror_dtr(struct dm_target *ti) 1086 { 1087 struct mirror_set *ms = (struct mirror_set *) ti->private; 1088 1089 del_mirror_set(ms); 1090 kcopyd_client_destroy(ms->kcopyd_client); 1091 free_context(ms, ti, ms->nr_mirrors); 1092 } 1093 1094 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1095 { 1096 int should_wake = 0; 1097 struct bio_list *bl; 1098 1099 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1100 spin_lock(&ms->lock); 1101 should_wake = !(bl->head); 1102 bio_list_add(bl, bio); 1103 spin_unlock(&ms->lock); 1104 1105 if (should_wake) 1106 wake(); 1107 } 1108 1109 /* 1110 * Mirror mapping function 1111 */ 1112 static int mirror_map(struct dm_target *ti, struct bio *bio, 1113 union map_info *map_context) 1114 { 1115 int r, rw = bio_rw(bio); 1116 struct mirror *m; 1117 struct mirror_set *ms = ti->private; 1118 1119 map_context->ll = bio_to_region(&ms->rh, bio); 1120 1121 if (rw == WRITE) { 1122 queue_bio(ms, bio, rw); 1123 return 0; 1124 } 1125 1126 r = ms->rh.log->type->in_sync(ms->rh.log, 1127 bio_to_region(&ms->rh, bio), 0); 1128 if (r < 0 && r != -EWOULDBLOCK) 1129 return r; 1130 1131 if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1132 r = 0; 1133 1134 /* 1135 * We don't want to fast track a recovery just for a read 1136 * ahead. So we just let it silently fail. 1137 * FIXME: get rid of this. 1138 */ 1139 if (!r && rw == READA) 1140 return -EIO; 1141 1142 if (!r) { 1143 /* Pass this io over to the daemon */ 1144 queue_bio(ms, bio, rw); 1145 return 0; 1146 } 1147 1148 m = choose_mirror(ms, bio->bi_sector); 1149 if (!m) 1150 return -EIO; 1151 1152 map_bio(ms, m, bio); 1153 return 1; 1154 } 1155 1156 static int mirror_end_io(struct dm_target *ti, struct bio *bio, 1157 int error, union map_info *map_context) 1158 { 1159 int rw = bio_rw(bio); 1160 struct mirror_set *ms = (struct mirror_set *) ti->private; 1161 region_t region = map_context->ll; 1162 1163 /* 1164 * We need to dec pending if this was a write. 1165 */ 1166 if (rw == WRITE) 1167 rh_dec(&ms->rh, region); 1168 1169 return 0; 1170 } 1171 1172 static void mirror_postsuspend(struct dm_target *ti) 1173 { 1174 struct mirror_set *ms = (struct mirror_set *) ti->private; 1175 struct dirty_log *log = ms->rh.log; 1176 1177 rh_stop_recovery(&ms->rh); 1178 if (log->type->suspend && log->type->suspend(log)) 1179 /* FIXME: need better error handling */ 1180 DMWARN("log suspend failed"); 1181 } 1182 1183 static void mirror_resume(struct dm_target *ti) 1184 { 1185 struct mirror_set *ms = (struct mirror_set *) ti->private; 1186 struct dirty_log *log = ms->rh.log; 1187 if (log->type->resume && log->type->resume(log)) 1188 /* FIXME: need better error handling */ 1189 DMWARN("log resume failed"); 1190 rh_start_recovery(&ms->rh); 1191 } 1192 1193 static int mirror_status(struct dm_target *ti, status_type_t type, 1194 char *result, unsigned int maxlen) 1195 { 1196 unsigned int m, sz; 1197 struct mirror_set *ms = (struct mirror_set *) ti->private; 1198 1199 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); 1200 1201 switch (type) { 1202 case STATUSTYPE_INFO: 1203 DMEMIT("%d ", ms->nr_mirrors); 1204 for (m = 0; m < ms->nr_mirrors; m++) 1205 DMEMIT("%s ", ms->mirror[m].dev->name); 1206 1207 DMEMIT("%llu/%llu", 1208 (unsigned long long)ms->rh.log->type-> 1209 get_sync_count(ms->rh.log), 1210 (unsigned long long)ms->nr_regions); 1211 break; 1212 1213 case STATUSTYPE_TABLE: 1214 DMEMIT("%d ", ms->nr_mirrors); 1215 for (m = 0; m < ms->nr_mirrors; m++) 1216 DMEMIT("%s %llu ", ms->mirror[m].dev->name, 1217 (unsigned long long)ms->mirror[m].offset); 1218 } 1219 1220 return 0; 1221 } 1222 1223 static struct target_type mirror_target = { 1224 .name = "mirror", 1225 .version = {1, 0, 2}, 1226 .module = THIS_MODULE, 1227 .ctr = mirror_ctr, 1228 .dtr = mirror_dtr, 1229 .map = mirror_map, 1230 .end_io = mirror_end_io, 1231 .postsuspend = mirror_postsuspend, 1232 .resume = mirror_resume, 1233 .status = mirror_status, 1234 }; 1235 1236 static int __init dm_mirror_init(void) 1237 { 1238 int r; 1239 1240 r = dm_dirty_log_init(); 1241 if (r) 1242 return r; 1243 1244 _kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1245 if (!_kmirrord_wq) { 1246 DMERR("couldn't start kmirrord"); 1247 dm_dirty_log_exit(); 1248 return r; 1249 } 1250 INIT_WORK(&_kmirrord_work, do_work, NULL); 1251 1252 r = dm_register_target(&mirror_target); 1253 if (r < 0) { 1254 DMERR("%s: Failed to register mirror target", 1255 mirror_target.name); 1256 dm_dirty_log_exit(); 1257 destroy_workqueue(_kmirrord_wq); 1258 } 1259 1260 return r; 1261 } 1262 1263 static void __exit dm_mirror_exit(void) 1264 { 1265 int r; 1266 1267 r = dm_unregister_target(&mirror_target); 1268 if (r < 0) 1269 DMERR("%s: unregister failed %d", mirror_target.name, r); 1270 1271 destroy_workqueue(_kmirrord_wq); 1272 dm_dirty_log_exit(); 1273 } 1274 1275 /* Module hooks */ 1276 module_init(dm_mirror_init); 1277 module_exit(dm_mirror_exit); 1278 1279 MODULE_DESCRIPTION(DM_NAME " mirror target"); 1280 MODULE_AUTHOR("Joe Thornber"); 1281 MODULE_LICENSE("GPL"); 1282