1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 #include "dm-io.h" 10 #include "dm-log.h" 11 #include "kcopyd.h" 12 13 #include <linux/ctype.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/pagemap.h> 18 #include <linux/slab.h> 19 #include <linux/time.h> 20 #include <linux/vmalloc.h> 21 #include <linux/workqueue.h> 22 #include <linux/log2.h> 23 24 #define DM_MSG_PREFIX "raid1" 25 #define DM_IO_PAGES 64 26 27 #define DM_RAID1_HANDLE_ERRORS 0x01 28 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) 29 30 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); 31 32 /*----------------------------------------------------------------- 33 * Region hash 34 * 35 * The mirror splits itself up into discrete regions. Each 36 * region can be in one of three states: clean, dirty, 37 * nosync. There is no need to put clean regions in the hash. 38 * 39 * In addition to being present in the hash table a region _may_ 40 * be present on one of three lists. 41 * 42 * clean_regions: Regions on this list have no io pending to 43 * them, they are in sync, we are no longer interested in them, 44 * they are dull. rh_update_states() will remove them from the 45 * hash table. 46 * 47 * quiesced_regions: These regions have been spun down, ready 48 * for recovery. rh_recovery_start() will remove regions from 49 * this list and hand them to kmirrord, which will schedule the 50 * recovery io with kcopyd. 51 * 52 * recovered_regions: Regions that kcopyd has successfully 53 * recovered. rh_update_states() will now schedule any delayed 54 * io, up the recovery_count, and remove the region from the 55 * hash. 56 * 57 * There are 2 locks: 58 * A rw spin lock 'hash_lock' protects just the hash table, 59 * this is never held in write mode from interrupt context, 60 * which I believe means that we only have to disable irqs when 61 * doing a write lock. 62 * 63 * An ordinary spin lock 'region_lock' that protects the three 64 * lists in the region_hash, with the 'state', 'list' and 65 * 'bhs_delayed' fields of the regions. This is used from irq 66 * context, so all other uses will have to suspend local irqs. 67 *---------------------------------------------------------------*/ 68 struct mirror_set; 69 struct region_hash { 70 struct mirror_set *ms; 71 uint32_t region_size; 72 unsigned region_shift; 73 74 /* holds persistent region state */ 75 struct dirty_log *log; 76 77 /* hash table */ 78 rwlock_t hash_lock; 79 mempool_t *region_pool; 80 unsigned int mask; 81 unsigned int nr_buckets; 82 struct list_head *buckets; 83 84 spinlock_t region_lock; 85 atomic_t recovery_in_flight; 86 struct semaphore recovery_count; 87 struct list_head clean_regions; 88 struct list_head quiesced_regions; 89 struct list_head recovered_regions; 90 struct list_head failed_recovered_regions; 91 }; 92 93 enum { 94 RH_CLEAN, 95 RH_DIRTY, 96 RH_NOSYNC, 97 RH_RECOVERING 98 }; 99 100 struct region { 101 struct region_hash *rh; /* FIXME: can we get rid of this ? */ 102 region_t key; 103 int state; 104 105 struct list_head hash_list; 106 struct list_head list; 107 108 atomic_t pending; 109 struct bio_list delayed_bios; 110 }; 111 112 113 /*----------------------------------------------------------------- 114 * Mirror set structures. 115 *---------------------------------------------------------------*/ 116 struct mirror { 117 struct mirror_set *ms; 118 atomic_t error_count; 119 struct dm_dev *dev; 120 sector_t offset; 121 }; 122 123 struct mirror_set { 124 struct dm_target *ti; 125 struct list_head list; 126 struct region_hash rh; 127 struct kcopyd_client *kcopyd_client; 128 uint64_t features; 129 130 spinlock_t lock; /* protects the next two lists */ 131 struct bio_list reads; 132 struct bio_list writes; 133 134 struct dm_io_client *io_client; 135 136 /* recovery */ 137 region_t nr_regions; 138 int in_sync; 139 int log_failure; 140 141 struct mirror *default_mirror; /* Default mirror */ 142 143 struct workqueue_struct *kmirrord_wq; 144 struct work_struct kmirrord_work; 145 146 unsigned int nr_mirrors; 147 struct mirror mirror[0]; 148 }; 149 150 /* 151 * Conversion fns 152 */ 153 static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) 154 { 155 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; 156 } 157 158 static inline sector_t region_to_sector(struct region_hash *rh, region_t region) 159 { 160 return region << rh->region_shift; 161 } 162 163 static void wake(struct mirror_set *ms) 164 { 165 queue_work(ms->kmirrord_wq, &ms->kmirrord_work); 166 } 167 168 /* FIXME move this */ 169 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); 170 171 #define MIN_REGIONS 64 172 #define MAX_RECOVERY 1 173 static int rh_init(struct region_hash *rh, struct mirror_set *ms, 174 struct dirty_log *log, uint32_t region_size, 175 region_t nr_regions) 176 { 177 unsigned int nr_buckets, max_buckets; 178 size_t i; 179 180 /* 181 * Calculate a suitable number of buckets for our hash 182 * table. 183 */ 184 max_buckets = nr_regions >> 6; 185 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 186 ; 187 nr_buckets >>= 1; 188 189 rh->ms = ms; 190 rh->log = log; 191 rh->region_size = region_size; 192 rh->region_shift = ffs(region_size) - 1; 193 rwlock_init(&rh->hash_lock); 194 rh->mask = nr_buckets - 1; 195 rh->nr_buckets = nr_buckets; 196 197 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 198 if (!rh->buckets) { 199 DMERR("unable to allocate region hash memory"); 200 return -ENOMEM; 201 } 202 203 for (i = 0; i < nr_buckets; i++) 204 INIT_LIST_HEAD(rh->buckets + i); 205 206 spin_lock_init(&rh->region_lock); 207 sema_init(&rh->recovery_count, 0); 208 atomic_set(&rh->recovery_in_flight, 0); 209 INIT_LIST_HEAD(&rh->clean_regions); 210 INIT_LIST_HEAD(&rh->quiesced_regions); 211 INIT_LIST_HEAD(&rh->recovered_regions); 212 INIT_LIST_HEAD(&rh->failed_recovered_regions); 213 214 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 215 sizeof(struct region)); 216 if (!rh->region_pool) { 217 vfree(rh->buckets); 218 rh->buckets = NULL; 219 return -ENOMEM; 220 } 221 222 return 0; 223 } 224 225 static void rh_exit(struct region_hash *rh) 226 { 227 unsigned int h; 228 struct region *reg, *nreg; 229 230 BUG_ON(!list_empty(&rh->quiesced_regions)); 231 for (h = 0; h < rh->nr_buckets; h++) { 232 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { 233 BUG_ON(atomic_read(®->pending)); 234 mempool_free(reg, rh->region_pool); 235 } 236 } 237 238 if (rh->log) 239 dm_destroy_dirty_log(rh->log); 240 if (rh->region_pool) 241 mempool_destroy(rh->region_pool); 242 vfree(rh->buckets); 243 } 244 245 #define RH_HASH_MULT 2654435387U 246 247 static inline unsigned int rh_hash(struct region_hash *rh, region_t region) 248 { 249 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; 250 } 251 252 static struct region *__rh_lookup(struct region_hash *rh, region_t region) 253 { 254 struct region *reg; 255 256 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) 257 if (reg->key == region) 258 return reg; 259 260 return NULL; 261 } 262 263 static void __rh_insert(struct region_hash *rh, struct region *reg) 264 { 265 unsigned int h = rh_hash(rh, reg->key); 266 list_add(®->hash_list, rh->buckets + h); 267 } 268 269 static struct region *__rh_alloc(struct region_hash *rh, region_t region) 270 { 271 struct region *reg, *nreg; 272 273 read_unlock(&rh->hash_lock); 274 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 275 if (unlikely(!nreg)) 276 nreg = kmalloc(sizeof(struct region), GFP_NOIO); 277 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 278 RH_CLEAN : RH_NOSYNC; 279 nreg->rh = rh; 280 nreg->key = region; 281 282 INIT_LIST_HEAD(&nreg->list); 283 284 atomic_set(&nreg->pending, 0); 285 bio_list_init(&nreg->delayed_bios); 286 write_lock_irq(&rh->hash_lock); 287 288 reg = __rh_lookup(rh, region); 289 if (reg) 290 /* we lost the race */ 291 mempool_free(nreg, rh->region_pool); 292 293 else { 294 __rh_insert(rh, nreg); 295 if (nreg->state == RH_CLEAN) { 296 spin_lock(&rh->region_lock); 297 list_add(&nreg->list, &rh->clean_regions); 298 spin_unlock(&rh->region_lock); 299 } 300 reg = nreg; 301 } 302 write_unlock_irq(&rh->hash_lock); 303 read_lock(&rh->hash_lock); 304 305 return reg; 306 } 307 308 static inline struct region *__rh_find(struct region_hash *rh, region_t region) 309 { 310 struct region *reg; 311 312 reg = __rh_lookup(rh, region); 313 if (!reg) 314 reg = __rh_alloc(rh, region); 315 316 return reg; 317 } 318 319 static int rh_state(struct region_hash *rh, region_t region, int may_block) 320 { 321 int r; 322 struct region *reg; 323 324 read_lock(&rh->hash_lock); 325 reg = __rh_lookup(rh, region); 326 read_unlock(&rh->hash_lock); 327 328 if (reg) 329 return reg->state; 330 331 /* 332 * The region wasn't in the hash, so we fall back to the 333 * dirty log. 334 */ 335 r = rh->log->type->in_sync(rh->log, region, may_block); 336 337 /* 338 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 339 * taken as a RH_NOSYNC 340 */ 341 return r == 1 ? RH_CLEAN : RH_NOSYNC; 342 } 343 344 static inline int rh_in_sync(struct region_hash *rh, 345 region_t region, int may_block) 346 { 347 int state = rh_state(rh, region, may_block); 348 return state == RH_CLEAN || state == RH_DIRTY; 349 } 350 351 static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) 352 { 353 struct bio *bio; 354 355 while ((bio = bio_list_pop(bio_list))) { 356 queue_bio(ms, bio, WRITE); 357 } 358 } 359 360 static void complete_resync_work(struct region *reg, int success) 361 { 362 struct region_hash *rh = reg->rh; 363 364 rh->log->type->set_region_sync(rh->log, reg->key, success); 365 dispatch_bios(rh->ms, ®->delayed_bios); 366 if (atomic_dec_and_test(&rh->recovery_in_flight)) 367 wake_up_all(&_kmirrord_recovery_stopped); 368 up(&rh->recovery_count); 369 } 370 371 static void rh_update_states(struct region_hash *rh) 372 { 373 struct region *reg, *next; 374 375 LIST_HEAD(clean); 376 LIST_HEAD(recovered); 377 LIST_HEAD(failed_recovered); 378 379 /* 380 * Quickly grab the lists. 381 */ 382 write_lock_irq(&rh->hash_lock); 383 spin_lock(&rh->region_lock); 384 if (!list_empty(&rh->clean_regions)) { 385 list_splice(&rh->clean_regions, &clean); 386 INIT_LIST_HEAD(&rh->clean_regions); 387 388 list_for_each_entry(reg, &clean, list) 389 list_del(®->hash_list); 390 } 391 392 if (!list_empty(&rh->recovered_regions)) { 393 list_splice(&rh->recovered_regions, &recovered); 394 INIT_LIST_HEAD(&rh->recovered_regions); 395 396 list_for_each_entry (reg, &recovered, list) 397 list_del(®->hash_list); 398 } 399 400 if (!list_empty(&rh->failed_recovered_regions)) { 401 list_splice(&rh->failed_recovered_regions, &failed_recovered); 402 INIT_LIST_HEAD(&rh->failed_recovered_regions); 403 404 list_for_each_entry(reg, &failed_recovered, list) 405 list_del(®->hash_list); 406 } 407 408 spin_unlock(&rh->region_lock); 409 write_unlock_irq(&rh->hash_lock); 410 411 /* 412 * All the regions on the recovered and clean lists have 413 * now been pulled out of the system, so no need to do 414 * any more locking. 415 */ 416 list_for_each_entry_safe (reg, next, &recovered, list) { 417 rh->log->type->clear_region(rh->log, reg->key); 418 complete_resync_work(reg, 1); 419 mempool_free(reg, rh->region_pool); 420 } 421 422 list_for_each_entry_safe(reg, next, &failed_recovered, list) { 423 complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1); 424 mempool_free(reg, rh->region_pool); 425 } 426 427 list_for_each_entry_safe(reg, next, &clean, list) { 428 rh->log->type->clear_region(rh->log, reg->key); 429 mempool_free(reg, rh->region_pool); 430 } 431 432 rh->log->type->flush(rh->log); 433 } 434 435 static void rh_inc(struct region_hash *rh, region_t region) 436 { 437 struct region *reg; 438 439 read_lock(&rh->hash_lock); 440 reg = __rh_find(rh, region); 441 442 spin_lock_irq(&rh->region_lock); 443 atomic_inc(®->pending); 444 445 if (reg->state == RH_CLEAN) { 446 reg->state = RH_DIRTY; 447 list_del_init(®->list); /* take off the clean list */ 448 spin_unlock_irq(&rh->region_lock); 449 450 rh->log->type->mark_region(rh->log, reg->key); 451 } else 452 spin_unlock_irq(&rh->region_lock); 453 454 455 read_unlock(&rh->hash_lock); 456 } 457 458 static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) 459 { 460 struct bio *bio; 461 462 for (bio = bios->head; bio; bio = bio->bi_next) 463 rh_inc(rh, bio_to_region(rh, bio)); 464 } 465 466 static void rh_dec(struct region_hash *rh, region_t region) 467 { 468 unsigned long flags; 469 struct region *reg; 470 int should_wake = 0; 471 472 read_lock(&rh->hash_lock); 473 reg = __rh_lookup(rh, region); 474 read_unlock(&rh->hash_lock); 475 476 spin_lock_irqsave(&rh->region_lock, flags); 477 if (atomic_dec_and_test(®->pending)) { 478 /* 479 * There is no pending I/O for this region. 480 * We can move the region to corresponding list for next action. 481 * At this point, the region is not yet connected to any list. 482 * 483 * If the state is RH_NOSYNC, the region should be kept off 484 * from clean list. 485 * The hash entry for RH_NOSYNC will remain in memory 486 * until the region is recovered or the map is reloaded. 487 */ 488 489 /* do nothing for RH_NOSYNC */ 490 if (reg->state == RH_RECOVERING) { 491 list_add_tail(®->list, &rh->quiesced_regions); 492 } else if (reg->state == RH_DIRTY) { 493 reg->state = RH_CLEAN; 494 list_add(®->list, &rh->clean_regions); 495 } 496 should_wake = 1; 497 } 498 spin_unlock_irqrestore(&rh->region_lock, flags); 499 500 if (should_wake) 501 wake(rh->ms); 502 } 503 504 /* 505 * Starts quiescing a region in preparation for recovery. 506 */ 507 static int __rh_recovery_prepare(struct region_hash *rh) 508 { 509 int r; 510 struct region *reg; 511 region_t region; 512 513 /* 514 * Ask the dirty log what's next. 515 */ 516 r = rh->log->type->get_resync_work(rh->log, ®ion); 517 if (r <= 0) 518 return r; 519 520 /* 521 * Get this region, and start it quiescing by setting the 522 * recovering flag. 523 */ 524 read_lock(&rh->hash_lock); 525 reg = __rh_find(rh, region); 526 read_unlock(&rh->hash_lock); 527 528 spin_lock_irq(&rh->region_lock); 529 reg->state = RH_RECOVERING; 530 531 /* Already quiesced ? */ 532 if (atomic_read(®->pending)) 533 list_del_init(®->list); 534 else 535 list_move(®->list, &rh->quiesced_regions); 536 537 spin_unlock_irq(&rh->region_lock); 538 539 return 1; 540 } 541 542 static void rh_recovery_prepare(struct region_hash *rh) 543 { 544 /* Extra reference to avoid race with rh_stop_recovery */ 545 atomic_inc(&rh->recovery_in_flight); 546 547 while (!down_trylock(&rh->recovery_count)) { 548 atomic_inc(&rh->recovery_in_flight); 549 if (__rh_recovery_prepare(rh) <= 0) { 550 atomic_dec(&rh->recovery_in_flight); 551 up(&rh->recovery_count); 552 break; 553 } 554 } 555 556 /* Drop the extra reference */ 557 if (atomic_dec_and_test(&rh->recovery_in_flight)) 558 wake_up_all(&_kmirrord_recovery_stopped); 559 } 560 561 /* 562 * Returns any quiesced regions. 563 */ 564 static struct region *rh_recovery_start(struct region_hash *rh) 565 { 566 struct region *reg = NULL; 567 568 spin_lock_irq(&rh->region_lock); 569 if (!list_empty(&rh->quiesced_regions)) { 570 reg = list_entry(rh->quiesced_regions.next, 571 struct region, list); 572 list_del_init(®->list); /* remove from the quiesced list */ 573 } 574 spin_unlock_irq(&rh->region_lock); 575 576 return reg; 577 } 578 579 static void rh_recovery_end(struct region *reg, int success) 580 { 581 struct region_hash *rh = reg->rh; 582 583 spin_lock_irq(&rh->region_lock); 584 if (success) 585 list_add(®->list, ®->rh->recovered_regions); 586 else { 587 reg->state = RH_NOSYNC; 588 list_add(®->list, ®->rh->failed_recovered_regions); 589 } 590 spin_unlock_irq(&rh->region_lock); 591 592 wake(rh->ms); 593 } 594 595 static int rh_flush(struct region_hash *rh) 596 { 597 return rh->log->type->flush(rh->log); 598 } 599 600 static void rh_delay(struct region_hash *rh, struct bio *bio) 601 { 602 struct region *reg; 603 604 read_lock(&rh->hash_lock); 605 reg = __rh_find(rh, bio_to_region(rh, bio)); 606 bio_list_add(®->delayed_bios, bio); 607 read_unlock(&rh->hash_lock); 608 } 609 610 static void rh_stop_recovery(struct region_hash *rh) 611 { 612 int i; 613 614 /* wait for any recovering regions */ 615 for (i = 0; i < MAX_RECOVERY; i++) 616 down(&rh->recovery_count); 617 } 618 619 static void rh_start_recovery(struct region_hash *rh) 620 { 621 int i; 622 623 for (i = 0; i < MAX_RECOVERY; i++) 624 up(&rh->recovery_count); 625 626 wake(rh->ms); 627 } 628 629 /* 630 * Every mirror should look like this one. 631 */ 632 #define DEFAULT_MIRROR 0 633 634 /* 635 * This is yucky. We squirrel the mirror_set struct away inside 636 * bi_next for write buffers. This is safe since the bh 637 * doesn't get submitted to the lower levels of block layer. 638 */ 639 static struct mirror_set *bio_get_ms(struct bio *bio) 640 { 641 return (struct mirror_set *) bio->bi_next; 642 } 643 644 static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 645 { 646 bio->bi_next = (struct bio *) ms; 647 } 648 649 /*----------------------------------------------------------------- 650 * Recovery. 651 * 652 * When a mirror is first activated we may find that some regions 653 * are in the no-sync state. We have to recover these by 654 * recopying from the default mirror to all the others. 655 *---------------------------------------------------------------*/ 656 static void recovery_complete(int read_err, unsigned int write_err, 657 void *context) 658 { 659 struct region *reg = (struct region *) context; 660 661 if (read_err) 662 /* Read error means the failure of default mirror. */ 663 DMERR_LIMIT("Unable to read primary mirror during recovery"); 664 665 if (write_err) 666 DMERR_LIMIT("Write error during recovery (error = 0x%x)", 667 write_err); 668 669 rh_recovery_end(reg, !(read_err || write_err)); 670 } 671 672 static int recover(struct mirror_set *ms, struct region *reg) 673 { 674 int r; 675 unsigned int i; 676 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; 677 struct mirror *m; 678 unsigned long flags = 0; 679 680 /* fill in the source */ 681 m = ms->default_mirror; 682 from.bdev = m->dev->bdev; 683 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 684 if (reg->key == (ms->nr_regions - 1)) { 685 /* 686 * The final region may be smaller than 687 * region_size. 688 */ 689 from.count = ms->ti->len & (reg->rh->region_size - 1); 690 if (!from.count) 691 from.count = reg->rh->region_size; 692 } else 693 from.count = reg->rh->region_size; 694 695 /* fill in the destinations */ 696 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 697 if (&ms->mirror[i] == ms->default_mirror) 698 continue; 699 700 m = ms->mirror + i; 701 dest->bdev = m->dev->bdev; 702 dest->sector = m->offset + region_to_sector(reg->rh, reg->key); 703 dest->count = from.count; 704 dest++; 705 } 706 707 /* hand to kcopyd */ 708 set_bit(KCOPYD_IGNORE_ERROR, &flags); 709 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, 710 recovery_complete, reg); 711 712 return r; 713 } 714 715 static void do_recovery(struct mirror_set *ms) 716 { 717 int r; 718 struct region *reg; 719 struct dirty_log *log = ms->rh.log; 720 721 /* 722 * Start quiescing some regions. 723 */ 724 rh_recovery_prepare(&ms->rh); 725 726 /* 727 * Copy any already quiesced regions. 728 */ 729 while ((reg = rh_recovery_start(&ms->rh))) { 730 r = recover(ms, reg); 731 if (r) 732 rh_recovery_end(reg, 0); 733 } 734 735 /* 736 * Update the in sync flag. 737 */ 738 if (!ms->in_sync && 739 (log->type->get_sync_count(log) == ms->nr_regions)) { 740 /* the sync is complete */ 741 dm_table_event(ms->ti->table); 742 ms->in_sync = 1; 743 } 744 } 745 746 /*----------------------------------------------------------------- 747 * Reads 748 *---------------------------------------------------------------*/ 749 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 750 { 751 /* FIXME: add read balancing */ 752 return ms->default_mirror; 753 } 754 755 /* 756 * remap a buffer to a particular mirror. 757 */ 758 static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 759 { 760 bio->bi_bdev = m->dev->bdev; 761 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 762 } 763 764 static void do_reads(struct mirror_set *ms, struct bio_list *reads) 765 { 766 region_t region; 767 struct bio *bio; 768 struct mirror *m; 769 770 while ((bio = bio_list_pop(reads))) { 771 region = bio_to_region(&ms->rh, bio); 772 773 /* 774 * We can only read balance if the region is in sync. 775 */ 776 if (rh_in_sync(&ms->rh, region, 1)) 777 m = choose_mirror(ms, bio->bi_sector); 778 else 779 m = ms->default_mirror; 780 781 map_bio(ms, m, bio); 782 generic_make_request(bio); 783 } 784 } 785 786 /*----------------------------------------------------------------- 787 * Writes. 788 * 789 * We do different things with the write io depending on the 790 * state of the region that it's in: 791 * 792 * SYNC: increment pending, use kcopyd to write to *all* mirrors 793 * RECOVERING: delay the io until recovery completes 794 * NOSYNC: increment pending, just write to the default mirror 795 *---------------------------------------------------------------*/ 796 static void write_callback(unsigned long error, void *context) 797 { 798 unsigned int i; 799 int uptodate = 1; 800 struct bio *bio = (struct bio *) context; 801 struct mirror_set *ms; 802 803 ms = bio_get_ms(bio); 804 bio_set_ms(bio, NULL); 805 806 /* 807 * NOTE: We don't decrement the pending count here, 808 * instead it is done by the targets endio function. 809 * This way we handle both writes to SYNC and NOSYNC 810 * regions with the same code. 811 */ 812 813 if (error) { 814 /* 815 * only error the io if all mirrors failed. 816 * FIXME: bogus 817 */ 818 uptodate = 0; 819 for (i = 0; i < ms->nr_mirrors; i++) 820 if (!test_bit(i, &error)) { 821 uptodate = 1; 822 break; 823 } 824 } 825 bio_endio(bio, 0); 826 } 827 828 static void do_write(struct mirror_set *ms, struct bio *bio) 829 { 830 unsigned int i; 831 struct io_region io[KCOPYD_MAX_REGIONS+1]; 832 struct mirror *m; 833 struct dm_io_request io_req = { 834 .bi_rw = WRITE, 835 .mem.type = DM_IO_BVEC, 836 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 837 .notify.fn = write_callback, 838 .notify.context = bio, 839 .client = ms->io_client, 840 }; 841 842 for (i = 0; i < ms->nr_mirrors; i++) { 843 m = ms->mirror + i; 844 845 io[i].bdev = m->dev->bdev; 846 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 847 io[i].count = bio->bi_size >> 9; 848 } 849 850 bio_set_ms(bio, ms); 851 852 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 853 } 854 855 static void do_writes(struct mirror_set *ms, struct bio_list *writes) 856 { 857 int state; 858 struct bio *bio; 859 struct bio_list sync, nosync, recover, *this_list = NULL; 860 861 if (!writes->head) 862 return; 863 864 /* 865 * Classify each write. 866 */ 867 bio_list_init(&sync); 868 bio_list_init(&nosync); 869 bio_list_init(&recover); 870 871 while ((bio = bio_list_pop(writes))) { 872 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); 873 switch (state) { 874 case RH_CLEAN: 875 case RH_DIRTY: 876 this_list = &sync; 877 break; 878 879 case RH_NOSYNC: 880 this_list = &nosync; 881 break; 882 883 case RH_RECOVERING: 884 this_list = &recover; 885 break; 886 } 887 888 bio_list_add(this_list, bio); 889 } 890 891 /* 892 * Increment the pending counts for any regions that will 893 * be written to (writes to recover regions are going to 894 * be delayed). 895 */ 896 rh_inc_pending(&ms->rh, &sync); 897 rh_inc_pending(&ms->rh, &nosync); 898 ms->log_failure = rh_flush(&ms->rh) ? 1 : 0; 899 900 /* 901 * Dispatch io. 902 */ 903 if (unlikely(ms->log_failure)) 904 while ((bio = bio_list_pop(&sync))) 905 bio_endio(bio, -EIO); 906 else while ((bio = bio_list_pop(&sync))) 907 do_write(ms, bio); 908 909 while ((bio = bio_list_pop(&recover))) 910 rh_delay(&ms->rh, bio); 911 912 while ((bio = bio_list_pop(&nosync))) { 913 map_bio(ms, ms->default_mirror, bio); 914 generic_make_request(bio); 915 } 916 } 917 918 /*----------------------------------------------------------------- 919 * kmirrord 920 *---------------------------------------------------------------*/ 921 static void do_mirror(struct work_struct *work) 922 { 923 struct mirror_set *ms =container_of(work, struct mirror_set, 924 kmirrord_work); 925 struct bio_list reads, writes; 926 927 spin_lock(&ms->lock); 928 reads = ms->reads; 929 writes = ms->writes; 930 bio_list_init(&ms->reads); 931 bio_list_init(&ms->writes); 932 spin_unlock(&ms->lock); 933 934 rh_update_states(&ms->rh); 935 do_recovery(ms); 936 do_reads(ms, &reads); 937 do_writes(ms, &writes); 938 } 939 940 /*----------------------------------------------------------------- 941 * Target functions 942 *---------------------------------------------------------------*/ 943 static struct mirror_set *alloc_context(unsigned int nr_mirrors, 944 uint32_t region_size, 945 struct dm_target *ti, 946 struct dirty_log *dl) 947 { 948 size_t len; 949 struct mirror_set *ms = NULL; 950 951 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) 952 return NULL; 953 954 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); 955 956 ms = kzalloc(len, GFP_KERNEL); 957 if (!ms) { 958 ti->error = "Cannot allocate mirror context"; 959 return NULL; 960 } 961 962 spin_lock_init(&ms->lock); 963 964 ms->ti = ti; 965 ms->nr_mirrors = nr_mirrors; 966 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 967 ms->in_sync = 0; 968 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 969 970 ms->io_client = dm_io_client_create(DM_IO_PAGES); 971 if (IS_ERR(ms->io_client)) { 972 ti->error = "Error creating dm_io client"; 973 kfree(ms); 974 return NULL; 975 } 976 977 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 978 ti->error = "Error creating dirty region hash"; 979 dm_io_client_destroy(ms->io_client); 980 kfree(ms); 981 return NULL; 982 } 983 984 return ms; 985 } 986 987 static void free_context(struct mirror_set *ms, struct dm_target *ti, 988 unsigned int m) 989 { 990 while (m--) 991 dm_put_device(ti, ms->mirror[m].dev); 992 993 dm_io_client_destroy(ms->io_client); 994 rh_exit(&ms->rh); 995 kfree(ms); 996 } 997 998 static inline int _check_region_size(struct dm_target *ti, uint32_t size) 999 { 1000 return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) || 1001 size > ti->len); 1002 } 1003 1004 static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 1005 unsigned int mirror, char **argv) 1006 { 1007 unsigned long long offset; 1008 1009 if (sscanf(argv[1], "%llu", &offset) != 1) { 1010 ti->error = "Invalid offset"; 1011 return -EINVAL; 1012 } 1013 1014 if (dm_get_device(ti, argv[0], offset, ti->len, 1015 dm_table_get_mode(ti->table), 1016 &ms->mirror[mirror].dev)) { 1017 ti->error = "Device lookup failure"; 1018 return -ENXIO; 1019 } 1020 1021 ms->mirror[mirror].ms = ms; 1022 ms->mirror[mirror].offset = offset; 1023 1024 return 0; 1025 } 1026 1027 /* 1028 * Create dirty log: log_type #log_params <log_params> 1029 */ 1030 static struct dirty_log *create_dirty_log(struct dm_target *ti, 1031 unsigned int argc, char **argv, 1032 unsigned int *args_used) 1033 { 1034 unsigned int param_count; 1035 struct dirty_log *dl; 1036 1037 if (argc < 2) { 1038 ti->error = "Insufficient mirror log arguments"; 1039 return NULL; 1040 } 1041 1042 if (sscanf(argv[1], "%u", ¶m_count) != 1) { 1043 ti->error = "Invalid mirror log argument count"; 1044 return NULL; 1045 } 1046 1047 *args_used = 2 + param_count; 1048 1049 if (argc < *args_used) { 1050 ti->error = "Insufficient mirror log arguments"; 1051 return NULL; 1052 } 1053 1054 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1055 if (!dl) { 1056 ti->error = "Error creating mirror dirty log"; 1057 return NULL; 1058 } 1059 1060 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1061 ti->error = "Invalid region size"; 1062 dm_destroy_dirty_log(dl); 1063 return NULL; 1064 } 1065 1066 return dl; 1067 } 1068 1069 static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, 1070 unsigned *args_used) 1071 { 1072 unsigned num_features; 1073 struct dm_target *ti = ms->ti; 1074 1075 *args_used = 0; 1076 1077 if (!argc) 1078 return 0; 1079 1080 if (sscanf(argv[0], "%u", &num_features) != 1) { 1081 ti->error = "Invalid number of features"; 1082 return -EINVAL; 1083 } 1084 1085 argc--; 1086 argv++; 1087 (*args_used)++; 1088 1089 if (num_features > argc) { 1090 ti->error = "Not enough arguments to support feature count"; 1091 return -EINVAL; 1092 } 1093 1094 if (!strcmp("handle_errors", argv[0])) 1095 ms->features |= DM_RAID1_HANDLE_ERRORS; 1096 else { 1097 ti->error = "Unrecognised feature requested"; 1098 return -EINVAL; 1099 } 1100 1101 (*args_used)++; 1102 1103 return 0; 1104 } 1105 1106 /* 1107 * Construct a mirror mapping: 1108 * 1109 * log_type #log_params <log_params> 1110 * #mirrors [mirror_path offset]{2,} 1111 * [#features <features>] 1112 * 1113 * log_type is "core" or "disk" 1114 * #log_params is between 1 and 3 1115 * 1116 * If present, features must be "handle_errors". 1117 */ 1118 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1119 { 1120 int r; 1121 unsigned int nr_mirrors, m, args_used; 1122 struct mirror_set *ms; 1123 struct dirty_log *dl; 1124 1125 dl = create_dirty_log(ti, argc, argv, &args_used); 1126 if (!dl) 1127 return -EINVAL; 1128 1129 argv += args_used; 1130 argc -= args_used; 1131 1132 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1133 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1134 ti->error = "Invalid number of mirrors"; 1135 dm_destroy_dirty_log(dl); 1136 return -EINVAL; 1137 } 1138 1139 argv++, argc--; 1140 1141 if (argc < nr_mirrors * 2) { 1142 ti->error = "Too few mirror arguments"; 1143 dm_destroy_dirty_log(dl); 1144 return -EINVAL; 1145 } 1146 1147 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); 1148 if (!ms) { 1149 dm_destroy_dirty_log(dl); 1150 return -ENOMEM; 1151 } 1152 1153 /* Get the mirror parameter sets */ 1154 for (m = 0; m < nr_mirrors; m++) { 1155 r = get_mirror(ms, ti, m, argv); 1156 if (r) { 1157 free_context(ms, ti, m); 1158 return r; 1159 } 1160 argv += 2; 1161 argc -= 2; 1162 } 1163 1164 ti->private = ms; 1165 ti->split_io = ms->rh.region_size; 1166 1167 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1168 if (!ms->kmirrord_wq) { 1169 DMERR("couldn't start kmirrord"); 1170 r = -ENOMEM; 1171 goto err_free_context; 1172 } 1173 INIT_WORK(&ms->kmirrord_work, do_mirror); 1174 1175 r = parse_features(ms, argc, argv, &args_used); 1176 if (r) 1177 goto err_destroy_wq; 1178 1179 argv += args_used; 1180 argc -= args_used; 1181 1182 /* 1183 * Any read-balancing addition depends on the 1184 * DM_RAID1_HANDLE_ERRORS flag being present. 1185 * This is because the decision to balance depends 1186 * on the sync state of a region. If the above 1187 * flag is not present, we ignore errors; and 1188 * the sync state may be inaccurate. 1189 */ 1190 1191 if (argc) { 1192 ti->error = "Too many mirror arguments"; 1193 r = -EINVAL; 1194 goto err_destroy_wq; 1195 } 1196 1197 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1198 if (r) 1199 goto err_destroy_wq; 1200 1201 wake(ms); 1202 return 0; 1203 1204 err_destroy_wq: 1205 destroy_workqueue(ms->kmirrord_wq); 1206 err_free_context: 1207 free_context(ms, ti, ms->nr_mirrors); 1208 return r; 1209 } 1210 1211 static void mirror_dtr(struct dm_target *ti) 1212 { 1213 struct mirror_set *ms = (struct mirror_set *) ti->private; 1214 1215 flush_workqueue(ms->kmirrord_wq); 1216 kcopyd_client_destroy(ms->kcopyd_client); 1217 destroy_workqueue(ms->kmirrord_wq); 1218 free_context(ms, ti, ms->nr_mirrors); 1219 } 1220 1221 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1222 { 1223 int should_wake = 0; 1224 struct bio_list *bl; 1225 1226 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1227 spin_lock(&ms->lock); 1228 should_wake = !(bl->head); 1229 bio_list_add(bl, bio); 1230 spin_unlock(&ms->lock); 1231 1232 if (should_wake) 1233 wake(ms); 1234 } 1235 1236 /* 1237 * Mirror mapping function 1238 */ 1239 static int mirror_map(struct dm_target *ti, struct bio *bio, 1240 union map_info *map_context) 1241 { 1242 int r, rw = bio_rw(bio); 1243 struct mirror *m; 1244 struct mirror_set *ms = ti->private; 1245 1246 map_context->ll = bio_to_region(&ms->rh, bio); 1247 1248 if (rw == WRITE) { 1249 queue_bio(ms, bio, rw); 1250 return DM_MAPIO_SUBMITTED; 1251 } 1252 1253 r = ms->rh.log->type->in_sync(ms->rh.log, 1254 bio_to_region(&ms->rh, bio), 0); 1255 if (r < 0 && r != -EWOULDBLOCK) 1256 return r; 1257 1258 if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1259 r = DM_MAPIO_SUBMITTED; 1260 1261 /* 1262 * We don't want to fast track a recovery just for a read 1263 * ahead. So we just let it silently fail. 1264 * FIXME: get rid of this. 1265 */ 1266 if (!r && rw == READA) 1267 return -EIO; 1268 1269 if (!r) { 1270 /* Pass this io over to the daemon */ 1271 queue_bio(ms, bio, rw); 1272 return DM_MAPIO_SUBMITTED; 1273 } 1274 1275 m = choose_mirror(ms, bio->bi_sector); 1276 if (!m) 1277 return -EIO; 1278 1279 map_bio(ms, m, bio); 1280 return DM_MAPIO_REMAPPED; 1281 } 1282 1283 static int mirror_end_io(struct dm_target *ti, struct bio *bio, 1284 int error, union map_info *map_context) 1285 { 1286 int rw = bio_rw(bio); 1287 struct mirror_set *ms = (struct mirror_set *) ti->private; 1288 region_t region = map_context->ll; 1289 1290 /* 1291 * We need to dec pending if this was a write. 1292 */ 1293 if (rw == WRITE) 1294 rh_dec(&ms->rh, region); 1295 1296 return 0; 1297 } 1298 1299 static void mirror_postsuspend(struct dm_target *ti) 1300 { 1301 struct mirror_set *ms = (struct mirror_set *) ti->private; 1302 struct dirty_log *log = ms->rh.log; 1303 1304 rh_stop_recovery(&ms->rh); 1305 1306 /* Wait for all I/O we generated to complete */ 1307 wait_event(_kmirrord_recovery_stopped, 1308 !atomic_read(&ms->rh.recovery_in_flight)); 1309 1310 if (log->type->postsuspend && log->type->postsuspend(log)) 1311 /* FIXME: need better error handling */ 1312 DMWARN("log suspend failed"); 1313 } 1314 1315 static void mirror_resume(struct dm_target *ti) 1316 { 1317 struct mirror_set *ms = (struct mirror_set *) ti->private; 1318 struct dirty_log *log = ms->rh.log; 1319 if (log->type->resume && log->type->resume(log)) 1320 /* FIXME: need better error handling */ 1321 DMWARN("log resume failed"); 1322 rh_start_recovery(&ms->rh); 1323 } 1324 1325 static int mirror_status(struct dm_target *ti, status_type_t type, 1326 char *result, unsigned int maxlen) 1327 { 1328 unsigned int m, sz = 0; 1329 struct mirror_set *ms = (struct mirror_set *) ti->private; 1330 1331 switch (type) { 1332 case STATUSTYPE_INFO: 1333 DMEMIT("%d ", ms->nr_mirrors); 1334 for (m = 0; m < ms->nr_mirrors; m++) 1335 DMEMIT("%s ", ms->mirror[m].dev->name); 1336 1337 DMEMIT("%llu/%llu 0 ", 1338 (unsigned long long)ms->rh.log->type-> 1339 get_sync_count(ms->rh.log), 1340 (unsigned long long)ms->nr_regions); 1341 1342 sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); 1343 1344 break; 1345 1346 case STATUSTYPE_TABLE: 1347 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); 1348 1349 DMEMIT("%d", ms->nr_mirrors); 1350 for (m = 0; m < ms->nr_mirrors; m++) 1351 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1352 (unsigned long long)ms->mirror[m].offset); 1353 1354 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1355 DMEMIT(" 1 handle_errors"); 1356 } 1357 1358 return 0; 1359 } 1360 1361 static struct target_type mirror_target = { 1362 .name = "mirror", 1363 .version = {1, 0, 3}, 1364 .module = THIS_MODULE, 1365 .ctr = mirror_ctr, 1366 .dtr = mirror_dtr, 1367 .map = mirror_map, 1368 .end_io = mirror_end_io, 1369 .postsuspend = mirror_postsuspend, 1370 .resume = mirror_resume, 1371 .status = mirror_status, 1372 }; 1373 1374 static int __init dm_mirror_init(void) 1375 { 1376 int r; 1377 1378 r = dm_dirty_log_init(); 1379 if (r) 1380 return r; 1381 1382 r = dm_register_target(&mirror_target); 1383 if (r < 0) { 1384 DMERR("Failed to register mirror target"); 1385 dm_dirty_log_exit(); 1386 } 1387 1388 return r; 1389 } 1390 1391 static void __exit dm_mirror_exit(void) 1392 { 1393 int r; 1394 1395 r = dm_unregister_target(&mirror_target); 1396 if (r < 0) 1397 DMERR("unregister failed %d", r); 1398 1399 dm_dirty_log_exit(); 1400 } 1401 1402 /* Module hooks */ 1403 module_init(dm_mirror_init); 1404 module_exit(dm_mirror_exit); 1405 1406 MODULE_DESCRIPTION(DM_NAME " mirror target"); 1407 MODULE_AUTHOR("Joe Thornber"); 1408 MODULE_LICENSE("GPL"); 1409