1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-list.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 #include <linux/blktrace_api.h> 23 24 static const char *_name = DM_NAME; 25 26 static unsigned int major = 0; 27 static unsigned int _major = 0; 28 29 /* 30 * One of these is allocated per bio. 31 */ 32 struct dm_io { 33 struct mapped_device *md; 34 int error; 35 struct bio *bio; 36 atomic_t io_count; 37 unsigned long start_time; 38 }; 39 40 /* 41 * One of these is allocated per target within a bio. Hopefully 42 * this will be simplified out one day. 43 */ 44 struct target_io { 45 struct dm_io *io; 46 struct dm_target *ti; 47 union map_info info; 48 }; 49 50 union map_info *dm_get_mapinfo(struct bio *bio) 51 { 52 if (bio && bio->bi_private) 53 return &((struct target_io *)bio->bi_private)->info; 54 return NULL; 55 } 56 57 /* 58 * Bits for the md->flags field. 59 */ 60 #define DMF_BLOCK_IO 0 61 #define DMF_SUSPENDED 1 62 #define DMF_FROZEN 2 63 64 struct mapped_device { 65 struct rw_semaphore io_lock; 66 struct semaphore suspend_lock; 67 rwlock_t map_lock; 68 atomic_t holders; 69 70 unsigned long flags; 71 72 request_queue_t *queue; 73 struct gendisk *disk; 74 char name[16]; 75 76 void *interface_ptr; 77 78 /* 79 * A list of ios that arrived while we were suspended. 80 */ 81 atomic_t pending; 82 wait_queue_head_t wait; 83 struct bio_list deferred; 84 85 /* 86 * The current mapping. 87 */ 88 struct dm_table *map; 89 90 /* 91 * io objects are allocated from here. 92 */ 93 mempool_t *io_pool; 94 mempool_t *tio_pool; 95 96 /* 97 * Event handling. 98 */ 99 atomic_t event_nr; 100 wait_queue_head_t eventq; 101 102 /* 103 * freeze/thaw support require holding onto a super block 104 */ 105 struct super_block *frozen_sb; 106 struct block_device *suspended_bdev; 107 108 /* forced geometry settings */ 109 struct hd_geometry geometry; 110 }; 111 112 #define MIN_IOS 256 113 static kmem_cache_t *_io_cache; 114 static kmem_cache_t *_tio_cache; 115 116 static struct bio_set *dm_set; 117 118 static int __init local_init(void) 119 { 120 int r; 121 122 dm_set = bioset_create(16, 16, 4); 123 if (!dm_set) 124 return -ENOMEM; 125 126 /* allocate a slab for the dm_ios */ 127 _io_cache = kmem_cache_create("dm_io", 128 sizeof(struct dm_io), 0, 0, NULL, NULL); 129 if (!_io_cache) 130 return -ENOMEM; 131 132 /* allocate a slab for the target ios */ 133 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), 134 0, 0, NULL, NULL); 135 if (!_tio_cache) { 136 kmem_cache_destroy(_io_cache); 137 return -ENOMEM; 138 } 139 140 _major = major; 141 r = register_blkdev(_major, _name); 142 if (r < 0) { 143 kmem_cache_destroy(_tio_cache); 144 kmem_cache_destroy(_io_cache); 145 return r; 146 } 147 148 if (!_major) 149 _major = r; 150 151 return 0; 152 } 153 154 static void local_exit(void) 155 { 156 kmem_cache_destroy(_tio_cache); 157 kmem_cache_destroy(_io_cache); 158 159 bioset_free(dm_set); 160 161 if (unregister_blkdev(_major, _name) < 0) 162 DMERR("devfs_unregister_blkdev failed"); 163 164 _major = 0; 165 166 DMINFO("cleaned up"); 167 } 168 169 int (*_inits[])(void) __initdata = { 170 local_init, 171 dm_target_init, 172 dm_linear_init, 173 dm_stripe_init, 174 dm_interface_init, 175 }; 176 177 void (*_exits[])(void) = { 178 local_exit, 179 dm_target_exit, 180 dm_linear_exit, 181 dm_stripe_exit, 182 dm_interface_exit, 183 }; 184 185 static int __init dm_init(void) 186 { 187 const int count = ARRAY_SIZE(_inits); 188 189 int r, i; 190 191 for (i = 0; i < count; i++) { 192 r = _inits[i](); 193 if (r) 194 goto bad; 195 } 196 197 return 0; 198 199 bad: 200 while (i--) 201 _exits[i](); 202 203 return r; 204 } 205 206 static void __exit dm_exit(void) 207 { 208 int i = ARRAY_SIZE(_exits); 209 210 while (i--) 211 _exits[i](); 212 } 213 214 /* 215 * Block device functions 216 */ 217 static int dm_blk_open(struct inode *inode, struct file *file) 218 { 219 struct mapped_device *md; 220 221 md = inode->i_bdev->bd_disk->private_data; 222 dm_get(md); 223 return 0; 224 } 225 226 static int dm_blk_close(struct inode *inode, struct file *file) 227 { 228 struct mapped_device *md; 229 230 md = inode->i_bdev->bd_disk->private_data; 231 dm_put(md); 232 return 0; 233 } 234 235 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 236 { 237 struct mapped_device *md = bdev->bd_disk->private_data; 238 239 return dm_get_geometry(md, geo); 240 } 241 242 static inline struct dm_io *alloc_io(struct mapped_device *md) 243 { 244 return mempool_alloc(md->io_pool, GFP_NOIO); 245 } 246 247 static inline void free_io(struct mapped_device *md, struct dm_io *io) 248 { 249 mempool_free(io, md->io_pool); 250 } 251 252 static inline struct target_io *alloc_tio(struct mapped_device *md) 253 { 254 return mempool_alloc(md->tio_pool, GFP_NOIO); 255 } 256 257 static inline void free_tio(struct mapped_device *md, struct target_io *tio) 258 { 259 mempool_free(tio, md->tio_pool); 260 } 261 262 static void start_io_acct(struct dm_io *io) 263 { 264 struct mapped_device *md = io->md; 265 266 io->start_time = jiffies; 267 268 preempt_disable(); 269 disk_round_stats(dm_disk(md)); 270 preempt_enable(); 271 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 272 } 273 274 static int end_io_acct(struct dm_io *io) 275 { 276 struct mapped_device *md = io->md; 277 struct bio *bio = io->bio; 278 unsigned long duration = jiffies - io->start_time; 279 int pending; 280 int rw = bio_data_dir(bio); 281 282 preempt_disable(); 283 disk_round_stats(dm_disk(md)); 284 preempt_enable(); 285 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 286 287 disk_stat_add(dm_disk(md), ticks[rw], duration); 288 289 return !pending; 290 } 291 292 /* 293 * Add the bio to the list of deferred io. 294 */ 295 static int queue_io(struct mapped_device *md, struct bio *bio) 296 { 297 down_write(&md->io_lock); 298 299 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 300 up_write(&md->io_lock); 301 return 1; 302 } 303 304 bio_list_add(&md->deferred, bio); 305 306 up_write(&md->io_lock); 307 return 0; /* deferred successfully */ 308 } 309 310 /* 311 * Everyone (including functions in this file), should use this 312 * function to access the md->map field, and make sure they call 313 * dm_table_put() when finished. 314 */ 315 struct dm_table *dm_get_table(struct mapped_device *md) 316 { 317 struct dm_table *t; 318 319 read_lock(&md->map_lock); 320 t = md->map; 321 if (t) 322 dm_table_get(t); 323 read_unlock(&md->map_lock); 324 325 return t; 326 } 327 328 /* 329 * Get the geometry associated with a dm device 330 */ 331 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 332 { 333 *geo = md->geometry; 334 335 return 0; 336 } 337 338 /* 339 * Set the geometry of a device. 340 */ 341 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 342 { 343 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 344 345 if (geo->start > sz) { 346 DMWARN("Start sector is beyond the geometry limits."); 347 return -EINVAL; 348 } 349 350 md->geometry = *geo; 351 352 return 0; 353 } 354 355 /*----------------------------------------------------------------- 356 * CRUD START: 357 * A more elegant soln is in the works that uses the queue 358 * merge fn, unfortunately there are a couple of changes to 359 * the block layer that I want to make for this. So in the 360 * interests of getting something for people to use I give 361 * you this clearly demarcated crap. 362 *---------------------------------------------------------------*/ 363 364 /* 365 * Decrements the number of outstanding ios that a bio has been 366 * cloned into, completing the original io if necc. 367 */ 368 static void dec_pending(struct dm_io *io, int error) 369 { 370 if (error) 371 io->error = error; 372 373 if (atomic_dec_and_test(&io->io_count)) { 374 if (end_io_acct(io)) 375 /* nudge anyone waiting on suspend queue */ 376 wake_up(&io->md->wait); 377 378 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); 379 380 bio_endio(io->bio, io->bio->bi_size, io->error); 381 free_io(io->md, io); 382 } 383 } 384 385 static int clone_endio(struct bio *bio, unsigned int done, int error) 386 { 387 int r = 0; 388 struct target_io *tio = bio->bi_private; 389 struct dm_io *io = tio->io; 390 dm_endio_fn endio = tio->ti->type->end_io; 391 392 if (bio->bi_size) 393 return 1; 394 395 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 396 error = -EIO; 397 398 if (endio) { 399 r = endio(tio->ti, bio, error, &tio->info); 400 if (r < 0) 401 error = r; 402 403 else if (r > 0) 404 /* the target wants another shot at the io */ 405 return 1; 406 } 407 408 free_tio(io->md, tio); 409 dec_pending(io, error); 410 bio_put(bio); 411 return r; 412 } 413 414 static sector_t max_io_len(struct mapped_device *md, 415 sector_t sector, struct dm_target *ti) 416 { 417 sector_t offset = sector - ti->begin; 418 sector_t len = ti->len - offset; 419 420 /* 421 * Does the target need to split even further ? 422 */ 423 if (ti->split_io) { 424 sector_t boundary; 425 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 426 - offset; 427 if (len > boundary) 428 len = boundary; 429 } 430 431 return len; 432 } 433 434 static void __map_bio(struct dm_target *ti, struct bio *clone, 435 struct target_io *tio) 436 { 437 int r; 438 sector_t sector; 439 440 /* 441 * Sanity checks. 442 */ 443 BUG_ON(!clone->bi_size); 444 445 clone->bi_end_io = clone_endio; 446 clone->bi_private = tio; 447 448 /* 449 * Map the clone. If r == 0 we don't need to do 450 * anything, the target has assumed ownership of 451 * this io. 452 */ 453 atomic_inc(&tio->io->io_count); 454 sector = clone->bi_sector; 455 r = ti->type->map(ti, clone, &tio->info); 456 if (r > 0) { 457 /* the bio has been remapped so dispatch it */ 458 459 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 460 tio->io->bio->bi_bdev->bd_dev, sector, 461 clone->bi_sector); 462 463 generic_make_request(clone); 464 } 465 466 else if (r < 0) { 467 /* error the io and bail out */ 468 struct dm_io *io = tio->io; 469 free_tio(tio->io->md, tio); 470 dec_pending(io, r); 471 bio_put(clone); 472 } 473 } 474 475 struct clone_info { 476 struct mapped_device *md; 477 struct dm_table *map; 478 struct bio *bio; 479 struct dm_io *io; 480 sector_t sector; 481 sector_t sector_count; 482 unsigned short idx; 483 }; 484 485 static void dm_bio_destructor(struct bio *bio) 486 { 487 bio_free(bio, dm_set); 488 } 489 490 /* 491 * Creates a little bio that is just does part of a bvec. 492 */ 493 static struct bio *split_bvec(struct bio *bio, sector_t sector, 494 unsigned short idx, unsigned int offset, 495 unsigned int len) 496 { 497 struct bio *clone; 498 struct bio_vec *bv = bio->bi_io_vec + idx; 499 500 clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); 501 clone->bi_destructor = dm_bio_destructor; 502 *clone->bi_io_vec = *bv; 503 504 clone->bi_sector = sector; 505 clone->bi_bdev = bio->bi_bdev; 506 clone->bi_rw = bio->bi_rw; 507 clone->bi_vcnt = 1; 508 clone->bi_size = to_bytes(len); 509 clone->bi_io_vec->bv_offset = offset; 510 clone->bi_io_vec->bv_len = clone->bi_size; 511 512 return clone; 513 } 514 515 /* 516 * Creates a bio that consists of range of complete bvecs. 517 */ 518 static struct bio *clone_bio(struct bio *bio, sector_t sector, 519 unsigned short idx, unsigned short bv_count, 520 unsigned int len) 521 { 522 struct bio *clone; 523 524 clone = bio_clone(bio, GFP_NOIO); 525 clone->bi_sector = sector; 526 clone->bi_idx = idx; 527 clone->bi_vcnt = idx + bv_count; 528 clone->bi_size = to_bytes(len); 529 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 530 531 return clone; 532 } 533 534 static void __clone_and_map(struct clone_info *ci) 535 { 536 struct bio *clone, *bio = ci->bio; 537 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 538 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 539 struct target_io *tio; 540 541 /* 542 * Allocate a target io object. 543 */ 544 tio = alloc_tio(ci->md); 545 tio->io = ci->io; 546 tio->ti = ti; 547 memset(&tio->info, 0, sizeof(tio->info)); 548 549 if (ci->sector_count <= max) { 550 /* 551 * Optimise for the simple case where we can do all of 552 * the remaining io with a single clone. 553 */ 554 clone = clone_bio(bio, ci->sector, ci->idx, 555 bio->bi_vcnt - ci->idx, ci->sector_count); 556 __map_bio(ti, clone, tio); 557 ci->sector_count = 0; 558 559 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 560 /* 561 * There are some bvecs that don't span targets. 562 * Do as many of these as possible. 563 */ 564 int i; 565 sector_t remaining = max; 566 sector_t bv_len; 567 568 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 569 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 570 571 if (bv_len > remaining) 572 break; 573 574 remaining -= bv_len; 575 len += bv_len; 576 } 577 578 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); 579 __map_bio(ti, clone, tio); 580 581 ci->sector += len; 582 ci->sector_count -= len; 583 ci->idx = i; 584 585 } else { 586 /* 587 * Handle a bvec that must be split between two or more targets. 588 */ 589 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 590 sector_t remaining = to_sector(bv->bv_len); 591 unsigned int offset = 0; 592 593 do { 594 if (offset) { 595 ti = dm_table_find_target(ci->map, ci->sector); 596 max = max_io_len(ci->md, ci->sector, ti); 597 598 tio = alloc_tio(ci->md); 599 tio->io = ci->io; 600 tio->ti = ti; 601 memset(&tio->info, 0, sizeof(tio->info)); 602 } 603 604 len = min(remaining, max); 605 606 clone = split_bvec(bio, ci->sector, ci->idx, 607 bv->bv_offset + offset, len); 608 609 __map_bio(ti, clone, tio); 610 611 ci->sector += len; 612 ci->sector_count -= len; 613 offset += to_bytes(len); 614 } while (remaining -= len); 615 616 ci->idx++; 617 } 618 } 619 620 /* 621 * Split the bio into several clones. 622 */ 623 static void __split_bio(struct mapped_device *md, struct bio *bio) 624 { 625 struct clone_info ci; 626 627 ci.map = dm_get_table(md); 628 if (!ci.map) { 629 bio_io_error(bio, bio->bi_size); 630 return; 631 } 632 633 ci.md = md; 634 ci.bio = bio; 635 ci.io = alloc_io(md); 636 ci.io->error = 0; 637 atomic_set(&ci.io->io_count, 1); 638 ci.io->bio = bio; 639 ci.io->md = md; 640 ci.sector = bio->bi_sector; 641 ci.sector_count = bio_sectors(bio); 642 ci.idx = bio->bi_idx; 643 644 start_io_acct(ci.io); 645 while (ci.sector_count) 646 __clone_and_map(&ci); 647 648 /* drop the extra reference count */ 649 dec_pending(ci.io, 0); 650 dm_table_put(ci.map); 651 } 652 /*----------------------------------------------------------------- 653 * CRUD END 654 *---------------------------------------------------------------*/ 655 656 /* 657 * The request function that just remaps the bio built up by 658 * dm_merge_bvec. 659 */ 660 static int dm_request(request_queue_t *q, struct bio *bio) 661 { 662 int r; 663 int rw = bio_data_dir(bio); 664 struct mapped_device *md = q->queuedata; 665 666 down_read(&md->io_lock); 667 668 disk_stat_inc(dm_disk(md), ios[rw]); 669 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 670 671 /* 672 * If we're suspended we have to queue 673 * this io for later. 674 */ 675 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 676 up_read(&md->io_lock); 677 678 if (bio_rw(bio) == READA) { 679 bio_io_error(bio, bio->bi_size); 680 return 0; 681 } 682 683 r = queue_io(md, bio); 684 if (r < 0) { 685 bio_io_error(bio, bio->bi_size); 686 return 0; 687 688 } else if (r == 0) 689 return 0; /* deferred successfully */ 690 691 /* 692 * We're in a while loop, because someone could suspend 693 * before we get to the following read lock. 694 */ 695 down_read(&md->io_lock); 696 } 697 698 __split_bio(md, bio); 699 up_read(&md->io_lock); 700 return 0; 701 } 702 703 static int dm_flush_all(request_queue_t *q, struct gendisk *disk, 704 sector_t *error_sector) 705 { 706 struct mapped_device *md = q->queuedata; 707 struct dm_table *map = dm_get_table(md); 708 int ret = -ENXIO; 709 710 if (map) { 711 ret = dm_table_flush_all(map); 712 dm_table_put(map); 713 } 714 715 return ret; 716 } 717 718 static void dm_unplug_all(request_queue_t *q) 719 { 720 struct mapped_device *md = q->queuedata; 721 struct dm_table *map = dm_get_table(md); 722 723 if (map) { 724 dm_table_unplug_all(map); 725 dm_table_put(map); 726 } 727 } 728 729 static int dm_any_congested(void *congested_data, int bdi_bits) 730 { 731 int r; 732 struct mapped_device *md = (struct mapped_device *) congested_data; 733 struct dm_table *map = dm_get_table(md); 734 735 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 736 r = bdi_bits; 737 else 738 r = dm_table_any_congested(map, bdi_bits); 739 740 dm_table_put(map); 741 return r; 742 } 743 744 /*----------------------------------------------------------------- 745 * An IDR is used to keep track of allocated minor numbers. 746 *---------------------------------------------------------------*/ 747 static DEFINE_MUTEX(_minor_lock); 748 static DEFINE_IDR(_minor_idr); 749 750 static void free_minor(unsigned int minor) 751 { 752 mutex_lock(&_minor_lock); 753 idr_remove(&_minor_idr, minor); 754 mutex_unlock(&_minor_lock); 755 } 756 757 /* 758 * See if the device with a specific minor # is free. 759 */ 760 static int specific_minor(struct mapped_device *md, unsigned int minor) 761 { 762 int r, m; 763 764 if (minor >= (1 << MINORBITS)) 765 return -EINVAL; 766 767 mutex_lock(&_minor_lock); 768 769 if (idr_find(&_minor_idr, minor)) { 770 r = -EBUSY; 771 goto out; 772 } 773 774 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 775 if (!r) { 776 r = -ENOMEM; 777 goto out; 778 } 779 780 r = idr_get_new_above(&_minor_idr, md, minor, &m); 781 if (r) { 782 goto out; 783 } 784 785 if (m != minor) { 786 idr_remove(&_minor_idr, m); 787 r = -EBUSY; 788 goto out; 789 } 790 791 out: 792 mutex_unlock(&_minor_lock); 793 return r; 794 } 795 796 static int next_free_minor(struct mapped_device *md, unsigned int *minor) 797 { 798 int r; 799 unsigned int m; 800 801 mutex_lock(&_minor_lock); 802 803 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 804 if (!r) { 805 r = -ENOMEM; 806 goto out; 807 } 808 809 r = idr_get_new(&_minor_idr, md, &m); 810 if (r) { 811 goto out; 812 } 813 814 if (m >= (1 << MINORBITS)) { 815 idr_remove(&_minor_idr, m); 816 r = -ENOSPC; 817 goto out; 818 } 819 820 *minor = m; 821 822 out: 823 mutex_unlock(&_minor_lock); 824 return r; 825 } 826 827 static struct block_device_operations dm_blk_dops; 828 829 /* 830 * Allocate and initialise a blank device with a given minor. 831 */ 832 static struct mapped_device *alloc_dev(unsigned int minor, int persistent) 833 { 834 int r; 835 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 836 837 if (!md) { 838 DMWARN("unable to allocate device, out of memory."); 839 return NULL; 840 } 841 842 /* get a minor number for the dev */ 843 r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); 844 if (r < 0) 845 goto bad1; 846 847 memset(md, 0, sizeof(*md)); 848 init_rwsem(&md->io_lock); 849 init_MUTEX(&md->suspend_lock); 850 rwlock_init(&md->map_lock); 851 atomic_set(&md->holders, 1); 852 atomic_set(&md->event_nr, 0); 853 854 md->queue = blk_alloc_queue(GFP_KERNEL); 855 if (!md->queue) 856 goto bad1; 857 858 md->queue->queuedata = md; 859 md->queue->backing_dev_info.congested_fn = dm_any_congested; 860 md->queue->backing_dev_info.congested_data = md; 861 blk_queue_make_request(md->queue, dm_request); 862 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 863 md->queue->unplug_fn = dm_unplug_all; 864 md->queue->issue_flush_fn = dm_flush_all; 865 866 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 867 if (!md->io_pool) 868 goto bad2; 869 870 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 871 if (!md->tio_pool) 872 goto bad3; 873 874 md->disk = alloc_disk(1); 875 if (!md->disk) 876 goto bad4; 877 878 md->disk->major = _major; 879 md->disk->first_minor = minor; 880 md->disk->fops = &dm_blk_dops; 881 md->disk->queue = md->queue; 882 md->disk->private_data = md; 883 sprintf(md->disk->disk_name, "dm-%d", minor); 884 add_disk(md->disk); 885 format_dev_t(md->name, MKDEV(_major, minor)); 886 887 atomic_set(&md->pending, 0); 888 init_waitqueue_head(&md->wait); 889 init_waitqueue_head(&md->eventq); 890 891 return md; 892 893 bad4: 894 mempool_destroy(md->tio_pool); 895 bad3: 896 mempool_destroy(md->io_pool); 897 bad2: 898 blk_cleanup_queue(md->queue); 899 free_minor(minor); 900 bad1: 901 kfree(md); 902 return NULL; 903 } 904 905 static void free_dev(struct mapped_device *md) 906 { 907 unsigned int minor = md->disk->first_minor; 908 909 if (md->suspended_bdev) { 910 thaw_bdev(md->suspended_bdev, NULL); 911 bdput(md->suspended_bdev); 912 } 913 mempool_destroy(md->tio_pool); 914 mempool_destroy(md->io_pool); 915 del_gendisk(md->disk); 916 free_minor(minor); 917 put_disk(md->disk); 918 blk_cleanup_queue(md->queue); 919 kfree(md); 920 } 921 922 /* 923 * Bind a table to the device. 924 */ 925 static void event_callback(void *context) 926 { 927 struct mapped_device *md = (struct mapped_device *) context; 928 929 atomic_inc(&md->event_nr); 930 wake_up(&md->eventq); 931 } 932 933 static void __set_size(struct mapped_device *md, sector_t size) 934 { 935 set_capacity(md->disk, size); 936 937 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 938 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 939 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 940 } 941 942 static int __bind(struct mapped_device *md, struct dm_table *t) 943 { 944 request_queue_t *q = md->queue; 945 sector_t size; 946 947 size = dm_table_get_size(t); 948 949 /* 950 * Wipe any geometry if the size of the table changed. 951 */ 952 if (size != get_capacity(md->disk)) 953 memset(&md->geometry, 0, sizeof(md->geometry)); 954 955 __set_size(md, size); 956 if (size == 0) 957 return 0; 958 959 dm_table_get(t); 960 dm_table_event_callback(t, event_callback, md); 961 962 write_lock(&md->map_lock); 963 md->map = t; 964 dm_table_set_restrictions(t, q); 965 write_unlock(&md->map_lock); 966 967 return 0; 968 } 969 970 static void __unbind(struct mapped_device *md) 971 { 972 struct dm_table *map = md->map; 973 974 if (!map) 975 return; 976 977 dm_table_event_callback(map, NULL, NULL); 978 write_lock(&md->map_lock); 979 md->map = NULL; 980 write_unlock(&md->map_lock); 981 dm_table_put(map); 982 } 983 984 /* 985 * Constructor for a new device. 986 */ 987 static int create_aux(unsigned int minor, int persistent, 988 struct mapped_device **result) 989 { 990 struct mapped_device *md; 991 992 md = alloc_dev(minor, persistent); 993 if (!md) 994 return -ENXIO; 995 996 *result = md; 997 return 0; 998 } 999 1000 int dm_create(struct mapped_device **result) 1001 { 1002 return create_aux(0, 0, result); 1003 } 1004 1005 int dm_create_with_minor(unsigned int minor, struct mapped_device **result) 1006 { 1007 return create_aux(minor, 1, result); 1008 } 1009 1010 static struct mapped_device *dm_find_md(dev_t dev) 1011 { 1012 struct mapped_device *md; 1013 unsigned minor = MINOR(dev); 1014 1015 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1016 return NULL; 1017 1018 mutex_lock(&_minor_lock); 1019 1020 md = idr_find(&_minor_idr, minor); 1021 if (!md || (dm_disk(md)->first_minor != minor)) 1022 md = NULL; 1023 1024 mutex_unlock(&_minor_lock); 1025 1026 return md; 1027 } 1028 1029 struct mapped_device *dm_get_md(dev_t dev) 1030 { 1031 struct mapped_device *md = dm_find_md(dev); 1032 1033 if (md) 1034 dm_get(md); 1035 1036 return md; 1037 } 1038 1039 void *dm_get_mdptr(struct mapped_device *md) 1040 { 1041 return md->interface_ptr; 1042 } 1043 1044 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1045 { 1046 md->interface_ptr = ptr; 1047 } 1048 1049 void dm_get(struct mapped_device *md) 1050 { 1051 atomic_inc(&md->holders); 1052 } 1053 1054 void dm_put(struct mapped_device *md) 1055 { 1056 struct dm_table *map; 1057 1058 if (atomic_dec_and_test(&md->holders)) { 1059 map = dm_get_table(md); 1060 if (!dm_suspended(md)) { 1061 dm_table_presuspend_targets(map); 1062 dm_table_postsuspend_targets(map); 1063 } 1064 __unbind(md); 1065 dm_table_put(map); 1066 free_dev(md); 1067 } 1068 } 1069 1070 /* 1071 * Process the deferred bios 1072 */ 1073 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1074 { 1075 struct bio *n; 1076 1077 while (c) { 1078 n = c->bi_next; 1079 c->bi_next = NULL; 1080 __split_bio(md, c); 1081 c = n; 1082 } 1083 } 1084 1085 /* 1086 * Swap in a new table (destroying old one). 1087 */ 1088 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1089 { 1090 int r = -EINVAL; 1091 1092 down(&md->suspend_lock); 1093 1094 /* device must be suspended */ 1095 if (!dm_suspended(md)) 1096 goto out; 1097 1098 __unbind(md); 1099 r = __bind(md, table); 1100 1101 out: 1102 up(&md->suspend_lock); 1103 return r; 1104 } 1105 1106 /* 1107 * Functions to lock and unlock any filesystem running on the 1108 * device. 1109 */ 1110 static int lock_fs(struct mapped_device *md) 1111 { 1112 int r; 1113 1114 WARN_ON(md->frozen_sb); 1115 1116 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1117 if (IS_ERR(md->frozen_sb)) { 1118 r = PTR_ERR(md->frozen_sb); 1119 md->frozen_sb = NULL; 1120 return r; 1121 } 1122 1123 set_bit(DMF_FROZEN, &md->flags); 1124 1125 /* don't bdput right now, we don't want the bdev 1126 * to go away while it is locked. 1127 */ 1128 return 0; 1129 } 1130 1131 static void unlock_fs(struct mapped_device *md) 1132 { 1133 if (!test_bit(DMF_FROZEN, &md->flags)) 1134 return; 1135 1136 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1137 md->frozen_sb = NULL; 1138 clear_bit(DMF_FROZEN, &md->flags); 1139 } 1140 1141 /* 1142 * We need to be able to change a mapping table under a mounted 1143 * filesystem. For example we might want to move some data in 1144 * the background. Before the table can be swapped with 1145 * dm_bind_table, dm_suspend must be called to flush any in 1146 * flight bios and ensure that any further io gets deferred. 1147 */ 1148 int dm_suspend(struct mapped_device *md, int do_lockfs) 1149 { 1150 struct dm_table *map = NULL; 1151 DECLARE_WAITQUEUE(wait, current); 1152 struct bio *def; 1153 int r = -EINVAL; 1154 1155 down(&md->suspend_lock); 1156 1157 if (dm_suspended(md)) 1158 goto out; 1159 1160 map = dm_get_table(md); 1161 1162 /* This does not get reverted if there's an error later. */ 1163 dm_table_presuspend_targets(map); 1164 1165 md->suspended_bdev = bdget_disk(md->disk, 0); 1166 if (!md->suspended_bdev) { 1167 DMWARN("bdget failed in dm_suspend"); 1168 r = -ENOMEM; 1169 goto out; 1170 } 1171 1172 /* Flush I/O to the device. */ 1173 if (do_lockfs) { 1174 r = lock_fs(md); 1175 if (r) 1176 goto out; 1177 } 1178 1179 /* 1180 * First we set the BLOCK_IO flag so no more ios will be mapped. 1181 */ 1182 down_write(&md->io_lock); 1183 set_bit(DMF_BLOCK_IO, &md->flags); 1184 1185 add_wait_queue(&md->wait, &wait); 1186 up_write(&md->io_lock); 1187 1188 /* unplug */ 1189 if (map) 1190 dm_table_unplug_all(map); 1191 1192 /* 1193 * Then we wait for the already mapped ios to 1194 * complete. 1195 */ 1196 while (1) { 1197 set_current_state(TASK_INTERRUPTIBLE); 1198 1199 if (!atomic_read(&md->pending) || signal_pending(current)) 1200 break; 1201 1202 io_schedule(); 1203 } 1204 set_current_state(TASK_RUNNING); 1205 1206 down_write(&md->io_lock); 1207 remove_wait_queue(&md->wait, &wait); 1208 1209 /* were we interrupted ? */ 1210 r = -EINTR; 1211 if (atomic_read(&md->pending)) { 1212 clear_bit(DMF_BLOCK_IO, &md->flags); 1213 def = bio_list_get(&md->deferred); 1214 __flush_deferred_io(md, def); 1215 up_write(&md->io_lock); 1216 unlock_fs(md); 1217 goto out; 1218 } 1219 up_write(&md->io_lock); 1220 1221 dm_table_postsuspend_targets(map); 1222 1223 set_bit(DMF_SUSPENDED, &md->flags); 1224 1225 r = 0; 1226 1227 out: 1228 if (r && md->suspended_bdev) { 1229 bdput(md->suspended_bdev); 1230 md->suspended_bdev = NULL; 1231 } 1232 1233 dm_table_put(map); 1234 up(&md->suspend_lock); 1235 return r; 1236 } 1237 1238 int dm_resume(struct mapped_device *md) 1239 { 1240 int r = -EINVAL; 1241 struct bio *def; 1242 struct dm_table *map = NULL; 1243 1244 down(&md->suspend_lock); 1245 if (!dm_suspended(md)) 1246 goto out; 1247 1248 map = dm_get_table(md); 1249 if (!map || !dm_table_get_size(map)) 1250 goto out; 1251 1252 dm_table_resume_targets(map); 1253 1254 down_write(&md->io_lock); 1255 clear_bit(DMF_BLOCK_IO, &md->flags); 1256 1257 def = bio_list_get(&md->deferred); 1258 __flush_deferred_io(md, def); 1259 up_write(&md->io_lock); 1260 1261 unlock_fs(md); 1262 1263 bdput(md->suspended_bdev); 1264 md->suspended_bdev = NULL; 1265 1266 clear_bit(DMF_SUSPENDED, &md->flags); 1267 1268 dm_table_unplug_all(map); 1269 1270 r = 0; 1271 1272 out: 1273 dm_table_put(map); 1274 up(&md->suspend_lock); 1275 1276 return r; 1277 } 1278 1279 /*----------------------------------------------------------------- 1280 * Event notification. 1281 *---------------------------------------------------------------*/ 1282 uint32_t dm_get_event_nr(struct mapped_device *md) 1283 { 1284 return atomic_read(&md->event_nr); 1285 } 1286 1287 int dm_wait_event(struct mapped_device *md, int event_nr) 1288 { 1289 return wait_event_interruptible(md->eventq, 1290 (event_nr != atomic_read(&md->event_nr))); 1291 } 1292 1293 /* 1294 * The gendisk is only valid as long as you have a reference 1295 * count on 'md'. 1296 */ 1297 struct gendisk *dm_disk(struct mapped_device *md) 1298 { 1299 return md->disk; 1300 } 1301 1302 int dm_suspended(struct mapped_device *md) 1303 { 1304 return test_bit(DMF_SUSPENDED, &md->flags); 1305 } 1306 1307 static struct block_device_operations dm_blk_dops = { 1308 .open = dm_blk_open, 1309 .release = dm_blk_close, 1310 .getgeo = dm_blk_getgeo, 1311 .owner = THIS_MODULE 1312 }; 1313 1314 EXPORT_SYMBOL(dm_get_mapinfo); 1315 1316 /* 1317 * module hooks 1318 */ 1319 module_init(dm_init); 1320 module_exit(dm_exit); 1321 1322 module_param(major, uint, 0); 1323 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1324 MODULE_DESCRIPTION(DM_NAME " driver"); 1325 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1326 MODULE_LICENSE("GPL"); 1327