1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-list.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 #include <linux/blktrace_api.h> 23 24 #define DM_MSG_PREFIX "core" 25 26 static const char *_name = DM_NAME; 27 28 static unsigned int major = 0; 29 static unsigned int _major = 0; 30 31 static DEFINE_SPINLOCK(_minor_lock); 32 /* 33 * One of these is allocated per bio. 34 */ 35 struct dm_io { 36 struct mapped_device *md; 37 int error; 38 struct bio *bio; 39 atomic_t io_count; 40 unsigned long start_time; 41 }; 42 43 /* 44 * One of these is allocated per target within a bio. Hopefully 45 * this will be simplified out one day. 46 */ 47 struct target_io { 48 struct dm_io *io; 49 struct dm_target *ti; 50 union map_info info; 51 }; 52 53 union map_info *dm_get_mapinfo(struct bio *bio) 54 { 55 if (bio && bio->bi_private) 56 return &((struct target_io *)bio->bi_private)->info; 57 return NULL; 58 } 59 60 #define MINOR_ALLOCED ((void *)-1) 61 62 /* 63 * Bits for the md->flags field. 64 */ 65 #define DMF_BLOCK_IO 0 66 #define DMF_SUSPENDED 1 67 #define DMF_FROZEN 2 68 #define DMF_FREEING 3 69 #define DMF_DELETING 4 70 71 struct mapped_device { 72 struct rw_semaphore io_lock; 73 struct semaphore suspend_lock; 74 rwlock_t map_lock; 75 atomic_t holders; 76 atomic_t open_count; 77 78 unsigned long flags; 79 80 request_queue_t *queue; 81 struct gendisk *disk; 82 char name[16]; 83 84 void *interface_ptr; 85 86 /* 87 * A list of ios that arrived while we were suspended. 88 */ 89 atomic_t pending; 90 wait_queue_head_t wait; 91 struct bio_list deferred; 92 93 /* 94 * The current mapping. 95 */ 96 struct dm_table *map; 97 98 /* 99 * io objects are allocated from here. 100 */ 101 mempool_t *io_pool; 102 mempool_t *tio_pool; 103 104 /* 105 * Event handling. 106 */ 107 atomic_t event_nr; 108 wait_queue_head_t eventq; 109 110 /* 111 * freeze/thaw support require holding onto a super block 112 */ 113 struct super_block *frozen_sb; 114 struct block_device *suspended_bdev; 115 116 /* forced geometry settings */ 117 struct hd_geometry geometry; 118 }; 119 120 #define MIN_IOS 256 121 static kmem_cache_t *_io_cache; 122 static kmem_cache_t *_tio_cache; 123 124 static struct bio_set *dm_set; 125 126 static int __init local_init(void) 127 { 128 int r; 129 130 dm_set = bioset_create(16, 16, 4); 131 if (!dm_set) 132 return -ENOMEM; 133 134 /* allocate a slab for the dm_ios */ 135 _io_cache = kmem_cache_create("dm_io", 136 sizeof(struct dm_io), 0, 0, NULL, NULL); 137 if (!_io_cache) 138 return -ENOMEM; 139 140 /* allocate a slab for the target ios */ 141 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), 142 0, 0, NULL, NULL); 143 if (!_tio_cache) { 144 kmem_cache_destroy(_io_cache); 145 return -ENOMEM; 146 } 147 148 _major = major; 149 r = register_blkdev(_major, _name); 150 if (r < 0) { 151 kmem_cache_destroy(_tio_cache); 152 kmem_cache_destroy(_io_cache); 153 return r; 154 } 155 156 if (!_major) 157 _major = r; 158 159 return 0; 160 } 161 162 static void local_exit(void) 163 { 164 kmem_cache_destroy(_tio_cache); 165 kmem_cache_destroy(_io_cache); 166 167 bioset_free(dm_set); 168 169 if (unregister_blkdev(_major, _name) < 0) 170 DMERR("unregister_blkdev failed"); 171 172 _major = 0; 173 174 DMINFO("cleaned up"); 175 } 176 177 int (*_inits[])(void) __initdata = { 178 local_init, 179 dm_target_init, 180 dm_linear_init, 181 dm_stripe_init, 182 dm_interface_init, 183 }; 184 185 void (*_exits[])(void) = { 186 local_exit, 187 dm_target_exit, 188 dm_linear_exit, 189 dm_stripe_exit, 190 dm_interface_exit, 191 }; 192 193 static int __init dm_init(void) 194 { 195 const int count = ARRAY_SIZE(_inits); 196 197 int r, i; 198 199 for (i = 0; i < count; i++) { 200 r = _inits[i](); 201 if (r) 202 goto bad; 203 } 204 205 return 0; 206 207 bad: 208 while (i--) 209 _exits[i](); 210 211 return r; 212 } 213 214 static void __exit dm_exit(void) 215 { 216 int i = ARRAY_SIZE(_exits); 217 218 while (i--) 219 _exits[i](); 220 } 221 222 /* 223 * Block device functions 224 */ 225 static int dm_blk_open(struct inode *inode, struct file *file) 226 { 227 struct mapped_device *md; 228 229 spin_lock(&_minor_lock); 230 231 md = inode->i_bdev->bd_disk->private_data; 232 if (!md) 233 goto out; 234 235 if (test_bit(DMF_FREEING, &md->flags) || 236 test_bit(DMF_DELETING, &md->flags)) { 237 md = NULL; 238 goto out; 239 } 240 241 dm_get(md); 242 atomic_inc(&md->open_count); 243 244 out: 245 spin_unlock(&_minor_lock); 246 247 return md ? 0 : -ENXIO; 248 } 249 250 static int dm_blk_close(struct inode *inode, struct file *file) 251 { 252 struct mapped_device *md; 253 254 md = inode->i_bdev->bd_disk->private_data; 255 atomic_dec(&md->open_count); 256 dm_put(md); 257 return 0; 258 } 259 260 int dm_open_count(struct mapped_device *md) 261 { 262 return atomic_read(&md->open_count); 263 } 264 265 /* 266 * Guarantees nothing is using the device before it's deleted. 267 */ 268 int dm_lock_for_deletion(struct mapped_device *md) 269 { 270 int r = 0; 271 272 spin_lock(&_minor_lock); 273 274 if (dm_open_count(md)) 275 r = -EBUSY; 276 else 277 set_bit(DMF_DELETING, &md->flags); 278 279 spin_unlock(&_minor_lock); 280 281 return r; 282 } 283 284 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 285 { 286 struct mapped_device *md = bdev->bd_disk->private_data; 287 288 return dm_get_geometry(md, geo); 289 } 290 291 static inline struct dm_io *alloc_io(struct mapped_device *md) 292 { 293 return mempool_alloc(md->io_pool, GFP_NOIO); 294 } 295 296 static inline void free_io(struct mapped_device *md, struct dm_io *io) 297 { 298 mempool_free(io, md->io_pool); 299 } 300 301 static inline struct target_io *alloc_tio(struct mapped_device *md) 302 { 303 return mempool_alloc(md->tio_pool, GFP_NOIO); 304 } 305 306 static inline void free_tio(struct mapped_device *md, struct target_io *tio) 307 { 308 mempool_free(tio, md->tio_pool); 309 } 310 311 static void start_io_acct(struct dm_io *io) 312 { 313 struct mapped_device *md = io->md; 314 315 io->start_time = jiffies; 316 317 preempt_disable(); 318 disk_round_stats(dm_disk(md)); 319 preempt_enable(); 320 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 321 } 322 323 static int end_io_acct(struct dm_io *io) 324 { 325 struct mapped_device *md = io->md; 326 struct bio *bio = io->bio; 327 unsigned long duration = jiffies - io->start_time; 328 int pending; 329 int rw = bio_data_dir(bio); 330 331 preempt_disable(); 332 disk_round_stats(dm_disk(md)); 333 preempt_enable(); 334 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 335 336 disk_stat_add(dm_disk(md), ticks[rw], duration); 337 338 return !pending; 339 } 340 341 /* 342 * Add the bio to the list of deferred io. 343 */ 344 static int queue_io(struct mapped_device *md, struct bio *bio) 345 { 346 down_write(&md->io_lock); 347 348 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 349 up_write(&md->io_lock); 350 return 1; 351 } 352 353 bio_list_add(&md->deferred, bio); 354 355 up_write(&md->io_lock); 356 return 0; /* deferred successfully */ 357 } 358 359 /* 360 * Everyone (including functions in this file), should use this 361 * function to access the md->map field, and make sure they call 362 * dm_table_put() when finished. 363 */ 364 struct dm_table *dm_get_table(struct mapped_device *md) 365 { 366 struct dm_table *t; 367 368 read_lock(&md->map_lock); 369 t = md->map; 370 if (t) 371 dm_table_get(t); 372 read_unlock(&md->map_lock); 373 374 return t; 375 } 376 377 /* 378 * Get the geometry associated with a dm device 379 */ 380 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 381 { 382 *geo = md->geometry; 383 384 return 0; 385 } 386 387 /* 388 * Set the geometry of a device. 389 */ 390 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 391 { 392 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 393 394 if (geo->start > sz) { 395 DMWARN("Start sector is beyond the geometry limits."); 396 return -EINVAL; 397 } 398 399 md->geometry = *geo; 400 401 return 0; 402 } 403 404 /*----------------------------------------------------------------- 405 * CRUD START: 406 * A more elegant soln is in the works that uses the queue 407 * merge fn, unfortunately there are a couple of changes to 408 * the block layer that I want to make for this. So in the 409 * interests of getting something for people to use I give 410 * you this clearly demarcated crap. 411 *---------------------------------------------------------------*/ 412 413 /* 414 * Decrements the number of outstanding ios that a bio has been 415 * cloned into, completing the original io if necc. 416 */ 417 static void dec_pending(struct dm_io *io, int error) 418 { 419 if (error) 420 io->error = error; 421 422 if (atomic_dec_and_test(&io->io_count)) { 423 if (end_io_acct(io)) 424 /* nudge anyone waiting on suspend queue */ 425 wake_up(&io->md->wait); 426 427 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); 428 429 bio_endio(io->bio, io->bio->bi_size, io->error); 430 free_io(io->md, io); 431 } 432 } 433 434 static int clone_endio(struct bio *bio, unsigned int done, int error) 435 { 436 int r = 0; 437 struct target_io *tio = bio->bi_private; 438 struct dm_io *io = tio->io; 439 dm_endio_fn endio = tio->ti->type->end_io; 440 441 if (bio->bi_size) 442 return 1; 443 444 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 445 error = -EIO; 446 447 if (endio) { 448 r = endio(tio->ti, bio, error, &tio->info); 449 if (r < 0) 450 error = r; 451 452 else if (r > 0) 453 /* the target wants another shot at the io */ 454 return 1; 455 } 456 457 free_tio(io->md, tio); 458 dec_pending(io, error); 459 bio_put(bio); 460 return r; 461 } 462 463 static sector_t max_io_len(struct mapped_device *md, 464 sector_t sector, struct dm_target *ti) 465 { 466 sector_t offset = sector - ti->begin; 467 sector_t len = ti->len - offset; 468 469 /* 470 * Does the target need to split even further ? 471 */ 472 if (ti->split_io) { 473 sector_t boundary; 474 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 475 - offset; 476 if (len > boundary) 477 len = boundary; 478 } 479 480 return len; 481 } 482 483 static void __map_bio(struct dm_target *ti, struct bio *clone, 484 struct target_io *tio) 485 { 486 int r; 487 sector_t sector; 488 489 /* 490 * Sanity checks. 491 */ 492 BUG_ON(!clone->bi_size); 493 494 clone->bi_end_io = clone_endio; 495 clone->bi_private = tio; 496 497 /* 498 * Map the clone. If r == 0 we don't need to do 499 * anything, the target has assumed ownership of 500 * this io. 501 */ 502 atomic_inc(&tio->io->io_count); 503 sector = clone->bi_sector; 504 r = ti->type->map(ti, clone, &tio->info); 505 if (r > 0) { 506 /* the bio has been remapped so dispatch it */ 507 508 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 509 tio->io->bio->bi_bdev->bd_dev, sector, 510 clone->bi_sector); 511 512 generic_make_request(clone); 513 } 514 515 else if (r < 0) { 516 /* error the io and bail out */ 517 struct dm_io *io = tio->io; 518 free_tio(tio->io->md, tio); 519 dec_pending(io, r); 520 bio_put(clone); 521 } 522 } 523 524 struct clone_info { 525 struct mapped_device *md; 526 struct dm_table *map; 527 struct bio *bio; 528 struct dm_io *io; 529 sector_t sector; 530 sector_t sector_count; 531 unsigned short idx; 532 }; 533 534 static void dm_bio_destructor(struct bio *bio) 535 { 536 bio_free(bio, dm_set); 537 } 538 539 /* 540 * Creates a little bio that is just does part of a bvec. 541 */ 542 static struct bio *split_bvec(struct bio *bio, sector_t sector, 543 unsigned short idx, unsigned int offset, 544 unsigned int len) 545 { 546 struct bio *clone; 547 struct bio_vec *bv = bio->bi_io_vec + idx; 548 549 clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); 550 clone->bi_destructor = dm_bio_destructor; 551 *clone->bi_io_vec = *bv; 552 553 clone->bi_sector = sector; 554 clone->bi_bdev = bio->bi_bdev; 555 clone->bi_rw = bio->bi_rw; 556 clone->bi_vcnt = 1; 557 clone->bi_size = to_bytes(len); 558 clone->bi_io_vec->bv_offset = offset; 559 clone->bi_io_vec->bv_len = clone->bi_size; 560 561 return clone; 562 } 563 564 /* 565 * Creates a bio that consists of range of complete bvecs. 566 */ 567 static struct bio *clone_bio(struct bio *bio, sector_t sector, 568 unsigned short idx, unsigned short bv_count, 569 unsigned int len) 570 { 571 struct bio *clone; 572 573 clone = bio_clone(bio, GFP_NOIO); 574 clone->bi_sector = sector; 575 clone->bi_idx = idx; 576 clone->bi_vcnt = idx + bv_count; 577 clone->bi_size = to_bytes(len); 578 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 579 580 return clone; 581 } 582 583 static void __clone_and_map(struct clone_info *ci) 584 { 585 struct bio *clone, *bio = ci->bio; 586 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 587 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 588 struct target_io *tio; 589 590 /* 591 * Allocate a target io object. 592 */ 593 tio = alloc_tio(ci->md); 594 tio->io = ci->io; 595 tio->ti = ti; 596 memset(&tio->info, 0, sizeof(tio->info)); 597 598 if (ci->sector_count <= max) { 599 /* 600 * Optimise for the simple case where we can do all of 601 * the remaining io with a single clone. 602 */ 603 clone = clone_bio(bio, ci->sector, ci->idx, 604 bio->bi_vcnt - ci->idx, ci->sector_count); 605 __map_bio(ti, clone, tio); 606 ci->sector_count = 0; 607 608 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 609 /* 610 * There are some bvecs that don't span targets. 611 * Do as many of these as possible. 612 */ 613 int i; 614 sector_t remaining = max; 615 sector_t bv_len; 616 617 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 618 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 619 620 if (bv_len > remaining) 621 break; 622 623 remaining -= bv_len; 624 len += bv_len; 625 } 626 627 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); 628 __map_bio(ti, clone, tio); 629 630 ci->sector += len; 631 ci->sector_count -= len; 632 ci->idx = i; 633 634 } else { 635 /* 636 * Handle a bvec that must be split between two or more targets. 637 */ 638 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 639 sector_t remaining = to_sector(bv->bv_len); 640 unsigned int offset = 0; 641 642 do { 643 if (offset) { 644 ti = dm_table_find_target(ci->map, ci->sector); 645 max = max_io_len(ci->md, ci->sector, ti); 646 647 tio = alloc_tio(ci->md); 648 tio->io = ci->io; 649 tio->ti = ti; 650 memset(&tio->info, 0, sizeof(tio->info)); 651 } 652 653 len = min(remaining, max); 654 655 clone = split_bvec(bio, ci->sector, ci->idx, 656 bv->bv_offset + offset, len); 657 658 __map_bio(ti, clone, tio); 659 660 ci->sector += len; 661 ci->sector_count -= len; 662 offset += to_bytes(len); 663 } while (remaining -= len); 664 665 ci->idx++; 666 } 667 } 668 669 /* 670 * Split the bio into several clones. 671 */ 672 static void __split_bio(struct mapped_device *md, struct bio *bio) 673 { 674 struct clone_info ci; 675 676 ci.map = dm_get_table(md); 677 if (!ci.map) { 678 bio_io_error(bio, bio->bi_size); 679 return; 680 } 681 682 ci.md = md; 683 ci.bio = bio; 684 ci.io = alloc_io(md); 685 ci.io->error = 0; 686 atomic_set(&ci.io->io_count, 1); 687 ci.io->bio = bio; 688 ci.io->md = md; 689 ci.sector = bio->bi_sector; 690 ci.sector_count = bio_sectors(bio); 691 ci.idx = bio->bi_idx; 692 693 start_io_acct(ci.io); 694 while (ci.sector_count) 695 __clone_and_map(&ci); 696 697 /* drop the extra reference count */ 698 dec_pending(ci.io, 0); 699 dm_table_put(ci.map); 700 } 701 /*----------------------------------------------------------------- 702 * CRUD END 703 *---------------------------------------------------------------*/ 704 705 /* 706 * The request function that just remaps the bio built up by 707 * dm_merge_bvec. 708 */ 709 static int dm_request(request_queue_t *q, struct bio *bio) 710 { 711 int r; 712 int rw = bio_data_dir(bio); 713 struct mapped_device *md = q->queuedata; 714 715 down_read(&md->io_lock); 716 717 disk_stat_inc(dm_disk(md), ios[rw]); 718 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 719 720 /* 721 * If we're suspended we have to queue 722 * this io for later. 723 */ 724 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 725 up_read(&md->io_lock); 726 727 if (bio_rw(bio) == READA) { 728 bio_io_error(bio, bio->bi_size); 729 return 0; 730 } 731 732 r = queue_io(md, bio); 733 if (r < 0) { 734 bio_io_error(bio, bio->bi_size); 735 return 0; 736 737 } else if (r == 0) 738 return 0; /* deferred successfully */ 739 740 /* 741 * We're in a while loop, because someone could suspend 742 * before we get to the following read lock. 743 */ 744 down_read(&md->io_lock); 745 } 746 747 __split_bio(md, bio); 748 up_read(&md->io_lock); 749 return 0; 750 } 751 752 static int dm_flush_all(request_queue_t *q, struct gendisk *disk, 753 sector_t *error_sector) 754 { 755 struct mapped_device *md = q->queuedata; 756 struct dm_table *map = dm_get_table(md); 757 int ret = -ENXIO; 758 759 if (map) { 760 ret = dm_table_flush_all(map); 761 dm_table_put(map); 762 } 763 764 return ret; 765 } 766 767 static void dm_unplug_all(request_queue_t *q) 768 { 769 struct mapped_device *md = q->queuedata; 770 struct dm_table *map = dm_get_table(md); 771 772 if (map) { 773 dm_table_unplug_all(map); 774 dm_table_put(map); 775 } 776 } 777 778 static int dm_any_congested(void *congested_data, int bdi_bits) 779 { 780 int r; 781 struct mapped_device *md = (struct mapped_device *) congested_data; 782 struct dm_table *map = dm_get_table(md); 783 784 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 785 r = bdi_bits; 786 else 787 r = dm_table_any_congested(map, bdi_bits); 788 789 dm_table_put(map); 790 return r; 791 } 792 793 /*----------------------------------------------------------------- 794 * An IDR is used to keep track of allocated minor numbers. 795 *---------------------------------------------------------------*/ 796 static DEFINE_IDR(_minor_idr); 797 798 static void free_minor(int minor) 799 { 800 spin_lock(&_minor_lock); 801 idr_remove(&_minor_idr, minor); 802 spin_unlock(&_minor_lock); 803 } 804 805 /* 806 * See if the device with a specific minor # is free. 807 */ 808 static int specific_minor(struct mapped_device *md, int minor) 809 { 810 int r, m; 811 812 if (minor >= (1 << MINORBITS)) 813 return -EINVAL; 814 815 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 816 if (!r) 817 return -ENOMEM; 818 819 spin_lock(&_minor_lock); 820 821 if (idr_find(&_minor_idr, minor)) { 822 r = -EBUSY; 823 goto out; 824 } 825 826 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 827 if (r) 828 goto out; 829 830 if (m != minor) { 831 idr_remove(&_minor_idr, m); 832 r = -EBUSY; 833 goto out; 834 } 835 836 out: 837 spin_unlock(&_minor_lock); 838 return r; 839 } 840 841 static int next_free_minor(struct mapped_device *md, int *minor) 842 { 843 int r, m; 844 845 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 846 if (!r) 847 return -ENOMEM; 848 849 spin_lock(&_minor_lock); 850 851 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 852 if (r) { 853 goto out; 854 } 855 856 if (m >= (1 << MINORBITS)) { 857 idr_remove(&_minor_idr, m); 858 r = -ENOSPC; 859 goto out; 860 } 861 862 *minor = m; 863 864 out: 865 spin_unlock(&_minor_lock); 866 return r; 867 } 868 869 static struct block_device_operations dm_blk_dops; 870 871 /* 872 * Allocate and initialise a blank device with a given minor. 873 */ 874 static struct mapped_device *alloc_dev(int minor) 875 { 876 int r; 877 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 878 void *old_md; 879 880 if (!md) { 881 DMWARN("unable to allocate device, out of memory."); 882 return NULL; 883 } 884 885 if (!try_module_get(THIS_MODULE)) 886 goto bad0; 887 888 /* get a minor number for the dev */ 889 if (minor == DM_ANY_MINOR) 890 r = next_free_minor(md, &minor); 891 else 892 r = specific_minor(md, minor); 893 if (r < 0) 894 goto bad1; 895 896 memset(md, 0, sizeof(*md)); 897 init_rwsem(&md->io_lock); 898 init_MUTEX(&md->suspend_lock); 899 rwlock_init(&md->map_lock); 900 atomic_set(&md->holders, 1); 901 atomic_set(&md->open_count, 0); 902 atomic_set(&md->event_nr, 0); 903 904 md->queue = blk_alloc_queue(GFP_KERNEL); 905 if (!md->queue) 906 goto bad1; 907 908 md->queue->queuedata = md; 909 md->queue->backing_dev_info.congested_fn = dm_any_congested; 910 md->queue->backing_dev_info.congested_data = md; 911 blk_queue_make_request(md->queue, dm_request); 912 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 913 md->queue->unplug_fn = dm_unplug_all; 914 md->queue->issue_flush_fn = dm_flush_all; 915 916 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 917 if (!md->io_pool) 918 goto bad2; 919 920 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 921 if (!md->tio_pool) 922 goto bad3; 923 924 md->disk = alloc_disk(1); 925 if (!md->disk) 926 goto bad4; 927 928 atomic_set(&md->pending, 0); 929 init_waitqueue_head(&md->wait); 930 init_waitqueue_head(&md->eventq); 931 932 md->disk->major = _major; 933 md->disk->first_minor = minor; 934 md->disk->fops = &dm_blk_dops; 935 md->disk->queue = md->queue; 936 md->disk->private_data = md; 937 sprintf(md->disk->disk_name, "dm-%d", minor); 938 add_disk(md->disk); 939 format_dev_t(md->name, MKDEV(_major, minor)); 940 941 /* Populate the mapping, nobody knows we exist yet */ 942 spin_lock(&_minor_lock); 943 old_md = idr_replace(&_minor_idr, md, minor); 944 spin_unlock(&_minor_lock); 945 946 BUG_ON(old_md != MINOR_ALLOCED); 947 948 return md; 949 950 bad4: 951 mempool_destroy(md->tio_pool); 952 bad3: 953 mempool_destroy(md->io_pool); 954 bad2: 955 blk_cleanup_queue(md->queue); 956 free_minor(minor); 957 bad1: 958 module_put(THIS_MODULE); 959 bad0: 960 kfree(md); 961 return NULL; 962 } 963 964 static void free_dev(struct mapped_device *md) 965 { 966 int minor = md->disk->first_minor; 967 968 if (md->suspended_bdev) { 969 thaw_bdev(md->suspended_bdev, NULL); 970 bdput(md->suspended_bdev); 971 } 972 mempool_destroy(md->tio_pool); 973 mempool_destroy(md->io_pool); 974 del_gendisk(md->disk); 975 free_minor(minor); 976 977 spin_lock(&_minor_lock); 978 md->disk->private_data = NULL; 979 spin_unlock(&_minor_lock); 980 981 put_disk(md->disk); 982 blk_cleanup_queue(md->queue); 983 module_put(THIS_MODULE); 984 kfree(md); 985 } 986 987 /* 988 * Bind a table to the device. 989 */ 990 static void event_callback(void *context) 991 { 992 struct mapped_device *md = (struct mapped_device *) context; 993 994 atomic_inc(&md->event_nr); 995 wake_up(&md->eventq); 996 } 997 998 static void __set_size(struct mapped_device *md, sector_t size) 999 { 1000 set_capacity(md->disk, size); 1001 1002 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1003 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1004 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1005 } 1006 1007 static int __bind(struct mapped_device *md, struct dm_table *t) 1008 { 1009 request_queue_t *q = md->queue; 1010 sector_t size; 1011 1012 size = dm_table_get_size(t); 1013 1014 /* 1015 * Wipe any geometry if the size of the table changed. 1016 */ 1017 if (size != get_capacity(md->disk)) 1018 memset(&md->geometry, 0, sizeof(md->geometry)); 1019 1020 __set_size(md, size); 1021 if (size == 0) 1022 return 0; 1023 1024 dm_table_get(t); 1025 dm_table_event_callback(t, event_callback, md); 1026 1027 write_lock(&md->map_lock); 1028 md->map = t; 1029 dm_table_set_restrictions(t, q); 1030 write_unlock(&md->map_lock); 1031 1032 return 0; 1033 } 1034 1035 static void __unbind(struct mapped_device *md) 1036 { 1037 struct dm_table *map = md->map; 1038 1039 if (!map) 1040 return; 1041 1042 dm_table_event_callback(map, NULL, NULL); 1043 write_lock(&md->map_lock); 1044 md->map = NULL; 1045 write_unlock(&md->map_lock); 1046 dm_table_put(map); 1047 } 1048 1049 /* 1050 * Constructor for a new device. 1051 */ 1052 int dm_create(int minor, struct mapped_device **result) 1053 { 1054 struct mapped_device *md; 1055 1056 md = alloc_dev(minor); 1057 if (!md) 1058 return -ENXIO; 1059 1060 *result = md; 1061 return 0; 1062 } 1063 1064 static struct mapped_device *dm_find_md(dev_t dev) 1065 { 1066 struct mapped_device *md; 1067 unsigned minor = MINOR(dev); 1068 1069 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1070 return NULL; 1071 1072 spin_lock(&_minor_lock); 1073 1074 md = idr_find(&_minor_idr, minor); 1075 if (md && (md == MINOR_ALLOCED || 1076 (dm_disk(md)->first_minor != minor) || 1077 test_bit(DMF_FREEING, &md->flags))) { 1078 md = NULL; 1079 goto out; 1080 } 1081 1082 out: 1083 spin_unlock(&_minor_lock); 1084 1085 return md; 1086 } 1087 1088 struct mapped_device *dm_get_md(dev_t dev) 1089 { 1090 struct mapped_device *md = dm_find_md(dev); 1091 1092 if (md) 1093 dm_get(md); 1094 1095 return md; 1096 } 1097 1098 void *dm_get_mdptr(struct mapped_device *md) 1099 { 1100 return md->interface_ptr; 1101 } 1102 1103 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1104 { 1105 md->interface_ptr = ptr; 1106 } 1107 1108 void dm_get(struct mapped_device *md) 1109 { 1110 atomic_inc(&md->holders); 1111 } 1112 1113 const char *dm_device_name(struct mapped_device *md) 1114 { 1115 return md->name; 1116 } 1117 EXPORT_SYMBOL_GPL(dm_device_name); 1118 1119 void dm_put(struct mapped_device *md) 1120 { 1121 struct dm_table *map; 1122 1123 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 1124 1125 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 1126 map = dm_get_table(md); 1127 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); 1128 set_bit(DMF_FREEING, &md->flags); 1129 spin_unlock(&_minor_lock); 1130 if (!dm_suspended(md)) { 1131 dm_table_presuspend_targets(map); 1132 dm_table_postsuspend_targets(map); 1133 } 1134 __unbind(md); 1135 dm_table_put(map); 1136 free_dev(md); 1137 } 1138 } 1139 1140 /* 1141 * Process the deferred bios 1142 */ 1143 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1144 { 1145 struct bio *n; 1146 1147 while (c) { 1148 n = c->bi_next; 1149 c->bi_next = NULL; 1150 __split_bio(md, c); 1151 c = n; 1152 } 1153 } 1154 1155 /* 1156 * Swap in a new table (destroying old one). 1157 */ 1158 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1159 { 1160 int r = -EINVAL; 1161 1162 down(&md->suspend_lock); 1163 1164 /* device must be suspended */ 1165 if (!dm_suspended(md)) 1166 goto out; 1167 1168 __unbind(md); 1169 r = __bind(md, table); 1170 1171 out: 1172 up(&md->suspend_lock); 1173 return r; 1174 } 1175 1176 /* 1177 * Functions to lock and unlock any filesystem running on the 1178 * device. 1179 */ 1180 static int lock_fs(struct mapped_device *md) 1181 { 1182 int r; 1183 1184 WARN_ON(md->frozen_sb); 1185 1186 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1187 if (IS_ERR(md->frozen_sb)) { 1188 r = PTR_ERR(md->frozen_sb); 1189 md->frozen_sb = NULL; 1190 return r; 1191 } 1192 1193 set_bit(DMF_FROZEN, &md->flags); 1194 1195 /* don't bdput right now, we don't want the bdev 1196 * to go away while it is locked. 1197 */ 1198 return 0; 1199 } 1200 1201 static void unlock_fs(struct mapped_device *md) 1202 { 1203 if (!test_bit(DMF_FROZEN, &md->flags)) 1204 return; 1205 1206 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1207 md->frozen_sb = NULL; 1208 clear_bit(DMF_FROZEN, &md->flags); 1209 } 1210 1211 /* 1212 * We need to be able to change a mapping table under a mounted 1213 * filesystem. For example we might want to move some data in 1214 * the background. Before the table can be swapped with 1215 * dm_bind_table, dm_suspend must be called to flush any in 1216 * flight bios and ensure that any further io gets deferred. 1217 */ 1218 int dm_suspend(struct mapped_device *md, int do_lockfs) 1219 { 1220 struct dm_table *map = NULL; 1221 DECLARE_WAITQUEUE(wait, current); 1222 struct bio *def; 1223 int r = -EINVAL; 1224 1225 down(&md->suspend_lock); 1226 1227 if (dm_suspended(md)) 1228 goto out; 1229 1230 map = dm_get_table(md); 1231 1232 /* This does not get reverted if there's an error later. */ 1233 dm_table_presuspend_targets(map); 1234 1235 md->suspended_bdev = bdget_disk(md->disk, 0); 1236 if (!md->suspended_bdev) { 1237 DMWARN("bdget failed in dm_suspend"); 1238 r = -ENOMEM; 1239 goto out; 1240 } 1241 1242 /* Flush I/O to the device. */ 1243 if (do_lockfs) { 1244 r = lock_fs(md); 1245 if (r) 1246 goto out; 1247 } 1248 1249 /* 1250 * First we set the BLOCK_IO flag so no more ios will be mapped. 1251 */ 1252 down_write(&md->io_lock); 1253 set_bit(DMF_BLOCK_IO, &md->flags); 1254 1255 add_wait_queue(&md->wait, &wait); 1256 up_write(&md->io_lock); 1257 1258 /* unplug */ 1259 if (map) 1260 dm_table_unplug_all(map); 1261 1262 /* 1263 * Then we wait for the already mapped ios to 1264 * complete. 1265 */ 1266 while (1) { 1267 set_current_state(TASK_INTERRUPTIBLE); 1268 1269 if (!atomic_read(&md->pending) || signal_pending(current)) 1270 break; 1271 1272 io_schedule(); 1273 } 1274 set_current_state(TASK_RUNNING); 1275 1276 down_write(&md->io_lock); 1277 remove_wait_queue(&md->wait, &wait); 1278 1279 /* were we interrupted ? */ 1280 r = -EINTR; 1281 if (atomic_read(&md->pending)) { 1282 clear_bit(DMF_BLOCK_IO, &md->flags); 1283 def = bio_list_get(&md->deferred); 1284 __flush_deferred_io(md, def); 1285 up_write(&md->io_lock); 1286 unlock_fs(md); 1287 goto out; 1288 } 1289 up_write(&md->io_lock); 1290 1291 dm_table_postsuspend_targets(map); 1292 1293 set_bit(DMF_SUSPENDED, &md->flags); 1294 1295 r = 0; 1296 1297 out: 1298 if (r && md->suspended_bdev) { 1299 bdput(md->suspended_bdev); 1300 md->suspended_bdev = NULL; 1301 } 1302 1303 dm_table_put(map); 1304 up(&md->suspend_lock); 1305 return r; 1306 } 1307 1308 int dm_resume(struct mapped_device *md) 1309 { 1310 int r = -EINVAL; 1311 struct bio *def; 1312 struct dm_table *map = NULL; 1313 1314 down(&md->suspend_lock); 1315 if (!dm_suspended(md)) 1316 goto out; 1317 1318 map = dm_get_table(md); 1319 if (!map || !dm_table_get_size(map)) 1320 goto out; 1321 1322 dm_table_resume_targets(map); 1323 1324 down_write(&md->io_lock); 1325 clear_bit(DMF_BLOCK_IO, &md->flags); 1326 1327 def = bio_list_get(&md->deferred); 1328 __flush_deferred_io(md, def); 1329 up_write(&md->io_lock); 1330 1331 unlock_fs(md); 1332 1333 bdput(md->suspended_bdev); 1334 md->suspended_bdev = NULL; 1335 1336 clear_bit(DMF_SUSPENDED, &md->flags); 1337 1338 dm_table_unplug_all(map); 1339 1340 r = 0; 1341 1342 out: 1343 dm_table_put(map); 1344 up(&md->suspend_lock); 1345 1346 return r; 1347 } 1348 1349 /*----------------------------------------------------------------- 1350 * Event notification. 1351 *---------------------------------------------------------------*/ 1352 uint32_t dm_get_event_nr(struct mapped_device *md) 1353 { 1354 return atomic_read(&md->event_nr); 1355 } 1356 1357 int dm_wait_event(struct mapped_device *md, int event_nr) 1358 { 1359 return wait_event_interruptible(md->eventq, 1360 (event_nr != atomic_read(&md->event_nr))); 1361 } 1362 1363 /* 1364 * The gendisk is only valid as long as you have a reference 1365 * count on 'md'. 1366 */ 1367 struct gendisk *dm_disk(struct mapped_device *md) 1368 { 1369 return md->disk; 1370 } 1371 1372 int dm_suspended(struct mapped_device *md) 1373 { 1374 return test_bit(DMF_SUSPENDED, &md->flags); 1375 } 1376 1377 static struct block_device_operations dm_blk_dops = { 1378 .open = dm_blk_open, 1379 .release = dm_blk_close, 1380 .getgeo = dm_blk_getgeo, 1381 .owner = THIS_MODULE 1382 }; 1383 1384 EXPORT_SYMBOL(dm_get_mapinfo); 1385 1386 /* 1387 * module hooks 1388 */ 1389 module_init(dm_init); 1390 module_exit(dm_exit); 1391 1392 module_param(major, uint, 0); 1393 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1394 MODULE_DESCRIPTION(DM_NAME " driver"); 1395 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1396 MODULE_LICENSE("GPL"); 1397