1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-list.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 #include <linux/blktrace_api.h> 23 #include <linux/smp_lock.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 static const char *_name = DM_NAME; 28 29 static unsigned int major = 0; 30 static unsigned int _major = 0; 31 32 static DEFINE_SPINLOCK(_minor_lock); 33 /* 34 * One of these is allocated per bio. 35 */ 36 struct dm_io { 37 struct mapped_device *md; 38 int error; 39 struct bio *bio; 40 atomic_t io_count; 41 unsigned long start_time; 42 }; 43 44 /* 45 * One of these is allocated per target within a bio. Hopefully 46 * this will be simplified out one day. 47 */ 48 struct target_io { 49 struct dm_io *io; 50 struct dm_target *ti; 51 union map_info info; 52 }; 53 54 union map_info *dm_get_mapinfo(struct bio *bio) 55 { 56 if (bio && bio->bi_private) 57 return &((struct target_io *)bio->bi_private)->info; 58 return NULL; 59 } 60 61 #define MINOR_ALLOCED ((void *)-1) 62 63 /* 64 * Bits for the md->flags field. 65 */ 66 #define DMF_BLOCK_IO 0 67 #define DMF_SUSPENDED 1 68 #define DMF_FROZEN 2 69 #define DMF_FREEING 3 70 #define DMF_DELETING 4 71 72 struct mapped_device { 73 struct rw_semaphore io_lock; 74 struct semaphore suspend_lock; 75 rwlock_t map_lock; 76 atomic_t holders; 77 atomic_t open_count; 78 79 unsigned long flags; 80 81 request_queue_t *queue; 82 struct gendisk *disk; 83 char name[16]; 84 85 void *interface_ptr; 86 87 /* 88 * A list of ios that arrived while we were suspended. 89 */ 90 atomic_t pending; 91 wait_queue_head_t wait; 92 struct bio_list deferred; 93 94 /* 95 * The current mapping. 96 */ 97 struct dm_table *map; 98 99 /* 100 * io objects are allocated from here. 101 */ 102 mempool_t *io_pool; 103 mempool_t *tio_pool; 104 105 struct bio_set *bs; 106 107 /* 108 * Event handling. 109 */ 110 atomic_t event_nr; 111 wait_queue_head_t eventq; 112 113 /* 114 * freeze/thaw support require holding onto a super block 115 */ 116 struct super_block *frozen_sb; 117 struct block_device *suspended_bdev; 118 119 /* forced geometry settings */ 120 struct hd_geometry geometry; 121 }; 122 123 #define MIN_IOS 256 124 static struct kmem_cache *_io_cache; 125 static struct kmem_cache *_tio_cache; 126 127 static int __init local_init(void) 128 { 129 int r; 130 131 /* allocate a slab for the dm_ios */ 132 _io_cache = kmem_cache_create("dm_io", 133 sizeof(struct dm_io), 0, 0, NULL, NULL); 134 if (!_io_cache) 135 return -ENOMEM; 136 137 /* allocate a slab for the target ios */ 138 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), 139 0, 0, NULL, NULL); 140 if (!_tio_cache) { 141 kmem_cache_destroy(_io_cache); 142 return -ENOMEM; 143 } 144 145 _major = major; 146 r = register_blkdev(_major, _name); 147 if (r < 0) { 148 kmem_cache_destroy(_tio_cache); 149 kmem_cache_destroy(_io_cache); 150 return r; 151 } 152 153 if (!_major) 154 _major = r; 155 156 return 0; 157 } 158 159 static void local_exit(void) 160 { 161 kmem_cache_destroy(_tio_cache); 162 kmem_cache_destroy(_io_cache); 163 164 if (unregister_blkdev(_major, _name) < 0) 165 DMERR("unregister_blkdev failed"); 166 167 _major = 0; 168 169 DMINFO("cleaned up"); 170 } 171 172 int (*_inits[])(void) __initdata = { 173 local_init, 174 dm_target_init, 175 dm_linear_init, 176 dm_stripe_init, 177 dm_interface_init, 178 }; 179 180 void (*_exits[])(void) = { 181 local_exit, 182 dm_target_exit, 183 dm_linear_exit, 184 dm_stripe_exit, 185 dm_interface_exit, 186 }; 187 188 static int __init dm_init(void) 189 { 190 const int count = ARRAY_SIZE(_inits); 191 192 int r, i; 193 194 for (i = 0; i < count; i++) { 195 r = _inits[i](); 196 if (r) 197 goto bad; 198 } 199 200 return 0; 201 202 bad: 203 while (i--) 204 _exits[i](); 205 206 return r; 207 } 208 209 static void __exit dm_exit(void) 210 { 211 int i = ARRAY_SIZE(_exits); 212 213 while (i--) 214 _exits[i](); 215 } 216 217 /* 218 * Block device functions 219 */ 220 static int dm_blk_open(struct inode *inode, struct file *file) 221 { 222 struct mapped_device *md; 223 224 spin_lock(&_minor_lock); 225 226 md = inode->i_bdev->bd_disk->private_data; 227 if (!md) 228 goto out; 229 230 if (test_bit(DMF_FREEING, &md->flags) || 231 test_bit(DMF_DELETING, &md->flags)) { 232 md = NULL; 233 goto out; 234 } 235 236 dm_get(md); 237 atomic_inc(&md->open_count); 238 239 out: 240 spin_unlock(&_minor_lock); 241 242 return md ? 0 : -ENXIO; 243 } 244 245 static int dm_blk_close(struct inode *inode, struct file *file) 246 { 247 struct mapped_device *md; 248 249 md = inode->i_bdev->bd_disk->private_data; 250 atomic_dec(&md->open_count); 251 dm_put(md); 252 return 0; 253 } 254 255 int dm_open_count(struct mapped_device *md) 256 { 257 return atomic_read(&md->open_count); 258 } 259 260 /* 261 * Guarantees nothing is using the device before it's deleted. 262 */ 263 int dm_lock_for_deletion(struct mapped_device *md) 264 { 265 int r = 0; 266 267 spin_lock(&_minor_lock); 268 269 if (dm_open_count(md)) 270 r = -EBUSY; 271 else 272 set_bit(DMF_DELETING, &md->flags); 273 274 spin_unlock(&_minor_lock); 275 276 return r; 277 } 278 279 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 280 { 281 struct mapped_device *md = bdev->bd_disk->private_data; 282 283 return dm_get_geometry(md, geo); 284 } 285 286 static int dm_blk_ioctl(struct inode *inode, struct file *file, 287 unsigned int cmd, unsigned long arg) 288 { 289 struct mapped_device *md; 290 struct dm_table *map; 291 struct dm_target *tgt; 292 int r = -ENOTTY; 293 294 /* We don't really need this lock, but we do need 'inode'. */ 295 unlock_kernel(); 296 297 md = inode->i_bdev->bd_disk->private_data; 298 299 map = dm_get_table(md); 300 301 if (!map || !dm_table_get_size(map)) 302 goto out; 303 304 /* We only support devices that have a single target */ 305 if (dm_table_get_num_targets(map) != 1) 306 goto out; 307 308 tgt = dm_table_get_target(map, 0); 309 310 if (dm_suspended(md)) { 311 r = -EAGAIN; 312 goto out; 313 } 314 315 if (tgt->type->ioctl) 316 r = tgt->type->ioctl(tgt, inode, file, cmd, arg); 317 318 out: 319 dm_table_put(map); 320 321 lock_kernel(); 322 return r; 323 } 324 325 static inline struct dm_io *alloc_io(struct mapped_device *md) 326 { 327 return mempool_alloc(md->io_pool, GFP_NOIO); 328 } 329 330 static inline void free_io(struct mapped_device *md, struct dm_io *io) 331 { 332 mempool_free(io, md->io_pool); 333 } 334 335 static inline struct target_io *alloc_tio(struct mapped_device *md) 336 { 337 return mempool_alloc(md->tio_pool, GFP_NOIO); 338 } 339 340 static inline void free_tio(struct mapped_device *md, struct target_io *tio) 341 { 342 mempool_free(tio, md->tio_pool); 343 } 344 345 static void start_io_acct(struct dm_io *io) 346 { 347 struct mapped_device *md = io->md; 348 349 io->start_time = jiffies; 350 351 preempt_disable(); 352 disk_round_stats(dm_disk(md)); 353 preempt_enable(); 354 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 355 } 356 357 static int end_io_acct(struct dm_io *io) 358 { 359 struct mapped_device *md = io->md; 360 struct bio *bio = io->bio; 361 unsigned long duration = jiffies - io->start_time; 362 int pending; 363 int rw = bio_data_dir(bio); 364 365 preempt_disable(); 366 disk_round_stats(dm_disk(md)); 367 preempt_enable(); 368 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 369 370 disk_stat_add(dm_disk(md), ticks[rw], duration); 371 372 return !pending; 373 } 374 375 /* 376 * Add the bio to the list of deferred io. 377 */ 378 static int queue_io(struct mapped_device *md, struct bio *bio) 379 { 380 down_write(&md->io_lock); 381 382 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 383 up_write(&md->io_lock); 384 return 1; 385 } 386 387 bio_list_add(&md->deferred, bio); 388 389 up_write(&md->io_lock); 390 return 0; /* deferred successfully */ 391 } 392 393 /* 394 * Everyone (including functions in this file), should use this 395 * function to access the md->map field, and make sure they call 396 * dm_table_put() when finished. 397 */ 398 struct dm_table *dm_get_table(struct mapped_device *md) 399 { 400 struct dm_table *t; 401 402 read_lock(&md->map_lock); 403 t = md->map; 404 if (t) 405 dm_table_get(t); 406 read_unlock(&md->map_lock); 407 408 return t; 409 } 410 411 /* 412 * Get the geometry associated with a dm device 413 */ 414 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 415 { 416 *geo = md->geometry; 417 418 return 0; 419 } 420 421 /* 422 * Set the geometry of a device. 423 */ 424 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 425 { 426 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 427 428 if (geo->start > sz) { 429 DMWARN("Start sector is beyond the geometry limits."); 430 return -EINVAL; 431 } 432 433 md->geometry = *geo; 434 435 return 0; 436 } 437 438 /*----------------------------------------------------------------- 439 * CRUD START: 440 * A more elegant soln is in the works that uses the queue 441 * merge fn, unfortunately there are a couple of changes to 442 * the block layer that I want to make for this. So in the 443 * interests of getting something for people to use I give 444 * you this clearly demarcated crap. 445 *---------------------------------------------------------------*/ 446 447 /* 448 * Decrements the number of outstanding ios that a bio has been 449 * cloned into, completing the original io if necc. 450 */ 451 static void dec_pending(struct dm_io *io, int error) 452 { 453 if (error) 454 io->error = error; 455 456 if (atomic_dec_and_test(&io->io_count)) { 457 if (end_io_acct(io)) 458 /* nudge anyone waiting on suspend queue */ 459 wake_up(&io->md->wait); 460 461 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); 462 463 bio_endio(io->bio, io->bio->bi_size, io->error); 464 free_io(io->md, io); 465 } 466 } 467 468 static int clone_endio(struct bio *bio, unsigned int done, int error) 469 { 470 int r = 0; 471 struct target_io *tio = bio->bi_private; 472 struct mapped_device *md = tio->io->md; 473 dm_endio_fn endio = tio->ti->type->end_io; 474 475 if (bio->bi_size) 476 return 1; 477 478 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 479 error = -EIO; 480 481 if (endio) { 482 r = endio(tio->ti, bio, error, &tio->info); 483 if (r < 0) 484 error = r; 485 486 else if (r > 0) 487 /* the target wants another shot at the io */ 488 return 1; 489 } 490 491 dec_pending(tio->io, error); 492 493 /* 494 * Store md for cleanup instead of tio which is about to get freed. 495 */ 496 bio->bi_private = md->bs; 497 498 bio_put(bio); 499 free_tio(md, tio); 500 return r; 501 } 502 503 static sector_t max_io_len(struct mapped_device *md, 504 sector_t sector, struct dm_target *ti) 505 { 506 sector_t offset = sector - ti->begin; 507 sector_t len = ti->len - offset; 508 509 /* 510 * Does the target need to split even further ? 511 */ 512 if (ti->split_io) { 513 sector_t boundary; 514 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 515 - offset; 516 if (len > boundary) 517 len = boundary; 518 } 519 520 return len; 521 } 522 523 static void __map_bio(struct dm_target *ti, struct bio *clone, 524 struct target_io *tio) 525 { 526 int r; 527 sector_t sector; 528 struct mapped_device *md; 529 530 /* 531 * Sanity checks. 532 */ 533 BUG_ON(!clone->bi_size); 534 535 clone->bi_end_io = clone_endio; 536 clone->bi_private = tio; 537 538 /* 539 * Map the clone. If r == 0 we don't need to do 540 * anything, the target has assumed ownership of 541 * this io. 542 */ 543 atomic_inc(&tio->io->io_count); 544 sector = clone->bi_sector; 545 r = ti->type->map(ti, clone, &tio->info); 546 if (r > 0) { 547 /* the bio has been remapped so dispatch it */ 548 549 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 550 tio->io->bio->bi_bdev->bd_dev, sector, 551 clone->bi_sector); 552 553 generic_make_request(clone); 554 } 555 556 else if (r < 0) { 557 /* error the io and bail out */ 558 md = tio->io->md; 559 dec_pending(tio->io, r); 560 /* 561 * Store bio_set for cleanup. 562 */ 563 clone->bi_private = md->bs; 564 bio_put(clone); 565 free_tio(md, tio); 566 } 567 } 568 569 struct clone_info { 570 struct mapped_device *md; 571 struct dm_table *map; 572 struct bio *bio; 573 struct dm_io *io; 574 sector_t sector; 575 sector_t sector_count; 576 unsigned short idx; 577 }; 578 579 static void dm_bio_destructor(struct bio *bio) 580 { 581 struct bio_set *bs = bio->bi_private; 582 583 bio_free(bio, bs); 584 } 585 586 /* 587 * Creates a little bio that is just does part of a bvec. 588 */ 589 static struct bio *split_bvec(struct bio *bio, sector_t sector, 590 unsigned short idx, unsigned int offset, 591 unsigned int len, struct bio_set *bs) 592 { 593 struct bio *clone; 594 struct bio_vec *bv = bio->bi_io_vec + idx; 595 596 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 597 clone->bi_destructor = dm_bio_destructor; 598 *clone->bi_io_vec = *bv; 599 600 clone->bi_sector = sector; 601 clone->bi_bdev = bio->bi_bdev; 602 clone->bi_rw = bio->bi_rw; 603 clone->bi_vcnt = 1; 604 clone->bi_size = to_bytes(len); 605 clone->bi_io_vec->bv_offset = offset; 606 clone->bi_io_vec->bv_len = clone->bi_size; 607 608 return clone; 609 } 610 611 /* 612 * Creates a bio that consists of range of complete bvecs. 613 */ 614 static struct bio *clone_bio(struct bio *bio, sector_t sector, 615 unsigned short idx, unsigned short bv_count, 616 unsigned int len, struct bio_set *bs) 617 { 618 struct bio *clone; 619 620 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 621 __bio_clone(clone, bio); 622 clone->bi_destructor = dm_bio_destructor; 623 clone->bi_sector = sector; 624 clone->bi_idx = idx; 625 clone->bi_vcnt = idx + bv_count; 626 clone->bi_size = to_bytes(len); 627 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 628 629 return clone; 630 } 631 632 static void __clone_and_map(struct clone_info *ci) 633 { 634 struct bio *clone, *bio = ci->bio; 635 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 636 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 637 struct target_io *tio; 638 639 /* 640 * Allocate a target io object. 641 */ 642 tio = alloc_tio(ci->md); 643 tio->io = ci->io; 644 tio->ti = ti; 645 memset(&tio->info, 0, sizeof(tio->info)); 646 647 if (ci->sector_count <= max) { 648 /* 649 * Optimise for the simple case where we can do all of 650 * the remaining io with a single clone. 651 */ 652 clone = clone_bio(bio, ci->sector, ci->idx, 653 bio->bi_vcnt - ci->idx, ci->sector_count, 654 ci->md->bs); 655 __map_bio(ti, clone, tio); 656 ci->sector_count = 0; 657 658 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 659 /* 660 * There are some bvecs that don't span targets. 661 * Do as many of these as possible. 662 */ 663 int i; 664 sector_t remaining = max; 665 sector_t bv_len; 666 667 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 668 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 669 670 if (bv_len > remaining) 671 break; 672 673 remaining -= bv_len; 674 len += bv_len; 675 } 676 677 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 678 ci->md->bs); 679 __map_bio(ti, clone, tio); 680 681 ci->sector += len; 682 ci->sector_count -= len; 683 ci->idx = i; 684 685 } else { 686 /* 687 * Handle a bvec that must be split between two or more targets. 688 */ 689 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 690 sector_t remaining = to_sector(bv->bv_len); 691 unsigned int offset = 0; 692 693 do { 694 if (offset) { 695 ti = dm_table_find_target(ci->map, ci->sector); 696 max = max_io_len(ci->md, ci->sector, ti); 697 698 tio = alloc_tio(ci->md); 699 tio->io = ci->io; 700 tio->ti = ti; 701 memset(&tio->info, 0, sizeof(tio->info)); 702 } 703 704 len = min(remaining, max); 705 706 clone = split_bvec(bio, ci->sector, ci->idx, 707 bv->bv_offset + offset, len, 708 ci->md->bs); 709 710 __map_bio(ti, clone, tio); 711 712 ci->sector += len; 713 ci->sector_count -= len; 714 offset += to_bytes(len); 715 } while (remaining -= len); 716 717 ci->idx++; 718 } 719 } 720 721 /* 722 * Split the bio into several clones. 723 */ 724 static void __split_bio(struct mapped_device *md, struct bio *bio) 725 { 726 struct clone_info ci; 727 728 ci.map = dm_get_table(md); 729 if (!ci.map) { 730 bio_io_error(bio, bio->bi_size); 731 return; 732 } 733 734 ci.md = md; 735 ci.bio = bio; 736 ci.io = alloc_io(md); 737 ci.io->error = 0; 738 atomic_set(&ci.io->io_count, 1); 739 ci.io->bio = bio; 740 ci.io->md = md; 741 ci.sector = bio->bi_sector; 742 ci.sector_count = bio_sectors(bio); 743 ci.idx = bio->bi_idx; 744 745 start_io_acct(ci.io); 746 while (ci.sector_count) 747 __clone_and_map(&ci); 748 749 /* drop the extra reference count */ 750 dec_pending(ci.io, 0); 751 dm_table_put(ci.map); 752 } 753 /*----------------------------------------------------------------- 754 * CRUD END 755 *---------------------------------------------------------------*/ 756 757 /* 758 * The request function that just remaps the bio built up by 759 * dm_merge_bvec. 760 */ 761 static int dm_request(request_queue_t *q, struct bio *bio) 762 { 763 int r; 764 int rw = bio_data_dir(bio); 765 struct mapped_device *md = q->queuedata; 766 767 down_read(&md->io_lock); 768 769 disk_stat_inc(dm_disk(md), ios[rw]); 770 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 771 772 /* 773 * If we're suspended we have to queue 774 * this io for later. 775 */ 776 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 777 up_read(&md->io_lock); 778 779 if (bio_rw(bio) == READA) { 780 bio_io_error(bio, bio->bi_size); 781 return 0; 782 } 783 784 r = queue_io(md, bio); 785 if (r < 0) { 786 bio_io_error(bio, bio->bi_size); 787 return 0; 788 789 } else if (r == 0) 790 return 0; /* deferred successfully */ 791 792 /* 793 * We're in a while loop, because someone could suspend 794 * before we get to the following read lock. 795 */ 796 down_read(&md->io_lock); 797 } 798 799 __split_bio(md, bio); 800 up_read(&md->io_lock); 801 return 0; 802 } 803 804 static int dm_flush_all(request_queue_t *q, struct gendisk *disk, 805 sector_t *error_sector) 806 { 807 struct mapped_device *md = q->queuedata; 808 struct dm_table *map = dm_get_table(md); 809 int ret = -ENXIO; 810 811 if (map) { 812 ret = dm_table_flush_all(map); 813 dm_table_put(map); 814 } 815 816 return ret; 817 } 818 819 static void dm_unplug_all(request_queue_t *q) 820 { 821 struct mapped_device *md = q->queuedata; 822 struct dm_table *map = dm_get_table(md); 823 824 if (map) { 825 dm_table_unplug_all(map); 826 dm_table_put(map); 827 } 828 } 829 830 static int dm_any_congested(void *congested_data, int bdi_bits) 831 { 832 int r; 833 struct mapped_device *md = (struct mapped_device *) congested_data; 834 struct dm_table *map = dm_get_table(md); 835 836 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 837 r = bdi_bits; 838 else 839 r = dm_table_any_congested(map, bdi_bits); 840 841 dm_table_put(map); 842 return r; 843 } 844 845 /*----------------------------------------------------------------- 846 * An IDR is used to keep track of allocated minor numbers. 847 *---------------------------------------------------------------*/ 848 static DEFINE_IDR(_minor_idr); 849 850 static void free_minor(int minor) 851 { 852 spin_lock(&_minor_lock); 853 idr_remove(&_minor_idr, minor); 854 spin_unlock(&_minor_lock); 855 } 856 857 /* 858 * See if the device with a specific minor # is free. 859 */ 860 static int specific_minor(struct mapped_device *md, int minor) 861 { 862 int r, m; 863 864 if (minor >= (1 << MINORBITS)) 865 return -EINVAL; 866 867 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 868 if (!r) 869 return -ENOMEM; 870 871 spin_lock(&_minor_lock); 872 873 if (idr_find(&_minor_idr, minor)) { 874 r = -EBUSY; 875 goto out; 876 } 877 878 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 879 if (r) 880 goto out; 881 882 if (m != minor) { 883 idr_remove(&_minor_idr, m); 884 r = -EBUSY; 885 goto out; 886 } 887 888 out: 889 spin_unlock(&_minor_lock); 890 return r; 891 } 892 893 static int next_free_minor(struct mapped_device *md, int *minor) 894 { 895 int r, m; 896 897 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 898 if (!r) 899 return -ENOMEM; 900 901 spin_lock(&_minor_lock); 902 903 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 904 if (r) { 905 goto out; 906 } 907 908 if (m >= (1 << MINORBITS)) { 909 idr_remove(&_minor_idr, m); 910 r = -ENOSPC; 911 goto out; 912 } 913 914 *minor = m; 915 916 out: 917 spin_unlock(&_minor_lock); 918 return r; 919 } 920 921 static struct block_device_operations dm_blk_dops; 922 923 /* 924 * Allocate and initialise a blank device with a given minor. 925 */ 926 static struct mapped_device *alloc_dev(int minor) 927 { 928 int r; 929 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 930 void *old_md; 931 932 if (!md) { 933 DMWARN("unable to allocate device, out of memory."); 934 return NULL; 935 } 936 937 if (!try_module_get(THIS_MODULE)) 938 goto bad0; 939 940 /* get a minor number for the dev */ 941 if (minor == DM_ANY_MINOR) 942 r = next_free_minor(md, &minor); 943 else 944 r = specific_minor(md, minor); 945 if (r < 0) 946 goto bad1; 947 948 memset(md, 0, sizeof(*md)); 949 init_rwsem(&md->io_lock); 950 init_MUTEX(&md->suspend_lock); 951 rwlock_init(&md->map_lock); 952 atomic_set(&md->holders, 1); 953 atomic_set(&md->open_count, 0); 954 atomic_set(&md->event_nr, 0); 955 956 md->queue = blk_alloc_queue(GFP_KERNEL); 957 if (!md->queue) 958 goto bad1_free_minor; 959 960 md->queue->queuedata = md; 961 md->queue->backing_dev_info.congested_fn = dm_any_congested; 962 md->queue->backing_dev_info.congested_data = md; 963 blk_queue_make_request(md->queue, dm_request); 964 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 965 md->queue->unplug_fn = dm_unplug_all; 966 md->queue->issue_flush_fn = dm_flush_all; 967 968 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 969 if (!md->io_pool) 970 goto bad2; 971 972 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 973 if (!md->tio_pool) 974 goto bad3; 975 976 md->bs = bioset_create(16, 16, 4); 977 if (!md->bs) 978 goto bad_no_bioset; 979 980 md->disk = alloc_disk(1); 981 if (!md->disk) 982 goto bad4; 983 984 atomic_set(&md->pending, 0); 985 init_waitqueue_head(&md->wait); 986 init_waitqueue_head(&md->eventq); 987 988 md->disk->major = _major; 989 md->disk->first_minor = minor; 990 md->disk->fops = &dm_blk_dops; 991 md->disk->queue = md->queue; 992 md->disk->private_data = md; 993 sprintf(md->disk->disk_name, "dm-%d", minor); 994 add_disk(md->disk); 995 format_dev_t(md->name, MKDEV(_major, minor)); 996 997 /* Populate the mapping, nobody knows we exist yet */ 998 spin_lock(&_minor_lock); 999 old_md = idr_replace(&_minor_idr, md, minor); 1000 spin_unlock(&_minor_lock); 1001 1002 BUG_ON(old_md != MINOR_ALLOCED); 1003 1004 return md; 1005 1006 bad4: 1007 bioset_free(md->bs); 1008 bad_no_bioset: 1009 mempool_destroy(md->tio_pool); 1010 bad3: 1011 mempool_destroy(md->io_pool); 1012 bad2: 1013 blk_cleanup_queue(md->queue); 1014 bad1_free_minor: 1015 free_minor(minor); 1016 bad1: 1017 module_put(THIS_MODULE); 1018 bad0: 1019 kfree(md); 1020 return NULL; 1021 } 1022 1023 static void free_dev(struct mapped_device *md) 1024 { 1025 int minor = md->disk->first_minor; 1026 1027 if (md->suspended_bdev) { 1028 thaw_bdev(md->suspended_bdev, NULL); 1029 bdput(md->suspended_bdev); 1030 } 1031 mempool_destroy(md->tio_pool); 1032 mempool_destroy(md->io_pool); 1033 bioset_free(md->bs); 1034 del_gendisk(md->disk); 1035 free_minor(minor); 1036 1037 spin_lock(&_minor_lock); 1038 md->disk->private_data = NULL; 1039 spin_unlock(&_minor_lock); 1040 1041 put_disk(md->disk); 1042 blk_cleanup_queue(md->queue); 1043 module_put(THIS_MODULE); 1044 kfree(md); 1045 } 1046 1047 /* 1048 * Bind a table to the device. 1049 */ 1050 static void event_callback(void *context) 1051 { 1052 struct mapped_device *md = (struct mapped_device *) context; 1053 1054 atomic_inc(&md->event_nr); 1055 wake_up(&md->eventq); 1056 } 1057 1058 static void __set_size(struct mapped_device *md, sector_t size) 1059 { 1060 set_capacity(md->disk, size); 1061 1062 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1063 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1064 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1065 } 1066 1067 static int __bind(struct mapped_device *md, struct dm_table *t) 1068 { 1069 request_queue_t *q = md->queue; 1070 sector_t size; 1071 1072 size = dm_table_get_size(t); 1073 1074 /* 1075 * Wipe any geometry if the size of the table changed. 1076 */ 1077 if (size != get_capacity(md->disk)) 1078 memset(&md->geometry, 0, sizeof(md->geometry)); 1079 1080 __set_size(md, size); 1081 if (size == 0) 1082 return 0; 1083 1084 dm_table_get(t); 1085 dm_table_event_callback(t, event_callback, md); 1086 1087 write_lock(&md->map_lock); 1088 md->map = t; 1089 dm_table_set_restrictions(t, q); 1090 write_unlock(&md->map_lock); 1091 1092 return 0; 1093 } 1094 1095 static void __unbind(struct mapped_device *md) 1096 { 1097 struct dm_table *map = md->map; 1098 1099 if (!map) 1100 return; 1101 1102 dm_table_event_callback(map, NULL, NULL); 1103 write_lock(&md->map_lock); 1104 md->map = NULL; 1105 write_unlock(&md->map_lock); 1106 dm_table_put(map); 1107 } 1108 1109 /* 1110 * Constructor for a new device. 1111 */ 1112 int dm_create(int minor, struct mapped_device **result) 1113 { 1114 struct mapped_device *md; 1115 1116 md = alloc_dev(minor); 1117 if (!md) 1118 return -ENXIO; 1119 1120 *result = md; 1121 return 0; 1122 } 1123 1124 static struct mapped_device *dm_find_md(dev_t dev) 1125 { 1126 struct mapped_device *md; 1127 unsigned minor = MINOR(dev); 1128 1129 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1130 return NULL; 1131 1132 spin_lock(&_minor_lock); 1133 1134 md = idr_find(&_minor_idr, minor); 1135 if (md && (md == MINOR_ALLOCED || 1136 (dm_disk(md)->first_minor != minor) || 1137 test_bit(DMF_FREEING, &md->flags))) { 1138 md = NULL; 1139 goto out; 1140 } 1141 1142 out: 1143 spin_unlock(&_minor_lock); 1144 1145 return md; 1146 } 1147 1148 struct mapped_device *dm_get_md(dev_t dev) 1149 { 1150 struct mapped_device *md = dm_find_md(dev); 1151 1152 if (md) 1153 dm_get(md); 1154 1155 return md; 1156 } 1157 1158 void *dm_get_mdptr(struct mapped_device *md) 1159 { 1160 return md->interface_ptr; 1161 } 1162 1163 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1164 { 1165 md->interface_ptr = ptr; 1166 } 1167 1168 void dm_get(struct mapped_device *md) 1169 { 1170 atomic_inc(&md->holders); 1171 } 1172 1173 const char *dm_device_name(struct mapped_device *md) 1174 { 1175 return md->name; 1176 } 1177 EXPORT_SYMBOL_GPL(dm_device_name); 1178 1179 void dm_put(struct mapped_device *md) 1180 { 1181 struct dm_table *map; 1182 1183 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 1184 1185 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 1186 map = dm_get_table(md); 1187 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); 1188 set_bit(DMF_FREEING, &md->flags); 1189 spin_unlock(&_minor_lock); 1190 if (!dm_suspended(md)) { 1191 dm_table_presuspend_targets(map); 1192 dm_table_postsuspend_targets(map); 1193 } 1194 __unbind(md); 1195 dm_table_put(map); 1196 free_dev(md); 1197 } 1198 } 1199 1200 /* 1201 * Process the deferred bios 1202 */ 1203 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1204 { 1205 struct bio *n; 1206 1207 while (c) { 1208 n = c->bi_next; 1209 c->bi_next = NULL; 1210 __split_bio(md, c); 1211 c = n; 1212 } 1213 } 1214 1215 /* 1216 * Swap in a new table (destroying old one). 1217 */ 1218 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1219 { 1220 int r = -EINVAL; 1221 1222 down(&md->suspend_lock); 1223 1224 /* device must be suspended */ 1225 if (!dm_suspended(md)) 1226 goto out; 1227 1228 __unbind(md); 1229 r = __bind(md, table); 1230 1231 out: 1232 up(&md->suspend_lock); 1233 return r; 1234 } 1235 1236 /* 1237 * Functions to lock and unlock any filesystem running on the 1238 * device. 1239 */ 1240 static int lock_fs(struct mapped_device *md) 1241 { 1242 int r; 1243 1244 WARN_ON(md->frozen_sb); 1245 1246 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1247 if (IS_ERR(md->frozen_sb)) { 1248 r = PTR_ERR(md->frozen_sb); 1249 md->frozen_sb = NULL; 1250 return r; 1251 } 1252 1253 set_bit(DMF_FROZEN, &md->flags); 1254 1255 /* don't bdput right now, we don't want the bdev 1256 * to go away while it is locked. 1257 */ 1258 return 0; 1259 } 1260 1261 static void unlock_fs(struct mapped_device *md) 1262 { 1263 if (!test_bit(DMF_FROZEN, &md->flags)) 1264 return; 1265 1266 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1267 md->frozen_sb = NULL; 1268 clear_bit(DMF_FROZEN, &md->flags); 1269 } 1270 1271 /* 1272 * We need to be able to change a mapping table under a mounted 1273 * filesystem. For example we might want to move some data in 1274 * the background. Before the table can be swapped with 1275 * dm_bind_table, dm_suspend must be called to flush any in 1276 * flight bios and ensure that any further io gets deferred. 1277 */ 1278 int dm_suspend(struct mapped_device *md, int do_lockfs) 1279 { 1280 struct dm_table *map = NULL; 1281 DECLARE_WAITQUEUE(wait, current); 1282 struct bio *def; 1283 int r = -EINVAL; 1284 1285 down(&md->suspend_lock); 1286 1287 if (dm_suspended(md)) 1288 goto out_unlock; 1289 1290 map = dm_get_table(md); 1291 1292 /* This does not get reverted if there's an error later. */ 1293 dm_table_presuspend_targets(map); 1294 1295 md->suspended_bdev = bdget_disk(md->disk, 0); 1296 if (!md->suspended_bdev) { 1297 DMWARN("bdget failed in dm_suspend"); 1298 r = -ENOMEM; 1299 goto out; 1300 } 1301 1302 /* Flush I/O to the device. */ 1303 if (do_lockfs) { 1304 r = lock_fs(md); 1305 if (r) 1306 goto out; 1307 } 1308 1309 /* 1310 * First we set the BLOCK_IO flag so no more ios will be mapped. 1311 */ 1312 down_write(&md->io_lock); 1313 set_bit(DMF_BLOCK_IO, &md->flags); 1314 1315 add_wait_queue(&md->wait, &wait); 1316 up_write(&md->io_lock); 1317 1318 /* unplug */ 1319 if (map) 1320 dm_table_unplug_all(map); 1321 1322 /* 1323 * Then we wait for the already mapped ios to 1324 * complete. 1325 */ 1326 while (1) { 1327 set_current_state(TASK_INTERRUPTIBLE); 1328 1329 if (!atomic_read(&md->pending) || signal_pending(current)) 1330 break; 1331 1332 io_schedule(); 1333 } 1334 set_current_state(TASK_RUNNING); 1335 1336 down_write(&md->io_lock); 1337 remove_wait_queue(&md->wait, &wait); 1338 1339 /* were we interrupted ? */ 1340 r = -EINTR; 1341 if (atomic_read(&md->pending)) { 1342 clear_bit(DMF_BLOCK_IO, &md->flags); 1343 def = bio_list_get(&md->deferred); 1344 __flush_deferred_io(md, def); 1345 up_write(&md->io_lock); 1346 unlock_fs(md); 1347 goto out; 1348 } 1349 up_write(&md->io_lock); 1350 1351 dm_table_postsuspend_targets(map); 1352 1353 set_bit(DMF_SUSPENDED, &md->flags); 1354 1355 r = 0; 1356 1357 out: 1358 if (r && md->suspended_bdev) { 1359 bdput(md->suspended_bdev); 1360 md->suspended_bdev = NULL; 1361 } 1362 1363 dm_table_put(map); 1364 1365 out_unlock: 1366 up(&md->suspend_lock); 1367 return r; 1368 } 1369 1370 int dm_resume(struct mapped_device *md) 1371 { 1372 int r = -EINVAL; 1373 struct bio *def; 1374 struct dm_table *map = NULL; 1375 1376 down(&md->suspend_lock); 1377 if (!dm_suspended(md)) 1378 goto out; 1379 1380 map = dm_get_table(md); 1381 if (!map || !dm_table_get_size(map)) 1382 goto out; 1383 1384 r = dm_table_resume_targets(map); 1385 if (r) 1386 goto out; 1387 1388 down_write(&md->io_lock); 1389 clear_bit(DMF_BLOCK_IO, &md->flags); 1390 1391 def = bio_list_get(&md->deferred); 1392 __flush_deferred_io(md, def); 1393 up_write(&md->io_lock); 1394 1395 unlock_fs(md); 1396 1397 bdput(md->suspended_bdev); 1398 md->suspended_bdev = NULL; 1399 1400 clear_bit(DMF_SUSPENDED, &md->flags); 1401 1402 dm_table_unplug_all(map); 1403 1404 kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); 1405 1406 r = 0; 1407 1408 out: 1409 dm_table_put(map); 1410 up(&md->suspend_lock); 1411 1412 return r; 1413 } 1414 1415 /*----------------------------------------------------------------- 1416 * Event notification. 1417 *---------------------------------------------------------------*/ 1418 uint32_t dm_get_event_nr(struct mapped_device *md) 1419 { 1420 return atomic_read(&md->event_nr); 1421 } 1422 1423 int dm_wait_event(struct mapped_device *md, int event_nr) 1424 { 1425 return wait_event_interruptible(md->eventq, 1426 (event_nr != atomic_read(&md->event_nr))); 1427 } 1428 1429 /* 1430 * The gendisk is only valid as long as you have a reference 1431 * count on 'md'. 1432 */ 1433 struct gendisk *dm_disk(struct mapped_device *md) 1434 { 1435 return md->disk; 1436 } 1437 1438 int dm_suspended(struct mapped_device *md) 1439 { 1440 return test_bit(DMF_SUSPENDED, &md->flags); 1441 } 1442 1443 static struct block_device_operations dm_blk_dops = { 1444 .open = dm_blk_open, 1445 .release = dm_blk_close, 1446 .ioctl = dm_blk_ioctl, 1447 .getgeo = dm_blk_getgeo, 1448 .owner = THIS_MODULE 1449 }; 1450 1451 EXPORT_SYMBOL(dm_get_mapinfo); 1452 1453 /* 1454 * module hooks 1455 */ 1456 module_init(dm_init); 1457 module_exit(dm_exit); 1458 1459 module_param(major, uint, 0); 1460 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1461 MODULE_DESCRIPTION(DM_NAME " driver"); 1462 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1463 MODULE_LICENSE("GPL"); 1464