1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 #ifdef CONFIG_PRINTK 28 /* 29 * ratelimit state to be used in DMXXX_LIMIT(). 30 */ 31 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 32 DEFAULT_RATELIMIT_INTERVAL, 33 DEFAULT_RATELIMIT_BURST); 34 EXPORT_SYMBOL(dm_ratelimit_state); 35 #endif 36 37 /* 38 * Cookies are numeric values sent with CHANGE and REMOVE 39 * uevents while resuming, removing or renaming the device. 40 */ 41 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 42 #define DM_COOKIE_LENGTH 24 43 44 static const char *_name = DM_NAME; 45 46 static unsigned int major = 0; 47 static unsigned int _major = 0; 48 49 static DEFINE_IDR(_minor_idr); 50 51 static DEFINE_SPINLOCK(_minor_lock); 52 /* 53 * For bio-based dm. 54 * One of these is allocated per bio. 55 */ 56 struct dm_io { 57 struct mapped_device *md; 58 int error; 59 atomic_t io_count; 60 struct bio *bio; 61 unsigned long start_time; 62 spinlock_t endio_lock; 63 }; 64 65 /* 66 * For bio-based dm. 67 * One of these is allocated per target within a bio. Hopefully 68 * this will be simplified out one day. 69 */ 70 struct dm_target_io { 71 struct dm_io *io; 72 struct dm_target *ti; 73 union map_info info; 74 }; 75 76 /* 77 * For request-based dm. 78 * One of these is allocated per request. 79 */ 80 struct dm_rq_target_io { 81 struct mapped_device *md; 82 struct dm_target *ti; 83 struct request *orig, clone; 84 int error; 85 union map_info info; 86 }; 87 88 /* 89 * For request-based dm. 90 * One of these is allocated per bio. 91 */ 92 struct dm_rq_clone_bio_info { 93 struct bio *orig; 94 struct dm_rq_target_io *tio; 95 }; 96 97 union map_info *dm_get_mapinfo(struct bio *bio) 98 { 99 if (bio && bio->bi_private) 100 return &((struct dm_target_io *)bio->bi_private)->info; 101 return NULL; 102 } 103 104 union map_info *dm_get_rq_mapinfo(struct request *rq) 105 { 106 if (rq && rq->end_io_data) 107 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 108 return NULL; 109 } 110 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 111 112 #define MINOR_ALLOCED ((void *)-1) 113 114 /* 115 * Bits for the md->flags field. 116 */ 117 #define DMF_BLOCK_IO_FOR_SUSPEND 0 118 #define DMF_SUSPENDED 1 119 #define DMF_FROZEN 2 120 #define DMF_FREEING 3 121 #define DMF_DELETING 4 122 #define DMF_NOFLUSH_SUSPENDING 5 123 #define DMF_MERGE_IS_OPTIONAL 6 124 125 /* 126 * Work processed by per-device workqueue. 127 */ 128 struct mapped_device { 129 struct rw_semaphore io_lock; 130 struct mutex suspend_lock; 131 rwlock_t map_lock; 132 atomic_t holders; 133 atomic_t open_count; 134 135 unsigned long flags; 136 137 struct request_queue *queue; 138 unsigned type; 139 /* Protect queue and type against concurrent access. */ 140 struct mutex type_lock; 141 142 struct target_type *immutable_target_type; 143 144 struct gendisk *disk; 145 char name[16]; 146 147 void *interface_ptr; 148 149 /* 150 * A list of ios that arrived while we were suspended. 151 */ 152 atomic_t pending[2]; 153 wait_queue_head_t wait; 154 struct work_struct work; 155 struct bio_list deferred; 156 spinlock_t deferred_lock; 157 158 /* 159 * Processing queue (flush) 160 */ 161 struct workqueue_struct *wq; 162 163 /* 164 * The current mapping. 165 */ 166 struct dm_table *map; 167 168 /* 169 * io objects are allocated from here. 170 */ 171 mempool_t *io_pool; 172 mempool_t *tio_pool; 173 174 struct bio_set *bs; 175 176 /* 177 * Event handling. 178 */ 179 atomic_t event_nr; 180 wait_queue_head_t eventq; 181 atomic_t uevent_seq; 182 struct list_head uevent_list; 183 spinlock_t uevent_lock; /* Protect access to uevent_list */ 184 185 /* 186 * freeze/thaw support require holding onto a super block 187 */ 188 struct super_block *frozen_sb; 189 struct block_device *bdev; 190 191 /* forced geometry settings */ 192 struct hd_geometry geometry; 193 194 /* sysfs handle */ 195 struct kobject kobj; 196 197 /* zero-length flush that will be cloned and submitted to targets */ 198 struct bio flush_bio; 199 }; 200 201 /* 202 * For mempools pre-allocation at the table loading time. 203 */ 204 struct dm_md_mempools { 205 mempool_t *io_pool; 206 mempool_t *tio_pool; 207 struct bio_set *bs; 208 }; 209 210 #define MIN_IOS 256 211 static struct kmem_cache *_io_cache; 212 static struct kmem_cache *_tio_cache; 213 static struct kmem_cache *_rq_tio_cache; 214 static struct kmem_cache *_rq_bio_info_cache; 215 216 static int __init local_init(void) 217 { 218 int r = -ENOMEM; 219 220 /* allocate a slab for the dm_ios */ 221 _io_cache = KMEM_CACHE(dm_io, 0); 222 if (!_io_cache) 223 return r; 224 225 /* allocate a slab for the target ios */ 226 _tio_cache = KMEM_CACHE(dm_target_io, 0); 227 if (!_tio_cache) 228 goto out_free_io_cache; 229 230 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 231 if (!_rq_tio_cache) 232 goto out_free_tio_cache; 233 234 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 235 if (!_rq_bio_info_cache) 236 goto out_free_rq_tio_cache; 237 238 r = dm_uevent_init(); 239 if (r) 240 goto out_free_rq_bio_info_cache; 241 242 _major = major; 243 r = register_blkdev(_major, _name); 244 if (r < 0) 245 goto out_uevent_exit; 246 247 if (!_major) 248 _major = r; 249 250 return 0; 251 252 out_uevent_exit: 253 dm_uevent_exit(); 254 out_free_rq_bio_info_cache: 255 kmem_cache_destroy(_rq_bio_info_cache); 256 out_free_rq_tio_cache: 257 kmem_cache_destroy(_rq_tio_cache); 258 out_free_tio_cache: 259 kmem_cache_destroy(_tio_cache); 260 out_free_io_cache: 261 kmem_cache_destroy(_io_cache); 262 263 return r; 264 } 265 266 static void local_exit(void) 267 { 268 kmem_cache_destroy(_rq_bio_info_cache); 269 kmem_cache_destroy(_rq_tio_cache); 270 kmem_cache_destroy(_tio_cache); 271 kmem_cache_destroy(_io_cache); 272 unregister_blkdev(_major, _name); 273 dm_uevent_exit(); 274 275 _major = 0; 276 277 DMINFO("cleaned up"); 278 } 279 280 static int (*_inits[])(void) __initdata = { 281 local_init, 282 dm_target_init, 283 dm_linear_init, 284 dm_stripe_init, 285 dm_io_init, 286 dm_kcopyd_init, 287 dm_interface_init, 288 }; 289 290 static void (*_exits[])(void) = { 291 local_exit, 292 dm_target_exit, 293 dm_linear_exit, 294 dm_stripe_exit, 295 dm_io_exit, 296 dm_kcopyd_exit, 297 dm_interface_exit, 298 }; 299 300 static int __init dm_init(void) 301 { 302 const int count = ARRAY_SIZE(_inits); 303 304 int r, i; 305 306 for (i = 0; i < count; i++) { 307 r = _inits[i](); 308 if (r) 309 goto bad; 310 } 311 312 return 0; 313 314 bad: 315 while (i--) 316 _exits[i](); 317 318 return r; 319 } 320 321 static void __exit dm_exit(void) 322 { 323 int i = ARRAY_SIZE(_exits); 324 325 while (i--) 326 _exits[i](); 327 328 /* 329 * Should be empty by this point. 330 */ 331 idr_remove_all(&_minor_idr); 332 idr_destroy(&_minor_idr); 333 } 334 335 /* 336 * Block device functions 337 */ 338 int dm_deleting_md(struct mapped_device *md) 339 { 340 return test_bit(DMF_DELETING, &md->flags); 341 } 342 343 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = bdev->bd_disk->private_data; 350 if (!md) 351 goto out; 352 353 if (test_bit(DMF_FREEING, &md->flags) || 354 dm_deleting_md(md)) { 355 md = NULL; 356 goto out; 357 } 358 359 dm_get(md); 360 atomic_inc(&md->open_count); 361 362 out: 363 spin_unlock(&_minor_lock); 364 365 return md ? 0 : -ENXIO; 366 } 367 368 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 369 { 370 struct mapped_device *md = disk->private_data; 371 372 spin_lock(&_minor_lock); 373 374 atomic_dec(&md->open_count); 375 dm_put(md); 376 377 spin_unlock(&_minor_lock); 378 379 return 0; 380 } 381 382 int dm_open_count(struct mapped_device *md) 383 { 384 return atomic_read(&md->open_count); 385 } 386 387 /* 388 * Guarantees nothing is using the device before it's deleted. 389 */ 390 int dm_lock_for_deletion(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (dm_open_count(md)) 397 r = -EBUSY; 398 else 399 set_bit(DMF_DELETING, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 407 { 408 struct mapped_device *md = bdev->bd_disk->private_data; 409 410 return dm_get_geometry(md, geo); 411 } 412 413 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 414 unsigned int cmd, unsigned long arg) 415 { 416 struct mapped_device *md = bdev->bd_disk->private_data; 417 struct dm_table *map = dm_get_live_table(md); 418 struct dm_target *tgt; 419 int r = -ENOTTY; 420 421 if (!map || !dm_table_get_size(map)) 422 goto out; 423 424 /* We only support devices that have a single target */ 425 if (dm_table_get_num_targets(map) != 1) 426 goto out; 427 428 tgt = dm_table_get_target(map, 0); 429 430 if (dm_suspended_md(md)) { 431 r = -EAGAIN; 432 goto out; 433 } 434 435 if (tgt->type->ioctl) 436 r = tgt->type->ioctl(tgt, cmd, arg); 437 438 out: 439 dm_table_put(map); 440 441 return r; 442 } 443 444 static struct dm_io *alloc_io(struct mapped_device *md) 445 { 446 return mempool_alloc(md->io_pool, GFP_NOIO); 447 } 448 449 static void free_io(struct mapped_device *md, struct dm_io *io) 450 { 451 mempool_free(io, md->io_pool); 452 } 453 454 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 455 { 456 mempool_free(tio, md->tio_pool); 457 } 458 459 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 460 gfp_t gfp_mask) 461 { 462 return mempool_alloc(md->tio_pool, gfp_mask); 463 } 464 465 static void free_rq_tio(struct dm_rq_target_io *tio) 466 { 467 mempool_free(tio, tio->md->tio_pool); 468 } 469 470 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 471 { 472 return mempool_alloc(md->io_pool, GFP_ATOMIC); 473 } 474 475 static void free_bio_info(struct dm_rq_clone_bio_info *info) 476 { 477 mempool_free(info, info->tio->md->io_pool); 478 } 479 480 static int md_in_flight(struct mapped_device *md) 481 { 482 return atomic_read(&md->pending[READ]) + 483 atomic_read(&md->pending[WRITE]); 484 } 485 486 static void start_io_acct(struct dm_io *io) 487 { 488 struct mapped_device *md = io->md; 489 int cpu; 490 int rw = bio_data_dir(io->bio); 491 492 io->start_time = jiffies; 493 494 cpu = part_stat_lock(); 495 part_round_stats(cpu, &dm_disk(md)->part0); 496 part_stat_unlock(); 497 atomic_set(&dm_disk(md)->part0.in_flight[rw], 498 atomic_inc_return(&md->pending[rw])); 499 } 500 501 static void end_io_acct(struct dm_io *io) 502 { 503 struct mapped_device *md = io->md; 504 struct bio *bio = io->bio; 505 unsigned long duration = jiffies - io->start_time; 506 int pending, cpu; 507 int rw = bio_data_dir(bio); 508 509 cpu = part_stat_lock(); 510 part_round_stats(cpu, &dm_disk(md)->part0); 511 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 512 part_stat_unlock(); 513 514 /* 515 * After this is decremented the bio must not be touched if it is 516 * a flush. 517 */ 518 pending = atomic_dec_return(&md->pending[rw]); 519 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 520 pending += atomic_read(&md->pending[rw^0x1]); 521 522 /* nudge anyone waiting on suspend queue */ 523 if (!pending) 524 wake_up(&md->wait); 525 } 526 527 /* 528 * Add the bio to the list of deferred io. 529 */ 530 static void queue_io(struct mapped_device *md, struct bio *bio) 531 { 532 unsigned long flags; 533 534 spin_lock_irqsave(&md->deferred_lock, flags); 535 bio_list_add(&md->deferred, bio); 536 spin_unlock_irqrestore(&md->deferred_lock, flags); 537 queue_work(md->wq, &md->work); 538 } 539 540 /* 541 * Everyone (including functions in this file), should use this 542 * function to access the md->map field, and make sure they call 543 * dm_table_put() when finished. 544 */ 545 struct dm_table *dm_get_live_table(struct mapped_device *md) 546 { 547 struct dm_table *t; 548 unsigned long flags; 549 550 read_lock_irqsave(&md->map_lock, flags); 551 t = md->map; 552 if (t) 553 dm_table_get(t); 554 read_unlock_irqrestore(&md->map_lock, flags); 555 556 return t; 557 } 558 559 /* 560 * Get the geometry associated with a dm device 561 */ 562 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 563 { 564 *geo = md->geometry; 565 566 return 0; 567 } 568 569 /* 570 * Set the geometry of a device. 571 */ 572 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 573 { 574 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 575 576 if (geo->start > sz) { 577 DMWARN("Start sector is beyond the geometry limits."); 578 return -EINVAL; 579 } 580 581 md->geometry = *geo; 582 583 return 0; 584 } 585 586 /*----------------------------------------------------------------- 587 * CRUD START: 588 * A more elegant soln is in the works that uses the queue 589 * merge fn, unfortunately there are a couple of changes to 590 * the block layer that I want to make for this. So in the 591 * interests of getting something for people to use I give 592 * you this clearly demarcated crap. 593 *---------------------------------------------------------------*/ 594 595 static int __noflush_suspending(struct mapped_device *md) 596 { 597 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 598 } 599 600 /* 601 * Decrements the number of outstanding ios that a bio has been 602 * cloned into, completing the original io if necc. 603 */ 604 static void dec_pending(struct dm_io *io, int error) 605 { 606 unsigned long flags; 607 int io_error; 608 struct bio *bio; 609 struct mapped_device *md = io->md; 610 611 /* Push-back supersedes any I/O errors */ 612 if (unlikely(error)) { 613 spin_lock_irqsave(&io->endio_lock, flags); 614 if (!(io->error > 0 && __noflush_suspending(md))) 615 io->error = error; 616 spin_unlock_irqrestore(&io->endio_lock, flags); 617 } 618 619 if (atomic_dec_and_test(&io->io_count)) { 620 if (io->error == DM_ENDIO_REQUEUE) { 621 /* 622 * Target requested pushing back the I/O. 623 */ 624 spin_lock_irqsave(&md->deferred_lock, flags); 625 if (__noflush_suspending(md)) 626 bio_list_add_head(&md->deferred, io->bio); 627 else 628 /* noflush suspend was interrupted. */ 629 io->error = -EIO; 630 spin_unlock_irqrestore(&md->deferred_lock, flags); 631 } 632 633 io_error = io->error; 634 bio = io->bio; 635 end_io_acct(io); 636 free_io(md, io); 637 638 if (io_error == DM_ENDIO_REQUEUE) 639 return; 640 641 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 642 /* 643 * Preflush done for flush with data, reissue 644 * without REQ_FLUSH. 645 */ 646 bio->bi_rw &= ~REQ_FLUSH; 647 queue_io(md, bio); 648 } else { 649 /* done with normal IO or empty flush */ 650 trace_block_bio_complete(md->queue, bio, io_error); 651 bio_endio(bio, io_error); 652 } 653 } 654 } 655 656 static void clone_endio(struct bio *bio, int error) 657 { 658 int r = 0; 659 struct dm_target_io *tio = bio->bi_private; 660 struct dm_io *io = tio->io; 661 struct mapped_device *md = tio->io->md; 662 dm_endio_fn endio = tio->ti->type->end_io; 663 664 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 665 error = -EIO; 666 667 if (endio) { 668 r = endio(tio->ti, bio, error, &tio->info); 669 if (r < 0 || r == DM_ENDIO_REQUEUE) 670 /* 671 * error and requeue request are handled 672 * in dec_pending(). 673 */ 674 error = r; 675 else if (r == DM_ENDIO_INCOMPLETE) 676 /* The target will handle the io */ 677 return; 678 else if (r) { 679 DMWARN("unimplemented target endio return value: %d", r); 680 BUG(); 681 } 682 } 683 684 /* 685 * Store md for cleanup instead of tio which is about to get freed. 686 */ 687 bio->bi_private = md->bs; 688 689 free_tio(md, tio); 690 bio_put(bio); 691 dec_pending(io, error); 692 } 693 694 /* 695 * Partial completion handling for request-based dm 696 */ 697 static void end_clone_bio(struct bio *clone, int error) 698 { 699 struct dm_rq_clone_bio_info *info = clone->bi_private; 700 struct dm_rq_target_io *tio = info->tio; 701 struct bio *bio = info->orig; 702 unsigned int nr_bytes = info->orig->bi_size; 703 704 bio_put(clone); 705 706 if (tio->error) 707 /* 708 * An error has already been detected on the request. 709 * Once error occurred, just let clone->end_io() handle 710 * the remainder. 711 */ 712 return; 713 else if (error) { 714 /* 715 * Don't notice the error to the upper layer yet. 716 * The error handling decision is made by the target driver, 717 * when the request is completed. 718 */ 719 tio->error = error; 720 return; 721 } 722 723 /* 724 * I/O for the bio successfully completed. 725 * Notice the data completion to the upper layer. 726 */ 727 728 /* 729 * bios are processed from the head of the list. 730 * So the completing bio should always be rq->bio. 731 * If it's not, something wrong is happening. 732 */ 733 if (tio->orig->bio != bio) 734 DMERR("bio completion is going in the middle of the request"); 735 736 /* 737 * Update the original request. 738 * Do not use blk_end_request() here, because it may complete 739 * the original request before the clone, and break the ordering. 740 */ 741 blk_update_request(tio->orig, 0, nr_bytes); 742 } 743 744 /* 745 * Don't touch any member of the md after calling this function because 746 * the md may be freed in dm_put() at the end of this function. 747 * Or do dm_get() before calling this function and dm_put() later. 748 */ 749 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 750 { 751 atomic_dec(&md->pending[rw]); 752 753 /* nudge anyone waiting on suspend queue */ 754 if (!md_in_flight(md)) 755 wake_up(&md->wait); 756 757 if (run_queue) 758 blk_run_queue(md->queue); 759 760 /* 761 * dm_put() must be at the end of this function. See the comment above 762 */ 763 dm_put(md); 764 } 765 766 static void free_rq_clone(struct request *clone) 767 { 768 struct dm_rq_target_io *tio = clone->end_io_data; 769 770 blk_rq_unprep_clone(clone); 771 free_rq_tio(tio); 772 } 773 774 /* 775 * Complete the clone and the original request. 776 * Must be called without queue lock. 777 */ 778 static void dm_end_request(struct request *clone, int error) 779 { 780 int rw = rq_data_dir(clone); 781 struct dm_rq_target_io *tio = clone->end_io_data; 782 struct mapped_device *md = tio->md; 783 struct request *rq = tio->orig; 784 785 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 786 rq->errors = clone->errors; 787 rq->resid_len = clone->resid_len; 788 789 if (rq->sense) 790 /* 791 * We are using the sense buffer of the original 792 * request. 793 * So setting the length of the sense data is enough. 794 */ 795 rq->sense_len = clone->sense_len; 796 } 797 798 free_rq_clone(clone); 799 blk_end_request_all(rq, error); 800 rq_completed(md, rw, true); 801 } 802 803 static void dm_unprep_request(struct request *rq) 804 { 805 struct request *clone = rq->special; 806 807 rq->special = NULL; 808 rq->cmd_flags &= ~REQ_DONTPREP; 809 810 free_rq_clone(clone); 811 } 812 813 /* 814 * Requeue the original request of a clone. 815 */ 816 void dm_requeue_unmapped_request(struct request *clone) 817 { 818 int rw = rq_data_dir(clone); 819 struct dm_rq_target_io *tio = clone->end_io_data; 820 struct mapped_device *md = tio->md; 821 struct request *rq = tio->orig; 822 struct request_queue *q = rq->q; 823 unsigned long flags; 824 825 dm_unprep_request(rq); 826 827 spin_lock_irqsave(q->queue_lock, flags); 828 blk_requeue_request(q, rq); 829 spin_unlock_irqrestore(q->queue_lock, flags); 830 831 rq_completed(md, rw, 0); 832 } 833 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 834 835 static void __stop_queue(struct request_queue *q) 836 { 837 blk_stop_queue(q); 838 } 839 840 static void stop_queue(struct request_queue *q) 841 { 842 unsigned long flags; 843 844 spin_lock_irqsave(q->queue_lock, flags); 845 __stop_queue(q); 846 spin_unlock_irqrestore(q->queue_lock, flags); 847 } 848 849 static void __start_queue(struct request_queue *q) 850 { 851 if (blk_queue_stopped(q)) 852 blk_start_queue(q); 853 } 854 855 static void start_queue(struct request_queue *q) 856 { 857 unsigned long flags; 858 859 spin_lock_irqsave(q->queue_lock, flags); 860 __start_queue(q); 861 spin_unlock_irqrestore(q->queue_lock, flags); 862 } 863 864 static void dm_done(struct request *clone, int error, bool mapped) 865 { 866 int r = error; 867 struct dm_rq_target_io *tio = clone->end_io_data; 868 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 869 870 if (mapped && rq_end_io) 871 r = rq_end_io(tio->ti, clone, error, &tio->info); 872 873 if (r <= 0) 874 /* The target wants to complete the I/O */ 875 dm_end_request(clone, r); 876 else if (r == DM_ENDIO_INCOMPLETE) 877 /* The target will handle the I/O */ 878 return; 879 else if (r == DM_ENDIO_REQUEUE) 880 /* The target wants to requeue the I/O */ 881 dm_requeue_unmapped_request(clone); 882 else { 883 DMWARN("unimplemented target endio return value: %d", r); 884 BUG(); 885 } 886 } 887 888 /* 889 * Request completion handler for request-based dm 890 */ 891 static void dm_softirq_done(struct request *rq) 892 { 893 bool mapped = true; 894 struct request *clone = rq->completion_data; 895 struct dm_rq_target_io *tio = clone->end_io_data; 896 897 if (rq->cmd_flags & REQ_FAILED) 898 mapped = false; 899 900 dm_done(clone, tio->error, mapped); 901 } 902 903 /* 904 * Complete the clone and the original request with the error status 905 * through softirq context. 906 */ 907 static void dm_complete_request(struct request *clone, int error) 908 { 909 struct dm_rq_target_io *tio = clone->end_io_data; 910 struct request *rq = tio->orig; 911 912 tio->error = error; 913 rq->completion_data = clone; 914 blk_complete_request(rq); 915 } 916 917 /* 918 * Complete the not-mapped clone and the original request with the error status 919 * through softirq context. 920 * Target's rq_end_io() function isn't called. 921 * This may be used when the target's map_rq() function fails. 922 */ 923 void dm_kill_unmapped_request(struct request *clone, int error) 924 { 925 struct dm_rq_target_io *tio = clone->end_io_data; 926 struct request *rq = tio->orig; 927 928 rq->cmd_flags |= REQ_FAILED; 929 dm_complete_request(clone, error); 930 } 931 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 932 933 /* 934 * Called with the queue lock held 935 */ 936 static void end_clone_request(struct request *clone, int error) 937 { 938 /* 939 * For just cleaning up the information of the queue in which 940 * the clone was dispatched. 941 * The clone is *NOT* freed actually here because it is alloced from 942 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 943 */ 944 __blk_put_request(clone->q, clone); 945 946 /* 947 * Actual request completion is done in a softirq context which doesn't 948 * hold the queue lock. Otherwise, deadlock could occur because: 949 * - another request may be submitted by the upper level driver 950 * of the stacking during the completion 951 * - the submission which requires queue lock may be done 952 * against this queue 953 */ 954 dm_complete_request(clone, error); 955 } 956 957 /* 958 * Return maximum size of I/O possible at the supplied sector up to the current 959 * target boundary. 960 */ 961 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 962 { 963 sector_t target_offset = dm_target_offset(ti, sector); 964 965 return ti->len - target_offset; 966 } 967 968 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 969 { 970 sector_t len = max_io_len_target_boundary(sector, ti); 971 sector_t offset, max_len; 972 973 /* 974 * Does the target need to split even further? 975 */ 976 if (ti->max_io_len) { 977 offset = dm_target_offset(ti, sector); 978 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 979 max_len = sector_div(offset, ti->max_io_len); 980 else 981 max_len = offset & (ti->max_io_len - 1); 982 max_len = ti->max_io_len - max_len; 983 984 if (len > max_len) 985 len = max_len; 986 } 987 988 return len; 989 } 990 991 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 992 { 993 if (len > UINT_MAX) { 994 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 995 (unsigned long long)len, UINT_MAX); 996 ti->error = "Maximum size of target IO is too large"; 997 return -EINVAL; 998 } 999 1000 ti->max_io_len = (uint32_t) len; 1001 1002 return 0; 1003 } 1004 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1005 1006 static void __map_bio(struct dm_target *ti, struct bio *clone, 1007 struct dm_target_io *tio) 1008 { 1009 int r; 1010 sector_t sector; 1011 struct mapped_device *md; 1012 1013 clone->bi_end_io = clone_endio; 1014 clone->bi_private = tio; 1015 1016 /* 1017 * Map the clone. If r == 0 we don't need to do 1018 * anything, the target has assumed ownership of 1019 * this io. 1020 */ 1021 atomic_inc(&tio->io->io_count); 1022 sector = clone->bi_sector; 1023 r = ti->type->map(ti, clone, &tio->info); 1024 if (r == DM_MAPIO_REMAPPED) { 1025 /* the bio has been remapped so dispatch it */ 1026 1027 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1028 tio->io->bio->bi_bdev->bd_dev, sector); 1029 1030 generic_make_request(clone); 1031 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1032 /* error the io and bail out, or requeue it if needed */ 1033 md = tio->io->md; 1034 dec_pending(tio->io, r); 1035 /* 1036 * Store bio_set for cleanup. 1037 */ 1038 clone->bi_end_io = NULL; 1039 clone->bi_private = md->bs; 1040 bio_put(clone); 1041 free_tio(md, tio); 1042 } else if (r) { 1043 DMWARN("unimplemented target map return value: %d", r); 1044 BUG(); 1045 } 1046 } 1047 1048 struct clone_info { 1049 struct mapped_device *md; 1050 struct dm_table *map; 1051 struct bio *bio; 1052 struct dm_io *io; 1053 sector_t sector; 1054 sector_t sector_count; 1055 unsigned short idx; 1056 }; 1057 1058 static void dm_bio_destructor(struct bio *bio) 1059 { 1060 struct bio_set *bs = bio->bi_private; 1061 1062 bio_free(bio, bs); 1063 } 1064 1065 /* 1066 * Creates a little bio that just does part of a bvec. 1067 */ 1068 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1069 unsigned short idx, unsigned int offset, 1070 unsigned int len, struct bio_set *bs) 1071 { 1072 struct bio *clone; 1073 struct bio_vec *bv = bio->bi_io_vec + idx; 1074 1075 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1076 clone->bi_destructor = dm_bio_destructor; 1077 *clone->bi_io_vec = *bv; 1078 1079 clone->bi_sector = sector; 1080 clone->bi_bdev = bio->bi_bdev; 1081 clone->bi_rw = bio->bi_rw; 1082 clone->bi_vcnt = 1; 1083 clone->bi_size = to_bytes(len); 1084 clone->bi_io_vec->bv_offset = offset; 1085 clone->bi_io_vec->bv_len = clone->bi_size; 1086 clone->bi_flags |= 1 << BIO_CLONED; 1087 1088 if (bio_integrity(bio)) { 1089 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1090 bio_integrity_trim(clone, 1091 bio_sector_offset(bio, idx, offset), len); 1092 } 1093 1094 return clone; 1095 } 1096 1097 /* 1098 * Creates a bio that consists of range of complete bvecs. 1099 */ 1100 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1101 unsigned short idx, unsigned short bv_count, 1102 unsigned int len, struct bio_set *bs) 1103 { 1104 struct bio *clone; 1105 1106 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1107 __bio_clone(clone, bio); 1108 clone->bi_destructor = dm_bio_destructor; 1109 clone->bi_sector = sector; 1110 clone->bi_idx = idx; 1111 clone->bi_vcnt = idx + bv_count; 1112 clone->bi_size = to_bytes(len); 1113 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1114 1115 if (bio_integrity(bio)) { 1116 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1117 1118 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1119 bio_integrity_trim(clone, 1120 bio_sector_offset(bio, idx, 0), len); 1121 } 1122 1123 return clone; 1124 } 1125 1126 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1127 struct dm_target *ti) 1128 { 1129 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1130 1131 tio->io = ci->io; 1132 tio->ti = ti; 1133 memset(&tio->info, 0, sizeof(tio->info)); 1134 1135 return tio; 1136 } 1137 1138 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1139 unsigned request_nr, sector_t len) 1140 { 1141 struct dm_target_io *tio = alloc_tio(ci, ti); 1142 struct bio *clone; 1143 1144 tio->info.target_request_nr = request_nr; 1145 1146 /* 1147 * Discard requests require the bio's inline iovecs be initialized. 1148 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1149 * and discard, so no need for concern about wasted bvec allocations. 1150 */ 1151 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1152 __bio_clone(clone, ci->bio); 1153 clone->bi_destructor = dm_bio_destructor; 1154 if (len) { 1155 clone->bi_sector = ci->sector; 1156 clone->bi_size = to_bytes(len); 1157 } 1158 1159 __map_bio(ti, clone, tio); 1160 } 1161 1162 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1163 unsigned num_requests, sector_t len) 1164 { 1165 unsigned request_nr; 1166 1167 for (request_nr = 0; request_nr < num_requests; request_nr++) 1168 __issue_target_request(ci, ti, request_nr, len); 1169 } 1170 1171 static int __clone_and_map_empty_flush(struct clone_info *ci) 1172 { 1173 unsigned target_nr = 0; 1174 struct dm_target *ti; 1175 1176 BUG_ON(bio_has_data(ci->bio)); 1177 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1178 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1179 1180 return 0; 1181 } 1182 1183 /* 1184 * Perform all io with a single clone. 1185 */ 1186 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1187 { 1188 struct bio *clone, *bio = ci->bio; 1189 struct dm_target_io *tio; 1190 1191 tio = alloc_tio(ci, ti); 1192 clone = clone_bio(bio, ci->sector, ci->idx, 1193 bio->bi_vcnt - ci->idx, ci->sector_count, 1194 ci->md->bs); 1195 __map_bio(ti, clone, tio); 1196 ci->sector_count = 0; 1197 } 1198 1199 static int __clone_and_map_discard(struct clone_info *ci) 1200 { 1201 struct dm_target *ti; 1202 sector_t len; 1203 1204 do { 1205 ti = dm_table_find_target(ci->map, ci->sector); 1206 if (!dm_target_is_valid(ti)) 1207 return -EIO; 1208 1209 /* 1210 * Even though the device advertised discard support, 1211 * that does not mean every target supports it, and 1212 * reconfiguration might also have changed that since the 1213 * check was performed. 1214 */ 1215 if (!ti->num_discard_requests) 1216 return -EOPNOTSUPP; 1217 1218 if (!ti->split_discard_requests) 1219 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1220 else 1221 len = min(ci->sector_count, max_io_len(ci->sector, ti)); 1222 1223 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1224 1225 ci->sector += len; 1226 } while (ci->sector_count -= len); 1227 1228 return 0; 1229 } 1230 1231 static int __clone_and_map(struct clone_info *ci) 1232 { 1233 struct bio *clone, *bio = ci->bio; 1234 struct dm_target *ti; 1235 sector_t len = 0, max; 1236 struct dm_target_io *tio; 1237 1238 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1239 return __clone_and_map_discard(ci); 1240 1241 ti = dm_table_find_target(ci->map, ci->sector); 1242 if (!dm_target_is_valid(ti)) 1243 return -EIO; 1244 1245 max = max_io_len(ci->sector, ti); 1246 1247 if (ci->sector_count <= max) { 1248 /* 1249 * Optimise for the simple case where we can do all of 1250 * the remaining io with a single clone. 1251 */ 1252 __clone_and_map_simple(ci, ti); 1253 1254 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1255 /* 1256 * There are some bvecs that don't span targets. 1257 * Do as many of these as possible. 1258 */ 1259 int i; 1260 sector_t remaining = max; 1261 sector_t bv_len; 1262 1263 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1264 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1265 1266 if (bv_len > remaining) 1267 break; 1268 1269 remaining -= bv_len; 1270 len += bv_len; 1271 } 1272 1273 tio = alloc_tio(ci, ti); 1274 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1275 ci->md->bs); 1276 __map_bio(ti, clone, tio); 1277 1278 ci->sector += len; 1279 ci->sector_count -= len; 1280 ci->idx = i; 1281 1282 } else { 1283 /* 1284 * Handle a bvec that must be split between two or more targets. 1285 */ 1286 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1287 sector_t remaining = to_sector(bv->bv_len); 1288 unsigned int offset = 0; 1289 1290 do { 1291 if (offset) { 1292 ti = dm_table_find_target(ci->map, ci->sector); 1293 if (!dm_target_is_valid(ti)) 1294 return -EIO; 1295 1296 max = max_io_len(ci->sector, ti); 1297 } 1298 1299 len = min(remaining, max); 1300 1301 tio = alloc_tio(ci, ti); 1302 clone = split_bvec(bio, ci->sector, ci->idx, 1303 bv->bv_offset + offset, len, 1304 ci->md->bs); 1305 1306 __map_bio(ti, clone, tio); 1307 1308 ci->sector += len; 1309 ci->sector_count -= len; 1310 offset += to_bytes(len); 1311 } while (remaining -= len); 1312 1313 ci->idx++; 1314 } 1315 1316 return 0; 1317 } 1318 1319 /* 1320 * Split the bio into several clones and submit it to targets. 1321 */ 1322 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1323 { 1324 struct clone_info ci; 1325 int error = 0; 1326 1327 ci.map = dm_get_live_table(md); 1328 if (unlikely(!ci.map)) { 1329 bio_io_error(bio); 1330 return; 1331 } 1332 1333 ci.md = md; 1334 ci.io = alloc_io(md); 1335 ci.io->error = 0; 1336 atomic_set(&ci.io->io_count, 1); 1337 ci.io->bio = bio; 1338 ci.io->md = md; 1339 spin_lock_init(&ci.io->endio_lock); 1340 ci.sector = bio->bi_sector; 1341 ci.idx = bio->bi_idx; 1342 1343 start_io_acct(ci.io); 1344 if (bio->bi_rw & REQ_FLUSH) { 1345 ci.bio = &ci.md->flush_bio; 1346 ci.sector_count = 0; 1347 error = __clone_and_map_empty_flush(&ci); 1348 /* dec_pending submits any data associated with flush */ 1349 } else { 1350 ci.bio = bio; 1351 ci.sector_count = bio_sectors(bio); 1352 while (ci.sector_count && !error) 1353 error = __clone_and_map(&ci); 1354 } 1355 1356 /* drop the extra reference count */ 1357 dec_pending(ci.io, error); 1358 dm_table_put(ci.map); 1359 } 1360 /*----------------------------------------------------------------- 1361 * CRUD END 1362 *---------------------------------------------------------------*/ 1363 1364 static int dm_merge_bvec(struct request_queue *q, 1365 struct bvec_merge_data *bvm, 1366 struct bio_vec *biovec) 1367 { 1368 struct mapped_device *md = q->queuedata; 1369 struct dm_table *map = dm_get_live_table(md); 1370 struct dm_target *ti; 1371 sector_t max_sectors; 1372 int max_size = 0; 1373 1374 if (unlikely(!map)) 1375 goto out; 1376 1377 ti = dm_table_find_target(map, bvm->bi_sector); 1378 if (!dm_target_is_valid(ti)) 1379 goto out_table; 1380 1381 /* 1382 * Find maximum amount of I/O that won't need splitting 1383 */ 1384 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1385 (sector_t) BIO_MAX_SECTORS); 1386 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1387 if (max_size < 0) 1388 max_size = 0; 1389 1390 /* 1391 * merge_bvec_fn() returns number of bytes 1392 * it can accept at this offset 1393 * max is precomputed maximal io size 1394 */ 1395 if (max_size && ti->type->merge) 1396 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1397 /* 1398 * If the target doesn't support merge method and some of the devices 1399 * provided their merge_bvec method (we know this by looking at 1400 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1401 * entries. So always set max_size to 0, and the code below allows 1402 * just one page. 1403 */ 1404 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1405 1406 max_size = 0; 1407 1408 out_table: 1409 dm_table_put(map); 1410 1411 out: 1412 /* 1413 * Always allow an entire first page 1414 */ 1415 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1416 max_size = biovec->bv_len; 1417 1418 return max_size; 1419 } 1420 1421 /* 1422 * The request function that just remaps the bio built up by 1423 * dm_merge_bvec. 1424 */ 1425 static void _dm_request(struct request_queue *q, struct bio *bio) 1426 { 1427 int rw = bio_data_dir(bio); 1428 struct mapped_device *md = q->queuedata; 1429 int cpu; 1430 1431 down_read(&md->io_lock); 1432 1433 cpu = part_stat_lock(); 1434 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1435 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1436 part_stat_unlock(); 1437 1438 /* if we're suspended, we have to queue this io for later */ 1439 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1440 up_read(&md->io_lock); 1441 1442 if (bio_rw(bio) != READA) 1443 queue_io(md, bio); 1444 else 1445 bio_io_error(bio); 1446 return; 1447 } 1448 1449 __split_and_process_bio(md, bio); 1450 up_read(&md->io_lock); 1451 return; 1452 } 1453 1454 static int dm_request_based(struct mapped_device *md) 1455 { 1456 return blk_queue_stackable(md->queue); 1457 } 1458 1459 static void dm_request(struct request_queue *q, struct bio *bio) 1460 { 1461 struct mapped_device *md = q->queuedata; 1462 1463 if (dm_request_based(md)) 1464 blk_queue_bio(q, bio); 1465 else 1466 _dm_request(q, bio); 1467 } 1468 1469 void dm_dispatch_request(struct request *rq) 1470 { 1471 int r; 1472 1473 if (blk_queue_io_stat(rq->q)) 1474 rq->cmd_flags |= REQ_IO_STAT; 1475 1476 rq->start_time = jiffies; 1477 r = blk_insert_cloned_request(rq->q, rq); 1478 if (r) 1479 dm_complete_request(rq, r); 1480 } 1481 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1482 1483 static void dm_rq_bio_destructor(struct bio *bio) 1484 { 1485 struct dm_rq_clone_bio_info *info = bio->bi_private; 1486 struct mapped_device *md = info->tio->md; 1487 1488 free_bio_info(info); 1489 bio_free(bio, md->bs); 1490 } 1491 1492 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1493 void *data) 1494 { 1495 struct dm_rq_target_io *tio = data; 1496 struct mapped_device *md = tio->md; 1497 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1498 1499 if (!info) 1500 return -ENOMEM; 1501 1502 info->orig = bio_orig; 1503 info->tio = tio; 1504 bio->bi_end_io = end_clone_bio; 1505 bio->bi_private = info; 1506 bio->bi_destructor = dm_rq_bio_destructor; 1507 1508 return 0; 1509 } 1510 1511 static int setup_clone(struct request *clone, struct request *rq, 1512 struct dm_rq_target_io *tio) 1513 { 1514 int r; 1515 1516 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1517 dm_rq_bio_constructor, tio); 1518 if (r) 1519 return r; 1520 1521 clone->cmd = rq->cmd; 1522 clone->cmd_len = rq->cmd_len; 1523 clone->sense = rq->sense; 1524 clone->buffer = rq->buffer; 1525 clone->end_io = end_clone_request; 1526 clone->end_io_data = tio; 1527 1528 return 0; 1529 } 1530 1531 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1532 gfp_t gfp_mask) 1533 { 1534 struct request *clone; 1535 struct dm_rq_target_io *tio; 1536 1537 tio = alloc_rq_tio(md, gfp_mask); 1538 if (!tio) 1539 return NULL; 1540 1541 tio->md = md; 1542 tio->ti = NULL; 1543 tio->orig = rq; 1544 tio->error = 0; 1545 memset(&tio->info, 0, sizeof(tio->info)); 1546 1547 clone = &tio->clone; 1548 if (setup_clone(clone, rq, tio)) { 1549 /* -ENOMEM */ 1550 free_rq_tio(tio); 1551 return NULL; 1552 } 1553 1554 return clone; 1555 } 1556 1557 /* 1558 * Called with the queue lock held. 1559 */ 1560 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1561 { 1562 struct mapped_device *md = q->queuedata; 1563 struct request *clone; 1564 1565 if (unlikely(rq->special)) { 1566 DMWARN("Already has something in rq->special."); 1567 return BLKPREP_KILL; 1568 } 1569 1570 clone = clone_rq(rq, md, GFP_ATOMIC); 1571 if (!clone) 1572 return BLKPREP_DEFER; 1573 1574 rq->special = clone; 1575 rq->cmd_flags |= REQ_DONTPREP; 1576 1577 return BLKPREP_OK; 1578 } 1579 1580 /* 1581 * Returns: 1582 * 0 : the request has been processed (not requeued) 1583 * !0 : the request has been requeued 1584 */ 1585 static int map_request(struct dm_target *ti, struct request *clone, 1586 struct mapped_device *md) 1587 { 1588 int r, requeued = 0; 1589 struct dm_rq_target_io *tio = clone->end_io_data; 1590 1591 /* 1592 * Hold the md reference here for the in-flight I/O. 1593 * We can't rely on the reference count by device opener, 1594 * because the device may be closed during the request completion 1595 * when all bios are completed. 1596 * See the comment in rq_completed() too. 1597 */ 1598 dm_get(md); 1599 1600 tio->ti = ti; 1601 r = ti->type->map_rq(ti, clone, &tio->info); 1602 switch (r) { 1603 case DM_MAPIO_SUBMITTED: 1604 /* The target has taken the I/O to submit by itself later */ 1605 break; 1606 case DM_MAPIO_REMAPPED: 1607 /* The target has remapped the I/O so dispatch it */ 1608 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1609 blk_rq_pos(tio->orig)); 1610 dm_dispatch_request(clone); 1611 break; 1612 case DM_MAPIO_REQUEUE: 1613 /* The target wants to requeue the I/O */ 1614 dm_requeue_unmapped_request(clone); 1615 requeued = 1; 1616 break; 1617 default: 1618 if (r > 0) { 1619 DMWARN("unimplemented target map return value: %d", r); 1620 BUG(); 1621 } 1622 1623 /* The target wants to complete the I/O */ 1624 dm_kill_unmapped_request(clone, r); 1625 break; 1626 } 1627 1628 return requeued; 1629 } 1630 1631 /* 1632 * q->request_fn for request-based dm. 1633 * Called with the queue lock held. 1634 */ 1635 static void dm_request_fn(struct request_queue *q) 1636 { 1637 struct mapped_device *md = q->queuedata; 1638 struct dm_table *map = dm_get_live_table(md); 1639 struct dm_target *ti; 1640 struct request *rq, *clone; 1641 sector_t pos; 1642 1643 /* 1644 * For suspend, check blk_queue_stopped() and increment 1645 * ->pending within a single queue_lock not to increment the 1646 * number of in-flight I/Os after the queue is stopped in 1647 * dm_suspend(). 1648 */ 1649 while (!blk_queue_stopped(q)) { 1650 rq = blk_peek_request(q); 1651 if (!rq) 1652 goto delay_and_out; 1653 1654 /* always use block 0 to find the target for flushes for now */ 1655 pos = 0; 1656 if (!(rq->cmd_flags & REQ_FLUSH)) 1657 pos = blk_rq_pos(rq); 1658 1659 ti = dm_table_find_target(map, pos); 1660 BUG_ON(!dm_target_is_valid(ti)); 1661 1662 if (ti->type->busy && ti->type->busy(ti)) 1663 goto delay_and_out; 1664 1665 blk_start_request(rq); 1666 clone = rq->special; 1667 atomic_inc(&md->pending[rq_data_dir(clone)]); 1668 1669 spin_unlock(q->queue_lock); 1670 if (map_request(ti, clone, md)) 1671 goto requeued; 1672 1673 BUG_ON(!irqs_disabled()); 1674 spin_lock(q->queue_lock); 1675 } 1676 1677 goto out; 1678 1679 requeued: 1680 BUG_ON(!irqs_disabled()); 1681 spin_lock(q->queue_lock); 1682 1683 delay_and_out: 1684 blk_delay_queue(q, HZ / 10); 1685 out: 1686 dm_table_put(map); 1687 1688 return; 1689 } 1690 1691 int dm_underlying_device_busy(struct request_queue *q) 1692 { 1693 return blk_lld_busy(q); 1694 } 1695 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1696 1697 static int dm_lld_busy(struct request_queue *q) 1698 { 1699 int r; 1700 struct mapped_device *md = q->queuedata; 1701 struct dm_table *map = dm_get_live_table(md); 1702 1703 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1704 r = 1; 1705 else 1706 r = dm_table_any_busy_target(map); 1707 1708 dm_table_put(map); 1709 1710 return r; 1711 } 1712 1713 static int dm_any_congested(void *congested_data, int bdi_bits) 1714 { 1715 int r = bdi_bits; 1716 struct mapped_device *md = congested_data; 1717 struct dm_table *map; 1718 1719 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1720 map = dm_get_live_table(md); 1721 if (map) { 1722 /* 1723 * Request-based dm cares about only own queue for 1724 * the query about congestion status of request_queue 1725 */ 1726 if (dm_request_based(md)) 1727 r = md->queue->backing_dev_info.state & 1728 bdi_bits; 1729 else 1730 r = dm_table_any_congested(map, bdi_bits); 1731 1732 dm_table_put(map); 1733 } 1734 } 1735 1736 return r; 1737 } 1738 1739 /*----------------------------------------------------------------- 1740 * An IDR is used to keep track of allocated minor numbers. 1741 *---------------------------------------------------------------*/ 1742 static void free_minor(int minor) 1743 { 1744 spin_lock(&_minor_lock); 1745 idr_remove(&_minor_idr, minor); 1746 spin_unlock(&_minor_lock); 1747 } 1748 1749 /* 1750 * See if the device with a specific minor # is free. 1751 */ 1752 static int specific_minor(int minor) 1753 { 1754 int r, m; 1755 1756 if (minor >= (1 << MINORBITS)) 1757 return -EINVAL; 1758 1759 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1760 if (!r) 1761 return -ENOMEM; 1762 1763 spin_lock(&_minor_lock); 1764 1765 if (idr_find(&_minor_idr, minor)) { 1766 r = -EBUSY; 1767 goto out; 1768 } 1769 1770 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1771 if (r) 1772 goto out; 1773 1774 if (m != minor) { 1775 idr_remove(&_minor_idr, m); 1776 r = -EBUSY; 1777 goto out; 1778 } 1779 1780 out: 1781 spin_unlock(&_minor_lock); 1782 return r; 1783 } 1784 1785 static int next_free_minor(int *minor) 1786 { 1787 int r, m; 1788 1789 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1790 if (!r) 1791 return -ENOMEM; 1792 1793 spin_lock(&_minor_lock); 1794 1795 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1796 if (r) 1797 goto out; 1798 1799 if (m >= (1 << MINORBITS)) { 1800 idr_remove(&_minor_idr, m); 1801 r = -ENOSPC; 1802 goto out; 1803 } 1804 1805 *minor = m; 1806 1807 out: 1808 spin_unlock(&_minor_lock); 1809 return r; 1810 } 1811 1812 static const struct block_device_operations dm_blk_dops; 1813 1814 static void dm_wq_work(struct work_struct *work); 1815 1816 static void dm_init_md_queue(struct mapped_device *md) 1817 { 1818 /* 1819 * Request-based dm devices cannot be stacked on top of bio-based dm 1820 * devices. The type of this dm device has not been decided yet. 1821 * The type is decided at the first table loading time. 1822 * To prevent problematic device stacking, clear the queue flag 1823 * for request stacking support until then. 1824 * 1825 * This queue is new, so no concurrency on the queue_flags. 1826 */ 1827 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1828 1829 md->queue->queuedata = md; 1830 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1831 md->queue->backing_dev_info.congested_data = md; 1832 blk_queue_make_request(md->queue, dm_request); 1833 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1834 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1835 } 1836 1837 /* 1838 * Allocate and initialise a blank device with a given minor. 1839 */ 1840 static struct mapped_device *alloc_dev(int minor) 1841 { 1842 int r; 1843 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1844 void *old_md; 1845 1846 if (!md) { 1847 DMWARN("unable to allocate device, out of memory."); 1848 return NULL; 1849 } 1850 1851 if (!try_module_get(THIS_MODULE)) 1852 goto bad_module_get; 1853 1854 /* get a minor number for the dev */ 1855 if (minor == DM_ANY_MINOR) 1856 r = next_free_minor(&minor); 1857 else 1858 r = specific_minor(minor); 1859 if (r < 0) 1860 goto bad_minor; 1861 1862 md->type = DM_TYPE_NONE; 1863 init_rwsem(&md->io_lock); 1864 mutex_init(&md->suspend_lock); 1865 mutex_init(&md->type_lock); 1866 spin_lock_init(&md->deferred_lock); 1867 rwlock_init(&md->map_lock); 1868 atomic_set(&md->holders, 1); 1869 atomic_set(&md->open_count, 0); 1870 atomic_set(&md->event_nr, 0); 1871 atomic_set(&md->uevent_seq, 0); 1872 INIT_LIST_HEAD(&md->uevent_list); 1873 spin_lock_init(&md->uevent_lock); 1874 1875 md->queue = blk_alloc_queue(GFP_KERNEL); 1876 if (!md->queue) 1877 goto bad_queue; 1878 1879 dm_init_md_queue(md); 1880 1881 md->disk = alloc_disk(1); 1882 if (!md->disk) 1883 goto bad_disk; 1884 1885 atomic_set(&md->pending[0], 0); 1886 atomic_set(&md->pending[1], 0); 1887 init_waitqueue_head(&md->wait); 1888 INIT_WORK(&md->work, dm_wq_work); 1889 init_waitqueue_head(&md->eventq); 1890 1891 md->disk->major = _major; 1892 md->disk->first_minor = minor; 1893 md->disk->fops = &dm_blk_dops; 1894 md->disk->queue = md->queue; 1895 md->disk->private_data = md; 1896 sprintf(md->disk->disk_name, "dm-%d", minor); 1897 add_disk(md->disk); 1898 format_dev_t(md->name, MKDEV(_major, minor)); 1899 1900 md->wq = alloc_workqueue("kdmflush", 1901 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1902 if (!md->wq) 1903 goto bad_thread; 1904 1905 md->bdev = bdget_disk(md->disk, 0); 1906 if (!md->bdev) 1907 goto bad_bdev; 1908 1909 bio_init(&md->flush_bio); 1910 md->flush_bio.bi_bdev = md->bdev; 1911 md->flush_bio.bi_rw = WRITE_FLUSH; 1912 1913 /* Populate the mapping, nobody knows we exist yet */ 1914 spin_lock(&_minor_lock); 1915 old_md = idr_replace(&_minor_idr, md, minor); 1916 spin_unlock(&_minor_lock); 1917 1918 BUG_ON(old_md != MINOR_ALLOCED); 1919 1920 return md; 1921 1922 bad_bdev: 1923 destroy_workqueue(md->wq); 1924 bad_thread: 1925 del_gendisk(md->disk); 1926 put_disk(md->disk); 1927 bad_disk: 1928 blk_cleanup_queue(md->queue); 1929 bad_queue: 1930 free_minor(minor); 1931 bad_minor: 1932 module_put(THIS_MODULE); 1933 bad_module_get: 1934 kfree(md); 1935 return NULL; 1936 } 1937 1938 static void unlock_fs(struct mapped_device *md); 1939 1940 static void free_dev(struct mapped_device *md) 1941 { 1942 int minor = MINOR(disk_devt(md->disk)); 1943 1944 unlock_fs(md); 1945 bdput(md->bdev); 1946 destroy_workqueue(md->wq); 1947 if (md->tio_pool) 1948 mempool_destroy(md->tio_pool); 1949 if (md->io_pool) 1950 mempool_destroy(md->io_pool); 1951 if (md->bs) 1952 bioset_free(md->bs); 1953 blk_integrity_unregister(md->disk); 1954 del_gendisk(md->disk); 1955 free_minor(minor); 1956 1957 spin_lock(&_minor_lock); 1958 md->disk->private_data = NULL; 1959 spin_unlock(&_minor_lock); 1960 1961 put_disk(md->disk); 1962 blk_cleanup_queue(md->queue); 1963 module_put(THIS_MODULE); 1964 kfree(md); 1965 } 1966 1967 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1968 { 1969 struct dm_md_mempools *p; 1970 1971 if (md->io_pool && md->tio_pool && md->bs) 1972 /* the md already has necessary mempools */ 1973 goto out; 1974 1975 p = dm_table_get_md_mempools(t); 1976 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1977 1978 md->io_pool = p->io_pool; 1979 p->io_pool = NULL; 1980 md->tio_pool = p->tio_pool; 1981 p->tio_pool = NULL; 1982 md->bs = p->bs; 1983 p->bs = NULL; 1984 1985 out: 1986 /* mempool bind completed, now no need any mempools in the table */ 1987 dm_table_free_md_mempools(t); 1988 } 1989 1990 /* 1991 * Bind a table to the device. 1992 */ 1993 static void event_callback(void *context) 1994 { 1995 unsigned long flags; 1996 LIST_HEAD(uevents); 1997 struct mapped_device *md = (struct mapped_device *) context; 1998 1999 spin_lock_irqsave(&md->uevent_lock, flags); 2000 list_splice_init(&md->uevent_list, &uevents); 2001 spin_unlock_irqrestore(&md->uevent_lock, flags); 2002 2003 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2004 2005 atomic_inc(&md->event_nr); 2006 wake_up(&md->eventq); 2007 } 2008 2009 /* 2010 * Protected by md->suspend_lock obtained by dm_swap_table(). 2011 */ 2012 static void __set_size(struct mapped_device *md, sector_t size) 2013 { 2014 set_capacity(md->disk, size); 2015 2016 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2017 } 2018 2019 /* 2020 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2021 * 2022 * If this function returns 0, then the device is either a non-dm 2023 * device without a merge_bvec_fn, or it is a dm device that is 2024 * able to split any bios it receives that are too big. 2025 */ 2026 int dm_queue_merge_is_compulsory(struct request_queue *q) 2027 { 2028 struct mapped_device *dev_md; 2029 2030 if (!q->merge_bvec_fn) 2031 return 0; 2032 2033 if (q->make_request_fn == dm_request) { 2034 dev_md = q->queuedata; 2035 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2036 return 0; 2037 } 2038 2039 return 1; 2040 } 2041 2042 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2043 struct dm_dev *dev, sector_t start, 2044 sector_t len, void *data) 2045 { 2046 struct block_device *bdev = dev->bdev; 2047 struct request_queue *q = bdev_get_queue(bdev); 2048 2049 return dm_queue_merge_is_compulsory(q); 2050 } 2051 2052 /* 2053 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2054 * on the properties of the underlying devices. 2055 */ 2056 static int dm_table_merge_is_optional(struct dm_table *table) 2057 { 2058 unsigned i = 0; 2059 struct dm_target *ti; 2060 2061 while (i < dm_table_get_num_targets(table)) { 2062 ti = dm_table_get_target(table, i++); 2063 2064 if (ti->type->iterate_devices && 2065 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2066 return 0; 2067 } 2068 2069 return 1; 2070 } 2071 2072 /* 2073 * Returns old map, which caller must destroy. 2074 */ 2075 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2076 struct queue_limits *limits) 2077 { 2078 struct dm_table *old_map; 2079 struct request_queue *q = md->queue; 2080 sector_t size; 2081 unsigned long flags; 2082 int merge_is_optional; 2083 2084 size = dm_table_get_size(t); 2085 2086 /* 2087 * Wipe any geometry if the size of the table changed. 2088 */ 2089 if (size != get_capacity(md->disk)) 2090 memset(&md->geometry, 0, sizeof(md->geometry)); 2091 2092 __set_size(md, size); 2093 2094 dm_table_event_callback(t, event_callback, md); 2095 2096 /* 2097 * The queue hasn't been stopped yet, if the old table type wasn't 2098 * for request-based during suspension. So stop it to prevent 2099 * I/O mapping before resume. 2100 * This must be done before setting the queue restrictions, 2101 * because request-based dm may be run just after the setting. 2102 */ 2103 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2104 stop_queue(q); 2105 2106 __bind_mempools(md, t); 2107 2108 merge_is_optional = dm_table_merge_is_optional(t); 2109 2110 write_lock_irqsave(&md->map_lock, flags); 2111 old_map = md->map; 2112 md->map = t; 2113 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2114 2115 dm_table_set_restrictions(t, q, limits); 2116 if (merge_is_optional) 2117 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2118 else 2119 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2120 write_unlock_irqrestore(&md->map_lock, flags); 2121 2122 return old_map; 2123 } 2124 2125 /* 2126 * Returns unbound table for the caller to free. 2127 */ 2128 static struct dm_table *__unbind(struct mapped_device *md) 2129 { 2130 struct dm_table *map = md->map; 2131 unsigned long flags; 2132 2133 if (!map) 2134 return NULL; 2135 2136 dm_table_event_callback(map, NULL, NULL); 2137 write_lock_irqsave(&md->map_lock, flags); 2138 md->map = NULL; 2139 write_unlock_irqrestore(&md->map_lock, flags); 2140 2141 return map; 2142 } 2143 2144 /* 2145 * Constructor for a new device. 2146 */ 2147 int dm_create(int minor, struct mapped_device **result) 2148 { 2149 struct mapped_device *md; 2150 2151 md = alloc_dev(minor); 2152 if (!md) 2153 return -ENXIO; 2154 2155 dm_sysfs_init(md); 2156 2157 *result = md; 2158 return 0; 2159 } 2160 2161 /* 2162 * Functions to manage md->type. 2163 * All are required to hold md->type_lock. 2164 */ 2165 void dm_lock_md_type(struct mapped_device *md) 2166 { 2167 mutex_lock(&md->type_lock); 2168 } 2169 2170 void dm_unlock_md_type(struct mapped_device *md) 2171 { 2172 mutex_unlock(&md->type_lock); 2173 } 2174 2175 void dm_set_md_type(struct mapped_device *md, unsigned type) 2176 { 2177 md->type = type; 2178 } 2179 2180 unsigned dm_get_md_type(struct mapped_device *md) 2181 { 2182 return md->type; 2183 } 2184 2185 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2186 { 2187 return md->immutable_target_type; 2188 } 2189 2190 /* 2191 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2192 */ 2193 static int dm_init_request_based_queue(struct mapped_device *md) 2194 { 2195 struct request_queue *q = NULL; 2196 2197 if (md->queue->elevator) 2198 return 1; 2199 2200 /* Fully initialize the queue */ 2201 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2202 if (!q) 2203 return 0; 2204 2205 md->queue = q; 2206 dm_init_md_queue(md); 2207 blk_queue_softirq_done(md->queue, dm_softirq_done); 2208 blk_queue_prep_rq(md->queue, dm_prep_fn); 2209 blk_queue_lld_busy(md->queue, dm_lld_busy); 2210 2211 elv_register_queue(md->queue); 2212 2213 return 1; 2214 } 2215 2216 /* 2217 * Setup the DM device's queue based on md's type 2218 */ 2219 int dm_setup_md_queue(struct mapped_device *md) 2220 { 2221 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2222 !dm_init_request_based_queue(md)) { 2223 DMWARN("Cannot initialize queue for request-based mapped device"); 2224 return -EINVAL; 2225 } 2226 2227 return 0; 2228 } 2229 2230 static struct mapped_device *dm_find_md(dev_t dev) 2231 { 2232 struct mapped_device *md; 2233 unsigned minor = MINOR(dev); 2234 2235 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2236 return NULL; 2237 2238 spin_lock(&_minor_lock); 2239 2240 md = idr_find(&_minor_idr, minor); 2241 if (md && (md == MINOR_ALLOCED || 2242 (MINOR(disk_devt(dm_disk(md))) != minor) || 2243 dm_deleting_md(md) || 2244 test_bit(DMF_FREEING, &md->flags))) { 2245 md = NULL; 2246 goto out; 2247 } 2248 2249 out: 2250 spin_unlock(&_minor_lock); 2251 2252 return md; 2253 } 2254 2255 struct mapped_device *dm_get_md(dev_t dev) 2256 { 2257 struct mapped_device *md = dm_find_md(dev); 2258 2259 if (md) 2260 dm_get(md); 2261 2262 return md; 2263 } 2264 EXPORT_SYMBOL_GPL(dm_get_md); 2265 2266 void *dm_get_mdptr(struct mapped_device *md) 2267 { 2268 return md->interface_ptr; 2269 } 2270 2271 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2272 { 2273 md->interface_ptr = ptr; 2274 } 2275 2276 void dm_get(struct mapped_device *md) 2277 { 2278 atomic_inc(&md->holders); 2279 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2280 } 2281 2282 const char *dm_device_name(struct mapped_device *md) 2283 { 2284 return md->name; 2285 } 2286 EXPORT_SYMBOL_GPL(dm_device_name); 2287 2288 static void __dm_destroy(struct mapped_device *md, bool wait) 2289 { 2290 struct dm_table *map; 2291 2292 might_sleep(); 2293 2294 spin_lock(&_minor_lock); 2295 map = dm_get_live_table(md); 2296 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2297 set_bit(DMF_FREEING, &md->flags); 2298 spin_unlock(&_minor_lock); 2299 2300 if (!dm_suspended_md(md)) { 2301 dm_table_presuspend_targets(map); 2302 dm_table_postsuspend_targets(map); 2303 } 2304 2305 /* 2306 * Rare, but there may be I/O requests still going to complete, 2307 * for example. Wait for all references to disappear. 2308 * No one should increment the reference count of the mapped_device, 2309 * after the mapped_device state becomes DMF_FREEING. 2310 */ 2311 if (wait) 2312 while (atomic_read(&md->holders)) 2313 msleep(1); 2314 else if (atomic_read(&md->holders)) 2315 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2316 dm_device_name(md), atomic_read(&md->holders)); 2317 2318 dm_sysfs_exit(md); 2319 dm_table_put(map); 2320 dm_table_destroy(__unbind(md)); 2321 free_dev(md); 2322 } 2323 2324 void dm_destroy(struct mapped_device *md) 2325 { 2326 __dm_destroy(md, true); 2327 } 2328 2329 void dm_destroy_immediate(struct mapped_device *md) 2330 { 2331 __dm_destroy(md, false); 2332 } 2333 2334 void dm_put(struct mapped_device *md) 2335 { 2336 atomic_dec(&md->holders); 2337 } 2338 EXPORT_SYMBOL_GPL(dm_put); 2339 2340 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2341 { 2342 int r = 0; 2343 DECLARE_WAITQUEUE(wait, current); 2344 2345 add_wait_queue(&md->wait, &wait); 2346 2347 while (1) { 2348 set_current_state(interruptible); 2349 2350 if (!md_in_flight(md)) 2351 break; 2352 2353 if (interruptible == TASK_INTERRUPTIBLE && 2354 signal_pending(current)) { 2355 r = -EINTR; 2356 break; 2357 } 2358 2359 io_schedule(); 2360 } 2361 set_current_state(TASK_RUNNING); 2362 2363 remove_wait_queue(&md->wait, &wait); 2364 2365 return r; 2366 } 2367 2368 /* 2369 * Process the deferred bios 2370 */ 2371 static void dm_wq_work(struct work_struct *work) 2372 { 2373 struct mapped_device *md = container_of(work, struct mapped_device, 2374 work); 2375 struct bio *c; 2376 2377 down_read(&md->io_lock); 2378 2379 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2380 spin_lock_irq(&md->deferred_lock); 2381 c = bio_list_pop(&md->deferred); 2382 spin_unlock_irq(&md->deferred_lock); 2383 2384 if (!c) 2385 break; 2386 2387 up_read(&md->io_lock); 2388 2389 if (dm_request_based(md)) 2390 generic_make_request(c); 2391 else 2392 __split_and_process_bio(md, c); 2393 2394 down_read(&md->io_lock); 2395 } 2396 2397 up_read(&md->io_lock); 2398 } 2399 2400 static void dm_queue_flush(struct mapped_device *md) 2401 { 2402 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2403 smp_mb__after_clear_bit(); 2404 queue_work(md->wq, &md->work); 2405 } 2406 2407 /* 2408 * Swap in a new table, returning the old one for the caller to destroy. 2409 */ 2410 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2411 { 2412 struct dm_table *map = ERR_PTR(-EINVAL); 2413 struct queue_limits limits; 2414 int r; 2415 2416 mutex_lock(&md->suspend_lock); 2417 2418 /* device must be suspended */ 2419 if (!dm_suspended_md(md)) 2420 goto out; 2421 2422 r = dm_calculate_queue_limits(table, &limits); 2423 if (r) { 2424 map = ERR_PTR(r); 2425 goto out; 2426 } 2427 2428 map = __bind(md, table, &limits); 2429 2430 out: 2431 mutex_unlock(&md->suspend_lock); 2432 return map; 2433 } 2434 2435 /* 2436 * Functions to lock and unlock any filesystem running on the 2437 * device. 2438 */ 2439 static int lock_fs(struct mapped_device *md) 2440 { 2441 int r; 2442 2443 WARN_ON(md->frozen_sb); 2444 2445 md->frozen_sb = freeze_bdev(md->bdev); 2446 if (IS_ERR(md->frozen_sb)) { 2447 r = PTR_ERR(md->frozen_sb); 2448 md->frozen_sb = NULL; 2449 return r; 2450 } 2451 2452 set_bit(DMF_FROZEN, &md->flags); 2453 2454 return 0; 2455 } 2456 2457 static void unlock_fs(struct mapped_device *md) 2458 { 2459 if (!test_bit(DMF_FROZEN, &md->flags)) 2460 return; 2461 2462 thaw_bdev(md->bdev, md->frozen_sb); 2463 md->frozen_sb = NULL; 2464 clear_bit(DMF_FROZEN, &md->flags); 2465 } 2466 2467 /* 2468 * We need to be able to change a mapping table under a mounted 2469 * filesystem. For example we might want to move some data in 2470 * the background. Before the table can be swapped with 2471 * dm_bind_table, dm_suspend must be called to flush any in 2472 * flight bios and ensure that any further io gets deferred. 2473 */ 2474 /* 2475 * Suspend mechanism in request-based dm. 2476 * 2477 * 1. Flush all I/Os by lock_fs() if needed. 2478 * 2. Stop dispatching any I/O by stopping the request_queue. 2479 * 3. Wait for all in-flight I/Os to be completed or requeued. 2480 * 2481 * To abort suspend, start the request_queue. 2482 */ 2483 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2484 { 2485 struct dm_table *map = NULL; 2486 int r = 0; 2487 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2488 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2489 2490 mutex_lock(&md->suspend_lock); 2491 2492 if (dm_suspended_md(md)) { 2493 r = -EINVAL; 2494 goto out_unlock; 2495 } 2496 2497 map = dm_get_live_table(md); 2498 2499 /* 2500 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2501 * This flag is cleared before dm_suspend returns. 2502 */ 2503 if (noflush) 2504 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2505 2506 /* This does not get reverted if there's an error later. */ 2507 dm_table_presuspend_targets(map); 2508 2509 /* 2510 * Flush I/O to the device. 2511 * Any I/O submitted after lock_fs() may not be flushed. 2512 * noflush takes precedence over do_lockfs. 2513 * (lock_fs() flushes I/Os and waits for them to complete.) 2514 */ 2515 if (!noflush && do_lockfs) { 2516 r = lock_fs(md); 2517 if (r) 2518 goto out; 2519 } 2520 2521 /* 2522 * Here we must make sure that no processes are submitting requests 2523 * to target drivers i.e. no one may be executing 2524 * __split_and_process_bio. This is called from dm_request and 2525 * dm_wq_work. 2526 * 2527 * To get all processes out of __split_and_process_bio in dm_request, 2528 * we take the write lock. To prevent any process from reentering 2529 * __split_and_process_bio from dm_request and quiesce the thread 2530 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2531 * flush_workqueue(md->wq). 2532 */ 2533 down_write(&md->io_lock); 2534 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2535 up_write(&md->io_lock); 2536 2537 /* 2538 * Stop md->queue before flushing md->wq in case request-based 2539 * dm defers requests to md->wq from md->queue. 2540 */ 2541 if (dm_request_based(md)) 2542 stop_queue(md->queue); 2543 2544 flush_workqueue(md->wq); 2545 2546 /* 2547 * At this point no more requests are entering target request routines. 2548 * We call dm_wait_for_completion to wait for all existing requests 2549 * to finish. 2550 */ 2551 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2552 2553 down_write(&md->io_lock); 2554 if (noflush) 2555 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2556 up_write(&md->io_lock); 2557 2558 /* were we interrupted ? */ 2559 if (r < 0) { 2560 dm_queue_flush(md); 2561 2562 if (dm_request_based(md)) 2563 start_queue(md->queue); 2564 2565 unlock_fs(md); 2566 goto out; /* pushback list is already flushed, so skip flush */ 2567 } 2568 2569 /* 2570 * If dm_wait_for_completion returned 0, the device is completely 2571 * quiescent now. There is no request-processing activity. All new 2572 * requests are being added to md->deferred list. 2573 */ 2574 2575 set_bit(DMF_SUSPENDED, &md->flags); 2576 2577 dm_table_postsuspend_targets(map); 2578 2579 out: 2580 dm_table_put(map); 2581 2582 out_unlock: 2583 mutex_unlock(&md->suspend_lock); 2584 return r; 2585 } 2586 2587 int dm_resume(struct mapped_device *md) 2588 { 2589 int r = -EINVAL; 2590 struct dm_table *map = NULL; 2591 2592 mutex_lock(&md->suspend_lock); 2593 if (!dm_suspended_md(md)) 2594 goto out; 2595 2596 map = dm_get_live_table(md); 2597 if (!map || !dm_table_get_size(map)) 2598 goto out; 2599 2600 r = dm_table_resume_targets(map); 2601 if (r) 2602 goto out; 2603 2604 dm_queue_flush(md); 2605 2606 /* 2607 * Flushing deferred I/Os must be done after targets are resumed 2608 * so that mapping of targets can work correctly. 2609 * Request-based dm is queueing the deferred I/Os in its request_queue. 2610 */ 2611 if (dm_request_based(md)) 2612 start_queue(md->queue); 2613 2614 unlock_fs(md); 2615 2616 clear_bit(DMF_SUSPENDED, &md->flags); 2617 2618 r = 0; 2619 out: 2620 dm_table_put(map); 2621 mutex_unlock(&md->suspend_lock); 2622 2623 return r; 2624 } 2625 2626 /*----------------------------------------------------------------- 2627 * Event notification. 2628 *---------------------------------------------------------------*/ 2629 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2630 unsigned cookie) 2631 { 2632 char udev_cookie[DM_COOKIE_LENGTH]; 2633 char *envp[] = { udev_cookie, NULL }; 2634 2635 if (!cookie) 2636 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2637 else { 2638 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2639 DM_COOKIE_ENV_VAR_NAME, cookie); 2640 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2641 action, envp); 2642 } 2643 } 2644 2645 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2646 { 2647 return atomic_add_return(1, &md->uevent_seq); 2648 } 2649 2650 uint32_t dm_get_event_nr(struct mapped_device *md) 2651 { 2652 return atomic_read(&md->event_nr); 2653 } 2654 2655 int dm_wait_event(struct mapped_device *md, int event_nr) 2656 { 2657 return wait_event_interruptible(md->eventq, 2658 (event_nr != atomic_read(&md->event_nr))); 2659 } 2660 2661 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2662 { 2663 unsigned long flags; 2664 2665 spin_lock_irqsave(&md->uevent_lock, flags); 2666 list_add(elist, &md->uevent_list); 2667 spin_unlock_irqrestore(&md->uevent_lock, flags); 2668 } 2669 2670 /* 2671 * The gendisk is only valid as long as you have a reference 2672 * count on 'md'. 2673 */ 2674 struct gendisk *dm_disk(struct mapped_device *md) 2675 { 2676 return md->disk; 2677 } 2678 2679 struct kobject *dm_kobject(struct mapped_device *md) 2680 { 2681 return &md->kobj; 2682 } 2683 2684 /* 2685 * struct mapped_device should not be exported outside of dm.c 2686 * so use this check to verify that kobj is part of md structure 2687 */ 2688 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2689 { 2690 struct mapped_device *md; 2691 2692 md = container_of(kobj, struct mapped_device, kobj); 2693 if (&md->kobj != kobj) 2694 return NULL; 2695 2696 if (test_bit(DMF_FREEING, &md->flags) || 2697 dm_deleting_md(md)) 2698 return NULL; 2699 2700 dm_get(md); 2701 return md; 2702 } 2703 2704 int dm_suspended_md(struct mapped_device *md) 2705 { 2706 return test_bit(DMF_SUSPENDED, &md->flags); 2707 } 2708 2709 int dm_suspended(struct dm_target *ti) 2710 { 2711 return dm_suspended_md(dm_table_get_md(ti->table)); 2712 } 2713 EXPORT_SYMBOL_GPL(dm_suspended); 2714 2715 int dm_noflush_suspending(struct dm_target *ti) 2716 { 2717 return __noflush_suspending(dm_table_get_md(ti->table)); 2718 } 2719 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2720 2721 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) 2722 { 2723 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2724 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2725 2726 if (!pools) 2727 return NULL; 2728 2729 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2730 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2731 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2732 if (!pools->io_pool) 2733 goto free_pools_and_out; 2734 2735 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2736 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2737 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2738 if (!pools->tio_pool) 2739 goto free_io_pool_and_out; 2740 2741 pools->bs = bioset_create(pool_size, 0); 2742 if (!pools->bs) 2743 goto free_tio_pool_and_out; 2744 2745 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2746 goto free_bioset_and_out; 2747 2748 return pools; 2749 2750 free_bioset_and_out: 2751 bioset_free(pools->bs); 2752 2753 free_tio_pool_and_out: 2754 mempool_destroy(pools->tio_pool); 2755 2756 free_io_pool_and_out: 2757 mempool_destroy(pools->io_pool); 2758 2759 free_pools_and_out: 2760 kfree(pools); 2761 2762 return NULL; 2763 } 2764 2765 void dm_free_md_mempools(struct dm_md_mempools *pools) 2766 { 2767 if (!pools) 2768 return; 2769 2770 if (pools->io_pool) 2771 mempool_destroy(pools->io_pool); 2772 2773 if (pools->tio_pool) 2774 mempool_destroy(pools->tio_pool); 2775 2776 if (pools->bs) 2777 bioset_free(pools->bs); 2778 2779 kfree(pools); 2780 } 2781 2782 static const struct block_device_operations dm_blk_dops = { 2783 .open = dm_blk_open, 2784 .release = dm_blk_close, 2785 .ioctl = dm_blk_ioctl, 2786 .getgeo = dm_blk_getgeo, 2787 .owner = THIS_MODULE 2788 }; 2789 2790 EXPORT_SYMBOL(dm_get_mapinfo); 2791 2792 /* 2793 * module hooks 2794 */ 2795 module_init(dm_init); 2796 module_exit(dm_exit); 2797 2798 module_param(major, uint, 0); 2799 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2800 MODULE_DESCRIPTION(DM_NAME " driver"); 2801 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2802 MODULE_LICENSE("GPL"); 2803