1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 #include <linux/delay.h> 23 24 #include <trace/events/block.h> 25 26 #define DM_MSG_PREFIX "core" 27 28 /* 29 * Cookies are numeric values sent with CHANGE and REMOVE 30 * uevents while resuming, removing or renaming the device. 31 */ 32 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 33 #define DM_COOKIE_LENGTH 24 34 35 static const char *_name = DM_NAME; 36 37 static unsigned int major = 0; 38 static unsigned int _major = 0; 39 40 static DEFINE_IDR(_minor_idr); 41 42 static DEFINE_SPINLOCK(_minor_lock); 43 /* 44 * For bio-based dm. 45 * One of these is allocated per bio. 46 */ 47 struct dm_io { 48 struct mapped_device *md; 49 int error; 50 atomic_t io_count; 51 struct bio *bio; 52 unsigned long start_time; 53 spinlock_t endio_lock; 54 }; 55 56 /* 57 * For bio-based dm. 58 * One of these is allocated per target within a bio. Hopefully 59 * this will be simplified out one day. 60 */ 61 struct dm_target_io { 62 struct dm_io *io; 63 struct dm_target *ti; 64 union map_info info; 65 }; 66 67 /* 68 * For request-based dm. 69 * One of these is allocated per request. 70 */ 71 struct dm_rq_target_io { 72 struct mapped_device *md; 73 struct dm_target *ti; 74 struct request *orig, clone; 75 int error; 76 union map_info info; 77 }; 78 79 /* 80 * For request-based dm. 81 * One of these is allocated per bio. 82 */ 83 struct dm_rq_clone_bio_info { 84 struct bio *orig; 85 struct dm_rq_target_io *tio; 86 }; 87 88 union map_info *dm_get_mapinfo(struct bio *bio) 89 { 90 if (bio && bio->bi_private) 91 return &((struct dm_target_io *)bio->bi_private)->info; 92 return NULL; 93 } 94 95 union map_info *dm_get_rq_mapinfo(struct request *rq) 96 { 97 if (rq && rq->end_io_data) 98 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 99 return NULL; 100 } 101 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 102 103 #define MINOR_ALLOCED ((void *)-1) 104 105 /* 106 * Bits for the md->flags field. 107 */ 108 #define DMF_BLOCK_IO_FOR_SUSPEND 0 109 #define DMF_SUSPENDED 1 110 #define DMF_FROZEN 2 111 #define DMF_FREEING 3 112 #define DMF_DELETING 4 113 #define DMF_NOFLUSH_SUSPENDING 5 114 #define DMF_MERGE_IS_OPTIONAL 6 115 116 /* 117 * Work processed by per-device workqueue. 118 */ 119 struct mapped_device { 120 struct rw_semaphore io_lock; 121 struct mutex suspend_lock; 122 rwlock_t map_lock; 123 atomic_t holders; 124 atomic_t open_count; 125 126 unsigned long flags; 127 128 struct request_queue *queue; 129 unsigned type; 130 /* Protect queue and type against concurrent access. */ 131 struct mutex type_lock; 132 133 struct gendisk *disk; 134 char name[16]; 135 136 void *interface_ptr; 137 138 /* 139 * A list of ios that arrived while we were suspended. 140 */ 141 atomic_t pending[2]; 142 wait_queue_head_t wait; 143 struct work_struct work; 144 struct bio_list deferred; 145 spinlock_t deferred_lock; 146 147 /* 148 * Processing queue (flush) 149 */ 150 struct workqueue_struct *wq; 151 152 /* 153 * The current mapping. 154 */ 155 struct dm_table *map; 156 157 /* 158 * io objects are allocated from here. 159 */ 160 mempool_t *io_pool; 161 mempool_t *tio_pool; 162 163 struct bio_set *bs; 164 165 /* 166 * Event handling. 167 */ 168 atomic_t event_nr; 169 wait_queue_head_t eventq; 170 atomic_t uevent_seq; 171 struct list_head uevent_list; 172 spinlock_t uevent_lock; /* Protect access to uevent_list */ 173 174 /* 175 * freeze/thaw support require holding onto a super block 176 */ 177 struct super_block *frozen_sb; 178 struct block_device *bdev; 179 180 /* forced geometry settings */ 181 struct hd_geometry geometry; 182 183 /* For saving the address of __make_request for request based dm */ 184 make_request_fn *saved_make_request_fn; 185 186 /* sysfs handle */ 187 struct kobject kobj; 188 189 /* zero-length flush that will be cloned and submitted to targets */ 190 struct bio flush_bio; 191 }; 192 193 /* 194 * For mempools pre-allocation at the table loading time. 195 */ 196 struct dm_md_mempools { 197 mempool_t *io_pool; 198 mempool_t *tio_pool; 199 struct bio_set *bs; 200 }; 201 202 #define MIN_IOS 256 203 static struct kmem_cache *_io_cache; 204 static struct kmem_cache *_tio_cache; 205 static struct kmem_cache *_rq_tio_cache; 206 static struct kmem_cache *_rq_bio_info_cache; 207 208 static int __init local_init(void) 209 { 210 int r = -ENOMEM; 211 212 /* allocate a slab for the dm_ios */ 213 _io_cache = KMEM_CACHE(dm_io, 0); 214 if (!_io_cache) 215 return r; 216 217 /* allocate a slab for the target ios */ 218 _tio_cache = KMEM_CACHE(dm_target_io, 0); 219 if (!_tio_cache) 220 goto out_free_io_cache; 221 222 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 223 if (!_rq_tio_cache) 224 goto out_free_tio_cache; 225 226 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 227 if (!_rq_bio_info_cache) 228 goto out_free_rq_tio_cache; 229 230 r = dm_uevent_init(); 231 if (r) 232 goto out_free_rq_bio_info_cache; 233 234 _major = major; 235 r = register_blkdev(_major, _name); 236 if (r < 0) 237 goto out_uevent_exit; 238 239 if (!_major) 240 _major = r; 241 242 return 0; 243 244 out_uevent_exit: 245 dm_uevent_exit(); 246 out_free_rq_bio_info_cache: 247 kmem_cache_destroy(_rq_bio_info_cache); 248 out_free_rq_tio_cache: 249 kmem_cache_destroy(_rq_tio_cache); 250 out_free_tio_cache: 251 kmem_cache_destroy(_tio_cache); 252 out_free_io_cache: 253 kmem_cache_destroy(_io_cache); 254 255 return r; 256 } 257 258 static void local_exit(void) 259 { 260 kmem_cache_destroy(_rq_bio_info_cache); 261 kmem_cache_destroy(_rq_tio_cache); 262 kmem_cache_destroy(_tio_cache); 263 kmem_cache_destroy(_io_cache); 264 unregister_blkdev(_major, _name); 265 dm_uevent_exit(); 266 267 _major = 0; 268 269 DMINFO("cleaned up"); 270 } 271 272 static int (*_inits[])(void) __initdata = { 273 local_init, 274 dm_target_init, 275 dm_linear_init, 276 dm_stripe_init, 277 dm_io_init, 278 dm_kcopyd_init, 279 dm_interface_init, 280 }; 281 282 static void (*_exits[])(void) = { 283 local_exit, 284 dm_target_exit, 285 dm_linear_exit, 286 dm_stripe_exit, 287 dm_io_exit, 288 dm_kcopyd_exit, 289 dm_interface_exit, 290 }; 291 292 static int __init dm_init(void) 293 { 294 const int count = ARRAY_SIZE(_inits); 295 296 int r, i; 297 298 for (i = 0; i < count; i++) { 299 r = _inits[i](); 300 if (r) 301 goto bad; 302 } 303 304 return 0; 305 306 bad: 307 while (i--) 308 _exits[i](); 309 310 return r; 311 } 312 313 static void __exit dm_exit(void) 314 { 315 int i = ARRAY_SIZE(_exits); 316 317 while (i--) 318 _exits[i](); 319 320 /* 321 * Should be empty by this point. 322 */ 323 idr_remove_all(&_minor_idr); 324 idr_destroy(&_minor_idr); 325 } 326 327 /* 328 * Block device functions 329 */ 330 int dm_deleting_md(struct mapped_device *md) 331 { 332 return test_bit(DMF_DELETING, &md->flags); 333 } 334 335 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 336 { 337 struct mapped_device *md; 338 339 spin_lock(&_minor_lock); 340 341 md = bdev->bd_disk->private_data; 342 if (!md) 343 goto out; 344 345 if (test_bit(DMF_FREEING, &md->flags) || 346 dm_deleting_md(md)) { 347 md = NULL; 348 goto out; 349 } 350 351 dm_get(md); 352 atomic_inc(&md->open_count); 353 354 out: 355 spin_unlock(&_minor_lock); 356 357 return md ? 0 : -ENXIO; 358 } 359 360 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 361 { 362 struct mapped_device *md = disk->private_data; 363 364 spin_lock(&_minor_lock); 365 366 atomic_dec(&md->open_count); 367 dm_put(md); 368 369 spin_unlock(&_minor_lock); 370 371 return 0; 372 } 373 374 int dm_open_count(struct mapped_device *md) 375 { 376 return atomic_read(&md->open_count); 377 } 378 379 /* 380 * Guarantees nothing is using the device before it's deleted. 381 */ 382 int dm_lock_for_deletion(struct mapped_device *md) 383 { 384 int r = 0; 385 386 spin_lock(&_minor_lock); 387 388 if (dm_open_count(md)) 389 r = -EBUSY; 390 else 391 set_bit(DMF_DELETING, &md->flags); 392 393 spin_unlock(&_minor_lock); 394 395 return r; 396 } 397 398 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 399 { 400 struct mapped_device *md = bdev->bd_disk->private_data; 401 402 return dm_get_geometry(md, geo); 403 } 404 405 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 406 unsigned int cmd, unsigned long arg) 407 { 408 struct mapped_device *md = bdev->bd_disk->private_data; 409 struct dm_table *map = dm_get_live_table(md); 410 struct dm_target *tgt; 411 int r = -ENOTTY; 412 413 if (!map || !dm_table_get_size(map)) 414 goto out; 415 416 /* We only support devices that have a single target */ 417 if (dm_table_get_num_targets(map) != 1) 418 goto out; 419 420 tgt = dm_table_get_target(map, 0); 421 422 if (dm_suspended_md(md)) { 423 r = -EAGAIN; 424 goto out; 425 } 426 427 if (tgt->type->ioctl) 428 r = tgt->type->ioctl(tgt, cmd, arg); 429 430 out: 431 dm_table_put(map); 432 433 return r; 434 } 435 436 static struct dm_io *alloc_io(struct mapped_device *md) 437 { 438 return mempool_alloc(md->io_pool, GFP_NOIO); 439 } 440 441 static void free_io(struct mapped_device *md, struct dm_io *io) 442 { 443 mempool_free(io, md->io_pool); 444 } 445 446 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 447 { 448 mempool_free(tio, md->tio_pool); 449 } 450 451 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 452 gfp_t gfp_mask) 453 { 454 return mempool_alloc(md->tio_pool, gfp_mask); 455 } 456 457 static void free_rq_tio(struct dm_rq_target_io *tio) 458 { 459 mempool_free(tio, tio->md->tio_pool); 460 } 461 462 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 463 { 464 return mempool_alloc(md->io_pool, GFP_ATOMIC); 465 } 466 467 static void free_bio_info(struct dm_rq_clone_bio_info *info) 468 { 469 mempool_free(info, info->tio->md->io_pool); 470 } 471 472 static int md_in_flight(struct mapped_device *md) 473 { 474 return atomic_read(&md->pending[READ]) + 475 atomic_read(&md->pending[WRITE]); 476 } 477 478 static void start_io_acct(struct dm_io *io) 479 { 480 struct mapped_device *md = io->md; 481 int cpu; 482 int rw = bio_data_dir(io->bio); 483 484 io->start_time = jiffies; 485 486 cpu = part_stat_lock(); 487 part_round_stats(cpu, &dm_disk(md)->part0); 488 part_stat_unlock(); 489 atomic_set(&dm_disk(md)->part0.in_flight[rw], 490 atomic_inc_return(&md->pending[rw])); 491 } 492 493 static void end_io_acct(struct dm_io *io) 494 { 495 struct mapped_device *md = io->md; 496 struct bio *bio = io->bio; 497 unsigned long duration = jiffies - io->start_time; 498 int pending, cpu; 499 int rw = bio_data_dir(bio); 500 501 cpu = part_stat_lock(); 502 part_round_stats(cpu, &dm_disk(md)->part0); 503 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 504 part_stat_unlock(); 505 506 /* 507 * After this is decremented the bio must not be touched if it is 508 * a flush. 509 */ 510 pending = atomic_dec_return(&md->pending[rw]); 511 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 512 pending += atomic_read(&md->pending[rw^0x1]); 513 514 /* nudge anyone waiting on suspend queue */ 515 if (!pending) 516 wake_up(&md->wait); 517 } 518 519 /* 520 * Add the bio to the list of deferred io. 521 */ 522 static void queue_io(struct mapped_device *md, struct bio *bio) 523 { 524 unsigned long flags; 525 526 spin_lock_irqsave(&md->deferred_lock, flags); 527 bio_list_add(&md->deferred, bio); 528 spin_unlock_irqrestore(&md->deferred_lock, flags); 529 queue_work(md->wq, &md->work); 530 } 531 532 /* 533 * Everyone (including functions in this file), should use this 534 * function to access the md->map field, and make sure they call 535 * dm_table_put() when finished. 536 */ 537 struct dm_table *dm_get_live_table(struct mapped_device *md) 538 { 539 struct dm_table *t; 540 unsigned long flags; 541 542 read_lock_irqsave(&md->map_lock, flags); 543 t = md->map; 544 if (t) 545 dm_table_get(t); 546 read_unlock_irqrestore(&md->map_lock, flags); 547 548 return t; 549 } 550 551 /* 552 * Get the geometry associated with a dm device 553 */ 554 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 555 { 556 *geo = md->geometry; 557 558 return 0; 559 } 560 561 /* 562 * Set the geometry of a device. 563 */ 564 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 565 { 566 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 567 568 if (geo->start > sz) { 569 DMWARN("Start sector is beyond the geometry limits."); 570 return -EINVAL; 571 } 572 573 md->geometry = *geo; 574 575 return 0; 576 } 577 578 /*----------------------------------------------------------------- 579 * CRUD START: 580 * A more elegant soln is in the works that uses the queue 581 * merge fn, unfortunately there are a couple of changes to 582 * the block layer that I want to make for this. So in the 583 * interests of getting something for people to use I give 584 * you this clearly demarcated crap. 585 *---------------------------------------------------------------*/ 586 587 static int __noflush_suspending(struct mapped_device *md) 588 { 589 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 590 } 591 592 /* 593 * Decrements the number of outstanding ios that a bio has been 594 * cloned into, completing the original io if necc. 595 */ 596 static void dec_pending(struct dm_io *io, int error) 597 { 598 unsigned long flags; 599 int io_error; 600 struct bio *bio; 601 struct mapped_device *md = io->md; 602 603 /* Push-back supersedes any I/O errors */ 604 if (unlikely(error)) { 605 spin_lock_irqsave(&io->endio_lock, flags); 606 if (!(io->error > 0 && __noflush_suspending(md))) 607 io->error = error; 608 spin_unlock_irqrestore(&io->endio_lock, flags); 609 } 610 611 if (atomic_dec_and_test(&io->io_count)) { 612 if (io->error == DM_ENDIO_REQUEUE) { 613 /* 614 * Target requested pushing back the I/O. 615 */ 616 spin_lock_irqsave(&md->deferred_lock, flags); 617 if (__noflush_suspending(md)) 618 bio_list_add_head(&md->deferred, io->bio); 619 else 620 /* noflush suspend was interrupted. */ 621 io->error = -EIO; 622 spin_unlock_irqrestore(&md->deferred_lock, flags); 623 } 624 625 io_error = io->error; 626 bio = io->bio; 627 end_io_acct(io); 628 free_io(md, io); 629 630 if (io_error == DM_ENDIO_REQUEUE) 631 return; 632 633 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 634 /* 635 * Preflush done for flush with data, reissue 636 * without REQ_FLUSH. 637 */ 638 bio->bi_rw &= ~REQ_FLUSH; 639 queue_io(md, bio); 640 } else { 641 /* done with normal IO or empty flush */ 642 trace_block_bio_complete(md->queue, bio, io_error); 643 bio_endio(bio, io_error); 644 } 645 } 646 } 647 648 static void clone_endio(struct bio *bio, int error) 649 { 650 int r = 0; 651 struct dm_target_io *tio = bio->bi_private; 652 struct dm_io *io = tio->io; 653 struct mapped_device *md = tio->io->md; 654 dm_endio_fn endio = tio->ti->type->end_io; 655 656 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 657 error = -EIO; 658 659 if (endio) { 660 r = endio(tio->ti, bio, error, &tio->info); 661 if (r < 0 || r == DM_ENDIO_REQUEUE) 662 /* 663 * error and requeue request are handled 664 * in dec_pending(). 665 */ 666 error = r; 667 else if (r == DM_ENDIO_INCOMPLETE) 668 /* The target will handle the io */ 669 return; 670 else if (r) { 671 DMWARN("unimplemented target endio return value: %d", r); 672 BUG(); 673 } 674 } 675 676 /* 677 * Store md for cleanup instead of tio which is about to get freed. 678 */ 679 bio->bi_private = md->bs; 680 681 free_tio(md, tio); 682 bio_put(bio); 683 dec_pending(io, error); 684 } 685 686 /* 687 * Partial completion handling for request-based dm 688 */ 689 static void end_clone_bio(struct bio *clone, int error) 690 { 691 struct dm_rq_clone_bio_info *info = clone->bi_private; 692 struct dm_rq_target_io *tio = info->tio; 693 struct bio *bio = info->orig; 694 unsigned int nr_bytes = info->orig->bi_size; 695 696 bio_put(clone); 697 698 if (tio->error) 699 /* 700 * An error has already been detected on the request. 701 * Once error occurred, just let clone->end_io() handle 702 * the remainder. 703 */ 704 return; 705 else if (error) { 706 /* 707 * Don't notice the error to the upper layer yet. 708 * The error handling decision is made by the target driver, 709 * when the request is completed. 710 */ 711 tio->error = error; 712 return; 713 } 714 715 /* 716 * I/O for the bio successfully completed. 717 * Notice the data completion to the upper layer. 718 */ 719 720 /* 721 * bios are processed from the head of the list. 722 * So the completing bio should always be rq->bio. 723 * If it's not, something wrong is happening. 724 */ 725 if (tio->orig->bio != bio) 726 DMERR("bio completion is going in the middle of the request"); 727 728 /* 729 * Update the original request. 730 * Do not use blk_end_request() here, because it may complete 731 * the original request before the clone, and break the ordering. 732 */ 733 blk_update_request(tio->orig, 0, nr_bytes); 734 } 735 736 /* 737 * Don't touch any member of the md after calling this function because 738 * the md may be freed in dm_put() at the end of this function. 739 * Or do dm_get() before calling this function and dm_put() later. 740 */ 741 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 742 { 743 atomic_dec(&md->pending[rw]); 744 745 /* nudge anyone waiting on suspend queue */ 746 if (!md_in_flight(md)) 747 wake_up(&md->wait); 748 749 if (run_queue) 750 blk_run_queue(md->queue); 751 752 /* 753 * dm_put() must be at the end of this function. See the comment above 754 */ 755 dm_put(md); 756 } 757 758 static void free_rq_clone(struct request *clone) 759 { 760 struct dm_rq_target_io *tio = clone->end_io_data; 761 762 blk_rq_unprep_clone(clone); 763 free_rq_tio(tio); 764 } 765 766 /* 767 * Complete the clone and the original request. 768 * Must be called without queue lock. 769 */ 770 static void dm_end_request(struct request *clone, int error) 771 { 772 int rw = rq_data_dir(clone); 773 struct dm_rq_target_io *tio = clone->end_io_data; 774 struct mapped_device *md = tio->md; 775 struct request *rq = tio->orig; 776 777 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 778 rq->errors = clone->errors; 779 rq->resid_len = clone->resid_len; 780 781 if (rq->sense) 782 /* 783 * We are using the sense buffer of the original 784 * request. 785 * So setting the length of the sense data is enough. 786 */ 787 rq->sense_len = clone->sense_len; 788 } 789 790 free_rq_clone(clone); 791 blk_end_request_all(rq, error); 792 rq_completed(md, rw, true); 793 } 794 795 static void dm_unprep_request(struct request *rq) 796 { 797 struct request *clone = rq->special; 798 799 rq->special = NULL; 800 rq->cmd_flags &= ~REQ_DONTPREP; 801 802 free_rq_clone(clone); 803 } 804 805 /* 806 * Requeue the original request of a clone. 807 */ 808 void dm_requeue_unmapped_request(struct request *clone) 809 { 810 int rw = rq_data_dir(clone); 811 struct dm_rq_target_io *tio = clone->end_io_data; 812 struct mapped_device *md = tio->md; 813 struct request *rq = tio->orig; 814 struct request_queue *q = rq->q; 815 unsigned long flags; 816 817 dm_unprep_request(rq); 818 819 spin_lock_irqsave(q->queue_lock, flags); 820 blk_requeue_request(q, rq); 821 spin_unlock_irqrestore(q->queue_lock, flags); 822 823 rq_completed(md, rw, 0); 824 } 825 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 826 827 static void __stop_queue(struct request_queue *q) 828 { 829 blk_stop_queue(q); 830 } 831 832 static void stop_queue(struct request_queue *q) 833 { 834 unsigned long flags; 835 836 spin_lock_irqsave(q->queue_lock, flags); 837 __stop_queue(q); 838 spin_unlock_irqrestore(q->queue_lock, flags); 839 } 840 841 static void __start_queue(struct request_queue *q) 842 { 843 if (blk_queue_stopped(q)) 844 blk_start_queue(q); 845 } 846 847 static void start_queue(struct request_queue *q) 848 { 849 unsigned long flags; 850 851 spin_lock_irqsave(q->queue_lock, flags); 852 __start_queue(q); 853 spin_unlock_irqrestore(q->queue_lock, flags); 854 } 855 856 static void dm_done(struct request *clone, int error, bool mapped) 857 { 858 int r = error; 859 struct dm_rq_target_io *tio = clone->end_io_data; 860 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 861 862 if (mapped && rq_end_io) 863 r = rq_end_io(tio->ti, clone, error, &tio->info); 864 865 if (r <= 0) 866 /* The target wants to complete the I/O */ 867 dm_end_request(clone, r); 868 else if (r == DM_ENDIO_INCOMPLETE) 869 /* The target will handle the I/O */ 870 return; 871 else if (r == DM_ENDIO_REQUEUE) 872 /* The target wants to requeue the I/O */ 873 dm_requeue_unmapped_request(clone); 874 else { 875 DMWARN("unimplemented target endio return value: %d", r); 876 BUG(); 877 } 878 } 879 880 /* 881 * Request completion handler for request-based dm 882 */ 883 static void dm_softirq_done(struct request *rq) 884 { 885 bool mapped = true; 886 struct request *clone = rq->completion_data; 887 struct dm_rq_target_io *tio = clone->end_io_data; 888 889 if (rq->cmd_flags & REQ_FAILED) 890 mapped = false; 891 892 dm_done(clone, tio->error, mapped); 893 } 894 895 /* 896 * Complete the clone and the original request with the error status 897 * through softirq context. 898 */ 899 static void dm_complete_request(struct request *clone, int error) 900 { 901 struct dm_rq_target_io *tio = clone->end_io_data; 902 struct request *rq = tio->orig; 903 904 tio->error = error; 905 rq->completion_data = clone; 906 blk_complete_request(rq); 907 } 908 909 /* 910 * Complete the not-mapped clone and the original request with the error status 911 * through softirq context. 912 * Target's rq_end_io() function isn't called. 913 * This may be used when the target's map_rq() function fails. 914 */ 915 void dm_kill_unmapped_request(struct request *clone, int error) 916 { 917 struct dm_rq_target_io *tio = clone->end_io_data; 918 struct request *rq = tio->orig; 919 920 rq->cmd_flags |= REQ_FAILED; 921 dm_complete_request(clone, error); 922 } 923 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 924 925 /* 926 * Called with the queue lock held 927 */ 928 static void end_clone_request(struct request *clone, int error) 929 { 930 /* 931 * For just cleaning up the information of the queue in which 932 * the clone was dispatched. 933 * The clone is *NOT* freed actually here because it is alloced from 934 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 935 */ 936 __blk_put_request(clone->q, clone); 937 938 /* 939 * Actual request completion is done in a softirq context which doesn't 940 * hold the queue lock. Otherwise, deadlock could occur because: 941 * - another request may be submitted by the upper level driver 942 * of the stacking during the completion 943 * - the submission which requires queue lock may be done 944 * against this queue 945 */ 946 dm_complete_request(clone, error); 947 } 948 949 /* 950 * Return maximum size of I/O possible at the supplied sector up to the current 951 * target boundary. 952 */ 953 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 954 { 955 sector_t target_offset = dm_target_offset(ti, sector); 956 957 return ti->len - target_offset; 958 } 959 960 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 961 { 962 sector_t len = max_io_len_target_boundary(sector, ti); 963 964 /* 965 * Does the target need to split even further ? 966 */ 967 if (ti->split_io) { 968 sector_t boundary; 969 sector_t offset = dm_target_offset(ti, sector); 970 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 971 - offset; 972 if (len > boundary) 973 len = boundary; 974 } 975 976 return len; 977 } 978 979 static void __map_bio(struct dm_target *ti, struct bio *clone, 980 struct dm_target_io *tio) 981 { 982 int r; 983 sector_t sector; 984 struct mapped_device *md; 985 986 clone->bi_end_io = clone_endio; 987 clone->bi_private = tio; 988 989 /* 990 * Map the clone. If r == 0 we don't need to do 991 * anything, the target has assumed ownership of 992 * this io. 993 */ 994 atomic_inc(&tio->io->io_count); 995 sector = clone->bi_sector; 996 r = ti->type->map(ti, clone, &tio->info); 997 if (r == DM_MAPIO_REMAPPED) { 998 /* the bio has been remapped so dispatch it */ 999 1000 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1001 tio->io->bio->bi_bdev->bd_dev, sector); 1002 1003 generic_make_request(clone); 1004 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1005 /* error the io and bail out, or requeue it if needed */ 1006 md = tio->io->md; 1007 dec_pending(tio->io, r); 1008 /* 1009 * Store bio_set for cleanup. 1010 */ 1011 clone->bi_private = md->bs; 1012 bio_put(clone); 1013 free_tio(md, tio); 1014 } else if (r) { 1015 DMWARN("unimplemented target map return value: %d", r); 1016 BUG(); 1017 } 1018 } 1019 1020 struct clone_info { 1021 struct mapped_device *md; 1022 struct dm_table *map; 1023 struct bio *bio; 1024 struct dm_io *io; 1025 sector_t sector; 1026 sector_t sector_count; 1027 unsigned short idx; 1028 }; 1029 1030 static void dm_bio_destructor(struct bio *bio) 1031 { 1032 struct bio_set *bs = bio->bi_private; 1033 1034 bio_free(bio, bs); 1035 } 1036 1037 /* 1038 * Creates a little bio that just does part of a bvec. 1039 */ 1040 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1041 unsigned short idx, unsigned int offset, 1042 unsigned int len, struct bio_set *bs) 1043 { 1044 struct bio *clone; 1045 struct bio_vec *bv = bio->bi_io_vec + idx; 1046 1047 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1048 clone->bi_destructor = dm_bio_destructor; 1049 *clone->bi_io_vec = *bv; 1050 1051 clone->bi_sector = sector; 1052 clone->bi_bdev = bio->bi_bdev; 1053 clone->bi_rw = bio->bi_rw; 1054 clone->bi_vcnt = 1; 1055 clone->bi_size = to_bytes(len); 1056 clone->bi_io_vec->bv_offset = offset; 1057 clone->bi_io_vec->bv_len = clone->bi_size; 1058 clone->bi_flags |= 1 << BIO_CLONED; 1059 1060 if (bio_integrity(bio)) { 1061 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1062 bio_integrity_trim(clone, 1063 bio_sector_offset(bio, idx, offset), len); 1064 } 1065 1066 return clone; 1067 } 1068 1069 /* 1070 * Creates a bio that consists of range of complete bvecs. 1071 */ 1072 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1073 unsigned short idx, unsigned short bv_count, 1074 unsigned int len, struct bio_set *bs) 1075 { 1076 struct bio *clone; 1077 1078 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1079 __bio_clone(clone, bio); 1080 clone->bi_destructor = dm_bio_destructor; 1081 clone->bi_sector = sector; 1082 clone->bi_idx = idx; 1083 clone->bi_vcnt = idx + bv_count; 1084 clone->bi_size = to_bytes(len); 1085 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1086 1087 if (bio_integrity(bio)) { 1088 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1089 1090 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1091 bio_integrity_trim(clone, 1092 bio_sector_offset(bio, idx, 0), len); 1093 } 1094 1095 return clone; 1096 } 1097 1098 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1099 struct dm_target *ti) 1100 { 1101 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1102 1103 tio->io = ci->io; 1104 tio->ti = ti; 1105 memset(&tio->info, 0, sizeof(tio->info)); 1106 1107 return tio; 1108 } 1109 1110 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1111 unsigned request_nr, sector_t len) 1112 { 1113 struct dm_target_io *tio = alloc_tio(ci, ti); 1114 struct bio *clone; 1115 1116 tio->info.target_request_nr = request_nr; 1117 1118 /* 1119 * Discard requests require the bio's inline iovecs be initialized. 1120 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1121 * and discard, so no need for concern about wasted bvec allocations. 1122 */ 1123 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1124 __bio_clone(clone, ci->bio); 1125 clone->bi_destructor = dm_bio_destructor; 1126 if (len) { 1127 clone->bi_sector = ci->sector; 1128 clone->bi_size = to_bytes(len); 1129 } 1130 1131 __map_bio(ti, clone, tio); 1132 } 1133 1134 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1135 unsigned num_requests, sector_t len) 1136 { 1137 unsigned request_nr; 1138 1139 for (request_nr = 0; request_nr < num_requests; request_nr++) 1140 __issue_target_request(ci, ti, request_nr, len); 1141 } 1142 1143 static int __clone_and_map_empty_flush(struct clone_info *ci) 1144 { 1145 unsigned target_nr = 0; 1146 struct dm_target *ti; 1147 1148 BUG_ON(bio_has_data(ci->bio)); 1149 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1150 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1151 1152 return 0; 1153 } 1154 1155 /* 1156 * Perform all io with a single clone. 1157 */ 1158 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1159 { 1160 struct bio *clone, *bio = ci->bio; 1161 struct dm_target_io *tio; 1162 1163 tio = alloc_tio(ci, ti); 1164 clone = clone_bio(bio, ci->sector, ci->idx, 1165 bio->bi_vcnt - ci->idx, ci->sector_count, 1166 ci->md->bs); 1167 __map_bio(ti, clone, tio); 1168 ci->sector_count = 0; 1169 } 1170 1171 static int __clone_and_map_discard(struct clone_info *ci) 1172 { 1173 struct dm_target *ti; 1174 sector_t len; 1175 1176 do { 1177 ti = dm_table_find_target(ci->map, ci->sector); 1178 if (!dm_target_is_valid(ti)) 1179 return -EIO; 1180 1181 /* 1182 * Even though the device advertised discard support, 1183 * that does not mean every target supports it, and 1184 * reconfiguration might also have changed that since the 1185 * check was performed. 1186 */ 1187 if (!ti->num_discard_requests) 1188 return -EOPNOTSUPP; 1189 1190 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1191 1192 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1193 1194 ci->sector += len; 1195 } while (ci->sector_count -= len); 1196 1197 return 0; 1198 } 1199 1200 static int __clone_and_map(struct clone_info *ci) 1201 { 1202 struct bio *clone, *bio = ci->bio; 1203 struct dm_target *ti; 1204 sector_t len = 0, max; 1205 struct dm_target_io *tio; 1206 1207 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1208 return __clone_and_map_discard(ci); 1209 1210 ti = dm_table_find_target(ci->map, ci->sector); 1211 if (!dm_target_is_valid(ti)) 1212 return -EIO; 1213 1214 max = max_io_len(ci->sector, ti); 1215 1216 if (ci->sector_count <= max) { 1217 /* 1218 * Optimise for the simple case where we can do all of 1219 * the remaining io with a single clone. 1220 */ 1221 __clone_and_map_simple(ci, ti); 1222 1223 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1224 /* 1225 * There are some bvecs that don't span targets. 1226 * Do as many of these as possible. 1227 */ 1228 int i; 1229 sector_t remaining = max; 1230 sector_t bv_len; 1231 1232 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1233 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1234 1235 if (bv_len > remaining) 1236 break; 1237 1238 remaining -= bv_len; 1239 len += bv_len; 1240 } 1241 1242 tio = alloc_tio(ci, ti); 1243 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1244 ci->md->bs); 1245 __map_bio(ti, clone, tio); 1246 1247 ci->sector += len; 1248 ci->sector_count -= len; 1249 ci->idx = i; 1250 1251 } else { 1252 /* 1253 * Handle a bvec that must be split between two or more targets. 1254 */ 1255 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1256 sector_t remaining = to_sector(bv->bv_len); 1257 unsigned int offset = 0; 1258 1259 do { 1260 if (offset) { 1261 ti = dm_table_find_target(ci->map, ci->sector); 1262 if (!dm_target_is_valid(ti)) 1263 return -EIO; 1264 1265 max = max_io_len(ci->sector, ti); 1266 } 1267 1268 len = min(remaining, max); 1269 1270 tio = alloc_tio(ci, ti); 1271 clone = split_bvec(bio, ci->sector, ci->idx, 1272 bv->bv_offset + offset, len, 1273 ci->md->bs); 1274 1275 __map_bio(ti, clone, tio); 1276 1277 ci->sector += len; 1278 ci->sector_count -= len; 1279 offset += to_bytes(len); 1280 } while (remaining -= len); 1281 1282 ci->idx++; 1283 } 1284 1285 return 0; 1286 } 1287 1288 /* 1289 * Split the bio into several clones and submit it to targets. 1290 */ 1291 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1292 { 1293 struct clone_info ci; 1294 int error = 0; 1295 1296 ci.map = dm_get_live_table(md); 1297 if (unlikely(!ci.map)) { 1298 bio_io_error(bio); 1299 return; 1300 } 1301 1302 ci.md = md; 1303 ci.io = alloc_io(md); 1304 ci.io->error = 0; 1305 atomic_set(&ci.io->io_count, 1); 1306 ci.io->bio = bio; 1307 ci.io->md = md; 1308 spin_lock_init(&ci.io->endio_lock); 1309 ci.sector = bio->bi_sector; 1310 ci.idx = bio->bi_idx; 1311 1312 start_io_acct(ci.io); 1313 if (bio->bi_rw & REQ_FLUSH) { 1314 ci.bio = &ci.md->flush_bio; 1315 ci.sector_count = 0; 1316 error = __clone_and_map_empty_flush(&ci); 1317 /* dec_pending submits any data associated with flush */ 1318 } else { 1319 ci.bio = bio; 1320 ci.sector_count = bio_sectors(bio); 1321 while (ci.sector_count && !error) 1322 error = __clone_and_map(&ci); 1323 } 1324 1325 /* drop the extra reference count */ 1326 dec_pending(ci.io, error); 1327 dm_table_put(ci.map); 1328 } 1329 /*----------------------------------------------------------------- 1330 * CRUD END 1331 *---------------------------------------------------------------*/ 1332 1333 static int dm_merge_bvec(struct request_queue *q, 1334 struct bvec_merge_data *bvm, 1335 struct bio_vec *biovec) 1336 { 1337 struct mapped_device *md = q->queuedata; 1338 struct dm_table *map = dm_get_live_table(md); 1339 struct dm_target *ti; 1340 sector_t max_sectors; 1341 int max_size = 0; 1342 1343 if (unlikely(!map)) 1344 goto out; 1345 1346 ti = dm_table_find_target(map, bvm->bi_sector); 1347 if (!dm_target_is_valid(ti)) 1348 goto out_table; 1349 1350 /* 1351 * Find maximum amount of I/O that won't need splitting 1352 */ 1353 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1354 (sector_t) BIO_MAX_SECTORS); 1355 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1356 if (max_size < 0) 1357 max_size = 0; 1358 1359 /* 1360 * merge_bvec_fn() returns number of bytes 1361 * it can accept at this offset 1362 * max is precomputed maximal io size 1363 */ 1364 if (max_size && ti->type->merge) 1365 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1366 /* 1367 * If the target doesn't support merge method and some of the devices 1368 * provided their merge_bvec method (we know this by looking at 1369 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1370 * entries. So always set max_size to 0, and the code below allows 1371 * just one page. 1372 */ 1373 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1374 1375 max_size = 0; 1376 1377 out_table: 1378 dm_table_put(map); 1379 1380 out: 1381 /* 1382 * Always allow an entire first page 1383 */ 1384 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1385 max_size = biovec->bv_len; 1386 1387 return max_size; 1388 } 1389 1390 /* 1391 * The request function that just remaps the bio built up by 1392 * dm_merge_bvec. 1393 */ 1394 static int _dm_request(struct request_queue *q, struct bio *bio) 1395 { 1396 int rw = bio_data_dir(bio); 1397 struct mapped_device *md = q->queuedata; 1398 int cpu; 1399 1400 down_read(&md->io_lock); 1401 1402 cpu = part_stat_lock(); 1403 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1404 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1405 part_stat_unlock(); 1406 1407 /* if we're suspended, we have to queue this io for later */ 1408 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1409 up_read(&md->io_lock); 1410 1411 if (bio_rw(bio) != READA) 1412 queue_io(md, bio); 1413 else 1414 bio_io_error(bio); 1415 return 0; 1416 } 1417 1418 __split_and_process_bio(md, bio); 1419 up_read(&md->io_lock); 1420 return 0; 1421 } 1422 1423 static int dm_make_request(struct request_queue *q, struct bio *bio) 1424 { 1425 struct mapped_device *md = q->queuedata; 1426 1427 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1428 } 1429 1430 static int dm_request_based(struct mapped_device *md) 1431 { 1432 return blk_queue_stackable(md->queue); 1433 } 1434 1435 static int dm_request(struct request_queue *q, struct bio *bio) 1436 { 1437 struct mapped_device *md = q->queuedata; 1438 1439 if (dm_request_based(md)) 1440 return dm_make_request(q, bio); 1441 1442 return _dm_request(q, bio); 1443 } 1444 1445 void dm_dispatch_request(struct request *rq) 1446 { 1447 int r; 1448 1449 if (blk_queue_io_stat(rq->q)) 1450 rq->cmd_flags |= REQ_IO_STAT; 1451 1452 rq->start_time = jiffies; 1453 r = blk_insert_cloned_request(rq->q, rq); 1454 if (r) 1455 dm_complete_request(rq, r); 1456 } 1457 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1458 1459 static void dm_rq_bio_destructor(struct bio *bio) 1460 { 1461 struct dm_rq_clone_bio_info *info = bio->bi_private; 1462 struct mapped_device *md = info->tio->md; 1463 1464 free_bio_info(info); 1465 bio_free(bio, md->bs); 1466 } 1467 1468 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1469 void *data) 1470 { 1471 struct dm_rq_target_io *tio = data; 1472 struct mapped_device *md = tio->md; 1473 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1474 1475 if (!info) 1476 return -ENOMEM; 1477 1478 info->orig = bio_orig; 1479 info->tio = tio; 1480 bio->bi_end_io = end_clone_bio; 1481 bio->bi_private = info; 1482 bio->bi_destructor = dm_rq_bio_destructor; 1483 1484 return 0; 1485 } 1486 1487 static int setup_clone(struct request *clone, struct request *rq, 1488 struct dm_rq_target_io *tio) 1489 { 1490 int r; 1491 1492 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1493 dm_rq_bio_constructor, tio); 1494 if (r) 1495 return r; 1496 1497 clone->cmd = rq->cmd; 1498 clone->cmd_len = rq->cmd_len; 1499 clone->sense = rq->sense; 1500 clone->buffer = rq->buffer; 1501 clone->end_io = end_clone_request; 1502 clone->end_io_data = tio; 1503 1504 return 0; 1505 } 1506 1507 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1508 gfp_t gfp_mask) 1509 { 1510 struct request *clone; 1511 struct dm_rq_target_io *tio; 1512 1513 tio = alloc_rq_tio(md, gfp_mask); 1514 if (!tio) 1515 return NULL; 1516 1517 tio->md = md; 1518 tio->ti = NULL; 1519 tio->orig = rq; 1520 tio->error = 0; 1521 memset(&tio->info, 0, sizeof(tio->info)); 1522 1523 clone = &tio->clone; 1524 if (setup_clone(clone, rq, tio)) { 1525 /* -ENOMEM */ 1526 free_rq_tio(tio); 1527 return NULL; 1528 } 1529 1530 return clone; 1531 } 1532 1533 /* 1534 * Called with the queue lock held. 1535 */ 1536 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1537 { 1538 struct mapped_device *md = q->queuedata; 1539 struct request *clone; 1540 1541 if (unlikely(rq->special)) { 1542 DMWARN("Already has something in rq->special."); 1543 return BLKPREP_KILL; 1544 } 1545 1546 clone = clone_rq(rq, md, GFP_ATOMIC); 1547 if (!clone) 1548 return BLKPREP_DEFER; 1549 1550 rq->special = clone; 1551 rq->cmd_flags |= REQ_DONTPREP; 1552 1553 return BLKPREP_OK; 1554 } 1555 1556 /* 1557 * Returns: 1558 * 0 : the request has been processed (not requeued) 1559 * !0 : the request has been requeued 1560 */ 1561 static int map_request(struct dm_target *ti, struct request *clone, 1562 struct mapped_device *md) 1563 { 1564 int r, requeued = 0; 1565 struct dm_rq_target_io *tio = clone->end_io_data; 1566 1567 /* 1568 * Hold the md reference here for the in-flight I/O. 1569 * We can't rely on the reference count by device opener, 1570 * because the device may be closed during the request completion 1571 * when all bios are completed. 1572 * See the comment in rq_completed() too. 1573 */ 1574 dm_get(md); 1575 1576 tio->ti = ti; 1577 r = ti->type->map_rq(ti, clone, &tio->info); 1578 switch (r) { 1579 case DM_MAPIO_SUBMITTED: 1580 /* The target has taken the I/O to submit by itself later */ 1581 break; 1582 case DM_MAPIO_REMAPPED: 1583 /* The target has remapped the I/O so dispatch it */ 1584 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1585 blk_rq_pos(tio->orig)); 1586 dm_dispatch_request(clone); 1587 break; 1588 case DM_MAPIO_REQUEUE: 1589 /* The target wants to requeue the I/O */ 1590 dm_requeue_unmapped_request(clone); 1591 requeued = 1; 1592 break; 1593 default: 1594 if (r > 0) { 1595 DMWARN("unimplemented target map return value: %d", r); 1596 BUG(); 1597 } 1598 1599 /* The target wants to complete the I/O */ 1600 dm_kill_unmapped_request(clone, r); 1601 break; 1602 } 1603 1604 return requeued; 1605 } 1606 1607 /* 1608 * q->request_fn for request-based dm. 1609 * Called with the queue lock held. 1610 */ 1611 static void dm_request_fn(struct request_queue *q) 1612 { 1613 struct mapped_device *md = q->queuedata; 1614 struct dm_table *map = dm_get_live_table(md); 1615 struct dm_target *ti; 1616 struct request *rq, *clone; 1617 sector_t pos; 1618 1619 /* 1620 * For suspend, check blk_queue_stopped() and increment 1621 * ->pending within a single queue_lock not to increment the 1622 * number of in-flight I/Os after the queue is stopped in 1623 * dm_suspend(). 1624 */ 1625 while (!blk_queue_stopped(q)) { 1626 rq = blk_peek_request(q); 1627 if (!rq) 1628 goto delay_and_out; 1629 1630 /* always use block 0 to find the target for flushes for now */ 1631 pos = 0; 1632 if (!(rq->cmd_flags & REQ_FLUSH)) 1633 pos = blk_rq_pos(rq); 1634 1635 ti = dm_table_find_target(map, pos); 1636 BUG_ON(!dm_target_is_valid(ti)); 1637 1638 if (ti->type->busy && ti->type->busy(ti)) 1639 goto delay_and_out; 1640 1641 blk_start_request(rq); 1642 clone = rq->special; 1643 atomic_inc(&md->pending[rq_data_dir(clone)]); 1644 1645 spin_unlock(q->queue_lock); 1646 if (map_request(ti, clone, md)) 1647 goto requeued; 1648 1649 BUG_ON(!irqs_disabled()); 1650 spin_lock(q->queue_lock); 1651 } 1652 1653 goto out; 1654 1655 requeued: 1656 BUG_ON(!irqs_disabled()); 1657 spin_lock(q->queue_lock); 1658 1659 delay_and_out: 1660 blk_delay_queue(q, HZ / 10); 1661 out: 1662 dm_table_put(map); 1663 1664 return; 1665 } 1666 1667 int dm_underlying_device_busy(struct request_queue *q) 1668 { 1669 return blk_lld_busy(q); 1670 } 1671 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1672 1673 static int dm_lld_busy(struct request_queue *q) 1674 { 1675 int r; 1676 struct mapped_device *md = q->queuedata; 1677 struct dm_table *map = dm_get_live_table(md); 1678 1679 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1680 r = 1; 1681 else 1682 r = dm_table_any_busy_target(map); 1683 1684 dm_table_put(map); 1685 1686 return r; 1687 } 1688 1689 static int dm_any_congested(void *congested_data, int bdi_bits) 1690 { 1691 int r = bdi_bits; 1692 struct mapped_device *md = congested_data; 1693 struct dm_table *map; 1694 1695 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1696 map = dm_get_live_table(md); 1697 if (map) { 1698 /* 1699 * Request-based dm cares about only own queue for 1700 * the query about congestion status of request_queue 1701 */ 1702 if (dm_request_based(md)) 1703 r = md->queue->backing_dev_info.state & 1704 bdi_bits; 1705 else 1706 r = dm_table_any_congested(map, bdi_bits); 1707 1708 dm_table_put(map); 1709 } 1710 } 1711 1712 return r; 1713 } 1714 1715 /*----------------------------------------------------------------- 1716 * An IDR is used to keep track of allocated minor numbers. 1717 *---------------------------------------------------------------*/ 1718 static void free_minor(int minor) 1719 { 1720 spin_lock(&_minor_lock); 1721 idr_remove(&_minor_idr, minor); 1722 spin_unlock(&_minor_lock); 1723 } 1724 1725 /* 1726 * See if the device with a specific minor # is free. 1727 */ 1728 static int specific_minor(int minor) 1729 { 1730 int r, m; 1731 1732 if (minor >= (1 << MINORBITS)) 1733 return -EINVAL; 1734 1735 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1736 if (!r) 1737 return -ENOMEM; 1738 1739 spin_lock(&_minor_lock); 1740 1741 if (idr_find(&_minor_idr, minor)) { 1742 r = -EBUSY; 1743 goto out; 1744 } 1745 1746 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1747 if (r) 1748 goto out; 1749 1750 if (m != minor) { 1751 idr_remove(&_minor_idr, m); 1752 r = -EBUSY; 1753 goto out; 1754 } 1755 1756 out: 1757 spin_unlock(&_minor_lock); 1758 return r; 1759 } 1760 1761 static int next_free_minor(int *minor) 1762 { 1763 int r, m; 1764 1765 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1766 if (!r) 1767 return -ENOMEM; 1768 1769 spin_lock(&_minor_lock); 1770 1771 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1772 if (r) 1773 goto out; 1774 1775 if (m >= (1 << MINORBITS)) { 1776 idr_remove(&_minor_idr, m); 1777 r = -ENOSPC; 1778 goto out; 1779 } 1780 1781 *minor = m; 1782 1783 out: 1784 spin_unlock(&_minor_lock); 1785 return r; 1786 } 1787 1788 static const struct block_device_operations dm_blk_dops; 1789 1790 static void dm_wq_work(struct work_struct *work); 1791 1792 static void dm_init_md_queue(struct mapped_device *md) 1793 { 1794 /* 1795 * Request-based dm devices cannot be stacked on top of bio-based dm 1796 * devices. The type of this dm device has not been decided yet. 1797 * The type is decided at the first table loading time. 1798 * To prevent problematic device stacking, clear the queue flag 1799 * for request stacking support until then. 1800 * 1801 * This queue is new, so no concurrency on the queue_flags. 1802 */ 1803 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1804 1805 md->queue->queuedata = md; 1806 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1807 md->queue->backing_dev_info.congested_data = md; 1808 blk_queue_make_request(md->queue, dm_request); 1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1811 } 1812 1813 /* 1814 * Allocate and initialise a blank device with a given minor. 1815 */ 1816 static struct mapped_device *alloc_dev(int minor) 1817 { 1818 int r; 1819 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1820 void *old_md; 1821 1822 if (!md) { 1823 DMWARN("unable to allocate device, out of memory."); 1824 return NULL; 1825 } 1826 1827 if (!try_module_get(THIS_MODULE)) 1828 goto bad_module_get; 1829 1830 /* get a minor number for the dev */ 1831 if (minor == DM_ANY_MINOR) 1832 r = next_free_minor(&minor); 1833 else 1834 r = specific_minor(minor); 1835 if (r < 0) 1836 goto bad_minor; 1837 1838 md->type = DM_TYPE_NONE; 1839 init_rwsem(&md->io_lock); 1840 mutex_init(&md->suspend_lock); 1841 mutex_init(&md->type_lock); 1842 spin_lock_init(&md->deferred_lock); 1843 rwlock_init(&md->map_lock); 1844 atomic_set(&md->holders, 1); 1845 atomic_set(&md->open_count, 0); 1846 atomic_set(&md->event_nr, 0); 1847 atomic_set(&md->uevent_seq, 0); 1848 INIT_LIST_HEAD(&md->uevent_list); 1849 spin_lock_init(&md->uevent_lock); 1850 1851 md->queue = blk_alloc_queue(GFP_KERNEL); 1852 if (!md->queue) 1853 goto bad_queue; 1854 1855 dm_init_md_queue(md); 1856 1857 md->disk = alloc_disk(1); 1858 if (!md->disk) 1859 goto bad_disk; 1860 1861 atomic_set(&md->pending[0], 0); 1862 atomic_set(&md->pending[1], 0); 1863 init_waitqueue_head(&md->wait); 1864 INIT_WORK(&md->work, dm_wq_work); 1865 init_waitqueue_head(&md->eventq); 1866 1867 md->disk->major = _major; 1868 md->disk->first_minor = minor; 1869 md->disk->fops = &dm_blk_dops; 1870 md->disk->queue = md->queue; 1871 md->disk->private_data = md; 1872 sprintf(md->disk->disk_name, "dm-%d", minor); 1873 add_disk(md->disk); 1874 format_dev_t(md->name, MKDEV(_major, minor)); 1875 1876 md->wq = alloc_workqueue("kdmflush", 1877 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1878 if (!md->wq) 1879 goto bad_thread; 1880 1881 md->bdev = bdget_disk(md->disk, 0); 1882 if (!md->bdev) 1883 goto bad_bdev; 1884 1885 bio_init(&md->flush_bio); 1886 md->flush_bio.bi_bdev = md->bdev; 1887 md->flush_bio.bi_rw = WRITE_FLUSH; 1888 1889 /* Populate the mapping, nobody knows we exist yet */ 1890 spin_lock(&_minor_lock); 1891 old_md = idr_replace(&_minor_idr, md, minor); 1892 spin_unlock(&_minor_lock); 1893 1894 BUG_ON(old_md != MINOR_ALLOCED); 1895 1896 return md; 1897 1898 bad_bdev: 1899 destroy_workqueue(md->wq); 1900 bad_thread: 1901 del_gendisk(md->disk); 1902 put_disk(md->disk); 1903 bad_disk: 1904 blk_cleanup_queue(md->queue); 1905 bad_queue: 1906 free_minor(minor); 1907 bad_minor: 1908 module_put(THIS_MODULE); 1909 bad_module_get: 1910 kfree(md); 1911 return NULL; 1912 } 1913 1914 static void unlock_fs(struct mapped_device *md); 1915 1916 static void free_dev(struct mapped_device *md) 1917 { 1918 int minor = MINOR(disk_devt(md->disk)); 1919 1920 unlock_fs(md); 1921 bdput(md->bdev); 1922 destroy_workqueue(md->wq); 1923 if (md->tio_pool) 1924 mempool_destroy(md->tio_pool); 1925 if (md->io_pool) 1926 mempool_destroy(md->io_pool); 1927 if (md->bs) 1928 bioset_free(md->bs); 1929 blk_integrity_unregister(md->disk); 1930 del_gendisk(md->disk); 1931 free_minor(minor); 1932 1933 spin_lock(&_minor_lock); 1934 md->disk->private_data = NULL; 1935 spin_unlock(&_minor_lock); 1936 1937 put_disk(md->disk); 1938 blk_cleanup_queue(md->queue); 1939 module_put(THIS_MODULE); 1940 kfree(md); 1941 } 1942 1943 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1944 { 1945 struct dm_md_mempools *p; 1946 1947 if (md->io_pool && md->tio_pool && md->bs) 1948 /* the md already has necessary mempools */ 1949 goto out; 1950 1951 p = dm_table_get_md_mempools(t); 1952 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1953 1954 md->io_pool = p->io_pool; 1955 p->io_pool = NULL; 1956 md->tio_pool = p->tio_pool; 1957 p->tio_pool = NULL; 1958 md->bs = p->bs; 1959 p->bs = NULL; 1960 1961 out: 1962 /* mempool bind completed, now no need any mempools in the table */ 1963 dm_table_free_md_mempools(t); 1964 } 1965 1966 /* 1967 * Bind a table to the device. 1968 */ 1969 static void event_callback(void *context) 1970 { 1971 unsigned long flags; 1972 LIST_HEAD(uevents); 1973 struct mapped_device *md = (struct mapped_device *) context; 1974 1975 spin_lock_irqsave(&md->uevent_lock, flags); 1976 list_splice_init(&md->uevent_list, &uevents); 1977 spin_unlock_irqrestore(&md->uevent_lock, flags); 1978 1979 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1980 1981 atomic_inc(&md->event_nr); 1982 wake_up(&md->eventq); 1983 } 1984 1985 /* 1986 * Protected by md->suspend_lock obtained by dm_swap_table(). 1987 */ 1988 static void __set_size(struct mapped_device *md, sector_t size) 1989 { 1990 set_capacity(md->disk, size); 1991 1992 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1993 } 1994 1995 /* 1996 * Return 1 if the queue has a compulsory merge_bvec_fn function. 1997 * 1998 * If this function returns 0, then the device is either a non-dm 1999 * device without a merge_bvec_fn, or it is a dm device that is 2000 * able to split any bios it receives that are too big. 2001 */ 2002 int dm_queue_merge_is_compulsory(struct request_queue *q) 2003 { 2004 struct mapped_device *dev_md; 2005 2006 if (!q->merge_bvec_fn) 2007 return 0; 2008 2009 if (q->make_request_fn == dm_request) { 2010 dev_md = q->queuedata; 2011 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2012 return 0; 2013 } 2014 2015 return 1; 2016 } 2017 2018 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2019 struct dm_dev *dev, sector_t start, 2020 sector_t len, void *data) 2021 { 2022 struct block_device *bdev = dev->bdev; 2023 struct request_queue *q = bdev_get_queue(bdev); 2024 2025 return dm_queue_merge_is_compulsory(q); 2026 } 2027 2028 /* 2029 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2030 * on the properties of the underlying devices. 2031 */ 2032 static int dm_table_merge_is_optional(struct dm_table *table) 2033 { 2034 unsigned i = 0; 2035 struct dm_target *ti; 2036 2037 while (i < dm_table_get_num_targets(table)) { 2038 ti = dm_table_get_target(table, i++); 2039 2040 if (ti->type->iterate_devices && 2041 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2042 return 0; 2043 } 2044 2045 return 1; 2046 } 2047 2048 /* 2049 * Returns old map, which caller must destroy. 2050 */ 2051 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2052 struct queue_limits *limits) 2053 { 2054 struct dm_table *old_map; 2055 struct request_queue *q = md->queue; 2056 sector_t size; 2057 unsigned long flags; 2058 int merge_is_optional; 2059 2060 size = dm_table_get_size(t); 2061 2062 /* 2063 * Wipe any geometry if the size of the table changed. 2064 */ 2065 if (size != get_capacity(md->disk)) 2066 memset(&md->geometry, 0, sizeof(md->geometry)); 2067 2068 __set_size(md, size); 2069 2070 dm_table_event_callback(t, event_callback, md); 2071 2072 /* 2073 * The queue hasn't been stopped yet, if the old table type wasn't 2074 * for request-based during suspension. So stop it to prevent 2075 * I/O mapping before resume. 2076 * This must be done before setting the queue restrictions, 2077 * because request-based dm may be run just after the setting. 2078 */ 2079 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2080 stop_queue(q); 2081 2082 __bind_mempools(md, t); 2083 2084 merge_is_optional = dm_table_merge_is_optional(t); 2085 2086 write_lock_irqsave(&md->map_lock, flags); 2087 old_map = md->map; 2088 md->map = t; 2089 dm_table_set_restrictions(t, q, limits); 2090 if (merge_is_optional) 2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2092 else 2093 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2094 write_unlock_irqrestore(&md->map_lock, flags); 2095 2096 return old_map; 2097 } 2098 2099 /* 2100 * Returns unbound table for the caller to free. 2101 */ 2102 static struct dm_table *__unbind(struct mapped_device *md) 2103 { 2104 struct dm_table *map = md->map; 2105 unsigned long flags; 2106 2107 if (!map) 2108 return NULL; 2109 2110 dm_table_event_callback(map, NULL, NULL); 2111 write_lock_irqsave(&md->map_lock, flags); 2112 md->map = NULL; 2113 write_unlock_irqrestore(&md->map_lock, flags); 2114 2115 return map; 2116 } 2117 2118 /* 2119 * Constructor for a new device. 2120 */ 2121 int dm_create(int minor, struct mapped_device **result) 2122 { 2123 struct mapped_device *md; 2124 2125 md = alloc_dev(minor); 2126 if (!md) 2127 return -ENXIO; 2128 2129 dm_sysfs_init(md); 2130 2131 *result = md; 2132 return 0; 2133 } 2134 2135 /* 2136 * Functions to manage md->type. 2137 * All are required to hold md->type_lock. 2138 */ 2139 void dm_lock_md_type(struct mapped_device *md) 2140 { 2141 mutex_lock(&md->type_lock); 2142 } 2143 2144 void dm_unlock_md_type(struct mapped_device *md) 2145 { 2146 mutex_unlock(&md->type_lock); 2147 } 2148 2149 void dm_set_md_type(struct mapped_device *md, unsigned type) 2150 { 2151 md->type = type; 2152 } 2153 2154 unsigned dm_get_md_type(struct mapped_device *md) 2155 { 2156 return md->type; 2157 } 2158 2159 /* 2160 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2161 */ 2162 static int dm_init_request_based_queue(struct mapped_device *md) 2163 { 2164 struct request_queue *q = NULL; 2165 2166 if (md->queue->elevator) 2167 return 1; 2168 2169 /* Fully initialize the queue */ 2170 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2171 if (!q) 2172 return 0; 2173 2174 md->queue = q; 2175 md->saved_make_request_fn = md->queue->make_request_fn; 2176 dm_init_md_queue(md); 2177 blk_queue_softirq_done(md->queue, dm_softirq_done); 2178 blk_queue_prep_rq(md->queue, dm_prep_fn); 2179 blk_queue_lld_busy(md->queue, dm_lld_busy); 2180 2181 elv_register_queue(md->queue); 2182 2183 return 1; 2184 } 2185 2186 /* 2187 * Setup the DM device's queue based on md's type 2188 */ 2189 int dm_setup_md_queue(struct mapped_device *md) 2190 { 2191 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2192 !dm_init_request_based_queue(md)) { 2193 DMWARN("Cannot initialize queue for request-based mapped device"); 2194 return -EINVAL; 2195 } 2196 2197 return 0; 2198 } 2199 2200 static struct mapped_device *dm_find_md(dev_t dev) 2201 { 2202 struct mapped_device *md; 2203 unsigned minor = MINOR(dev); 2204 2205 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2206 return NULL; 2207 2208 spin_lock(&_minor_lock); 2209 2210 md = idr_find(&_minor_idr, minor); 2211 if (md && (md == MINOR_ALLOCED || 2212 (MINOR(disk_devt(dm_disk(md))) != minor) || 2213 dm_deleting_md(md) || 2214 test_bit(DMF_FREEING, &md->flags))) { 2215 md = NULL; 2216 goto out; 2217 } 2218 2219 out: 2220 spin_unlock(&_minor_lock); 2221 2222 return md; 2223 } 2224 2225 struct mapped_device *dm_get_md(dev_t dev) 2226 { 2227 struct mapped_device *md = dm_find_md(dev); 2228 2229 if (md) 2230 dm_get(md); 2231 2232 return md; 2233 } 2234 2235 void *dm_get_mdptr(struct mapped_device *md) 2236 { 2237 return md->interface_ptr; 2238 } 2239 2240 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2241 { 2242 md->interface_ptr = ptr; 2243 } 2244 2245 void dm_get(struct mapped_device *md) 2246 { 2247 atomic_inc(&md->holders); 2248 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2249 } 2250 2251 const char *dm_device_name(struct mapped_device *md) 2252 { 2253 return md->name; 2254 } 2255 EXPORT_SYMBOL_GPL(dm_device_name); 2256 2257 static void __dm_destroy(struct mapped_device *md, bool wait) 2258 { 2259 struct dm_table *map; 2260 2261 might_sleep(); 2262 2263 spin_lock(&_minor_lock); 2264 map = dm_get_live_table(md); 2265 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2266 set_bit(DMF_FREEING, &md->flags); 2267 spin_unlock(&_minor_lock); 2268 2269 if (!dm_suspended_md(md)) { 2270 dm_table_presuspend_targets(map); 2271 dm_table_postsuspend_targets(map); 2272 } 2273 2274 /* 2275 * Rare, but there may be I/O requests still going to complete, 2276 * for example. Wait for all references to disappear. 2277 * No one should increment the reference count of the mapped_device, 2278 * after the mapped_device state becomes DMF_FREEING. 2279 */ 2280 if (wait) 2281 while (atomic_read(&md->holders)) 2282 msleep(1); 2283 else if (atomic_read(&md->holders)) 2284 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2285 dm_device_name(md), atomic_read(&md->holders)); 2286 2287 dm_sysfs_exit(md); 2288 dm_table_put(map); 2289 dm_table_destroy(__unbind(md)); 2290 free_dev(md); 2291 } 2292 2293 void dm_destroy(struct mapped_device *md) 2294 { 2295 __dm_destroy(md, true); 2296 } 2297 2298 void dm_destroy_immediate(struct mapped_device *md) 2299 { 2300 __dm_destroy(md, false); 2301 } 2302 2303 void dm_put(struct mapped_device *md) 2304 { 2305 atomic_dec(&md->holders); 2306 } 2307 EXPORT_SYMBOL_GPL(dm_put); 2308 2309 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2310 { 2311 int r = 0; 2312 DECLARE_WAITQUEUE(wait, current); 2313 2314 add_wait_queue(&md->wait, &wait); 2315 2316 while (1) { 2317 set_current_state(interruptible); 2318 2319 smp_mb(); 2320 if (!md_in_flight(md)) 2321 break; 2322 2323 if (interruptible == TASK_INTERRUPTIBLE && 2324 signal_pending(current)) { 2325 r = -EINTR; 2326 break; 2327 } 2328 2329 io_schedule(); 2330 } 2331 set_current_state(TASK_RUNNING); 2332 2333 remove_wait_queue(&md->wait, &wait); 2334 2335 return r; 2336 } 2337 2338 /* 2339 * Process the deferred bios 2340 */ 2341 static void dm_wq_work(struct work_struct *work) 2342 { 2343 struct mapped_device *md = container_of(work, struct mapped_device, 2344 work); 2345 struct bio *c; 2346 2347 down_read(&md->io_lock); 2348 2349 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2350 spin_lock_irq(&md->deferred_lock); 2351 c = bio_list_pop(&md->deferred); 2352 spin_unlock_irq(&md->deferred_lock); 2353 2354 if (!c) 2355 break; 2356 2357 up_read(&md->io_lock); 2358 2359 if (dm_request_based(md)) 2360 generic_make_request(c); 2361 else 2362 __split_and_process_bio(md, c); 2363 2364 down_read(&md->io_lock); 2365 } 2366 2367 up_read(&md->io_lock); 2368 } 2369 2370 static void dm_queue_flush(struct mapped_device *md) 2371 { 2372 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2373 smp_mb__after_clear_bit(); 2374 queue_work(md->wq, &md->work); 2375 } 2376 2377 /* 2378 * Swap in a new table, returning the old one for the caller to destroy. 2379 */ 2380 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2381 { 2382 struct dm_table *map = ERR_PTR(-EINVAL); 2383 struct queue_limits limits; 2384 int r; 2385 2386 mutex_lock(&md->suspend_lock); 2387 2388 /* device must be suspended */ 2389 if (!dm_suspended_md(md)) 2390 goto out; 2391 2392 r = dm_calculate_queue_limits(table, &limits); 2393 if (r) { 2394 map = ERR_PTR(r); 2395 goto out; 2396 } 2397 2398 map = __bind(md, table, &limits); 2399 2400 out: 2401 mutex_unlock(&md->suspend_lock); 2402 return map; 2403 } 2404 2405 /* 2406 * Functions to lock and unlock any filesystem running on the 2407 * device. 2408 */ 2409 static int lock_fs(struct mapped_device *md) 2410 { 2411 int r; 2412 2413 WARN_ON(md->frozen_sb); 2414 2415 md->frozen_sb = freeze_bdev(md->bdev); 2416 if (IS_ERR(md->frozen_sb)) { 2417 r = PTR_ERR(md->frozen_sb); 2418 md->frozen_sb = NULL; 2419 return r; 2420 } 2421 2422 set_bit(DMF_FROZEN, &md->flags); 2423 2424 return 0; 2425 } 2426 2427 static void unlock_fs(struct mapped_device *md) 2428 { 2429 if (!test_bit(DMF_FROZEN, &md->flags)) 2430 return; 2431 2432 thaw_bdev(md->bdev, md->frozen_sb); 2433 md->frozen_sb = NULL; 2434 clear_bit(DMF_FROZEN, &md->flags); 2435 } 2436 2437 /* 2438 * We need to be able to change a mapping table under a mounted 2439 * filesystem. For example we might want to move some data in 2440 * the background. Before the table can be swapped with 2441 * dm_bind_table, dm_suspend must be called to flush any in 2442 * flight bios and ensure that any further io gets deferred. 2443 */ 2444 /* 2445 * Suspend mechanism in request-based dm. 2446 * 2447 * 1. Flush all I/Os by lock_fs() if needed. 2448 * 2. Stop dispatching any I/O by stopping the request_queue. 2449 * 3. Wait for all in-flight I/Os to be completed or requeued. 2450 * 2451 * To abort suspend, start the request_queue. 2452 */ 2453 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2454 { 2455 struct dm_table *map = NULL; 2456 int r = 0; 2457 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2458 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2459 2460 mutex_lock(&md->suspend_lock); 2461 2462 if (dm_suspended_md(md)) { 2463 r = -EINVAL; 2464 goto out_unlock; 2465 } 2466 2467 map = dm_get_live_table(md); 2468 2469 /* 2470 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2471 * This flag is cleared before dm_suspend returns. 2472 */ 2473 if (noflush) 2474 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2475 2476 /* This does not get reverted if there's an error later. */ 2477 dm_table_presuspend_targets(map); 2478 2479 /* 2480 * Flush I/O to the device. 2481 * Any I/O submitted after lock_fs() may not be flushed. 2482 * noflush takes precedence over do_lockfs. 2483 * (lock_fs() flushes I/Os and waits for them to complete.) 2484 */ 2485 if (!noflush && do_lockfs) { 2486 r = lock_fs(md); 2487 if (r) 2488 goto out; 2489 } 2490 2491 /* 2492 * Here we must make sure that no processes are submitting requests 2493 * to target drivers i.e. no one may be executing 2494 * __split_and_process_bio. This is called from dm_request and 2495 * dm_wq_work. 2496 * 2497 * To get all processes out of __split_and_process_bio in dm_request, 2498 * we take the write lock. To prevent any process from reentering 2499 * __split_and_process_bio from dm_request and quiesce the thread 2500 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2501 * flush_workqueue(md->wq). 2502 */ 2503 down_write(&md->io_lock); 2504 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2505 up_write(&md->io_lock); 2506 2507 /* 2508 * Stop md->queue before flushing md->wq in case request-based 2509 * dm defers requests to md->wq from md->queue. 2510 */ 2511 if (dm_request_based(md)) 2512 stop_queue(md->queue); 2513 2514 flush_workqueue(md->wq); 2515 2516 /* 2517 * At this point no more requests are entering target request routines. 2518 * We call dm_wait_for_completion to wait for all existing requests 2519 * to finish. 2520 */ 2521 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2522 2523 down_write(&md->io_lock); 2524 if (noflush) 2525 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2526 up_write(&md->io_lock); 2527 2528 /* were we interrupted ? */ 2529 if (r < 0) { 2530 dm_queue_flush(md); 2531 2532 if (dm_request_based(md)) 2533 start_queue(md->queue); 2534 2535 unlock_fs(md); 2536 goto out; /* pushback list is already flushed, so skip flush */ 2537 } 2538 2539 /* 2540 * If dm_wait_for_completion returned 0, the device is completely 2541 * quiescent now. There is no request-processing activity. All new 2542 * requests are being added to md->deferred list. 2543 */ 2544 2545 set_bit(DMF_SUSPENDED, &md->flags); 2546 2547 dm_table_postsuspend_targets(map); 2548 2549 out: 2550 dm_table_put(map); 2551 2552 out_unlock: 2553 mutex_unlock(&md->suspend_lock); 2554 return r; 2555 } 2556 2557 int dm_resume(struct mapped_device *md) 2558 { 2559 int r = -EINVAL; 2560 struct dm_table *map = NULL; 2561 2562 mutex_lock(&md->suspend_lock); 2563 if (!dm_suspended_md(md)) 2564 goto out; 2565 2566 map = dm_get_live_table(md); 2567 if (!map || !dm_table_get_size(map)) 2568 goto out; 2569 2570 r = dm_table_resume_targets(map); 2571 if (r) 2572 goto out; 2573 2574 dm_queue_flush(md); 2575 2576 /* 2577 * Flushing deferred I/Os must be done after targets are resumed 2578 * so that mapping of targets can work correctly. 2579 * Request-based dm is queueing the deferred I/Os in its request_queue. 2580 */ 2581 if (dm_request_based(md)) 2582 start_queue(md->queue); 2583 2584 unlock_fs(md); 2585 2586 clear_bit(DMF_SUSPENDED, &md->flags); 2587 2588 r = 0; 2589 out: 2590 dm_table_put(map); 2591 mutex_unlock(&md->suspend_lock); 2592 2593 return r; 2594 } 2595 2596 /*----------------------------------------------------------------- 2597 * Event notification. 2598 *---------------------------------------------------------------*/ 2599 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2600 unsigned cookie) 2601 { 2602 char udev_cookie[DM_COOKIE_LENGTH]; 2603 char *envp[] = { udev_cookie, NULL }; 2604 2605 if (!cookie) 2606 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2607 else { 2608 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2609 DM_COOKIE_ENV_VAR_NAME, cookie); 2610 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2611 action, envp); 2612 } 2613 } 2614 2615 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2616 { 2617 return atomic_add_return(1, &md->uevent_seq); 2618 } 2619 2620 uint32_t dm_get_event_nr(struct mapped_device *md) 2621 { 2622 return atomic_read(&md->event_nr); 2623 } 2624 2625 int dm_wait_event(struct mapped_device *md, int event_nr) 2626 { 2627 return wait_event_interruptible(md->eventq, 2628 (event_nr != atomic_read(&md->event_nr))); 2629 } 2630 2631 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2632 { 2633 unsigned long flags; 2634 2635 spin_lock_irqsave(&md->uevent_lock, flags); 2636 list_add(elist, &md->uevent_list); 2637 spin_unlock_irqrestore(&md->uevent_lock, flags); 2638 } 2639 2640 /* 2641 * The gendisk is only valid as long as you have a reference 2642 * count on 'md'. 2643 */ 2644 struct gendisk *dm_disk(struct mapped_device *md) 2645 { 2646 return md->disk; 2647 } 2648 2649 struct kobject *dm_kobject(struct mapped_device *md) 2650 { 2651 return &md->kobj; 2652 } 2653 2654 /* 2655 * struct mapped_device should not be exported outside of dm.c 2656 * so use this check to verify that kobj is part of md structure 2657 */ 2658 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2659 { 2660 struct mapped_device *md; 2661 2662 md = container_of(kobj, struct mapped_device, kobj); 2663 if (&md->kobj != kobj) 2664 return NULL; 2665 2666 if (test_bit(DMF_FREEING, &md->flags) || 2667 dm_deleting_md(md)) 2668 return NULL; 2669 2670 dm_get(md); 2671 return md; 2672 } 2673 2674 int dm_suspended_md(struct mapped_device *md) 2675 { 2676 return test_bit(DMF_SUSPENDED, &md->flags); 2677 } 2678 2679 int dm_suspended(struct dm_target *ti) 2680 { 2681 return dm_suspended_md(dm_table_get_md(ti->table)); 2682 } 2683 EXPORT_SYMBOL_GPL(dm_suspended); 2684 2685 int dm_noflush_suspending(struct dm_target *ti) 2686 { 2687 return __noflush_suspending(dm_table_get_md(ti->table)); 2688 } 2689 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2690 2691 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) 2692 { 2693 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2694 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2695 2696 if (!pools) 2697 return NULL; 2698 2699 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2700 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2701 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2702 if (!pools->io_pool) 2703 goto free_pools_and_out; 2704 2705 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2706 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2707 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2708 if (!pools->tio_pool) 2709 goto free_io_pool_and_out; 2710 2711 pools->bs = bioset_create(pool_size, 0); 2712 if (!pools->bs) 2713 goto free_tio_pool_and_out; 2714 2715 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2716 goto free_bioset_and_out; 2717 2718 return pools; 2719 2720 free_bioset_and_out: 2721 bioset_free(pools->bs); 2722 2723 free_tio_pool_and_out: 2724 mempool_destroy(pools->tio_pool); 2725 2726 free_io_pool_and_out: 2727 mempool_destroy(pools->io_pool); 2728 2729 free_pools_and_out: 2730 kfree(pools); 2731 2732 return NULL; 2733 } 2734 2735 void dm_free_md_mempools(struct dm_md_mempools *pools) 2736 { 2737 if (!pools) 2738 return; 2739 2740 if (pools->io_pool) 2741 mempool_destroy(pools->io_pool); 2742 2743 if (pools->tio_pool) 2744 mempool_destroy(pools->tio_pool); 2745 2746 if (pools->bs) 2747 bioset_free(pools->bs); 2748 2749 kfree(pools); 2750 } 2751 2752 static const struct block_device_operations dm_blk_dops = { 2753 .open = dm_blk_open, 2754 .release = dm_blk_close, 2755 .ioctl = dm_blk_ioctl, 2756 .getgeo = dm_blk_getgeo, 2757 .owner = THIS_MODULE 2758 }; 2759 2760 EXPORT_SYMBOL(dm_get_mapinfo); 2761 2762 /* 2763 * module hooks 2764 */ 2765 module_init(dm_init); 2766 module_exit(dm_exit); 2767 2768 module_param(major, uint, 0); 2769 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2770 MODULE_DESCRIPTION(DM_NAME " driver"); 2771 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2772 MODULE_LICENSE("GPL"); 2773