1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 /* 28 * Cookies are numeric values sent with CHANGE and REMOVE 29 * uevents while resuming, removing or renaming the device. 30 */ 31 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32 #define DM_COOKIE_LENGTH 24 33 34 static const char *_name = DM_NAME; 35 36 static unsigned int major = 0; 37 static unsigned int _major = 0; 38 39 static DEFINE_SPINLOCK(_minor_lock); 40 /* 41 * For bio-based dm. 42 * One of these is allocated per bio. 43 */ 44 struct dm_io { 45 struct mapped_device *md; 46 int error; 47 atomic_t io_count; 48 struct bio *bio; 49 unsigned long start_time; 50 spinlock_t endio_lock; 51 }; 52 53 /* 54 * For bio-based dm. 55 * One of these is allocated per target within a bio. Hopefully 56 * this will be simplified out one day. 57 */ 58 struct dm_target_io { 59 struct dm_io *io; 60 struct dm_target *ti; 61 union map_info info; 62 }; 63 64 /* 65 * For request-based dm. 66 * One of these is allocated per request. 67 */ 68 struct dm_rq_target_io { 69 struct mapped_device *md; 70 struct dm_target *ti; 71 struct request *orig, clone; 72 int error; 73 union map_info info; 74 }; 75 76 /* 77 * For request-based dm. 78 * One of these is allocated per bio. 79 */ 80 struct dm_rq_clone_bio_info { 81 struct bio *orig; 82 struct dm_rq_target_io *tio; 83 }; 84 85 union map_info *dm_get_mapinfo(struct bio *bio) 86 { 87 if (bio && bio->bi_private) 88 return &((struct dm_target_io *)bio->bi_private)->info; 89 return NULL; 90 } 91 92 union map_info *dm_get_rq_mapinfo(struct request *rq) 93 { 94 if (rq && rq->end_io_data) 95 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 96 return NULL; 97 } 98 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 99 100 #define MINOR_ALLOCED ((void *)-1) 101 102 /* 103 * Bits for the md->flags field. 104 */ 105 #define DMF_BLOCK_IO_FOR_SUSPEND 0 106 #define DMF_SUSPENDED 1 107 #define DMF_FROZEN 2 108 #define DMF_FREEING 3 109 #define DMF_DELETING 4 110 #define DMF_NOFLUSH_SUSPENDING 5 111 #define DMF_QUEUE_IO_TO_THREAD 6 112 113 /* 114 * Work processed by per-device workqueue. 115 */ 116 struct mapped_device { 117 struct rw_semaphore io_lock; 118 struct mutex suspend_lock; 119 rwlock_t map_lock; 120 atomic_t holders; 121 atomic_t open_count; 122 123 unsigned long flags; 124 125 struct request_queue *queue; 126 struct gendisk *disk; 127 char name[16]; 128 129 void *interface_ptr; 130 131 /* 132 * A list of ios that arrived while we were suspended. 133 */ 134 atomic_t pending[2]; 135 wait_queue_head_t wait; 136 struct work_struct work; 137 struct bio_list deferred; 138 spinlock_t deferred_lock; 139 140 /* 141 * An error from the barrier request currently being processed. 142 */ 143 int barrier_error; 144 145 /* 146 * Protect barrier_error from concurrent endio processing 147 * in request-based dm. 148 */ 149 spinlock_t barrier_error_lock; 150 151 /* 152 * Processing queue (flush/barriers) 153 */ 154 struct workqueue_struct *wq; 155 struct work_struct barrier_work; 156 157 /* A pointer to the currently processing pre/post flush request */ 158 struct request *flush_request; 159 160 /* 161 * The current mapping. 162 */ 163 struct dm_table *map; 164 165 /* 166 * io objects are allocated from here. 167 */ 168 mempool_t *io_pool; 169 mempool_t *tio_pool; 170 171 struct bio_set *bs; 172 173 /* 174 * Event handling. 175 */ 176 atomic_t event_nr; 177 wait_queue_head_t eventq; 178 atomic_t uevent_seq; 179 struct list_head uevent_list; 180 spinlock_t uevent_lock; /* Protect access to uevent_list */ 181 182 /* 183 * freeze/thaw support require holding onto a super block 184 */ 185 struct super_block *frozen_sb; 186 struct block_device *bdev; 187 188 /* forced geometry settings */ 189 struct hd_geometry geometry; 190 191 /* For saving the address of __make_request for request based dm */ 192 make_request_fn *saved_make_request_fn; 193 194 /* sysfs handle */ 195 struct kobject kobj; 196 197 /* zero-length barrier that will be cloned and submitted to targets */ 198 struct bio barrier_bio; 199 }; 200 201 /* 202 * For mempools pre-allocation at the table loading time. 203 */ 204 struct dm_md_mempools { 205 mempool_t *io_pool; 206 mempool_t *tio_pool; 207 struct bio_set *bs; 208 }; 209 210 #define MIN_IOS 256 211 static struct kmem_cache *_io_cache; 212 static struct kmem_cache *_tio_cache; 213 static struct kmem_cache *_rq_tio_cache; 214 static struct kmem_cache *_rq_bio_info_cache; 215 216 static int __init local_init(void) 217 { 218 int r = -ENOMEM; 219 220 /* allocate a slab for the dm_ios */ 221 _io_cache = KMEM_CACHE(dm_io, 0); 222 if (!_io_cache) 223 return r; 224 225 /* allocate a slab for the target ios */ 226 _tio_cache = KMEM_CACHE(dm_target_io, 0); 227 if (!_tio_cache) 228 goto out_free_io_cache; 229 230 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 231 if (!_rq_tio_cache) 232 goto out_free_tio_cache; 233 234 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 235 if (!_rq_bio_info_cache) 236 goto out_free_rq_tio_cache; 237 238 r = dm_uevent_init(); 239 if (r) 240 goto out_free_rq_bio_info_cache; 241 242 _major = major; 243 r = register_blkdev(_major, _name); 244 if (r < 0) 245 goto out_uevent_exit; 246 247 if (!_major) 248 _major = r; 249 250 return 0; 251 252 out_uevent_exit: 253 dm_uevent_exit(); 254 out_free_rq_bio_info_cache: 255 kmem_cache_destroy(_rq_bio_info_cache); 256 out_free_rq_tio_cache: 257 kmem_cache_destroy(_rq_tio_cache); 258 out_free_tio_cache: 259 kmem_cache_destroy(_tio_cache); 260 out_free_io_cache: 261 kmem_cache_destroy(_io_cache); 262 263 return r; 264 } 265 266 static void local_exit(void) 267 { 268 kmem_cache_destroy(_rq_bio_info_cache); 269 kmem_cache_destroy(_rq_tio_cache); 270 kmem_cache_destroy(_tio_cache); 271 kmem_cache_destroy(_io_cache); 272 unregister_blkdev(_major, _name); 273 dm_uevent_exit(); 274 275 _major = 0; 276 277 DMINFO("cleaned up"); 278 } 279 280 static int (*_inits[])(void) __initdata = { 281 local_init, 282 dm_target_init, 283 dm_linear_init, 284 dm_stripe_init, 285 dm_io_init, 286 dm_kcopyd_init, 287 dm_interface_init, 288 }; 289 290 static void (*_exits[])(void) = { 291 local_exit, 292 dm_target_exit, 293 dm_linear_exit, 294 dm_stripe_exit, 295 dm_io_exit, 296 dm_kcopyd_exit, 297 dm_interface_exit, 298 }; 299 300 static int __init dm_init(void) 301 { 302 const int count = ARRAY_SIZE(_inits); 303 304 int r, i; 305 306 for (i = 0; i < count; i++) { 307 r = _inits[i](); 308 if (r) 309 goto bad; 310 } 311 312 return 0; 313 314 bad: 315 while (i--) 316 _exits[i](); 317 318 return r; 319 } 320 321 static void __exit dm_exit(void) 322 { 323 int i = ARRAY_SIZE(_exits); 324 325 while (i--) 326 _exits[i](); 327 } 328 329 /* 330 * Block device functions 331 */ 332 int dm_deleting_md(struct mapped_device *md) 333 { 334 return test_bit(DMF_DELETING, &md->flags); 335 } 336 337 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 338 { 339 struct mapped_device *md; 340 341 spin_lock(&_minor_lock); 342 343 md = bdev->bd_disk->private_data; 344 if (!md) 345 goto out; 346 347 if (test_bit(DMF_FREEING, &md->flags) || 348 dm_deleting_md(md)) { 349 md = NULL; 350 goto out; 351 } 352 353 dm_get(md); 354 atomic_inc(&md->open_count); 355 356 out: 357 spin_unlock(&_minor_lock); 358 359 return md ? 0 : -ENXIO; 360 } 361 362 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 363 { 364 struct mapped_device *md = disk->private_data; 365 atomic_dec(&md->open_count); 366 dm_put(md); 367 return 0; 368 } 369 370 int dm_open_count(struct mapped_device *md) 371 { 372 return atomic_read(&md->open_count); 373 } 374 375 /* 376 * Guarantees nothing is using the device before it's deleted. 377 */ 378 int dm_lock_for_deletion(struct mapped_device *md) 379 { 380 int r = 0; 381 382 spin_lock(&_minor_lock); 383 384 if (dm_open_count(md)) 385 r = -EBUSY; 386 else 387 set_bit(DMF_DELETING, &md->flags); 388 389 spin_unlock(&_minor_lock); 390 391 return r; 392 } 393 394 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 395 { 396 struct mapped_device *md = bdev->bd_disk->private_data; 397 398 return dm_get_geometry(md, geo); 399 } 400 401 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 402 unsigned int cmd, unsigned long arg) 403 { 404 struct mapped_device *md = bdev->bd_disk->private_data; 405 struct dm_table *map = dm_get_live_table(md); 406 struct dm_target *tgt; 407 int r = -ENOTTY; 408 409 if (!map || !dm_table_get_size(map)) 410 goto out; 411 412 /* We only support devices that have a single target */ 413 if (dm_table_get_num_targets(map) != 1) 414 goto out; 415 416 tgt = dm_table_get_target(map, 0); 417 418 if (dm_suspended_md(md)) { 419 r = -EAGAIN; 420 goto out; 421 } 422 423 if (tgt->type->ioctl) 424 r = tgt->type->ioctl(tgt, cmd, arg); 425 426 out: 427 dm_table_put(map); 428 429 return r; 430 } 431 432 static struct dm_io *alloc_io(struct mapped_device *md) 433 { 434 return mempool_alloc(md->io_pool, GFP_NOIO); 435 } 436 437 static void free_io(struct mapped_device *md, struct dm_io *io) 438 { 439 mempool_free(io, md->io_pool); 440 } 441 442 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 443 { 444 mempool_free(tio, md->tio_pool); 445 } 446 447 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 448 gfp_t gfp_mask) 449 { 450 return mempool_alloc(md->tio_pool, gfp_mask); 451 } 452 453 static void free_rq_tio(struct dm_rq_target_io *tio) 454 { 455 mempool_free(tio, tio->md->tio_pool); 456 } 457 458 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 459 { 460 return mempool_alloc(md->io_pool, GFP_ATOMIC); 461 } 462 463 static void free_bio_info(struct dm_rq_clone_bio_info *info) 464 { 465 mempool_free(info, info->tio->md->io_pool); 466 } 467 468 static int md_in_flight(struct mapped_device *md) 469 { 470 return atomic_read(&md->pending[READ]) + 471 atomic_read(&md->pending[WRITE]); 472 } 473 474 static void start_io_acct(struct dm_io *io) 475 { 476 struct mapped_device *md = io->md; 477 int cpu; 478 int rw = bio_data_dir(io->bio); 479 480 io->start_time = jiffies; 481 482 cpu = part_stat_lock(); 483 part_round_stats(cpu, &dm_disk(md)->part0); 484 part_stat_unlock(); 485 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 486 } 487 488 static void end_io_acct(struct dm_io *io) 489 { 490 struct mapped_device *md = io->md; 491 struct bio *bio = io->bio; 492 unsigned long duration = jiffies - io->start_time; 493 int pending, cpu; 494 int rw = bio_data_dir(bio); 495 496 cpu = part_stat_lock(); 497 part_round_stats(cpu, &dm_disk(md)->part0); 498 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 499 part_stat_unlock(); 500 501 /* 502 * After this is decremented the bio must not be touched if it is 503 * a barrier. 504 */ 505 dm_disk(md)->part0.in_flight[rw] = pending = 506 atomic_dec_return(&md->pending[rw]); 507 pending += atomic_read(&md->pending[rw^0x1]); 508 509 /* nudge anyone waiting on suspend queue */ 510 if (!pending) 511 wake_up(&md->wait); 512 } 513 514 /* 515 * Add the bio to the list of deferred io. 516 */ 517 static void queue_io(struct mapped_device *md, struct bio *bio) 518 { 519 down_write(&md->io_lock); 520 521 spin_lock_irq(&md->deferred_lock); 522 bio_list_add(&md->deferred, bio); 523 spin_unlock_irq(&md->deferred_lock); 524 525 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 526 queue_work(md->wq, &md->work); 527 528 up_write(&md->io_lock); 529 } 530 531 /* 532 * Everyone (including functions in this file), should use this 533 * function to access the md->map field, and make sure they call 534 * dm_table_put() when finished. 535 */ 536 struct dm_table *dm_get_live_table(struct mapped_device *md) 537 { 538 struct dm_table *t; 539 unsigned long flags; 540 541 read_lock_irqsave(&md->map_lock, flags); 542 t = md->map; 543 if (t) 544 dm_table_get(t); 545 read_unlock_irqrestore(&md->map_lock, flags); 546 547 return t; 548 } 549 550 /* 551 * Get the geometry associated with a dm device 552 */ 553 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 554 { 555 *geo = md->geometry; 556 557 return 0; 558 } 559 560 /* 561 * Set the geometry of a device. 562 */ 563 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 564 { 565 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 566 567 if (geo->start > sz) { 568 DMWARN("Start sector is beyond the geometry limits."); 569 return -EINVAL; 570 } 571 572 md->geometry = *geo; 573 574 return 0; 575 } 576 577 /*----------------------------------------------------------------- 578 * CRUD START: 579 * A more elegant soln is in the works that uses the queue 580 * merge fn, unfortunately there are a couple of changes to 581 * the block layer that I want to make for this. So in the 582 * interests of getting something for people to use I give 583 * you this clearly demarcated crap. 584 *---------------------------------------------------------------*/ 585 586 static int __noflush_suspending(struct mapped_device *md) 587 { 588 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 589 } 590 591 /* 592 * Decrements the number of outstanding ios that a bio has been 593 * cloned into, completing the original io if necc. 594 */ 595 static void dec_pending(struct dm_io *io, int error) 596 { 597 unsigned long flags; 598 int io_error; 599 struct bio *bio; 600 struct mapped_device *md = io->md; 601 602 /* Push-back supersedes any I/O errors */ 603 if (unlikely(error)) { 604 spin_lock_irqsave(&io->endio_lock, flags); 605 if (!(io->error > 0 && __noflush_suspending(md))) 606 io->error = error; 607 spin_unlock_irqrestore(&io->endio_lock, flags); 608 } 609 610 if (atomic_dec_and_test(&io->io_count)) { 611 if (io->error == DM_ENDIO_REQUEUE) { 612 /* 613 * Target requested pushing back the I/O. 614 */ 615 spin_lock_irqsave(&md->deferred_lock, flags); 616 if (__noflush_suspending(md)) { 617 if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) 618 bio_list_add_head(&md->deferred, 619 io->bio); 620 } else 621 /* noflush suspend was interrupted. */ 622 io->error = -EIO; 623 spin_unlock_irqrestore(&md->deferred_lock, flags); 624 } 625 626 io_error = io->error; 627 bio = io->bio; 628 629 if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { 630 /* 631 * There can be just one barrier request so we use 632 * a per-device variable for error reporting. 633 * Note that you can't touch the bio after end_io_acct 634 */ 635 if (!md->barrier_error && io_error != -EOPNOTSUPP) 636 md->barrier_error = io_error; 637 end_io_acct(io); 638 free_io(md, io); 639 } else { 640 end_io_acct(io); 641 free_io(md, io); 642 643 if (io_error != DM_ENDIO_REQUEUE) { 644 trace_block_bio_complete(md->queue, bio); 645 646 bio_endio(bio, io_error); 647 } 648 } 649 } 650 } 651 652 static void clone_endio(struct bio *bio, int error) 653 { 654 int r = 0; 655 struct dm_target_io *tio = bio->bi_private; 656 struct dm_io *io = tio->io; 657 struct mapped_device *md = tio->io->md; 658 dm_endio_fn endio = tio->ti->type->end_io; 659 660 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 661 error = -EIO; 662 663 if (endio) { 664 r = endio(tio->ti, bio, error, &tio->info); 665 if (r < 0 || r == DM_ENDIO_REQUEUE) 666 /* 667 * error and requeue request are handled 668 * in dec_pending(). 669 */ 670 error = r; 671 else if (r == DM_ENDIO_INCOMPLETE) 672 /* The target will handle the io */ 673 return; 674 else if (r) { 675 DMWARN("unimplemented target endio return value: %d", r); 676 BUG(); 677 } 678 } 679 680 /* 681 * Store md for cleanup instead of tio which is about to get freed. 682 */ 683 bio->bi_private = md->bs; 684 685 free_tio(md, tio); 686 bio_put(bio); 687 dec_pending(io, error); 688 } 689 690 /* 691 * Partial completion handling for request-based dm 692 */ 693 static void end_clone_bio(struct bio *clone, int error) 694 { 695 struct dm_rq_clone_bio_info *info = clone->bi_private; 696 struct dm_rq_target_io *tio = info->tio; 697 struct bio *bio = info->orig; 698 unsigned int nr_bytes = info->orig->bi_size; 699 700 bio_put(clone); 701 702 if (tio->error) 703 /* 704 * An error has already been detected on the request. 705 * Once error occurred, just let clone->end_io() handle 706 * the remainder. 707 */ 708 return; 709 else if (error) { 710 /* 711 * Don't notice the error to the upper layer yet. 712 * The error handling decision is made by the target driver, 713 * when the request is completed. 714 */ 715 tio->error = error; 716 return; 717 } 718 719 /* 720 * I/O for the bio successfully completed. 721 * Notice the data completion to the upper layer. 722 */ 723 724 /* 725 * bios are processed from the head of the list. 726 * So the completing bio should always be rq->bio. 727 * If it's not, something wrong is happening. 728 */ 729 if (tio->orig->bio != bio) 730 DMERR("bio completion is going in the middle of the request"); 731 732 /* 733 * Update the original request. 734 * Do not use blk_end_request() here, because it may complete 735 * the original request before the clone, and break the ordering. 736 */ 737 blk_update_request(tio->orig, 0, nr_bytes); 738 } 739 740 static void store_barrier_error(struct mapped_device *md, int error) 741 { 742 unsigned long flags; 743 744 spin_lock_irqsave(&md->barrier_error_lock, flags); 745 /* 746 * Basically, the first error is taken, but: 747 * -EOPNOTSUPP supersedes any I/O error. 748 * Requeue request supersedes any I/O error but -EOPNOTSUPP. 749 */ 750 if (!md->barrier_error || error == -EOPNOTSUPP || 751 (md->barrier_error != -EOPNOTSUPP && 752 error == DM_ENDIO_REQUEUE)) 753 md->barrier_error = error; 754 spin_unlock_irqrestore(&md->barrier_error_lock, flags); 755 } 756 757 /* 758 * Don't touch any member of the md after calling this function because 759 * the md may be freed in dm_put() at the end of this function. 760 * Or do dm_get() before calling this function and dm_put() later. 761 */ 762 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 763 { 764 atomic_dec(&md->pending[rw]); 765 766 /* nudge anyone waiting on suspend queue */ 767 if (!md_in_flight(md)) 768 wake_up(&md->wait); 769 770 if (run_queue) 771 blk_run_queue(md->queue); 772 773 /* 774 * dm_put() must be at the end of this function. See the comment above 775 */ 776 dm_put(md); 777 } 778 779 static void free_rq_clone(struct request *clone) 780 { 781 struct dm_rq_target_io *tio = clone->end_io_data; 782 783 blk_rq_unprep_clone(clone); 784 free_rq_tio(tio); 785 } 786 787 /* 788 * Complete the clone and the original request. 789 * Must be called without queue lock. 790 */ 791 static void dm_end_request(struct request *clone, int error) 792 { 793 int rw = rq_data_dir(clone); 794 int run_queue = 1; 795 bool is_barrier = blk_barrier_rq(clone); 796 struct dm_rq_target_io *tio = clone->end_io_data; 797 struct mapped_device *md = tio->md; 798 struct request *rq = tio->orig; 799 800 if (blk_pc_request(rq) && !is_barrier) { 801 rq->errors = clone->errors; 802 rq->resid_len = clone->resid_len; 803 804 if (rq->sense) 805 /* 806 * We are using the sense buffer of the original 807 * request. 808 * So setting the length of the sense data is enough. 809 */ 810 rq->sense_len = clone->sense_len; 811 } 812 813 free_rq_clone(clone); 814 815 if (unlikely(is_barrier)) { 816 if (unlikely(error)) 817 store_barrier_error(md, error); 818 run_queue = 0; 819 } else 820 blk_end_request_all(rq, error); 821 822 rq_completed(md, rw, run_queue); 823 } 824 825 static void dm_unprep_request(struct request *rq) 826 { 827 struct request *clone = rq->special; 828 829 rq->special = NULL; 830 rq->cmd_flags &= ~REQ_DONTPREP; 831 832 free_rq_clone(clone); 833 } 834 835 /* 836 * Requeue the original request of a clone. 837 */ 838 void dm_requeue_unmapped_request(struct request *clone) 839 { 840 int rw = rq_data_dir(clone); 841 struct dm_rq_target_io *tio = clone->end_io_data; 842 struct mapped_device *md = tio->md; 843 struct request *rq = tio->orig; 844 struct request_queue *q = rq->q; 845 unsigned long flags; 846 847 if (unlikely(blk_barrier_rq(clone))) { 848 /* 849 * Barrier clones share an original request. 850 * Leave it to dm_end_request(), which handles this special 851 * case. 852 */ 853 dm_end_request(clone, DM_ENDIO_REQUEUE); 854 return; 855 } 856 857 dm_unprep_request(rq); 858 859 spin_lock_irqsave(q->queue_lock, flags); 860 if (elv_queue_empty(q)) 861 blk_plug_device(q); 862 blk_requeue_request(q, rq); 863 spin_unlock_irqrestore(q->queue_lock, flags); 864 865 rq_completed(md, rw, 0); 866 } 867 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 868 869 static void __stop_queue(struct request_queue *q) 870 { 871 blk_stop_queue(q); 872 } 873 874 static void stop_queue(struct request_queue *q) 875 { 876 unsigned long flags; 877 878 spin_lock_irqsave(q->queue_lock, flags); 879 __stop_queue(q); 880 spin_unlock_irqrestore(q->queue_lock, flags); 881 } 882 883 static void __start_queue(struct request_queue *q) 884 { 885 if (blk_queue_stopped(q)) 886 blk_start_queue(q); 887 } 888 889 static void start_queue(struct request_queue *q) 890 { 891 unsigned long flags; 892 893 spin_lock_irqsave(q->queue_lock, flags); 894 __start_queue(q); 895 spin_unlock_irqrestore(q->queue_lock, flags); 896 } 897 898 static void dm_done(struct request *clone, int error, bool mapped) 899 { 900 int r = error; 901 struct dm_rq_target_io *tio = clone->end_io_data; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 903 904 if (mapped && rq_end_io) 905 r = rq_end_io(tio->ti, clone, error, &tio->info); 906 907 if (r <= 0) 908 /* The target wants to complete the I/O */ 909 dm_end_request(clone, r); 910 else if (r == DM_ENDIO_INCOMPLETE) 911 /* The target will handle the I/O */ 912 return; 913 else if (r == DM_ENDIO_REQUEUE) 914 /* The target wants to requeue the I/O */ 915 dm_requeue_unmapped_request(clone); 916 else { 917 DMWARN("unimplemented target endio return value: %d", r); 918 BUG(); 919 } 920 } 921 922 /* 923 * Request completion handler for request-based dm 924 */ 925 static void dm_softirq_done(struct request *rq) 926 { 927 bool mapped = true; 928 struct request *clone = rq->completion_data; 929 struct dm_rq_target_io *tio = clone->end_io_data; 930 931 if (rq->cmd_flags & REQ_FAILED) 932 mapped = false; 933 934 dm_done(clone, tio->error, mapped); 935 } 936 937 /* 938 * Complete the clone and the original request with the error status 939 * through softirq context. 940 */ 941 static void dm_complete_request(struct request *clone, int error) 942 { 943 struct dm_rq_target_io *tio = clone->end_io_data; 944 struct request *rq = tio->orig; 945 946 if (unlikely(blk_barrier_rq(clone))) { 947 /* 948 * Barrier clones share an original request. So can't use 949 * softirq_done with the original. 950 * Pass the clone to dm_done() directly in this special case. 951 * It is safe (even if clone->q->queue_lock is held here) 952 * because there is no I/O dispatching during the completion 953 * of barrier clone. 954 */ 955 dm_done(clone, error, true); 956 return; 957 } 958 959 tio->error = error; 960 rq->completion_data = clone; 961 blk_complete_request(rq); 962 } 963 964 /* 965 * Complete the not-mapped clone and the original request with the error status 966 * through softirq context. 967 * Target's rq_end_io() function isn't called. 968 * This may be used when the target's map_rq() function fails. 969 */ 970 void dm_kill_unmapped_request(struct request *clone, int error) 971 { 972 struct dm_rq_target_io *tio = clone->end_io_data; 973 struct request *rq = tio->orig; 974 975 if (unlikely(blk_barrier_rq(clone))) { 976 /* 977 * Barrier clones share an original request. 978 * Leave it to dm_end_request(), which handles this special 979 * case. 980 */ 981 BUG_ON(error > 0); 982 dm_end_request(clone, error); 983 return; 984 } 985 986 rq->cmd_flags |= REQ_FAILED; 987 dm_complete_request(clone, error); 988 } 989 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 990 991 /* 992 * Called with the queue lock held 993 */ 994 static void end_clone_request(struct request *clone, int error) 995 { 996 /* 997 * For just cleaning up the information of the queue in which 998 * the clone was dispatched. 999 * The clone is *NOT* freed actually here because it is alloced from 1000 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1001 */ 1002 __blk_put_request(clone->q, clone); 1003 1004 /* 1005 * Actual request completion is done in a softirq context which doesn't 1006 * hold the queue lock. Otherwise, deadlock could occur because: 1007 * - another request may be submitted by the upper level driver 1008 * of the stacking during the completion 1009 * - the submission which requires queue lock may be done 1010 * against this queue 1011 */ 1012 dm_complete_request(clone, error); 1013 } 1014 1015 static sector_t max_io_len(struct mapped_device *md, 1016 sector_t sector, struct dm_target *ti) 1017 { 1018 sector_t offset = sector - ti->begin; 1019 sector_t len = ti->len - offset; 1020 1021 /* 1022 * Does the target need to split even further ? 1023 */ 1024 if (ti->split_io) { 1025 sector_t boundary; 1026 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 1027 - offset; 1028 if (len > boundary) 1029 len = boundary; 1030 } 1031 1032 return len; 1033 } 1034 1035 static void __map_bio(struct dm_target *ti, struct bio *clone, 1036 struct dm_target_io *tio) 1037 { 1038 int r; 1039 sector_t sector; 1040 struct mapped_device *md; 1041 1042 clone->bi_end_io = clone_endio; 1043 clone->bi_private = tio; 1044 1045 /* 1046 * Map the clone. If r == 0 we don't need to do 1047 * anything, the target has assumed ownership of 1048 * this io. 1049 */ 1050 atomic_inc(&tio->io->io_count); 1051 sector = clone->bi_sector; 1052 r = ti->type->map(ti, clone, &tio->info); 1053 if (r == DM_MAPIO_REMAPPED) { 1054 /* the bio has been remapped so dispatch it */ 1055 1056 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 1057 tio->io->bio->bi_bdev->bd_dev, sector); 1058 1059 generic_make_request(clone); 1060 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1061 /* error the io and bail out, or requeue it if needed */ 1062 md = tio->io->md; 1063 dec_pending(tio->io, r); 1064 /* 1065 * Store bio_set for cleanup. 1066 */ 1067 clone->bi_private = md->bs; 1068 bio_put(clone); 1069 free_tio(md, tio); 1070 } else if (r) { 1071 DMWARN("unimplemented target map return value: %d", r); 1072 BUG(); 1073 } 1074 } 1075 1076 struct clone_info { 1077 struct mapped_device *md; 1078 struct dm_table *map; 1079 struct bio *bio; 1080 struct dm_io *io; 1081 sector_t sector; 1082 sector_t sector_count; 1083 unsigned short idx; 1084 }; 1085 1086 static void dm_bio_destructor(struct bio *bio) 1087 { 1088 struct bio_set *bs = bio->bi_private; 1089 1090 bio_free(bio, bs); 1091 } 1092 1093 /* 1094 * Creates a little bio that is just does part of a bvec. 1095 */ 1096 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1097 unsigned short idx, unsigned int offset, 1098 unsigned int len, struct bio_set *bs) 1099 { 1100 struct bio *clone; 1101 struct bio_vec *bv = bio->bi_io_vec + idx; 1102 1103 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1104 clone->bi_destructor = dm_bio_destructor; 1105 *clone->bi_io_vec = *bv; 1106 1107 clone->bi_sector = sector; 1108 clone->bi_bdev = bio->bi_bdev; 1109 clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); 1110 clone->bi_vcnt = 1; 1111 clone->bi_size = to_bytes(len); 1112 clone->bi_io_vec->bv_offset = offset; 1113 clone->bi_io_vec->bv_len = clone->bi_size; 1114 clone->bi_flags |= 1 << BIO_CLONED; 1115 1116 if (bio_integrity(bio)) { 1117 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1118 bio_integrity_trim(clone, 1119 bio_sector_offset(bio, idx, offset), len); 1120 } 1121 1122 return clone; 1123 } 1124 1125 /* 1126 * Creates a bio that consists of range of complete bvecs. 1127 */ 1128 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1129 unsigned short idx, unsigned short bv_count, 1130 unsigned int len, struct bio_set *bs) 1131 { 1132 struct bio *clone; 1133 1134 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1135 __bio_clone(clone, bio); 1136 clone->bi_rw &= ~(1 << BIO_RW_BARRIER); 1137 clone->bi_destructor = dm_bio_destructor; 1138 clone->bi_sector = sector; 1139 clone->bi_idx = idx; 1140 clone->bi_vcnt = idx + bv_count; 1141 clone->bi_size = to_bytes(len); 1142 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1143 1144 if (bio_integrity(bio)) { 1145 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1146 1147 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1148 bio_integrity_trim(clone, 1149 bio_sector_offset(bio, idx, 0), len); 1150 } 1151 1152 return clone; 1153 } 1154 1155 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1156 struct dm_target *ti) 1157 { 1158 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1159 1160 tio->io = ci->io; 1161 tio->ti = ti; 1162 memset(&tio->info, 0, sizeof(tio->info)); 1163 1164 return tio; 1165 } 1166 1167 static void __flush_target(struct clone_info *ci, struct dm_target *ti, 1168 unsigned flush_nr) 1169 { 1170 struct dm_target_io *tio = alloc_tio(ci, ti); 1171 struct bio *clone; 1172 1173 tio->info.flush_request = flush_nr; 1174 1175 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1176 __bio_clone(clone, ci->bio); 1177 clone->bi_destructor = dm_bio_destructor; 1178 1179 __map_bio(ti, clone, tio); 1180 } 1181 1182 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1183 { 1184 unsigned target_nr = 0, flush_nr; 1185 struct dm_target *ti; 1186 1187 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1188 for (flush_nr = 0; flush_nr < ti->num_flush_requests; 1189 flush_nr++) 1190 __flush_target(ci, ti, flush_nr); 1191 1192 ci->sector_count = 0; 1193 1194 return 0; 1195 } 1196 1197 static int __clone_and_map(struct clone_info *ci) 1198 { 1199 struct bio *clone, *bio = ci->bio; 1200 struct dm_target *ti; 1201 sector_t len = 0, max; 1202 struct dm_target_io *tio; 1203 1204 if (unlikely(bio_empty_barrier(bio))) 1205 return __clone_and_map_empty_barrier(ci); 1206 1207 ti = dm_table_find_target(ci->map, ci->sector); 1208 if (!dm_target_is_valid(ti)) 1209 return -EIO; 1210 1211 max = max_io_len(ci->md, ci->sector, ti); 1212 1213 /* 1214 * Allocate a target io object. 1215 */ 1216 tio = alloc_tio(ci, ti); 1217 1218 if (ci->sector_count <= max) { 1219 /* 1220 * Optimise for the simple case where we can do all of 1221 * the remaining io with a single clone. 1222 */ 1223 clone = clone_bio(bio, ci->sector, ci->idx, 1224 bio->bi_vcnt - ci->idx, ci->sector_count, 1225 ci->md->bs); 1226 __map_bio(ti, clone, tio); 1227 ci->sector_count = 0; 1228 1229 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1230 /* 1231 * There are some bvecs that don't span targets. 1232 * Do as many of these as possible. 1233 */ 1234 int i; 1235 sector_t remaining = max; 1236 sector_t bv_len; 1237 1238 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1239 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1240 1241 if (bv_len > remaining) 1242 break; 1243 1244 remaining -= bv_len; 1245 len += bv_len; 1246 } 1247 1248 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1249 ci->md->bs); 1250 __map_bio(ti, clone, tio); 1251 1252 ci->sector += len; 1253 ci->sector_count -= len; 1254 ci->idx = i; 1255 1256 } else { 1257 /* 1258 * Handle a bvec that must be split between two or more targets. 1259 */ 1260 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1261 sector_t remaining = to_sector(bv->bv_len); 1262 unsigned int offset = 0; 1263 1264 do { 1265 if (offset) { 1266 ti = dm_table_find_target(ci->map, ci->sector); 1267 if (!dm_target_is_valid(ti)) 1268 return -EIO; 1269 1270 max = max_io_len(ci->md, ci->sector, ti); 1271 1272 tio = alloc_tio(ci, ti); 1273 } 1274 1275 len = min(remaining, max); 1276 1277 clone = split_bvec(bio, ci->sector, ci->idx, 1278 bv->bv_offset + offset, len, 1279 ci->md->bs); 1280 1281 __map_bio(ti, clone, tio); 1282 1283 ci->sector += len; 1284 ci->sector_count -= len; 1285 offset += to_bytes(len); 1286 } while (remaining -= len); 1287 1288 ci->idx++; 1289 } 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * Split the bio into several clones and submit it to targets. 1296 */ 1297 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1298 { 1299 struct clone_info ci; 1300 int error = 0; 1301 1302 ci.map = dm_get_live_table(md); 1303 if (unlikely(!ci.map)) { 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1305 bio_io_error(bio); 1306 else 1307 if (!md->barrier_error) 1308 md->barrier_error = -EIO; 1309 return; 1310 } 1311 1312 ci.md = md; 1313 ci.bio = bio; 1314 ci.io = alloc_io(md); 1315 ci.io->error = 0; 1316 atomic_set(&ci.io->io_count, 1); 1317 ci.io->bio = bio; 1318 ci.io->md = md; 1319 spin_lock_init(&ci.io->endio_lock); 1320 ci.sector = bio->bi_sector; 1321 ci.sector_count = bio_sectors(bio); 1322 if (unlikely(bio_empty_barrier(bio))) 1323 ci.sector_count = 1; 1324 ci.idx = bio->bi_idx; 1325 1326 start_io_acct(ci.io); 1327 while (ci.sector_count && !error) 1328 error = __clone_and_map(&ci); 1329 1330 /* drop the extra reference count */ 1331 dec_pending(ci.io, error); 1332 dm_table_put(ci.map); 1333 } 1334 /*----------------------------------------------------------------- 1335 * CRUD END 1336 *---------------------------------------------------------------*/ 1337 1338 static int dm_merge_bvec(struct request_queue *q, 1339 struct bvec_merge_data *bvm, 1340 struct bio_vec *biovec) 1341 { 1342 struct mapped_device *md = q->queuedata; 1343 struct dm_table *map = dm_get_live_table(md); 1344 struct dm_target *ti; 1345 sector_t max_sectors; 1346 int max_size = 0; 1347 1348 if (unlikely(!map)) 1349 goto out; 1350 1351 ti = dm_table_find_target(map, bvm->bi_sector); 1352 if (!dm_target_is_valid(ti)) 1353 goto out_table; 1354 1355 /* 1356 * Find maximum amount of I/O that won't need splitting 1357 */ 1358 max_sectors = min(max_io_len(md, bvm->bi_sector, ti), 1359 (sector_t) BIO_MAX_SECTORS); 1360 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1361 if (max_size < 0) 1362 max_size = 0; 1363 1364 /* 1365 * merge_bvec_fn() returns number of bytes 1366 * it can accept at this offset 1367 * max is precomputed maximal io size 1368 */ 1369 if (max_size && ti->type->merge) 1370 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1371 /* 1372 * If the target doesn't support merge method and some of the devices 1373 * provided their merge_bvec method (we know this by looking at 1374 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1375 * entries. So always set max_size to 0, and the code below allows 1376 * just one page. 1377 */ 1378 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1379 1380 max_size = 0; 1381 1382 out_table: 1383 dm_table_put(map); 1384 1385 out: 1386 /* 1387 * Always allow an entire first page 1388 */ 1389 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1390 max_size = biovec->bv_len; 1391 1392 return max_size; 1393 } 1394 1395 /* 1396 * The request function that just remaps the bio built up by 1397 * dm_merge_bvec. 1398 */ 1399 static int _dm_request(struct request_queue *q, struct bio *bio) 1400 { 1401 int rw = bio_data_dir(bio); 1402 struct mapped_device *md = q->queuedata; 1403 int cpu; 1404 1405 down_read(&md->io_lock); 1406 1407 cpu = part_stat_lock(); 1408 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1409 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1410 part_stat_unlock(); 1411 1412 /* 1413 * If we're suspended or the thread is processing barriers 1414 * we have to queue this io for later. 1415 */ 1416 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1417 unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 1418 up_read(&md->io_lock); 1419 1420 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1421 bio_rw(bio) == READA) { 1422 bio_io_error(bio); 1423 return 0; 1424 } 1425 1426 queue_io(md, bio); 1427 1428 return 0; 1429 } 1430 1431 __split_and_process_bio(md, bio); 1432 up_read(&md->io_lock); 1433 return 0; 1434 } 1435 1436 static int dm_make_request(struct request_queue *q, struct bio *bio) 1437 { 1438 struct mapped_device *md = q->queuedata; 1439 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1441 } 1442 1443 static int dm_request_based(struct mapped_device *md) 1444 { 1445 return blk_queue_stackable(md->queue); 1446 } 1447 1448 static int dm_request(struct request_queue *q, struct bio *bio) 1449 { 1450 struct mapped_device *md = q->queuedata; 1451 1452 if (dm_request_based(md)) 1453 return dm_make_request(q, bio); 1454 1455 return _dm_request(q, bio); 1456 } 1457 1458 /* 1459 * Mark this request as flush request, so that dm_request_fn() can 1460 * recognize. 1461 */ 1462 static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) 1463 { 1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK; 1465 rq->cmd[0] = REQ_LB_OP_FLUSH; 1466 } 1467 1468 static bool dm_rq_is_flush_request(struct request *rq) 1469 { 1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && 1471 rq->cmd[0] == REQ_LB_OP_FLUSH) 1472 return true; 1473 else 1474 return false; 1475 } 1476 1477 void dm_dispatch_request(struct request *rq) 1478 { 1479 int r; 1480 1481 if (blk_queue_io_stat(rq->q)) 1482 rq->cmd_flags |= REQ_IO_STAT; 1483 1484 rq->start_time = jiffies; 1485 r = blk_insert_cloned_request(rq->q, rq); 1486 if (r) 1487 dm_complete_request(rq, r); 1488 } 1489 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1490 1491 static void dm_rq_bio_destructor(struct bio *bio) 1492 { 1493 struct dm_rq_clone_bio_info *info = bio->bi_private; 1494 struct mapped_device *md = info->tio->md; 1495 1496 free_bio_info(info); 1497 bio_free(bio, md->bs); 1498 } 1499 1500 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1501 void *data) 1502 { 1503 struct dm_rq_target_io *tio = data; 1504 struct mapped_device *md = tio->md; 1505 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1506 1507 if (!info) 1508 return -ENOMEM; 1509 1510 info->orig = bio_orig; 1511 info->tio = tio; 1512 bio->bi_end_io = end_clone_bio; 1513 bio->bi_private = info; 1514 bio->bi_destructor = dm_rq_bio_destructor; 1515 1516 return 0; 1517 } 1518 1519 static int setup_clone(struct request *clone, struct request *rq, 1520 struct dm_rq_target_io *tio) 1521 { 1522 int r; 1523 1524 if (dm_rq_is_flush_request(rq)) { 1525 blk_rq_init(NULL, clone); 1526 clone->cmd_type = REQ_TYPE_FS; 1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1528 } else { 1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1530 dm_rq_bio_constructor, tio); 1531 if (r) 1532 return r; 1533 1534 clone->cmd = rq->cmd; 1535 clone->cmd_len = rq->cmd_len; 1536 clone->sense = rq->sense; 1537 clone->buffer = rq->buffer; 1538 } 1539 1540 clone->end_io = end_clone_request; 1541 clone->end_io_data = tio; 1542 1543 return 0; 1544 } 1545 1546 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1547 gfp_t gfp_mask) 1548 { 1549 struct request *clone; 1550 struct dm_rq_target_io *tio; 1551 1552 tio = alloc_rq_tio(md, gfp_mask); 1553 if (!tio) 1554 return NULL; 1555 1556 tio->md = md; 1557 tio->ti = NULL; 1558 tio->orig = rq; 1559 tio->error = 0; 1560 memset(&tio->info, 0, sizeof(tio->info)); 1561 1562 clone = &tio->clone; 1563 if (setup_clone(clone, rq, tio)) { 1564 /* -ENOMEM */ 1565 free_rq_tio(tio); 1566 return NULL; 1567 } 1568 1569 return clone; 1570 } 1571 1572 /* 1573 * Called with the queue lock held. 1574 */ 1575 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1576 { 1577 struct mapped_device *md = q->queuedata; 1578 struct request *clone; 1579 1580 if (unlikely(dm_rq_is_flush_request(rq))) 1581 return BLKPREP_OK; 1582 1583 if (unlikely(rq->special)) { 1584 DMWARN("Already has something in rq->special."); 1585 return BLKPREP_KILL; 1586 } 1587 1588 clone = clone_rq(rq, md, GFP_ATOMIC); 1589 if (!clone) 1590 return BLKPREP_DEFER; 1591 1592 rq->special = clone; 1593 rq->cmd_flags |= REQ_DONTPREP; 1594 1595 return BLKPREP_OK; 1596 } 1597 1598 /* 1599 * Returns: 1600 * 0 : the request has been processed (not requeued) 1601 * !0 : the request has been requeued 1602 */ 1603 static int map_request(struct dm_target *ti, struct request *clone, 1604 struct mapped_device *md) 1605 { 1606 int r, requeued = 0; 1607 struct dm_rq_target_io *tio = clone->end_io_data; 1608 1609 /* 1610 * Hold the md reference here for the in-flight I/O. 1611 * We can't rely on the reference count by device opener, 1612 * because the device may be closed during the request completion 1613 * when all bios are completed. 1614 * See the comment in rq_completed() too. 1615 */ 1616 dm_get(md); 1617 1618 tio->ti = ti; 1619 r = ti->type->map_rq(ti, clone, &tio->info); 1620 switch (r) { 1621 case DM_MAPIO_SUBMITTED: 1622 /* The target has taken the I/O to submit by itself later */ 1623 break; 1624 case DM_MAPIO_REMAPPED: 1625 /* The target has remapped the I/O so dispatch it */ 1626 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1627 blk_rq_pos(tio->orig)); 1628 dm_dispatch_request(clone); 1629 break; 1630 case DM_MAPIO_REQUEUE: 1631 /* The target wants to requeue the I/O */ 1632 dm_requeue_unmapped_request(clone); 1633 requeued = 1; 1634 break; 1635 default: 1636 if (r > 0) { 1637 DMWARN("unimplemented target map return value: %d", r); 1638 BUG(); 1639 } 1640 1641 /* The target wants to complete the I/O */ 1642 dm_kill_unmapped_request(clone, r); 1643 break; 1644 } 1645 1646 return requeued; 1647 } 1648 1649 /* 1650 * q->request_fn for request-based dm. 1651 * Called with the queue lock held. 1652 */ 1653 static void dm_request_fn(struct request_queue *q) 1654 { 1655 struct mapped_device *md = q->queuedata; 1656 struct dm_table *map = dm_get_live_table(md); 1657 struct dm_target *ti; 1658 struct request *rq, *clone; 1659 1660 /* 1661 * For suspend, check blk_queue_stopped() and increment 1662 * ->pending within a single queue_lock not to increment the 1663 * number of in-flight I/Os after the queue is stopped in 1664 * dm_suspend(). 1665 */ 1666 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1667 rq = blk_peek_request(q); 1668 if (!rq) 1669 goto plug_and_out; 1670 1671 if (unlikely(dm_rq_is_flush_request(rq))) { 1672 BUG_ON(md->flush_request); 1673 md->flush_request = rq; 1674 blk_start_request(rq); 1675 queue_work(md->wq, &md->barrier_work); 1676 goto out; 1677 } 1678 1679 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1680 if (ti->type->busy && ti->type->busy(ti)) 1681 goto plug_and_out; 1682 1683 blk_start_request(rq); 1684 clone = rq->special; 1685 atomic_inc(&md->pending[rq_data_dir(clone)]); 1686 1687 spin_unlock(q->queue_lock); 1688 if (map_request(ti, clone, md)) 1689 goto requeued; 1690 1691 spin_lock_irq(q->queue_lock); 1692 } 1693 1694 goto out; 1695 1696 requeued: 1697 spin_lock_irq(q->queue_lock); 1698 1699 plug_and_out: 1700 if (!elv_queue_empty(q)) 1701 /* Some requests still remain, retry later */ 1702 blk_plug_device(q); 1703 1704 out: 1705 dm_table_put(map); 1706 1707 return; 1708 } 1709 1710 int dm_underlying_device_busy(struct request_queue *q) 1711 { 1712 return blk_lld_busy(q); 1713 } 1714 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1715 1716 static int dm_lld_busy(struct request_queue *q) 1717 { 1718 int r; 1719 struct mapped_device *md = q->queuedata; 1720 struct dm_table *map = dm_get_live_table(md); 1721 1722 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1723 r = 1; 1724 else 1725 r = dm_table_any_busy_target(map); 1726 1727 dm_table_put(map); 1728 1729 return r; 1730 } 1731 1732 static void dm_unplug_all(struct request_queue *q) 1733 { 1734 struct mapped_device *md = q->queuedata; 1735 struct dm_table *map = dm_get_live_table(md); 1736 1737 if (map) { 1738 if (dm_request_based(md)) 1739 generic_unplug_device(q); 1740 1741 dm_table_unplug_all(map); 1742 dm_table_put(map); 1743 } 1744 } 1745 1746 static int dm_any_congested(void *congested_data, int bdi_bits) 1747 { 1748 int r = bdi_bits; 1749 struct mapped_device *md = congested_data; 1750 struct dm_table *map; 1751 1752 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1753 map = dm_get_live_table(md); 1754 if (map) { 1755 /* 1756 * Request-based dm cares about only own queue for 1757 * the query about congestion status of request_queue 1758 */ 1759 if (dm_request_based(md)) 1760 r = md->queue->backing_dev_info.state & 1761 bdi_bits; 1762 else 1763 r = dm_table_any_congested(map, bdi_bits); 1764 1765 dm_table_put(map); 1766 } 1767 } 1768 1769 return r; 1770 } 1771 1772 /*----------------------------------------------------------------- 1773 * An IDR is used to keep track of allocated minor numbers. 1774 *---------------------------------------------------------------*/ 1775 static DEFINE_IDR(_minor_idr); 1776 1777 static void free_minor(int minor) 1778 { 1779 spin_lock(&_minor_lock); 1780 idr_remove(&_minor_idr, minor); 1781 spin_unlock(&_minor_lock); 1782 } 1783 1784 /* 1785 * See if the device with a specific minor # is free. 1786 */ 1787 static int specific_minor(int minor) 1788 { 1789 int r, m; 1790 1791 if (minor >= (1 << MINORBITS)) 1792 return -EINVAL; 1793 1794 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1795 if (!r) 1796 return -ENOMEM; 1797 1798 spin_lock(&_minor_lock); 1799 1800 if (idr_find(&_minor_idr, minor)) { 1801 r = -EBUSY; 1802 goto out; 1803 } 1804 1805 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1806 if (r) 1807 goto out; 1808 1809 if (m != minor) { 1810 idr_remove(&_minor_idr, m); 1811 r = -EBUSY; 1812 goto out; 1813 } 1814 1815 out: 1816 spin_unlock(&_minor_lock); 1817 return r; 1818 } 1819 1820 static int next_free_minor(int *minor) 1821 { 1822 int r, m; 1823 1824 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1825 if (!r) 1826 return -ENOMEM; 1827 1828 spin_lock(&_minor_lock); 1829 1830 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1831 if (r) 1832 goto out; 1833 1834 if (m >= (1 << MINORBITS)) { 1835 idr_remove(&_minor_idr, m); 1836 r = -ENOSPC; 1837 goto out; 1838 } 1839 1840 *minor = m; 1841 1842 out: 1843 spin_unlock(&_minor_lock); 1844 return r; 1845 } 1846 1847 static const struct block_device_operations dm_blk_dops; 1848 1849 static void dm_wq_work(struct work_struct *work); 1850 static void dm_rq_barrier_work(struct work_struct *work); 1851 1852 /* 1853 * Allocate and initialise a blank device with a given minor. 1854 */ 1855 static struct mapped_device *alloc_dev(int minor) 1856 { 1857 int r; 1858 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1859 void *old_md; 1860 1861 if (!md) { 1862 DMWARN("unable to allocate device, out of memory."); 1863 return NULL; 1864 } 1865 1866 if (!try_module_get(THIS_MODULE)) 1867 goto bad_module_get; 1868 1869 /* get a minor number for the dev */ 1870 if (minor == DM_ANY_MINOR) 1871 r = next_free_minor(&minor); 1872 else 1873 r = specific_minor(minor); 1874 if (r < 0) 1875 goto bad_minor; 1876 1877 init_rwsem(&md->io_lock); 1878 mutex_init(&md->suspend_lock); 1879 spin_lock_init(&md->deferred_lock); 1880 spin_lock_init(&md->barrier_error_lock); 1881 rwlock_init(&md->map_lock); 1882 atomic_set(&md->holders, 1); 1883 atomic_set(&md->open_count, 0); 1884 atomic_set(&md->event_nr, 0); 1885 atomic_set(&md->uevent_seq, 0); 1886 INIT_LIST_HEAD(&md->uevent_list); 1887 spin_lock_init(&md->uevent_lock); 1888 1889 md->queue = blk_init_queue(dm_request_fn, NULL); 1890 if (!md->queue) 1891 goto bad_queue; 1892 1893 /* 1894 * Request-based dm devices cannot be stacked on top of bio-based dm 1895 * devices. The type of this dm device has not been decided yet, 1896 * although we initialized the queue using blk_init_queue(). 1897 * The type is decided at the first table loading time. 1898 * To prevent problematic device stacking, clear the queue flag 1899 * for request stacking support until then. 1900 * 1901 * This queue is new, so no concurrency on the queue_flags. 1902 */ 1903 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1904 md->saved_make_request_fn = md->queue->make_request_fn; 1905 md->queue->queuedata = md; 1906 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1907 md->queue->backing_dev_info.congested_data = md; 1908 blk_queue_make_request(md->queue, dm_request); 1909 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1910 md->queue->unplug_fn = dm_unplug_all; 1911 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1912 blk_queue_softirq_done(md->queue, dm_softirq_done); 1913 blk_queue_prep_rq(md->queue, dm_prep_fn); 1914 blk_queue_lld_busy(md->queue, dm_lld_busy); 1915 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, 1916 dm_rq_prepare_flush); 1917 1918 md->disk = alloc_disk(1); 1919 if (!md->disk) 1920 goto bad_disk; 1921 1922 atomic_set(&md->pending[0], 0); 1923 atomic_set(&md->pending[1], 0); 1924 init_waitqueue_head(&md->wait); 1925 INIT_WORK(&md->work, dm_wq_work); 1926 INIT_WORK(&md->barrier_work, dm_rq_barrier_work); 1927 init_waitqueue_head(&md->eventq); 1928 1929 md->disk->major = _major; 1930 md->disk->first_minor = minor; 1931 md->disk->fops = &dm_blk_dops; 1932 md->disk->queue = md->queue; 1933 md->disk->private_data = md; 1934 sprintf(md->disk->disk_name, "dm-%d", minor); 1935 add_disk(md->disk); 1936 format_dev_t(md->name, MKDEV(_major, minor)); 1937 1938 md->wq = create_singlethread_workqueue("kdmflush"); 1939 if (!md->wq) 1940 goto bad_thread; 1941 1942 md->bdev = bdget_disk(md->disk, 0); 1943 if (!md->bdev) 1944 goto bad_bdev; 1945 1946 /* Populate the mapping, nobody knows we exist yet */ 1947 spin_lock(&_minor_lock); 1948 old_md = idr_replace(&_minor_idr, md, minor); 1949 spin_unlock(&_minor_lock); 1950 1951 BUG_ON(old_md != MINOR_ALLOCED); 1952 1953 return md; 1954 1955 bad_bdev: 1956 destroy_workqueue(md->wq); 1957 bad_thread: 1958 del_gendisk(md->disk); 1959 put_disk(md->disk); 1960 bad_disk: 1961 blk_cleanup_queue(md->queue); 1962 bad_queue: 1963 free_minor(minor); 1964 bad_minor: 1965 module_put(THIS_MODULE); 1966 bad_module_get: 1967 kfree(md); 1968 return NULL; 1969 } 1970 1971 static void unlock_fs(struct mapped_device *md); 1972 1973 static void free_dev(struct mapped_device *md) 1974 { 1975 int minor = MINOR(disk_devt(md->disk)); 1976 1977 unlock_fs(md); 1978 bdput(md->bdev); 1979 destroy_workqueue(md->wq); 1980 if (md->tio_pool) 1981 mempool_destroy(md->tio_pool); 1982 if (md->io_pool) 1983 mempool_destroy(md->io_pool); 1984 if (md->bs) 1985 bioset_free(md->bs); 1986 blk_integrity_unregister(md->disk); 1987 del_gendisk(md->disk); 1988 free_minor(minor); 1989 1990 spin_lock(&_minor_lock); 1991 md->disk->private_data = NULL; 1992 spin_unlock(&_minor_lock); 1993 1994 put_disk(md->disk); 1995 blk_cleanup_queue(md->queue); 1996 module_put(THIS_MODULE); 1997 kfree(md); 1998 } 1999 2000 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2001 { 2002 struct dm_md_mempools *p; 2003 2004 if (md->io_pool && md->tio_pool && md->bs) 2005 /* the md already has necessary mempools */ 2006 goto out; 2007 2008 p = dm_table_get_md_mempools(t); 2009 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 2010 2011 md->io_pool = p->io_pool; 2012 p->io_pool = NULL; 2013 md->tio_pool = p->tio_pool; 2014 p->tio_pool = NULL; 2015 md->bs = p->bs; 2016 p->bs = NULL; 2017 2018 out: 2019 /* mempool bind completed, now no need any mempools in the table */ 2020 dm_table_free_md_mempools(t); 2021 } 2022 2023 /* 2024 * Bind a table to the device. 2025 */ 2026 static void event_callback(void *context) 2027 { 2028 unsigned long flags; 2029 LIST_HEAD(uevents); 2030 struct mapped_device *md = (struct mapped_device *) context; 2031 2032 spin_lock_irqsave(&md->uevent_lock, flags); 2033 list_splice_init(&md->uevent_list, &uevents); 2034 spin_unlock_irqrestore(&md->uevent_lock, flags); 2035 2036 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2037 2038 atomic_inc(&md->event_nr); 2039 wake_up(&md->eventq); 2040 } 2041 2042 static void __set_size(struct mapped_device *md, sector_t size) 2043 { 2044 set_capacity(md->disk, size); 2045 2046 mutex_lock(&md->bdev->bd_inode->i_mutex); 2047 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2048 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2049 } 2050 2051 /* 2052 * Returns old map, which caller must destroy. 2053 */ 2054 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2055 struct queue_limits *limits) 2056 { 2057 struct dm_table *old_map; 2058 struct request_queue *q = md->queue; 2059 sector_t size; 2060 unsigned long flags; 2061 2062 size = dm_table_get_size(t); 2063 2064 /* 2065 * Wipe any geometry if the size of the table changed. 2066 */ 2067 if (size != get_capacity(md->disk)) 2068 memset(&md->geometry, 0, sizeof(md->geometry)); 2069 2070 __set_size(md, size); 2071 2072 dm_table_event_callback(t, event_callback, md); 2073 2074 /* 2075 * The queue hasn't been stopped yet, if the old table type wasn't 2076 * for request-based during suspension. So stop it to prevent 2077 * I/O mapping before resume. 2078 * This must be done before setting the queue restrictions, 2079 * because request-based dm may be run just after the setting. 2080 */ 2081 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2082 stop_queue(q); 2083 2084 __bind_mempools(md, t); 2085 2086 write_lock_irqsave(&md->map_lock, flags); 2087 old_map = md->map; 2088 md->map = t; 2089 dm_table_set_restrictions(t, q, limits); 2090 write_unlock_irqrestore(&md->map_lock, flags); 2091 2092 return old_map; 2093 } 2094 2095 /* 2096 * Returns unbound table for the caller to free. 2097 */ 2098 static struct dm_table *__unbind(struct mapped_device *md) 2099 { 2100 struct dm_table *map = md->map; 2101 unsigned long flags; 2102 2103 if (!map) 2104 return NULL; 2105 2106 dm_table_event_callback(map, NULL, NULL); 2107 write_lock_irqsave(&md->map_lock, flags); 2108 md->map = NULL; 2109 write_unlock_irqrestore(&md->map_lock, flags); 2110 2111 return map; 2112 } 2113 2114 /* 2115 * Constructor for a new device. 2116 */ 2117 int dm_create(int minor, struct mapped_device **result) 2118 { 2119 struct mapped_device *md; 2120 2121 md = alloc_dev(minor); 2122 if (!md) 2123 return -ENXIO; 2124 2125 dm_sysfs_init(md); 2126 2127 *result = md; 2128 return 0; 2129 } 2130 2131 static struct mapped_device *dm_find_md(dev_t dev) 2132 { 2133 struct mapped_device *md; 2134 unsigned minor = MINOR(dev); 2135 2136 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2137 return NULL; 2138 2139 spin_lock(&_minor_lock); 2140 2141 md = idr_find(&_minor_idr, minor); 2142 if (md && (md == MINOR_ALLOCED || 2143 (MINOR(disk_devt(dm_disk(md))) != minor) || 2144 test_bit(DMF_FREEING, &md->flags))) { 2145 md = NULL; 2146 goto out; 2147 } 2148 2149 out: 2150 spin_unlock(&_minor_lock); 2151 2152 return md; 2153 } 2154 2155 struct mapped_device *dm_get_md(dev_t dev) 2156 { 2157 struct mapped_device *md = dm_find_md(dev); 2158 2159 if (md) 2160 dm_get(md); 2161 2162 return md; 2163 } 2164 2165 void *dm_get_mdptr(struct mapped_device *md) 2166 { 2167 return md->interface_ptr; 2168 } 2169 2170 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2171 { 2172 md->interface_ptr = ptr; 2173 } 2174 2175 void dm_get(struct mapped_device *md) 2176 { 2177 atomic_inc(&md->holders); 2178 } 2179 2180 const char *dm_device_name(struct mapped_device *md) 2181 { 2182 return md->name; 2183 } 2184 EXPORT_SYMBOL_GPL(dm_device_name); 2185 2186 void dm_put(struct mapped_device *md) 2187 { 2188 struct dm_table *map; 2189 2190 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2191 2192 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2193 map = dm_get_live_table(md); 2194 idr_replace(&_minor_idr, MINOR_ALLOCED, 2195 MINOR(disk_devt(dm_disk(md)))); 2196 set_bit(DMF_FREEING, &md->flags); 2197 spin_unlock(&_minor_lock); 2198 if (!dm_suspended_md(md)) { 2199 dm_table_presuspend_targets(map); 2200 dm_table_postsuspend_targets(map); 2201 } 2202 dm_sysfs_exit(md); 2203 dm_table_put(map); 2204 dm_table_destroy(__unbind(md)); 2205 free_dev(md); 2206 } 2207 } 2208 EXPORT_SYMBOL_GPL(dm_put); 2209 2210 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2211 { 2212 int r = 0; 2213 DECLARE_WAITQUEUE(wait, current); 2214 2215 dm_unplug_all(md->queue); 2216 2217 add_wait_queue(&md->wait, &wait); 2218 2219 while (1) { 2220 set_current_state(interruptible); 2221 2222 smp_mb(); 2223 if (!md_in_flight(md)) 2224 break; 2225 2226 if (interruptible == TASK_INTERRUPTIBLE && 2227 signal_pending(current)) { 2228 r = -EINTR; 2229 break; 2230 } 2231 2232 io_schedule(); 2233 } 2234 set_current_state(TASK_RUNNING); 2235 2236 remove_wait_queue(&md->wait, &wait); 2237 2238 return r; 2239 } 2240 2241 static void dm_flush(struct mapped_device *md) 2242 { 2243 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2244 2245 bio_init(&md->barrier_bio); 2246 md->barrier_bio.bi_bdev = md->bdev; 2247 md->barrier_bio.bi_rw = WRITE_BARRIER; 2248 __split_and_process_bio(md, &md->barrier_bio); 2249 2250 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2251 } 2252 2253 static void process_barrier(struct mapped_device *md, struct bio *bio) 2254 { 2255 md->barrier_error = 0; 2256 2257 dm_flush(md); 2258 2259 if (!bio_empty_barrier(bio)) { 2260 __split_and_process_bio(md, bio); 2261 dm_flush(md); 2262 } 2263 2264 if (md->barrier_error != DM_ENDIO_REQUEUE) 2265 bio_endio(bio, md->barrier_error); 2266 else { 2267 spin_lock_irq(&md->deferred_lock); 2268 bio_list_add_head(&md->deferred, bio); 2269 spin_unlock_irq(&md->deferred_lock); 2270 } 2271 } 2272 2273 /* 2274 * Process the deferred bios 2275 */ 2276 static void dm_wq_work(struct work_struct *work) 2277 { 2278 struct mapped_device *md = container_of(work, struct mapped_device, 2279 work); 2280 struct bio *c; 2281 2282 down_write(&md->io_lock); 2283 2284 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2285 spin_lock_irq(&md->deferred_lock); 2286 c = bio_list_pop(&md->deferred); 2287 spin_unlock_irq(&md->deferred_lock); 2288 2289 if (!c) { 2290 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2291 break; 2292 } 2293 2294 up_write(&md->io_lock); 2295 2296 if (dm_request_based(md)) 2297 generic_make_request(c); 2298 else { 2299 if (bio_rw_flagged(c, BIO_RW_BARRIER)) 2300 process_barrier(md, c); 2301 else 2302 __split_and_process_bio(md, c); 2303 } 2304 2305 down_write(&md->io_lock); 2306 } 2307 2308 up_write(&md->io_lock); 2309 } 2310 2311 static void dm_queue_flush(struct mapped_device *md) 2312 { 2313 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2314 smp_mb__after_clear_bit(); 2315 queue_work(md->wq, &md->work); 2316 } 2317 2318 static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) 2319 { 2320 struct dm_rq_target_io *tio = clone->end_io_data; 2321 2322 tio->info.flush_request = flush_nr; 2323 } 2324 2325 /* Issue barrier requests to targets and wait for their completion. */ 2326 static int dm_rq_barrier(struct mapped_device *md) 2327 { 2328 int i, j; 2329 struct dm_table *map = dm_get_live_table(md); 2330 unsigned num_targets = dm_table_get_num_targets(map); 2331 struct dm_target *ti; 2332 struct request *clone; 2333 2334 md->barrier_error = 0; 2335 2336 for (i = 0; i < num_targets; i++) { 2337 ti = dm_table_get_target(map, i); 2338 for (j = 0; j < ti->num_flush_requests; j++) { 2339 clone = clone_rq(md->flush_request, md, GFP_NOIO); 2340 dm_rq_set_flush_nr(clone, j); 2341 atomic_inc(&md->pending[rq_data_dir(clone)]); 2342 map_request(ti, clone, md); 2343 } 2344 } 2345 2346 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2347 dm_table_put(map); 2348 2349 return md->barrier_error; 2350 } 2351 2352 static void dm_rq_barrier_work(struct work_struct *work) 2353 { 2354 int error; 2355 struct mapped_device *md = container_of(work, struct mapped_device, 2356 barrier_work); 2357 struct request_queue *q = md->queue; 2358 struct request *rq; 2359 unsigned long flags; 2360 2361 /* 2362 * Hold the md reference here and leave it at the last part so that 2363 * the md can't be deleted by device opener when the barrier request 2364 * completes. 2365 */ 2366 dm_get(md); 2367 2368 error = dm_rq_barrier(md); 2369 2370 rq = md->flush_request; 2371 md->flush_request = NULL; 2372 2373 if (error == DM_ENDIO_REQUEUE) { 2374 spin_lock_irqsave(q->queue_lock, flags); 2375 blk_requeue_request(q, rq); 2376 spin_unlock_irqrestore(q->queue_lock, flags); 2377 } else 2378 blk_end_request_all(rq, error); 2379 2380 blk_run_queue(q); 2381 2382 dm_put(md); 2383 } 2384 2385 /* 2386 * Swap in a new table, returning the old one for the caller to destroy. 2387 */ 2388 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2389 { 2390 struct dm_table *map = ERR_PTR(-EINVAL); 2391 struct queue_limits limits; 2392 int r; 2393 2394 mutex_lock(&md->suspend_lock); 2395 2396 /* device must be suspended */ 2397 if (!dm_suspended_md(md)) 2398 goto out; 2399 2400 r = dm_calculate_queue_limits(table, &limits); 2401 if (r) { 2402 map = ERR_PTR(r); 2403 goto out; 2404 } 2405 2406 /* cannot change the device type, once a table is bound */ 2407 if (md->map && 2408 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2409 DMWARN("can't change the device type after a table is bound"); 2410 goto out; 2411 } 2412 2413 map = __bind(md, table, &limits); 2414 2415 out: 2416 mutex_unlock(&md->suspend_lock); 2417 return map; 2418 } 2419 2420 /* 2421 * Functions to lock and unlock any filesystem running on the 2422 * device. 2423 */ 2424 static int lock_fs(struct mapped_device *md) 2425 { 2426 int r; 2427 2428 WARN_ON(md->frozen_sb); 2429 2430 md->frozen_sb = freeze_bdev(md->bdev); 2431 if (IS_ERR(md->frozen_sb)) { 2432 r = PTR_ERR(md->frozen_sb); 2433 md->frozen_sb = NULL; 2434 return r; 2435 } 2436 2437 set_bit(DMF_FROZEN, &md->flags); 2438 2439 return 0; 2440 } 2441 2442 static void unlock_fs(struct mapped_device *md) 2443 { 2444 if (!test_bit(DMF_FROZEN, &md->flags)) 2445 return; 2446 2447 thaw_bdev(md->bdev, md->frozen_sb); 2448 md->frozen_sb = NULL; 2449 clear_bit(DMF_FROZEN, &md->flags); 2450 } 2451 2452 /* 2453 * We need to be able to change a mapping table under a mounted 2454 * filesystem. For example we might want to move some data in 2455 * the background. Before the table can be swapped with 2456 * dm_bind_table, dm_suspend must be called to flush any in 2457 * flight bios and ensure that any further io gets deferred. 2458 */ 2459 /* 2460 * Suspend mechanism in request-based dm. 2461 * 2462 * 1. Flush all I/Os by lock_fs() if needed. 2463 * 2. Stop dispatching any I/O by stopping the request_queue. 2464 * 3. Wait for all in-flight I/Os to be completed or requeued. 2465 * 2466 * To abort suspend, start the request_queue. 2467 */ 2468 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2469 { 2470 struct dm_table *map = NULL; 2471 int r = 0; 2472 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2473 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2474 2475 mutex_lock(&md->suspend_lock); 2476 2477 if (dm_suspended_md(md)) { 2478 r = -EINVAL; 2479 goto out_unlock; 2480 } 2481 2482 map = dm_get_live_table(md); 2483 2484 /* 2485 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2486 * This flag is cleared before dm_suspend returns. 2487 */ 2488 if (noflush) 2489 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2490 2491 /* This does not get reverted if there's an error later. */ 2492 dm_table_presuspend_targets(map); 2493 2494 /* 2495 * Flush I/O to the device. 2496 * Any I/O submitted after lock_fs() may not be flushed. 2497 * noflush takes precedence over do_lockfs. 2498 * (lock_fs() flushes I/Os and waits for them to complete.) 2499 */ 2500 if (!noflush && do_lockfs) { 2501 r = lock_fs(md); 2502 if (r) 2503 goto out; 2504 } 2505 2506 /* 2507 * Here we must make sure that no processes are submitting requests 2508 * to target drivers i.e. no one may be executing 2509 * __split_and_process_bio. This is called from dm_request and 2510 * dm_wq_work. 2511 * 2512 * To get all processes out of __split_and_process_bio in dm_request, 2513 * we take the write lock. To prevent any process from reentering 2514 * __split_and_process_bio from dm_request, we set 2515 * DMF_QUEUE_IO_TO_THREAD. 2516 * 2517 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2518 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2519 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2520 * further calls to __split_and_process_bio from dm_wq_work. 2521 */ 2522 down_write(&md->io_lock); 2523 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2524 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2525 up_write(&md->io_lock); 2526 2527 /* 2528 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2529 * can be kicked until md->queue is stopped. So stop md->queue before 2530 * flushing md->wq. 2531 */ 2532 if (dm_request_based(md)) 2533 stop_queue(md->queue); 2534 2535 flush_workqueue(md->wq); 2536 2537 /* 2538 * At this point no more requests are entering target request routines. 2539 * We call dm_wait_for_completion to wait for all existing requests 2540 * to finish. 2541 */ 2542 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2543 2544 down_write(&md->io_lock); 2545 if (noflush) 2546 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2547 up_write(&md->io_lock); 2548 2549 /* were we interrupted ? */ 2550 if (r < 0) { 2551 dm_queue_flush(md); 2552 2553 if (dm_request_based(md)) 2554 start_queue(md->queue); 2555 2556 unlock_fs(md); 2557 goto out; /* pushback list is already flushed, so skip flush */ 2558 } 2559 2560 /* 2561 * If dm_wait_for_completion returned 0, the device is completely 2562 * quiescent now. There is no request-processing activity. All new 2563 * requests are being added to md->deferred list. 2564 */ 2565 2566 set_bit(DMF_SUSPENDED, &md->flags); 2567 2568 dm_table_postsuspend_targets(map); 2569 2570 out: 2571 dm_table_put(map); 2572 2573 out_unlock: 2574 mutex_unlock(&md->suspend_lock); 2575 return r; 2576 } 2577 2578 int dm_resume(struct mapped_device *md) 2579 { 2580 int r = -EINVAL; 2581 struct dm_table *map = NULL; 2582 2583 mutex_lock(&md->suspend_lock); 2584 if (!dm_suspended_md(md)) 2585 goto out; 2586 2587 map = dm_get_live_table(md); 2588 if (!map || !dm_table_get_size(map)) 2589 goto out; 2590 2591 r = dm_table_resume_targets(map); 2592 if (r) 2593 goto out; 2594 2595 dm_queue_flush(md); 2596 2597 /* 2598 * Flushing deferred I/Os must be done after targets are resumed 2599 * so that mapping of targets can work correctly. 2600 * Request-based dm is queueing the deferred I/Os in its request_queue. 2601 */ 2602 if (dm_request_based(md)) 2603 start_queue(md->queue); 2604 2605 unlock_fs(md); 2606 2607 clear_bit(DMF_SUSPENDED, &md->flags); 2608 2609 dm_table_unplug_all(map); 2610 r = 0; 2611 out: 2612 dm_table_put(map); 2613 mutex_unlock(&md->suspend_lock); 2614 2615 return r; 2616 } 2617 2618 /*----------------------------------------------------------------- 2619 * Event notification. 2620 *---------------------------------------------------------------*/ 2621 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2622 unsigned cookie) 2623 { 2624 char udev_cookie[DM_COOKIE_LENGTH]; 2625 char *envp[] = { udev_cookie, NULL }; 2626 2627 if (!cookie) 2628 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2629 else { 2630 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2631 DM_COOKIE_ENV_VAR_NAME, cookie); 2632 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2633 action, envp); 2634 } 2635 } 2636 2637 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2638 { 2639 return atomic_add_return(1, &md->uevent_seq); 2640 } 2641 2642 uint32_t dm_get_event_nr(struct mapped_device *md) 2643 { 2644 return atomic_read(&md->event_nr); 2645 } 2646 2647 int dm_wait_event(struct mapped_device *md, int event_nr) 2648 { 2649 return wait_event_interruptible(md->eventq, 2650 (event_nr != atomic_read(&md->event_nr))); 2651 } 2652 2653 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2654 { 2655 unsigned long flags; 2656 2657 spin_lock_irqsave(&md->uevent_lock, flags); 2658 list_add(elist, &md->uevent_list); 2659 spin_unlock_irqrestore(&md->uevent_lock, flags); 2660 } 2661 2662 /* 2663 * The gendisk is only valid as long as you have a reference 2664 * count on 'md'. 2665 */ 2666 struct gendisk *dm_disk(struct mapped_device *md) 2667 { 2668 return md->disk; 2669 } 2670 2671 struct kobject *dm_kobject(struct mapped_device *md) 2672 { 2673 return &md->kobj; 2674 } 2675 2676 /* 2677 * struct mapped_device should not be exported outside of dm.c 2678 * so use this check to verify that kobj is part of md structure 2679 */ 2680 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2681 { 2682 struct mapped_device *md; 2683 2684 md = container_of(kobj, struct mapped_device, kobj); 2685 if (&md->kobj != kobj) 2686 return NULL; 2687 2688 if (test_bit(DMF_FREEING, &md->flags) || 2689 dm_deleting_md(md)) 2690 return NULL; 2691 2692 dm_get(md); 2693 return md; 2694 } 2695 2696 int dm_suspended_md(struct mapped_device *md) 2697 { 2698 return test_bit(DMF_SUSPENDED, &md->flags); 2699 } 2700 2701 int dm_suspended(struct dm_target *ti) 2702 { 2703 return dm_suspended_md(dm_table_get_md(ti->table)); 2704 } 2705 EXPORT_SYMBOL_GPL(dm_suspended); 2706 2707 int dm_noflush_suspending(struct dm_target *ti) 2708 { 2709 return __noflush_suspending(dm_table_get_md(ti->table)); 2710 } 2711 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2712 2713 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2714 { 2715 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2716 2717 if (!pools) 2718 return NULL; 2719 2720 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2721 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2722 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2723 if (!pools->io_pool) 2724 goto free_pools_and_out; 2725 2726 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2727 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2728 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2729 if (!pools->tio_pool) 2730 goto free_io_pool_and_out; 2731 2732 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2733 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2734 if (!pools->bs) 2735 goto free_tio_pool_and_out; 2736 2737 return pools; 2738 2739 free_tio_pool_and_out: 2740 mempool_destroy(pools->tio_pool); 2741 2742 free_io_pool_and_out: 2743 mempool_destroy(pools->io_pool); 2744 2745 free_pools_and_out: 2746 kfree(pools); 2747 2748 return NULL; 2749 } 2750 2751 void dm_free_md_mempools(struct dm_md_mempools *pools) 2752 { 2753 if (!pools) 2754 return; 2755 2756 if (pools->io_pool) 2757 mempool_destroy(pools->io_pool); 2758 2759 if (pools->tio_pool) 2760 mempool_destroy(pools->tio_pool); 2761 2762 if (pools->bs) 2763 bioset_free(pools->bs); 2764 2765 kfree(pools); 2766 } 2767 2768 static const struct block_device_operations dm_blk_dops = { 2769 .open = dm_blk_open, 2770 .release = dm_blk_close, 2771 .ioctl = dm_blk_ioctl, 2772 .getgeo = dm_blk_getgeo, 2773 .owner = THIS_MODULE 2774 }; 2775 2776 EXPORT_SYMBOL(dm_get_mapinfo); 2777 2778 /* 2779 * module hooks 2780 */ 2781 module_init(dm_init); 2782 module_exit(dm_exit); 2783 2784 module_param(major, uint, 0); 2785 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2786 MODULE_DESCRIPTION(DM_NAME " driver"); 2787 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2788 MODULE_LICENSE("GPL"); 2789