1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 /* 28 * Cookies are numeric values sent with CHANGE and REMOVE 29 * uevents while resuming, removing or renaming the device. 30 */ 31 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32 #define DM_COOKIE_LENGTH 24 33 34 static const char *_name = DM_NAME; 35 36 static unsigned int major = 0; 37 static unsigned int _major = 0; 38 39 static DEFINE_SPINLOCK(_minor_lock); 40 /* 41 * For bio-based dm. 42 * One of these is allocated per bio. 43 */ 44 struct dm_io { 45 struct mapped_device *md; 46 int error; 47 atomic_t io_count; 48 struct bio *bio; 49 unsigned long start_time; 50 spinlock_t endio_lock; 51 }; 52 53 /* 54 * For bio-based dm. 55 * One of these is allocated per target within a bio. Hopefully 56 * this will be simplified out one day. 57 */ 58 struct dm_target_io { 59 struct dm_io *io; 60 struct dm_target *ti; 61 union map_info info; 62 }; 63 64 /* 65 * For request-based dm. 66 * One of these is allocated per request. 67 */ 68 struct dm_rq_target_io { 69 struct mapped_device *md; 70 struct dm_target *ti; 71 struct request *orig, clone; 72 int error; 73 union map_info info; 74 }; 75 76 /* 77 * For request-based dm. 78 * One of these is allocated per bio. 79 */ 80 struct dm_rq_clone_bio_info { 81 struct bio *orig; 82 struct dm_rq_target_io *tio; 83 }; 84 85 union map_info *dm_get_mapinfo(struct bio *bio) 86 { 87 if (bio && bio->bi_private) 88 return &((struct dm_target_io *)bio->bi_private)->info; 89 return NULL; 90 } 91 92 union map_info *dm_get_rq_mapinfo(struct request *rq) 93 { 94 if (rq && rq->end_io_data) 95 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 96 return NULL; 97 } 98 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 99 100 #define MINOR_ALLOCED ((void *)-1) 101 102 /* 103 * Bits for the md->flags field. 104 */ 105 #define DMF_BLOCK_IO_FOR_SUSPEND 0 106 #define DMF_SUSPENDED 1 107 #define DMF_FROZEN 2 108 #define DMF_FREEING 3 109 #define DMF_DELETING 4 110 #define DMF_NOFLUSH_SUSPENDING 5 111 #define DMF_QUEUE_IO_TO_THREAD 6 112 113 /* 114 * Work processed by per-device workqueue. 115 */ 116 struct mapped_device { 117 struct rw_semaphore io_lock; 118 struct mutex suspend_lock; 119 rwlock_t map_lock; 120 atomic_t holders; 121 atomic_t open_count; 122 123 unsigned long flags; 124 125 struct request_queue *queue; 126 struct gendisk *disk; 127 char name[16]; 128 129 void *interface_ptr; 130 131 /* 132 * A list of ios that arrived while we were suspended. 133 */ 134 atomic_t pending[2]; 135 wait_queue_head_t wait; 136 struct work_struct work; 137 struct bio_list deferred; 138 spinlock_t deferred_lock; 139 140 /* 141 * An error from the barrier request currently being processed. 142 */ 143 int barrier_error; 144 145 /* 146 * Processing queue (flush/barriers) 147 */ 148 struct workqueue_struct *wq; 149 150 /* 151 * The current mapping. 152 */ 153 struct dm_table *map; 154 155 /* 156 * io objects are allocated from here. 157 */ 158 mempool_t *io_pool; 159 mempool_t *tio_pool; 160 161 struct bio_set *bs; 162 163 /* 164 * Event handling. 165 */ 166 atomic_t event_nr; 167 wait_queue_head_t eventq; 168 atomic_t uevent_seq; 169 struct list_head uevent_list; 170 spinlock_t uevent_lock; /* Protect access to uevent_list */ 171 172 /* 173 * freeze/thaw support require holding onto a super block 174 */ 175 struct super_block *frozen_sb; 176 struct block_device *bdev; 177 178 /* forced geometry settings */ 179 struct hd_geometry geometry; 180 181 /* marker of flush suspend for request-based dm */ 182 struct request suspend_rq; 183 184 /* For saving the address of __make_request for request based dm */ 185 make_request_fn *saved_make_request_fn; 186 187 /* sysfs handle */ 188 struct kobject kobj; 189 190 /* zero-length barrier that will be cloned and submitted to targets */ 191 struct bio barrier_bio; 192 }; 193 194 /* 195 * For mempools pre-allocation at the table loading time. 196 */ 197 struct dm_md_mempools { 198 mempool_t *io_pool; 199 mempool_t *tio_pool; 200 struct bio_set *bs; 201 }; 202 203 #define MIN_IOS 256 204 static struct kmem_cache *_io_cache; 205 static struct kmem_cache *_tio_cache; 206 static struct kmem_cache *_rq_tio_cache; 207 static struct kmem_cache *_rq_bio_info_cache; 208 209 static int __init local_init(void) 210 { 211 int r = -ENOMEM; 212 213 /* allocate a slab for the dm_ios */ 214 _io_cache = KMEM_CACHE(dm_io, 0); 215 if (!_io_cache) 216 return r; 217 218 /* allocate a slab for the target ios */ 219 _tio_cache = KMEM_CACHE(dm_target_io, 0); 220 if (!_tio_cache) 221 goto out_free_io_cache; 222 223 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 224 if (!_rq_tio_cache) 225 goto out_free_tio_cache; 226 227 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 228 if (!_rq_bio_info_cache) 229 goto out_free_rq_tio_cache; 230 231 r = dm_uevent_init(); 232 if (r) 233 goto out_free_rq_bio_info_cache; 234 235 _major = major; 236 r = register_blkdev(_major, _name); 237 if (r < 0) 238 goto out_uevent_exit; 239 240 if (!_major) 241 _major = r; 242 243 return 0; 244 245 out_uevent_exit: 246 dm_uevent_exit(); 247 out_free_rq_bio_info_cache: 248 kmem_cache_destroy(_rq_bio_info_cache); 249 out_free_rq_tio_cache: 250 kmem_cache_destroy(_rq_tio_cache); 251 out_free_tio_cache: 252 kmem_cache_destroy(_tio_cache); 253 out_free_io_cache: 254 kmem_cache_destroy(_io_cache); 255 256 return r; 257 } 258 259 static void local_exit(void) 260 { 261 kmem_cache_destroy(_rq_bio_info_cache); 262 kmem_cache_destroy(_rq_tio_cache); 263 kmem_cache_destroy(_tio_cache); 264 kmem_cache_destroy(_io_cache); 265 unregister_blkdev(_major, _name); 266 dm_uevent_exit(); 267 268 _major = 0; 269 270 DMINFO("cleaned up"); 271 } 272 273 static int (*_inits[])(void) __initdata = { 274 local_init, 275 dm_target_init, 276 dm_linear_init, 277 dm_stripe_init, 278 dm_kcopyd_init, 279 dm_interface_init, 280 }; 281 282 static void (*_exits[])(void) = { 283 local_exit, 284 dm_target_exit, 285 dm_linear_exit, 286 dm_stripe_exit, 287 dm_kcopyd_exit, 288 dm_interface_exit, 289 }; 290 291 static int __init dm_init(void) 292 { 293 const int count = ARRAY_SIZE(_inits); 294 295 int r, i; 296 297 for (i = 0; i < count; i++) { 298 r = _inits[i](); 299 if (r) 300 goto bad; 301 } 302 303 return 0; 304 305 bad: 306 while (i--) 307 _exits[i](); 308 309 return r; 310 } 311 312 static void __exit dm_exit(void) 313 { 314 int i = ARRAY_SIZE(_exits); 315 316 while (i--) 317 _exits[i](); 318 } 319 320 /* 321 * Block device functions 322 */ 323 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 324 { 325 struct mapped_device *md; 326 327 spin_lock(&_minor_lock); 328 329 md = bdev->bd_disk->private_data; 330 if (!md) 331 goto out; 332 333 if (test_bit(DMF_FREEING, &md->flags) || 334 test_bit(DMF_DELETING, &md->flags)) { 335 md = NULL; 336 goto out; 337 } 338 339 dm_get(md); 340 atomic_inc(&md->open_count); 341 342 out: 343 spin_unlock(&_minor_lock); 344 345 return md ? 0 : -ENXIO; 346 } 347 348 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 349 { 350 struct mapped_device *md = disk->private_data; 351 atomic_dec(&md->open_count); 352 dm_put(md); 353 return 0; 354 } 355 356 int dm_open_count(struct mapped_device *md) 357 { 358 return atomic_read(&md->open_count); 359 } 360 361 /* 362 * Guarantees nothing is using the device before it's deleted. 363 */ 364 int dm_lock_for_deletion(struct mapped_device *md) 365 { 366 int r = 0; 367 368 spin_lock(&_minor_lock); 369 370 if (dm_open_count(md)) 371 r = -EBUSY; 372 else 373 set_bit(DMF_DELETING, &md->flags); 374 375 spin_unlock(&_minor_lock); 376 377 return r; 378 } 379 380 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 381 { 382 struct mapped_device *md = bdev->bd_disk->private_data; 383 384 return dm_get_geometry(md, geo); 385 } 386 387 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 388 unsigned int cmd, unsigned long arg) 389 { 390 struct mapped_device *md = bdev->bd_disk->private_data; 391 struct dm_table *map = dm_get_table(md); 392 struct dm_target *tgt; 393 int r = -ENOTTY; 394 395 if (!map || !dm_table_get_size(map)) 396 goto out; 397 398 /* We only support devices that have a single target */ 399 if (dm_table_get_num_targets(map) != 1) 400 goto out; 401 402 tgt = dm_table_get_target(map, 0); 403 404 if (dm_suspended(md)) { 405 r = -EAGAIN; 406 goto out; 407 } 408 409 if (tgt->type->ioctl) 410 r = tgt->type->ioctl(tgt, cmd, arg); 411 412 out: 413 dm_table_put(map); 414 415 return r; 416 } 417 418 static struct dm_io *alloc_io(struct mapped_device *md) 419 { 420 return mempool_alloc(md->io_pool, GFP_NOIO); 421 } 422 423 static void free_io(struct mapped_device *md, struct dm_io *io) 424 { 425 mempool_free(io, md->io_pool); 426 } 427 428 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 429 { 430 mempool_free(tio, md->tio_pool); 431 } 432 433 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 434 { 435 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 436 } 437 438 static void free_rq_tio(struct dm_rq_target_io *tio) 439 { 440 mempool_free(tio, tio->md->tio_pool); 441 } 442 443 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 444 { 445 return mempool_alloc(md->io_pool, GFP_ATOMIC); 446 } 447 448 static void free_bio_info(struct dm_rq_clone_bio_info *info) 449 { 450 mempool_free(info, info->tio->md->io_pool); 451 } 452 453 static void start_io_acct(struct dm_io *io) 454 { 455 struct mapped_device *md = io->md; 456 int cpu; 457 int rw = bio_data_dir(io->bio); 458 459 io->start_time = jiffies; 460 461 cpu = part_stat_lock(); 462 part_round_stats(cpu, &dm_disk(md)->part0); 463 part_stat_unlock(); 464 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 465 } 466 467 static void end_io_acct(struct dm_io *io) 468 { 469 struct mapped_device *md = io->md; 470 struct bio *bio = io->bio; 471 unsigned long duration = jiffies - io->start_time; 472 int pending, cpu; 473 int rw = bio_data_dir(bio); 474 475 cpu = part_stat_lock(); 476 part_round_stats(cpu, &dm_disk(md)->part0); 477 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 478 part_stat_unlock(); 479 480 /* 481 * After this is decremented the bio must not be touched if it is 482 * a barrier. 483 */ 484 dm_disk(md)->part0.in_flight[rw] = pending = 485 atomic_dec_return(&md->pending[rw]); 486 pending += atomic_read(&md->pending[rw^0x1]); 487 488 /* nudge anyone waiting on suspend queue */ 489 if (!pending) 490 wake_up(&md->wait); 491 } 492 493 /* 494 * Add the bio to the list of deferred io. 495 */ 496 static void queue_io(struct mapped_device *md, struct bio *bio) 497 { 498 down_write(&md->io_lock); 499 500 spin_lock_irq(&md->deferred_lock); 501 bio_list_add(&md->deferred, bio); 502 spin_unlock_irq(&md->deferred_lock); 503 504 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 505 queue_work(md->wq, &md->work); 506 507 up_write(&md->io_lock); 508 } 509 510 /* 511 * Everyone (including functions in this file), should use this 512 * function to access the md->map field, and make sure they call 513 * dm_table_put() when finished. 514 */ 515 struct dm_table *dm_get_table(struct mapped_device *md) 516 { 517 struct dm_table *t; 518 unsigned long flags; 519 520 read_lock_irqsave(&md->map_lock, flags); 521 t = md->map; 522 if (t) 523 dm_table_get(t); 524 read_unlock_irqrestore(&md->map_lock, flags); 525 526 return t; 527 } 528 529 /* 530 * Get the geometry associated with a dm device 531 */ 532 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 533 { 534 *geo = md->geometry; 535 536 return 0; 537 } 538 539 /* 540 * Set the geometry of a device. 541 */ 542 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 543 { 544 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 545 546 if (geo->start > sz) { 547 DMWARN("Start sector is beyond the geometry limits."); 548 return -EINVAL; 549 } 550 551 md->geometry = *geo; 552 553 return 0; 554 } 555 556 /*----------------------------------------------------------------- 557 * CRUD START: 558 * A more elegant soln is in the works that uses the queue 559 * merge fn, unfortunately there are a couple of changes to 560 * the block layer that I want to make for this. So in the 561 * interests of getting something for people to use I give 562 * you this clearly demarcated crap. 563 *---------------------------------------------------------------*/ 564 565 static int __noflush_suspending(struct mapped_device *md) 566 { 567 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 568 } 569 570 /* 571 * Decrements the number of outstanding ios that a bio has been 572 * cloned into, completing the original io if necc. 573 */ 574 static void dec_pending(struct dm_io *io, int error) 575 { 576 unsigned long flags; 577 int io_error; 578 struct bio *bio; 579 struct mapped_device *md = io->md; 580 581 /* Push-back supersedes any I/O errors */ 582 if (unlikely(error)) { 583 spin_lock_irqsave(&io->endio_lock, flags); 584 if (!(io->error > 0 && __noflush_suspending(md))) 585 io->error = error; 586 spin_unlock_irqrestore(&io->endio_lock, flags); 587 } 588 589 if (atomic_dec_and_test(&io->io_count)) { 590 if (io->error == DM_ENDIO_REQUEUE) { 591 /* 592 * Target requested pushing back the I/O. 593 */ 594 spin_lock_irqsave(&md->deferred_lock, flags); 595 if (__noflush_suspending(md)) { 596 if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) 597 bio_list_add_head(&md->deferred, 598 io->bio); 599 } else 600 /* noflush suspend was interrupted. */ 601 io->error = -EIO; 602 spin_unlock_irqrestore(&md->deferred_lock, flags); 603 } 604 605 io_error = io->error; 606 bio = io->bio; 607 608 if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { 609 /* 610 * There can be just one barrier request so we use 611 * a per-device variable for error reporting. 612 * Note that you can't touch the bio after end_io_acct 613 */ 614 if (!md->barrier_error && io_error != -EOPNOTSUPP) 615 md->barrier_error = io_error; 616 end_io_acct(io); 617 } else { 618 end_io_acct(io); 619 620 if (io_error != DM_ENDIO_REQUEUE) { 621 trace_block_bio_complete(md->queue, bio); 622 623 bio_endio(bio, io_error); 624 } 625 } 626 627 free_io(md, io); 628 } 629 } 630 631 static void clone_endio(struct bio *bio, int error) 632 { 633 int r = 0; 634 struct dm_target_io *tio = bio->bi_private; 635 struct dm_io *io = tio->io; 636 struct mapped_device *md = tio->io->md; 637 dm_endio_fn endio = tio->ti->type->end_io; 638 639 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 640 error = -EIO; 641 642 if (endio) { 643 r = endio(tio->ti, bio, error, &tio->info); 644 if (r < 0 || r == DM_ENDIO_REQUEUE) 645 /* 646 * error and requeue request are handled 647 * in dec_pending(). 648 */ 649 error = r; 650 else if (r == DM_ENDIO_INCOMPLETE) 651 /* The target will handle the io */ 652 return; 653 else if (r) { 654 DMWARN("unimplemented target endio return value: %d", r); 655 BUG(); 656 } 657 } 658 659 /* 660 * Store md for cleanup instead of tio which is about to get freed. 661 */ 662 bio->bi_private = md->bs; 663 664 free_tio(md, tio); 665 bio_put(bio); 666 dec_pending(io, error); 667 } 668 669 /* 670 * Partial completion handling for request-based dm 671 */ 672 static void end_clone_bio(struct bio *clone, int error) 673 { 674 struct dm_rq_clone_bio_info *info = clone->bi_private; 675 struct dm_rq_target_io *tio = info->tio; 676 struct bio *bio = info->orig; 677 unsigned int nr_bytes = info->orig->bi_size; 678 679 bio_put(clone); 680 681 if (tio->error) 682 /* 683 * An error has already been detected on the request. 684 * Once error occurred, just let clone->end_io() handle 685 * the remainder. 686 */ 687 return; 688 else if (error) { 689 /* 690 * Don't notice the error to the upper layer yet. 691 * The error handling decision is made by the target driver, 692 * when the request is completed. 693 */ 694 tio->error = error; 695 return; 696 } 697 698 /* 699 * I/O for the bio successfully completed. 700 * Notice the data completion to the upper layer. 701 */ 702 703 /* 704 * bios are processed from the head of the list. 705 * So the completing bio should always be rq->bio. 706 * If it's not, something wrong is happening. 707 */ 708 if (tio->orig->bio != bio) 709 DMERR("bio completion is going in the middle of the request"); 710 711 /* 712 * Update the original request. 713 * Do not use blk_end_request() here, because it may complete 714 * the original request before the clone, and break the ordering. 715 */ 716 blk_update_request(tio->orig, 0, nr_bytes); 717 } 718 719 /* 720 * Don't touch any member of the md after calling this function because 721 * the md may be freed in dm_put() at the end of this function. 722 * Or do dm_get() before calling this function and dm_put() later. 723 */ 724 static void rq_completed(struct mapped_device *md, int run_queue) 725 { 726 int wakeup_waiters = 0; 727 struct request_queue *q = md->queue; 728 unsigned long flags; 729 730 spin_lock_irqsave(q->queue_lock, flags); 731 if (!queue_in_flight(q)) 732 wakeup_waiters = 1; 733 spin_unlock_irqrestore(q->queue_lock, flags); 734 735 /* nudge anyone waiting on suspend queue */ 736 if (wakeup_waiters) 737 wake_up(&md->wait); 738 739 if (run_queue) 740 blk_run_queue(q); 741 742 /* 743 * dm_put() must be at the end of this function. See the comment above 744 */ 745 dm_put(md); 746 } 747 748 static void free_rq_clone(struct request *clone) 749 { 750 struct dm_rq_target_io *tio = clone->end_io_data; 751 752 blk_rq_unprep_clone(clone); 753 free_rq_tio(tio); 754 } 755 756 static void dm_unprep_request(struct request *rq) 757 { 758 struct request *clone = rq->special; 759 760 rq->special = NULL; 761 rq->cmd_flags &= ~REQ_DONTPREP; 762 763 free_rq_clone(clone); 764 } 765 766 /* 767 * Requeue the original request of a clone. 768 */ 769 void dm_requeue_unmapped_request(struct request *clone) 770 { 771 struct dm_rq_target_io *tio = clone->end_io_data; 772 struct mapped_device *md = tio->md; 773 struct request *rq = tio->orig; 774 struct request_queue *q = rq->q; 775 unsigned long flags; 776 777 dm_unprep_request(rq); 778 779 spin_lock_irqsave(q->queue_lock, flags); 780 if (elv_queue_empty(q)) 781 blk_plug_device(q); 782 blk_requeue_request(q, rq); 783 spin_unlock_irqrestore(q->queue_lock, flags); 784 785 rq_completed(md, 0); 786 } 787 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 788 789 static void __stop_queue(struct request_queue *q) 790 { 791 blk_stop_queue(q); 792 } 793 794 static void stop_queue(struct request_queue *q) 795 { 796 unsigned long flags; 797 798 spin_lock_irqsave(q->queue_lock, flags); 799 __stop_queue(q); 800 spin_unlock_irqrestore(q->queue_lock, flags); 801 } 802 803 static void __start_queue(struct request_queue *q) 804 { 805 if (blk_queue_stopped(q)) 806 blk_start_queue(q); 807 } 808 809 static void start_queue(struct request_queue *q) 810 { 811 unsigned long flags; 812 813 spin_lock_irqsave(q->queue_lock, flags); 814 __start_queue(q); 815 spin_unlock_irqrestore(q->queue_lock, flags); 816 } 817 818 /* 819 * Complete the clone and the original request. 820 * Must be called without queue lock. 821 */ 822 static void dm_end_request(struct request *clone, int error) 823 { 824 struct dm_rq_target_io *tio = clone->end_io_data; 825 struct mapped_device *md = tio->md; 826 struct request *rq = tio->orig; 827 828 if (blk_pc_request(rq)) { 829 rq->errors = clone->errors; 830 rq->resid_len = clone->resid_len; 831 832 if (rq->sense) 833 /* 834 * We are using the sense buffer of the original 835 * request. 836 * So setting the length of the sense data is enough. 837 */ 838 rq->sense_len = clone->sense_len; 839 } 840 841 free_rq_clone(clone); 842 843 blk_end_request_all(rq, error); 844 845 rq_completed(md, 1); 846 } 847 848 /* 849 * Request completion handler for request-based dm 850 */ 851 static void dm_softirq_done(struct request *rq) 852 { 853 struct request *clone = rq->completion_data; 854 struct dm_rq_target_io *tio = clone->end_io_data; 855 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 856 int error = tio->error; 857 858 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 859 error = rq_end_io(tio->ti, clone, error, &tio->info); 860 861 if (error <= 0) 862 /* The target wants to complete the I/O */ 863 dm_end_request(clone, error); 864 else if (error == DM_ENDIO_INCOMPLETE) 865 /* The target will handle the I/O */ 866 return; 867 else if (error == DM_ENDIO_REQUEUE) 868 /* The target wants to requeue the I/O */ 869 dm_requeue_unmapped_request(clone); 870 else { 871 DMWARN("unimplemented target endio return value: %d", error); 872 BUG(); 873 } 874 } 875 876 /* 877 * Complete the clone and the original request with the error status 878 * through softirq context. 879 */ 880 static void dm_complete_request(struct request *clone, int error) 881 { 882 struct dm_rq_target_io *tio = clone->end_io_data; 883 struct request *rq = tio->orig; 884 885 tio->error = error; 886 rq->completion_data = clone; 887 blk_complete_request(rq); 888 } 889 890 /* 891 * Complete the not-mapped clone and the original request with the error status 892 * through softirq context. 893 * Target's rq_end_io() function isn't called. 894 * This may be used when the target's map_rq() function fails. 895 */ 896 void dm_kill_unmapped_request(struct request *clone, int error) 897 { 898 struct dm_rq_target_io *tio = clone->end_io_data; 899 struct request *rq = tio->orig; 900 901 rq->cmd_flags |= REQ_FAILED; 902 dm_complete_request(clone, error); 903 } 904 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 905 906 /* 907 * Called with the queue lock held 908 */ 909 static void end_clone_request(struct request *clone, int error) 910 { 911 /* 912 * For just cleaning up the information of the queue in which 913 * the clone was dispatched. 914 * The clone is *NOT* freed actually here because it is alloced from 915 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 916 */ 917 __blk_put_request(clone->q, clone); 918 919 /* 920 * Actual request completion is done in a softirq context which doesn't 921 * hold the queue lock. Otherwise, deadlock could occur because: 922 * - another request may be submitted by the upper level driver 923 * of the stacking during the completion 924 * - the submission which requires queue lock may be done 925 * against this queue 926 */ 927 dm_complete_request(clone, error); 928 } 929 930 static sector_t max_io_len(struct mapped_device *md, 931 sector_t sector, struct dm_target *ti) 932 { 933 sector_t offset = sector - ti->begin; 934 sector_t len = ti->len - offset; 935 936 /* 937 * Does the target need to split even further ? 938 */ 939 if (ti->split_io) { 940 sector_t boundary; 941 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 942 - offset; 943 if (len > boundary) 944 len = boundary; 945 } 946 947 return len; 948 } 949 950 static void __map_bio(struct dm_target *ti, struct bio *clone, 951 struct dm_target_io *tio) 952 { 953 int r; 954 sector_t sector; 955 struct mapped_device *md; 956 957 clone->bi_end_io = clone_endio; 958 clone->bi_private = tio; 959 960 /* 961 * Map the clone. If r == 0 we don't need to do 962 * anything, the target has assumed ownership of 963 * this io. 964 */ 965 atomic_inc(&tio->io->io_count); 966 sector = clone->bi_sector; 967 r = ti->type->map(ti, clone, &tio->info); 968 if (r == DM_MAPIO_REMAPPED) { 969 /* the bio has been remapped so dispatch it */ 970 971 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 972 tio->io->bio->bi_bdev->bd_dev, sector); 973 974 generic_make_request(clone); 975 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 976 /* error the io and bail out, or requeue it if needed */ 977 md = tio->io->md; 978 dec_pending(tio->io, r); 979 /* 980 * Store bio_set for cleanup. 981 */ 982 clone->bi_private = md->bs; 983 bio_put(clone); 984 free_tio(md, tio); 985 } else if (r) { 986 DMWARN("unimplemented target map return value: %d", r); 987 BUG(); 988 } 989 } 990 991 struct clone_info { 992 struct mapped_device *md; 993 struct dm_table *map; 994 struct bio *bio; 995 struct dm_io *io; 996 sector_t sector; 997 sector_t sector_count; 998 unsigned short idx; 999 }; 1000 1001 static void dm_bio_destructor(struct bio *bio) 1002 { 1003 struct bio_set *bs = bio->bi_private; 1004 1005 bio_free(bio, bs); 1006 } 1007 1008 /* 1009 * Creates a little bio that is just does part of a bvec. 1010 */ 1011 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1012 unsigned short idx, unsigned int offset, 1013 unsigned int len, struct bio_set *bs) 1014 { 1015 struct bio *clone; 1016 struct bio_vec *bv = bio->bi_io_vec + idx; 1017 1018 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1019 clone->bi_destructor = dm_bio_destructor; 1020 *clone->bi_io_vec = *bv; 1021 1022 clone->bi_sector = sector; 1023 clone->bi_bdev = bio->bi_bdev; 1024 clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); 1025 clone->bi_vcnt = 1; 1026 clone->bi_size = to_bytes(len); 1027 clone->bi_io_vec->bv_offset = offset; 1028 clone->bi_io_vec->bv_len = clone->bi_size; 1029 clone->bi_flags |= 1 << BIO_CLONED; 1030 1031 if (bio_integrity(bio)) { 1032 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1033 bio_integrity_trim(clone, 1034 bio_sector_offset(bio, idx, offset), len); 1035 } 1036 1037 return clone; 1038 } 1039 1040 /* 1041 * Creates a bio that consists of range of complete bvecs. 1042 */ 1043 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1044 unsigned short idx, unsigned short bv_count, 1045 unsigned int len, struct bio_set *bs) 1046 { 1047 struct bio *clone; 1048 1049 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1050 __bio_clone(clone, bio); 1051 clone->bi_rw &= ~(1 << BIO_RW_BARRIER); 1052 clone->bi_destructor = dm_bio_destructor; 1053 clone->bi_sector = sector; 1054 clone->bi_idx = idx; 1055 clone->bi_vcnt = idx + bv_count; 1056 clone->bi_size = to_bytes(len); 1057 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1058 1059 if (bio_integrity(bio)) { 1060 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1061 1062 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1063 bio_integrity_trim(clone, 1064 bio_sector_offset(bio, idx, 0), len); 1065 } 1066 1067 return clone; 1068 } 1069 1070 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1071 struct dm_target *ti) 1072 { 1073 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1074 1075 tio->io = ci->io; 1076 tio->ti = ti; 1077 memset(&tio->info, 0, sizeof(tio->info)); 1078 1079 return tio; 1080 } 1081 1082 static void __flush_target(struct clone_info *ci, struct dm_target *ti, 1083 unsigned flush_nr) 1084 { 1085 struct dm_target_io *tio = alloc_tio(ci, ti); 1086 struct bio *clone; 1087 1088 tio->info.flush_request = flush_nr; 1089 1090 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1091 __bio_clone(clone, ci->bio); 1092 clone->bi_destructor = dm_bio_destructor; 1093 1094 __map_bio(ti, clone, tio); 1095 } 1096 1097 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1098 { 1099 unsigned target_nr = 0, flush_nr; 1100 struct dm_target *ti; 1101 1102 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1103 for (flush_nr = 0; flush_nr < ti->num_flush_requests; 1104 flush_nr++) 1105 __flush_target(ci, ti, flush_nr); 1106 1107 ci->sector_count = 0; 1108 1109 return 0; 1110 } 1111 1112 static int __clone_and_map(struct clone_info *ci) 1113 { 1114 struct bio *clone, *bio = ci->bio; 1115 struct dm_target *ti; 1116 sector_t len = 0, max; 1117 struct dm_target_io *tio; 1118 1119 if (unlikely(bio_empty_barrier(bio))) 1120 return __clone_and_map_empty_barrier(ci); 1121 1122 ti = dm_table_find_target(ci->map, ci->sector); 1123 if (!dm_target_is_valid(ti)) 1124 return -EIO; 1125 1126 max = max_io_len(ci->md, ci->sector, ti); 1127 1128 /* 1129 * Allocate a target io object. 1130 */ 1131 tio = alloc_tio(ci, ti); 1132 1133 if (ci->sector_count <= max) { 1134 /* 1135 * Optimise for the simple case where we can do all of 1136 * the remaining io with a single clone. 1137 */ 1138 clone = clone_bio(bio, ci->sector, ci->idx, 1139 bio->bi_vcnt - ci->idx, ci->sector_count, 1140 ci->md->bs); 1141 __map_bio(ti, clone, tio); 1142 ci->sector_count = 0; 1143 1144 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1145 /* 1146 * There are some bvecs that don't span targets. 1147 * Do as many of these as possible. 1148 */ 1149 int i; 1150 sector_t remaining = max; 1151 sector_t bv_len; 1152 1153 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1154 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1155 1156 if (bv_len > remaining) 1157 break; 1158 1159 remaining -= bv_len; 1160 len += bv_len; 1161 } 1162 1163 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1164 ci->md->bs); 1165 __map_bio(ti, clone, tio); 1166 1167 ci->sector += len; 1168 ci->sector_count -= len; 1169 ci->idx = i; 1170 1171 } else { 1172 /* 1173 * Handle a bvec that must be split between two or more targets. 1174 */ 1175 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1176 sector_t remaining = to_sector(bv->bv_len); 1177 unsigned int offset = 0; 1178 1179 do { 1180 if (offset) { 1181 ti = dm_table_find_target(ci->map, ci->sector); 1182 if (!dm_target_is_valid(ti)) 1183 return -EIO; 1184 1185 max = max_io_len(ci->md, ci->sector, ti); 1186 1187 tio = alloc_tio(ci, ti); 1188 } 1189 1190 len = min(remaining, max); 1191 1192 clone = split_bvec(bio, ci->sector, ci->idx, 1193 bv->bv_offset + offset, len, 1194 ci->md->bs); 1195 1196 __map_bio(ti, clone, tio); 1197 1198 ci->sector += len; 1199 ci->sector_count -= len; 1200 offset += to_bytes(len); 1201 } while (remaining -= len); 1202 1203 ci->idx++; 1204 } 1205 1206 return 0; 1207 } 1208 1209 /* 1210 * Split the bio into several clones and submit it to targets. 1211 */ 1212 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1213 { 1214 struct clone_info ci; 1215 int error = 0; 1216 1217 ci.map = dm_get_table(md); 1218 if (unlikely(!ci.map)) { 1219 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1220 bio_io_error(bio); 1221 else 1222 if (!md->barrier_error) 1223 md->barrier_error = -EIO; 1224 return; 1225 } 1226 1227 ci.md = md; 1228 ci.bio = bio; 1229 ci.io = alloc_io(md); 1230 ci.io->error = 0; 1231 atomic_set(&ci.io->io_count, 1); 1232 ci.io->bio = bio; 1233 ci.io->md = md; 1234 spin_lock_init(&ci.io->endio_lock); 1235 ci.sector = bio->bi_sector; 1236 ci.sector_count = bio_sectors(bio); 1237 if (unlikely(bio_empty_barrier(bio))) 1238 ci.sector_count = 1; 1239 ci.idx = bio->bi_idx; 1240 1241 start_io_acct(ci.io); 1242 while (ci.sector_count && !error) 1243 error = __clone_and_map(&ci); 1244 1245 /* drop the extra reference count */ 1246 dec_pending(ci.io, error); 1247 dm_table_put(ci.map); 1248 } 1249 /*----------------------------------------------------------------- 1250 * CRUD END 1251 *---------------------------------------------------------------*/ 1252 1253 static int dm_merge_bvec(struct request_queue *q, 1254 struct bvec_merge_data *bvm, 1255 struct bio_vec *biovec) 1256 { 1257 struct mapped_device *md = q->queuedata; 1258 struct dm_table *map = dm_get_table(md); 1259 struct dm_target *ti; 1260 sector_t max_sectors; 1261 int max_size = 0; 1262 1263 if (unlikely(!map)) 1264 goto out; 1265 1266 ti = dm_table_find_target(map, bvm->bi_sector); 1267 if (!dm_target_is_valid(ti)) 1268 goto out_table; 1269 1270 /* 1271 * Find maximum amount of I/O that won't need splitting 1272 */ 1273 max_sectors = min(max_io_len(md, bvm->bi_sector, ti), 1274 (sector_t) BIO_MAX_SECTORS); 1275 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1276 if (max_size < 0) 1277 max_size = 0; 1278 1279 /* 1280 * merge_bvec_fn() returns number of bytes 1281 * it can accept at this offset 1282 * max is precomputed maximal io size 1283 */ 1284 if (max_size && ti->type->merge) 1285 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1286 /* 1287 * If the target doesn't support merge method and some of the devices 1288 * provided their merge_bvec method (we know this by looking at 1289 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1290 * entries. So always set max_size to 0, and the code below allows 1291 * just one page. 1292 */ 1293 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1294 1295 max_size = 0; 1296 1297 out_table: 1298 dm_table_put(map); 1299 1300 out: 1301 /* 1302 * Always allow an entire first page 1303 */ 1304 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1305 max_size = biovec->bv_len; 1306 1307 return max_size; 1308 } 1309 1310 /* 1311 * The request function that just remaps the bio built up by 1312 * dm_merge_bvec. 1313 */ 1314 static int _dm_request(struct request_queue *q, struct bio *bio) 1315 { 1316 int rw = bio_data_dir(bio); 1317 struct mapped_device *md = q->queuedata; 1318 int cpu; 1319 1320 down_read(&md->io_lock); 1321 1322 cpu = part_stat_lock(); 1323 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1324 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1325 part_stat_unlock(); 1326 1327 /* 1328 * If we're suspended or the thread is processing barriers 1329 * we have to queue this io for later. 1330 */ 1331 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1332 unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 1333 up_read(&md->io_lock); 1334 1335 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1336 bio_rw(bio) == READA) { 1337 bio_io_error(bio); 1338 return 0; 1339 } 1340 1341 queue_io(md, bio); 1342 1343 return 0; 1344 } 1345 1346 __split_and_process_bio(md, bio); 1347 up_read(&md->io_lock); 1348 return 0; 1349 } 1350 1351 static int dm_make_request(struct request_queue *q, struct bio *bio) 1352 { 1353 struct mapped_device *md = q->queuedata; 1354 1355 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 1356 bio_endio(bio, -EOPNOTSUPP); 1357 return 0; 1358 } 1359 1360 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1361 } 1362 1363 static int dm_request_based(struct mapped_device *md) 1364 { 1365 return blk_queue_stackable(md->queue); 1366 } 1367 1368 static int dm_request(struct request_queue *q, struct bio *bio) 1369 { 1370 struct mapped_device *md = q->queuedata; 1371 1372 if (dm_request_based(md)) 1373 return dm_make_request(q, bio); 1374 1375 return _dm_request(q, bio); 1376 } 1377 1378 void dm_dispatch_request(struct request *rq) 1379 { 1380 int r; 1381 1382 if (blk_queue_io_stat(rq->q)) 1383 rq->cmd_flags |= REQ_IO_STAT; 1384 1385 rq->start_time = jiffies; 1386 r = blk_insert_cloned_request(rq->q, rq); 1387 if (r) 1388 dm_complete_request(rq, r); 1389 } 1390 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1391 1392 static void dm_rq_bio_destructor(struct bio *bio) 1393 { 1394 struct dm_rq_clone_bio_info *info = bio->bi_private; 1395 struct mapped_device *md = info->tio->md; 1396 1397 free_bio_info(info); 1398 bio_free(bio, md->bs); 1399 } 1400 1401 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1402 void *data) 1403 { 1404 struct dm_rq_target_io *tio = data; 1405 struct mapped_device *md = tio->md; 1406 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1407 1408 if (!info) 1409 return -ENOMEM; 1410 1411 info->orig = bio_orig; 1412 info->tio = tio; 1413 bio->bi_end_io = end_clone_bio; 1414 bio->bi_private = info; 1415 bio->bi_destructor = dm_rq_bio_destructor; 1416 1417 return 0; 1418 } 1419 1420 static int setup_clone(struct request *clone, struct request *rq, 1421 struct dm_rq_target_io *tio) 1422 { 1423 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1424 dm_rq_bio_constructor, tio); 1425 1426 if (r) 1427 return r; 1428 1429 clone->cmd = rq->cmd; 1430 clone->cmd_len = rq->cmd_len; 1431 clone->sense = rq->sense; 1432 clone->buffer = rq->buffer; 1433 clone->end_io = end_clone_request; 1434 clone->end_io_data = tio; 1435 1436 return 0; 1437 } 1438 1439 static int dm_rq_flush_suspending(struct mapped_device *md) 1440 { 1441 return !md->suspend_rq.special; 1442 } 1443 1444 /* 1445 * Called with the queue lock held. 1446 */ 1447 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1448 { 1449 struct mapped_device *md = q->queuedata; 1450 struct dm_rq_target_io *tio; 1451 struct request *clone; 1452 1453 if (unlikely(rq == &md->suspend_rq)) { 1454 if (dm_rq_flush_suspending(md)) 1455 return BLKPREP_OK; 1456 else 1457 /* The flush suspend was interrupted */ 1458 return BLKPREP_KILL; 1459 } 1460 1461 if (unlikely(rq->special)) { 1462 DMWARN("Already has something in rq->special."); 1463 return BLKPREP_KILL; 1464 } 1465 1466 tio = alloc_rq_tio(md); /* Only one for each original request */ 1467 if (!tio) 1468 /* -ENOMEM */ 1469 return BLKPREP_DEFER; 1470 1471 tio->md = md; 1472 tio->ti = NULL; 1473 tio->orig = rq; 1474 tio->error = 0; 1475 memset(&tio->info, 0, sizeof(tio->info)); 1476 1477 clone = &tio->clone; 1478 if (setup_clone(clone, rq, tio)) { 1479 /* -ENOMEM */ 1480 free_rq_tio(tio); 1481 return BLKPREP_DEFER; 1482 } 1483 1484 rq->special = clone; 1485 rq->cmd_flags |= REQ_DONTPREP; 1486 1487 return BLKPREP_OK; 1488 } 1489 1490 static void map_request(struct dm_target *ti, struct request *rq, 1491 struct mapped_device *md) 1492 { 1493 int r; 1494 struct request *clone = rq->special; 1495 struct dm_rq_target_io *tio = clone->end_io_data; 1496 1497 /* 1498 * Hold the md reference here for the in-flight I/O. 1499 * We can't rely on the reference count by device opener, 1500 * because the device may be closed during the request completion 1501 * when all bios are completed. 1502 * See the comment in rq_completed() too. 1503 */ 1504 dm_get(md); 1505 1506 tio->ti = ti; 1507 r = ti->type->map_rq(ti, clone, &tio->info); 1508 switch (r) { 1509 case DM_MAPIO_SUBMITTED: 1510 /* The target has taken the I/O to submit by itself later */ 1511 break; 1512 case DM_MAPIO_REMAPPED: 1513 /* The target has remapped the I/O so dispatch it */ 1514 dm_dispatch_request(clone); 1515 break; 1516 case DM_MAPIO_REQUEUE: 1517 /* The target wants to requeue the I/O */ 1518 dm_requeue_unmapped_request(clone); 1519 break; 1520 default: 1521 if (r > 0) { 1522 DMWARN("unimplemented target map return value: %d", r); 1523 BUG(); 1524 } 1525 1526 /* The target wants to complete the I/O */ 1527 dm_kill_unmapped_request(clone, r); 1528 break; 1529 } 1530 } 1531 1532 /* 1533 * q->request_fn for request-based dm. 1534 * Called with the queue lock held. 1535 */ 1536 static void dm_request_fn(struct request_queue *q) 1537 { 1538 struct mapped_device *md = q->queuedata; 1539 struct dm_table *map = dm_get_table(md); 1540 struct dm_target *ti; 1541 struct request *rq; 1542 1543 /* 1544 * For noflush suspend, check blk_queue_stopped() to immediately 1545 * quit I/O dispatching. 1546 */ 1547 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1548 rq = blk_peek_request(q); 1549 if (!rq) 1550 goto plug_and_out; 1551 1552 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1553 if (queue_in_flight(q)) 1554 /* Not quiet yet. Wait more */ 1555 goto plug_and_out; 1556 1557 /* This device should be quiet now */ 1558 __stop_queue(q); 1559 blk_start_request(rq); 1560 __blk_end_request_all(rq, 0); 1561 wake_up(&md->wait); 1562 goto out; 1563 } 1564 1565 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1566 if (ti->type->busy && ti->type->busy(ti)) 1567 goto plug_and_out; 1568 1569 blk_start_request(rq); 1570 spin_unlock(q->queue_lock); 1571 map_request(ti, rq, md); 1572 spin_lock_irq(q->queue_lock); 1573 } 1574 1575 goto out; 1576 1577 plug_and_out: 1578 if (!elv_queue_empty(q)) 1579 /* Some requests still remain, retry later */ 1580 blk_plug_device(q); 1581 1582 out: 1583 dm_table_put(map); 1584 1585 return; 1586 } 1587 1588 int dm_underlying_device_busy(struct request_queue *q) 1589 { 1590 return blk_lld_busy(q); 1591 } 1592 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1593 1594 static int dm_lld_busy(struct request_queue *q) 1595 { 1596 int r; 1597 struct mapped_device *md = q->queuedata; 1598 struct dm_table *map = dm_get_table(md); 1599 1600 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1601 r = 1; 1602 else 1603 r = dm_table_any_busy_target(map); 1604 1605 dm_table_put(map); 1606 1607 return r; 1608 } 1609 1610 static void dm_unplug_all(struct request_queue *q) 1611 { 1612 struct mapped_device *md = q->queuedata; 1613 struct dm_table *map = dm_get_table(md); 1614 1615 if (map) { 1616 if (dm_request_based(md)) 1617 generic_unplug_device(q); 1618 1619 dm_table_unplug_all(map); 1620 dm_table_put(map); 1621 } 1622 } 1623 1624 static int dm_any_congested(void *congested_data, int bdi_bits) 1625 { 1626 int r = bdi_bits; 1627 struct mapped_device *md = congested_data; 1628 struct dm_table *map; 1629 1630 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1631 map = dm_get_table(md); 1632 if (map) { 1633 /* 1634 * Request-based dm cares about only own queue for 1635 * the query about congestion status of request_queue 1636 */ 1637 if (dm_request_based(md)) 1638 r = md->queue->backing_dev_info.state & 1639 bdi_bits; 1640 else 1641 r = dm_table_any_congested(map, bdi_bits); 1642 1643 dm_table_put(map); 1644 } 1645 } 1646 1647 return r; 1648 } 1649 1650 /*----------------------------------------------------------------- 1651 * An IDR is used to keep track of allocated minor numbers. 1652 *---------------------------------------------------------------*/ 1653 static DEFINE_IDR(_minor_idr); 1654 1655 static void free_minor(int minor) 1656 { 1657 spin_lock(&_minor_lock); 1658 idr_remove(&_minor_idr, minor); 1659 spin_unlock(&_minor_lock); 1660 } 1661 1662 /* 1663 * See if the device with a specific minor # is free. 1664 */ 1665 static int specific_minor(int minor) 1666 { 1667 int r, m; 1668 1669 if (minor >= (1 << MINORBITS)) 1670 return -EINVAL; 1671 1672 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1673 if (!r) 1674 return -ENOMEM; 1675 1676 spin_lock(&_minor_lock); 1677 1678 if (idr_find(&_minor_idr, minor)) { 1679 r = -EBUSY; 1680 goto out; 1681 } 1682 1683 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1684 if (r) 1685 goto out; 1686 1687 if (m != minor) { 1688 idr_remove(&_minor_idr, m); 1689 r = -EBUSY; 1690 goto out; 1691 } 1692 1693 out: 1694 spin_unlock(&_minor_lock); 1695 return r; 1696 } 1697 1698 static int next_free_minor(int *minor) 1699 { 1700 int r, m; 1701 1702 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1703 if (!r) 1704 return -ENOMEM; 1705 1706 spin_lock(&_minor_lock); 1707 1708 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1709 if (r) 1710 goto out; 1711 1712 if (m >= (1 << MINORBITS)) { 1713 idr_remove(&_minor_idr, m); 1714 r = -ENOSPC; 1715 goto out; 1716 } 1717 1718 *minor = m; 1719 1720 out: 1721 spin_unlock(&_minor_lock); 1722 return r; 1723 } 1724 1725 static const struct block_device_operations dm_blk_dops; 1726 1727 static void dm_wq_work(struct work_struct *work); 1728 1729 /* 1730 * Allocate and initialise a blank device with a given minor. 1731 */ 1732 static struct mapped_device *alloc_dev(int minor) 1733 { 1734 int r; 1735 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1736 void *old_md; 1737 1738 if (!md) { 1739 DMWARN("unable to allocate device, out of memory."); 1740 return NULL; 1741 } 1742 1743 if (!try_module_get(THIS_MODULE)) 1744 goto bad_module_get; 1745 1746 /* get a minor number for the dev */ 1747 if (minor == DM_ANY_MINOR) 1748 r = next_free_minor(&minor); 1749 else 1750 r = specific_minor(minor); 1751 if (r < 0) 1752 goto bad_minor; 1753 1754 init_rwsem(&md->io_lock); 1755 mutex_init(&md->suspend_lock); 1756 spin_lock_init(&md->deferred_lock); 1757 rwlock_init(&md->map_lock); 1758 atomic_set(&md->holders, 1); 1759 atomic_set(&md->open_count, 0); 1760 atomic_set(&md->event_nr, 0); 1761 atomic_set(&md->uevent_seq, 0); 1762 INIT_LIST_HEAD(&md->uevent_list); 1763 spin_lock_init(&md->uevent_lock); 1764 1765 md->queue = blk_init_queue(dm_request_fn, NULL); 1766 if (!md->queue) 1767 goto bad_queue; 1768 1769 /* 1770 * Request-based dm devices cannot be stacked on top of bio-based dm 1771 * devices. The type of this dm device has not been decided yet, 1772 * although we initialized the queue using blk_init_queue(). 1773 * The type is decided at the first table loading time. 1774 * To prevent problematic device stacking, clear the queue flag 1775 * for request stacking support until then. 1776 * 1777 * This queue is new, so no concurrency on the queue_flags. 1778 */ 1779 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1780 md->saved_make_request_fn = md->queue->make_request_fn; 1781 md->queue->queuedata = md; 1782 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1783 md->queue->backing_dev_info.congested_data = md; 1784 blk_queue_make_request(md->queue, dm_request); 1785 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1786 md->queue->unplug_fn = dm_unplug_all; 1787 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1788 blk_queue_softirq_done(md->queue, dm_softirq_done); 1789 blk_queue_prep_rq(md->queue, dm_prep_fn); 1790 blk_queue_lld_busy(md->queue, dm_lld_busy); 1791 1792 md->disk = alloc_disk(1); 1793 if (!md->disk) 1794 goto bad_disk; 1795 1796 atomic_set(&md->pending[0], 0); 1797 atomic_set(&md->pending[1], 0); 1798 init_waitqueue_head(&md->wait); 1799 INIT_WORK(&md->work, dm_wq_work); 1800 init_waitqueue_head(&md->eventq); 1801 1802 md->disk->major = _major; 1803 md->disk->first_minor = minor; 1804 md->disk->fops = &dm_blk_dops; 1805 md->disk->queue = md->queue; 1806 md->disk->private_data = md; 1807 sprintf(md->disk->disk_name, "dm-%d", minor); 1808 add_disk(md->disk); 1809 format_dev_t(md->name, MKDEV(_major, minor)); 1810 1811 md->wq = create_singlethread_workqueue("kdmflush"); 1812 if (!md->wq) 1813 goto bad_thread; 1814 1815 md->bdev = bdget_disk(md->disk, 0); 1816 if (!md->bdev) 1817 goto bad_bdev; 1818 1819 /* Populate the mapping, nobody knows we exist yet */ 1820 spin_lock(&_minor_lock); 1821 old_md = idr_replace(&_minor_idr, md, minor); 1822 spin_unlock(&_minor_lock); 1823 1824 BUG_ON(old_md != MINOR_ALLOCED); 1825 1826 return md; 1827 1828 bad_bdev: 1829 destroy_workqueue(md->wq); 1830 bad_thread: 1831 del_gendisk(md->disk); 1832 put_disk(md->disk); 1833 bad_disk: 1834 blk_cleanup_queue(md->queue); 1835 bad_queue: 1836 free_minor(minor); 1837 bad_minor: 1838 module_put(THIS_MODULE); 1839 bad_module_get: 1840 kfree(md); 1841 return NULL; 1842 } 1843 1844 static void unlock_fs(struct mapped_device *md); 1845 1846 static void free_dev(struct mapped_device *md) 1847 { 1848 int minor = MINOR(disk_devt(md->disk)); 1849 1850 unlock_fs(md); 1851 bdput(md->bdev); 1852 destroy_workqueue(md->wq); 1853 if (md->tio_pool) 1854 mempool_destroy(md->tio_pool); 1855 if (md->io_pool) 1856 mempool_destroy(md->io_pool); 1857 if (md->bs) 1858 bioset_free(md->bs); 1859 blk_integrity_unregister(md->disk); 1860 del_gendisk(md->disk); 1861 free_minor(minor); 1862 1863 spin_lock(&_minor_lock); 1864 md->disk->private_data = NULL; 1865 spin_unlock(&_minor_lock); 1866 1867 put_disk(md->disk); 1868 blk_cleanup_queue(md->queue); 1869 module_put(THIS_MODULE); 1870 kfree(md); 1871 } 1872 1873 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1874 { 1875 struct dm_md_mempools *p; 1876 1877 if (md->io_pool && md->tio_pool && md->bs) 1878 /* the md already has necessary mempools */ 1879 goto out; 1880 1881 p = dm_table_get_md_mempools(t); 1882 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1883 1884 md->io_pool = p->io_pool; 1885 p->io_pool = NULL; 1886 md->tio_pool = p->tio_pool; 1887 p->tio_pool = NULL; 1888 md->bs = p->bs; 1889 p->bs = NULL; 1890 1891 out: 1892 /* mempool bind completed, now no need any mempools in the table */ 1893 dm_table_free_md_mempools(t); 1894 } 1895 1896 /* 1897 * Bind a table to the device. 1898 */ 1899 static void event_callback(void *context) 1900 { 1901 unsigned long flags; 1902 LIST_HEAD(uevents); 1903 struct mapped_device *md = (struct mapped_device *) context; 1904 1905 spin_lock_irqsave(&md->uevent_lock, flags); 1906 list_splice_init(&md->uevent_list, &uevents); 1907 spin_unlock_irqrestore(&md->uevent_lock, flags); 1908 1909 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1910 1911 atomic_inc(&md->event_nr); 1912 wake_up(&md->eventq); 1913 } 1914 1915 static void __set_size(struct mapped_device *md, sector_t size) 1916 { 1917 set_capacity(md->disk, size); 1918 1919 mutex_lock(&md->bdev->bd_inode->i_mutex); 1920 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1921 mutex_unlock(&md->bdev->bd_inode->i_mutex); 1922 } 1923 1924 static int __bind(struct mapped_device *md, struct dm_table *t, 1925 struct queue_limits *limits) 1926 { 1927 struct request_queue *q = md->queue; 1928 sector_t size; 1929 unsigned long flags; 1930 1931 size = dm_table_get_size(t); 1932 1933 /* 1934 * Wipe any geometry if the size of the table changed. 1935 */ 1936 if (size != get_capacity(md->disk)) 1937 memset(&md->geometry, 0, sizeof(md->geometry)); 1938 1939 __set_size(md, size); 1940 1941 if (!size) { 1942 dm_table_destroy(t); 1943 return 0; 1944 } 1945 1946 dm_table_event_callback(t, event_callback, md); 1947 1948 /* 1949 * The queue hasn't been stopped yet, if the old table type wasn't 1950 * for request-based during suspension. So stop it to prevent 1951 * I/O mapping before resume. 1952 * This must be done before setting the queue restrictions, 1953 * because request-based dm may be run just after the setting. 1954 */ 1955 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 1956 stop_queue(q); 1957 1958 __bind_mempools(md, t); 1959 1960 write_lock_irqsave(&md->map_lock, flags); 1961 md->map = t; 1962 dm_table_set_restrictions(t, q, limits); 1963 write_unlock_irqrestore(&md->map_lock, flags); 1964 1965 return 0; 1966 } 1967 1968 static void __unbind(struct mapped_device *md) 1969 { 1970 struct dm_table *map = md->map; 1971 unsigned long flags; 1972 1973 if (!map) 1974 return; 1975 1976 dm_table_event_callback(map, NULL, NULL); 1977 write_lock_irqsave(&md->map_lock, flags); 1978 md->map = NULL; 1979 write_unlock_irqrestore(&md->map_lock, flags); 1980 dm_table_destroy(map); 1981 } 1982 1983 /* 1984 * Constructor for a new device. 1985 */ 1986 int dm_create(int minor, struct mapped_device **result) 1987 { 1988 struct mapped_device *md; 1989 1990 md = alloc_dev(minor); 1991 if (!md) 1992 return -ENXIO; 1993 1994 dm_sysfs_init(md); 1995 1996 *result = md; 1997 return 0; 1998 } 1999 2000 static struct mapped_device *dm_find_md(dev_t dev) 2001 { 2002 struct mapped_device *md; 2003 unsigned minor = MINOR(dev); 2004 2005 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2006 return NULL; 2007 2008 spin_lock(&_minor_lock); 2009 2010 md = idr_find(&_minor_idr, minor); 2011 if (md && (md == MINOR_ALLOCED || 2012 (MINOR(disk_devt(dm_disk(md))) != minor) || 2013 test_bit(DMF_FREEING, &md->flags))) { 2014 md = NULL; 2015 goto out; 2016 } 2017 2018 out: 2019 spin_unlock(&_minor_lock); 2020 2021 return md; 2022 } 2023 2024 struct mapped_device *dm_get_md(dev_t dev) 2025 { 2026 struct mapped_device *md = dm_find_md(dev); 2027 2028 if (md) 2029 dm_get(md); 2030 2031 return md; 2032 } 2033 2034 void *dm_get_mdptr(struct mapped_device *md) 2035 { 2036 return md->interface_ptr; 2037 } 2038 2039 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2040 { 2041 md->interface_ptr = ptr; 2042 } 2043 2044 void dm_get(struct mapped_device *md) 2045 { 2046 atomic_inc(&md->holders); 2047 } 2048 2049 const char *dm_device_name(struct mapped_device *md) 2050 { 2051 return md->name; 2052 } 2053 EXPORT_SYMBOL_GPL(dm_device_name); 2054 2055 void dm_put(struct mapped_device *md) 2056 { 2057 struct dm_table *map; 2058 2059 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2060 2061 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2062 map = dm_get_table(md); 2063 idr_replace(&_minor_idr, MINOR_ALLOCED, 2064 MINOR(disk_devt(dm_disk(md)))); 2065 set_bit(DMF_FREEING, &md->flags); 2066 spin_unlock(&_minor_lock); 2067 if (!dm_suspended(md)) { 2068 dm_table_presuspend_targets(map); 2069 dm_table_postsuspend_targets(map); 2070 } 2071 dm_sysfs_exit(md); 2072 dm_table_put(map); 2073 __unbind(md); 2074 free_dev(md); 2075 } 2076 } 2077 EXPORT_SYMBOL_GPL(dm_put); 2078 2079 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2080 { 2081 int r = 0; 2082 DECLARE_WAITQUEUE(wait, current); 2083 struct request_queue *q = md->queue; 2084 unsigned long flags; 2085 2086 dm_unplug_all(md->queue); 2087 2088 add_wait_queue(&md->wait, &wait); 2089 2090 while (1) { 2091 set_current_state(interruptible); 2092 2093 smp_mb(); 2094 if (dm_request_based(md)) { 2095 spin_lock_irqsave(q->queue_lock, flags); 2096 if (!queue_in_flight(q) && blk_queue_stopped(q)) { 2097 spin_unlock_irqrestore(q->queue_lock, flags); 2098 break; 2099 } 2100 spin_unlock_irqrestore(q->queue_lock, flags); 2101 } else if (!atomic_read(&md->pending[0]) && 2102 !atomic_read(&md->pending[1])) 2103 break; 2104 2105 if (interruptible == TASK_INTERRUPTIBLE && 2106 signal_pending(current)) { 2107 r = -EINTR; 2108 break; 2109 } 2110 2111 io_schedule(); 2112 } 2113 set_current_state(TASK_RUNNING); 2114 2115 remove_wait_queue(&md->wait, &wait); 2116 2117 return r; 2118 } 2119 2120 static void dm_flush(struct mapped_device *md) 2121 { 2122 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2123 2124 bio_init(&md->barrier_bio); 2125 md->barrier_bio.bi_bdev = md->bdev; 2126 md->barrier_bio.bi_rw = WRITE_BARRIER; 2127 __split_and_process_bio(md, &md->barrier_bio); 2128 2129 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2130 } 2131 2132 static void process_barrier(struct mapped_device *md, struct bio *bio) 2133 { 2134 md->barrier_error = 0; 2135 2136 dm_flush(md); 2137 2138 if (!bio_empty_barrier(bio)) { 2139 __split_and_process_bio(md, bio); 2140 dm_flush(md); 2141 } 2142 2143 if (md->barrier_error != DM_ENDIO_REQUEUE) 2144 bio_endio(bio, md->barrier_error); 2145 else { 2146 spin_lock_irq(&md->deferred_lock); 2147 bio_list_add_head(&md->deferred, bio); 2148 spin_unlock_irq(&md->deferred_lock); 2149 } 2150 } 2151 2152 /* 2153 * Process the deferred bios 2154 */ 2155 static void dm_wq_work(struct work_struct *work) 2156 { 2157 struct mapped_device *md = container_of(work, struct mapped_device, 2158 work); 2159 struct bio *c; 2160 2161 down_write(&md->io_lock); 2162 2163 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2164 spin_lock_irq(&md->deferred_lock); 2165 c = bio_list_pop(&md->deferred); 2166 spin_unlock_irq(&md->deferred_lock); 2167 2168 if (!c) { 2169 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2170 break; 2171 } 2172 2173 up_write(&md->io_lock); 2174 2175 if (dm_request_based(md)) 2176 generic_make_request(c); 2177 else { 2178 if (bio_rw_flagged(c, BIO_RW_BARRIER)) 2179 process_barrier(md, c); 2180 else 2181 __split_and_process_bio(md, c); 2182 } 2183 2184 down_write(&md->io_lock); 2185 } 2186 2187 up_write(&md->io_lock); 2188 } 2189 2190 static void dm_queue_flush(struct mapped_device *md) 2191 { 2192 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2193 smp_mb__after_clear_bit(); 2194 queue_work(md->wq, &md->work); 2195 } 2196 2197 /* 2198 * Swap in a new table (destroying old one). 2199 */ 2200 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 2201 { 2202 struct queue_limits limits; 2203 int r = -EINVAL; 2204 2205 mutex_lock(&md->suspend_lock); 2206 2207 /* device must be suspended */ 2208 if (!dm_suspended(md)) 2209 goto out; 2210 2211 r = dm_calculate_queue_limits(table, &limits); 2212 if (r) 2213 goto out; 2214 2215 /* cannot change the device type, once a table is bound */ 2216 if (md->map && 2217 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2218 DMWARN("can't change the device type after a table is bound"); 2219 goto out; 2220 } 2221 2222 __unbind(md); 2223 r = __bind(md, table, &limits); 2224 2225 out: 2226 mutex_unlock(&md->suspend_lock); 2227 return r; 2228 } 2229 2230 static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2231 { 2232 md->suspend_rq.special = (void *)0x1; 2233 } 2234 2235 static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2236 { 2237 struct request_queue *q = md->queue; 2238 unsigned long flags; 2239 2240 spin_lock_irqsave(q->queue_lock, flags); 2241 if (!noflush) 2242 dm_rq_invalidate_suspend_marker(md); 2243 __start_queue(q); 2244 spin_unlock_irqrestore(q->queue_lock, flags); 2245 } 2246 2247 static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2248 { 2249 struct request *rq = &md->suspend_rq; 2250 struct request_queue *q = md->queue; 2251 2252 if (noflush) 2253 stop_queue(q); 2254 else { 2255 blk_rq_init(q, rq); 2256 blk_insert_request(q, rq, 0, NULL); 2257 } 2258 } 2259 2260 static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2261 { 2262 int r = 1; 2263 struct request *rq = &md->suspend_rq; 2264 struct request_queue *q = md->queue; 2265 unsigned long flags; 2266 2267 if (noflush) 2268 return r; 2269 2270 /* The marker must be protected by queue lock if it is in use */ 2271 spin_lock_irqsave(q->queue_lock, flags); 2272 if (unlikely(rq->ref_count)) { 2273 /* 2274 * This can happen, when the previous flush suspend was 2275 * interrupted, the marker is still in the queue and 2276 * this flush suspend has been invoked, because we don't 2277 * remove the marker at the time of suspend interruption. 2278 * We have only one marker per mapped_device, so we can't 2279 * start another flush suspend while it is in use. 2280 */ 2281 BUG_ON(!rq->special); /* The marker should be invalidated */ 2282 DMWARN("Invalidating the previous flush suspend is still in" 2283 " progress. Please retry later."); 2284 r = 0; 2285 } 2286 spin_unlock_irqrestore(q->queue_lock, flags); 2287 2288 return r; 2289 } 2290 2291 /* 2292 * Functions to lock and unlock any filesystem running on the 2293 * device. 2294 */ 2295 static int lock_fs(struct mapped_device *md) 2296 { 2297 int r; 2298 2299 WARN_ON(md->frozen_sb); 2300 2301 md->frozen_sb = freeze_bdev(md->bdev); 2302 if (IS_ERR(md->frozen_sb)) { 2303 r = PTR_ERR(md->frozen_sb); 2304 md->frozen_sb = NULL; 2305 return r; 2306 } 2307 2308 set_bit(DMF_FROZEN, &md->flags); 2309 2310 return 0; 2311 } 2312 2313 static void unlock_fs(struct mapped_device *md) 2314 { 2315 if (!test_bit(DMF_FROZEN, &md->flags)) 2316 return; 2317 2318 thaw_bdev(md->bdev, md->frozen_sb); 2319 md->frozen_sb = NULL; 2320 clear_bit(DMF_FROZEN, &md->flags); 2321 } 2322 2323 /* 2324 * We need to be able to change a mapping table under a mounted 2325 * filesystem. For example we might want to move some data in 2326 * the background. Before the table can be swapped with 2327 * dm_bind_table, dm_suspend must be called to flush any in 2328 * flight bios and ensure that any further io gets deferred. 2329 */ 2330 /* 2331 * Suspend mechanism in request-based dm. 2332 * 2333 * After the suspend starts, further incoming requests are kept in 2334 * the request_queue and deferred. 2335 * Remaining requests in the request_queue at the start of suspend are flushed 2336 * if it is flush suspend. 2337 * The suspend completes when the following conditions have been satisfied, 2338 * so wait for it: 2339 * 1. q->in_flight is 0 (which means no in_flight request) 2340 * 2. queue has been stopped (which means no request dispatching) 2341 * 2342 * 2343 * Noflush suspend 2344 * --------------- 2345 * Noflush suspend doesn't need to dispatch remaining requests. 2346 * So stop the queue immediately. Then, wait for all in_flight requests 2347 * to be completed or requeued. 2348 * 2349 * To abort noflush suspend, start the queue. 2350 * 2351 * 2352 * Flush suspend 2353 * ------------- 2354 * Flush suspend needs to dispatch remaining requests. So stop the queue 2355 * after the remaining requests are completed. (Requeued request must be also 2356 * re-dispatched and completed. Until then, we can't stop the queue.) 2357 * 2358 * During flushing the remaining requests, further incoming requests are also 2359 * inserted to the same queue. To distinguish which requests are to be 2360 * flushed, we insert a marker request to the queue at the time of starting 2361 * flush suspend, like a barrier. 2362 * The dispatching is blocked when the marker is found on the top of the queue. 2363 * And the queue is stopped when all in_flight requests are completed, since 2364 * that means the remaining requests are completely flushed. 2365 * Then, the marker is removed from the queue. 2366 * 2367 * To abort flush suspend, we also need to take care of the marker, not only 2368 * starting the queue. 2369 * We don't remove the marker forcibly from the queue since it's against 2370 * the block-layer manner. Instead, we put a invalidated mark on the marker. 2371 * When the invalidated marker is found on the top of the queue, it is 2372 * immediately removed from the queue, so it doesn't block dispatching. 2373 * Because we have only one marker per mapped_device, we can't start another 2374 * flush suspend until the invalidated marker is removed from the queue. 2375 * So fail and return with -EBUSY in such a case. 2376 */ 2377 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2378 { 2379 struct dm_table *map = NULL; 2380 int r = 0; 2381 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2382 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2383 2384 mutex_lock(&md->suspend_lock); 2385 2386 if (dm_suspended(md)) { 2387 r = -EINVAL; 2388 goto out_unlock; 2389 } 2390 2391 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2392 r = -EBUSY; 2393 goto out_unlock; 2394 } 2395 2396 map = dm_get_table(md); 2397 2398 /* 2399 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2400 * This flag is cleared before dm_suspend returns. 2401 */ 2402 if (noflush) 2403 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2404 2405 /* This does not get reverted if there's an error later. */ 2406 dm_table_presuspend_targets(map); 2407 2408 /* 2409 * Flush I/O to the device. noflush supersedes do_lockfs, 2410 * because lock_fs() needs to flush I/Os. 2411 */ 2412 if (!noflush && do_lockfs) { 2413 r = lock_fs(md); 2414 if (r) 2415 goto out; 2416 } 2417 2418 /* 2419 * Here we must make sure that no processes are submitting requests 2420 * to target drivers i.e. no one may be executing 2421 * __split_and_process_bio. This is called from dm_request and 2422 * dm_wq_work. 2423 * 2424 * To get all processes out of __split_and_process_bio in dm_request, 2425 * we take the write lock. To prevent any process from reentering 2426 * __split_and_process_bio from dm_request, we set 2427 * DMF_QUEUE_IO_TO_THREAD. 2428 * 2429 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2430 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2431 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2432 * further calls to __split_and_process_bio from dm_wq_work. 2433 */ 2434 down_write(&md->io_lock); 2435 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2436 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2437 up_write(&md->io_lock); 2438 2439 flush_workqueue(md->wq); 2440 2441 if (dm_request_based(md)) 2442 dm_rq_start_suspend(md, noflush); 2443 2444 /* 2445 * At this point no more requests are entering target request routines. 2446 * We call dm_wait_for_completion to wait for all existing requests 2447 * to finish. 2448 */ 2449 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2450 2451 down_write(&md->io_lock); 2452 if (noflush) 2453 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2454 up_write(&md->io_lock); 2455 2456 /* were we interrupted ? */ 2457 if (r < 0) { 2458 dm_queue_flush(md); 2459 2460 if (dm_request_based(md)) 2461 dm_rq_abort_suspend(md, noflush); 2462 2463 unlock_fs(md); 2464 goto out; /* pushback list is already flushed, so skip flush */ 2465 } 2466 2467 /* 2468 * If dm_wait_for_completion returned 0, the device is completely 2469 * quiescent now. There is no request-processing activity. All new 2470 * requests are being added to md->deferred list. 2471 */ 2472 2473 dm_table_postsuspend_targets(map); 2474 2475 set_bit(DMF_SUSPENDED, &md->flags); 2476 2477 out: 2478 dm_table_put(map); 2479 2480 out_unlock: 2481 mutex_unlock(&md->suspend_lock); 2482 return r; 2483 } 2484 2485 int dm_resume(struct mapped_device *md) 2486 { 2487 int r = -EINVAL; 2488 struct dm_table *map = NULL; 2489 2490 mutex_lock(&md->suspend_lock); 2491 if (!dm_suspended(md)) 2492 goto out; 2493 2494 map = dm_get_table(md); 2495 if (!map || !dm_table_get_size(map)) 2496 goto out; 2497 2498 r = dm_table_resume_targets(map); 2499 if (r) 2500 goto out; 2501 2502 dm_queue_flush(md); 2503 2504 /* 2505 * Flushing deferred I/Os must be done after targets are resumed 2506 * so that mapping of targets can work correctly. 2507 * Request-based dm is queueing the deferred I/Os in its request_queue. 2508 */ 2509 if (dm_request_based(md)) 2510 start_queue(md->queue); 2511 2512 unlock_fs(md); 2513 2514 clear_bit(DMF_SUSPENDED, &md->flags); 2515 2516 dm_table_unplug_all(map); 2517 r = 0; 2518 out: 2519 dm_table_put(map); 2520 mutex_unlock(&md->suspend_lock); 2521 2522 return r; 2523 } 2524 2525 /*----------------------------------------------------------------- 2526 * Event notification. 2527 *---------------------------------------------------------------*/ 2528 void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2529 unsigned cookie) 2530 { 2531 char udev_cookie[DM_COOKIE_LENGTH]; 2532 char *envp[] = { udev_cookie, NULL }; 2533 2534 if (!cookie) 2535 kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2536 else { 2537 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2538 DM_COOKIE_ENV_VAR_NAME, cookie); 2539 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); 2540 } 2541 } 2542 2543 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2544 { 2545 return atomic_add_return(1, &md->uevent_seq); 2546 } 2547 2548 uint32_t dm_get_event_nr(struct mapped_device *md) 2549 { 2550 return atomic_read(&md->event_nr); 2551 } 2552 2553 int dm_wait_event(struct mapped_device *md, int event_nr) 2554 { 2555 return wait_event_interruptible(md->eventq, 2556 (event_nr != atomic_read(&md->event_nr))); 2557 } 2558 2559 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2560 { 2561 unsigned long flags; 2562 2563 spin_lock_irqsave(&md->uevent_lock, flags); 2564 list_add(elist, &md->uevent_list); 2565 spin_unlock_irqrestore(&md->uevent_lock, flags); 2566 } 2567 2568 /* 2569 * The gendisk is only valid as long as you have a reference 2570 * count on 'md'. 2571 */ 2572 struct gendisk *dm_disk(struct mapped_device *md) 2573 { 2574 return md->disk; 2575 } 2576 2577 struct kobject *dm_kobject(struct mapped_device *md) 2578 { 2579 return &md->kobj; 2580 } 2581 2582 /* 2583 * struct mapped_device should not be exported outside of dm.c 2584 * so use this check to verify that kobj is part of md structure 2585 */ 2586 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2587 { 2588 struct mapped_device *md; 2589 2590 md = container_of(kobj, struct mapped_device, kobj); 2591 if (&md->kobj != kobj) 2592 return NULL; 2593 2594 if (test_bit(DMF_FREEING, &md->flags) || 2595 test_bit(DMF_DELETING, &md->flags)) 2596 return NULL; 2597 2598 dm_get(md); 2599 return md; 2600 } 2601 2602 int dm_suspended(struct mapped_device *md) 2603 { 2604 return test_bit(DMF_SUSPENDED, &md->flags); 2605 } 2606 2607 int dm_noflush_suspending(struct dm_target *ti) 2608 { 2609 struct mapped_device *md = dm_table_get_md(ti->table); 2610 int r = __noflush_suspending(md); 2611 2612 dm_put(md); 2613 2614 return r; 2615 } 2616 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2617 2618 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2619 { 2620 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2621 2622 if (!pools) 2623 return NULL; 2624 2625 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2626 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2627 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2628 if (!pools->io_pool) 2629 goto free_pools_and_out; 2630 2631 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2632 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2633 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2634 if (!pools->tio_pool) 2635 goto free_io_pool_and_out; 2636 2637 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2638 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2639 if (!pools->bs) 2640 goto free_tio_pool_and_out; 2641 2642 return pools; 2643 2644 free_tio_pool_and_out: 2645 mempool_destroy(pools->tio_pool); 2646 2647 free_io_pool_and_out: 2648 mempool_destroy(pools->io_pool); 2649 2650 free_pools_and_out: 2651 kfree(pools); 2652 2653 return NULL; 2654 } 2655 2656 void dm_free_md_mempools(struct dm_md_mempools *pools) 2657 { 2658 if (!pools) 2659 return; 2660 2661 if (pools->io_pool) 2662 mempool_destroy(pools->io_pool); 2663 2664 if (pools->tio_pool) 2665 mempool_destroy(pools->tio_pool); 2666 2667 if (pools->bs) 2668 bioset_free(pools->bs); 2669 2670 kfree(pools); 2671 } 2672 2673 static const struct block_device_operations dm_blk_dops = { 2674 .open = dm_blk_open, 2675 .release = dm_blk_close, 2676 .ioctl = dm_blk_ioctl, 2677 .getgeo = dm_blk_getgeo, 2678 .owner = THIS_MODULE 2679 }; 2680 2681 EXPORT_SYMBOL(dm_get_mapinfo); 2682 2683 /* 2684 * module hooks 2685 */ 2686 module_init(dm_init); 2687 module_exit(dm_exit); 2688 2689 module_param(major, uint, 0); 2690 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2691 MODULE_DESCRIPTION(DM_NAME " driver"); 2692 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2693 MODULE_LICENSE("GPL"); 2694