1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 /* 28 * Cookies are numeric values sent with CHANGE and REMOVE 29 * uevents while resuming, removing or renaming the device. 30 */ 31 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32 #define DM_COOKIE_LENGTH 24 33 34 static const char *_name = DM_NAME; 35 36 static unsigned int major = 0; 37 static unsigned int _major = 0; 38 39 static DEFINE_SPINLOCK(_minor_lock); 40 /* 41 * For bio-based dm. 42 * One of these is allocated per bio. 43 */ 44 struct dm_io { 45 struct mapped_device *md; 46 int error; 47 atomic_t io_count; 48 struct bio *bio; 49 unsigned long start_time; 50 }; 51 52 /* 53 * For bio-based dm. 54 * One of these is allocated per target within a bio. Hopefully 55 * this will be simplified out one day. 56 */ 57 struct dm_target_io { 58 struct dm_io *io; 59 struct dm_target *ti; 60 union map_info info; 61 }; 62 63 /* 64 * For request-based dm. 65 * One of these is allocated per request. 66 */ 67 struct dm_rq_target_io { 68 struct mapped_device *md; 69 struct dm_target *ti; 70 struct request *orig, clone; 71 int error; 72 union map_info info; 73 }; 74 75 /* 76 * For request-based dm. 77 * One of these is allocated per bio. 78 */ 79 struct dm_rq_clone_bio_info { 80 struct bio *orig; 81 struct dm_rq_target_io *tio; 82 }; 83 84 union map_info *dm_get_mapinfo(struct bio *bio) 85 { 86 if (bio && bio->bi_private) 87 return &((struct dm_target_io *)bio->bi_private)->info; 88 return NULL; 89 } 90 91 union map_info *dm_get_rq_mapinfo(struct request *rq) 92 { 93 if (rq && rq->end_io_data) 94 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 95 return NULL; 96 } 97 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 98 99 #define MINOR_ALLOCED ((void *)-1) 100 101 /* 102 * Bits for the md->flags field. 103 */ 104 #define DMF_BLOCK_IO_FOR_SUSPEND 0 105 #define DMF_SUSPENDED 1 106 #define DMF_FROZEN 2 107 #define DMF_FREEING 3 108 #define DMF_DELETING 4 109 #define DMF_NOFLUSH_SUSPENDING 5 110 #define DMF_QUEUE_IO_TO_THREAD 6 111 112 /* 113 * Work processed by per-device workqueue. 114 */ 115 struct mapped_device { 116 struct rw_semaphore io_lock; 117 struct mutex suspend_lock; 118 rwlock_t map_lock; 119 atomic_t holders; 120 atomic_t open_count; 121 122 unsigned long flags; 123 124 struct request_queue *queue; 125 struct gendisk *disk; 126 char name[16]; 127 128 void *interface_ptr; 129 130 /* 131 * A list of ios that arrived while we were suspended. 132 */ 133 atomic_t pending; 134 wait_queue_head_t wait; 135 struct work_struct work; 136 struct bio_list deferred; 137 spinlock_t deferred_lock; 138 139 /* 140 * An error from the barrier request currently being processed. 141 */ 142 int barrier_error; 143 144 /* 145 * Processing queue (flush/barriers) 146 */ 147 struct workqueue_struct *wq; 148 149 /* 150 * The current mapping. 151 */ 152 struct dm_table *map; 153 154 /* 155 * io objects are allocated from here. 156 */ 157 mempool_t *io_pool; 158 mempool_t *tio_pool; 159 160 struct bio_set *bs; 161 162 /* 163 * Event handling. 164 */ 165 atomic_t event_nr; 166 wait_queue_head_t eventq; 167 atomic_t uevent_seq; 168 struct list_head uevent_list; 169 spinlock_t uevent_lock; /* Protect access to uevent_list */ 170 171 /* 172 * freeze/thaw support require holding onto a super block 173 */ 174 struct super_block *frozen_sb; 175 struct block_device *bdev; 176 177 /* forced geometry settings */ 178 struct hd_geometry geometry; 179 180 /* marker of flush suspend for request-based dm */ 181 struct request suspend_rq; 182 183 /* For saving the address of __make_request for request based dm */ 184 make_request_fn *saved_make_request_fn; 185 186 /* sysfs handle */ 187 struct kobject kobj; 188 189 /* zero-length barrier that will be cloned and submitted to targets */ 190 struct bio barrier_bio; 191 }; 192 193 /* 194 * For mempools pre-allocation at the table loading time. 195 */ 196 struct dm_md_mempools { 197 mempool_t *io_pool; 198 mempool_t *tio_pool; 199 struct bio_set *bs; 200 }; 201 202 #define MIN_IOS 256 203 static struct kmem_cache *_io_cache; 204 static struct kmem_cache *_tio_cache; 205 static struct kmem_cache *_rq_tio_cache; 206 static struct kmem_cache *_rq_bio_info_cache; 207 208 static int __init local_init(void) 209 { 210 int r = -ENOMEM; 211 212 /* allocate a slab for the dm_ios */ 213 _io_cache = KMEM_CACHE(dm_io, 0); 214 if (!_io_cache) 215 return r; 216 217 /* allocate a slab for the target ios */ 218 _tio_cache = KMEM_CACHE(dm_target_io, 0); 219 if (!_tio_cache) 220 goto out_free_io_cache; 221 222 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 223 if (!_rq_tio_cache) 224 goto out_free_tio_cache; 225 226 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 227 if (!_rq_bio_info_cache) 228 goto out_free_rq_tio_cache; 229 230 r = dm_uevent_init(); 231 if (r) 232 goto out_free_rq_bio_info_cache; 233 234 _major = major; 235 r = register_blkdev(_major, _name); 236 if (r < 0) 237 goto out_uevent_exit; 238 239 if (!_major) 240 _major = r; 241 242 return 0; 243 244 out_uevent_exit: 245 dm_uevent_exit(); 246 out_free_rq_bio_info_cache: 247 kmem_cache_destroy(_rq_bio_info_cache); 248 out_free_rq_tio_cache: 249 kmem_cache_destroy(_rq_tio_cache); 250 out_free_tio_cache: 251 kmem_cache_destroy(_tio_cache); 252 out_free_io_cache: 253 kmem_cache_destroy(_io_cache); 254 255 return r; 256 } 257 258 static void local_exit(void) 259 { 260 kmem_cache_destroy(_rq_bio_info_cache); 261 kmem_cache_destroy(_rq_tio_cache); 262 kmem_cache_destroy(_tio_cache); 263 kmem_cache_destroy(_io_cache); 264 unregister_blkdev(_major, _name); 265 dm_uevent_exit(); 266 267 _major = 0; 268 269 DMINFO("cleaned up"); 270 } 271 272 static int (*_inits[])(void) __initdata = { 273 local_init, 274 dm_target_init, 275 dm_linear_init, 276 dm_stripe_init, 277 dm_kcopyd_init, 278 dm_interface_init, 279 }; 280 281 static void (*_exits[])(void) = { 282 local_exit, 283 dm_target_exit, 284 dm_linear_exit, 285 dm_stripe_exit, 286 dm_kcopyd_exit, 287 dm_interface_exit, 288 }; 289 290 static int __init dm_init(void) 291 { 292 const int count = ARRAY_SIZE(_inits); 293 294 int r, i; 295 296 for (i = 0; i < count; i++) { 297 r = _inits[i](); 298 if (r) 299 goto bad; 300 } 301 302 return 0; 303 304 bad: 305 while (i--) 306 _exits[i](); 307 308 return r; 309 } 310 311 static void __exit dm_exit(void) 312 { 313 int i = ARRAY_SIZE(_exits); 314 315 while (i--) 316 _exits[i](); 317 } 318 319 /* 320 * Block device functions 321 */ 322 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 323 { 324 struct mapped_device *md; 325 326 spin_lock(&_minor_lock); 327 328 md = bdev->bd_disk->private_data; 329 if (!md) 330 goto out; 331 332 if (test_bit(DMF_FREEING, &md->flags) || 333 test_bit(DMF_DELETING, &md->flags)) { 334 md = NULL; 335 goto out; 336 } 337 338 dm_get(md); 339 atomic_inc(&md->open_count); 340 341 out: 342 spin_unlock(&_minor_lock); 343 344 return md ? 0 : -ENXIO; 345 } 346 347 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 348 { 349 struct mapped_device *md = disk->private_data; 350 atomic_dec(&md->open_count); 351 dm_put(md); 352 return 0; 353 } 354 355 int dm_open_count(struct mapped_device *md) 356 { 357 return atomic_read(&md->open_count); 358 } 359 360 /* 361 * Guarantees nothing is using the device before it's deleted. 362 */ 363 int dm_lock_for_deletion(struct mapped_device *md) 364 { 365 int r = 0; 366 367 spin_lock(&_minor_lock); 368 369 if (dm_open_count(md)) 370 r = -EBUSY; 371 else 372 set_bit(DMF_DELETING, &md->flags); 373 374 spin_unlock(&_minor_lock); 375 376 return r; 377 } 378 379 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 380 { 381 struct mapped_device *md = bdev->bd_disk->private_data; 382 383 return dm_get_geometry(md, geo); 384 } 385 386 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 387 unsigned int cmd, unsigned long arg) 388 { 389 struct mapped_device *md = bdev->bd_disk->private_data; 390 struct dm_table *map = dm_get_table(md); 391 struct dm_target *tgt; 392 int r = -ENOTTY; 393 394 if (!map || !dm_table_get_size(map)) 395 goto out; 396 397 /* We only support devices that have a single target */ 398 if (dm_table_get_num_targets(map) != 1) 399 goto out; 400 401 tgt = dm_table_get_target(map, 0); 402 403 if (dm_suspended(md)) { 404 r = -EAGAIN; 405 goto out; 406 } 407 408 if (tgt->type->ioctl) 409 r = tgt->type->ioctl(tgt, cmd, arg); 410 411 out: 412 dm_table_put(map); 413 414 return r; 415 } 416 417 static struct dm_io *alloc_io(struct mapped_device *md) 418 { 419 return mempool_alloc(md->io_pool, GFP_NOIO); 420 } 421 422 static void free_io(struct mapped_device *md, struct dm_io *io) 423 { 424 mempool_free(io, md->io_pool); 425 } 426 427 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 428 { 429 mempool_free(tio, md->tio_pool); 430 } 431 432 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 433 { 434 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 435 } 436 437 static void free_rq_tio(struct dm_rq_target_io *tio) 438 { 439 mempool_free(tio, tio->md->tio_pool); 440 } 441 442 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 443 { 444 return mempool_alloc(md->io_pool, GFP_ATOMIC); 445 } 446 447 static void free_bio_info(struct dm_rq_clone_bio_info *info) 448 { 449 mempool_free(info, info->tio->md->io_pool); 450 } 451 452 static void start_io_acct(struct dm_io *io) 453 { 454 struct mapped_device *md = io->md; 455 int cpu; 456 457 io->start_time = jiffies; 458 459 cpu = part_stat_lock(); 460 part_round_stats(cpu, &dm_disk(md)->part0); 461 part_stat_unlock(); 462 dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending); 463 } 464 465 static void end_io_acct(struct dm_io *io) 466 { 467 struct mapped_device *md = io->md; 468 struct bio *bio = io->bio; 469 unsigned long duration = jiffies - io->start_time; 470 int pending, cpu; 471 int rw = bio_data_dir(bio); 472 473 cpu = part_stat_lock(); 474 part_round_stats(cpu, &dm_disk(md)->part0); 475 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 476 part_stat_unlock(); 477 478 /* 479 * After this is decremented the bio must not be touched if it is 480 * a barrier. 481 */ 482 dm_disk(md)->part0.in_flight = pending = 483 atomic_dec_return(&md->pending); 484 485 /* nudge anyone waiting on suspend queue */ 486 if (!pending) 487 wake_up(&md->wait); 488 } 489 490 /* 491 * Add the bio to the list of deferred io. 492 */ 493 static void queue_io(struct mapped_device *md, struct bio *bio) 494 { 495 down_write(&md->io_lock); 496 497 spin_lock_irq(&md->deferred_lock); 498 bio_list_add(&md->deferred, bio); 499 spin_unlock_irq(&md->deferred_lock); 500 501 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 502 queue_work(md->wq, &md->work); 503 504 up_write(&md->io_lock); 505 } 506 507 /* 508 * Everyone (including functions in this file), should use this 509 * function to access the md->map field, and make sure they call 510 * dm_table_put() when finished. 511 */ 512 struct dm_table *dm_get_table(struct mapped_device *md) 513 { 514 struct dm_table *t; 515 unsigned long flags; 516 517 read_lock_irqsave(&md->map_lock, flags); 518 t = md->map; 519 if (t) 520 dm_table_get(t); 521 read_unlock_irqrestore(&md->map_lock, flags); 522 523 return t; 524 } 525 526 /* 527 * Get the geometry associated with a dm device 528 */ 529 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 530 { 531 *geo = md->geometry; 532 533 return 0; 534 } 535 536 /* 537 * Set the geometry of a device. 538 */ 539 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 540 { 541 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 542 543 if (geo->start > sz) { 544 DMWARN("Start sector is beyond the geometry limits."); 545 return -EINVAL; 546 } 547 548 md->geometry = *geo; 549 550 return 0; 551 } 552 553 /*----------------------------------------------------------------- 554 * CRUD START: 555 * A more elegant soln is in the works that uses the queue 556 * merge fn, unfortunately there are a couple of changes to 557 * the block layer that I want to make for this. So in the 558 * interests of getting something for people to use I give 559 * you this clearly demarcated crap. 560 *---------------------------------------------------------------*/ 561 562 static int __noflush_suspending(struct mapped_device *md) 563 { 564 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 565 } 566 567 /* 568 * Decrements the number of outstanding ios that a bio has been 569 * cloned into, completing the original io if necc. 570 */ 571 static void dec_pending(struct dm_io *io, int error) 572 { 573 unsigned long flags; 574 int io_error; 575 struct bio *bio; 576 struct mapped_device *md = io->md; 577 578 /* Push-back supersedes any I/O errors */ 579 if (error && !(io->error > 0 && __noflush_suspending(md))) 580 io->error = error; 581 582 if (atomic_dec_and_test(&io->io_count)) { 583 if (io->error == DM_ENDIO_REQUEUE) { 584 /* 585 * Target requested pushing back the I/O. 586 */ 587 spin_lock_irqsave(&md->deferred_lock, flags); 588 if (__noflush_suspending(md)) { 589 if (!bio_barrier(io->bio)) 590 bio_list_add_head(&md->deferred, 591 io->bio); 592 } else 593 /* noflush suspend was interrupted. */ 594 io->error = -EIO; 595 spin_unlock_irqrestore(&md->deferred_lock, flags); 596 } 597 598 io_error = io->error; 599 bio = io->bio; 600 601 if (bio_barrier(bio)) { 602 /* 603 * There can be just one barrier request so we use 604 * a per-device variable for error reporting. 605 * Note that you can't touch the bio after end_io_acct 606 */ 607 if (!md->barrier_error && io_error != -EOPNOTSUPP) 608 md->barrier_error = io_error; 609 end_io_acct(io); 610 } else { 611 end_io_acct(io); 612 613 if (io_error != DM_ENDIO_REQUEUE) { 614 trace_block_bio_complete(md->queue, bio); 615 616 bio_endio(bio, io_error); 617 } 618 } 619 620 free_io(md, io); 621 } 622 } 623 624 static void clone_endio(struct bio *bio, int error) 625 { 626 int r = 0; 627 struct dm_target_io *tio = bio->bi_private; 628 struct dm_io *io = tio->io; 629 struct mapped_device *md = tio->io->md; 630 dm_endio_fn endio = tio->ti->type->end_io; 631 632 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 633 error = -EIO; 634 635 if (endio) { 636 r = endio(tio->ti, bio, error, &tio->info); 637 if (r < 0 || r == DM_ENDIO_REQUEUE) 638 /* 639 * error and requeue request are handled 640 * in dec_pending(). 641 */ 642 error = r; 643 else if (r == DM_ENDIO_INCOMPLETE) 644 /* The target will handle the io */ 645 return; 646 else if (r) { 647 DMWARN("unimplemented target endio return value: %d", r); 648 BUG(); 649 } 650 } 651 652 /* 653 * Store md for cleanup instead of tio which is about to get freed. 654 */ 655 bio->bi_private = md->bs; 656 657 free_tio(md, tio); 658 bio_put(bio); 659 dec_pending(io, error); 660 } 661 662 /* 663 * Partial completion handling for request-based dm 664 */ 665 static void end_clone_bio(struct bio *clone, int error) 666 { 667 struct dm_rq_clone_bio_info *info = clone->bi_private; 668 struct dm_rq_target_io *tio = info->tio; 669 struct bio *bio = info->orig; 670 unsigned int nr_bytes = info->orig->bi_size; 671 672 bio_put(clone); 673 674 if (tio->error) 675 /* 676 * An error has already been detected on the request. 677 * Once error occurred, just let clone->end_io() handle 678 * the remainder. 679 */ 680 return; 681 else if (error) { 682 /* 683 * Don't notice the error to the upper layer yet. 684 * The error handling decision is made by the target driver, 685 * when the request is completed. 686 */ 687 tio->error = error; 688 return; 689 } 690 691 /* 692 * I/O for the bio successfully completed. 693 * Notice the data completion to the upper layer. 694 */ 695 696 /* 697 * bios are processed from the head of the list. 698 * So the completing bio should always be rq->bio. 699 * If it's not, something wrong is happening. 700 */ 701 if (tio->orig->bio != bio) 702 DMERR("bio completion is going in the middle of the request"); 703 704 /* 705 * Update the original request. 706 * Do not use blk_end_request() here, because it may complete 707 * the original request before the clone, and break the ordering. 708 */ 709 blk_update_request(tio->orig, 0, nr_bytes); 710 } 711 712 /* 713 * Don't touch any member of the md after calling this function because 714 * the md may be freed in dm_put() at the end of this function. 715 * Or do dm_get() before calling this function and dm_put() later. 716 */ 717 static void rq_completed(struct mapped_device *md, int run_queue) 718 { 719 int wakeup_waiters = 0; 720 struct request_queue *q = md->queue; 721 unsigned long flags; 722 723 spin_lock_irqsave(q->queue_lock, flags); 724 if (!queue_in_flight(q)) 725 wakeup_waiters = 1; 726 spin_unlock_irqrestore(q->queue_lock, flags); 727 728 /* nudge anyone waiting on suspend queue */ 729 if (wakeup_waiters) 730 wake_up(&md->wait); 731 732 if (run_queue) 733 blk_run_queue(q); 734 735 /* 736 * dm_put() must be at the end of this function. See the comment above 737 */ 738 dm_put(md); 739 } 740 741 static void dm_unprep_request(struct request *rq) 742 { 743 struct request *clone = rq->special; 744 struct dm_rq_target_io *tio = clone->end_io_data; 745 746 rq->special = NULL; 747 rq->cmd_flags &= ~REQ_DONTPREP; 748 749 blk_rq_unprep_clone(clone); 750 free_rq_tio(tio); 751 } 752 753 /* 754 * Requeue the original request of a clone. 755 */ 756 void dm_requeue_unmapped_request(struct request *clone) 757 { 758 struct dm_rq_target_io *tio = clone->end_io_data; 759 struct mapped_device *md = tio->md; 760 struct request *rq = tio->orig; 761 struct request_queue *q = rq->q; 762 unsigned long flags; 763 764 dm_unprep_request(rq); 765 766 spin_lock_irqsave(q->queue_lock, flags); 767 if (elv_queue_empty(q)) 768 blk_plug_device(q); 769 blk_requeue_request(q, rq); 770 spin_unlock_irqrestore(q->queue_lock, flags); 771 772 rq_completed(md, 0); 773 } 774 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 775 776 static void __stop_queue(struct request_queue *q) 777 { 778 blk_stop_queue(q); 779 } 780 781 static void stop_queue(struct request_queue *q) 782 { 783 unsigned long flags; 784 785 spin_lock_irqsave(q->queue_lock, flags); 786 __stop_queue(q); 787 spin_unlock_irqrestore(q->queue_lock, flags); 788 } 789 790 static void __start_queue(struct request_queue *q) 791 { 792 if (blk_queue_stopped(q)) 793 blk_start_queue(q); 794 } 795 796 static void start_queue(struct request_queue *q) 797 { 798 unsigned long flags; 799 800 spin_lock_irqsave(q->queue_lock, flags); 801 __start_queue(q); 802 spin_unlock_irqrestore(q->queue_lock, flags); 803 } 804 805 /* 806 * Complete the clone and the original request. 807 * Must be called without queue lock. 808 */ 809 static void dm_end_request(struct request *clone, int error) 810 { 811 struct dm_rq_target_io *tio = clone->end_io_data; 812 struct mapped_device *md = tio->md; 813 struct request *rq = tio->orig; 814 815 if (blk_pc_request(rq)) { 816 rq->errors = clone->errors; 817 rq->resid_len = clone->resid_len; 818 819 if (rq->sense) 820 /* 821 * We are using the sense buffer of the original 822 * request. 823 * So setting the length of the sense data is enough. 824 */ 825 rq->sense_len = clone->sense_len; 826 } 827 828 BUG_ON(clone->bio); 829 free_rq_tio(tio); 830 831 blk_end_request_all(rq, error); 832 833 rq_completed(md, 1); 834 } 835 836 /* 837 * Request completion handler for request-based dm 838 */ 839 static void dm_softirq_done(struct request *rq) 840 { 841 struct request *clone = rq->completion_data; 842 struct dm_rq_target_io *tio = clone->end_io_data; 843 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 844 int error = tio->error; 845 846 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 847 error = rq_end_io(tio->ti, clone, error, &tio->info); 848 849 if (error <= 0) 850 /* The target wants to complete the I/O */ 851 dm_end_request(clone, error); 852 else if (error == DM_ENDIO_INCOMPLETE) 853 /* The target will handle the I/O */ 854 return; 855 else if (error == DM_ENDIO_REQUEUE) 856 /* The target wants to requeue the I/O */ 857 dm_requeue_unmapped_request(clone); 858 else { 859 DMWARN("unimplemented target endio return value: %d", error); 860 BUG(); 861 } 862 } 863 864 /* 865 * Complete the clone and the original request with the error status 866 * through softirq context. 867 */ 868 static void dm_complete_request(struct request *clone, int error) 869 { 870 struct dm_rq_target_io *tio = clone->end_io_data; 871 struct request *rq = tio->orig; 872 873 tio->error = error; 874 rq->completion_data = clone; 875 blk_complete_request(rq); 876 } 877 878 /* 879 * Complete the not-mapped clone and the original request with the error status 880 * through softirq context. 881 * Target's rq_end_io() function isn't called. 882 * This may be used when the target's map_rq() function fails. 883 */ 884 void dm_kill_unmapped_request(struct request *clone, int error) 885 { 886 struct dm_rq_target_io *tio = clone->end_io_data; 887 struct request *rq = tio->orig; 888 889 rq->cmd_flags |= REQ_FAILED; 890 dm_complete_request(clone, error); 891 } 892 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 893 894 /* 895 * Called with the queue lock held 896 */ 897 static void end_clone_request(struct request *clone, int error) 898 { 899 /* 900 * For just cleaning up the information of the queue in which 901 * the clone was dispatched. 902 * The clone is *NOT* freed actually here because it is alloced from 903 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 904 */ 905 __blk_put_request(clone->q, clone); 906 907 /* 908 * Actual request completion is done in a softirq context which doesn't 909 * hold the queue lock. Otherwise, deadlock could occur because: 910 * - another request may be submitted by the upper level driver 911 * of the stacking during the completion 912 * - the submission which requires queue lock may be done 913 * against this queue 914 */ 915 dm_complete_request(clone, error); 916 } 917 918 static sector_t max_io_len(struct mapped_device *md, 919 sector_t sector, struct dm_target *ti) 920 { 921 sector_t offset = sector - ti->begin; 922 sector_t len = ti->len - offset; 923 924 /* 925 * Does the target need to split even further ? 926 */ 927 if (ti->split_io) { 928 sector_t boundary; 929 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 930 - offset; 931 if (len > boundary) 932 len = boundary; 933 } 934 935 return len; 936 } 937 938 static void __map_bio(struct dm_target *ti, struct bio *clone, 939 struct dm_target_io *tio) 940 { 941 int r; 942 sector_t sector; 943 struct mapped_device *md; 944 945 clone->bi_end_io = clone_endio; 946 clone->bi_private = tio; 947 948 /* 949 * Map the clone. If r == 0 we don't need to do 950 * anything, the target has assumed ownership of 951 * this io. 952 */ 953 atomic_inc(&tio->io->io_count); 954 sector = clone->bi_sector; 955 r = ti->type->map(ti, clone, &tio->info); 956 if (r == DM_MAPIO_REMAPPED) { 957 /* the bio has been remapped so dispatch it */ 958 959 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 960 tio->io->bio->bi_bdev->bd_dev, sector); 961 962 generic_make_request(clone); 963 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 964 /* error the io and bail out, or requeue it if needed */ 965 md = tio->io->md; 966 dec_pending(tio->io, r); 967 /* 968 * Store bio_set for cleanup. 969 */ 970 clone->bi_private = md->bs; 971 bio_put(clone); 972 free_tio(md, tio); 973 } else if (r) { 974 DMWARN("unimplemented target map return value: %d", r); 975 BUG(); 976 } 977 } 978 979 struct clone_info { 980 struct mapped_device *md; 981 struct dm_table *map; 982 struct bio *bio; 983 struct dm_io *io; 984 sector_t sector; 985 sector_t sector_count; 986 unsigned short idx; 987 }; 988 989 static void dm_bio_destructor(struct bio *bio) 990 { 991 struct bio_set *bs = bio->bi_private; 992 993 bio_free(bio, bs); 994 } 995 996 /* 997 * Creates a little bio that is just does part of a bvec. 998 */ 999 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1000 unsigned short idx, unsigned int offset, 1001 unsigned int len, struct bio_set *bs) 1002 { 1003 struct bio *clone; 1004 struct bio_vec *bv = bio->bi_io_vec + idx; 1005 1006 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1007 clone->bi_destructor = dm_bio_destructor; 1008 *clone->bi_io_vec = *bv; 1009 1010 clone->bi_sector = sector; 1011 clone->bi_bdev = bio->bi_bdev; 1012 clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); 1013 clone->bi_vcnt = 1; 1014 clone->bi_size = to_bytes(len); 1015 clone->bi_io_vec->bv_offset = offset; 1016 clone->bi_io_vec->bv_len = clone->bi_size; 1017 clone->bi_flags |= 1 << BIO_CLONED; 1018 1019 if (bio_integrity(bio)) { 1020 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1021 bio_integrity_trim(clone, 1022 bio_sector_offset(bio, idx, offset), len); 1023 } 1024 1025 return clone; 1026 } 1027 1028 /* 1029 * Creates a bio that consists of range of complete bvecs. 1030 */ 1031 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1032 unsigned short idx, unsigned short bv_count, 1033 unsigned int len, struct bio_set *bs) 1034 { 1035 struct bio *clone; 1036 1037 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1038 __bio_clone(clone, bio); 1039 clone->bi_rw &= ~(1 << BIO_RW_BARRIER); 1040 clone->bi_destructor = dm_bio_destructor; 1041 clone->bi_sector = sector; 1042 clone->bi_idx = idx; 1043 clone->bi_vcnt = idx + bv_count; 1044 clone->bi_size = to_bytes(len); 1045 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1046 1047 if (bio_integrity(bio)) { 1048 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1049 1050 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1051 bio_integrity_trim(clone, 1052 bio_sector_offset(bio, idx, 0), len); 1053 } 1054 1055 return clone; 1056 } 1057 1058 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1059 struct dm_target *ti) 1060 { 1061 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1062 1063 tio->io = ci->io; 1064 tio->ti = ti; 1065 memset(&tio->info, 0, sizeof(tio->info)); 1066 1067 return tio; 1068 } 1069 1070 static void __flush_target(struct clone_info *ci, struct dm_target *ti, 1071 unsigned flush_nr) 1072 { 1073 struct dm_target_io *tio = alloc_tio(ci, ti); 1074 struct bio *clone; 1075 1076 tio->info.flush_request = flush_nr; 1077 1078 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1079 __bio_clone(clone, ci->bio); 1080 clone->bi_destructor = dm_bio_destructor; 1081 1082 __map_bio(ti, clone, tio); 1083 } 1084 1085 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1086 { 1087 unsigned target_nr = 0, flush_nr; 1088 struct dm_target *ti; 1089 1090 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1091 for (flush_nr = 0; flush_nr < ti->num_flush_requests; 1092 flush_nr++) 1093 __flush_target(ci, ti, flush_nr); 1094 1095 ci->sector_count = 0; 1096 1097 return 0; 1098 } 1099 1100 static int __clone_and_map(struct clone_info *ci) 1101 { 1102 struct bio *clone, *bio = ci->bio; 1103 struct dm_target *ti; 1104 sector_t len = 0, max; 1105 struct dm_target_io *tio; 1106 1107 if (unlikely(bio_empty_barrier(bio))) 1108 return __clone_and_map_empty_barrier(ci); 1109 1110 ti = dm_table_find_target(ci->map, ci->sector); 1111 if (!dm_target_is_valid(ti)) 1112 return -EIO; 1113 1114 max = max_io_len(ci->md, ci->sector, ti); 1115 1116 /* 1117 * Allocate a target io object. 1118 */ 1119 tio = alloc_tio(ci, ti); 1120 1121 if (ci->sector_count <= max) { 1122 /* 1123 * Optimise for the simple case where we can do all of 1124 * the remaining io with a single clone. 1125 */ 1126 clone = clone_bio(bio, ci->sector, ci->idx, 1127 bio->bi_vcnt - ci->idx, ci->sector_count, 1128 ci->md->bs); 1129 __map_bio(ti, clone, tio); 1130 ci->sector_count = 0; 1131 1132 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1133 /* 1134 * There are some bvecs that don't span targets. 1135 * Do as many of these as possible. 1136 */ 1137 int i; 1138 sector_t remaining = max; 1139 sector_t bv_len; 1140 1141 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1142 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1143 1144 if (bv_len > remaining) 1145 break; 1146 1147 remaining -= bv_len; 1148 len += bv_len; 1149 } 1150 1151 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1152 ci->md->bs); 1153 __map_bio(ti, clone, tio); 1154 1155 ci->sector += len; 1156 ci->sector_count -= len; 1157 ci->idx = i; 1158 1159 } else { 1160 /* 1161 * Handle a bvec that must be split between two or more targets. 1162 */ 1163 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1164 sector_t remaining = to_sector(bv->bv_len); 1165 unsigned int offset = 0; 1166 1167 do { 1168 if (offset) { 1169 ti = dm_table_find_target(ci->map, ci->sector); 1170 if (!dm_target_is_valid(ti)) 1171 return -EIO; 1172 1173 max = max_io_len(ci->md, ci->sector, ti); 1174 1175 tio = alloc_tio(ci, ti); 1176 } 1177 1178 len = min(remaining, max); 1179 1180 clone = split_bvec(bio, ci->sector, ci->idx, 1181 bv->bv_offset + offset, len, 1182 ci->md->bs); 1183 1184 __map_bio(ti, clone, tio); 1185 1186 ci->sector += len; 1187 ci->sector_count -= len; 1188 offset += to_bytes(len); 1189 } while (remaining -= len); 1190 1191 ci->idx++; 1192 } 1193 1194 return 0; 1195 } 1196 1197 /* 1198 * Split the bio into several clones and submit it to targets. 1199 */ 1200 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1201 { 1202 struct clone_info ci; 1203 int error = 0; 1204 1205 ci.map = dm_get_table(md); 1206 if (unlikely(!ci.map)) { 1207 if (!bio_barrier(bio)) 1208 bio_io_error(bio); 1209 else 1210 if (!md->barrier_error) 1211 md->barrier_error = -EIO; 1212 return; 1213 } 1214 1215 ci.md = md; 1216 ci.bio = bio; 1217 ci.io = alloc_io(md); 1218 ci.io->error = 0; 1219 atomic_set(&ci.io->io_count, 1); 1220 ci.io->bio = bio; 1221 ci.io->md = md; 1222 ci.sector = bio->bi_sector; 1223 ci.sector_count = bio_sectors(bio); 1224 if (unlikely(bio_empty_barrier(bio))) 1225 ci.sector_count = 1; 1226 ci.idx = bio->bi_idx; 1227 1228 start_io_acct(ci.io); 1229 while (ci.sector_count && !error) 1230 error = __clone_and_map(&ci); 1231 1232 /* drop the extra reference count */ 1233 dec_pending(ci.io, error); 1234 dm_table_put(ci.map); 1235 } 1236 /*----------------------------------------------------------------- 1237 * CRUD END 1238 *---------------------------------------------------------------*/ 1239 1240 static int dm_merge_bvec(struct request_queue *q, 1241 struct bvec_merge_data *bvm, 1242 struct bio_vec *biovec) 1243 { 1244 struct mapped_device *md = q->queuedata; 1245 struct dm_table *map = dm_get_table(md); 1246 struct dm_target *ti; 1247 sector_t max_sectors; 1248 int max_size = 0; 1249 1250 if (unlikely(!map)) 1251 goto out; 1252 1253 ti = dm_table_find_target(map, bvm->bi_sector); 1254 if (!dm_target_is_valid(ti)) 1255 goto out_table; 1256 1257 /* 1258 * Find maximum amount of I/O that won't need splitting 1259 */ 1260 max_sectors = min(max_io_len(md, bvm->bi_sector, ti), 1261 (sector_t) BIO_MAX_SECTORS); 1262 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1263 if (max_size < 0) 1264 max_size = 0; 1265 1266 /* 1267 * merge_bvec_fn() returns number of bytes 1268 * it can accept at this offset 1269 * max is precomputed maximal io size 1270 */ 1271 if (max_size && ti->type->merge) 1272 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1273 /* 1274 * If the target doesn't support merge method and some of the devices 1275 * provided their merge_bvec method (we know this by looking at 1276 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1277 * entries. So always set max_size to 0, and the code below allows 1278 * just one page. 1279 */ 1280 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1281 1282 max_size = 0; 1283 1284 out_table: 1285 dm_table_put(map); 1286 1287 out: 1288 /* 1289 * Always allow an entire first page 1290 */ 1291 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1292 max_size = biovec->bv_len; 1293 1294 return max_size; 1295 } 1296 1297 /* 1298 * The request function that just remaps the bio built up by 1299 * dm_merge_bvec. 1300 */ 1301 static int _dm_request(struct request_queue *q, struct bio *bio) 1302 { 1303 int rw = bio_data_dir(bio); 1304 struct mapped_device *md = q->queuedata; 1305 int cpu; 1306 1307 down_read(&md->io_lock); 1308 1309 cpu = part_stat_lock(); 1310 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1311 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1312 part_stat_unlock(); 1313 1314 /* 1315 * If we're suspended or the thread is processing barriers 1316 * we have to queue this io for later. 1317 */ 1318 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1319 unlikely(bio_barrier(bio))) { 1320 up_read(&md->io_lock); 1321 1322 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1323 bio_rw(bio) == READA) { 1324 bio_io_error(bio); 1325 return 0; 1326 } 1327 1328 queue_io(md, bio); 1329 1330 return 0; 1331 } 1332 1333 __split_and_process_bio(md, bio); 1334 up_read(&md->io_lock); 1335 return 0; 1336 } 1337 1338 static int dm_make_request(struct request_queue *q, struct bio *bio) 1339 { 1340 struct mapped_device *md = q->queuedata; 1341 1342 if (unlikely(bio_barrier(bio))) { 1343 bio_endio(bio, -EOPNOTSUPP); 1344 return 0; 1345 } 1346 1347 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1348 } 1349 1350 static int dm_request_based(struct mapped_device *md) 1351 { 1352 return blk_queue_stackable(md->queue); 1353 } 1354 1355 static int dm_request(struct request_queue *q, struct bio *bio) 1356 { 1357 struct mapped_device *md = q->queuedata; 1358 1359 if (dm_request_based(md)) 1360 return dm_make_request(q, bio); 1361 1362 return _dm_request(q, bio); 1363 } 1364 1365 void dm_dispatch_request(struct request *rq) 1366 { 1367 int r; 1368 1369 if (blk_queue_io_stat(rq->q)) 1370 rq->cmd_flags |= REQ_IO_STAT; 1371 1372 rq->start_time = jiffies; 1373 r = blk_insert_cloned_request(rq->q, rq); 1374 if (r) 1375 dm_complete_request(rq, r); 1376 } 1377 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1378 1379 static void dm_rq_bio_destructor(struct bio *bio) 1380 { 1381 struct dm_rq_clone_bio_info *info = bio->bi_private; 1382 struct mapped_device *md = info->tio->md; 1383 1384 free_bio_info(info); 1385 bio_free(bio, md->bs); 1386 } 1387 1388 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1389 void *data) 1390 { 1391 struct dm_rq_target_io *tio = data; 1392 struct mapped_device *md = tio->md; 1393 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1394 1395 if (!info) 1396 return -ENOMEM; 1397 1398 info->orig = bio_orig; 1399 info->tio = tio; 1400 bio->bi_end_io = end_clone_bio; 1401 bio->bi_private = info; 1402 bio->bi_destructor = dm_rq_bio_destructor; 1403 1404 return 0; 1405 } 1406 1407 static int setup_clone(struct request *clone, struct request *rq, 1408 struct dm_rq_target_io *tio) 1409 { 1410 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1411 dm_rq_bio_constructor, tio); 1412 1413 if (r) 1414 return r; 1415 1416 clone->cmd = rq->cmd; 1417 clone->cmd_len = rq->cmd_len; 1418 clone->sense = rq->sense; 1419 clone->buffer = rq->buffer; 1420 clone->end_io = end_clone_request; 1421 clone->end_io_data = tio; 1422 1423 return 0; 1424 } 1425 1426 static int dm_rq_flush_suspending(struct mapped_device *md) 1427 { 1428 return !md->suspend_rq.special; 1429 } 1430 1431 /* 1432 * Called with the queue lock held. 1433 */ 1434 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1435 { 1436 struct mapped_device *md = q->queuedata; 1437 struct dm_rq_target_io *tio; 1438 struct request *clone; 1439 1440 if (unlikely(rq == &md->suspend_rq)) { 1441 if (dm_rq_flush_suspending(md)) 1442 return BLKPREP_OK; 1443 else 1444 /* The flush suspend was interrupted */ 1445 return BLKPREP_KILL; 1446 } 1447 1448 if (unlikely(rq->special)) { 1449 DMWARN("Already has something in rq->special."); 1450 return BLKPREP_KILL; 1451 } 1452 1453 tio = alloc_rq_tio(md); /* Only one for each original request */ 1454 if (!tio) 1455 /* -ENOMEM */ 1456 return BLKPREP_DEFER; 1457 1458 tio->md = md; 1459 tio->ti = NULL; 1460 tio->orig = rq; 1461 tio->error = 0; 1462 memset(&tio->info, 0, sizeof(tio->info)); 1463 1464 clone = &tio->clone; 1465 if (setup_clone(clone, rq, tio)) { 1466 /* -ENOMEM */ 1467 free_rq_tio(tio); 1468 return BLKPREP_DEFER; 1469 } 1470 1471 rq->special = clone; 1472 rq->cmd_flags |= REQ_DONTPREP; 1473 1474 return BLKPREP_OK; 1475 } 1476 1477 static void map_request(struct dm_target *ti, struct request *rq, 1478 struct mapped_device *md) 1479 { 1480 int r; 1481 struct request *clone = rq->special; 1482 struct dm_rq_target_io *tio = clone->end_io_data; 1483 1484 /* 1485 * Hold the md reference here for the in-flight I/O. 1486 * We can't rely on the reference count by device opener, 1487 * because the device may be closed during the request completion 1488 * when all bios are completed. 1489 * See the comment in rq_completed() too. 1490 */ 1491 dm_get(md); 1492 1493 tio->ti = ti; 1494 r = ti->type->map_rq(ti, clone, &tio->info); 1495 switch (r) { 1496 case DM_MAPIO_SUBMITTED: 1497 /* The target has taken the I/O to submit by itself later */ 1498 break; 1499 case DM_MAPIO_REMAPPED: 1500 /* The target has remapped the I/O so dispatch it */ 1501 dm_dispatch_request(clone); 1502 break; 1503 case DM_MAPIO_REQUEUE: 1504 /* The target wants to requeue the I/O */ 1505 dm_requeue_unmapped_request(clone); 1506 break; 1507 default: 1508 if (r > 0) { 1509 DMWARN("unimplemented target map return value: %d", r); 1510 BUG(); 1511 } 1512 1513 /* The target wants to complete the I/O */ 1514 dm_kill_unmapped_request(clone, r); 1515 break; 1516 } 1517 } 1518 1519 /* 1520 * q->request_fn for request-based dm. 1521 * Called with the queue lock held. 1522 */ 1523 static void dm_request_fn(struct request_queue *q) 1524 { 1525 struct mapped_device *md = q->queuedata; 1526 struct dm_table *map = dm_get_table(md); 1527 struct dm_target *ti; 1528 struct request *rq; 1529 1530 /* 1531 * For noflush suspend, check blk_queue_stopped() to immediately 1532 * quit I/O dispatching. 1533 */ 1534 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1535 rq = blk_peek_request(q); 1536 if (!rq) 1537 goto plug_and_out; 1538 1539 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1540 if (queue_in_flight(q)) 1541 /* Not quiet yet. Wait more */ 1542 goto plug_and_out; 1543 1544 /* This device should be quiet now */ 1545 __stop_queue(q); 1546 blk_start_request(rq); 1547 __blk_end_request_all(rq, 0); 1548 wake_up(&md->wait); 1549 goto out; 1550 } 1551 1552 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1553 if (ti->type->busy && ti->type->busy(ti)) 1554 goto plug_and_out; 1555 1556 blk_start_request(rq); 1557 spin_unlock(q->queue_lock); 1558 map_request(ti, rq, md); 1559 spin_lock_irq(q->queue_lock); 1560 } 1561 1562 goto out; 1563 1564 plug_and_out: 1565 if (!elv_queue_empty(q)) 1566 /* Some requests still remain, retry later */ 1567 blk_plug_device(q); 1568 1569 out: 1570 dm_table_put(map); 1571 1572 return; 1573 } 1574 1575 int dm_underlying_device_busy(struct request_queue *q) 1576 { 1577 return blk_lld_busy(q); 1578 } 1579 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1580 1581 static int dm_lld_busy(struct request_queue *q) 1582 { 1583 int r; 1584 struct mapped_device *md = q->queuedata; 1585 struct dm_table *map = dm_get_table(md); 1586 1587 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1588 r = 1; 1589 else 1590 r = dm_table_any_busy_target(map); 1591 1592 dm_table_put(map); 1593 1594 return r; 1595 } 1596 1597 static void dm_unplug_all(struct request_queue *q) 1598 { 1599 struct mapped_device *md = q->queuedata; 1600 struct dm_table *map = dm_get_table(md); 1601 1602 if (map) { 1603 if (dm_request_based(md)) 1604 generic_unplug_device(q); 1605 1606 dm_table_unplug_all(map); 1607 dm_table_put(map); 1608 } 1609 } 1610 1611 static int dm_any_congested(void *congested_data, int bdi_bits) 1612 { 1613 int r = bdi_bits; 1614 struct mapped_device *md = congested_data; 1615 struct dm_table *map; 1616 1617 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1618 map = dm_get_table(md); 1619 if (map) { 1620 /* 1621 * Request-based dm cares about only own queue for 1622 * the query about congestion status of request_queue 1623 */ 1624 if (dm_request_based(md)) 1625 r = md->queue->backing_dev_info.state & 1626 bdi_bits; 1627 else 1628 r = dm_table_any_congested(map, bdi_bits); 1629 1630 dm_table_put(map); 1631 } 1632 } 1633 1634 return r; 1635 } 1636 1637 /*----------------------------------------------------------------- 1638 * An IDR is used to keep track of allocated minor numbers. 1639 *---------------------------------------------------------------*/ 1640 static DEFINE_IDR(_minor_idr); 1641 1642 static void free_minor(int minor) 1643 { 1644 spin_lock(&_minor_lock); 1645 idr_remove(&_minor_idr, minor); 1646 spin_unlock(&_minor_lock); 1647 } 1648 1649 /* 1650 * See if the device with a specific minor # is free. 1651 */ 1652 static int specific_minor(int minor) 1653 { 1654 int r, m; 1655 1656 if (minor >= (1 << MINORBITS)) 1657 return -EINVAL; 1658 1659 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1660 if (!r) 1661 return -ENOMEM; 1662 1663 spin_lock(&_minor_lock); 1664 1665 if (idr_find(&_minor_idr, minor)) { 1666 r = -EBUSY; 1667 goto out; 1668 } 1669 1670 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1671 if (r) 1672 goto out; 1673 1674 if (m != minor) { 1675 idr_remove(&_minor_idr, m); 1676 r = -EBUSY; 1677 goto out; 1678 } 1679 1680 out: 1681 spin_unlock(&_minor_lock); 1682 return r; 1683 } 1684 1685 static int next_free_minor(int *minor) 1686 { 1687 int r, m; 1688 1689 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1690 if (!r) 1691 return -ENOMEM; 1692 1693 spin_lock(&_minor_lock); 1694 1695 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1696 if (r) 1697 goto out; 1698 1699 if (m >= (1 << MINORBITS)) { 1700 idr_remove(&_minor_idr, m); 1701 r = -ENOSPC; 1702 goto out; 1703 } 1704 1705 *minor = m; 1706 1707 out: 1708 spin_unlock(&_minor_lock); 1709 return r; 1710 } 1711 1712 static struct block_device_operations dm_blk_dops; 1713 1714 static void dm_wq_work(struct work_struct *work); 1715 1716 /* 1717 * Allocate and initialise a blank device with a given minor. 1718 */ 1719 static struct mapped_device *alloc_dev(int minor) 1720 { 1721 int r; 1722 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1723 void *old_md; 1724 1725 if (!md) { 1726 DMWARN("unable to allocate device, out of memory."); 1727 return NULL; 1728 } 1729 1730 if (!try_module_get(THIS_MODULE)) 1731 goto bad_module_get; 1732 1733 /* get a minor number for the dev */ 1734 if (minor == DM_ANY_MINOR) 1735 r = next_free_minor(&minor); 1736 else 1737 r = specific_minor(minor); 1738 if (r < 0) 1739 goto bad_minor; 1740 1741 init_rwsem(&md->io_lock); 1742 mutex_init(&md->suspend_lock); 1743 spin_lock_init(&md->deferred_lock); 1744 rwlock_init(&md->map_lock); 1745 atomic_set(&md->holders, 1); 1746 atomic_set(&md->open_count, 0); 1747 atomic_set(&md->event_nr, 0); 1748 atomic_set(&md->uevent_seq, 0); 1749 INIT_LIST_HEAD(&md->uevent_list); 1750 spin_lock_init(&md->uevent_lock); 1751 1752 md->queue = blk_init_queue(dm_request_fn, NULL); 1753 if (!md->queue) 1754 goto bad_queue; 1755 1756 /* 1757 * Request-based dm devices cannot be stacked on top of bio-based dm 1758 * devices. The type of this dm device has not been decided yet, 1759 * although we initialized the queue using blk_init_queue(). 1760 * The type is decided at the first table loading time. 1761 * To prevent problematic device stacking, clear the queue flag 1762 * for request stacking support until then. 1763 * 1764 * This queue is new, so no concurrency on the queue_flags. 1765 */ 1766 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1767 md->saved_make_request_fn = md->queue->make_request_fn; 1768 md->queue->queuedata = md; 1769 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1770 md->queue->backing_dev_info.congested_data = md; 1771 blk_queue_make_request(md->queue, dm_request); 1772 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1773 md->queue->unplug_fn = dm_unplug_all; 1774 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1775 blk_queue_softirq_done(md->queue, dm_softirq_done); 1776 blk_queue_prep_rq(md->queue, dm_prep_fn); 1777 blk_queue_lld_busy(md->queue, dm_lld_busy); 1778 1779 md->disk = alloc_disk(1); 1780 if (!md->disk) 1781 goto bad_disk; 1782 1783 atomic_set(&md->pending, 0); 1784 init_waitqueue_head(&md->wait); 1785 INIT_WORK(&md->work, dm_wq_work); 1786 init_waitqueue_head(&md->eventq); 1787 1788 md->disk->major = _major; 1789 md->disk->first_minor = minor; 1790 md->disk->fops = &dm_blk_dops; 1791 md->disk->queue = md->queue; 1792 md->disk->private_data = md; 1793 sprintf(md->disk->disk_name, "dm-%d", minor); 1794 add_disk(md->disk); 1795 format_dev_t(md->name, MKDEV(_major, minor)); 1796 1797 md->wq = create_singlethread_workqueue("kdmflush"); 1798 if (!md->wq) 1799 goto bad_thread; 1800 1801 md->bdev = bdget_disk(md->disk, 0); 1802 if (!md->bdev) 1803 goto bad_bdev; 1804 1805 /* Populate the mapping, nobody knows we exist yet */ 1806 spin_lock(&_minor_lock); 1807 old_md = idr_replace(&_minor_idr, md, minor); 1808 spin_unlock(&_minor_lock); 1809 1810 BUG_ON(old_md != MINOR_ALLOCED); 1811 1812 return md; 1813 1814 bad_bdev: 1815 destroy_workqueue(md->wq); 1816 bad_thread: 1817 put_disk(md->disk); 1818 bad_disk: 1819 blk_cleanup_queue(md->queue); 1820 bad_queue: 1821 free_minor(minor); 1822 bad_minor: 1823 module_put(THIS_MODULE); 1824 bad_module_get: 1825 kfree(md); 1826 return NULL; 1827 } 1828 1829 static void unlock_fs(struct mapped_device *md); 1830 1831 static void free_dev(struct mapped_device *md) 1832 { 1833 int minor = MINOR(disk_devt(md->disk)); 1834 1835 unlock_fs(md); 1836 bdput(md->bdev); 1837 destroy_workqueue(md->wq); 1838 if (md->tio_pool) 1839 mempool_destroy(md->tio_pool); 1840 if (md->io_pool) 1841 mempool_destroy(md->io_pool); 1842 if (md->bs) 1843 bioset_free(md->bs); 1844 blk_integrity_unregister(md->disk); 1845 del_gendisk(md->disk); 1846 free_minor(minor); 1847 1848 spin_lock(&_minor_lock); 1849 md->disk->private_data = NULL; 1850 spin_unlock(&_minor_lock); 1851 1852 put_disk(md->disk); 1853 blk_cleanup_queue(md->queue); 1854 module_put(THIS_MODULE); 1855 kfree(md); 1856 } 1857 1858 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1859 { 1860 struct dm_md_mempools *p; 1861 1862 if (md->io_pool && md->tio_pool && md->bs) 1863 /* the md already has necessary mempools */ 1864 goto out; 1865 1866 p = dm_table_get_md_mempools(t); 1867 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1868 1869 md->io_pool = p->io_pool; 1870 p->io_pool = NULL; 1871 md->tio_pool = p->tio_pool; 1872 p->tio_pool = NULL; 1873 md->bs = p->bs; 1874 p->bs = NULL; 1875 1876 out: 1877 /* mempool bind completed, now no need any mempools in the table */ 1878 dm_table_free_md_mempools(t); 1879 } 1880 1881 /* 1882 * Bind a table to the device. 1883 */ 1884 static void event_callback(void *context) 1885 { 1886 unsigned long flags; 1887 LIST_HEAD(uevents); 1888 struct mapped_device *md = (struct mapped_device *) context; 1889 1890 spin_lock_irqsave(&md->uevent_lock, flags); 1891 list_splice_init(&md->uevent_list, &uevents); 1892 spin_unlock_irqrestore(&md->uevent_lock, flags); 1893 1894 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1895 1896 atomic_inc(&md->event_nr); 1897 wake_up(&md->eventq); 1898 } 1899 1900 static void __set_size(struct mapped_device *md, sector_t size) 1901 { 1902 set_capacity(md->disk, size); 1903 1904 mutex_lock(&md->bdev->bd_inode->i_mutex); 1905 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1906 mutex_unlock(&md->bdev->bd_inode->i_mutex); 1907 } 1908 1909 static int __bind(struct mapped_device *md, struct dm_table *t, 1910 struct queue_limits *limits) 1911 { 1912 struct request_queue *q = md->queue; 1913 sector_t size; 1914 unsigned long flags; 1915 1916 size = dm_table_get_size(t); 1917 1918 /* 1919 * Wipe any geometry if the size of the table changed. 1920 */ 1921 if (size != get_capacity(md->disk)) 1922 memset(&md->geometry, 0, sizeof(md->geometry)); 1923 1924 __set_size(md, size); 1925 1926 if (!size) { 1927 dm_table_destroy(t); 1928 return 0; 1929 } 1930 1931 dm_table_event_callback(t, event_callback, md); 1932 1933 /* 1934 * The queue hasn't been stopped yet, if the old table type wasn't 1935 * for request-based during suspension. So stop it to prevent 1936 * I/O mapping before resume. 1937 * This must be done before setting the queue restrictions, 1938 * because request-based dm may be run just after the setting. 1939 */ 1940 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 1941 stop_queue(q); 1942 1943 __bind_mempools(md, t); 1944 1945 write_lock_irqsave(&md->map_lock, flags); 1946 md->map = t; 1947 dm_table_set_restrictions(t, q, limits); 1948 write_unlock_irqrestore(&md->map_lock, flags); 1949 1950 return 0; 1951 } 1952 1953 static void __unbind(struct mapped_device *md) 1954 { 1955 struct dm_table *map = md->map; 1956 unsigned long flags; 1957 1958 if (!map) 1959 return; 1960 1961 dm_table_event_callback(map, NULL, NULL); 1962 write_lock_irqsave(&md->map_lock, flags); 1963 md->map = NULL; 1964 write_unlock_irqrestore(&md->map_lock, flags); 1965 dm_table_destroy(map); 1966 } 1967 1968 /* 1969 * Constructor for a new device. 1970 */ 1971 int dm_create(int minor, struct mapped_device **result) 1972 { 1973 struct mapped_device *md; 1974 1975 md = alloc_dev(minor); 1976 if (!md) 1977 return -ENXIO; 1978 1979 dm_sysfs_init(md); 1980 1981 *result = md; 1982 return 0; 1983 } 1984 1985 static struct mapped_device *dm_find_md(dev_t dev) 1986 { 1987 struct mapped_device *md; 1988 unsigned minor = MINOR(dev); 1989 1990 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1991 return NULL; 1992 1993 spin_lock(&_minor_lock); 1994 1995 md = idr_find(&_minor_idr, minor); 1996 if (md && (md == MINOR_ALLOCED || 1997 (MINOR(disk_devt(dm_disk(md))) != minor) || 1998 test_bit(DMF_FREEING, &md->flags))) { 1999 md = NULL; 2000 goto out; 2001 } 2002 2003 out: 2004 spin_unlock(&_minor_lock); 2005 2006 return md; 2007 } 2008 2009 struct mapped_device *dm_get_md(dev_t dev) 2010 { 2011 struct mapped_device *md = dm_find_md(dev); 2012 2013 if (md) 2014 dm_get(md); 2015 2016 return md; 2017 } 2018 2019 void *dm_get_mdptr(struct mapped_device *md) 2020 { 2021 return md->interface_ptr; 2022 } 2023 2024 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2025 { 2026 md->interface_ptr = ptr; 2027 } 2028 2029 void dm_get(struct mapped_device *md) 2030 { 2031 atomic_inc(&md->holders); 2032 } 2033 2034 const char *dm_device_name(struct mapped_device *md) 2035 { 2036 return md->name; 2037 } 2038 EXPORT_SYMBOL_GPL(dm_device_name); 2039 2040 void dm_put(struct mapped_device *md) 2041 { 2042 struct dm_table *map; 2043 2044 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2045 2046 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2047 map = dm_get_table(md); 2048 idr_replace(&_minor_idr, MINOR_ALLOCED, 2049 MINOR(disk_devt(dm_disk(md)))); 2050 set_bit(DMF_FREEING, &md->flags); 2051 spin_unlock(&_minor_lock); 2052 if (!dm_suspended(md)) { 2053 dm_table_presuspend_targets(map); 2054 dm_table_postsuspend_targets(map); 2055 } 2056 dm_sysfs_exit(md); 2057 dm_table_put(map); 2058 __unbind(md); 2059 free_dev(md); 2060 } 2061 } 2062 EXPORT_SYMBOL_GPL(dm_put); 2063 2064 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2065 { 2066 int r = 0; 2067 DECLARE_WAITQUEUE(wait, current); 2068 struct request_queue *q = md->queue; 2069 unsigned long flags; 2070 2071 dm_unplug_all(md->queue); 2072 2073 add_wait_queue(&md->wait, &wait); 2074 2075 while (1) { 2076 set_current_state(interruptible); 2077 2078 smp_mb(); 2079 if (dm_request_based(md)) { 2080 spin_lock_irqsave(q->queue_lock, flags); 2081 if (!queue_in_flight(q) && blk_queue_stopped(q)) { 2082 spin_unlock_irqrestore(q->queue_lock, flags); 2083 break; 2084 } 2085 spin_unlock_irqrestore(q->queue_lock, flags); 2086 } else if (!atomic_read(&md->pending)) 2087 break; 2088 2089 if (interruptible == TASK_INTERRUPTIBLE && 2090 signal_pending(current)) { 2091 r = -EINTR; 2092 break; 2093 } 2094 2095 io_schedule(); 2096 } 2097 set_current_state(TASK_RUNNING); 2098 2099 remove_wait_queue(&md->wait, &wait); 2100 2101 return r; 2102 } 2103 2104 static void dm_flush(struct mapped_device *md) 2105 { 2106 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2107 2108 bio_init(&md->barrier_bio); 2109 md->barrier_bio.bi_bdev = md->bdev; 2110 md->barrier_bio.bi_rw = WRITE_BARRIER; 2111 __split_and_process_bio(md, &md->barrier_bio); 2112 2113 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2114 } 2115 2116 static void process_barrier(struct mapped_device *md, struct bio *bio) 2117 { 2118 md->barrier_error = 0; 2119 2120 dm_flush(md); 2121 2122 if (!bio_empty_barrier(bio)) { 2123 __split_and_process_bio(md, bio); 2124 dm_flush(md); 2125 } 2126 2127 if (md->barrier_error != DM_ENDIO_REQUEUE) 2128 bio_endio(bio, md->barrier_error); 2129 else { 2130 spin_lock_irq(&md->deferred_lock); 2131 bio_list_add_head(&md->deferred, bio); 2132 spin_unlock_irq(&md->deferred_lock); 2133 } 2134 } 2135 2136 /* 2137 * Process the deferred bios 2138 */ 2139 static void dm_wq_work(struct work_struct *work) 2140 { 2141 struct mapped_device *md = container_of(work, struct mapped_device, 2142 work); 2143 struct bio *c; 2144 2145 down_write(&md->io_lock); 2146 2147 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2148 spin_lock_irq(&md->deferred_lock); 2149 c = bio_list_pop(&md->deferred); 2150 spin_unlock_irq(&md->deferred_lock); 2151 2152 if (!c) { 2153 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2154 break; 2155 } 2156 2157 up_write(&md->io_lock); 2158 2159 if (dm_request_based(md)) 2160 generic_make_request(c); 2161 else { 2162 if (bio_barrier(c)) 2163 process_barrier(md, c); 2164 else 2165 __split_and_process_bio(md, c); 2166 } 2167 2168 down_write(&md->io_lock); 2169 } 2170 2171 up_write(&md->io_lock); 2172 } 2173 2174 static void dm_queue_flush(struct mapped_device *md) 2175 { 2176 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2177 smp_mb__after_clear_bit(); 2178 queue_work(md->wq, &md->work); 2179 } 2180 2181 /* 2182 * Swap in a new table (destroying old one). 2183 */ 2184 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 2185 { 2186 struct queue_limits limits; 2187 int r = -EINVAL; 2188 2189 mutex_lock(&md->suspend_lock); 2190 2191 /* device must be suspended */ 2192 if (!dm_suspended(md)) 2193 goto out; 2194 2195 r = dm_calculate_queue_limits(table, &limits); 2196 if (r) 2197 goto out; 2198 2199 /* cannot change the device type, once a table is bound */ 2200 if (md->map && 2201 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2202 DMWARN("can't change the device type after a table is bound"); 2203 goto out; 2204 } 2205 2206 /* 2207 * It is enought that blk_queue_ordered() is called only once when 2208 * the first bio-based table is bound. 2209 * 2210 * This setting should be moved to alloc_dev() when request-based dm 2211 * supports barrier. 2212 */ 2213 if (!md->map && dm_table_bio_based(table)) 2214 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL); 2215 2216 __unbind(md); 2217 r = __bind(md, table, &limits); 2218 2219 out: 2220 mutex_unlock(&md->suspend_lock); 2221 return r; 2222 } 2223 2224 static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2225 { 2226 md->suspend_rq.special = (void *)0x1; 2227 } 2228 2229 static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2230 { 2231 struct request_queue *q = md->queue; 2232 unsigned long flags; 2233 2234 spin_lock_irqsave(q->queue_lock, flags); 2235 if (!noflush) 2236 dm_rq_invalidate_suspend_marker(md); 2237 __start_queue(q); 2238 spin_unlock_irqrestore(q->queue_lock, flags); 2239 } 2240 2241 static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2242 { 2243 struct request *rq = &md->suspend_rq; 2244 struct request_queue *q = md->queue; 2245 2246 if (noflush) 2247 stop_queue(q); 2248 else { 2249 blk_rq_init(q, rq); 2250 blk_insert_request(q, rq, 0, NULL); 2251 } 2252 } 2253 2254 static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2255 { 2256 int r = 1; 2257 struct request *rq = &md->suspend_rq; 2258 struct request_queue *q = md->queue; 2259 unsigned long flags; 2260 2261 if (noflush) 2262 return r; 2263 2264 /* The marker must be protected by queue lock if it is in use */ 2265 spin_lock_irqsave(q->queue_lock, flags); 2266 if (unlikely(rq->ref_count)) { 2267 /* 2268 * This can happen, when the previous flush suspend was 2269 * interrupted, the marker is still in the queue and 2270 * this flush suspend has been invoked, because we don't 2271 * remove the marker at the time of suspend interruption. 2272 * We have only one marker per mapped_device, so we can't 2273 * start another flush suspend while it is in use. 2274 */ 2275 BUG_ON(!rq->special); /* The marker should be invalidated */ 2276 DMWARN("Invalidating the previous flush suspend is still in" 2277 " progress. Please retry later."); 2278 r = 0; 2279 } 2280 spin_unlock_irqrestore(q->queue_lock, flags); 2281 2282 return r; 2283 } 2284 2285 /* 2286 * Functions to lock and unlock any filesystem running on the 2287 * device. 2288 */ 2289 static int lock_fs(struct mapped_device *md) 2290 { 2291 int r; 2292 2293 WARN_ON(md->frozen_sb); 2294 2295 md->frozen_sb = freeze_bdev(md->bdev); 2296 if (IS_ERR(md->frozen_sb)) { 2297 r = PTR_ERR(md->frozen_sb); 2298 md->frozen_sb = NULL; 2299 return r; 2300 } 2301 2302 set_bit(DMF_FROZEN, &md->flags); 2303 2304 return 0; 2305 } 2306 2307 static void unlock_fs(struct mapped_device *md) 2308 { 2309 if (!test_bit(DMF_FROZEN, &md->flags)) 2310 return; 2311 2312 thaw_bdev(md->bdev, md->frozen_sb); 2313 md->frozen_sb = NULL; 2314 clear_bit(DMF_FROZEN, &md->flags); 2315 } 2316 2317 /* 2318 * We need to be able to change a mapping table under a mounted 2319 * filesystem. For example we might want to move some data in 2320 * the background. Before the table can be swapped with 2321 * dm_bind_table, dm_suspend must be called to flush any in 2322 * flight bios and ensure that any further io gets deferred. 2323 */ 2324 /* 2325 * Suspend mechanism in request-based dm. 2326 * 2327 * After the suspend starts, further incoming requests are kept in 2328 * the request_queue and deferred. 2329 * Remaining requests in the request_queue at the start of suspend are flushed 2330 * if it is flush suspend. 2331 * The suspend completes when the following conditions have been satisfied, 2332 * so wait for it: 2333 * 1. q->in_flight is 0 (which means no in_flight request) 2334 * 2. queue has been stopped (which means no request dispatching) 2335 * 2336 * 2337 * Noflush suspend 2338 * --------------- 2339 * Noflush suspend doesn't need to dispatch remaining requests. 2340 * So stop the queue immediately. Then, wait for all in_flight requests 2341 * to be completed or requeued. 2342 * 2343 * To abort noflush suspend, start the queue. 2344 * 2345 * 2346 * Flush suspend 2347 * ------------- 2348 * Flush suspend needs to dispatch remaining requests. So stop the queue 2349 * after the remaining requests are completed. (Requeued request must be also 2350 * re-dispatched and completed. Until then, we can't stop the queue.) 2351 * 2352 * During flushing the remaining requests, further incoming requests are also 2353 * inserted to the same queue. To distinguish which requests are to be 2354 * flushed, we insert a marker request to the queue at the time of starting 2355 * flush suspend, like a barrier. 2356 * The dispatching is blocked when the marker is found on the top of the queue. 2357 * And the queue is stopped when all in_flight requests are completed, since 2358 * that means the remaining requests are completely flushed. 2359 * Then, the marker is removed from the queue. 2360 * 2361 * To abort flush suspend, we also need to take care of the marker, not only 2362 * starting the queue. 2363 * We don't remove the marker forcibly from the queue since it's against 2364 * the block-layer manner. Instead, we put a invalidated mark on the marker. 2365 * When the invalidated marker is found on the top of the queue, it is 2366 * immediately removed from the queue, so it doesn't block dispatching. 2367 * Because we have only one marker per mapped_device, we can't start another 2368 * flush suspend until the invalidated marker is removed from the queue. 2369 * So fail and return with -EBUSY in such a case. 2370 */ 2371 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2372 { 2373 struct dm_table *map = NULL; 2374 int r = 0; 2375 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2376 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2377 2378 mutex_lock(&md->suspend_lock); 2379 2380 if (dm_suspended(md)) { 2381 r = -EINVAL; 2382 goto out_unlock; 2383 } 2384 2385 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2386 r = -EBUSY; 2387 goto out_unlock; 2388 } 2389 2390 map = dm_get_table(md); 2391 2392 /* 2393 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2394 * This flag is cleared before dm_suspend returns. 2395 */ 2396 if (noflush) 2397 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2398 2399 /* This does not get reverted if there's an error later. */ 2400 dm_table_presuspend_targets(map); 2401 2402 /* 2403 * Flush I/O to the device. noflush supersedes do_lockfs, 2404 * because lock_fs() needs to flush I/Os. 2405 */ 2406 if (!noflush && do_lockfs) { 2407 r = lock_fs(md); 2408 if (r) 2409 goto out; 2410 } 2411 2412 /* 2413 * Here we must make sure that no processes are submitting requests 2414 * to target drivers i.e. no one may be executing 2415 * __split_and_process_bio. This is called from dm_request and 2416 * dm_wq_work. 2417 * 2418 * To get all processes out of __split_and_process_bio in dm_request, 2419 * we take the write lock. To prevent any process from reentering 2420 * __split_and_process_bio from dm_request, we set 2421 * DMF_QUEUE_IO_TO_THREAD. 2422 * 2423 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2424 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2425 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2426 * further calls to __split_and_process_bio from dm_wq_work. 2427 */ 2428 down_write(&md->io_lock); 2429 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2430 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2431 up_write(&md->io_lock); 2432 2433 flush_workqueue(md->wq); 2434 2435 if (dm_request_based(md)) 2436 dm_rq_start_suspend(md, noflush); 2437 2438 /* 2439 * At this point no more requests are entering target request routines. 2440 * We call dm_wait_for_completion to wait for all existing requests 2441 * to finish. 2442 */ 2443 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2444 2445 down_write(&md->io_lock); 2446 if (noflush) 2447 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2448 up_write(&md->io_lock); 2449 2450 /* were we interrupted ? */ 2451 if (r < 0) { 2452 dm_queue_flush(md); 2453 2454 if (dm_request_based(md)) 2455 dm_rq_abort_suspend(md, noflush); 2456 2457 unlock_fs(md); 2458 goto out; /* pushback list is already flushed, so skip flush */ 2459 } 2460 2461 /* 2462 * If dm_wait_for_completion returned 0, the device is completely 2463 * quiescent now. There is no request-processing activity. All new 2464 * requests are being added to md->deferred list. 2465 */ 2466 2467 dm_table_postsuspend_targets(map); 2468 2469 set_bit(DMF_SUSPENDED, &md->flags); 2470 2471 out: 2472 dm_table_put(map); 2473 2474 out_unlock: 2475 mutex_unlock(&md->suspend_lock); 2476 return r; 2477 } 2478 2479 int dm_resume(struct mapped_device *md) 2480 { 2481 int r = -EINVAL; 2482 struct dm_table *map = NULL; 2483 2484 mutex_lock(&md->suspend_lock); 2485 if (!dm_suspended(md)) 2486 goto out; 2487 2488 map = dm_get_table(md); 2489 if (!map || !dm_table_get_size(map)) 2490 goto out; 2491 2492 r = dm_table_resume_targets(map); 2493 if (r) 2494 goto out; 2495 2496 dm_queue_flush(md); 2497 2498 /* 2499 * Flushing deferred I/Os must be done after targets are resumed 2500 * so that mapping of targets can work correctly. 2501 * Request-based dm is queueing the deferred I/Os in its request_queue. 2502 */ 2503 if (dm_request_based(md)) 2504 start_queue(md->queue); 2505 2506 unlock_fs(md); 2507 2508 clear_bit(DMF_SUSPENDED, &md->flags); 2509 2510 dm_table_unplug_all(map); 2511 r = 0; 2512 out: 2513 dm_table_put(map); 2514 mutex_unlock(&md->suspend_lock); 2515 2516 return r; 2517 } 2518 2519 /*----------------------------------------------------------------- 2520 * Event notification. 2521 *---------------------------------------------------------------*/ 2522 void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2523 unsigned cookie) 2524 { 2525 char udev_cookie[DM_COOKIE_LENGTH]; 2526 char *envp[] = { udev_cookie, NULL }; 2527 2528 if (!cookie) 2529 kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2530 else { 2531 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2532 DM_COOKIE_ENV_VAR_NAME, cookie); 2533 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); 2534 } 2535 } 2536 2537 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2538 { 2539 return atomic_add_return(1, &md->uevent_seq); 2540 } 2541 2542 uint32_t dm_get_event_nr(struct mapped_device *md) 2543 { 2544 return atomic_read(&md->event_nr); 2545 } 2546 2547 int dm_wait_event(struct mapped_device *md, int event_nr) 2548 { 2549 return wait_event_interruptible(md->eventq, 2550 (event_nr != atomic_read(&md->event_nr))); 2551 } 2552 2553 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2554 { 2555 unsigned long flags; 2556 2557 spin_lock_irqsave(&md->uevent_lock, flags); 2558 list_add(elist, &md->uevent_list); 2559 spin_unlock_irqrestore(&md->uevent_lock, flags); 2560 } 2561 2562 /* 2563 * The gendisk is only valid as long as you have a reference 2564 * count on 'md'. 2565 */ 2566 struct gendisk *dm_disk(struct mapped_device *md) 2567 { 2568 return md->disk; 2569 } 2570 2571 struct kobject *dm_kobject(struct mapped_device *md) 2572 { 2573 return &md->kobj; 2574 } 2575 2576 /* 2577 * struct mapped_device should not be exported outside of dm.c 2578 * so use this check to verify that kobj is part of md structure 2579 */ 2580 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2581 { 2582 struct mapped_device *md; 2583 2584 md = container_of(kobj, struct mapped_device, kobj); 2585 if (&md->kobj != kobj) 2586 return NULL; 2587 2588 if (test_bit(DMF_FREEING, &md->flags) || 2589 test_bit(DMF_DELETING, &md->flags)) 2590 return NULL; 2591 2592 dm_get(md); 2593 return md; 2594 } 2595 2596 int dm_suspended(struct mapped_device *md) 2597 { 2598 return test_bit(DMF_SUSPENDED, &md->flags); 2599 } 2600 2601 int dm_noflush_suspending(struct dm_target *ti) 2602 { 2603 struct mapped_device *md = dm_table_get_md(ti->table); 2604 int r = __noflush_suspending(md); 2605 2606 dm_put(md); 2607 2608 return r; 2609 } 2610 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2611 2612 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2613 { 2614 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2615 2616 if (!pools) 2617 return NULL; 2618 2619 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2620 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2621 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2622 if (!pools->io_pool) 2623 goto free_pools_and_out; 2624 2625 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2626 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2627 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2628 if (!pools->tio_pool) 2629 goto free_io_pool_and_out; 2630 2631 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2632 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2633 if (!pools->bs) 2634 goto free_tio_pool_and_out; 2635 2636 return pools; 2637 2638 free_tio_pool_and_out: 2639 mempool_destroy(pools->tio_pool); 2640 2641 free_io_pool_and_out: 2642 mempool_destroy(pools->io_pool); 2643 2644 free_pools_and_out: 2645 kfree(pools); 2646 2647 return NULL; 2648 } 2649 2650 void dm_free_md_mempools(struct dm_md_mempools *pools) 2651 { 2652 if (!pools) 2653 return; 2654 2655 if (pools->io_pool) 2656 mempool_destroy(pools->io_pool); 2657 2658 if (pools->tio_pool) 2659 mempool_destroy(pools->tio_pool); 2660 2661 if (pools->bs) 2662 bioset_free(pools->bs); 2663 2664 kfree(pools); 2665 } 2666 2667 static struct block_device_operations dm_blk_dops = { 2668 .open = dm_blk_open, 2669 .release = dm_blk_close, 2670 .ioctl = dm_blk_ioctl, 2671 .getgeo = dm_blk_getgeo, 2672 .owner = THIS_MODULE 2673 }; 2674 2675 EXPORT_SYMBOL(dm_get_mapinfo); 2676 2677 /* 2678 * module hooks 2679 */ 2680 module_init(dm_init); 2681 module_exit(dm_exit); 2682 2683 module_param(major, uint, 0); 2684 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2685 MODULE_DESCRIPTION(DM_NAME " driver"); 2686 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2687 MODULE_LICENSE("GPL"); 2688