1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/smp_lock.h> 19 #include <linux/mempool.h> 20 #include <linux/slab.h> 21 #include <linux/idr.h> 22 #include <linux/hdreg.h> 23 #include <linux/delay.h> 24 25 #include <trace/events/block.h> 26 27 #define DM_MSG_PREFIX "core" 28 29 /* 30 * Cookies are numeric values sent with CHANGE and REMOVE 31 * uevents while resuming, removing or renaming the device. 32 */ 33 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 34 #define DM_COOKIE_LENGTH 24 35 36 static const char *_name = DM_NAME; 37 38 static unsigned int major = 0; 39 static unsigned int _major = 0; 40 41 static DEFINE_SPINLOCK(_minor_lock); 42 /* 43 * For bio-based dm. 44 * One of these is allocated per bio. 45 */ 46 struct dm_io { 47 struct mapped_device *md; 48 int error; 49 atomic_t io_count; 50 struct bio *bio; 51 unsigned long start_time; 52 spinlock_t endio_lock; 53 }; 54 55 /* 56 * For bio-based dm. 57 * One of these is allocated per target within a bio. Hopefully 58 * this will be simplified out one day. 59 */ 60 struct dm_target_io { 61 struct dm_io *io; 62 struct dm_target *ti; 63 union map_info info; 64 }; 65 66 /* 67 * For request-based dm. 68 * One of these is allocated per request. 69 */ 70 struct dm_rq_target_io { 71 struct mapped_device *md; 72 struct dm_target *ti; 73 struct request *orig, clone; 74 int error; 75 union map_info info; 76 }; 77 78 /* 79 * For request-based dm. 80 * One of these is allocated per bio. 81 */ 82 struct dm_rq_clone_bio_info { 83 struct bio *orig; 84 struct dm_rq_target_io *tio; 85 }; 86 87 union map_info *dm_get_mapinfo(struct bio *bio) 88 { 89 if (bio && bio->bi_private) 90 return &((struct dm_target_io *)bio->bi_private)->info; 91 return NULL; 92 } 93 94 union map_info *dm_get_rq_mapinfo(struct request *rq) 95 { 96 if (rq && rq->end_io_data) 97 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 98 return NULL; 99 } 100 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 101 102 #define MINOR_ALLOCED ((void *)-1) 103 104 /* 105 * Bits for the md->flags field. 106 */ 107 #define DMF_BLOCK_IO_FOR_SUSPEND 0 108 #define DMF_SUSPENDED 1 109 #define DMF_FROZEN 2 110 #define DMF_FREEING 3 111 #define DMF_DELETING 4 112 #define DMF_NOFLUSH_SUSPENDING 5 113 #define DMF_QUEUE_IO_TO_THREAD 6 114 115 /* 116 * Work processed by per-device workqueue. 117 */ 118 struct mapped_device { 119 struct rw_semaphore io_lock; 120 struct mutex suspend_lock; 121 rwlock_t map_lock; 122 atomic_t holders; 123 atomic_t open_count; 124 125 unsigned long flags; 126 127 struct request_queue *queue; 128 unsigned type; 129 /* Protect queue and type against concurrent access. */ 130 struct mutex type_lock; 131 132 struct gendisk *disk; 133 char name[16]; 134 135 void *interface_ptr; 136 137 /* 138 * A list of ios that arrived while we were suspended. 139 */ 140 atomic_t pending[2]; 141 wait_queue_head_t wait; 142 struct work_struct work; 143 struct bio_list deferred; 144 spinlock_t deferred_lock; 145 146 /* 147 * An error from the barrier request currently being processed. 148 */ 149 int barrier_error; 150 151 /* 152 * Protect barrier_error from concurrent endio processing 153 * in request-based dm. 154 */ 155 spinlock_t barrier_error_lock; 156 157 /* 158 * Processing queue (flush/barriers) 159 */ 160 struct workqueue_struct *wq; 161 struct work_struct barrier_work; 162 163 /* A pointer to the currently processing pre/post flush request */ 164 struct request *flush_request; 165 166 /* 167 * The current mapping. 168 */ 169 struct dm_table *map; 170 171 /* 172 * io objects are allocated from here. 173 */ 174 mempool_t *io_pool; 175 mempool_t *tio_pool; 176 177 struct bio_set *bs; 178 179 /* 180 * Event handling. 181 */ 182 atomic_t event_nr; 183 wait_queue_head_t eventq; 184 atomic_t uevent_seq; 185 struct list_head uevent_list; 186 spinlock_t uevent_lock; /* Protect access to uevent_list */ 187 188 /* 189 * freeze/thaw support require holding onto a super block 190 */ 191 struct super_block *frozen_sb; 192 struct block_device *bdev; 193 194 /* forced geometry settings */ 195 struct hd_geometry geometry; 196 197 /* For saving the address of __make_request for request based dm */ 198 make_request_fn *saved_make_request_fn; 199 200 /* sysfs handle */ 201 struct kobject kobj; 202 203 /* zero-length barrier that will be cloned and submitted to targets */ 204 struct bio barrier_bio; 205 }; 206 207 /* 208 * For mempools pre-allocation at the table loading time. 209 */ 210 struct dm_md_mempools { 211 mempool_t *io_pool; 212 mempool_t *tio_pool; 213 struct bio_set *bs; 214 }; 215 216 #define MIN_IOS 256 217 static struct kmem_cache *_io_cache; 218 static struct kmem_cache *_tio_cache; 219 static struct kmem_cache *_rq_tio_cache; 220 static struct kmem_cache *_rq_bio_info_cache; 221 222 static int __init local_init(void) 223 { 224 int r = -ENOMEM; 225 226 /* allocate a slab for the dm_ios */ 227 _io_cache = KMEM_CACHE(dm_io, 0); 228 if (!_io_cache) 229 return r; 230 231 /* allocate a slab for the target ios */ 232 _tio_cache = KMEM_CACHE(dm_target_io, 0); 233 if (!_tio_cache) 234 goto out_free_io_cache; 235 236 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 237 if (!_rq_tio_cache) 238 goto out_free_tio_cache; 239 240 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 241 if (!_rq_bio_info_cache) 242 goto out_free_rq_tio_cache; 243 244 r = dm_uevent_init(); 245 if (r) 246 goto out_free_rq_bio_info_cache; 247 248 _major = major; 249 r = register_blkdev(_major, _name); 250 if (r < 0) 251 goto out_uevent_exit; 252 253 if (!_major) 254 _major = r; 255 256 return 0; 257 258 out_uevent_exit: 259 dm_uevent_exit(); 260 out_free_rq_bio_info_cache: 261 kmem_cache_destroy(_rq_bio_info_cache); 262 out_free_rq_tio_cache: 263 kmem_cache_destroy(_rq_tio_cache); 264 out_free_tio_cache: 265 kmem_cache_destroy(_tio_cache); 266 out_free_io_cache: 267 kmem_cache_destroy(_io_cache); 268 269 return r; 270 } 271 272 static void local_exit(void) 273 { 274 kmem_cache_destroy(_rq_bio_info_cache); 275 kmem_cache_destroy(_rq_tio_cache); 276 kmem_cache_destroy(_tio_cache); 277 kmem_cache_destroy(_io_cache); 278 unregister_blkdev(_major, _name); 279 dm_uevent_exit(); 280 281 _major = 0; 282 283 DMINFO("cleaned up"); 284 } 285 286 static int (*_inits[])(void) __initdata = { 287 local_init, 288 dm_target_init, 289 dm_linear_init, 290 dm_stripe_init, 291 dm_io_init, 292 dm_kcopyd_init, 293 dm_interface_init, 294 }; 295 296 static void (*_exits[])(void) = { 297 local_exit, 298 dm_target_exit, 299 dm_linear_exit, 300 dm_stripe_exit, 301 dm_io_exit, 302 dm_kcopyd_exit, 303 dm_interface_exit, 304 }; 305 306 static int __init dm_init(void) 307 { 308 const int count = ARRAY_SIZE(_inits); 309 310 int r, i; 311 312 for (i = 0; i < count; i++) { 313 r = _inits[i](); 314 if (r) 315 goto bad; 316 } 317 318 return 0; 319 320 bad: 321 while (i--) 322 _exits[i](); 323 324 return r; 325 } 326 327 static void __exit dm_exit(void) 328 { 329 int i = ARRAY_SIZE(_exits); 330 331 while (i--) 332 _exits[i](); 333 } 334 335 /* 336 * Block device functions 337 */ 338 int dm_deleting_md(struct mapped_device *md) 339 { 340 return test_bit(DMF_DELETING, &md->flags); 341 } 342 343 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 lock_kernel(); 348 spin_lock(&_minor_lock); 349 350 md = bdev->bd_disk->private_data; 351 if (!md) 352 goto out; 353 354 if (test_bit(DMF_FREEING, &md->flags) || 355 dm_deleting_md(md)) { 356 md = NULL; 357 goto out; 358 } 359 360 dm_get(md); 361 atomic_inc(&md->open_count); 362 363 out: 364 spin_unlock(&_minor_lock); 365 unlock_kernel(); 366 367 return md ? 0 : -ENXIO; 368 } 369 370 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 371 { 372 struct mapped_device *md = disk->private_data; 373 374 lock_kernel(); 375 atomic_dec(&md->open_count); 376 dm_put(md); 377 unlock_kernel(); 378 379 return 0; 380 } 381 382 int dm_open_count(struct mapped_device *md) 383 { 384 return atomic_read(&md->open_count); 385 } 386 387 /* 388 * Guarantees nothing is using the device before it's deleted. 389 */ 390 int dm_lock_for_deletion(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (dm_open_count(md)) 397 r = -EBUSY; 398 else 399 set_bit(DMF_DELETING, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 407 { 408 struct mapped_device *md = bdev->bd_disk->private_data; 409 410 return dm_get_geometry(md, geo); 411 } 412 413 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 414 unsigned int cmd, unsigned long arg) 415 { 416 struct mapped_device *md = bdev->bd_disk->private_data; 417 struct dm_table *map = dm_get_live_table(md); 418 struct dm_target *tgt; 419 int r = -ENOTTY; 420 421 if (!map || !dm_table_get_size(map)) 422 goto out; 423 424 /* We only support devices that have a single target */ 425 if (dm_table_get_num_targets(map) != 1) 426 goto out; 427 428 tgt = dm_table_get_target(map, 0); 429 430 if (dm_suspended_md(md)) { 431 r = -EAGAIN; 432 goto out; 433 } 434 435 if (tgt->type->ioctl) 436 r = tgt->type->ioctl(tgt, cmd, arg); 437 438 out: 439 dm_table_put(map); 440 441 return r; 442 } 443 444 static struct dm_io *alloc_io(struct mapped_device *md) 445 { 446 return mempool_alloc(md->io_pool, GFP_NOIO); 447 } 448 449 static void free_io(struct mapped_device *md, struct dm_io *io) 450 { 451 mempool_free(io, md->io_pool); 452 } 453 454 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 455 { 456 mempool_free(tio, md->tio_pool); 457 } 458 459 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 460 gfp_t gfp_mask) 461 { 462 return mempool_alloc(md->tio_pool, gfp_mask); 463 } 464 465 static void free_rq_tio(struct dm_rq_target_io *tio) 466 { 467 mempool_free(tio, tio->md->tio_pool); 468 } 469 470 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 471 { 472 return mempool_alloc(md->io_pool, GFP_ATOMIC); 473 } 474 475 static void free_bio_info(struct dm_rq_clone_bio_info *info) 476 { 477 mempool_free(info, info->tio->md->io_pool); 478 } 479 480 static int md_in_flight(struct mapped_device *md) 481 { 482 return atomic_read(&md->pending[READ]) + 483 atomic_read(&md->pending[WRITE]); 484 } 485 486 static void start_io_acct(struct dm_io *io) 487 { 488 struct mapped_device *md = io->md; 489 int cpu; 490 int rw = bio_data_dir(io->bio); 491 492 io->start_time = jiffies; 493 494 cpu = part_stat_lock(); 495 part_round_stats(cpu, &dm_disk(md)->part0); 496 part_stat_unlock(); 497 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 498 } 499 500 static void end_io_acct(struct dm_io *io) 501 { 502 struct mapped_device *md = io->md; 503 struct bio *bio = io->bio; 504 unsigned long duration = jiffies - io->start_time; 505 int pending, cpu; 506 int rw = bio_data_dir(bio); 507 508 cpu = part_stat_lock(); 509 part_round_stats(cpu, &dm_disk(md)->part0); 510 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 511 part_stat_unlock(); 512 513 /* 514 * After this is decremented the bio must not be touched if it is 515 * a barrier. 516 */ 517 dm_disk(md)->part0.in_flight[rw] = pending = 518 atomic_dec_return(&md->pending[rw]); 519 pending += atomic_read(&md->pending[rw^0x1]); 520 521 /* nudge anyone waiting on suspend queue */ 522 if (!pending) 523 wake_up(&md->wait); 524 } 525 526 /* 527 * Add the bio to the list of deferred io. 528 */ 529 static void queue_io(struct mapped_device *md, struct bio *bio) 530 { 531 down_write(&md->io_lock); 532 533 spin_lock_irq(&md->deferred_lock); 534 bio_list_add(&md->deferred, bio); 535 spin_unlock_irq(&md->deferred_lock); 536 537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 538 queue_work(md->wq, &md->work); 539 540 up_write(&md->io_lock); 541 } 542 543 /* 544 * Everyone (including functions in this file), should use this 545 * function to access the md->map field, and make sure they call 546 * dm_table_put() when finished. 547 */ 548 struct dm_table *dm_get_live_table(struct mapped_device *md) 549 { 550 struct dm_table *t; 551 unsigned long flags; 552 553 read_lock_irqsave(&md->map_lock, flags); 554 t = md->map; 555 if (t) 556 dm_table_get(t); 557 read_unlock_irqrestore(&md->map_lock, flags); 558 559 return t; 560 } 561 562 /* 563 * Get the geometry associated with a dm device 564 */ 565 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 566 { 567 *geo = md->geometry; 568 569 return 0; 570 } 571 572 /* 573 * Set the geometry of a device. 574 */ 575 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 576 { 577 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 578 579 if (geo->start > sz) { 580 DMWARN("Start sector is beyond the geometry limits."); 581 return -EINVAL; 582 } 583 584 md->geometry = *geo; 585 586 return 0; 587 } 588 589 /*----------------------------------------------------------------- 590 * CRUD START: 591 * A more elegant soln is in the works that uses the queue 592 * merge fn, unfortunately there are a couple of changes to 593 * the block layer that I want to make for this. So in the 594 * interests of getting something for people to use I give 595 * you this clearly demarcated crap. 596 *---------------------------------------------------------------*/ 597 598 static int __noflush_suspending(struct mapped_device *md) 599 { 600 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 601 } 602 603 /* 604 * Decrements the number of outstanding ios that a bio has been 605 * cloned into, completing the original io if necc. 606 */ 607 static void dec_pending(struct dm_io *io, int error) 608 { 609 unsigned long flags; 610 int io_error; 611 struct bio *bio; 612 struct mapped_device *md = io->md; 613 614 /* Push-back supersedes any I/O errors */ 615 if (unlikely(error)) { 616 spin_lock_irqsave(&io->endio_lock, flags); 617 if (!(io->error > 0 && __noflush_suspending(md))) 618 io->error = error; 619 spin_unlock_irqrestore(&io->endio_lock, flags); 620 } 621 622 if (atomic_dec_and_test(&io->io_count)) { 623 if (io->error == DM_ENDIO_REQUEUE) { 624 /* 625 * Target requested pushing back the I/O. 626 */ 627 spin_lock_irqsave(&md->deferred_lock, flags); 628 if (__noflush_suspending(md)) { 629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 630 bio_list_add_head(&md->deferred, 631 io->bio); 632 } else 633 /* noflush suspend was interrupted. */ 634 io->error = -EIO; 635 spin_unlock_irqrestore(&md->deferred_lock, flags); 636 } 637 638 io_error = io->error; 639 bio = io->bio; 640 641 if (bio->bi_rw & REQ_HARDBARRIER) { 642 /* 643 * There can be just one barrier request so we use 644 * a per-device variable for error reporting. 645 * Note that you can't touch the bio after end_io_acct 646 * 647 * We ignore -EOPNOTSUPP for empty flush reported by 648 * underlying devices. We assume that if the device 649 * doesn't support empty barriers, it doesn't need 650 * cache flushing commands. 651 */ 652 if (!md->barrier_error && 653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 654 md->barrier_error = io_error; 655 end_io_acct(io); 656 free_io(md, io); 657 } else { 658 end_io_acct(io); 659 free_io(md, io); 660 661 if (io_error != DM_ENDIO_REQUEUE) { 662 trace_block_bio_complete(md->queue, bio); 663 664 bio_endio(bio, io_error); 665 } 666 } 667 } 668 } 669 670 static void clone_endio(struct bio *bio, int error) 671 { 672 int r = 0; 673 struct dm_target_io *tio = bio->bi_private; 674 struct dm_io *io = tio->io; 675 struct mapped_device *md = tio->io->md; 676 dm_endio_fn endio = tio->ti->type->end_io; 677 678 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 679 error = -EIO; 680 681 if (endio) { 682 r = endio(tio->ti, bio, error, &tio->info); 683 if (r < 0 || r == DM_ENDIO_REQUEUE) 684 /* 685 * error and requeue request are handled 686 * in dec_pending(). 687 */ 688 error = r; 689 else if (r == DM_ENDIO_INCOMPLETE) 690 /* The target will handle the io */ 691 return; 692 else if (r) { 693 DMWARN("unimplemented target endio return value: %d", r); 694 BUG(); 695 } 696 } 697 698 /* 699 * Store md for cleanup instead of tio which is about to get freed. 700 */ 701 bio->bi_private = md->bs; 702 703 free_tio(md, tio); 704 bio_put(bio); 705 dec_pending(io, error); 706 } 707 708 /* 709 * Partial completion handling for request-based dm 710 */ 711 static void end_clone_bio(struct bio *clone, int error) 712 { 713 struct dm_rq_clone_bio_info *info = clone->bi_private; 714 struct dm_rq_target_io *tio = info->tio; 715 struct bio *bio = info->orig; 716 unsigned int nr_bytes = info->orig->bi_size; 717 718 bio_put(clone); 719 720 if (tio->error) 721 /* 722 * An error has already been detected on the request. 723 * Once error occurred, just let clone->end_io() handle 724 * the remainder. 725 */ 726 return; 727 else if (error) { 728 /* 729 * Don't notice the error to the upper layer yet. 730 * The error handling decision is made by the target driver, 731 * when the request is completed. 732 */ 733 tio->error = error; 734 return; 735 } 736 737 /* 738 * I/O for the bio successfully completed. 739 * Notice the data completion to the upper layer. 740 */ 741 742 /* 743 * bios are processed from the head of the list. 744 * So the completing bio should always be rq->bio. 745 * If it's not, something wrong is happening. 746 */ 747 if (tio->orig->bio != bio) 748 DMERR("bio completion is going in the middle of the request"); 749 750 /* 751 * Update the original request. 752 * Do not use blk_end_request() here, because it may complete 753 * the original request before the clone, and break the ordering. 754 */ 755 blk_update_request(tio->orig, 0, nr_bytes); 756 } 757 758 static void store_barrier_error(struct mapped_device *md, int error) 759 { 760 unsigned long flags; 761 762 spin_lock_irqsave(&md->barrier_error_lock, flags); 763 /* 764 * Basically, the first error is taken, but: 765 * -EOPNOTSUPP supersedes any I/O error. 766 * Requeue request supersedes any I/O error but -EOPNOTSUPP. 767 */ 768 if (!md->barrier_error || error == -EOPNOTSUPP || 769 (md->barrier_error != -EOPNOTSUPP && 770 error == DM_ENDIO_REQUEUE)) 771 md->barrier_error = error; 772 spin_unlock_irqrestore(&md->barrier_error_lock, flags); 773 } 774 775 /* 776 * Don't touch any member of the md after calling this function because 777 * the md may be freed in dm_put() at the end of this function. 778 * Or do dm_get() before calling this function and dm_put() later. 779 */ 780 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 781 { 782 atomic_dec(&md->pending[rw]); 783 784 /* nudge anyone waiting on suspend queue */ 785 if (!md_in_flight(md)) 786 wake_up(&md->wait); 787 788 if (run_queue) 789 blk_run_queue(md->queue); 790 791 /* 792 * dm_put() must be at the end of this function. See the comment above 793 */ 794 dm_put(md); 795 } 796 797 static void free_rq_clone(struct request *clone) 798 { 799 struct dm_rq_target_io *tio = clone->end_io_data; 800 801 blk_rq_unprep_clone(clone); 802 free_rq_tio(tio); 803 } 804 805 /* 806 * Complete the clone and the original request. 807 * Must be called without queue lock. 808 */ 809 static void dm_end_request(struct request *clone, int error) 810 { 811 int rw = rq_data_dir(clone); 812 int run_queue = 1; 813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; 814 struct dm_rq_target_io *tio = clone->end_io_data; 815 struct mapped_device *md = tio->md; 816 struct request *rq = tio->orig; 817 818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 819 rq->errors = clone->errors; 820 rq->resid_len = clone->resid_len; 821 822 if (rq->sense) 823 /* 824 * We are using the sense buffer of the original 825 * request. 826 * So setting the length of the sense data is enough. 827 */ 828 rq->sense_len = clone->sense_len; 829 } 830 831 free_rq_clone(clone); 832 833 if (unlikely(is_barrier)) { 834 if (unlikely(error)) 835 store_barrier_error(md, error); 836 run_queue = 0; 837 } else 838 blk_end_request_all(rq, error); 839 840 rq_completed(md, rw, run_queue); 841 } 842 843 static void dm_unprep_request(struct request *rq) 844 { 845 struct request *clone = rq->special; 846 847 rq->special = NULL; 848 rq->cmd_flags &= ~REQ_DONTPREP; 849 850 free_rq_clone(clone); 851 } 852 853 /* 854 * Requeue the original request of a clone. 855 */ 856 void dm_requeue_unmapped_request(struct request *clone) 857 { 858 int rw = rq_data_dir(clone); 859 struct dm_rq_target_io *tio = clone->end_io_data; 860 struct mapped_device *md = tio->md; 861 struct request *rq = tio->orig; 862 struct request_queue *q = rq->q; 863 unsigned long flags; 864 865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 866 /* 867 * Barrier clones share an original request. 868 * Leave it to dm_end_request(), which handles this special 869 * case. 870 */ 871 dm_end_request(clone, DM_ENDIO_REQUEUE); 872 return; 873 } 874 875 dm_unprep_request(rq); 876 877 spin_lock_irqsave(q->queue_lock, flags); 878 if (elv_queue_empty(q)) 879 blk_plug_device(q); 880 blk_requeue_request(q, rq); 881 spin_unlock_irqrestore(q->queue_lock, flags); 882 883 rq_completed(md, rw, 0); 884 } 885 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 886 887 static void __stop_queue(struct request_queue *q) 888 { 889 blk_stop_queue(q); 890 } 891 892 static void stop_queue(struct request_queue *q) 893 { 894 unsigned long flags; 895 896 spin_lock_irqsave(q->queue_lock, flags); 897 __stop_queue(q); 898 spin_unlock_irqrestore(q->queue_lock, flags); 899 } 900 901 static void __start_queue(struct request_queue *q) 902 { 903 if (blk_queue_stopped(q)) 904 blk_start_queue(q); 905 } 906 907 static void start_queue(struct request_queue *q) 908 { 909 unsigned long flags; 910 911 spin_lock_irqsave(q->queue_lock, flags); 912 __start_queue(q); 913 spin_unlock_irqrestore(q->queue_lock, flags); 914 } 915 916 static void dm_done(struct request *clone, int error, bool mapped) 917 { 918 int r = error; 919 struct dm_rq_target_io *tio = clone->end_io_data; 920 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 921 922 if (mapped && rq_end_io) 923 r = rq_end_io(tio->ti, clone, error, &tio->info); 924 925 if (r <= 0) 926 /* The target wants to complete the I/O */ 927 dm_end_request(clone, r); 928 else if (r == DM_ENDIO_INCOMPLETE) 929 /* The target will handle the I/O */ 930 return; 931 else if (r == DM_ENDIO_REQUEUE) 932 /* The target wants to requeue the I/O */ 933 dm_requeue_unmapped_request(clone); 934 else { 935 DMWARN("unimplemented target endio return value: %d", r); 936 BUG(); 937 } 938 } 939 940 /* 941 * Request completion handler for request-based dm 942 */ 943 static void dm_softirq_done(struct request *rq) 944 { 945 bool mapped = true; 946 struct request *clone = rq->completion_data; 947 struct dm_rq_target_io *tio = clone->end_io_data; 948 949 if (rq->cmd_flags & REQ_FAILED) 950 mapped = false; 951 952 dm_done(clone, tio->error, mapped); 953 } 954 955 /* 956 * Complete the clone and the original request with the error status 957 * through softirq context. 958 */ 959 static void dm_complete_request(struct request *clone, int error) 960 { 961 struct dm_rq_target_io *tio = clone->end_io_data; 962 struct request *rq = tio->orig; 963 964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 965 /* 966 * Barrier clones share an original request. So can't use 967 * softirq_done with the original. 968 * Pass the clone to dm_done() directly in this special case. 969 * It is safe (even if clone->q->queue_lock is held here) 970 * because there is no I/O dispatching during the completion 971 * of barrier clone. 972 */ 973 dm_done(clone, error, true); 974 return; 975 } 976 977 tio->error = error; 978 rq->completion_data = clone; 979 blk_complete_request(rq); 980 } 981 982 /* 983 * Complete the not-mapped clone and the original request with the error status 984 * through softirq context. 985 * Target's rq_end_io() function isn't called. 986 * This may be used when the target's map_rq() function fails. 987 */ 988 void dm_kill_unmapped_request(struct request *clone, int error) 989 { 990 struct dm_rq_target_io *tio = clone->end_io_data; 991 struct request *rq = tio->orig; 992 993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 994 /* 995 * Barrier clones share an original request. 996 * Leave it to dm_end_request(), which handles this special 997 * case. 998 */ 999 BUG_ON(error > 0); 1000 dm_end_request(clone, error); 1001 return; 1002 } 1003 1004 rq->cmd_flags |= REQ_FAILED; 1005 dm_complete_request(clone, error); 1006 } 1007 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 1008 1009 /* 1010 * Called with the queue lock held 1011 */ 1012 static void end_clone_request(struct request *clone, int error) 1013 { 1014 /* 1015 * For just cleaning up the information of the queue in which 1016 * the clone was dispatched. 1017 * The clone is *NOT* freed actually here because it is alloced from 1018 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1019 */ 1020 __blk_put_request(clone->q, clone); 1021 1022 /* 1023 * Actual request completion is done in a softirq context which doesn't 1024 * hold the queue lock. Otherwise, deadlock could occur because: 1025 * - another request may be submitted by the upper level driver 1026 * of the stacking during the completion 1027 * - the submission which requires queue lock may be done 1028 * against this queue 1029 */ 1030 dm_complete_request(clone, error); 1031 } 1032 1033 /* 1034 * Return maximum size of I/O possible at the supplied sector up to the current 1035 * target boundary. 1036 */ 1037 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1038 { 1039 sector_t target_offset = dm_target_offset(ti, sector); 1040 1041 return ti->len - target_offset; 1042 } 1043 1044 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1045 { 1046 sector_t len = max_io_len_target_boundary(sector, ti); 1047 1048 /* 1049 * Does the target need to split even further ? 1050 */ 1051 if (ti->split_io) { 1052 sector_t boundary; 1053 sector_t offset = dm_target_offset(ti, sector); 1054 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 1055 - offset; 1056 if (len > boundary) 1057 len = boundary; 1058 } 1059 1060 return len; 1061 } 1062 1063 static void __map_bio(struct dm_target *ti, struct bio *clone, 1064 struct dm_target_io *tio) 1065 { 1066 int r; 1067 sector_t sector; 1068 struct mapped_device *md; 1069 1070 clone->bi_end_io = clone_endio; 1071 clone->bi_private = tio; 1072 1073 /* 1074 * Map the clone. If r == 0 we don't need to do 1075 * anything, the target has assumed ownership of 1076 * this io. 1077 */ 1078 atomic_inc(&tio->io->io_count); 1079 sector = clone->bi_sector; 1080 r = ti->type->map(ti, clone, &tio->info); 1081 if (r == DM_MAPIO_REMAPPED) { 1082 /* the bio has been remapped so dispatch it */ 1083 1084 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 1085 tio->io->bio->bi_bdev->bd_dev, sector); 1086 1087 generic_make_request(clone); 1088 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1089 /* error the io and bail out, or requeue it if needed */ 1090 md = tio->io->md; 1091 dec_pending(tio->io, r); 1092 /* 1093 * Store bio_set for cleanup. 1094 */ 1095 clone->bi_private = md->bs; 1096 bio_put(clone); 1097 free_tio(md, tio); 1098 } else if (r) { 1099 DMWARN("unimplemented target map return value: %d", r); 1100 BUG(); 1101 } 1102 } 1103 1104 struct clone_info { 1105 struct mapped_device *md; 1106 struct dm_table *map; 1107 struct bio *bio; 1108 struct dm_io *io; 1109 sector_t sector; 1110 sector_t sector_count; 1111 unsigned short idx; 1112 }; 1113 1114 static void dm_bio_destructor(struct bio *bio) 1115 { 1116 struct bio_set *bs = bio->bi_private; 1117 1118 bio_free(bio, bs); 1119 } 1120 1121 /* 1122 * Creates a little bio that is just does part of a bvec. 1123 */ 1124 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1125 unsigned short idx, unsigned int offset, 1126 unsigned int len, struct bio_set *bs) 1127 { 1128 struct bio *clone; 1129 struct bio_vec *bv = bio->bi_io_vec + idx; 1130 1131 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1132 clone->bi_destructor = dm_bio_destructor; 1133 *clone->bi_io_vec = *bv; 1134 1135 clone->bi_sector = sector; 1136 clone->bi_bdev = bio->bi_bdev; 1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1138 clone->bi_vcnt = 1; 1139 clone->bi_size = to_bytes(len); 1140 clone->bi_io_vec->bv_offset = offset; 1141 clone->bi_io_vec->bv_len = clone->bi_size; 1142 clone->bi_flags |= 1 << BIO_CLONED; 1143 1144 if (bio_integrity(bio)) { 1145 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1146 bio_integrity_trim(clone, 1147 bio_sector_offset(bio, idx, offset), len); 1148 } 1149 1150 return clone; 1151 } 1152 1153 /* 1154 * Creates a bio that consists of range of complete bvecs. 1155 */ 1156 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1157 unsigned short idx, unsigned short bv_count, 1158 unsigned int len, struct bio_set *bs) 1159 { 1160 struct bio *clone; 1161 1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1163 __bio_clone(clone, bio); 1164 clone->bi_rw &= ~REQ_HARDBARRIER; 1165 clone->bi_destructor = dm_bio_destructor; 1166 clone->bi_sector = sector; 1167 clone->bi_idx = idx; 1168 clone->bi_vcnt = idx + bv_count; 1169 clone->bi_size = to_bytes(len); 1170 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1171 1172 if (bio_integrity(bio)) { 1173 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1174 1175 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1176 bio_integrity_trim(clone, 1177 bio_sector_offset(bio, idx, 0), len); 1178 } 1179 1180 return clone; 1181 } 1182 1183 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1184 struct dm_target *ti) 1185 { 1186 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1187 1188 tio->io = ci->io; 1189 tio->ti = ti; 1190 memset(&tio->info, 0, sizeof(tio->info)); 1191 1192 return tio; 1193 } 1194 1195 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1196 unsigned request_nr, sector_t len) 1197 { 1198 struct dm_target_io *tio = alloc_tio(ci, ti); 1199 struct bio *clone; 1200 1201 tio->info.target_request_nr = request_nr; 1202 1203 /* 1204 * Discard requests require the bio's inline iovecs be initialized. 1205 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1206 * and discard, so no need for concern about wasted bvec allocations. 1207 */ 1208 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1209 __bio_clone(clone, ci->bio); 1210 clone->bi_destructor = dm_bio_destructor; 1211 if (len) { 1212 clone->bi_sector = ci->sector; 1213 clone->bi_size = to_bytes(len); 1214 } 1215 1216 __map_bio(ti, clone, tio); 1217 } 1218 1219 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1220 unsigned num_requests, sector_t len) 1221 { 1222 unsigned request_nr; 1223 1224 for (request_nr = 0; request_nr < num_requests; request_nr++) 1225 __issue_target_request(ci, ti, request_nr, len); 1226 } 1227 1228 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1229 { 1230 unsigned target_nr = 0; 1231 struct dm_target *ti; 1232 1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1235 1236 ci->sector_count = 0; 1237 1238 return 0; 1239 } 1240 1241 /* 1242 * Perform all io with a single clone. 1243 */ 1244 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1245 { 1246 struct bio *clone, *bio = ci->bio; 1247 struct dm_target_io *tio; 1248 1249 tio = alloc_tio(ci, ti); 1250 clone = clone_bio(bio, ci->sector, ci->idx, 1251 bio->bi_vcnt - ci->idx, ci->sector_count, 1252 ci->md->bs); 1253 __map_bio(ti, clone, tio); 1254 ci->sector_count = 0; 1255 } 1256 1257 static int __clone_and_map_discard(struct clone_info *ci) 1258 { 1259 struct dm_target *ti; 1260 sector_t len; 1261 1262 do { 1263 ti = dm_table_find_target(ci->map, ci->sector); 1264 if (!dm_target_is_valid(ti)) 1265 return -EIO; 1266 1267 /* 1268 * Even though the device advertised discard support, 1269 * reconfiguration might have changed that since the 1270 * check was performed. 1271 */ 1272 if (!ti->num_discard_requests) 1273 return -EOPNOTSUPP; 1274 1275 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1276 1277 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1278 1279 ci->sector += len; 1280 } while (ci->sector_count -= len); 1281 1282 return 0; 1283 } 1284 1285 static int __clone_and_map(struct clone_info *ci) 1286 { 1287 struct bio *clone, *bio = ci->bio; 1288 struct dm_target *ti; 1289 sector_t len = 0, max; 1290 struct dm_target_io *tio; 1291 1292 if (unlikely(bio_empty_barrier(bio))) 1293 return __clone_and_map_empty_barrier(ci); 1294 1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1296 return __clone_and_map_discard(ci); 1297 1298 ti = dm_table_find_target(ci->map, ci->sector); 1299 if (!dm_target_is_valid(ti)) 1300 return -EIO; 1301 1302 max = max_io_len(ci->sector, ti); 1303 1304 if (ci->sector_count <= max) { 1305 /* 1306 * Optimise for the simple case where we can do all of 1307 * the remaining io with a single clone. 1308 */ 1309 __clone_and_map_simple(ci, ti); 1310 1311 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1312 /* 1313 * There are some bvecs that don't span targets. 1314 * Do as many of these as possible. 1315 */ 1316 int i; 1317 sector_t remaining = max; 1318 sector_t bv_len; 1319 1320 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1321 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1322 1323 if (bv_len > remaining) 1324 break; 1325 1326 remaining -= bv_len; 1327 len += bv_len; 1328 } 1329 1330 tio = alloc_tio(ci, ti); 1331 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1332 ci->md->bs); 1333 __map_bio(ti, clone, tio); 1334 1335 ci->sector += len; 1336 ci->sector_count -= len; 1337 ci->idx = i; 1338 1339 } else { 1340 /* 1341 * Handle a bvec that must be split between two or more targets. 1342 */ 1343 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1344 sector_t remaining = to_sector(bv->bv_len); 1345 unsigned int offset = 0; 1346 1347 do { 1348 if (offset) { 1349 ti = dm_table_find_target(ci->map, ci->sector); 1350 if (!dm_target_is_valid(ti)) 1351 return -EIO; 1352 1353 max = max_io_len(ci->sector, ti); 1354 } 1355 1356 len = min(remaining, max); 1357 1358 tio = alloc_tio(ci, ti); 1359 clone = split_bvec(bio, ci->sector, ci->idx, 1360 bv->bv_offset + offset, len, 1361 ci->md->bs); 1362 1363 __map_bio(ti, clone, tio); 1364 1365 ci->sector += len; 1366 ci->sector_count -= len; 1367 offset += to_bytes(len); 1368 } while (remaining -= len); 1369 1370 ci->idx++; 1371 } 1372 1373 return 0; 1374 } 1375 1376 /* 1377 * Split the bio into several clones and submit it to targets. 1378 */ 1379 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1380 { 1381 struct clone_info ci; 1382 int error = 0; 1383 1384 ci.map = dm_get_live_table(md); 1385 if (unlikely(!ci.map)) { 1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1387 bio_io_error(bio); 1388 else 1389 if (!md->barrier_error) 1390 md->barrier_error = -EIO; 1391 return; 1392 } 1393 1394 ci.md = md; 1395 ci.bio = bio; 1396 ci.io = alloc_io(md); 1397 ci.io->error = 0; 1398 atomic_set(&ci.io->io_count, 1); 1399 ci.io->bio = bio; 1400 ci.io->md = md; 1401 spin_lock_init(&ci.io->endio_lock); 1402 ci.sector = bio->bi_sector; 1403 ci.sector_count = bio_sectors(bio); 1404 if (unlikely(bio_empty_barrier(bio))) 1405 ci.sector_count = 1; 1406 ci.idx = bio->bi_idx; 1407 1408 start_io_acct(ci.io); 1409 while (ci.sector_count && !error) 1410 error = __clone_and_map(&ci); 1411 1412 /* drop the extra reference count */ 1413 dec_pending(ci.io, error); 1414 dm_table_put(ci.map); 1415 } 1416 /*----------------------------------------------------------------- 1417 * CRUD END 1418 *---------------------------------------------------------------*/ 1419 1420 static int dm_merge_bvec(struct request_queue *q, 1421 struct bvec_merge_data *bvm, 1422 struct bio_vec *biovec) 1423 { 1424 struct mapped_device *md = q->queuedata; 1425 struct dm_table *map = dm_get_live_table(md); 1426 struct dm_target *ti; 1427 sector_t max_sectors; 1428 int max_size = 0; 1429 1430 if (unlikely(!map)) 1431 goto out; 1432 1433 ti = dm_table_find_target(map, bvm->bi_sector); 1434 if (!dm_target_is_valid(ti)) 1435 goto out_table; 1436 1437 /* 1438 * Find maximum amount of I/O that won't need splitting 1439 */ 1440 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1441 (sector_t) BIO_MAX_SECTORS); 1442 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1443 if (max_size < 0) 1444 max_size = 0; 1445 1446 /* 1447 * merge_bvec_fn() returns number of bytes 1448 * it can accept at this offset 1449 * max is precomputed maximal io size 1450 */ 1451 if (max_size && ti->type->merge) 1452 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1453 /* 1454 * If the target doesn't support merge method and some of the devices 1455 * provided their merge_bvec method (we know this by looking at 1456 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1457 * entries. So always set max_size to 0, and the code below allows 1458 * just one page. 1459 */ 1460 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1461 1462 max_size = 0; 1463 1464 out_table: 1465 dm_table_put(map); 1466 1467 out: 1468 /* 1469 * Always allow an entire first page 1470 */ 1471 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1472 max_size = biovec->bv_len; 1473 1474 return max_size; 1475 } 1476 1477 /* 1478 * The request function that just remaps the bio built up by 1479 * dm_merge_bvec. 1480 */ 1481 static int _dm_request(struct request_queue *q, struct bio *bio) 1482 { 1483 int rw = bio_data_dir(bio); 1484 struct mapped_device *md = q->queuedata; 1485 int cpu; 1486 1487 down_read(&md->io_lock); 1488 1489 cpu = part_stat_lock(); 1490 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1492 part_stat_unlock(); 1493 1494 /* 1495 * If we're suspended or the thread is processing barriers 1496 * we have to queue this io for later. 1497 */ 1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 1500 up_read(&md->io_lock); 1501 1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1503 bio_rw(bio) == READA) { 1504 bio_io_error(bio); 1505 return 0; 1506 } 1507 1508 queue_io(md, bio); 1509 1510 return 0; 1511 } 1512 1513 __split_and_process_bio(md, bio); 1514 up_read(&md->io_lock); 1515 return 0; 1516 } 1517 1518 static int dm_make_request(struct request_queue *q, struct bio *bio) 1519 { 1520 struct mapped_device *md = q->queuedata; 1521 1522 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1523 } 1524 1525 static int dm_request_based(struct mapped_device *md) 1526 { 1527 return blk_queue_stackable(md->queue); 1528 } 1529 1530 static int dm_request(struct request_queue *q, struct bio *bio) 1531 { 1532 struct mapped_device *md = q->queuedata; 1533 1534 if (dm_request_based(md)) 1535 return dm_make_request(q, bio); 1536 1537 return _dm_request(q, bio); 1538 } 1539 1540 static bool dm_rq_is_flush_request(struct request *rq) 1541 { 1542 if (rq->cmd_flags & REQ_FLUSH) 1543 return true; 1544 else 1545 return false; 1546 } 1547 1548 void dm_dispatch_request(struct request *rq) 1549 { 1550 int r; 1551 1552 if (blk_queue_io_stat(rq->q)) 1553 rq->cmd_flags |= REQ_IO_STAT; 1554 1555 rq->start_time = jiffies; 1556 r = blk_insert_cloned_request(rq->q, rq); 1557 if (r) 1558 dm_complete_request(rq, r); 1559 } 1560 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1561 1562 static void dm_rq_bio_destructor(struct bio *bio) 1563 { 1564 struct dm_rq_clone_bio_info *info = bio->bi_private; 1565 struct mapped_device *md = info->tio->md; 1566 1567 free_bio_info(info); 1568 bio_free(bio, md->bs); 1569 } 1570 1571 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1572 void *data) 1573 { 1574 struct dm_rq_target_io *tio = data; 1575 struct mapped_device *md = tio->md; 1576 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1577 1578 if (!info) 1579 return -ENOMEM; 1580 1581 info->orig = bio_orig; 1582 info->tio = tio; 1583 bio->bi_end_io = end_clone_bio; 1584 bio->bi_private = info; 1585 bio->bi_destructor = dm_rq_bio_destructor; 1586 1587 return 0; 1588 } 1589 1590 static int setup_clone(struct request *clone, struct request *rq, 1591 struct dm_rq_target_io *tio) 1592 { 1593 int r; 1594 1595 if (dm_rq_is_flush_request(rq)) { 1596 blk_rq_init(NULL, clone); 1597 clone->cmd_type = REQ_TYPE_FS; 1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1599 } else { 1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1601 dm_rq_bio_constructor, tio); 1602 if (r) 1603 return r; 1604 1605 clone->cmd = rq->cmd; 1606 clone->cmd_len = rq->cmd_len; 1607 clone->sense = rq->sense; 1608 clone->buffer = rq->buffer; 1609 } 1610 1611 clone->end_io = end_clone_request; 1612 clone->end_io_data = tio; 1613 1614 return 0; 1615 } 1616 1617 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1618 gfp_t gfp_mask) 1619 { 1620 struct request *clone; 1621 struct dm_rq_target_io *tio; 1622 1623 tio = alloc_rq_tio(md, gfp_mask); 1624 if (!tio) 1625 return NULL; 1626 1627 tio->md = md; 1628 tio->ti = NULL; 1629 tio->orig = rq; 1630 tio->error = 0; 1631 memset(&tio->info, 0, sizeof(tio->info)); 1632 1633 clone = &tio->clone; 1634 if (setup_clone(clone, rq, tio)) { 1635 /* -ENOMEM */ 1636 free_rq_tio(tio); 1637 return NULL; 1638 } 1639 1640 return clone; 1641 } 1642 1643 /* 1644 * Called with the queue lock held. 1645 */ 1646 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1647 { 1648 struct mapped_device *md = q->queuedata; 1649 struct request *clone; 1650 1651 if (unlikely(dm_rq_is_flush_request(rq))) 1652 return BLKPREP_OK; 1653 1654 if (unlikely(rq->special)) { 1655 DMWARN("Already has something in rq->special."); 1656 return BLKPREP_KILL; 1657 } 1658 1659 clone = clone_rq(rq, md, GFP_ATOMIC); 1660 if (!clone) 1661 return BLKPREP_DEFER; 1662 1663 rq->special = clone; 1664 rq->cmd_flags |= REQ_DONTPREP; 1665 1666 return BLKPREP_OK; 1667 } 1668 1669 /* 1670 * Returns: 1671 * 0 : the request has been processed (not requeued) 1672 * !0 : the request has been requeued 1673 */ 1674 static int map_request(struct dm_target *ti, struct request *clone, 1675 struct mapped_device *md) 1676 { 1677 int r, requeued = 0; 1678 struct dm_rq_target_io *tio = clone->end_io_data; 1679 1680 /* 1681 * Hold the md reference here for the in-flight I/O. 1682 * We can't rely on the reference count by device opener, 1683 * because the device may be closed during the request completion 1684 * when all bios are completed. 1685 * See the comment in rq_completed() too. 1686 */ 1687 dm_get(md); 1688 1689 tio->ti = ti; 1690 r = ti->type->map_rq(ti, clone, &tio->info); 1691 switch (r) { 1692 case DM_MAPIO_SUBMITTED: 1693 /* The target has taken the I/O to submit by itself later */ 1694 break; 1695 case DM_MAPIO_REMAPPED: 1696 /* The target has remapped the I/O so dispatch it */ 1697 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1698 blk_rq_pos(tio->orig)); 1699 dm_dispatch_request(clone); 1700 break; 1701 case DM_MAPIO_REQUEUE: 1702 /* The target wants to requeue the I/O */ 1703 dm_requeue_unmapped_request(clone); 1704 requeued = 1; 1705 break; 1706 default: 1707 if (r > 0) { 1708 DMWARN("unimplemented target map return value: %d", r); 1709 BUG(); 1710 } 1711 1712 /* The target wants to complete the I/O */ 1713 dm_kill_unmapped_request(clone, r); 1714 break; 1715 } 1716 1717 return requeued; 1718 } 1719 1720 /* 1721 * q->request_fn for request-based dm. 1722 * Called with the queue lock held. 1723 */ 1724 static void dm_request_fn(struct request_queue *q) 1725 { 1726 struct mapped_device *md = q->queuedata; 1727 struct dm_table *map = dm_get_live_table(md); 1728 struct dm_target *ti; 1729 struct request *rq, *clone; 1730 1731 /* 1732 * For suspend, check blk_queue_stopped() and increment 1733 * ->pending within a single queue_lock not to increment the 1734 * number of in-flight I/Os after the queue is stopped in 1735 * dm_suspend(). 1736 */ 1737 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1738 rq = blk_peek_request(q); 1739 if (!rq) 1740 goto plug_and_out; 1741 1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1743 BUG_ON(md->flush_request); 1744 md->flush_request = rq; 1745 blk_start_request(rq); 1746 queue_work(md->wq, &md->barrier_work); 1747 goto out; 1748 } 1749 1750 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1751 if (ti->type->busy && ti->type->busy(ti)) 1752 goto plug_and_out; 1753 1754 blk_start_request(rq); 1755 clone = rq->special; 1756 atomic_inc(&md->pending[rq_data_dir(clone)]); 1757 1758 spin_unlock(q->queue_lock); 1759 if (map_request(ti, clone, md)) 1760 goto requeued; 1761 1762 spin_lock_irq(q->queue_lock); 1763 } 1764 1765 goto out; 1766 1767 requeued: 1768 spin_lock_irq(q->queue_lock); 1769 1770 plug_and_out: 1771 if (!elv_queue_empty(q)) 1772 /* Some requests still remain, retry later */ 1773 blk_plug_device(q); 1774 1775 out: 1776 dm_table_put(map); 1777 1778 return; 1779 } 1780 1781 int dm_underlying_device_busy(struct request_queue *q) 1782 { 1783 return blk_lld_busy(q); 1784 } 1785 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1786 1787 static int dm_lld_busy(struct request_queue *q) 1788 { 1789 int r; 1790 struct mapped_device *md = q->queuedata; 1791 struct dm_table *map = dm_get_live_table(md); 1792 1793 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1794 r = 1; 1795 else 1796 r = dm_table_any_busy_target(map); 1797 1798 dm_table_put(map); 1799 1800 return r; 1801 } 1802 1803 static void dm_unplug_all(struct request_queue *q) 1804 { 1805 struct mapped_device *md = q->queuedata; 1806 struct dm_table *map = dm_get_live_table(md); 1807 1808 if (map) { 1809 if (dm_request_based(md)) 1810 generic_unplug_device(q); 1811 1812 dm_table_unplug_all(map); 1813 dm_table_put(map); 1814 } 1815 } 1816 1817 static int dm_any_congested(void *congested_data, int bdi_bits) 1818 { 1819 int r = bdi_bits; 1820 struct mapped_device *md = congested_data; 1821 struct dm_table *map; 1822 1823 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1824 map = dm_get_live_table(md); 1825 if (map) { 1826 /* 1827 * Request-based dm cares about only own queue for 1828 * the query about congestion status of request_queue 1829 */ 1830 if (dm_request_based(md)) 1831 r = md->queue->backing_dev_info.state & 1832 bdi_bits; 1833 else 1834 r = dm_table_any_congested(map, bdi_bits); 1835 1836 dm_table_put(map); 1837 } 1838 } 1839 1840 return r; 1841 } 1842 1843 /*----------------------------------------------------------------- 1844 * An IDR is used to keep track of allocated minor numbers. 1845 *---------------------------------------------------------------*/ 1846 static DEFINE_IDR(_minor_idr); 1847 1848 static void free_minor(int minor) 1849 { 1850 spin_lock(&_minor_lock); 1851 idr_remove(&_minor_idr, minor); 1852 spin_unlock(&_minor_lock); 1853 } 1854 1855 /* 1856 * See if the device with a specific minor # is free. 1857 */ 1858 static int specific_minor(int minor) 1859 { 1860 int r, m; 1861 1862 if (minor >= (1 << MINORBITS)) 1863 return -EINVAL; 1864 1865 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1866 if (!r) 1867 return -ENOMEM; 1868 1869 spin_lock(&_minor_lock); 1870 1871 if (idr_find(&_minor_idr, minor)) { 1872 r = -EBUSY; 1873 goto out; 1874 } 1875 1876 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1877 if (r) 1878 goto out; 1879 1880 if (m != minor) { 1881 idr_remove(&_minor_idr, m); 1882 r = -EBUSY; 1883 goto out; 1884 } 1885 1886 out: 1887 spin_unlock(&_minor_lock); 1888 return r; 1889 } 1890 1891 static int next_free_minor(int *minor) 1892 { 1893 int r, m; 1894 1895 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1896 if (!r) 1897 return -ENOMEM; 1898 1899 spin_lock(&_minor_lock); 1900 1901 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1902 if (r) 1903 goto out; 1904 1905 if (m >= (1 << MINORBITS)) { 1906 idr_remove(&_minor_idr, m); 1907 r = -ENOSPC; 1908 goto out; 1909 } 1910 1911 *minor = m; 1912 1913 out: 1914 spin_unlock(&_minor_lock); 1915 return r; 1916 } 1917 1918 static const struct block_device_operations dm_blk_dops; 1919 1920 static void dm_wq_work(struct work_struct *work); 1921 static void dm_rq_barrier_work(struct work_struct *work); 1922 1923 static void dm_init_md_queue(struct mapped_device *md) 1924 { 1925 /* 1926 * Request-based dm devices cannot be stacked on top of bio-based dm 1927 * devices. The type of this dm device has not been decided yet. 1928 * The type is decided at the first table loading time. 1929 * To prevent problematic device stacking, clear the queue flag 1930 * for request stacking support until then. 1931 * 1932 * This queue is new, so no concurrency on the queue_flags. 1933 */ 1934 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1935 1936 md->queue->queuedata = md; 1937 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1938 md->queue->backing_dev_info.congested_data = md; 1939 blk_queue_make_request(md->queue, dm_request); 1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1941 md->queue->unplug_fn = dm_unplug_all; 1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1943 } 1944 1945 /* 1946 * Allocate and initialise a blank device with a given minor. 1947 */ 1948 static struct mapped_device *alloc_dev(int minor) 1949 { 1950 int r; 1951 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1952 void *old_md; 1953 1954 if (!md) { 1955 DMWARN("unable to allocate device, out of memory."); 1956 return NULL; 1957 } 1958 1959 if (!try_module_get(THIS_MODULE)) 1960 goto bad_module_get; 1961 1962 /* get a minor number for the dev */ 1963 if (minor == DM_ANY_MINOR) 1964 r = next_free_minor(&minor); 1965 else 1966 r = specific_minor(minor); 1967 if (r < 0) 1968 goto bad_minor; 1969 1970 md->type = DM_TYPE_NONE; 1971 init_rwsem(&md->io_lock); 1972 mutex_init(&md->suspend_lock); 1973 mutex_init(&md->type_lock); 1974 spin_lock_init(&md->deferred_lock); 1975 spin_lock_init(&md->barrier_error_lock); 1976 rwlock_init(&md->map_lock); 1977 atomic_set(&md->holders, 1); 1978 atomic_set(&md->open_count, 0); 1979 atomic_set(&md->event_nr, 0); 1980 atomic_set(&md->uevent_seq, 0); 1981 INIT_LIST_HEAD(&md->uevent_list); 1982 spin_lock_init(&md->uevent_lock); 1983 1984 md->queue = blk_alloc_queue(GFP_KERNEL); 1985 if (!md->queue) 1986 goto bad_queue; 1987 1988 dm_init_md_queue(md); 1989 1990 md->disk = alloc_disk(1); 1991 if (!md->disk) 1992 goto bad_disk; 1993 1994 atomic_set(&md->pending[0], 0); 1995 atomic_set(&md->pending[1], 0); 1996 init_waitqueue_head(&md->wait); 1997 INIT_WORK(&md->work, dm_wq_work); 1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work); 1999 init_waitqueue_head(&md->eventq); 2000 2001 md->disk->major = _major; 2002 md->disk->first_minor = minor; 2003 md->disk->fops = &dm_blk_dops; 2004 md->disk->queue = md->queue; 2005 md->disk->private_data = md; 2006 sprintf(md->disk->disk_name, "dm-%d", minor); 2007 add_disk(md->disk); 2008 format_dev_t(md->name, MKDEV(_major, minor)); 2009 2010 md->wq = create_singlethread_workqueue("kdmflush"); 2011 if (!md->wq) 2012 goto bad_thread; 2013 2014 md->bdev = bdget_disk(md->disk, 0); 2015 if (!md->bdev) 2016 goto bad_bdev; 2017 2018 /* Populate the mapping, nobody knows we exist yet */ 2019 spin_lock(&_minor_lock); 2020 old_md = idr_replace(&_minor_idr, md, minor); 2021 spin_unlock(&_minor_lock); 2022 2023 BUG_ON(old_md != MINOR_ALLOCED); 2024 2025 return md; 2026 2027 bad_bdev: 2028 destroy_workqueue(md->wq); 2029 bad_thread: 2030 del_gendisk(md->disk); 2031 put_disk(md->disk); 2032 bad_disk: 2033 blk_cleanup_queue(md->queue); 2034 bad_queue: 2035 free_minor(minor); 2036 bad_minor: 2037 module_put(THIS_MODULE); 2038 bad_module_get: 2039 kfree(md); 2040 return NULL; 2041 } 2042 2043 static void unlock_fs(struct mapped_device *md); 2044 2045 static void free_dev(struct mapped_device *md) 2046 { 2047 int minor = MINOR(disk_devt(md->disk)); 2048 2049 unlock_fs(md); 2050 bdput(md->bdev); 2051 destroy_workqueue(md->wq); 2052 if (md->tio_pool) 2053 mempool_destroy(md->tio_pool); 2054 if (md->io_pool) 2055 mempool_destroy(md->io_pool); 2056 if (md->bs) 2057 bioset_free(md->bs); 2058 blk_integrity_unregister(md->disk); 2059 del_gendisk(md->disk); 2060 free_minor(minor); 2061 2062 spin_lock(&_minor_lock); 2063 md->disk->private_data = NULL; 2064 spin_unlock(&_minor_lock); 2065 2066 put_disk(md->disk); 2067 blk_cleanup_queue(md->queue); 2068 module_put(THIS_MODULE); 2069 kfree(md); 2070 } 2071 2072 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2073 { 2074 struct dm_md_mempools *p; 2075 2076 if (md->io_pool && md->tio_pool && md->bs) 2077 /* the md already has necessary mempools */ 2078 goto out; 2079 2080 p = dm_table_get_md_mempools(t); 2081 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 2082 2083 md->io_pool = p->io_pool; 2084 p->io_pool = NULL; 2085 md->tio_pool = p->tio_pool; 2086 p->tio_pool = NULL; 2087 md->bs = p->bs; 2088 p->bs = NULL; 2089 2090 out: 2091 /* mempool bind completed, now no need any mempools in the table */ 2092 dm_table_free_md_mempools(t); 2093 } 2094 2095 /* 2096 * Bind a table to the device. 2097 */ 2098 static void event_callback(void *context) 2099 { 2100 unsigned long flags; 2101 LIST_HEAD(uevents); 2102 struct mapped_device *md = (struct mapped_device *) context; 2103 2104 spin_lock_irqsave(&md->uevent_lock, flags); 2105 list_splice_init(&md->uevent_list, &uevents); 2106 spin_unlock_irqrestore(&md->uevent_lock, flags); 2107 2108 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2109 2110 atomic_inc(&md->event_nr); 2111 wake_up(&md->eventq); 2112 } 2113 2114 static void __set_size(struct mapped_device *md, sector_t size) 2115 { 2116 set_capacity(md->disk, size); 2117 2118 mutex_lock(&md->bdev->bd_inode->i_mutex); 2119 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2120 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2121 } 2122 2123 /* 2124 * Returns old map, which caller must destroy. 2125 */ 2126 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2127 struct queue_limits *limits) 2128 { 2129 struct dm_table *old_map; 2130 struct request_queue *q = md->queue; 2131 sector_t size; 2132 unsigned long flags; 2133 2134 size = dm_table_get_size(t); 2135 2136 /* 2137 * Wipe any geometry if the size of the table changed. 2138 */ 2139 if (size != get_capacity(md->disk)) 2140 memset(&md->geometry, 0, sizeof(md->geometry)); 2141 2142 __set_size(md, size); 2143 2144 dm_table_event_callback(t, event_callback, md); 2145 2146 /* 2147 * The queue hasn't been stopped yet, if the old table type wasn't 2148 * for request-based during suspension. So stop it to prevent 2149 * I/O mapping before resume. 2150 * This must be done before setting the queue restrictions, 2151 * because request-based dm may be run just after the setting. 2152 */ 2153 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2154 stop_queue(q); 2155 2156 __bind_mempools(md, t); 2157 2158 write_lock_irqsave(&md->map_lock, flags); 2159 old_map = md->map; 2160 md->map = t; 2161 dm_table_set_restrictions(t, q, limits); 2162 write_unlock_irqrestore(&md->map_lock, flags); 2163 2164 return old_map; 2165 } 2166 2167 /* 2168 * Returns unbound table for the caller to free. 2169 */ 2170 static struct dm_table *__unbind(struct mapped_device *md) 2171 { 2172 struct dm_table *map = md->map; 2173 unsigned long flags; 2174 2175 if (!map) 2176 return NULL; 2177 2178 dm_table_event_callback(map, NULL, NULL); 2179 write_lock_irqsave(&md->map_lock, flags); 2180 md->map = NULL; 2181 write_unlock_irqrestore(&md->map_lock, flags); 2182 2183 return map; 2184 } 2185 2186 /* 2187 * Constructor for a new device. 2188 */ 2189 int dm_create(int minor, struct mapped_device **result) 2190 { 2191 struct mapped_device *md; 2192 2193 md = alloc_dev(minor); 2194 if (!md) 2195 return -ENXIO; 2196 2197 dm_sysfs_init(md); 2198 2199 *result = md; 2200 return 0; 2201 } 2202 2203 /* 2204 * Functions to manage md->type. 2205 * All are required to hold md->type_lock. 2206 */ 2207 void dm_lock_md_type(struct mapped_device *md) 2208 { 2209 mutex_lock(&md->type_lock); 2210 } 2211 2212 void dm_unlock_md_type(struct mapped_device *md) 2213 { 2214 mutex_unlock(&md->type_lock); 2215 } 2216 2217 void dm_set_md_type(struct mapped_device *md, unsigned type) 2218 { 2219 md->type = type; 2220 } 2221 2222 unsigned dm_get_md_type(struct mapped_device *md) 2223 { 2224 return md->type; 2225 } 2226 2227 /* 2228 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2229 */ 2230 static int dm_init_request_based_queue(struct mapped_device *md) 2231 { 2232 struct request_queue *q = NULL; 2233 2234 if (md->queue->elevator) 2235 return 1; 2236 2237 /* Fully initialize the queue */ 2238 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2239 if (!q) 2240 return 0; 2241 2242 md->queue = q; 2243 md->saved_make_request_fn = md->queue->make_request_fn; 2244 dm_init_md_queue(md); 2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2248 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); 2249 2250 elv_register_queue(md->queue); 2251 2252 return 1; 2253 } 2254 2255 /* 2256 * Setup the DM device's queue based on md's type 2257 */ 2258 int dm_setup_md_queue(struct mapped_device *md) 2259 { 2260 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2261 !dm_init_request_based_queue(md)) { 2262 DMWARN("Cannot initialize queue for request-based mapped device"); 2263 return -EINVAL; 2264 } 2265 2266 return 0; 2267 } 2268 2269 static struct mapped_device *dm_find_md(dev_t dev) 2270 { 2271 struct mapped_device *md; 2272 unsigned minor = MINOR(dev); 2273 2274 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2275 return NULL; 2276 2277 spin_lock(&_minor_lock); 2278 2279 md = idr_find(&_minor_idr, minor); 2280 if (md && (md == MINOR_ALLOCED || 2281 (MINOR(disk_devt(dm_disk(md))) != minor) || 2282 dm_deleting_md(md) || 2283 test_bit(DMF_FREEING, &md->flags))) { 2284 md = NULL; 2285 goto out; 2286 } 2287 2288 out: 2289 spin_unlock(&_minor_lock); 2290 2291 return md; 2292 } 2293 2294 struct mapped_device *dm_get_md(dev_t dev) 2295 { 2296 struct mapped_device *md = dm_find_md(dev); 2297 2298 if (md) 2299 dm_get(md); 2300 2301 return md; 2302 } 2303 2304 void *dm_get_mdptr(struct mapped_device *md) 2305 { 2306 return md->interface_ptr; 2307 } 2308 2309 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2310 { 2311 md->interface_ptr = ptr; 2312 } 2313 2314 void dm_get(struct mapped_device *md) 2315 { 2316 atomic_inc(&md->holders); 2317 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2318 } 2319 2320 const char *dm_device_name(struct mapped_device *md) 2321 { 2322 return md->name; 2323 } 2324 EXPORT_SYMBOL_GPL(dm_device_name); 2325 2326 static void __dm_destroy(struct mapped_device *md, bool wait) 2327 { 2328 struct dm_table *map; 2329 2330 might_sleep(); 2331 2332 spin_lock(&_minor_lock); 2333 map = dm_get_live_table(md); 2334 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2335 set_bit(DMF_FREEING, &md->flags); 2336 spin_unlock(&_minor_lock); 2337 2338 if (!dm_suspended_md(md)) { 2339 dm_table_presuspend_targets(map); 2340 dm_table_postsuspend_targets(map); 2341 } 2342 2343 /* 2344 * Rare, but there may be I/O requests still going to complete, 2345 * for example. Wait for all references to disappear. 2346 * No one should increment the reference count of the mapped_device, 2347 * after the mapped_device state becomes DMF_FREEING. 2348 */ 2349 if (wait) 2350 while (atomic_read(&md->holders)) 2351 msleep(1); 2352 else if (atomic_read(&md->holders)) 2353 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2354 dm_device_name(md), atomic_read(&md->holders)); 2355 2356 dm_sysfs_exit(md); 2357 dm_table_put(map); 2358 dm_table_destroy(__unbind(md)); 2359 free_dev(md); 2360 } 2361 2362 void dm_destroy(struct mapped_device *md) 2363 { 2364 __dm_destroy(md, true); 2365 } 2366 2367 void dm_destroy_immediate(struct mapped_device *md) 2368 { 2369 __dm_destroy(md, false); 2370 } 2371 2372 void dm_put(struct mapped_device *md) 2373 { 2374 atomic_dec(&md->holders); 2375 } 2376 EXPORT_SYMBOL_GPL(dm_put); 2377 2378 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2379 { 2380 int r = 0; 2381 DECLARE_WAITQUEUE(wait, current); 2382 2383 dm_unplug_all(md->queue); 2384 2385 add_wait_queue(&md->wait, &wait); 2386 2387 while (1) { 2388 set_current_state(interruptible); 2389 2390 smp_mb(); 2391 if (!md_in_flight(md)) 2392 break; 2393 2394 if (interruptible == TASK_INTERRUPTIBLE && 2395 signal_pending(current)) { 2396 r = -EINTR; 2397 break; 2398 } 2399 2400 io_schedule(); 2401 } 2402 set_current_state(TASK_RUNNING); 2403 2404 remove_wait_queue(&md->wait, &wait); 2405 2406 return r; 2407 } 2408 2409 static void dm_flush(struct mapped_device *md) 2410 { 2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2412 2413 bio_init(&md->barrier_bio); 2414 md->barrier_bio.bi_bdev = md->bdev; 2415 md->barrier_bio.bi_rw = WRITE_BARRIER; 2416 __split_and_process_bio(md, &md->barrier_bio); 2417 2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2419 } 2420 2421 static void process_barrier(struct mapped_device *md, struct bio *bio) 2422 { 2423 md->barrier_error = 0; 2424 2425 dm_flush(md); 2426 2427 if (!bio_empty_barrier(bio)) { 2428 __split_and_process_bio(md, bio); 2429 /* 2430 * If the request isn't supported, don't waste time with 2431 * the second flush. 2432 */ 2433 if (md->barrier_error != -EOPNOTSUPP) 2434 dm_flush(md); 2435 } 2436 2437 if (md->barrier_error != DM_ENDIO_REQUEUE) 2438 bio_endio(bio, md->barrier_error); 2439 else { 2440 spin_lock_irq(&md->deferred_lock); 2441 bio_list_add_head(&md->deferred, bio); 2442 spin_unlock_irq(&md->deferred_lock); 2443 } 2444 } 2445 2446 /* 2447 * Process the deferred bios 2448 */ 2449 static void dm_wq_work(struct work_struct *work) 2450 { 2451 struct mapped_device *md = container_of(work, struct mapped_device, 2452 work); 2453 struct bio *c; 2454 2455 down_write(&md->io_lock); 2456 2457 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2458 spin_lock_irq(&md->deferred_lock); 2459 c = bio_list_pop(&md->deferred); 2460 spin_unlock_irq(&md->deferred_lock); 2461 2462 if (!c) { 2463 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2464 break; 2465 } 2466 2467 up_write(&md->io_lock); 2468 2469 if (dm_request_based(md)) 2470 generic_make_request(c); 2471 else { 2472 if (c->bi_rw & REQ_HARDBARRIER) 2473 process_barrier(md, c); 2474 else 2475 __split_and_process_bio(md, c); 2476 } 2477 2478 down_write(&md->io_lock); 2479 } 2480 2481 up_write(&md->io_lock); 2482 } 2483 2484 static void dm_queue_flush(struct mapped_device *md) 2485 { 2486 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2487 smp_mb__after_clear_bit(); 2488 queue_work(md->wq, &md->work); 2489 } 2490 2491 static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr) 2492 { 2493 struct dm_rq_target_io *tio = clone->end_io_data; 2494 2495 tio->info.target_request_nr = request_nr; 2496 } 2497 2498 /* Issue barrier requests to targets and wait for their completion. */ 2499 static int dm_rq_barrier(struct mapped_device *md) 2500 { 2501 int i, j; 2502 struct dm_table *map = dm_get_live_table(md); 2503 unsigned num_targets = dm_table_get_num_targets(map); 2504 struct dm_target *ti; 2505 struct request *clone; 2506 2507 md->barrier_error = 0; 2508 2509 for (i = 0; i < num_targets; i++) { 2510 ti = dm_table_get_target(map, i); 2511 for (j = 0; j < ti->num_flush_requests; j++) { 2512 clone = clone_rq(md->flush_request, md, GFP_NOIO); 2513 dm_rq_set_target_request_nr(clone, j); 2514 atomic_inc(&md->pending[rq_data_dir(clone)]); 2515 map_request(ti, clone, md); 2516 } 2517 } 2518 2519 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2520 dm_table_put(map); 2521 2522 return md->barrier_error; 2523 } 2524 2525 static void dm_rq_barrier_work(struct work_struct *work) 2526 { 2527 int error; 2528 struct mapped_device *md = container_of(work, struct mapped_device, 2529 barrier_work); 2530 struct request_queue *q = md->queue; 2531 struct request *rq; 2532 unsigned long flags; 2533 2534 /* 2535 * Hold the md reference here and leave it at the last part so that 2536 * the md can't be deleted by device opener when the barrier request 2537 * completes. 2538 */ 2539 dm_get(md); 2540 2541 error = dm_rq_barrier(md); 2542 2543 rq = md->flush_request; 2544 md->flush_request = NULL; 2545 2546 if (error == DM_ENDIO_REQUEUE) { 2547 spin_lock_irqsave(q->queue_lock, flags); 2548 blk_requeue_request(q, rq); 2549 spin_unlock_irqrestore(q->queue_lock, flags); 2550 } else 2551 blk_end_request_all(rq, error); 2552 2553 blk_run_queue(q); 2554 2555 dm_put(md); 2556 } 2557 2558 /* 2559 * Swap in a new table, returning the old one for the caller to destroy. 2560 */ 2561 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2562 { 2563 struct dm_table *map = ERR_PTR(-EINVAL); 2564 struct queue_limits limits; 2565 int r; 2566 2567 mutex_lock(&md->suspend_lock); 2568 2569 /* device must be suspended */ 2570 if (!dm_suspended_md(md)) 2571 goto out; 2572 2573 r = dm_calculate_queue_limits(table, &limits); 2574 if (r) { 2575 map = ERR_PTR(r); 2576 goto out; 2577 } 2578 2579 map = __bind(md, table, &limits); 2580 2581 out: 2582 mutex_unlock(&md->suspend_lock); 2583 return map; 2584 } 2585 2586 /* 2587 * Functions to lock and unlock any filesystem running on the 2588 * device. 2589 */ 2590 static int lock_fs(struct mapped_device *md) 2591 { 2592 int r; 2593 2594 WARN_ON(md->frozen_sb); 2595 2596 md->frozen_sb = freeze_bdev(md->bdev); 2597 if (IS_ERR(md->frozen_sb)) { 2598 r = PTR_ERR(md->frozen_sb); 2599 md->frozen_sb = NULL; 2600 return r; 2601 } 2602 2603 set_bit(DMF_FROZEN, &md->flags); 2604 2605 return 0; 2606 } 2607 2608 static void unlock_fs(struct mapped_device *md) 2609 { 2610 if (!test_bit(DMF_FROZEN, &md->flags)) 2611 return; 2612 2613 thaw_bdev(md->bdev, md->frozen_sb); 2614 md->frozen_sb = NULL; 2615 clear_bit(DMF_FROZEN, &md->flags); 2616 } 2617 2618 /* 2619 * We need to be able to change a mapping table under a mounted 2620 * filesystem. For example we might want to move some data in 2621 * the background. Before the table can be swapped with 2622 * dm_bind_table, dm_suspend must be called to flush any in 2623 * flight bios and ensure that any further io gets deferred. 2624 */ 2625 /* 2626 * Suspend mechanism in request-based dm. 2627 * 2628 * 1. Flush all I/Os by lock_fs() if needed. 2629 * 2. Stop dispatching any I/O by stopping the request_queue. 2630 * 3. Wait for all in-flight I/Os to be completed or requeued. 2631 * 2632 * To abort suspend, start the request_queue. 2633 */ 2634 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2635 { 2636 struct dm_table *map = NULL; 2637 int r = 0; 2638 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2639 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2640 2641 mutex_lock(&md->suspend_lock); 2642 2643 if (dm_suspended_md(md)) { 2644 r = -EINVAL; 2645 goto out_unlock; 2646 } 2647 2648 map = dm_get_live_table(md); 2649 2650 /* 2651 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2652 * This flag is cleared before dm_suspend returns. 2653 */ 2654 if (noflush) 2655 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2656 2657 /* This does not get reverted if there's an error later. */ 2658 dm_table_presuspend_targets(map); 2659 2660 /* 2661 * Flush I/O to the device. 2662 * Any I/O submitted after lock_fs() may not be flushed. 2663 * noflush takes precedence over do_lockfs. 2664 * (lock_fs() flushes I/Os and waits for them to complete.) 2665 */ 2666 if (!noflush && do_lockfs) { 2667 r = lock_fs(md); 2668 if (r) 2669 goto out; 2670 } 2671 2672 /* 2673 * Here we must make sure that no processes are submitting requests 2674 * to target drivers i.e. no one may be executing 2675 * __split_and_process_bio. This is called from dm_request and 2676 * dm_wq_work. 2677 * 2678 * To get all processes out of __split_and_process_bio in dm_request, 2679 * we take the write lock. To prevent any process from reentering 2680 * __split_and_process_bio from dm_request, we set 2681 * DMF_QUEUE_IO_TO_THREAD. 2682 * 2683 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2684 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2685 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2686 * further calls to __split_and_process_bio from dm_wq_work. 2687 */ 2688 down_write(&md->io_lock); 2689 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2690 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2691 up_write(&md->io_lock); 2692 2693 /* 2694 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2695 * can be kicked until md->queue is stopped. So stop md->queue before 2696 * flushing md->wq. 2697 */ 2698 if (dm_request_based(md)) 2699 stop_queue(md->queue); 2700 2701 flush_workqueue(md->wq); 2702 2703 /* 2704 * At this point no more requests are entering target request routines. 2705 * We call dm_wait_for_completion to wait for all existing requests 2706 * to finish. 2707 */ 2708 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2709 2710 down_write(&md->io_lock); 2711 if (noflush) 2712 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2713 up_write(&md->io_lock); 2714 2715 /* were we interrupted ? */ 2716 if (r < 0) { 2717 dm_queue_flush(md); 2718 2719 if (dm_request_based(md)) 2720 start_queue(md->queue); 2721 2722 unlock_fs(md); 2723 goto out; /* pushback list is already flushed, so skip flush */ 2724 } 2725 2726 /* 2727 * If dm_wait_for_completion returned 0, the device is completely 2728 * quiescent now. There is no request-processing activity. All new 2729 * requests are being added to md->deferred list. 2730 */ 2731 2732 set_bit(DMF_SUSPENDED, &md->flags); 2733 2734 dm_table_postsuspend_targets(map); 2735 2736 out: 2737 dm_table_put(map); 2738 2739 out_unlock: 2740 mutex_unlock(&md->suspend_lock); 2741 return r; 2742 } 2743 2744 int dm_resume(struct mapped_device *md) 2745 { 2746 int r = -EINVAL; 2747 struct dm_table *map = NULL; 2748 2749 mutex_lock(&md->suspend_lock); 2750 if (!dm_suspended_md(md)) 2751 goto out; 2752 2753 map = dm_get_live_table(md); 2754 if (!map || !dm_table_get_size(map)) 2755 goto out; 2756 2757 r = dm_table_resume_targets(map); 2758 if (r) 2759 goto out; 2760 2761 dm_queue_flush(md); 2762 2763 /* 2764 * Flushing deferred I/Os must be done after targets are resumed 2765 * so that mapping of targets can work correctly. 2766 * Request-based dm is queueing the deferred I/Os in its request_queue. 2767 */ 2768 if (dm_request_based(md)) 2769 start_queue(md->queue); 2770 2771 unlock_fs(md); 2772 2773 clear_bit(DMF_SUSPENDED, &md->flags); 2774 2775 dm_table_unplug_all(map); 2776 r = 0; 2777 out: 2778 dm_table_put(map); 2779 mutex_unlock(&md->suspend_lock); 2780 2781 return r; 2782 } 2783 2784 /*----------------------------------------------------------------- 2785 * Event notification. 2786 *---------------------------------------------------------------*/ 2787 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2788 unsigned cookie) 2789 { 2790 char udev_cookie[DM_COOKIE_LENGTH]; 2791 char *envp[] = { udev_cookie, NULL }; 2792 2793 if (!cookie) 2794 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2795 else { 2796 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2797 DM_COOKIE_ENV_VAR_NAME, cookie); 2798 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2799 action, envp); 2800 } 2801 } 2802 2803 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2804 { 2805 return atomic_add_return(1, &md->uevent_seq); 2806 } 2807 2808 uint32_t dm_get_event_nr(struct mapped_device *md) 2809 { 2810 return atomic_read(&md->event_nr); 2811 } 2812 2813 int dm_wait_event(struct mapped_device *md, int event_nr) 2814 { 2815 return wait_event_interruptible(md->eventq, 2816 (event_nr != atomic_read(&md->event_nr))); 2817 } 2818 2819 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2820 { 2821 unsigned long flags; 2822 2823 spin_lock_irqsave(&md->uevent_lock, flags); 2824 list_add(elist, &md->uevent_list); 2825 spin_unlock_irqrestore(&md->uevent_lock, flags); 2826 } 2827 2828 /* 2829 * The gendisk is only valid as long as you have a reference 2830 * count on 'md'. 2831 */ 2832 struct gendisk *dm_disk(struct mapped_device *md) 2833 { 2834 return md->disk; 2835 } 2836 2837 struct kobject *dm_kobject(struct mapped_device *md) 2838 { 2839 return &md->kobj; 2840 } 2841 2842 /* 2843 * struct mapped_device should not be exported outside of dm.c 2844 * so use this check to verify that kobj is part of md structure 2845 */ 2846 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2847 { 2848 struct mapped_device *md; 2849 2850 md = container_of(kobj, struct mapped_device, kobj); 2851 if (&md->kobj != kobj) 2852 return NULL; 2853 2854 if (test_bit(DMF_FREEING, &md->flags) || 2855 dm_deleting_md(md)) 2856 return NULL; 2857 2858 dm_get(md); 2859 return md; 2860 } 2861 2862 int dm_suspended_md(struct mapped_device *md) 2863 { 2864 return test_bit(DMF_SUSPENDED, &md->flags); 2865 } 2866 2867 int dm_suspended(struct dm_target *ti) 2868 { 2869 return dm_suspended_md(dm_table_get_md(ti->table)); 2870 } 2871 EXPORT_SYMBOL_GPL(dm_suspended); 2872 2873 int dm_noflush_suspending(struct dm_target *ti) 2874 { 2875 return __noflush_suspending(dm_table_get_md(ti->table)); 2876 } 2877 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2878 2879 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2880 { 2881 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2882 2883 if (!pools) 2884 return NULL; 2885 2886 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2887 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2888 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2889 if (!pools->io_pool) 2890 goto free_pools_and_out; 2891 2892 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2893 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2894 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2895 if (!pools->tio_pool) 2896 goto free_io_pool_and_out; 2897 2898 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2899 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2900 if (!pools->bs) 2901 goto free_tio_pool_and_out; 2902 2903 return pools; 2904 2905 free_tio_pool_and_out: 2906 mempool_destroy(pools->tio_pool); 2907 2908 free_io_pool_and_out: 2909 mempool_destroy(pools->io_pool); 2910 2911 free_pools_and_out: 2912 kfree(pools); 2913 2914 return NULL; 2915 } 2916 2917 void dm_free_md_mempools(struct dm_md_mempools *pools) 2918 { 2919 if (!pools) 2920 return; 2921 2922 if (pools->io_pool) 2923 mempool_destroy(pools->io_pool); 2924 2925 if (pools->tio_pool) 2926 mempool_destroy(pools->tio_pool); 2927 2928 if (pools->bs) 2929 bioset_free(pools->bs); 2930 2931 kfree(pools); 2932 } 2933 2934 static const struct block_device_operations dm_blk_dops = { 2935 .open = dm_blk_open, 2936 .release = dm_blk_close, 2937 .ioctl = dm_blk_ioctl, 2938 .getgeo = dm_blk_getgeo, 2939 .owner = THIS_MODULE 2940 }; 2941 2942 EXPORT_SYMBOL(dm_get_mapinfo); 2943 2944 /* 2945 * module hooks 2946 */ 2947 module_init(dm_init); 2948 module_exit(dm_exit); 2949 2950 module_param(major, uint, 0); 2951 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2952 MODULE_DESCRIPTION(DM_NAME " driver"); 2953 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2954 MODULE_LICENSE("GPL"); 2955