1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/blk-crypto-profile.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 /* 44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 * ending this fs bio, we will recover its ->bi_private. 47 */ 48 #define REQ_DM_POLL_LIST REQ_DRV 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 67 68 void dm_issue_global_event(void) 69 { 70 atomic_inc(&dm_global_event_nr); 71 wake_up(&dm_global_eventq); 72 } 73 74 /* 75 * One of these is allocated (on-stack) per original bio. 76 */ 77 struct clone_info { 78 struct dm_table *map; 79 struct bio *bio; 80 struct dm_io *io; 81 sector_t sector; 82 unsigned sector_count; 83 bool submit_as_polled; 84 }; 85 86 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 87 #define DM_IO_BIO_OFFSET \ 88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 89 90 static inline struct dm_target_io *clone_to_tio(struct bio *clone) 91 { 92 return container_of(clone, struct dm_target_io, clone); 93 } 94 95 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 { 97 if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) 98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 } 101 EXPORT_SYMBOL_GPL(dm_per_bio_data); 102 103 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 104 { 105 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 106 if (io->magic == DM_IO_MAGIC) 107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 108 BUG_ON(io->magic != DM_TIO_MAGIC); 109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 110 } 111 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 112 113 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 114 { 115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 116 } 117 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 118 119 #define MINOR_ALLOCED ((void *)-1) 120 121 #define DM_NUMA_NODE NUMA_NO_NODE 122 static int dm_numa_node = DM_NUMA_NODE; 123 124 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 125 static int swap_bios = DEFAULT_SWAP_BIOS; 126 static int get_swap_bios(void) 127 { 128 int latch = READ_ONCE(swap_bios); 129 if (unlikely(latch <= 0)) 130 latch = DEFAULT_SWAP_BIOS; 131 return latch; 132 } 133 134 /* 135 * For mempools pre-allocation at the table loading time. 136 */ 137 struct dm_md_mempools { 138 struct bio_set bs; 139 struct bio_set io_bs; 140 }; 141 142 struct table_device { 143 struct list_head list; 144 refcount_t count; 145 struct dm_dev dm_dev; 146 }; 147 148 /* 149 * Bio-based DM's mempools' reserved IOs set by the user. 150 */ 151 #define RESERVED_BIO_BASED_IOS 16 152 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 153 154 static int __dm_get_module_param_int(int *module_param, int min, int max) 155 { 156 int param = READ_ONCE(*module_param); 157 int modified_param = 0; 158 bool modified = true; 159 160 if (param < min) 161 modified_param = min; 162 else if (param > max) 163 modified_param = max; 164 else 165 modified = false; 166 167 if (modified) { 168 (void)cmpxchg(module_param, param, modified_param); 169 param = modified_param; 170 } 171 172 return param; 173 } 174 175 unsigned __dm_get_module_param(unsigned *module_param, 176 unsigned def, unsigned max) 177 { 178 unsigned param = READ_ONCE(*module_param); 179 unsigned modified_param = 0; 180 181 if (!param) 182 modified_param = def; 183 else if (param > max) 184 modified_param = max; 185 186 if (modified_param) { 187 (void)cmpxchg(module_param, param, modified_param); 188 param = modified_param; 189 } 190 191 return param; 192 } 193 194 unsigned dm_get_reserved_bio_based_ios(void) 195 { 196 return __dm_get_module_param(&reserved_bio_based_ios, 197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 198 } 199 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 200 201 static unsigned dm_get_numa_node(void) 202 { 203 return __dm_get_module_param_int(&dm_numa_node, 204 DM_NUMA_NODE, num_online_nodes() - 1); 205 } 206 207 static int __init local_init(void) 208 { 209 int r; 210 211 r = dm_uevent_init(); 212 if (r) 213 return r; 214 215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 216 if (!deferred_remove_workqueue) { 217 r = -ENOMEM; 218 goto out_uevent_exit; 219 } 220 221 _major = major; 222 r = register_blkdev(_major, _name); 223 if (r < 0) 224 goto out_free_workqueue; 225 226 if (!_major) 227 _major = r; 228 229 return 0; 230 231 out_free_workqueue: 232 destroy_workqueue(deferred_remove_workqueue); 233 out_uevent_exit: 234 dm_uevent_exit(); 235 236 return r; 237 } 238 239 static void local_exit(void) 240 { 241 flush_scheduled_work(); 242 destroy_workqueue(deferred_remove_workqueue); 243 244 unregister_blkdev(_major, _name); 245 dm_uevent_exit(); 246 247 _major = 0; 248 249 DMINFO("cleaned up"); 250 } 251 252 static int (*_inits[])(void) __initdata = { 253 local_init, 254 dm_target_init, 255 dm_linear_init, 256 dm_stripe_init, 257 dm_io_init, 258 dm_kcopyd_init, 259 dm_interface_init, 260 dm_statistics_init, 261 }; 262 263 static void (*_exits[])(void) = { 264 local_exit, 265 dm_target_exit, 266 dm_linear_exit, 267 dm_stripe_exit, 268 dm_io_exit, 269 dm_kcopyd_exit, 270 dm_interface_exit, 271 dm_statistics_exit, 272 }; 273 274 static int __init dm_init(void) 275 { 276 const int count = ARRAY_SIZE(_inits); 277 int r, i; 278 279 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 281 " Duplicate IMA measurements will not be recorded in the IMA log."); 282 #endif 283 284 for (i = 0; i < count; i++) { 285 r = _inits[i](); 286 if (r) 287 goto bad; 288 } 289 290 return 0; 291 bad: 292 while (i--) 293 _exits[i](); 294 295 return r; 296 } 297 298 static void __exit dm_exit(void) 299 { 300 int i = ARRAY_SIZE(_exits); 301 302 while (i--) 303 _exits[i](); 304 305 /* 306 * Should be empty by this point. 307 */ 308 idr_destroy(&_minor_idr); 309 } 310 311 /* 312 * Block device functions 313 */ 314 int dm_deleting_md(struct mapped_device *md) 315 { 316 return test_bit(DMF_DELETING, &md->flags); 317 } 318 319 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 320 { 321 struct mapped_device *md; 322 323 spin_lock(&_minor_lock); 324 325 md = bdev->bd_disk->private_data; 326 if (!md) 327 goto out; 328 329 if (test_bit(DMF_FREEING, &md->flags) || 330 dm_deleting_md(md)) { 331 md = NULL; 332 goto out; 333 } 334 335 dm_get(md); 336 atomic_inc(&md->open_count); 337 out: 338 spin_unlock(&_minor_lock); 339 340 return md ? 0 : -ENXIO; 341 } 342 343 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = disk->private_data; 350 if (WARN_ON(!md)) 351 goto out; 352 353 if (atomic_dec_and_test(&md->open_count) && 354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 355 queue_work(deferred_remove_workqueue, &deferred_remove_work); 356 357 dm_put(md); 358 out: 359 spin_unlock(&_minor_lock); 360 } 361 362 int dm_open_count(struct mapped_device *md) 363 { 364 return atomic_read(&md->open_count); 365 } 366 367 /* 368 * Guarantees nothing is using the device before it's deleted. 369 */ 370 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 371 { 372 int r = 0; 373 374 spin_lock(&_minor_lock); 375 376 if (dm_open_count(md)) { 377 r = -EBUSY; 378 if (mark_deferred) 379 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 381 r = -EEXIST; 382 else 383 set_bit(DMF_DELETING, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 int dm_cancel_deferred_remove(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (test_bit(DMF_DELETING, &md->flags)) 397 r = -EBUSY; 398 else 399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static void do_deferred_remove(struct work_struct *w) 407 { 408 dm_deferred_remove(); 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 419 struct block_device **bdev) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 return r; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 return r; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 return r; 438 439 if (dm_suspended_md(md)) 440 return -EAGAIN; 441 442 r = tgt->type->prepare_ioctl(tgt, bdev); 443 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 444 dm_put_live_table(md, *srcu_idx); 445 msleep(10); 446 goto retry; 447 } 448 449 return r; 450 } 451 452 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 453 { 454 dm_put_live_table(md, srcu_idx); 455 } 456 457 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 458 unsigned int cmd, unsigned long arg) 459 { 460 struct mapped_device *md = bdev->bd_disk->private_data; 461 int r, srcu_idx; 462 463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 464 if (r < 0) 465 goto out; 466 467 if (r > 0) { 468 /* 469 * Target determined this ioctl is being issued against a 470 * subset of the parent bdev; require extra privileges. 471 */ 472 if (!capable(CAP_SYS_RAWIO)) { 473 DMDEBUG_LIMIT( 474 "%s: sending ioctl %x to DM device without required privilege.", 475 current->comm, cmd); 476 r = -ENOIOCTLCMD; 477 goto out; 478 } 479 } 480 481 if (!bdev->bd_disk->fops->ioctl) 482 r = -ENOTTY; 483 else 484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 485 out: 486 dm_unprepare_ioctl(md, srcu_idx); 487 return r; 488 } 489 490 u64 dm_start_time_ns_from_clone(struct bio *bio) 491 { 492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); 493 } 494 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 496 static bool bio_is_flush_with_data(struct bio *bio) 497 { 498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 499 } 500 501 static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 502 unsigned long start_time, struct dm_stats_aux *stats_aux) 503 { 504 bool is_flush_with_data; 505 unsigned int bi_size; 506 507 /* If REQ_PREFLUSH set save any payload but do not account it */ 508 is_flush_with_data = bio_is_flush_with_data(bio); 509 if (is_flush_with_data) { 510 bi_size = bio->bi_iter.bi_size; 511 bio->bi_iter.bi_size = 0; 512 } 513 514 if (!end) 515 bio_start_io_acct_time(bio, start_time); 516 else 517 bio_end_io_acct(bio, start_time); 518 519 if (unlikely(dm_stats_used(&md->stats))) 520 dm_stats_account_io(&md->stats, bio_data_dir(bio), 521 bio->bi_iter.bi_sector, bio_sectors(bio), 522 end, start_time, stats_aux); 523 524 /* Restore bio's payload so it does get accounted upon requeue */ 525 if (is_flush_with_data) 526 bio->bi_iter.bi_size = bi_size; 527 } 528 529 static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 530 { 531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 532 } 533 534 static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 535 { 536 /* Must account IO to DM device in terms of orig_bio */ 537 struct bio *bio = io->orig_bio; 538 539 /* 540 * Ensure IO accounting is only ever started once. 541 * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. 542 */ 543 if (!clone || 544 likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { 545 if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) 546 return; 547 dm_io_set_flag(io, DM_IO_ACCOUNTED); 548 } else { 549 unsigned long flags; 550 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 551 return; 552 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ 553 spin_lock_irqsave(&io->lock, flags); 554 dm_io_set_flag(io, DM_IO_ACCOUNTED); 555 spin_unlock_irqrestore(&io->lock, flags); 556 } 557 558 __dm_start_io_acct(io, bio); 559 } 560 561 static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 562 { 563 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 564 } 565 566 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 567 { 568 struct dm_io *io; 569 struct dm_target_io *tio; 570 struct bio *clone; 571 572 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 573 574 tio = clone_to_tio(clone); 575 tio->flags = 0; 576 dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); 577 tio->io = NULL; 578 579 io = container_of(tio, struct dm_io, tio); 580 io->magic = DM_IO_MAGIC; 581 io->status = 0; 582 atomic_set(&io->io_count, 1); 583 this_cpu_inc(*md->pending_io); 584 io->orig_bio = NULL; 585 io->md = md; 586 io->map_task = current; 587 spin_lock_init(&io->lock); 588 io->start_time = jiffies; 589 io->flags = 0; 590 591 dm_stats_record_start(&md->stats, &io->stats_aux); 592 593 return io; 594 } 595 596 static void free_io(struct dm_io *io) 597 { 598 bio_put(&io->tio.clone); 599 } 600 601 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, 602 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 603 { 604 struct dm_target_io *tio; 605 struct bio *clone; 606 607 if (!ci->io->tio.io) { 608 /* the dm_target_io embedded in ci->io is available */ 609 tio = &ci->io->tio; 610 /* alloc_io() already initialized embedded clone */ 611 clone = &tio->clone; 612 } else { 613 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 614 gfp_mask, &ci->io->md->bs); 615 if (!clone) 616 return NULL; 617 618 /* REQ_DM_POLL_LIST shouldn't be inherited */ 619 clone->bi_opf &= ~REQ_DM_POLL_LIST; 620 621 tio = clone_to_tio(clone); 622 tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ 623 } 624 625 tio->magic = DM_TIO_MAGIC; 626 tio->io = ci->io; 627 tio->ti = ti; 628 tio->target_bio_nr = target_bio_nr; 629 tio->len_ptr = len; 630 tio->old_sector = 0; 631 632 if (len) { 633 clone->bi_iter.bi_size = to_bytes(*len); 634 if (bio_integrity(clone)) 635 bio_integrity_trim(clone); 636 } 637 638 return clone; 639 } 640 641 static void free_tio(struct bio *clone) 642 { 643 if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) 644 return; 645 bio_put(clone); 646 } 647 648 /* 649 * Add the bio to the list of deferred io. 650 */ 651 static void queue_io(struct mapped_device *md, struct bio *bio) 652 { 653 unsigned long flags; 654 655 spin_lock_irqsave(&md->deferred_lock, flags); 656 bio_list_add(&md->deferred, bio); 657 spin_unlock_irqrestore(&md->deferred_lock, flags); 658 queue_work(md->wq, &md->work); 659 } 660 661 /* 662 * Everyone (including functions in this file), should use this 663 * function to access the md->map field, and make sure they call 664 * dm_put_live_table() when finished. 665 */ 666 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 667 { 668 *srcu_idx = srcu_read_lock(&md->io_barrier); 669 670 return srcu_dereference(md->map, &md->io_barrier); 671 } 672 673 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 674 { 675 srcu_read_unlock(&md->io_barrier, srcu_idx); 676 } 677 678 void dm_sync_table(struct mapped_device *md) 679 { 680 synchronize_srcu(&md->io_barrier); 681 synchronize_rcu_expedited(); 682 } 683 684 /* 685 * A fast alternative to dm_get_live_table/dm_put_live_table. 686 * The caller must not block between these two functions. 687 */ 688 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 689 { 690 rcu_read_lock(); 691 return rcu_dereference(md->map); 692 } 693 694 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 695 { 696 rcu_read_unlock(); 697 } 698 699 static char *_dm_claim_ptr = "I belong to device-mapper"; 700 701 /* 702 * Open a table device so we can use it as a map destination. 703 */ 704 static int open_table_device(struct table_device *td, dev_t dev, 705 struct mapped_device *md) 706 { 707 struct block_device *bdev; 708 u64 part_off; 709 int r; 710 711 BUG_ON(td->dm_dev.bdev); 712 713 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 714 if (IS_ERR(bdev)) 715 return PTR_ERR(bdev); 716 717 r = bd_link_disk_holder(bdev, dm_disk(md)); 718 if (r) { 719 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 720 return r; 721 } 722 723 td->dm_dev.bdev = bdev; 724 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 725 return 0; 726 } 727 728 /* 729 * Close a table device that we've been using. 730 */ 731 static void close_table_device(struct table_device *td, struct mapped_device *md) 732 { 733 if (!td->dm_dev.bdev) 734 return; 735 736 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 737 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 738 put_dax(td->dm_dev.dax_dev); 739 td->dm_dev.bdev = NULL; 740 td->dm_dev.dax_dev = NULL; 741 } 742 743 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 744 fmode_t mode) 745 { 746 struct table_device *td; 747 748 list_for_each_entry(td, l, list) 749 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 750 return td; 751 752 return NULL; 753 } 754 755 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 756 struct dm_dev **result) 757 { 758 int r; 759 struct table_device *td; 760 761 mutex_lock(&md->table_devices_lock); 762 td = find_table_device(&md->table_devices, dev, mode); 763 if (!td) { 764 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 765 if (!td) { 766 mutex_unlock(&md->table_devices_lock); 767 return -ENOMEM; 768 } 769 770 td->dm_dev.mode = mode; 771 td->dm_dev.bdev = NULL; 772 773 if ((r = open_table_device(td, dev, md))) { 774 mutex_unlock(&md->table_devices_lock); 775 kfree(td); 776 return r; 777 } 778 779 format_dev_t(td->dm_dev.name, dev); 780 781 refcount_set(&td->count, 1); 782 list_add(&td->list, &md->table_devices); 783 } else { 784 refcount_inc(&td->count); 785 } 786 mutex_unlock(&md->table_devices_lock); 787 788 *result = &td->dm_dev; 789 return 0; 790 } 791 792 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 793 { 794 struct table_device *td = container_of(d, struct table_device, dm_dev); 795 796 mutex_lock(&md->table_devices_lock); 797 if (refcount_dec_and_test(&td->count)) { 798 close_table_device(td, md); 799 list_del(&td->list); 800 kfree(td); 801 } 802 mutex_unlock(&md->table_devices_lock); 803 } 804 805 static void free_table_devices(struct list_head *devices) 806 { 807 struct list_head *tmp, *next; 808 809 list_for_each_safe(tmp, next, devices) { 810 struct table_device *td = list_entry(tmp, struct table_device, list); 811 812 DMWARN("dm_destroy: %s still exists with %d references", 813 td->dm_dev.name, refcount_read(&td->count)); 814 kfree(td); 815 } 816 } 817 818 /* 819 * Get the geometry associated with a dm device 820 */ 821 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 822 { 823 *geo = md->geometry; 824 825 return 0; 826 } 827 828 /* 829 * Set the geometry of a device. 830 */ 831 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 832 { 833 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 834 835 if (geo->start > sz) { 836 DMWARN("Start sector is beyond the geometry limits."); 837 return -EINVAL; 838 } 839 840 md->geometry = *geo; 841 842 return 0; 843 } 844 845 static int __noflush_suspending(struct mapped_device *md) 846 { 847 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 848 } 849 850 static void dm_io_complete(struct dm_io *io) 851 { 852 blk_status_t io_error; 853 struct mapped_device *md = io->md; 854 struct bio *bio = io->orig_bio; 855 856 if (io->status == BLK_STS_DM_REQUEUE) { 857 unsigned long flags; 858 /* 859 * Target requested pushing back the I/O. 860 */ 861 spin_lock_irqsave(&md->deferred_lock, flags); 862 if (__noflush_suspending(md) && 863 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 864 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 865 bio_list_add_head(&md->deferred, bio); 866 } else { 867 /* 868 * noflush suspend was interrupted or this is 869 * a write to a zoned target. 870 */ 871 io->status = BLK_STS_IOERR; 872 } 873 spin_unlock_irqrestore(&md->deferred_lock, flags); 874 } 875 876 io_error = io->status; 877 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 878 dm_end_io_acct(io, bio); 879 else if (!io_error) { 880 /* 881 * Must handle target that DM_MAPIO_SUBMITTED only to 882 * then bio_endio() rather than dm_submit_bio_remap() 883 */ 884 __dm_start_io_acct(io, bio); 885 dm_end_io_acct(io, bio); 886 } 887 free_io(io); 888 smp_wmb(); 889 this_cpu_dec(*md->pending_io); 890 891 /* nudge anyone waiting on suspend queue */ 892 if (unlikely(wq_has_sleeper(&md->wait))) 893 wake_up(&md->wait); 894 895 if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) { 896 if (bio->bi_opf & REQ_POLLED) { 897 /* 898 * Upper layer won't help us poll split bio (io->orig_bio 899 * may only reflect a subset of the pre-split original) 900 * so clear REQ_POLLED in case of requeue. 901 */ 902 bio->bi_opf &= ~REQ_POLLED; 903 if (io_error == BLK_STS_AGAIN) { 904 /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ 905 queue_io(md, bio); 906 } 907 } 908 return; 909 } 910 911 if (bio_is_flush_with_data(bio)) { 912 /* 913 * Preflush done for flush with data, reissue 914 * without REQ_PREFLUSH. 915 */ 916 bio->bi_opf &= ~REQ_PREFLUSH; 917 queue_io(md, bio); 918 } else { 919 /* done with normal IO or empty flush */ 920 if (io_error) 921 bio->bi_status = io_error; 922 bio_endio(bio); 923 } 924 } 925 926 static inline bool dm_tio_is_normal(struct dm_target_io *tio) 927 { 928 return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && 929 !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 930 } 931 932 /* 933 * Decrements the number of outstanding ios that a bio has been 934 * cloned into, completing the original io if necc. 935 */ 936 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 937 { 938 /* Push-back supersedes any I/O errors */ 939 if (unlikely(error)) { 940 unsigned long flags; 941 spin_lock_irqsave(&io->lock, flags); 942 if (!(io->status == BLK_STS_DM_REQUEUE && 943 __noflush_suspending(io->md))) 944 io->status = error; 945 spin_unlock_irqrestore(&io->lock, flags); 946 } 947 948 if (atomic_dec_and_test(&io->io_count)) 949 dm_io_complete(io); 950 } 951 952 void disable_discard(struct mapped_device *md) 953 { 954 struct queue_limits *limits = dm_get_queue_limits(md); 955 956 /* device doesn't really support DISCARD, disable it */ 957 limits->max_discard_sectors = 0; 958 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 959 } 960 961 void disable_write_zeroes(struct mapped_device *md) 962 { 963 struct queue_limits *limits = dm_get_queue_limits(md); 964 965 /* device doesn't really support WRITE ZEROES, disable it */ 966 limits->max_write_zeroes_sectors = 0; 967 } 968 969 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 970 { 971 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 972 } 973 974 static void clone_endio(struct bio *bio) 975 { 976 blk_status_t error = bio->bi_status; 977 struct dm_target_io *tio = clone_to_tio(bio); 978 struct dm_io *io = tio->io; 979 struct mapped_device *md = tio->io->md; 980 dm_endio_fn endio = tio->ti->type->end_io; 981 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 982 983 if (unlikely(error == BLK_STS_TARGET)) { 984 if (bio_op(bio) == REQ_OP_DISCARD && 985 !q->limits.max_discard_sectors) 986 disable_discard(md); 987 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 988 !q->limits.max_write_zeroes_sectors) 989 disable_write_zeroes(md); 990 } 991 992 if (blk_queue_is_zoned(q)) 993 dm_zone_endio(io, bio); 994 995 if (endio) { 996 int r = endio(tio->ti, bio, &error); 997 switch (r) { 998 case DM_ENDIO_REQUEUE: 999 /* 1000 * Requeuing writes to a sequential zone of a zoned 1001 * target will break the sequential write pattern: 1002 * fail such IO. 1003 */ 1004 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 1005 error = BLK_STS_IOERR; 1006 else 1007 error = BLK_STS_DM_REQUEUE; 1008 fallthrough; 1009 case DM_ENDIO_DONE: 1010 break; 1011 case DM_ENDIO_INCOMPLETE: 1012 /* The target will handle the io */ 1013 return; 1014 default: 1015 DMWARN("unimplemented target endio return value: %d", r); 1016 BUG(); 1017 } 1018 } 1019 1020 if (unlikely(swap_bios_limit(tio->ti, bio))) { 1021 struct mapped_device *md = io->md; 1022 up(&md->swap_bios_semaphore); 1023 } 1024 1025 free_tio(bio); 1026 dm_io_dec_pending(io, error); 1027 } 1028 1029 /* 1030 * Return maximum size of I/O possible at the supplied sector up to the current 1031 * target boundary. 1032 */ 1033 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 1034 sector_t target_offset) 1035 { 1036 return ti->len - target_offset; 1037 } 1038 1039 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 1040 { 1041 sector_t target_offset = dm_target_offset(ti, sector); 1042 sector_t len = max_io_len_target_boundary(ti, target_offset); 1043 sector_t max_len; 1044 1045 /* 1046 * Does the target need to split IO even further? 1047 * - varied (per target) IO splitting is a tenet of DM; this 1048 * explains why stacked chunk_sectors based splitting via 1049 * blk_max_size_offset() isn't possible here. So pass in 1050 * ti->max_io_len to override stacked chunk_sectors. 1051 */ 1052 if (ti->max_io_len) { 1053 max_len = blk_max_size_offset(ti->table->md->queue, 1054 target_offset, ti->max_io_len); 1055 if (len > max_len) 1056 len = max_len; 1057 } 1058 1059 return len; 1060 } 1061 1062 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1063 { 1064 if (len > UINT_MAX) { 1065 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1066 (unsigned long long)len, UINT_MAX); 1067 ti->error = "Maximum size of target IO is too large"; 1068 return -EINVAL; 1069 } 1070 1071 ti->max_io_len = (uint32_t) len; 1072 1073 return 0; 1074 } 1075 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1076 1077 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 1078 sector_t sector, int *srcu_idx) 1079 __acquires(md->io_barrier) 1080 { 1081 struct dm_table *map; 1082 struct dm_target *ti; 1083 1084 map = dm_get_live_table(md, srcu_idx); 1085 if (!map) 1086 return NULL; 1087 1088 ti = dm_table_find_target(map, sector); 1089 if (!ti) 1090 return NULL; 1091 1092 return ti; 1093 } 1094 1095 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1096 long nr_pages, void **kaddr, pfn_t *pfn) 1097 { 1098 struct mapped_device *md = dax_get_private(dax_dev); 1099 sector_t sector = pgoff * PAGE_SECTORS; 1100 struct dm_target *ti; 1101 long len, ret = -EIO; 1102 int srcu_idx; 1103 1104 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1105 1106 if (!ti) 1107 goto out; 1108 if (!ti->type->direct_access) 1109 goto out; 1110 len = max_io_len(ti, sector) / PAGE_SECTORS; 1111 if (len < 1) 1112 goto out; 1113 nr_pages = min(len, nr_pages); 1114 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1115 1116 out: 1117 dm_put_live_table(md, srcu_idx); 1118 1119 return ret; 1120 } 1121 1122 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1123 size_t nr_pages) 1124 { 1125 struct mapped_device *md = dax_get_private(dax_dev); 1126 sector_t sector = pgoff * PAGE_SECTORS; 1127 struct dm_target *ti; 1128 int ret = -EIO; 1129 int srcu_idx; 1130 1131 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1132 1133 if (!ti) 1134 goto out; 1135 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1136 /* 1137 * ->zero_page_range() is mandatory dax operation. If we are 1138 * here, something is wrong. 1139 */ 1140 goto out; 1141 } 1142 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1143 out: 1144 dm_put_live_table(md, srcu_idx); 1145 1146 return ret; 1147 } 1148 1149 /* 1150 * A target may call dm_accept_partial_bio only from the map routine. It is 1151 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1152 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1153 * __send_duplicate_bios(). 1154 * 1155 * dm_accept_partial_bio informs the dm that the target only wants to process 1156 * additional n_sectors sectors of the bio and the rest of the data should be 1157 * sent in a next bio. 1158 * 1159 * A diagram that explains the arithmetics: 1160 * +--------------------+---------------+-------+ 1161 * | 1 | 2 | 3 | 1162 * +--------------------+---------------+-------+ 1163 * 1164 * <-------------- *tio->len_ptr ---------------> 1165 * <------- bi_size -------> 1166 * <-- n_sectors --> 1167 * 1168 * Region 1 was already iterated over with bio_advance or similar function. 1169 * (it may be empty if the target doesn't use bio_advance) 1170 * Region 2 is the remaining bio size that the target wants to process. 1171 * (it may be empty if region 1 is non-empty, although there is no reason 1172 * to make it empty) 1173 * The target requires that region 3 is to be sent in the next bio. 1174 * 1175 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1176 * the partially processed part (the sum of regions 1+2) must be the same for all 1177 * copies of the bio. 1178 */ 1179 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1180 { 1181 struct dm_target_io *tio = clone_to_tio(bio); 1182 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1183 1184 BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 1185 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1186 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1187 BUG_ON(bi_size > *tio->len_ptr); 1188 BUG_ON(n_sectors > bi_size); 1189 1190 *tio->len_ptr -= bi_size - n_sectors; 1191 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1192 } 1193 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1194 1195 static inline void __dm_submit_bio_remap(struct bio *clone, 1196 dev_t dev, sector_t old_sector) 1197 { 1198 trace_block_bio_remap(clone, dev, old_sector); 1199 submit_bio_noacct(clone); 1200 } 1201 1202 /* 1203 * @clone: clone bio that DM core passed to target's .map function 1204 * @tgt_clone: clone of @clone bio that target needs submitted 1205 * 1206 * Targets should use this interface to submit bios they take 1207 * ownership of when returning DM_MAPIO_SUBMITTED. 1208 * 1209 * Target should also enable ti->accounts_remapped_io 1210 */ 1211 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1212 { 1213 struct dm_target_io *tio = clone_to_tio(clone); 1214 struct dm_io *io = tio->io; 1215 1216 WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1217 1218 /* establish bio that will get submitted */ 1219 if (!tgt_clone) 1220 tgt_clone = clone; 1221 1222 /* 1223 * Account io->origin_bio to DM dev on behalf of target 1224 * that took ownership of IO with DM_MAPIO_SUBMITTED. 1225 */ 1226 if (io->map_task == current) { 1227 /* Still in target's map function */ 1228 dm_io_set_flag(io, DM_IO_START_ACCT); 1229 } else { 1230 /* 1231 * Called by another thread, managed by DM target, 1232 * wait for dm_split_and_process_bio() to store 1233 * io->orig_bio 1234 */ 1235 while (unlikely(!smp_load_acquire(&io->orig_bio))) 1236 msleep(1); 1237 dm_start_io_acct(io, clone); 1238 } 1239 1240 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1241 tio->old_sector); 1242 } 1243 EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1244 1245 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1246 { 1247 mutex_lock(&md->swap_bios_lock); 1248 while (latch < md->swap_bios) { 1249 cond_resched(); 1250 down(&md->swap_bios_semaphore); 1251 md->swap_bios--; 1252 } 1253 while (latch > md->swap_bios) { 1254 cond_resched(); 1255 up(&md->swap_bios_semaphore); 1256 md->swap_bios++; 1257 } 1258 mutex_unlock(&md->swap_bios_lock); 1259 } 1260 1261 static void __map_bio(struct bio *clone) 1262 { 1263 struct dm_target_io *tio = clone_to_tio(clone); 1264 int r; 1265 struct dm_io *io = tio->io; 1266 struct dm_target *ti = tio->ti; 1267 1268 clone->bi_end_io = clone_endio; 1269 1270 /* 1271 * Map the clone. 1272 */ 1273 dm_io_inc_pending(io); 1274 tio->old_sector = clone->bi_iter.bi_sector; 1275 1276 if (unlikely(swap_bios_limit(ti, clone))) { 1277 struct mapped_device *md = io->md; 1278 int latch = get_swap_bios(); 1279 if (unlikely(latch != md->swap_bios)) 1280 __set_swap_bios_limit(md, latch); 1281 down(&md->swap_bios_semaphore); 1282 } 1283 1284 /* 1285 * Check if the IO needs a special mapping due to zone append emulation 1286 * on zoned target. In this case, dm_zone_map_bio() calls the target 1287 * map operation. 1288 */ 1289 if (dm_emulate_zone_append(io->md)) 1290 r = dm_zone_map_bio(tio); 1291 else 1292 r = ti->type->map(ti, clone); 1293 1294 switch (r) { 1295 case DM_MAPIO_SUBMITTED: 1296 /* target has assumed ownership of this io */ 1297 if (!ti->accounts_remapped_io) 1298 dm_io_set_flag(io, DM_IO_START_ACCT); 1299 break; 1300 case DM_MAPIO_REMAPPED: 1301 /* 1302 * the bio has been remapped so dispatch it, but defer 1303 * dm_start_io_acct() until after possible bio_split(). 1304 */ 1305 __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1306 tio->old_sector); 1307 dm_io_set_flag(io, DM_IO_START_ACCT); 1308 break; 1309 case DM_MAPIO_KILL: 1310 case DM_MAPIO_REQUEUE: 1311 if (unlikely(swap_bios_limit(ti, clone))) 1312 up(&io->md->swap_bios_semaphore); 1313 free_tio(clone); 1314 if (r == DM_MAPIO_KILL) 1315 dm_io_dec_pending(io, BLK_STS_IOERR); 1316 else 1317 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1318 break; 1319 default: 1320 DMWARN("unimplemented target map return value: %d", r); 1321 BUG(); 1322 } 1323 } 1324 1325 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1326 struct dm_target *ti, unsigned num_bios, 1327 unsigned *len) 1328 { 1329 struct bio *bio; 1330 int try; 1331 1332 for (try = 0; try < 2; try++) { 1333 int bio_nr; 1334 1335 if (try) 1336 mutex_lock(&ci->io->md->table_devices_lock); 1337 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1338 bio = alloc_tio(ci, ti, bio_nr, len, 1339 try ? GFP_NOIO : GFP_NOWAIT); 1340 if (!bio) 1341 break; 1342 1343 bio_list_add(blist, bio); 1344 } 1345 if (try) 1346 mutex_unlock(&ci->io->md->table_devices_lock); 1347 if (bio_nr == num_bios) 1348 return; 1349 1350 while ((bio = bio_list_pop(blist))) 1351 free_tio(bio); 1352 } 1353 } 1354 1355 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1356 unsigned num_bios, unsigned *len) 1357 { 1358 struct bio_list blist = BIO_EMPTY_LIST; 1359 struct bio *clone; 1360 1361 switch (num_bios) { 1362 case 0: 1363 break; 1364 case 1: 1365 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1366 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1367 __map_bio(clone); 1368 break; 1369 default: 1370 alloc_multiple_bios(&blist, ci, ti, num_bios, len); 1371 while ((clone = bio_list_pop(&blist))) { 1372 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1373 __map_bio(clone); 1374 } 1375 break; 1376 } 1377 } 1378 1379 static void __send_empty_flush(struct clone_info *ci) 1380 { 1381 unsigned target_nr = 0; 1382 struct dm_target *ti; 1383 struct bio flush_bio; 1384 1385 /* 1386 * Use an on-stack bio for this, it's safe since we don't 1387 * need to reference it after submit. It's just used as 1388 * the basis for the clone(s). 1389 */ 1390 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1391 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1392 1393 ci->bio = &flush_bio; 1394 ci->sector_count = 0; 1395 1396 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1397 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1398 1399 bio_uninit(ci->bio); 1400 } 1401 1402 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1403 unsigned num_bios) 1404 { 1405 unsigned len; 1406 1407 len = min_t(sector_t, ci->sector_count, 1408 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1409 1410 /* 1411 * dm_accept_partial_bio cannot be used with duplicate bios, 1412 * so update clone_info cursor before __send_duplicate_bios(). 1413 */ 1414 ci->sector += len; 1415 ci->sector_count -= len; 1416 1417 __send_duplicate_bios(ci, ti, num_bios, &len); 1418 } 1419 1420 static bool is_abnormal_io(struct bio *bio) 1421 { 1422 bool r = false; 1423 1424 switch (bio_op(bio)) { 1425 case REQ_OP_DISCARD: 1426 case REQ_OP_SECURE_ERASE: 1427 case REQ_OP_WRITE_ZEROES: 1428 r = true; 1429 break; 1430 } 1431 1432 return r; 1433 } 1434 1435 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1436 int *result) 1437 { 1438 unsigned num_bios = 0; 1439 1440 switch (bio_op(ci->bio)) { 1441 case REQ_OP_DISCARD: 1442 num_bios = ti->num_discard_bios; 1443 break; 1444 case REQ_OP_SECURE_ERASE: 1445 num_bios = ti->num_secure_erase_bios; 1446 break; 1447 case REQ_OP_WRITE_ZEROES: 1448 num_bios = ti->num_write_zeroes_bios; 1449 break; 1450 default: 1451 return false; 1452 } 1453 1454 /* 1455 * Even though the device advertised support for this type of 1456 * request, that does not mean every target supports it, and 1457 * reconfiguration might also have changed that since the 1458 * check was performed. 1459 */ 1460 if (!num_bios) 1461 *result = -EOPNOTSUPP; 1462 else { 1463 __send_changing_extent_only(ci, ti, num_bios); 1464 *result = 0; 1465 } 1466 return true; 1467 } 1468 1469 /* 1470 * Reuse ->bi_private as hlist head for storing all dm_io instances 1471 * associated with this bio, and this bio's bi_private needs to be 1472 * stored in dm_io->data before the reuse. 1473 * 1474 * bio->bi_private is owned by fs or upper layer, so block layer won't 1475 * touch it after splitting. Meantime it won't be changed by anyone after 1476 * bio is submitted. So this reuse is safe. 1477 */ 1478 static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1479 { 1480 return (struct hlist_head *)&bio->bi_private; 1481 } 1482 1483 static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1484 { 1485 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1486 1487 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1488 bio->bi_opf |= REQ_DM_POLL_LIST; 1489 /* 1490 * Save .bi_private into dm_io, so that we can reuse 1491 * .bi_private as hlist head for storing dm_io list 1492 */ 1493 io->data = bio->bi_private; 1494 1495 INIT_HLIST_HEAD(head); 1496 1497 /* tell block layer to poll for completion */ 1498 bio->bi_cookie = ~BLK_QC_T_NONE; 1499 } else { 1500 /* 1501 * bio recursed due to split, reuse original poll list, 1502 * and save bio->bi_private too. 1503 */ 1504 io->data = hlist_entry(head->first, struct dm_io, node)->data; 1505 } 1506 1507 hlist_add_head(&io->node, head); 1508 } 1509 1510 /* 1511 * Select the correct strategy for processing a non-flush bio. 1512 */ 1513 static int __split_and_process_bio(struct clone_info *ci) 1514 { 1515 struct bio *clone; 1516 struct dm_target *ti; 1517 unsigned len; 1518 int r; 1519 1520 ti = dm_table_find_target(ci->map, ci->sector); 1521 if (!ti) 1522 return -EIO; 1523 1524 if (__process_abnormal_io(ci, ti, &r)) 1525 return r; 1526 1527 /* 1528 * Only support bio polling for normal IO, and the target io is 1529 * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1530 */ 1531 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1532 1533 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1534 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1535 __map_bio(clone); 1536 1537 ci->sector += len; 1538 ci->sector_count -= len; 1539 1540 return 0; 1541 } 1542 1543 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1544 struct dm_table *map, struct bio *bio) 1545 { 1546 ci->map = map; 1547 ci->io = alloc_io(md, bio); 1548 ci->bio = bio; 1549 ci->submit_as_polled = false; 1550 ci->sector = bio->bi_iter.bi_sector; 1551 ci->sector_count = bio_sectors(bio); 1552 1553 /* Shouldn't happen but sector_count was being set to 0 so... */ 1554 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1555 ci->sector_count = 0; 1556 } 1557 1558 /* 1559 * Entry point to split a bio into clones and submit them to the targets. 1560 */ 1561 static void dm_split_and_process_bio(struct mapped_device *md, 1562 struct dm_table *map, struct bio *bio) 1563 { 1564 struct clone_info ci; 1565 struct bio *orig_bio = NULL; 1566 int error = 0; 1567 1568 init_clone_info(&ci, md, map, bio); 1569 1570 if (bio->bi_opf & REQ_PREFLUSH) { 1571 __send_empty_flush(&ci); 1572 /* dm_io_complete submits any data associated with flush */ 1573 goto out; 1574 } 1575 1576 error = __split_and_process_bio(&ci); 1577 ci.io->map_task = NULL; 1578 if (error || !ci.sector_count) 1579 goto out; 1580 1581 /* 1582 * Remainder must be passed to submit_bio_noacct() so it gets handled 1583 * *after* bios already submitted have been completely processed. 1584 * We take a clone of the original to store in ci.io->orig_bio to be 1585 * used by dm_end_io_acct() and for dm_io_complete() to use for 1586 * completion handling. 1587 */ 1588 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1589 GFP_NOIO, &md->queue->bio_split); 1590 bio_chain(orig_bio, bio); 1591 trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1592 submit_bio_noacct(bio); 1593 out: 1594 if (!orig_bio) 1595 orig_bio = bio; 1596 smp_store_release(&ci.io->orig_bio, orig_bio); 1597 if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) 1598 dm_start_io_acct(ci.io, NULL); 1599 1600 /* 1601 * Drop the extra reference count for non-POLLED bio, and hold one 1602 * reference for POLLED bio, which will be released in dm_poll_bio 1603 * 1604 * Add every dm_io instance into the hlist_head which is stored in 1605 * bio->bi_private, so that dm_poll_bio can poll them all. 1606 */ 1607 if (error || !ci.submit_as_polled) 1608 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1609 else 1610 dm_queue_poll_io(bio, ci.io); 1611 } 1612 1613 static void dm_submit_bio(struct bio *bio) 1614 { 1615 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1616 int srcu_idx; 1617 struct dm_table *map; 1618 1619 map = dm_get_live_table(md, &srcu_idx); 1620 1621 /* If suspended, or map not yet available, queue this IO for later */ 1622 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1623 unlikely(!map)) { 1624 if (bio->bi_opf & REQ_NOWAIT) 1625 bio_wouldblock_error(bio); 1626 else if (bio->bi_opf & REQ_RAHEAD) 1627 bio_io_error(bio); 1628 else 1629 queue_io(md, bio); 1630 goto out; 1631 } 1632 1633 /* 1634 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1635 * otherwise associated queue_limits won't be imposed. 1636 */ 1637 if (is_abnormal_io(bio)) 1638 blk_queue_split(&bio); 1639 1640 dm_split_and_process_bio(md, map, bio); 1641 out: 1642 dm_put_live_table(md, srcu_idx); 1643 } 1644 1645 static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1646 unsigned int flags) 1647 { 1648 WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); 1649 1650 /* don't poll if the mapped io is done */ 1651 if (atomic_read(&io->io_count) > 1) 1652 bio_poll(&io->tio.clone, iob, flags); 1653 1654 /* bio_poll holds the last reference */ 1655 return atomic_read(&io->io_count) == 1; 1656 } 1657 1658 static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1659 unsigned int flags) 1660 { 1661 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1662 struct hlist_head tmp = HLIST_HEAD_INIT; 1663 struct hlist_node *next; 1664 struct dm_io *io; 1665 1666 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1667 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1668 return 0; 1669 1670 WARN_ON_ONCE(hlist_empty(head)); 1671 1672 hlist_move_list(head, &tmp); 1673 1674 /* 1675 * Restore .bi_private before possibly completing dm_io. 1676 * 1677 * bio_poll() is only possible once @bio has been completely 1678 * submitted via submit_bio_noacct()'s depth-first submission. 1679 * So there is no dm_queue_poll_io() race associated with 1680 * clearing REQ_DM_POLL_LIST here. 1681 */ 1682 bio->bi_opf &= ~REQ_DM_POLL_LIST; 1683 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1684 1685 hlist_for_each_entry_safe(io, next, &tmp, node) { 1686 if (dm_poll_dm_io(io, iob, flags)) { 1687 hlist_del_init(&io->node); 1688 /* 1689 * clone_endio() has already occurred, so passing 1690 * error as 0 here doesn't override io->status 1691 */ 1692 dm_io_dec_pending(io, 0); 1693 } 1694 } 1695 1696 /* Not done? */ 1697 if (!hlist_empty(&tmp)) { 1698 bio->bi_opf |= REQ_DM_POLL_LIST; 1699 /* Reset bio->bi_private to dm_io list head */ 1700 hlist_move_list(&tmp, head); 1701 return 0; 1702 } 1703 return 1; 1704 } 1705 1706 /*----------------------------------------------------------------- 1707 * An IDR is used to keep track of allocated minor numbers. 1708 *---------------------------------------------------------------*/ 1709 static void free_minor(int minor) 1710 { 1711 spin_lock(&_minor_lock); 1712 idr_remove(&_minor_idr, minor); 1713 spin_unlock(&_minor_lock); 1714 } 1715 1716 /* 1717 * See if the device with a specific minor # is free. 1718 */ 1719 static int specific_minor(int minor) 1720 { 1721 int r; 1722 1723 if (minor >= (1 << MINORBITS)) 1724 return -EINVAL; 1725 1726 idr_preload(GFP_KERNEL); 1727 spin_lock(&_minor_lock); 1728 1729 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1730 1731 spin_unlock(&_minor_lock); 1732 idr_preload_end(); 1733 if (r < 0) 1734 return r == -ENOSPC ? -EBUSY : r; 1735 return 0; 1736 } 1737 1738 static int next_free_minor(int *minor) 1739 { 1740 int r; 1741 1742 idr_preload(GFP_KERNEL); 1743 spin_lock(&_minor_lock); 1744 1745 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1746 1747 spin_unlock(&_minor_lock); 1748 idr_preload_end(); 1749 if (r < 0) 1750 return r; 1751 *minor = r; 1752 return 0; 1753 } 1754 1755 static const struct block_device_operations dm_blk_dops; 1756 static const struct block_device_operations dm_rq_blk_dops; 1757 static const struct dax_operations dm_dax_ops; 1758 1759 static void dm_wq_work(struct work_struct *work); 1760 1761 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1762 static void dm_queue_destroy_crypto_profile(struct request_queue *q) 1763 { 1764 dm_destroy_crypto_profile(q->crypto_profile); 1765 } 1766 1767 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1768 1769 static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) 1770 { 1771 } 1772 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1773 1774 static void cleanup_mapped_device(struct mapped_device *md) 1775 { 1776 if (md->wq) 1777 destroy_workqueue(md->wq); 1778 bioset_exit(&md->bs); 1779 bioset_exit(&md->io_bs); 1780 1781 if (md->dax_dev) { 1782 dax_remove_host(md->disk); 1783 kill_dax(md->dax_dev); 1784 put_dax(md->dax_dev); 1785 md->dax_dev = NULL; 1786 } 1787 1788 dm_cleanup_zoned_dev(md); 1789 if (md->disk) { 1790 spin_lock(&_minor_lock); 1791 md->disk->private_data = NULL; 1792 spin_unlock(&_minor_lock); 1793 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1794 dm_sysfs_exit(md); 1795 del_gendisk(md->disk); 1796 } 1797 dm_queue_destroy_crypto_profile(md->queue); 1798 blk_cleanup_disk(md->disk); 1799 } 1800 1801 if (md->pending_io) { 1802 free_percpu(md->pending_io); 1803 md->pending_io = NULL; 1804 } 1805 1806 cleanup_srcu_struct(&md->io_barrier); 1807 1808 mutex_destroy(&md->suspend_lock); 1809 mutex_destroy(&md->type_lock); 1810 mutex_destroy(&md->table_devices_lock); 1811 mutex_destroy(&md->swap_bios_lock); 1812 1813 dm_mq_cleanup_mapped_device(md); 1814 } 1815 1816 /* 1817 * Allocate and initialise a blank device with a given minor. 1818 */ 1819 static struct mapped_device *alloc_dev(int minor) 1820 { 1821 int r, numa_node_id = dm_get_numa_node(); 1822 struct mapped_device *md; 1823 void *old_md; 1824 1825 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1826 if (!md) { 1827 DMWARN("unable to allocate device, out of memory."); 1828 return NULL; 1829 } 1830 1831 if (!try_module_get(THIS_MODULE)) 1832 goto bad_module_get; 1833 1834 /* get a minor number for the dev */ 1835 if (minor == DM_ANY_MINOR) 1836 r = next_free_minor(&minor); 1837 else 1838 r = specific_minor(minor); 1839 if (r < 0) 1840 goto bad_minor; 1841 1842 r = init_srcu_struct(&md->io_barrier); 1843 if (r < 0) 1844 goto bad_io_barrier; 1845 1846 md->numa_node_id = numa_node_id; 1847 md->init_tio_pdu = false; 1848 md->type = DM_TYPE_NONE; 1849 mutex_init(&md->suspend_lock); 1850 mutex_init(&md->type_lock); 1851 mutex_init(&md->table_devices_lock); 1852 spin_lock_init(&md->deferred_lock); 1853 atomic_set(&md->holders, 1); 1854 atomic_set(&md->open_count, 0); 1855 atomic_set(&md->event_nr, 0); 1856 atomic_set(&md->uevent_seq, 0); 1857 INIT_LIST_HEAD(&md->uevent_list); 1858 INIT_LIST_HEAD(&md->table_devices); 1859 spin_lock_init(&md->uevent_lock); 1860 1861 /* 1862 * default to bio-based until DM table is loaded and md->type 1863 * established. If request-based table is loaded: blk-mq will 1864 * override accordingly. 1865 */ 1866 md->disk = blk_alloc_disk(md->numa_node_id); 1867 if (!md->disk) 1868 goto bad; 1869 md->queue = md->disk->queue; 1870 1871 init_waitqueue_head(&md->wait); 1872 INIT_WORK(&md->work, dm_wq_work); 1873 init_waitqueue_head(&md->eventq); 1874 init_completion(&md->kobj_holder.completion); 1875 1876 md->swap_bios = get_swap_bios(); 1877 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1878 mutex_init(&md->swap_bios_lock); 1879 1880 md->disk->major = _major; 1881 md->disk->first_minor = minor; 1882 md->disk->minors = 1; 1883 md->disk->flags |= GENHD_FL_NO_PART; 1884 md->disk->fops = &dm_blk_dops; 1885 md->disk->queue = md->queue; 1886 md->disk->private_data = md; 1887 sprintf(md->disk->disk_name, "dm-%d", minor); 1888 1889 if (IS_ENABLED(CONFIG_FS_DAX)) { 1890 md->dax_dev = alloc_dax(md, &dm_dax_ops); 1891 if (IS_ERR(md->dax_dev)) { 1892 md->dax_dev = NULL; 1893 goto bad; 1894 } 1895 set_dax_nocache(md->dax_dev); 1896 set_dax_nomc(md->dax_dev); 1897 if (dax_add_host(md->dax_dev, md->disk)) 1898 goto bad; 1899 } 1900 1901 format_dev_t(md->name, MKDEV(_major, minor)); 1902 1903 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1904 if (!md->wq) 1905 goto bad; 1906 1907 md->pending_io = alloc_percpu(unsigned long); 1908 if (!md->pending_io) 1909 goto bad; 1910 1911 dm_stats_init(&md->stats); 1912 1913 /* Populate the mapping, nobody knows we exist yet */ 1914 spin_lock(&_minor_lock); 1915 old_md = idr_replace(&_minor_idr, md, minor); 1916 spin_unlock(&_minor_lock); 1917 1918 BUG_ON(old_md != MINOR_ALLOCED); 1919 1920 return md; 1921 1922 bad: 1923 cleanup_mapped_device(md); 1924 bad_io_barrier: 1925 free_minor(minor); 1926 bad_minor: 1927 module_put(THIS_MODULE); 1928 bad_module_get: 1929 kvfree(md); 1930 return NULL; 1931 } 1932 1933 static void unlock_fs(struct mapped_device *md); 1934 1935 static void free_dev(struct mapped_device *md) 1936 { 1937 int minor = MINOR(disk_devt(md->disk)); 1938 1939 unlock_fs(md); 1940 1941 cleanup_mapped_device(md); 1942 1943 free_table_devices(&md->table_devices); 1944 dm_stats_cleanup(&md->stats); 1945 free_minor(minor); 1946 1947 module_put(THIS_MODULE); 1948 kvfree(md); 1949 } 1950 1951 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1952 { 1953 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1954 int ret = 0; 1955 1956 if (dm_table_bio_based(t)) { 1957 /* 1958 * The md may already have mempools that need changing. 1959 * If so, reload bioset because front_pad may have changed 1960 * because a different table was loaded. 1961 */ 1962 bioset_exit(&md->bs); 1963 bioset_exit(&md->io_bs); 1964 1965 } else if (bioset_initialized(&md->bs)) { 1966 /* 1967 * There's no need to reload with request-based dm 1968 * because the size of front_pad doesn't change. 1969 * Note for future: If you are to reload bioset, 1970 * prep-ed requests in the queue may refer 1971 * to bio from the old bioset, so you must walk 1972 * through the queue to unprep. 1973 */ 1974 goto out; 1975 } 1976 1977 BUG_ON(!p || 1978 bioset_initialized(&md->bs) || 1979 bioset_initialized(&md->io_bs)); 1980 1981 ret = bioset_init_from_src(&md->bs, &p->bs); 1982 if (ret) 1983 goto out; 1984 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 1985 if (ret) 1986 bioset_exit(&md->bs); 1987 out: 1988 /* mempool bind completed, no longer need any mempools in the table */ 1989 dm_table_free_md_mempools(t); 1990 return ret; 1991 } 1992 1993 /* 1994 * Bind a table to the device. 1995 */ 1996 static void event_callback(void *context) 1997 { 1998 unsigned long flags; 1999 LIST_HEAD(uevents); 2000 struct mapped_device *md = (struct mapped_device *) context; 2001 2002 spin_lock_irqsave(&md->uevent_lock, flags); 2003 list_splice_init(&md->uevent_list, &uevents); 2004 spin_unlock_irqrestore(&md->uevent_lock, flags); 2005 2006 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2007 2008 atomic_inc(&md->event_nr); 2009 wake_up(&md->eventq); 2010 dm_issue_global_event(); 2011 } 2012 2013 /* 2014 * Returns old map, which caller must destroy. 2015 */ 2016 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2017 struct queue_limits *limits) 2018 { 2019 struct dm_table *old_map; 2020 sector_t size; 2021 int ret; 2022 2023 lockdep_assert_held(&md->suspend_lock); 2024 2025 size = dm_table_get_size(t); 2026 2027 /* 2028 * Wipe any geometry if the size of the table changed. 2029 */ 2030 if (size != dm_get_size(md)) 2031 memset(&md->geometry, 0, sizeof(md->geometry)); 2032 2033 if (!get_capacity(md->disk)) 2034 set_capacity(md->disk, size); 2035 else 2036 set_capacity_and_notify(md->disk, size); 2037 2038 dm_table_event_callback(t, event_callback, md); 2039 2040 if (dm_table_request_based(t)) { 2041 /* 2042 * Leverage the fact that request-based DM targets are 2043 * immutable singletons - used to optimize dm_mq_queue_rq. 2044 */ 2045 md->immutable_target = dm_table_get_immutable_target(t); 2046 } 2047 2048 ret = __bind_mempools(md, t); 2049 if (ret) { 2050 old_map = ERR_PTR(ret); 2051 goto out; 2052 } 2053 2054 ret = dm_table_set_restrictions(t, md->queue, limits); 2055 if (ret) { 2056 old_map = ERR_PTR(ret); 2057 goto out; 2058 } 2059 2060 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2061 rcu_assign_pointer(md->map, (void *)t); 2062 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2063 2064 if (old_map) 2065 dm_sync_table(md); 2066 out: 2067 return old_map; 2068 } 2069 2070 /* 2071 * Returns unbound table for the caller to free. 2072 */ 2073 static struct dm_table *__unbind(struct mapped_device *md) 2074 { 2075 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2076 2077 if (!map) 2078 return NULL; 2079 2080 dm_table_event_callback(map, NULL, NULL); 2081 RCU_INIT_POINTER(md->map, NULL); 2082 dm_sync_table(md); 2083 2084 return map; 2085 } 2086 2087 /* 2088 * Constructor for a new device. 2089 */ 2090 int dm_create(int minor, struct mapped_device **result) 2091 { 2092 struct mapped_device *md; 2093 2094 md = alloc_dev(minor); 2095 if (!md) 2096 return -ENXIO; 2097 2098 dm_ima_reset_data(md); 2099 2100 *result = md; 2101 return 0; 2102 } 2103 2104 /* 2105 * Functions to manage md->type. 2106 * All are required to hold md->type_lock. 2107 */ 2108 void dm_lock_md_type(struct mapped_device *md) 2109 { 2110 mutex_lock(&md->type_lock); 2111 } 2112 2113 void dm_unlock_md_type(struct mapped_device *md) 2114 { 2115 mutex_unlock(&md->type_lock); 2116 } 2117 2118 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2119 { 2120 BUG_ON(!mutex_is_locked(&md->type_lock)); 2121 md->type = type; 2122 } 2123 2124 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2125 { 2126 return md->type; 2127 } 2128 2129 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2130 { 2131 return md->immutable_target_type; 2132 } 2133 2134 /* 2135 * The queue_limits are only valid as long as you have a reference 2136 * count on 'md'. 2137 */ 2138 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2139 { 2140 BUG_ON(!atomic_read(&md->holders)); 2141 return &md->queue->limits; 2142 } 2143 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2144 2145 /* 2146 * Setup the DM device's queue based on md's type 2147 */ 2148 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2149 { 2150 enum dm_queue_mode type = dm_table_get_type(t); 2151 struct queue_limits limits; 2152 int r; 2153 2154 switch (type) { 2155 case DM_TYPE_REQUEST_BASED: 2156 md->disk->fops = &dm_rq_blk_dops; 2157 r = dm_mq_init_request_queue(md, t); 2158 if (r) { 2159 DMERR("Cannot initialize queue for request-based dm mapped device"); 2160 return r; 2161 } 2162 break; 2163 case DM_TYPE_BIO_BASED: 2164 case DM_TYPE_DAX_BIO_BASED: 2165 break; 2166 case DM_TYPE_NONE: 2167 WARN_ON_ONCE(true); 2168 break; 2169 } 2170 2171 r = dm_calculate_queue_limits(t, &limits); 2172 if (r) { 2173 DMERR("Cannot calculate initial queue limits"); 2174 return r; 2175 } 2176 r = dm_table_set_restrictions(t, md->queue, &limits); 2177 if (r) 2178 return r; 2179 2180 r = add_disk(md->disk); 2181 if (r) 2182 return r; 2183 2184 r = dm_sysfs_init(md); 2185 if (r) { 2186 del_gendisk(md->disk); 2187 return r; 2188 } 2189 md->type = type; 2190 return 0; 2191 } 2192 2193 struct mapped_device *dm_get_md(dev_t dev) 2194 { 2195 struct mapped_device *md; 2196 unsigned minor = MINOR(dev); 2197 2198 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2199 return NULL; 2200 2201 spin_lock(&_minor_lock); 2202 2203 md = idr_find(&_minor_idr, minor); 2204 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2205 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2206 md = NULL; 2207 goto out; 2208 } 2209 dm_get(md); 2210 out: 2211 spin_unlock(&_minor_lock); 2212 2213 return md; 2214 } 2215 EXPORT_SYMBOL_GPL(dm_get_md); 2216 2217 void *dm_get_mdptr(struct mapped_device *md) 2218 { 2219 return md->interface_ptr; 2220 } 2221 2222 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2223 { 2224 md->interface_ptr = ptr; 2225 } 2226 2227 void dm_get(struct mapped_device *md) 2228 { 2229 atomic_inc(&md->holders); 2230 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2231 } 2232 2233 int dm_hold(struct mapped_device *md) 2234 { 2235 spin_lock(&_minor_lock); 2236 if (test_bit(DMF_FREEING, &md->flags)) { 2237 spin_unlock(&_minor_lock); 2238 return -EBUSY; 2239 } 2240 dm_get(md); 2241 spin_unlock(&_minor_lock); 2242 return 0; 2243 } 2244 EXPORT_SYMBOL_GPL(dm_hold); 2245 2246 const char *dm_device_name(struct mapped_device *md) 2247 { 2248 return md->name; 2249 } 2250 EXPORT_SYMBOL_GPL(dm_device_name); 2251 2252 static void __dm_destroy(struct mapped_device *md, bool wait) 2253 { 2254 struct dm_table *map; 2255 int srcu_idx; 2256 2257 might_sleep(); 2258 2259 spin_lock(&_minor_lock); 2260 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2261 set_bit(DMF_FREEING, &md->flags); 2262 spin_unlock(&_minor_lock); 2263 2264 blk_mark_disk_dead(md->disk); 2265 2266 /* 2267 * Take suspend_lock so that presuspend and postsuspend methods 2268 * do not race with internal suspend. 2269 */ 2270 mutex_lock(&md->suspend_lock); 2271 map = dm_get_live_table(md, &srcu_idx); 2272 if (!dm_suspended_md(md)) { 2273 dm_table_presuspend_targets(map); 2274 set_bit(DMF_SUSPENDED, &md->flags); 2275 set_bit(DMF_POST_SUSPENDING, &md->flags); 2276 dm_table_postsuspend_targets(map); 2277 } 2278 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2279 dm_put_live_table(md, srcu_idx); 2280 mutex_unlock(&md->suspend_lock); 2281 2282 /* 2283 * Rare, but there may be I/O requests still going to complete, 2284 * for example. Wait for all references to disappear. 2285 * No one should increment the reference count of the mapped_device, 2286 * after the mapped_device state becomes DMF_FREEING. 2287 */ 2288 if (wait) 2289 while (atomic_read(&md->holders)) 2290 msleep(1); 2291 else if (atomic_read(&md->holders)) 2292 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2293 dm_device_name(md), atomic_read(&md->holders)); 2294 2295 dm_table_destroy(__unbind(md)); 2296 free_dev(md); 2297 } 2298 2299 void dm_destroy(struct mapped_device *md) 2300 { 2301 __dm_destroy(md, true); 2302 } 2303 2304 void dm_destroy_immediate(struct mapped_device *md) 2305 { 2306 __dm_destroy(md, false); 2307 } 2308 2309 void dm_put(struct mapped_device *md) 2310 { 2311 atomic_dec(&md->holders); 2312 } 2313 EXPORT_SYMBOL_GPL(dm_put); 2314 2315 static bool dm_in_flight_bios(struct mapped_device *md) 2316 { 2317 int cpu; 2318 unsigned long sum = 0; 2319 2320 for_each_possible_cpu(cpu) 2321 sum += *per_cpu_ptr(md->pending_io, cpu); 2322 2323 return sum != 0; 2324 } 2325 2326 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2327 { 2328 int r = 0; 2329 DEFINE_WAIT(wait); 2330 2331 while (true) { 2332 prepare_to_wait(&md->wait, &wait, task_state); 2333 2334 if (!dm_in_flight_bios(md)) 2335 break; 2336 2337 if (signal_pending_state(task_state, current)) { 2338 r = -EINTR; 2339 break; 2340 } 2341 2342 io_schedule(); 2343 } 2344 finish_wait(&md->wait, &wait); 2345 2346 smp_rmb(); 2347 2348 return r; 2349 } 2350 2351 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2352 { 2353 int r = 0; 2354 2355 if (!queue_is_mq(md->queue)) 2356 return dm_wait_for_bios_completion(md, task_state); 2357 2358 while (true) { 2359 if (!blk_mq_queue_inflight(md->queue)) 2360 break; 2361 2362 if (signal_pending_state(task_state, current)) { 2363 r = -EINTR; 2364 break; 2365 } 2366 2367 msleep(5); 2368 } 2369 2370 return r; 2371 } 2372 2373 /* 2374 * Process the deferred bios 2375 */ 2376 static void dm_wq_work(struct work_struct *work) 2377 { 2378 struct mapped_device *md = container_of(work, struct mapped_device, work); 2379 struct bio *bio; 2380 2381 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2382 spin_lock_irq(&md->deferred_lock); 2383 bio = bio_list_pop(&md->deferred); 2384 spin_unlock_irq(&md->deferred_lock); 2385 2386 if (!bio) 2387 break; 2388 2389 submit_bio_noacct(bio); 2390 } 2391 } 2392 2393 static void dm_queue_flush(struct mapped_device *md) 2394 { 2395 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2396 smp_mb__after_atomic(); 2397 queue_work(md->wq, &md->work); 2398 } 2399 2400 /* 2401 * Swap in a new table, returning the old one for the caller to destroy. 2402 */ 2403 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2404 { 2405 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2406 struct queue_limits limits; 2407 int r; 2408 2409 mutex_lock(&md->suspend_lock); 2410 2411 /* device must be suspended */ 2412 if (!dm_suspended_md(md)) 2413 goto out; 2414 2415 /* 2416 * If the new table has no data devices, retain the existing limits. 2417 * This helps multipath with queue_if_no_path if all paths disappear, 2418 * then new I/O is queued based on these limits, and then some paths 2419 * reappear. 2420 */ 2421 if (dm_table_has_no_data_devices(table)) { 2422 live_map = dm_get_live_table_fast(md); 2423 if (live_map) 2424 limits = md->queue->limits; 2425 dm_put_live_table_fast(md); 2426 } 2427 2428 if (!live_map) { 2429 r = dm_calculate_queue_limits(table, &limits); 2430 if (r) { 2431 map = ERR_PTR(r); 2432 goto out; 2433 } 2434 } 2435 2436 map = __bind(md, table, &limits); 2437 dm_issue_global_event(); 2438 2439 out: 2440 mutex_unlock(&md->suspend_lock); 2441 return map; 2442 } 2443 2444 /* 2445 * Functions to lock and unlock any filesystem running on the 2446 * device. 2447 */ 2448 static int lock_fs(struct mapped_device *md) 2449 { 2450 int r; 2451 2452 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2453 2454 r = freeze_bdev(md->disk->part0); 2455 if (!r) 2456 set_bit(DMF_FROZEN, &md->flags); 2457 return r; 2458 } 2459 2460 static void unlock_fs(struct mapped_device *md) 2461 { 2462 if (!test_bit(DMF_FROZEN, &md->flags)) 2463 return; 2464 thaw_bdev(md->disk->part0); 2465 clear_bit(DMF_FROZEN, &md->flags); 2466 } 2467 2468 /* 2469 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2470 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2471 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2472 * 2473 * If __dm_suspend returns 0, the device is completely quiescent 2474 * now. There is no request-processing activity. All new requests 2475 * are being added to md->deferred list. 2476 */ 2477 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2478 unsigned suspend_flags, unsigned int task_state, 2479 int dmf_suspended_flag) 2480 { 2481 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2482 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2483 int r; 2484 2485 lockdep_assert_held(&md->suspend_lock); 2486 2487 /* 2488 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2489 * This flag is cleared before dm_suspend returns. 2490 */ 2491 if (noflush) 2492 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2493 else 2494 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2495 2496 /* 2497 * This gets reverted if there's an error later and the targets 2498 * provide the .presuspend_undo hook. 2499 */ 2500 dm_table_presuspend_targets(map); 2501 2502 /* 2503 * Flush I/O to the device. 2504 * Any I/O submitted after lock_fs() may not be flushed. 2505 * noflush takes precedence over do_lockfs. 2506 * (lock_fs() flushes I/Os and waits for them to complete.) 2507 */ 2508 if (!noflush && do_lockfs) { 2509 r = lock_fs(md); 2510 if (r) { 2511 dm_table_presuspend_undo_targets(map); 2512 return r; 2513 } 2514 } 2515 2516 /* 2517 * Here we must make sure that no processes are submitting requests 2518 * to target drivers i.e. no one may be executing 2519 * dm_split_and_process_bio from dm_submit_bio. 2520 * 2521 * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2522 * we take the write lock. To prevent any process from reentering 2523 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2524 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2525 * flush_workqueue(md->wq). 2526 */ 2527 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2528 if (map) 2529 synchronize_srcu(&md->io_barrier); 2530 2531 /* 2532 * Stop md->queue before flushing md->wq in case request-based 2533 * dm defers requests to md->wq from md->queue. 2534 */ 2535 if (dm_request_based(md)) 2536 dm_stop_queue(md->queue); 2537 2538 flush_workqueue(md->wq); 2539 2540 /* 2541 * At this point no more requests are entering target request routines. 2542 * We call dm_wait_for_completion to wait for all existing requests 2543 * to finish. 2544 */ 2545 r = dm_wait_for_completion(md, task_state); 2546 if (!r) 2547 set_bit(dmf_suspended_flag, &md->flags); 2548 2549 if (noflush) 2550 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2551 if (map) 2552 synchronize_srcu(&md->io_barrier); 2553 2554 /* were we interrupted ? */ 2555 if (r < 0) { 2556 dm_queue_flush(md); 2557 2558 if (dm_request_based(md)) 2559 dm_start_queue(md->queue); 2560 2561 unlock_fs(md); 2562 dm_table_presuspend_undo_targets(map); 2563 /* pushback list is already flushed, so skip flush */ 2564 } 2565 2566 return r; 2567 } 2568 2569 /* 2570 * We need to be able to change a mapping table under a mounted 2571 * filesystem. For example we might want to move some data in 2572 * the background. Before the table can be swapped with 2573 * dm_bind_table, dm_suspend must be called to flush any in 2574 * flight bios and ensure that any further io gets deferred. 2575 */ 2576 /* 2577 * Suspend mechanism in request-based dm. 2578 * 2579 * 1. Flush all I/Os by lock_fs() if needed. 2580 * 2. Stop dispatching any I/O by stopping the request_queue. 2581 * 3. Wait for all in-flight I/Os to be completed or requeued. 2582 * 2583 * To abort suspend, start the request_queue. 2584 */ 2585 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2586 { 2587 struct dm_table *map = NULL; 2588 int r = 0; 2589 2590 retry: 2591 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2592 2593 if (dm_suspended_md(md)) { 2594 r = -EINVAL; 2595 goto out_unlock; 2596 } 2597 2598 if (dm_suspended_internally_md(md)) { 2599 /* already internally suspended, wait for internal resume */ 2600 mutex_unlock(&md->suspend_lock); 2601 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2602 if (r) 2603 return r; 2604 goto retry; 2605 } 2606 2607 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2608 2609 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2610 if (r) 2611 goto out_unlock; 2612 2613 set_bit(DMF_POST_SUSPENDING, &md->flags); 2614 dm_table_postsuspend_targets(map); 2615 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2616 2617 out_unlock: 2618 mutex_unlock(&md->suspend_lock); 2619 return r; 2620 } 2621 2622 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2623 { 2624 if (map) { 2625 int r = dm_table_resume_targets(map); 2626 if (r) 2627 return r; 2628 } 2629 2630 dm_queue_flush(md); 2631 2632 /* 2633 * Flushing deferred I/Os must be done after targets are resumed 2634 * so that mapping of targets can work correctly. 2635 * Request-based dm is queueing the deferred I/Os in its request_queue. 2636 */ 2637 if (dm_request_based(md)) 2638 dm_start_queue(md->queue); 2639 2640 unlock_fs(md); 2641 2642 return 0; 2643 } 2644 2645 int dm_resume(struct mapped_device *md) 2646 { 2647 int r; 2648 struct dm_table *map = NULL; 2649 2650 retry: 2651 r = -EINVAL; 2652 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2653 2654 if (!dm_suspended_md(md)) 2655 goto out; 2656 2657 if (dm_suspended_internally_md(md)) { 2658 /* already internally suspended, wait for internal resume */ 2659 mutex_unlock(&md->suspend_lock); 2660 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2661 if (r) 2662 return r; 2663 goto retry; 2664 } 2665 2666 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2667 if (!map || !dm_table_get_size(map)) 2668 goto out; 2669 2670 r = __dm_resume(md, map); 2671 if (r) 2672 goto out; 2673 2674 clear_bit(DMF_SUSPENDED, &md->flags); 2675 out: 2676 mutex_unlock(&md->suspend_lock); 2677 2678 return r; 2679 } 2680 2681 /* 2682 * Internal suspend/resume works like userspace-driven suspend. It waits 2683 * until all bios finish and prevents issuing new bios to the target drivers. 2684 * It may be used only from the kernel. 2685 */ 2686 2687 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2688 { 2689 struct dm_table *map = NULL; 2690 2691 lockdep_assert_held(&md->suspend_lock); 2692 2693 if (md->internal_suspend_count++) 2694 return; /* nested internal suspend */ 2695 2696 if (dm_suspended_md(md)) { 2697 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2698 return; /* nest suspend */ 2699 } 2700 2701 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2702 2703 /* 2704 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2705 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2706 * would require changing .presuspend to return an error -- avoid this 2707 * until there is a need for more elaborate variants of internal suspend. 2708 */ 2709 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2710 DMF_SUSPENDED_INTERNALLY); 2711 2712 set_bit(DMF_POST_SUSPENDING, &md->flags); 2713 dm_table_postsuspend_targets(map); 2714 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2715 } 2716 2717 static void __dm_internal_resume(struct mapped_device *md) 2718 { 2719 BUG_ON(!md->internal_suspend_count); 2720 2721 if (--md->internal_suspend_count) 2722 return; /* resume from nested internal suspend */ 2723 2724 if (dm_suspended_md(md)) 2725 goto done; /* resume from nested suspend */ 2726 2727 /* 2728 * NOTE: existing callers don't need to call dm_table_resume_targets 2729 * (which may fail -- so best to avoid it for now by passing NULL map) 2730 */ 2731 (void) __dm_resume(md, NULL); 2732 2733 done: 2734 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2735 smp_mb__after_atomic(); 2736 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2737 } 2738 2739 void dm_internal_suspend_noflush(struct mapped_device *md) 2740 { 2741 mutex_lock(&md->suspend_lock); 2742 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2743 mutex_unlock(&md->suspend_lock); 2744 } 2745 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2746 2747 void dm_internal_resume(struct mapped_device *md) 2748 { 2749 mutex_lock(&md->suspend_lock); 2750 __dm_internal_resume(md); 2751 mutex_unlock(&md->suspend_lock); 2752 } 2753 EXPORT_SYMBOL_GPL(dm_internal_resume); 2754 2755 /* 2756 * Fast variants of internal suspend/resume hold md->suspend_lock, 2757 * which prevents interaction with userspace-driven suspend. 2758 */ 2759 2760 void dm_internal_suspend_fast(struct mapped_device *md) 2761 { 2762 mutex_lock(&md->suspend_lock); 2763 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2764 return; 2765 2766 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2767 synchronize_srcu(&md->io_barrier); 2768 flush_workqueue(md->wq); 2769 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2770 } 2771 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2772 2773 void dm_internal_resume_fast(struct mapped_device *md) 2774 { 2775 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2776 goto done; 2777 2778 dm_queue_flush(md); 2779 2780 done: 2781 mutex_unlock(&md->suspend_lock); 2782 } 2783 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2784 2785 /*----------------------------------------------------------------- 2786 * Event notification. 2787 *---------------------------------------------------------------*/ 2788 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2789 unsigned cookie) 2790 { 2791 int r; 2792 unsigned noio_flag; 2793 char udev_cookie[DM_COOKIE_LENGTH]; 2794 char *envp[] = { udev_cookie, NULL }; 2795 2796 noio_flag = memalloc_noio_save(); 2797 2798 if (!cookie) 2799 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2800 else { 2801 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2802 DM_COOKIE_ENV_VAR_NAME, cookie); 2803 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2804 action, envp); 2805 } 2806 2807 memalloc_noio_restore(noio_flag); 2808 2809 return r; 2810 } 2811 2812 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2813 { 2814 return atomic_add_return(1, &md->uevent_seq); 2815 } 2816 2817 uint32_t dm_get_event_nr(struct mapped_device *md) 2818 { 2819 return atomic_read(&md->event_nr); 2820 } 2821 2822 int dm_wait_event(struct mapped_device *md, int event_nr) 2823 { 2824 return wait_event_interruptible(md->eventq, 2825 (event_nr != atomic_read(&md->event_nr))); 2826 } 2827 2828 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2829 { 2830 unsigned long flags; 2831 2832 spin_lock_irqsave(&md->uevent_lock, flags); 2833 list_add(elist, &md->uevent_list); 2834 spin_unlock_irqrestore(&md->uevent_lock, flags); 2835 } 2836 2837 /* 2838 * The gendisk is only valid as long as you have a reference 2839 * count on 'md'. 2840 */ 2841 struct gendisk *dm_disk(struct mapped_device *md) 2842 { 2843 return md->disk; 2844 } 2845 EXPORT_SYMBOL_GPL(dm_disk); 2846 2847 struct kobject *dm_kobject(struct mapped_device *md) 2848 { 2849 return &md->kobj_holder.kobj; 2850 } 2851 2852 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2853 { 2854 struct mapped_device *md; 2855 2856 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2857 2858 spin_lock(&_minor_lock); 2859 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2860 md = NULL; 2861 goto out; 2862 } 2863 dm_get(md); 2864 out: 2865 spin_unlock(&_minor_lock); 2866 2867 return md; 2868 } 2869 2870 int dm_suspended_md(struct mapped_device *md) 2871 { 2872 return test_bit(DMF_SUSPENDED, &md->flags); 2873 } 2874 2875 static int dm_post_suspending_md(struct mapped_device *md) 2876 { 2877 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2878 } 2879 2880 int dm_suspended_internally_md(struct mapped_device *md) 2881 { 2882 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2883 } 2884 2885 int dm_test_deferred_remove_flag(struct mapped_device *md) 2886 { 2887 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2888 } 2889 2890 int dm_suspended(struct dm_target *ti) 2891 { 2892 return dm_suspended_md(ti->table->md); 2893 } 2894 EXPORT_SYMBOL_GPL(dm_suspended); 2895 2896 int dm_post_suspending(struct dm_target *ti) 2897 { 2898 return dm_post_suspending_md(ti->table->md); 2899 } 2900 EXPORT_SYMBOL_GPL(dm_post_suspending); 2901 2902 int dm_noflush_suspending(struct dm_target *ti) 2903 { 2904 return __noflush_suspending(ti->table->md); 2905 } 2906 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2907 2908 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2909 unsigned integrity, unsigned per_io_data_size, 2910 unsigned min_pool_size) 2911 { 2912 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2913 unsigned int pool_size = 0; 2914 unsigned int front_pad, io_front_pad; 2915 int ret; 2916 2917 if (!pools) 2918 return NULL; 2919 2920 switch (type) { 2921 case DM_TYPE_BIO_BASED: 2922 case DM_TYPE_DAX_BIO_BASED: 2923 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2924 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2925 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2926 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2927 if (ret) 2928 goto out; 2929 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2930 goto out; 2931 break; 2932 case DM_TYPE_REQUEST_BASED: 2933 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2934 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2935 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2936 break; 2937 default: 2938 BUG(); 2939 } 2940 2941 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2942 if (ret) 2943 goto out; 2944 2945 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2946 goto out; 2947 2948 return pools; 2949 2950 out: 2951 dm_free_md_mempools(pools); 2952 2953 return NULL; 2954 } 2955 2956 void dm_free_md_mempools(struct dm_md_mempools *pools) 2957 { 2958 if (!pools) 2959 return; 2960 2961 bioset_exit(&pools->bs); 2962 bioset_exit(&pools->io_bs); 2963 2964 kfree(pools); 2965 } 2966 2967 struct dm_pr { 2968 u64 old_key; 2969 u64 new_key; 2970 u32 flags; 2971 bool fail_early; 2972 }; 2973 2974 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2975 void *data) 2976 { 2977 struct mapped_device *md = bdev->bd_disk->private_data; 2978 struct dm_table *table; 2979 struct dm_target *ti; 2980 int ret = -ENOTTY, srcu_idx; 2981 2982 table = dm_get_live_table(md, &srcu_idx); 2983 if (!table || !dm_table_get_size(table)) 2984 goto out; 2985 2986 /* We only support devices that have a single target */ 2987 if (dm_table_get_num_targets(table) != 1) 2988 goto out; 2989 ti = dm_table_get_target(table, 0); 2990 2991 ret = -EINVAL; 2992 if (!ti->type->iterate_devices) 2993 goto out; 2994 2995 ret = ti->type->iterate_devices(ti, fn, data); 2996 out: 2997 dm_put_live_table(md, srcu_idx); 2998 return ret; 2999 } 3000 3001 /* 3002 * For register / unregister we need to manually call out to every path. 3003 */ 3004 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3005 sector_t start, sector_t len, void *data) 3006 { 3007 struct dm_pr *pr = data; 3008 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3009 3010 if (!ops || !ops->pr_register) 3011 return -EOPNOTSUPP; 3012 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3013 } 3014 3015 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3016 u32 flags) 3017 { 3018 struct dm_pr pr = { 3019 .old_key = old_key, 3020 .new_key = new_key, 3021 .flags = flags, 3022 .fail_early = true, 3023 }; 3024 int ret; 3025 3026 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3027 if (ret && new_key) { 3028 /* unregister all paths if we failed to register any path */ 3029 pr.old_key = new_key; 3030 pr.new_key = 0; 3031 pr.flags = 0; 3032 pr.fail_early = false; 3033 dm_call_pr(bdev, __dm_pr_register, &pr); 3034 } 3035 3036 return ret; 3037 } 3038 3039 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3040 u32 flags) 3041 { 3042 struct mapped_device *md = bdev->bd_disk->private_data; 3043 const struct pr_ops *ops; 3044 int r, srcu_idx; 3045 3046 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3047 if (r < 0) 3048 goto out; 3049 3050 ops = bdev->bd_disk->fops->pr_ops; 3051 if (ops && ops->pr_reserve) 3052 r = ops->pr_reserve(bdev, key, type, flags); 3053 else 3054 r = -EOPNOTSUPP; 3055 out: 3056 dm_unprepare_ioctl(md, srcu_idx); 3057 return r; 3058 } 3059 3060 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3061 { 3062 struct mapped_device *md = bdev->bd_disk->private_data; 3063 const struct pr_ops *ops; 3064 int r, srcu_idx; 3065 3066 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3067 if (r < 0) 3068 goto out; 3069 3070 ops = bdev->bd_disk->fops->pr_ops; 3071 if (ops && ops->pr_release) 3072 r = ops->pr_release(bdev, key, type); 3073 else 3074 r = -EOPNOTSUPP; 3075 out: 3076 dm_unprepare_ioctl(md, srcu_idx); 3077 return r; 3078 } 3079 3080 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3081 enum pr_type type, bool abort) 3082 { 3083 struct mapped_device *md = bdev->bd_disk->private_data; 3084 const struct pr_ops *ops; 3085 int r, srcu_idx; 3086 3087 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3088 if (r < 0) 3089 goto out; 3090 3091 ops = bdev->bd_disk->fops->pr_ops; 3092 if (ops && ops->pr_preempt) 3093 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3094 else 3095 r = -EOPNOTSUPP; 3096 out: 3097 dm_unprepare_ioctl(md, srcu_idx); 3098 return r; 3099 } 3100 3101 static int dm_pr_clear(struct block_device *bdev, u64 key) 3102 { 3103 struct mapped_device *md = bdev->bd_disk->private_data; 3104 const struct pr_ops *ops; 3105 int r, srcu_idx; 3106 3107 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3108 if (r < 0) 3109 goto out; 3110 3111 ops = bdev->bd_disk->fops->pr_ops; 3112 if (ops && ops->pr_clear) 3113 r = ops->pr_clear(bdev, key); 3114 else 3115 r = -EOPNOTSUPP; 3116 out: 3117 dm_unprepare_ioctl(md, srcu_idx); 3118 return r; 3119 } 3120 3121 static const struct pr_ops dm_pr_ops = { 3122 .pr_register = dm_pr_register, 3123 .pr_reserve = dm_pr_reserve, 3124 .pr_release = dm_pr_release, 3125 .pr_preempt = dm_pr_preempt, 3126 .pr_clear = dm_pr_clear, 3127 }; 3128 3129 static const struct block_device_operations dm_blk_dops = { 3130 .submit_bio = dm_submit_bio, 3131 .poll_bio = dm_poll_bio, 3132 .open = dm_blk_open, 3133 .release = dm_blk_close, 3134 .ioctl = dm_blk_ioctl, 3135 .getgeo = dm_blk_getgeo, 3136 .report_zones = dm_blk_report_zones, 3137 .pr_ops = &dm_pr_ops, 3138 .owner = THIS_MODULE 3139 }; 3140 3141 static const struct block_device_operations dm_rq_blk_dops = { 3142 .open = dm_blk_open, 3143 .release = dm_blk_close, 3144 .ioctl = dm_blk_ioctl, 3145 .getgeo = dm_blk_getgeo, 3146 .pr_ops = &dm_pr_ops, 3147 .owner = THIS_MODULE 3148 }; 3149 3150 static const struct dax_operations dm_dax_ops = { 3151 .direct_access = dm_dax_direct_access, 3152 .zero_page_range = dm_dax_zero_page_range, 3153 }; 3154 3155 /* 3156 * module hooks 3157 */ 3158 module_init(dm_init); 3159 module_exit(dm_exit); 3160 3161 module_param(major, uint, 0); 3162 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3163 3164 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3165 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3166 3167 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3168 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3169 3170 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3171 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3172 3173 MODULE_DESCRIPTION(DM_NAME " driver"); 3174 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3175 MODULE_LICENSE("GPL"); 3176