1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/keyslot-manager.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 static const char *_name = DM_NAME; 44 45 static unsigned int major = 0; 46 static unsigned int _major = 0; 47 48 static DEFINE_IDR(_minor_idr); 49 50 static DEFINE_SPINLOCK(_minor_lock); 51 52 static void do_deferred_remove(struct work_struct *w); 53 54 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 55 56 static struct workqueue_struct *deferred_remove_workqueue; 57 58 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 59 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 60 61 void dm_issue_global_event(void) 62 { 63 atomic_inc(&dm_global_event_nr); 64 wake_up(&dm_global_eventq); 65 } 66 67 /* 68 * One of these is allocated (on-stack) per original bio. 69 */ 70 struct clone_info { 71 struct dm_table *map; 72 struct bio *bio; 73 struct dm_io *io; 74 sector_t sector; 75 unsigned sector_count; 76 }; 77 78 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 79 #define DM_IO_BIO_OFFSET \ 80 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 81 82 void *dm_per_bio_data(struct bio *bio, size_t data_size) 83 { 84 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 85 if (!tio->inside_dm_io) 86 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 87 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 88 } 89 EXPORT_SYMBOL_GPL(dm_per_bio_data); 90 91 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 92 { 93 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 94 if (io->magic == DM_IO_MAGIC) 95 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 96 BUG_ON(io->magic != DM_TIO_MAGIC); 97 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 98 } 99 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 100 101 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 102 { 103 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 104 } 105 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 106 107 #define MINOR_ALLOCED ((void *)-1) 108 109 #define DM_NUMA_NODE NUMA_NO_NODE 110 static int dm_numa_node = DM_NUMA_NODE; 111 112 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 113 static int swap_bios = DEFAULT_SWAP_BIOS; 114 static int get_swap_bios(void) 115 { 116 int latch = READ_ONCE(swap_bios); 117 if (unlikely(latch <= 0)) 118 latch = DEFAULT_SWAP_BIOS; 119 return latch; 120 } 121 122 /* 123 * For mempools pre-allocation at the table loading time. 124 */ 125 struct dm_md_mempools { 126 struct bio_set bs; 127 struct bio_set io_bs; 128 }; 129 130 struct table_device { 131 struct list_head list; 132 refcount_t count; 133 struct dm_dev dm_dev; 134 }; 135 136 /* 137 * Bio-based DM's mempools' reserved IOs set by the user. 138 */ 139 #define RESERVED_BIO_BASED_IOS 16 140 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 141 142 static int __dm_get_module_param_int(int *module_param, int min, int max) 143 { 144 int param = READ_ONCE(*module_param); 145 int modified_param = 0; 146 bool modified = true; 147 148 if (param < min) 149 modified_param = min; 150 else if (param > max) 151 modified_param = max; 152 else 153 modified = false; 154 155 if (modified) { 156 (void)cmpxchg(module_param, param, modified_param); 157 param = modified_param; 158 } 159 160 return param; 161 } 162 163 unsigned __dm_get_module_param(unsigned *module_param, 164 unsigned def, unsigned max) 165 { 166 unsigned param = READ_ONCE(*module_param); 167 unsigned modified_param = 0; 168 169 if (!param) 170 modified_param = def; 171 else if (param > max) 172 modified_param = max; 173 174 if (modified_param) { 175 (void)cmpxchg(module_param, param, modified_param); 176 param = modified_param; 177 } 178 179 return param; 180 } 181 182 unsigned dm_get_reserved_bio_based_ios(void) 183 { 184 return __dm_get_module_param(&reserved_bio_based_ios, 185 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 186 } 187 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 188 189 static unsigned dm_get_numa_node(void) 190 { 191 return __dm_get_module_param_int(&dm_numa_node, 192 DM_NUMA_NODE, num_online_nodes() - 1); 193 } 194 195 static int __init local_init(void) 196 { 197 int r; 198 199 r = dm_uevent_init(); 200 if (r) 201 return r; 202 203 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 204 if (!deferred_remove_workqueue) { 205 r = -ENOMEM; 206 goto out_uevent_exit; 207 } 208 209 _major = major; 210 r = register_blkdev(_major, _name); 211 if (r < 0) 212 goto out_free_workqueue; 213 214 if (!_major) 215 _major = r; 216 217 return 0; 218 219 out_free_workqueue: 220 destroy_workqueue(deferred_remove_workqueue); 221 out_uevent_exit: 222 dm_uevent_exit(); 223 224 return r; 225 } 226 227 static void local_exit(void) 228 { 229 flush_scheduled_work(); 230 destroy_workqueue(deferred_remove_workqueue); 231 232 unregister_blkdev(_major, _name); 233 dm_uevent_exit(); 234 235 _major = 0; 236 237 DMINFO("cleaned up"); 238 } 239 240 static int (*_inits[])(void) __initdata = { 241 local_init, 242 dm_target_init, 243 dm_linear_init, 244 dm_stripe_init, 245 dm_io_init, 246 dm_kcopyd_init, 247 dm_interface_init, 248 dm_statistics_init, 249 }; 250 251 static void (*_exits[])(void) = { 252 local_exit, 253 dm_target_exit, 254 dm_linear_exit, 255 dm_stripe_exit, 256 dm_io_exit, 257 dm_kcopyd_exit, 258 dm_interface_exit, 259 dm_statistics_exit, 260 }; 261 262 static int __init dm_init(void) 263 { 264 const int count = ARRAY_SIZE(_inits); 265 int r, i; 266 267 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 268 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 269 " Duplicate IMA measurements will not be recorded in the IMA log."); 270 #endif 271 272 for (i = 0; i < count; i++) { 273 r = _inits[i](); 274 if (r) 275 goto bad; 276 } 277 278 return 0; 279 bad: 280 while (i--) 281 _exits[i](); 282 283 return r; 284 } 285 286 static void __exit dm_exit(void) 287 { 288 int i = ARRAY_SIZE(_exits); 289 290 while (i--) 291 _exits[i](); 292 293 /* 294 * Should be empty by this point. 295 */ 296 idr_destroy(&_minor_idr); 297 } 298 299 /* 300 * Block device functions 301 */ 302 int dm_deleting_md(struct mapped_device *md) 303 { 304 return test_bit(DMF_DELETING, &md->flags); 305 } 306 307 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 308 { 309 struct mapped_device *md; 310 311 spin_lock(&_minor_lock); 312 313 md = bdev->bd_disk->private_data; 314 if (!md) 315 goto out; 316 317 if (test_bit(DMF_FREEING, &md->flags) || 318 dm_deleting_md(md)) { 319 md = NULL; 320 goto out; 321 } 322 323 dm_get(md); 324 atomic_inc(&md->open_count); 325 out: 326 spin_unlock(&_minor_lock); 327 328 return md ? 0 : -ENXIO; 329 } 330 331 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 332 { 333 struct mapped_device *md; 334 335 spin_lock(&_minor_lock); 336 337 md = disk->private_data; 338 if (WARN_ON(!md)) 339 goto out; 340 341 if (atomic_dec_and_test(&md->open_count) && 342 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 343 queue_work(deferred_remove_workqueue, &deferred_remove_work); 344 345 dm_put(md); 346 out: 347 spin_unlock(&_minor_lock); 348 } 349 350 int dm_open_count(struct mapped_device *md) 351 { 352 return atomic_read(&md->open_count); 353 } 354 355 /* 356 * Guarantees nothing is using the device before it's deleted. 357 */ 358 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 359 { 360 int r = 0; 361 362 spin_lock(&_minor_lock); 363 364 if (dm_open_count(md)) { 365 r = -EBUSY; 366 if (mark_deferred) 367 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 368 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 369 r = -EEXIST; 370 else 371 set_bit(DMF_DELETING, &md->flags); 372 373 spin_unlock(&_minor_lock); 374 375 return r; 376 } 377 378 int dm_cancel_deferred_remove(struct mapped_device *md) 379 { 380 int r = 0; 381 382 spin_lock(&_minor_lock); 383 384 if (test_bit(DMF_DELETING, &md->flags)) 385 r = -EBUSY; 386 else 387 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 388 389 spin_unlock(&_minor_lock); 390 391 return r; 392 } 393 394 static void do_deferred_remove(struct work_struct *w) 395 { 396 dm_deferred_remove(); 397 } 398 399 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 400 { 401 struct mapped_device *md = bdev->bd_disk->private_data; 402 403 return dm_get_geometry(md, geo); 404 } 405 406 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 407 struct block_device **bdev) 408 { 409 struct dm_target *tgt; 410 struct dm_table *map; 411 int r; 412 413 retry: 414 r = -ENOTTY; 415 map = dm_get_live_table(md, srcu_idx); 416 if (!map || !dm_table_get_size(map)) 417 return r; 418 419 /* We only support devices that have a single target */ 420 if (dm_table_get_num_targets(map) != 1) 421 return r; 422 423 tgt = dm_table_get_target(map, 0); 424 if (!tgt->type->prepare_ioctl) 425 return r; 426 427 if (dm_suspended_md(md)) 428 return -EAGAIN; 429 430 r = tgt->type->prepare_ioctl(tgt, bdev); 431 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 432 dm_put_live_table(md, *srcu_idx); 433 msleep(10); 434 goto retry; 435 } 436 437 return r; 438 } 439 440 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 441 { 442 dm_put_live_table(md, srcu_idx); 443 } 444 445 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 446 unsigned int cmd, unsigned long arg) 447 { 448 struct mapped_device *md = bdev->bd_disk->private_data; 449 int r, srcu_idx; 450 451 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 452 if (r < 0) 453 goto out; 454 455 if (r > 0) { 456 /* 457 * Target determined this ioctl is being issued against a 458 * subset of the parent bdev; require extra privileges. 459 */ 460 if (!capable(CAP_SYS_RAWIO)) { 461 DMDEBUG_LIMIT( 462 "%s: sending ioctl %x to DM device without required privilege.", 463 current->comm, cmd); 464 r = -ENOIOCTLCMD; 465 goto out; 466 } 467 } 468 469 if (!bdev->bd_disk->fops->ioctl) 470 r = -ENOTTY; 471 else 472 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 473 out: 474 dm_unprepare_ioctl(md, srcu_idx); 475 return r; 476 } 477 478 u64 dm_start_time_ns_from_clone(struct bio *bio) 479 { 480 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 481 struct dm_io *io = tio->io; 482 483 return jiffies_to_nsecs(io->start_time); 484 } 485 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 486 487 static void start_io_acct(struct dm_io *io) 488 { 489 struct mapped_device *md = io->md; 490 struct bio *bio = io->orig_bio; 491 492 io->start_time = bio_start_io_acct(bio); 493 if (unlikely(dm_stats_used(&md->stats))) 494 dm_stats_account_io(&md->stats, bio_data_dir(bio), 495 bio->bi_iter.bi_sector, bio_sectors(bio), 496 false, 0, &io->stats_aux); 497 } 498 499 static void end_io_acct(struct dm_io *io) 500 { 501 struct mapped_device *md = io->md; 502 struct bio *bio = io->orig_bio; 503 unsigned long duration = jiffies - io->start_time; 504 505 bio_end_io_acct(bio, io->start_time); 506 507 if (unlikely(dm_stats_used(&md->stats))) 508 dm_stats_account_io(&md->stats, bio_data_dir(bio), 509 bio->bi_iter.bi_sector, bio_sectors(bio), 510 true, duration, &io->stats_aux); 511 512 /* nudge anyone waiting on suspend queue */ 513 if (unlikely(wq_has_sleeper(&md->wait))) 514 wake_up(&md->wait); 515 } 516 517 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 518 { 519 struct dm_io *io; 520 struct dm_target_io *tio; 521 struct bio *clone; 522 523 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs); 524 if (!clone) 525 return NULL; 526 527 tio = container_of(clone, struct dm_target_io, clone); 528 tio->inside_dm_io = true; 529 tio->io = NULL; 530 531 io = container_of(tio, struct dm_io, tio); 532 io->magic = DM_IO_MAGIC; 533 io->status = 0; 534 atomic_set(&io->io_count, 1); 535 io->orig_bio = bio; 536 io->md = md; 537 spin_lock_init(&io->endio_lock); 538 539 start_io_acct(io); 540 541 return io; 542 } 543 544 static void free_io(struct mapped_device *md, struct dm_io *io) 545 { 546 bio_put(&io->tio.clone); 547 } 548 549 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti, 550 unsigned target_bio_nr, gfp_t gfp_mask) 551 { 552 struct dm_target_io *tio; 553 554 if (!ci->io->tio.io) { 555 /* the dm_target_io embedded in ci->io is available */ 556 tio = &ci->io->tio; 557 } else { 558 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs); 559 if (!clone) 560 return NULL; 561 562 tio = container_of(clone, struct dm_target_io, clone); 563 tio->inside_dm_io = false; 564 } 565 566 tio->magic = DM_TIO_MAGIC; 567 tio->io = ci->io; 568 tio->ti = ti; 569 tio->target_bio_nr = target_bio_nr; 570 571 return tio; 572 } 573 574 static void free_tio(struct dm_target_io *tio) 575 { 576 if (tio->inside_dm_io) 577 return; 578 bio_put(&tio->clone); 579 } 580 581 /* 582 * Add the bio to the list of deferred io. 583 */ 584 static void queue_io(struct mapped_device *md, struct bio *bio) 585 { 586 unsigned long flags; 587 588 spin_lock_irqsave(&md->deferred_lock, flags); 589 bio_list_add(&md->deferred, bio); 590 spin_unlock_irqrestore(&md->deferred_lock, flags); 591 queue_work(md->wq, &md->work); 592 } 593 594 /* 595 * Everyone (including functions in this file), should use this 596 * function to access the md->map field, and make sure they call 597 * dm_put_live_table() when finished. 598 */ 599 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 600 { 601 *srcu_idx = srcu_read_lock(&md->io_barrier); 602 603 return srcu_dereference(md->map, &md->io_barrier); 604 } 605 606 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 607 { 608 srcu_read_unlock(&md->io_barrier, srcu_idx); 609 } 610 611 void dm_sync_table(struct mapped_device *md) 612 { 613 synchronize_srcu(&md->io_barrier); 614 synchronize_rcu_expedited(); 615 } 616 617 /* 618 * A fast alternative to dm_get_live_table/dm_put_live_table. 619 * The caller must not block between these two functions. 620 */ 621 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 622 { 623 rcu_read_lock(); 624 return rcu_dereference(md->map); 625 } 626 627 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 628 { 629 rcu_read_unlock(); 630 } 631 632 static char *_dm_claim_ptr = "I belong to device-mapper"; 633 634 /* 635 * Open a table device so we can use it as a map destination. 636 */ 637 static int open_table_device(struct table_device *td, dev_t dev, 638 struct mapped_device *md) 639 { 640 struct block_device *bdev; 641 642 int r; 643 644 BUG_ON(td->dm_dev.bdev); 645 646 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 647 if (IS_ERR(bdev)) 648 return PTR_ERR(bdev); 649 650 r = bd_link_disk_holder(bdev, dm_disk(md)); 651 if (r) { 652 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 653 return r; 654 } 655 656 td->dm_dev.bdev = bdev; 657 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev); 658 return 0; 659 } 660 661 /* 662 * Close a table device that we've been using. 663 */ 664 static void close_table_device(struct table_device *td, struct mapped_device *md) 665 { 666 if (!td->dm_dev.bdev) 667 return; 668 669 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 670 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 671 put_dax(td->dm_dev.dax_dev); 672 td->dm_dev.bdev = NULL; 673 td->dm_dev.dax_dev = NULL; 674 } 675 676 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 677 fmode_t mode) 678 { 679 struct table_device *td; 680 681 list_for_each_entry(td, l, list) 682 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 683 return td; 684 685 return NULL; 686 } 687 688 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 689 struct dm_dev **result) 690 { 691 int r; 692 struct table_device *td; 693 694 mutex_lock(&md->table_devices_lock); 695 td = find_table_device(&md->table_devices, dev, mode); 696 if (!td) { 697 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 698 if (!td) { 699 mutex_unlock(&md->table_devices_lock); 700 return -ENOMEM; 701 } 702 703 td->dm_dev.mode = mode; 704 td->dm_dev.bdev = NULL; 705 706 if ((r = open_table_device(td, dev, md))) { 707 mutex_unlock(&md->table_devices_lock); 708 kfree(td); 709 return r; 710 } 711 712 format_dev_t(td->dm_dev.name, dev); 713 714 refcount_set(&td->count, 1); 715 list_add(&td->list, &md->table_devices); 716 } else { 717 refcount_inc(&td->count); 718 } 719 mutex_unlock(&md->table_devices_lock); 720 721 *result = &td->dm_dev; 722 return 0; 723 } 724 725 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 726 { 727 struct table_device *td = container_of(d, struct table_device, dm_dev); 728 729 mutex_lock(&md->table_devices_lock); 730 if (refcount_dec_and_test(&td->count)) { 731 close_table_device(td, md); 732 list_del(&td->list); 733 kfree(td); 734 } 735 mutex_unlock(&md->table_devices_lock); 736 } 737 738 static void free_table_devices(struct list_head *devices) 739 { 740 struct list_head *tmp, *next; 741 742 list_for_each_safe(tmp, next, devices) { 743 struct table_device *td = list_entry(tmp, struct table_device, list); 744 745 DMWARN("dm_destroy: %s still exists with %d references", 746 td->dm_dev.name, refcount_read(&td->count)); 747 kfree(td); 748 } 749 } 750 751 /* 752 * Get the geometry associated with a dm device 753 */ 754 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 755 { 756 *geo = md->geometry; 757 758 return 0; 759 } 760 761 /* 762 * Set the geometry of a device. 763 */ 764 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 765 { 766 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 767 768 if (geo->start > sz) { 769 DMWARN("Start sector is beyond the geometry limits."); 770 return -EINVAL; 771 } 772 773 md->geometry = *geo; 774 775 return 0; 776 } 777 778 static int __noflush_suspending(struct mapped_device *md) 779 { 780 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 781 } 782 783 /* 784 * Decrements the number of outstanding ios that a bio has been 785 * cloned into, completing the original io if necc. 786 */ 787 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 788 { 789 unsigned long flags; 790 blk_status_t io_error; 791 struct bio *bio; 792 struct mapped_device *md = io->md; 793 794 /* Push-back supersedes any I/O errors */ 795 if (unlikely(error)) { 796 spin_lock_irqsave(&io->endio_lock, flags); 797 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md))) 798 io->status = error; 799 spin_unlock_irqrestore(&io->endio_lock, flags); 800 } 801 802 if (atomic_dec_and_test(&io->io_count)) { 803 bio = io->orig_bio; 804 if (io->status == BLK_STS_DM_REQUEUE) { 805 /* 806 * Target requested pushing back the I/O. 807 */ 808 spin_lock_irqsave(&md->deferred_lock, flags); 809 if (__noflush_suspending(md) && 810 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 811 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 812 bio_list_add_head(&md->deferred, bio); 813 } else { 814 /* 815 * noflush suspend was interrupted or this is 816 * a write to a zoned target. 817 */ 818 io->status = BLK_STS_IOERR; 819 } 820 spin_unlock_irqrestore(&md->deferred_lock, flags); 821 } 822 823 io_error = io->status; 824 end_io_acct(io); 825 free_io(md, io); 826 827 if (io_error == BLK_STS_DM_REQUEUE) 828 return; 829 830 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { 831 /* 832 * Preflush done for flush with data, reissue 833 * without REQ_PREFLUSH. 834 */ 835 bio->bi_opf &= ~REQ_PREFLUSH; 836 queue_io(md, bio); 837 } else { 838 /* done with normal IO or empty flush */ 839 if (io_error) 840 bio->bi_status = io_error; 841 bio_endio(bio); 842 } 843 } 844 } 845 846 void disable_discard(struct mapped_device *md) 847 { 848 struct queue_limits *limits = dm_get_queue_limits(md); 849 850 /* device doesn't really support DISCARD, disable it */ 851 limits->max_discard_sectors = 0; 852 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 853 } 854 855 void disable_write_same(struct mapped_device *md) 856 { 857 struct queue_limits *limits = dm_get_queue_limits(md); 858 859 /* device doesn't really support WRITE SAME, disable it */ 860 limits->max_write_same_sectors = 0; 861 } 862 863 void disable_write_zeroes(struct mapped_device *md) 864 { 865 struct queue_limits *limits = dm_get_queue_limits(md); 866 867 /* device doesn't really support WRITE ZEROES, disable it */ 868 limits->max_write_zeroes_sectors = 0; 869 } 870 871 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 872 { 873 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 874 } 875 876 static void clone_endio(struct bio *bio) 877 { 878 blk_status_t error = bio->bi_status; 879 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 880 struct dm_io *io = tio->io; 881 struct mapped_device *md = tio->io->md; 882 dm_endio_fn endio = tio->ti->type->end_io; 883 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 884 885 if (unlikely(error == BLK_STS_TARGET)) { 886 if (bio_op(bio) == REQ_OP_DISCARD && 887 !q->limits.max_discard_sectors) 888 disable_discard(md); 889 else if (bio_op(bio) == REQ_OP_WRITE_SAME && 890 !q->limits.max_write_same_sectors) 891 disable_write_same(md); 892 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 893 !q->limits.max_write_zeroes_sectors) 894 disable_write_zeroes(md); 895 } 896 897 if (blk_queue_is_zoned(q)) 898 dm_zone_endio(io, bio); 899 900 if (endio) { 901 int r = endio(tio->ti, bio, &error); 902 switch (r) { 903 case DM_ENDIO_REQUEUE: 904 /* 905 * Requeuing writes to a sequential zone of a zoned 906 * target will break the sequential write pattern: 907 * fail such IO. 908 */ 909 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 910 error = BLK_STS_IOERR; 911 else 912 error = BLK_STS_DM_REQUEUE; 913 fallthrough; 914 case DM_ENDIO_DONE: 915 break; 916 case DM_ENDIO_INCOMPLETE: 917 /* The target will handle the io */ 918 return; 919 default: 920 DMWARN("unimplemented target endio return value: %d", r); 921 BUG(); 922 } 923 } 924 925 if (unlikely(swap_bios_limit(tio->ti, bio))) { 926 struct mapped_device *md = io->md; 927 up(&md->swap_bios_semaphore); 928 } 929 930 free_tio(tio); 931 dm_io_dec_pending(io, error); 932 } 933 934 /* 935 * Return maximum size of I/O possible at the supplied sector up to the current 936 * target boundary. 937 */ 938 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 939 sector_t target_offset) 940 { 941 return ti->len - target_offset; 942 } 943 944 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 945 { 946 sector_t target_offset = dm_target_offset(ti, sector); 947 sector_t len = max_io_len_target_boundary(ti, target_offset); 948 sector_t max_len; 949 950 /* 951 * Does the target need to split IO even further? 952 * - varied (per target) IO splitting is a tenet of DM; this 953 * explains why stacked chunk_sectors based splitting via 954 * blk_max_size_offset() isn't possible here. So pass in 955 * ti->max_io_len to override stacked chunk_sectors. 956 */ 957 if (ti->max_io_len) { 958 max_len = blk_max_size_offset(ti->table->md->queue, 959 target_offset, ti->max_io_len); 960 if (len > max_len) 961 len = max_len; 962 } 963 964 return len; 965 } 966 967 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 968 { 969 if (len > UINT_MAX) { 970 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 971 (unsigned long long)len, UINT_MAX); 972 ti->error = "Maximum size of target IO is too large"; 973 return -EINVAL; 974 } 975 976 ti->max_io_len = (uint32_t) len; 977 978 return 0; 979 } 980 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 981 982 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 983 sector_t sector, int *srcu_idx) 984 __acquires(md->io_barrier) 985 { 986 struct dm_table *map; 987 struct dm_target *ti; 988 989 map = dm_get_live_table(md, srcu_idx); 990 if (!map) 991 return NULL; 992 993 ti = dm_table_find_target(map, sector); 994 if (!ti) 995 return NULL; 996 997 return ti; 998 } 999 1000 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1001 long nr_pages, void **kaddr, pfn_t *pfn) 1002 { 1003 struct mapped_device *md = dax_get_private(dax_dev); 1004 sector_t sector = pgoff * PAGE_SECTORS; 1005 struct dm_target *ti; 1006 long len, ret = -EIO; 1007 int srcu_idx; 1008 1009 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1010 1011 if (!ti) 1012 goto out; 1013 if (!ti->type->direct_access) 1014 goto out; 1015 len = max_io_len(ti, sector) / PAGE_SECTORS; 1016 if (len < 1) 1017 goto out; 1018 nr_pages = min(len, nr_pages); 1019 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1020 1021 out: 1022 dm_put_live_table(md, srcu_idx); 1023 1024 return ret; 1025 } 1026 1027 static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 1028 int blocksize, sector_t start, sector_t len) 1029 { 1030 struct mapped_device *md = dax_get_private(dax_dev); 1031 struct dm_table *map; 1032 bool ret = false; 1033 int srcu_idx; 1034 1035 map = dm_get_live_table(md, &srcu_idx); 1036 if (!map) 1037 goto out; 1038 1039 ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize); 1040 1041 out: 1042 dm_put_live_table(md, srcu_idx); 1043 1044 return ret; 1045 } 1046 1047 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, 1048 void *addr, size_t bytes, struct iov_iter *i) 1049 { 1050 struct mapped_device *md = dax_get_private(dax_dev); 1051 sector_t sector = pgoff * PAGE_SECTORS; 1052 struct dm_target *ti; 1053 long ret = 0; 1054 int srcu_idx; 1055 1056 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1057 1058 if (!ti) 1059 goto out; 1060 if (!ti->type->dax_copy_from_iter) { 1061 ret = copy_from_iter(addr, bytes, i); 1062 goto out; 1063 } 1064 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i); 1065 out: 1066 dm_put_live_table(md, srcu_idx); 1067 1068 return ret; 1069 } 1070 1071 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, 1072 void *addr, size_t bytes, struct iov_iter *i) 1073 { 1074 struct mapped_device *md = dax_get_private(dax_dev); 1075 sector_t sector = pgoff * PAGE_SECTORS; 1076 struct dm_target *ti; 1077 long ret = 0; 1078 int srcu_idx; 1079 1080 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1081 1082 if (!ti) 1083 goto out; 1084 if (!ti->type->dax_copy_to_iter) { 1085 ret = copy_to_iter(addr, bytes, i); 1086 goto out; 1087 } 1088 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i); 1089 out: 1090 dm_put_live_table(md, srcu_idx); 1091 1092 return ret; 1093 } 1094 1095 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1096 size_t nr_pages) 1097 { 1098 struct mapped_device *md = dax_get_private(dax_dev); 1099 sector_t sector = pgoff * PAGE_SECTORS; 1100 struct dm_target *ti; 1101 int ret = -EIO; 1102 int srcu_idx; 1103 1104 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1105 1106 if (!ti) 1107 goto out; 1108 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1109 /* 1110 * ->zero_page_range() is mandatory dax operation. If we are 1111 * here, something is wrong. 1112 */ 1113 goto out; 1114 } 1115 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1116 out: 1117 dm_put_live_table(md, srcu_idx); 1118 1119 return ret; 1120 } 1121 1122 /* 1123 * A target may call dm_accept_partial_bio only from the map routine. It is 1124 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1125 * operations and REQ_OP_ZONE_APPEND (zone append writes). 1126 * 1127 * dm_accept_partial_bio informs the dm that the target only wants to process 1128 * additional n_sectors sectors of the bio and the rest of the data should be 1129 * sent in a next bio. 1130 * 1131 * A diagram that explains the arithmetics: 1132 * +--------------------+---------------+-------+ 1133 * | 1 | 2 | 3 | 1134 * +--------------------+---------------+-------+ 1135 * 1136 * <-------------- *tio->len_ptr ---------------> 1137 * <------- bi_size -------> 1138 * <-- n_sectors --> 1139 * 1140 * Region 1 was already iterated over with bio_advance or similar function. 1141 * (it may be empty if the target doesn't use bio_advance) 1142 * Region 2 is the remaining bio size that the target wants to process. 1143 * (it may be empty if region 1 is non-empty, although there is no reason 1144 * to make it empty) 1145 * The target requires that region 3 is to be sent in the next bio. 1146 * 1147 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1148 * the partially processed part (the sum of regions 1+2) must be the same for all 1149 * copies of the bio. 1150 */ 1151 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1152 { 1153 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1154 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1155 1156 BUG_ON(bio->bi_opf & REQ_PREFLUSH); 1157 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1158 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1159 BUG_ON(bi_size > *tio->len_ptr); 1160 BUG_ON(n_sectors > bi_size); 1161 1162 *tio->len_ptr -= bi_size - n_sectors; 1163 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1164 } 1165 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1166 1167 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1168 { 1169 mutex_lock(&md->swap_bios_lock); 1170 while (latch < md->swap_bios) { 1171 cond_resched(); 1172 down(&md->swap_bios_semaphore); 1173 md->swap_bios--; 1174 } 1175 while (latch > md->swap_bios) { 1176 cond_resched(); 1177 up(&md->swap_bios_semaphore); 1178 md->swap_bios++; 1179 } 1180 mutex_unlock(&md->swap_bios_lock); 1181 } 1182 1183 static blk_qc_t __map_bio(struct dm_target_io *tio) 1184 { 1185 int r; 1186 sector_t sector; 1187 struct bio *clone = &tio->clone; 1188 struct dm_io *io = tio->io; 1189 struct dm_target *ti = tio->ti; 1190 blk_qc_t ret = BLK_QC_T_NONE; 1191 1192 clone->bi_end_io = clone_endio; 1193 1194 /* 1195 * Map the clone. If r == 0 we don't need to do 1196 * anything, the target has assumed ownership of 1197 * this io. 1198 */ 1199 dm_io_inc_pending(io); 1200 sector = clone->bi_iter.bi_sector; 1201 1202 if (unlikely(swap_bios_limit(ti, clone))) { 1203 struct mapped_device *md = io->md; 1204 int latch = get_swap_bios(); 1205 if (unlikely(latch != md->swap_bios)) 1206 __set_swap_bios_limit(md, latch); 1207 down(&md->swap_bios_semaphore); 1208 } 1209 1210 /* 1211 * Check if the IO needs a special mapping due to zone append emulation 1212 * on zoned target. In this case, dm_zone_map_bio() calls the target 1213 * map operation. 1214 */ 1215 if (dm_emulate_zone_append(io->md)) 1216 r = dm_zone_map_bio(tio); 1217 else 1218 r = ti->type->map(ti, clone); 1219 1220 switch (r) { 1221 case DM_MAPIO_SUBMITTED: 1222 break; 1223 case DM_MAPIO_REMAPPED: 1224 /* the bio has been remapped so dispatch it */ 1225 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); 1226 ret = submit_bio_noacct(clone); 1227 break; 1228 case DM_MAPIO_KILL: 1229 if (unlikely(swap_bios_limit(ti, clone))) { 1230 struct mapped_device *md = io->md; 1231 up(&md->swap_bios_semaphore); 1232 } 1233 free_tio(tio); 1234 dm_io_dec_pending(io, BLK_STS_IOERR); 1235 break; 1236 case DM_MAPIO_REQUEUE: 1237 if (unlikely(swap_bios_limit(ti, clone))) { 1238 struct mapped_device *md = io->md; 1239 up(&md->swap_bios_semaphore); 1240 } 1241 free_tio(tio); 1242 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1243 break; 1244 default: 1245 DMWARN("unimplemented target map return value: %d", r); 1246 BUG(); 1247 } 1248 1249 return ret; 1250 } 1251 1252 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1253 { 1254 bio->bi_iter.bi_sector = sector; 1255 bio->bi_iter.bi_size = to_bytes(len); 1256 } 1257 1258 /* 1259 * Creates a bio that consists of range of complete bvecs. 1260 */ 1261 static int clone_bio(struct dm_target_io *tio, struct bio *bio, 1262 sector_t sector, unsigned len) 1263 { 1264 struct bio *clone = &tio->clone; 1265 int r; 1266 1267 __bio_clone_fast(clone, bio); 1268 1269 r = bio_crypt_clone(clone, bio, GFP_NOIO); 1270 if (r < 0) 1271 return r; 1272 1273 if (bio_integrity(bio)) { 1274 if (unlikely(!dm_target_has_integrity(tio->ti->type) && 1275 !dm_target_passes_integrity(tio->ti->type))) { 1276 DMWARN("%s: the target %s doesn't support integrity data.", 1277 dm_device_name(tio->io->md), 1278 tio->ti->type->name); 1279 return -EIO; 1280 } 1281 1282 r = bio_integrity_clone(clone, bio, GFP_NOIO); 1283 if (r < 0) 1284 return r; 1285 } 1286 1287 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1288 clone->bi_iter.bi_size = to_bytes(len); 1289 1290 if (bio_integrity(bio)) 1291 bio_integrity_trim(clone); 1292 1293 return 0; 1294 } 1295 1296 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1297 struct dm_target *ti, unsigned num_bios) 1298 { 1299 struct dm_target_io *tio; 1300 int try; 1301 1302 if (!num_bios) 1303 return; 1304 1305 if (num_bios == 1) { 1306 tio = alloc_tio(ci, ti, 0, GFP_NOIO); 1307 bio_list_add(blist, &tio->clone); 1308 return; 1309 } 1310 1311 for (try = 0; try < 2; try++) { 1312 int bio_nr; 1313 struct bio *bio; 1314 1315 if (try) 1316 mutex_lock(&ci->io->md->table_devices_lock); 1317 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1318 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT); 1319 if (!tio) 1320 break; 1321 1322 bio_list_add(blist, &tio->clone); 1323 } 1324 if (try) 1325 mutex_unlock(&ci->io->md->table_devices_lock); 1326 if (bio_nr == num_bios) 1327 return; 1328 1329 while ((bio = bio_list_pop(blist))) { 1330 tio = container_of(bio, struct dm_target_io, clone); 1331 free_tio(tio); 1332 } 1333 } 1334 } 1335 1336 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci, 1337 struct dm_target_io *tio, unsigned *len) 1338 { 1339 struct bio *clone = &tio->clone; 1340 1341 tio->len_ptr = len; 1342 1343 __bio_clone_fast(clone, ci->bio); 1344 if (len) 1345 bio_setup_sector(clone, ci->sector, *len); 1346 1347 return __map_bio(tio); 1348 } 1349 1350 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1351 unsigned num_bios, unsigned *len) 1352 { 1353 struct bio_list blist = BIO_EMPTY_LIST; 1354 struct bio *bio; 1355 struct dm_target_io *tio; 1356 1357 alloc_multiple_bios(&blist, ci, ti, num_bios); 1358 1359 while ((bio = bio_list_pop(&blist))) { 1360 tio = container_of(bio, struct dm_target_io, clone); 1361 (void) __clone_and_map_simple_bio(ci, tio, len); 1362 } 1363 } 1364 1365 static int __send_empty_flush(struct clone_info *ci) 1366 { 1367 unsigned target_nr = 0; 1368 struct dm_target *ti; 1369 struct bio flush_bio; 1370 1371 /* 1372 * Use an on-stack bio for this, it's safe since we don't 1373 * need to reference it after submit. It's just used as 1374 * the basis for the clone(s). 1375 */ 1376 bio_init(&flush_bio, NULL, 0); 1377 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; 1378 bio_set_dev(&flush_bio, ci->io->md->disk->part0); 1379 1380 ci->bio = &flush_bio; 1381 ci->sector_count = 0; 1382 1383 BUG_ON(bio_has_data(ci->bio)); 1384 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1385 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1386 1387 bio_uninit(ci->bio); 1388 return 0; 1389 } 1390 1391 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1392 sector_t sector, unsigned *len) 1393 { 1394 struct bio *bio = ci->bio; 1395 struct dm_target_io *tio; 1396 int r; 1397 1398 tio = alloc_tio(ci, ti, 0, GFP_NOIO); 1399 tio->len_ptr = len; 1400 r = clone_bio(tio, bio, sector, *len); 1401 if (r < 0) { 1402 free_tio(tio); 1403 return r; 1404 } 1405 (void) __map_bio(tio); 1406 1407 return 0; 1408 } 1409 1410 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1411 unsigned num_bios) 1412 { 1413 unsigned len; 1414 1415 /* 1416 * Even though the device advertised support for this type of 1417 * request, that does not mean every target supports it, and 1418 * reconfiguration might also have changed that since the 1419 * check was performed. 1420 */ 1421 if (!num_bios) 1422 return -EOPNOTSUPP; 1423 1424 len = min_t(sector_t, ci->sector_count, 1425 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1426 1427 __send_duplicate_bios(ci, ti, num_bios, &len); 1428 1429 ci->sector += len; 1430 ci->sector_count -= len; 1431 1432 return 0; 1433 } 1434 1435 static bool is_abnormal_io(struct bio *bio) 1436 { 1437 bool r = false; 1438 1439 switch (bio_op(bio)) { 1440 case REQ_OP_DISCARD: 1441 case REQ_OP_SECURE_ERASE: 1442 case REQ_OP_WRITE_SAME: 1443 case REQ_OP_WRITE_ZEROES: 1444 r = true; 1445 break; 1446 } 1447 1448 return r; 1449 } 1450 1451 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1452 int *result) 1453 { 1454 struct bio *bio = ci->bio; 1455 unsigned num_bios = 0; 1456 1457 switch (bio_op(bio)) { 1458 case REQ_OP_DISCARD: 1459 num_bios = ti->num_discard_bios; 1460 break; 1461 case REQ_OP_SECURE_ERASE: 1462 num_bios = ti->num_secure_erase_bios; 1463 break; 1464 case REQ_OP_WRITE_SAME: 1465 num_bios = ti->num_write_same_bios; 1466 break; 1467 case REQ_OP_WRITE_ZEROES: 1468 num_bios = ti->num_write_zeroes_bios; 1469 break; 1470 default: 1471 return false; 1472 } 1473 1474 *result = __send_changing_extent_only(ci, ti, num_bios); 1475 return true; 1476 } 1477 1478 /* 1479 * Select the correct strategy for processing a non-flush bio. 1480 */ 1481 static int __split_and_process_non_flush(struct clone_info *ci) 1482 { 1483 struct dm_target *ti; 1484 unsigned len; 1485 int r; 1486 1487 ti = dm_table_find_target(ci->map, ci->sector); 1488 if (!ti) 1489 return -EIO; 1490 1491 if (__process_abnormal_io(ci, ti, &r)) 1492 return r; 1493 1494 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1495 1496 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1497 if (r < 0) 1498 return r; 1499 1500 ci->sector += len; 1501 ci->sector_count -= len; 1502 1503 return 0; 1504 } 1505 1506 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1507 struct dm_table *map, struct bio *bio) 1508 { 1509 ci->map = map; 1510 ci->io = alloc_io(md, bio); 1511 ci->sector = bio->bi_iter.bi_sector; 1512 } 1513 1514 #define __dm_part_stat_sub(part, field, subnd) \ 1515 (part_stat_get(part, field) -= (subnd)) 1516 1517 /* 1518 * Entry point to split a bio into clones and submit them to the targets. 1519 */ 1520 static blk_qc_t __split_and_process_bio(struct mapped_device *md, 1521 struct dm_table *map, struct bio *bio) 1522 { 1523 struct clone_info ci; 1524 blk_qc_t ret = BLK_QC_T_NONE; 1525 int error = 0; 1526 1527 init_clone_info(&ci, md, map, bio); 1528 1529 if (bio->bi_opf & REQ_PREFLUSH) { 1530 error = __send_empty_flush(&ci); 1531 /* dm_io_dec_pending submits any data associated with flush */ 1532 } else if (op_is_zone_mgmt(bio_op(bio))) { 1533 ci.bio = bio; 1534 ci.sector_count = 0; 1535 error = __split_and_process_non_flush(&ci); 1536 } else { 1537 ci.bio = bio; 1538 ci.sector_count = bio_sectors(bio); 1539 error = __split_and_process_non_flush(&ci); 1540 if (ci.sector_count && !error) { 1541 /* 1542 * Remainder must be passed to submit_bio_noacct() 1543 * so that it gets handled *after* bios already submitted 1544 * have been completely processed. 1545 * We take a clone of the original to store in 1546 * ci.io->orig_bio to be used by end_io_acct() and 1547 * for dec_pending to use for completion handling. 1548 */ 1549 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1550 GFP_NOIO, &md->queue->bio_split); 1551 ci.io->orig_bio = b; 1552 1553 /* 1554 * Adjust IO stats for each split, otherwise upon queue 1555 * reentry there will be redundant IO accounting. 1556 * NOTE: this is a stop-gap fix, a proper fix involves 1557 * significant refactoring of DM core's bio splitting 1558 * (by eliminating DM's splitting and just using bio_split) 1559 */ 1560 part_stat_lock(); 1561 __dm_part_stat_sub(dm_disk(md)->part0, 1562 sectors[op_stat_group(bio_op(bio))], ci.sector_count); 1563 part_stat_unlock(); 1564 1565 bio_chain(b, bio); 1566 trace_block_split(b, bio->bi_iter.bi_sector); 1567 ret = submit_bio_noacct(bio); 1568 } 1569 } 1570 1571 /* drop the extra reference count */ 1572 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1573 return ret; 1574 } 1575 1576 static blk_qc_t dm_submit_bio(struct bio *bio) 1577 { 1578 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1579 blk_qc_t ret = BLK_QC_T_NONE; 1580 int srcu_idx; 1581 struct dm_table *map; 1582 1583 map = dm_get_live_table(md, &srcu_idx); 1584 if (unlikely(!map)) { 1585 DMERR_LIMIT("%s: mapping table unavailable, erroring io", 1586 dm_device_name(md)); 1587 bio_io_error(bio); 1588 goto out; 1589 } 1590 1591 /* If suspended, queue this IO for later */ 1592 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1593 if (bio->bi_opf & REQ_NOWAIT) 1594 bio_wouldblock_error(bio); 1595 else if (bio->bi_opf & REQ_RAHEAD) 1596 bio_io_error(bio); 1597 else 1598 queue_io(md, bio); 1599 goto out; 1600 } 1601 1602 /* 1603 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1604 * otherwise associated queue_limits won't be imposed. 1605 */ 1606 if (is_abnormal_io(bio)) 1607 blk_queue_split(&bio); 1608 1609 ret = __split_and_process_bio(md, map, bio); 1610 out: 1611 dm_put_live_table(md, srcu_idx); 1612 return ret; 1613 } 1614 1615 /*----------------------------------------------------------------- 1616 * An IDR is used to keep track of allocated minor numbers. 1617 *---------------------------------------------------------------*/ 1618 static void free_minor(int minor) 1619 { 1620 spin_lock(&_minor_lock); 1621 idr_remove(&_minor_idr, minor); 1622 spin_unlock(&_minor_lock); 1623 } 1624 1625 /* 1626 * See if the device with a specific minor # is free. 1627 */ 1628 static int specific_minor(int minor) 1629 { 1630 int r; 1631 1632 if (minor >= (1 << MINORBITS)) 1633 return -EINVAL; 1634 1635 idr_preload(GFP_KERNEL); 1636 spin_lock(&_minor_lock); 1637 1638 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1639 1640 spin_unlock(&_minor_lock); 1641 idr_preload_end(); 1642 if (r < 0) 1643 return r == -ENOSPC ? -EBUSY : r; 1644 return 0; 1645 } 1646 1647 static int next_free_minor(int *minor) 1648 { 1649 int r; 1650 1651 idr_preload(GFP_KERNEL); 1652 spin_lock(&_minor_lock); 1653 1654 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1655 1656 spin_unlock(&_minor_lock); 1657 idr_preload_end(); 1658 if (r < 0) 1659 return r; 1660 *minor = r; 1661 return 0; 1662 } 1663 1664 static const struct block_device_operations dm_blk_dops; 1665 static const struct block_device_operations dm_rq_blk_dops; 1666 static const struct dax_operations dm_dax_ops; 1667 1668 static void dm_wq_work(struct work_struct *work); 1669 1670 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1671 static void dm_queue_destroy_keyslot_manager(struct request_queue *q) 1672 { 1673 dm_destroy_keyslot_manager(q->ksm); 1674 } 1675 1676 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1677 1678 static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) 1679 { 1680 } 1681 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1682 1683 static void cleanup_mapped_device(struct mapped_device *md) 1684 { 1685 if (md->wq) 1686 destroy_workqueue(md->wq); 1687 bioset_exit(&md->bs); 1688 bioset_exit(&md->io_bs); 1689 1690 if (md->dax_dev) { 1691 kill_dax(md->dax_dev); 1692 put_dax(md->dax_dev); 1693 md->dax_dev = NULL; 1694 } 1695 1696 if (md->disk) { 1697 spin_lock(&_minor_lock); 1698 md->disk->private_data = NULL; 1699 spin_unlock(&_minor_lock); 1700 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1701 dm_sysfs_exit(md); 1702 del_gendisk(md->disk); 1703 } 1704 dm_queue_destroy_keyslot_manager(md->queue); 1705 blk_cleanup_disk(md->disk); 1706 } 1707 1708 cleanup_srcu_struct(&md->io_barrier); 1709 1710 mutex_destroy(&md->suspend_lock); 1711 mutex_destroy(&md->type_lock); 1712 mutex_destroy(&md->table_devices_lock); 1713 mutex_destroy(&md->swap_bios_lock); 1714 1715 dm_mq_cleanup_mapped_device(md); 1716 dm_cleanup_zoned_dev(md); 1717 } 1718 1719 /* 1720 * Allocate and initialise a blank device with a given minor. 1721 */ 1722 static struct mapped_device *alloc_dev(int minor) 1723 { 1724 int r, numa_node_id = dm_get_numa_node(); 1725 struct mapped_device *md; 1726 void *old_md; 1727 1728 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1729 if (!md) { 1730 DMWARN("unable to allocate device, out of memory."); 1731 return NULL; 1732 } 1733 1734 if (!try_module_get(THIS_MODULE)) 1735 goto bad_module_get; 1736 1737 /* get a minor number for the dev */ 1738 if (minor == DM_ANY_MINOR) 1739 r = next_free_minor(&minor); 1740 else 1741 r = specific_minor(minor); 1742 if (r < 0) 1743 goto bad_minor; 1744 1745 r = init_srcu_struct(&md->io_barrier); 1746 if (r < 0) 1747 goto bad_io_barrier; 1748 1749 md->numa_node_id = numa_node_id; 1750 md->init_tio_pdu = false; 1751 md->type = DM_TYPE_NONE; 1752 mutex_init(&md->suspend_lock); 1753 mutex_init(&md->type_lock); 1754 mutex_init(&md->table_devices_lock); 1755 spin_lock_init(&md->deferred_lock); 1756 atomic_set(&md->holders, 1); 1757 atomic_set(&md->open_count, 0); 1758 atomic_set(&md->event_nr, 0); 1759 atomic_set(&md->uevent_seq, 0); 1760 INIT_LIST_HEAD(&md->uevent_list); 1761 INIT_LIST_HEAD(&md->table_devices); 1762 spin_lock_init(&md->uevent_lock); 1763 1764 /* 1765 * default to bio-based until DM table is loaded and md->type 1766 * established. If request-based table is loaded: blk-mq will 1767 * override accordingly. 1768 */ 1769 md->disk = blk_alloc_disk(md->numa_node_id); 1770 if (!md->disk) 1771 goto bad; 1772 md->queue = md->disk->queue; 1773 1774 init_waitqueue_head(&md->wait); 1775 INIT_WORK(&md->work, dm_wq_work); 1776 init_waitqueue_head(&md->eventq); 1777 init_completion(&md->kobj_holder.completion); 1778 1779 md->swap_bios = get_swap_bios(); 1780 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1781 mutex_init(&md->swap_bios_lock); 1782 1783 md->disk->major = _major; 1784 md->disk->first_minor = minor; 1785 md->disk->minors = 1; 1786 md->disk->fops = &dm_blk_dops; 1787 md->disk->queue = md->queue; 1788 md->disk->private_data = md; 1789 sprintf(md->disk->disk_name, "dm-%d", minor); 1790 1791 if (IS_ENABLED(CONFIG_DAX_DRIVER)) { 1792 md->dax_dev = alloc_dax(md, md->disk->disk_name, 1793 &dm_dax_ops, 0); 1794 if (IS_ERR(md->dax_dev)) 1795 goto bad; 1796 } 1797 1798 format_dev_t(md->name, MKDEV(_major, minor)); 1799 1800 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 1801 if (!md->wq) 1802 goto bad; 1803 1804 dm_stats_init(&md->stats); 1805 1806 /* Populate the mapping, nobody knows we exist yet */ 1807 spin_lock(&_minor_lock); 1808 old_md = idr_replace(&_minor_idr, md, minor); 1809 spin_unlock(&_minor_lock); 1810 1811 BUG_ON(old_md != MINOR_ALLOCED); 1812 1813 return md; 1814 1815 bad: 1816 cleanup_mapped_device(md); 1817 bad_io_barrier: 1818 free_minor(minor); 1819 bad_minor: 1820 module_put(THIS_MODULE); 1821 bad_module_get: 1822 kvfree(md); 1823 return NULL; 1824 } 1825 1826 static void unlock_fs(struct mapped_device *md); 1827 1828 static void free_dev(struct mapped_device *md) 1829 { 1830 int minor = MINOR(disk_devt(md->disk)); 1831 1832 unlock_fs(md); 1833 1834 cleanup_mapped_device(md); 1835 1836 free_table_devices(&md->table_devices); 1837 dm_stats_cleanup(&md->stats); 1838 free_minor(minor); 1839 1840 module_put(THIS_MODULE); 1841 kvfree(md); 1842 } 1843 1844 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1845 { 1846 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1847 int ret = 0; 1848 1849 if (dm_table_bio_based(t)) { 1850 /* 1851 * The md may already have mempools that need changing. 1852 * If so, reload bioset because front_pad may have changed 1853 * because a different table was loaded. 1854 */ 1855 bioset_exit(&md->bs); 1856 bioset_exit(&md->io_bs); 1857 1858 } else if (bioset_initialized(&md->bs)) { 1859 /* 1860 * There's no need to reload with request-based dm 1861 * because the size of front_pad doesn't change. 1862 * Note for future: If you are to reload bioset, 1863 * prep-ed requests in the queue may refer 1864 * to bio from the old bioset, so you must walk 1865 * through the queue to unprep. 1866 */ 1867 goto out; 1868 } 1869 1870 BUG_ON(!p || 1871 bioset_initialized(&md->bs) || 1872 bioset_initialized(&md->io_bs)); 1873 1874 ret = bioset_init_from_src(&md->bs, &p->bs); 1875 if (ret) 1876 goto out; 1877 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 1878 if (ret) 1879 bioset_exit(&md->bs); 1880 out: 1881 /* mempool bind completed, no longer need any mempools in the table */ 1882 dm_table_free_md_mempools(t); 1883 return ret; 1884 } 1885 1886 /* 1887 * Bind a table to the device. 1888 */ 1889 static void event_callback(void *context) 1890 { 1891 unsigned long flags; 1892 LIST_HEAD(uevents); 1893 struct mapped_device *md = (struct mapped_device *) context; 1894 1895 spin_lock_irqsave(&md->uevent_lock, flags); 1896 list_splice_init(&md->uevent_list, &uevents); 1897 spin_unlock_irqrestore(&md->uevent_lock, flags); 1898 1899 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1900 1901 atomic_inc(&md->event_nr); 1902 wake_up(&md->eventq); 1903 dm_issue_global_event(); 1904 } 1905 1906 /* 1907 * Returns old map, which caller must destroy. 1908 */ 1909 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 1910 struct queue_limits *limits) 1911 { 1912 struct dm_table *old_map; 1913 struct request_queue *q = md->queue; 1914 bool request_based = dm_table_request_based(t); 1915 sector_t size; 1916 int ret; 1917 1918 lockdep_assert_held(&md->suspend_lock); 1919 1920 size = dm_table_get_size(t); 1921 1922 /* 1923 * Wipe any geometry if the size of the table changed. 1924 */ 1925 if (size != dm_get_size(md)) 1926 memset(&md->geometry, 0, sizeof(md->geometry)); 1927 1928 if (!get_capacity(md->disk)) 1929 set_capacity(md->disk, size); 1930 else 1931 set_capacity_and_notify(md->disk, size); 1932 1933 dm_table_event_callback(t, event_callback, md); 1934 1935 /* 1936 * The queue hasn't been stopped yet, if the old table type wasn't 1937 * for request-based during suspension. So stop it to prevent 1938 * I/O mapping before resume. 1939 * This must be done before setting the queue restrictions, 1940 * because request-based dm may be run just after the setting. 1941 */ 1942 if (request_based) 1943 dm_stop_queue(q); 1944 1945 if (request_based) { 1946 /* 1947 * Leverage the fact that request-based DM targets are 1948 * immutable singletons - used to optimize dm_mq_queue_rq. 1949 */ 1950 md->immutable_target = dm_table_get_immutable_target(t); 1951 } 1952 1953 ret = __bind_mempools(md, t); 1954 if (ret) { 1955 old_map = ERR_PTR(ret); 1956 goto out; 1957 } 1958 1959 ret = dm_table_set_restrictions(t, q, limits); 1960 if (ret) { 1961 old_map = ERR_PTR(ret); 1962 goto out; 1963 } 1964 1965 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 1966 rcu_assign_pointer(md->map, (void *)t); 1967 md->immutable_target_type = dm_table_get_immutable_target_type(t); 1968 1969 if (old_map) 1970 dm_sync_table(md); 1971 1972 out: 1973 return old_map; 1974 } 1975 1976 /* 1977 * Returns unbound table for the caller to free. 1978 */ 1979 static struct dm_table *__unbind(struct mapped_device *md) 1980 { 1981 struct dm_table *map = rcu_dereference_protected(md->map, 1); 1982 1983 if (!map) 1984 return NULL; 1985 1986 dm_table_event_callback(map, NULL, NULL); 1987 RCU_INIT_POINTER(md->map, NULL); 1988 dm_sync_table(md); 1989 1990 return map; 1991 } 1992 1993 /* 1994 * Constructor for a new device. 1995 */ 1996 int dm_create(int minor, struct mapped_device **result) 1997 { 1998 struct mapped_device *md; 1999 2000 md = alloc_dev(minor); 2001 if (!md) 2002 return -ENXIO; 2003 2004 dm_ima_reset_data(md); 2005 2006 *result = md; 2007 return 0; 2008 } 2009 2010 /* 2011 * Functions to manage md->type. 2012 * All are required to hold md->type_lock. 2013 */ 2014 void dm_lock_md_type(struct mapped_device *md) 2015 { 2016 mutex_lock(&md->type_lock); 2017 } 2018 2019 void dm_unlock_md_type(struct mapped_device *md) 2020 { 2021 mutex_unlock(&md->type_lock); 2022 } 2023 2024 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2025 { 2026 BUG_ON(!mutex_is_locked(&md->type_lock)); 2027 md->type = type; 2028 } 2029 2030 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2031 { 2032 return md->type; 2033 } 2034 2035 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2036 { 2037 return md->immutable_target_type; 2038 } 2039 2040 /* 2041 * The queue_limits are only valid as long as you have a reference 2042 * count on 'md'. 2043 */ 2044 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2045 { 2046 BUG_ON(!atomic_read(&md->holders)); 2047 return &md->queue->limits; 2048 } 2049 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2050 2051 /* 2052 * Setup the DM device's queue based on md's type 2053 */ 2054 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2055 { 2056 enum dm_queue_mode type = dm_table_get_type(t); 2057 struct queue_limits limits; 2058 int r; 2059 2060 switch (type) { 2061 case DM_TYPE_REQUEST_BASED: 2062 md->disk->fops = &dm_rq_blk_dops; 2063 r = dm_mq_init_request_queue(md, t); 2064 if (r) { 2065 DMERR("Cannot initialize queue for request-based dm mapped device"); 2066 return r; 2067 } 2068 break; 2069 case DM_TYPE_BIO_BASED: 2070 case DM_TYPE_DAX_BIO_BASED: 2071 break; 2072 case DM_TYPE_NONE: 2073 WARN_ON_ONCE(true); 2074 break; 2075 } 2076 2077 r = dm_calculate_queue_limits(t, &limits); 2078 if (r) { 2079 DMERR("Cannot calculate initial queue limits"); 2080 return r; 2081 } 2082 r = dm_table_set_restrictions(t, md->queue, &limits); 2083 if (r) 2084 return r; 2085 2086 add_disk(md->disk); 2087 2088 r = dm_sysfs_init(md); 2089 if (r) { 2090 del_gendisk(md->disk); 2091 return r; 2092 } 2093 md->type = type; 2094 return 0; 2095 } 2096 2097 struct mapped_device *dm_get_md(dev_t dev) 2098 { 2099 struct mapped_device *md; 2100 unsigned minor = MINOR(dev); 2101 2102 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2103 return NULL; 2104 2105 spin_lock(&_minor_lock); 2106 2107 md = idr_find(&_minor_idr, minor); 2108 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2109 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2110 md = NULL; 2111 goto out; 2112 } 2113 dm_get(md); 2114 out: 2115 spin_unlock(&_minor_lock); 2116 2117 return md; 2118 } 2119 EXPORT_SYMBOL_GPL(dm_get_md); 2120 2121 void *dm_get_mdptr(struct mapped_device *md) 2122 { 2123 return md->interface_ptr; 2124 } 2125 2126 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2127 { 2128 md->interface_ptr = ptr; 2129 } 2130 2131 void dm_get(struct mapped_device *md) 2132 { 2133 atomic_inc(&md->holders); 2134 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2135 } 2136 2137 int dm_hold(struct mapped_device *md) 2138 { 2139 spin_lock(&_minor_lock); 2140 if (test_bit(DMF_FREEING, &md->flags)) { 2141 spin_unlock(&_minor_lock); 2142 return -EBUSY; 2143 } 2144 dm_get(md); 2145 spin_unlock(&_minor_lock); 2146 return 0; 2147 } 2148 EXPORT_SYMBOL_GPL(dm_hold); 2149 2150 const char *dm_device_name(struct mapped_device *md) 2151 { 2152 return md->name; 2153 } 2154 EXPORT_SYMBOL_GPL(dm_device_name); 2155 2156 static void __dm_destroy(struct mapped_device *md, bool wait) 2157 { 2158 struct dm_table *map; 2159 int srcu_idx; 2160 2161 might_sleep(); 2162 2163 spin_lock(&_minor_lock); 2164 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2165 set_bit(DMF_FREEING, &md->flags); 2166 spin_unlock(&_minor_lock); 2167 2168 blk_set_queue_dying(md->queue); 2169 2170 /* 2171 * Take suspend_lock so that presuspend and postsuspend methods 2172 * do not race with internal suspend. 2173 */ 2174 mutex_lock(&md->suspend_lock); 2175 map = dm_get_live_table(md, &srcu_idx); 2176 if (!dm_suspended_md(md)) { 2177 dm_table_presuspend_targets(map); 2178 set_bit(DMF_SUSPENDED, &md->flags); 2179 set_bit(DMF_POST_SUSPENDING, &md->flags); 2180 dm_table_postsuspend_targets(map); 2181 } 2182 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2183 dm_put_live_table(md, srcu_idx); 2184 mutex_unlock(&md->suspend_lock); 2185 2186 /* 2187 * Rare, but there may be I/O requests still going to complete, 2188 * for example. Wait for all references to disappear. 2189 * No one should increment the reference count of the mapped_device, 2190 * after the mapped_device state becomes DMF_FREEING. 2191 */ 2192 if (wait) 2193 while (atomic_read(&md->holders)) 2194 msleep(1); 2195 else if (atomic_read(&md->holders)) 2196 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2197 dm_device_name(md), atomic_read(&md->holders)); 2198 2199 dm_table_destroy(__unbind(md)); 2200 free_dev(md); 2201 } 2202 2203 void dm_destroy(struct mapped_device *md) 2204 { 2205 __dm_destroy(md, true); 2206 } 2207 2208 void dm_destroy_immediate(struct mapped_device *md) 2209 { 2210 __dm_destroy(md, false); 2211 } 2212 2213 void dm_put(struct mapped_device *md) 2214 { 2215 atomic_dec(&md->holders); 2216 } 2217 EXPORT_SYMBOL_GPL(dm_put); 2218 2219 static bool md_in_flight_bios(struct mapped_device *md) 2220 { 2221 int cpu; 2222 struct block_device *part = dm_disk(md)->part0; 2223 long sum = 0; 2224 2225 for_each_possible_cpu(cpu) { 2226 sum += part_stat_local_read_cpu(part, in_flight[0], cpu); 2227 sum += part_stat_local_read_cpu(part, in_flight[1], cpu); 2228 } 2229 2230 return sum != 0; 2231 } 2232 2233 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2234 { 2235 int r = 0; 2236 DEFINE_WAIT(wait); 2237 2238 while (true) { 2239 prepare_to_wait(&md->wait, &wait, task_state); 2240 2241 if (!md_in_flight_bios(md)) 2242 break; 2243 2244 if (signal_pending_state(task_state, current)) { 2245 r = -EINTR; 2246 break; 2247 } 2248 2249 io_schedule(); 2250 } 2251 finish_wait(&md->wait, &wait); 2252 2253 return r; 2254 } 2255 2256 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2257 { 2258 int r = 0; 2259 2260 if (!queue_is_mq(md->queue)) 2261 return dm_wait_for_bios_completion(md, task_state); 2262 2263 while (true) { 2264 if (!blk_mq_queue_inflight(md->queue)) 2265 break; 2266 2267 if (signal_pending_state(task_state, current)) { 2268 r = -EINTR; 2269 break; 2270 } 2271 2272 msleep(5); 2273 } 2274 2275 return r; 2276 } 2277 2278 /* 2279 * Process the deferred bios 2280 */ 2281 static void dm_wq_work(struct work_struct *work) 2282 { 2283 struct mapped_device *md = container_of(work, struct mapped_device, work); 2284 struct bio *bio; 2285 2286 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2287 spin_lock_irq(&md->deferred_lock); 2288 bio = bio_list_pop(&md->deferred); 2289 spin_unlock_irq(&md->deferred_lock); 2290 2291 if (!bio) 2292 break; 2293 2294 submit_bio_noacct(bio); 2295 } 2296 } 2297 2298 static void dm_queue_flush(struct mapped_device *md) 2299 { 2300 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2301 smp_mb__after_atomic(); 2302 queue_work(md->wq, &md->work); 2303 } 2304 2305 /* 2306 * Swap in a new table, returning the old one for the caller to destroy. 2307 */ 2308 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2309 { 2310 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2311 struct queue_limits limits; 2312 int r; 2313 2314 mutex_lock(&md->suspend_lock); 2315 2316 /* device must be suspended */ 2317 if (!dm_suspended_md(md)) 2318 goto out; 2319 2320 /* 2321 * If the new table has no data devices, retain the existing limits. 2322 * This helps multipath with queue_if_no_path if all paths disappear, 2323 * then new I/O is queued based on these limits, and then some paths 2324 * reappear. 2325 */ 2326 if (dm_table_has_no_data_devices(table)) { 2327 live_map = dm_get_live_table_fast(md); 2328 if (live_map) 2329 limits = md->queue->limits; 2330 dm_put_live_table_fast(md); 2331 } 2332 2333 if (!live_map) { 2334 r = dm_calculate_queue_limits(table, &limits); 2335 if (r) { 2336 map = ERR_PTR(r); 2337 goto out; 2338 } 2339 } 2340 2341 map = __bind(md, table, &limits); 2342 dm_issue_global_event(); 2343 2344 out: 2345 mutex_unlock(&md->suspend_lock); 2346 return map; 2347 } 2348 2349 /* 2350 * Functions to lock and unlock any filesystem running on the 2351 * device. 2352 */ 2353 static int lock_fs(struct mapped_device *md) 2354 { 2355 int r; 2356 2357 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2358 2359 r = freeze_bdev(md->disk->part0); 2360 if (!r) 2361 set_bit(DMF_FROZEN, &md->flags); 2362 return r; 2363 } 2364 2365 static void unlock_fs(struct mapped_device *md) 2366 { 2367 if (!test_bit(DMF_FROZEN, &md->flags)) 2368 return; 2369 thaw_bdev(md->disk->part0); 2370 clear_bit(DMF_FROZEN, &md->flags); 2371 } 2372 2373 /* 2374 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2375 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2376 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2377 * 2378 * If __dm_suspend returns 0, the device is completely quiescent 2379 * now. There is no request-processing activity. All new requests 2380 * are being added to md->deferred list. 2381 */ 2382 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2383 unsigned suspend_flags, unsigned int task_state, 2384 int dmf_suspended_flag) 2385 { 2386 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2387 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2388 int r; 2389 2390 lockdep_assert_held(&md->suspend_lock); 2391 2392 /* 2393 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2394 * This flag is cleared before dm_suspend returns. 2395 */ 2396 if (noflush) 2397 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2398 else 2399 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2400 2401 /* 2402 * This gets reverted if there's an error later and the targets 2403 * provide the .presuspend_undo hook. 2404 */ 2405 dm_table_presuspend_targets(map); 2406 2407 /* 2408 * Flush I/O to the device. 2409 * Any I/O submitted after lock_fs() may not be flushed. 2410 * noflush takes precedence over do_lockfs. 2411 * (lock_fs() flushes I/Os and waits for them to complete.) 2412 */ 2413 if (!noflush && do_lockfs) { 2414 r = lock_fs(md); 2415 if (r) { 2416 dm_table_presuspend_undo_targets(map); 2417 return r; 2418 } 2419 } 2420 2421 /* 2422 * Here we must make sure that no processes are submitting requests 2423 * to target drivers i.e. no one may be executing 2424 * __split_and_process_bio from dm_submit_bio. 2425 * 2426 * To get all processes out of __split_and_process_bio in dm_submit_bio, 2427 * we take the write lock. To prevent any process from reentering 2428 * __split_and_process_bio from dm_submit_bio and quiesce the thread 2429 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2430 * flush_workqueue(md->wq). 2431 */ 2432 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2433 if (map) 2434 synchronize_srcu(&md->io_barrier); 2435 2436 /* 2437 * Stop md->queue before flushing md->wq in case request-based 2438 * dm defers requests to md->wq from md->queue. 2439 */ 2440 if (dm_request_based(md)) 2441 dm_stop_queue(md->queue); 2442 2443 flush_workqueue(md->wq); 2444 2445 /* 2446 * At this point no more requests are entering target request routines. 2447 * We call dm_wait_for_completion to wait for all existing requests 2448 * to finish. 2449 */ 2450 r = dm_wait_for_completion(md, task_state); 2451 if (!r) 2452 set_bit(dmf_suspended_flag, &md->flags); 2453 2454 if (noflush) 2455 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2456 if (map) 2457 synchronize_srcu(&md->io_barrier); 2458 2459 /* were we interrupted ? */ 2460 if (r < 0) { 2461 dm_queue_flush(md); 2462 2463 if (dm_request_based(md)) 2464 dm_start_queue(md->queue); 2465 2466 unlock_fs(md); 2467 dm_table_presuspend_undo_targets(map); 2468 /* pushback list is already flushed, so skip flush */ 2469 } 2470 2471 return r; 2472 } 2473 2474 /* 2475 * We need to be able to change a mapping table under a mounted 2476 * filesystem. For example we might want to move some data in 2477 * the background. Before the table can be swapped with 2478 * dm_bind_table, dm_suspend must be called to flush any in 2479 * flight bios and ensure that any further io gets deferred. 2480 */ 2481 /* 2482 * Suspend mechanism in request-based dm. 2483 * 2484 * 1. Flush all I/Os by lock_fs() if needed. 2485 * 2. Stop dispatching any I/O by stopping the request_queue. 2486 * 3. Wait for all in-flight I/Os to be completed or requeued. 2487 * 2488 * To abort suspend, start the request_queue. 2489 */ 2490 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2491 { 2492 struct dm_table *map = NULL; 2493 int r = 0; 2494 2495 retry: 2496 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2497 2498 if (dm_suspended_md(md)) { 2499 r = -EINVAL; 2500 goto out_unlock; 2501 } 2502 2503 if (dm_suspended_internally_md(md)) { 2504 /* already internally suspended, wait for internal resume */ 2505 mutex_unlock(&md->suspend_lock); 2506 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2507 if (r) 2508 return r; 2509 goto retry; 2510 } 2511 2512 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2513 2514 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2515 if (r) 2516 goto out_unlock; 2517 2518 set_bit(DMF_POST_SUSPENDING, &md->flags); 2519 dm_table_postsuspend_targets(map); 2520 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2521 2522 out_unlock: 2523 mutex_unlock(&md->suspend_lock); 2524 return r; 2525 } 2526 2527 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2528 { 2529 if (map) { 2530 int r = dm_table_resume_targets(map); 2531 if (r) 2532 return r; 2533 } 2534 2535 dm_queue_flush(md); 2536 2537 /* 2538 * Flushing deferred I/Os must be done after targets are resumed 2539 * so that mapping of targets can work correctly. 2540 * Request-based dm is queueing the deferred I/Os in its request_queue. 2541 */ 2542 if (dm_request_based(md)) 2543 dm_start_queue(md->queue); 2544 2545 unlock_fs(md); 2546 2547 return 0; 2548 } 2549 2550 int dm_resume(struct mapped_device *md) 2551 { 2552 int r; 2553 struct dm_table *map = NULL; 2554 2555 retry: 2556 r = -EINVAL; 2557 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2558 2559 if (!dm_suspended_md(md)) 2560 goto out; 2561 2562 if (dm_suspended_internally_md(md)) { 2563 /* already internally suspended, wait for internal resume */ 2564 mutex_unlock(&md->suspend_lock); 2565 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2566 if (r) 2567 return r; 2568 goto retry; 2569 } 2570 2571 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2572 if (!map || !dm_table_get_size(map)) 2573 goto out; 2574 2575 r = __dm_resume(md, map); 2576 if (r) 2577 goto out; 2578 2579 clear_bit(DMF_SUSPENDED, &md->flags); 2580 out: 2581 mutex_unlock(&md->suspend_lock); 2582 2583 return r; 2584 } 2585 2586 /* 2587 * Internal suspend/resume works like userspace-driven suspend. It waits 2588 * until all bios finish and prevents issuing new bios to the target drivers. 2589 * It may be used only from the kernel. 2590 */ 2591 2592 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2593 { 2594 struct dm_table *map = NULL; 2595 2596 lockdep_assert_held(&md->suspend_lock); 2597 2598 if (md->internal_suspend_count++) 2599 return; /* nested internal suspend */ 2600 2601 if (dm_suspended_md(md)) { 2602 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2603 return; /* nest suspend */ 2604 } 2605 2606 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2607 2608 /* 2609 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2610 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2611 * would require changing .presuspend to return an error -- avoid this 2612 * until there is a need for more elaborate variants of internal suspend. 2613 */ 2614 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2615 DMF_SUSPENDED_INTERNALLY); 2616 2617 set_bit(DMF_POST_SUSPENDING, &md->flags); 2618 dm_table_postsuspend_targets(map); 2619 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2620 } 2621 2622 static void __dm_internal_resume(struct mapped_device *md) 2623 { 2624 BUG_ON(!md->internal_suspend_count); 2625 2626 if (--md->internal_suspend_count) 2627 return; /* resume from nested internal suspend */ 2628 2629 if (dm_suspended_md(md)) 2630 goto done; /* resume from nested suspend */ 2631 2632 /* 2633 * NOTE: existing callers don't need to call dm_table_resume_targets 2634 * (which may fail -- so best to avoid it for now by passing NULL map) 2635 */ 2636 (void) __dm_resume(md, NULL); 2637 2638 done: 2639 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2640 smp_mb__after_atomic(); 2641 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2642 } 2643 2644 void dm_internal_suspend_noflush(struct mapped_device *md) 2645 { 2646 mutex_lock(&md->suspend_lock); 2647 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2648 mutex_unlock(&md->suspend_lock); 2649 } 2650 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2651 2652 void dm_internal_resume(struct mapped_device *md) 2653 { 2654 mutex_lock(&md->suspend_lock); 2655 __dm_internal_resume(md); 2656 mutex_unlock(&md->suspend_lock); 2657 } 2658 EXPORT_SYMBOL_GPL(dm_internal_resume); 2659 2660 /* 2661 * Fast variants of internal suspend/resume hold md->suspend_lock, 2662 * which prevents interaction with userspace-driven suspend. 2663 */ 2664 2665 void dm_internal_suspend_fast(struct mapped_device *md) 2666 { 2667 mutex_lock(&md->suspend_lock); 2668 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2669 return; 2670 2671 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2672 synchronize_srcu(&md->io_barrier); 2673 flush_workqueue(md->wq); 2674 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2675 } 2676 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2677 2678 void dm_internal_resume_fast(struct mapped_device *md) 2679 { 2680 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2681 goto done; 2682 2683 dm_queue_flush(md); 2684 2685 done: 2686 mutex_unlock(&md->suspend_lock); 2687 } 2688 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2689 2690 /*----------------------------------------------------------------- 2691 * Event notification. 2692 *---------------------------------------------------------------*/ 2693 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2694 unsigned cookie) 2695 { 2696 int r; 2697 unsigned noio_flag; 2698 char udev_cookie[DM_COOKIE_LENGTH]; 2699 char *envp[] = { udev_cookie, NULL }; 2700 2701 noio_flag = memalloc_noio_save(); 2702 2703 if (!cookie) 2704 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2705 else { 2706 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2707 DM_COOKIE_ENV_VAR_NAME, cookie); 2708 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2709 action, envp); 2710 } 2711 2712 memalloc_noio_restore(noio_flag); 2713 2714 return r; 2715 } 2716 2717 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2718 { 2719 return atomic_add_return(1, &md->uevent_seq); 2720 } 2721 2722 uint32_t dm_get_event_nr(struct mapped_device *md) 2723 { 2724 return atomic_read(&md->event_nr); 2725 } 2726 2727 int dm_wait_event(struct mapped_device *md, int event_nr) 2728 { 2729 return wait_event_interruptible(md->eventq, 2730 (event_nr != atomic_read(&md->event_nr))); 2731 } 2732 2733 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2734 { 2735 unsigned long flags; 2736 2737 spin_lock_irqsave(&md->uevent_lock, flags); 2738 list_add(elist, &md->uevent_list); 2739 spin_unlock_irqrestore(&md->uevent_lock, flags); 2740 } 2741 2742 /* 2743 * The gendisk is only valid as long as you have a reference 2744 * count on 'md'. 2745 */ 2746 struct gendisk *dm_disk(struct mapped_device *md) 2747 { 2748 return md->disk; 2749 } 2750 EXPORT_SYMBOL_GPL(dm_disk); 2751 2752 struct kobject *dm_kobject(struct mapped_device *md) 2753 { 2754 return &md->kobj_holder.kobj; 2755 } 2756 2757 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2758 { 2759 struct mapped_device *md; 2760 2761 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2762 2763 spin_lock(&_minor_lock); 2764 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2765 md = NULL; 2766 goto out; 2767 } 2768 dm_get(md); 2769 out: 2770 spin_unlock(&_minor_lock); 2771 2772 return md; 2773 } 2774 2775 int dm_suspended_md(struct mapped_device *md) 2776 { 2777 return test_bit(DMF_SUSPENDED, &md->flags); 2778 } 2779 2780 static int dm_post_suspending_md(struct mapped_device *md) 2781 { 2782 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2783 } 2784 2785 int dm_suspended_internally_md(struct mapped_device *md) 2786 { 2787 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2788 } 2789 2790 int dm_test_deferred_remove_flag(struct mapped_device *md) 2791 { 2792 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2793 } 2794 2795 int dm_suspended(struct dm_target *ti) 2796 { 2797 return dm_suspended_md(ti->table->md); 2798 } 2799 EXPORT_SYMBOL_GPL(dm_suspended); 2800 2801 int dm_post_suspending(struct dm_target *ti) 2802 { 2803 return dm_post_suspending_md(ti->table->md); 2804 } 2805 EXPORT_SYMBOL_GPL(dm_post_suspending); 2806 2807 int dm_noflush_suspending(struct dm_target *ti) 2808 { 2809 return __noflush_suspending(ti->table->md); 2810 } 2811 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2812 2813 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2814 unsigned integrity, unsigned per_io_data_size, 2815 unsigned min_pool_size) 2816 { 2817 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2818 unsigned int pool_size = 0; 2819 unsigned int front_pad, io_front_pad; 2820 int ret; 2821 2822 if (!pools) 2823 return NULL; 2824 2825 switch (type) { 2826 case DM_TYPE_BIO_BASED: 2827 case DM_TYPE_DAX_BIO_BASED: 2828 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2829 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2830 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2831 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2832 if (ret) 2833 goto out; 2834 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2835 goto out; 2836 break; 2837 case DM_TYPE_REQUEST_BASED: 2838 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2839 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2840 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2841 break; 2842 default: 2843 BUG(); 2844 } 2845 2846 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2847 if (ret) 2848 goto out; 2849 2850 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2851 goto out; 2852 2853 return pools; 2854 2855 out: 2856 dm_free_md_mempools(pools); 2857 2858 return NULL; 2859 } 2860 2861 void dm_free_md_mempools(struct dm_md_mempools *pools) 2862 { 2863 if (!pools) 2864 return; 2865 2866 bioset_exit(&pools->bs); 2867 bioset_exit(&pools->io_bs); 2868 2869 kfree(pools); 2870 } 2871 2872 struct dm_pr { 2873 u64 old_key; 2874 u64 new_key; 2875 u32 flags; 2876 bool fail_early; 2877 }; 2878 2879 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2880 void *data) 2881 { 2882 struct mapped_device *md = bdev->bd_disk->private_data; 2883 struct dm_table *table; 2884 struct dm_target *ti; 2885 int ret = -ENOTTY, srcu_idx; 2886 2887 table = dm_get_live_table(md, &srcu_idx); 2888 if (!table || !dm_table_get_size(table)) 2889 goto out; 2890 2891 /* We only support devices that have a single target */ 2892 if (dm_table_get_num_targets(table) != 1) 2893 goto out; 2894 ti = dm_table_get_target(table, 0); 2895 2896 ret = -EINVAL; 2897 if (!ti->type->iterate_devices) 2898 goto out; 2899 2900 ret = ti->type->iterate_devices(ti, fn, data); 2901 out: 2902 dm_put_live_table(md, srcu_idx); 2903 return ret; 2904 } 2905 2906 /* 2907 * For register / unregister we need to manually call out to every path. 2908 */ 2909 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 2910 sector_t start, sector_t len, void *data) 2911 { 2912 struct dm_pr *pr = data; 2913 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 2914 2915 if (!ops || !ops->pr_register) 2916 return -EOPNOTSUPP; 2917 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 2918 } 2919 2920 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 2921 u32 flags) 2922 { 2923 struct dm_pr pr = { 2924 .old_key = old_key, 2925 .new_key = new_key, 2926 .flags = flags, 2927 .fail_early = true, 2928 }; 2929 int ret; 2930 2931 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 2932 if (ret && new_key) { 2933 /* unregister all paths if we failed to register any path */ 2934 pr.old_key = new_key; 2935 pr.new_key = 0; 2936 pr.flags = 0; 2937 pr.fail_early = false; 2938 dm_call_pr(bdev, __dm_pr_register, &pr); 2939 } 2940 2941 return ret; 2942 } 2943 2944 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 2945 u32 flags) 2946 { 2947 struct mapped_device *md = bdev->bd_disk->private_data; 2948 const struct pr_ops *ops; 2949 int r, srcu_idx; 2950 2951 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 2952 if (r < 0) 2953 goto out; 2954 2955 ops = bdev->bd_disk->fops->pr_ops; 2956 if (ops && ops->pr_reserve) 2957 r = ops->pr_reserve(bdev, key, type, flags); 2958 else 2959 r = -EOPNOTSUPP; 2960 out: 2961 dm_unprepare_ioctl(md, srcu_idx); 2962 return r; 2963 } 2964 2965 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 2966 { 2967 struct mapped_device *md = bdev->bd_disk->private_data; 2968 const struct pr_ops *ops; 2969 int r, srcu_idx; 2970 2971 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 2972 if (r < 0) 2973 goto out; 2974 2975 ops = bdev->bd_disk->fops->pr_ops; 2976 if (ops && ops->pr_release) 2977 r = ops->pr_release(bdev, key, type); 2978 else 2979 r = -EOPNOTSUPP; 2980 out: 2981 dm_unprepare_ioctl(md, srcu_idx); 2982 return r; 2983 } 2984 2985 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 2986 enum pr_type type, bool abort) 2987 { 2988 struct mapped_device *md = bdev->bd_disk->private_data; 2989 const struct pr_ops *ops; 2990 int r, srcu_idx; 2991 2992 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 2993 if (r < 0) 2994 goto out; 2995 2996 ops = bdev->bd_disk->fops->pr_ops; 2997 if (ops && ops->pr_preempt) 2998 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 2999 else 3000 r = -EOPNOTSUPP; 3001 out: 3002 dm_unprepare_ioctl(md, srcu_idx); 3003 return r; 3004 } 3005 3006 static int dm_pr_clear(struct block_device *bdev, u64 key) 3007 { 3008 struct mapped_device *md = bdev->bd_disk->private_data; 3009 const struct pr_ops *ops; 3010 int r, srcu_idx; 3011 3012 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3013 if (r < 0) 3014 goto out; 3015 3016 ops = bdev->bd_disk->fops->pr_ops; 3017 if (ops && ops->pr_clear) 3018 r = ops->pr_clear(bdev, key); 3019 else 3020 r = -EOPNOTSUPP; 3021 out: 3022 dm_unprepare_ioctl(md, srcu_idx); 3023 return r; 3024 } 3025 3026 static const struct pr_ops dm_pr_ops = { 3027 .pr_register = dm_pr_register, 3028 .pr_reserve = dm_pr_reserve, 3029 .pr_release = dm_pr_release, 3030 .pr_preempt = dm_pr_preempt, 3031 .pr_clear = dm_pr_clear, 3032 }; 3033 3034 static const struct block_device_operations dm_blk_dops = { 3035 .submit_bio = dm_submit_bio, 3036 .open = dm_blk_open, 3037 .release = dm_blk_close, 3038 .ioctl = dm_blk_ioctl, 3039 .getgeo = dm_blk_getgeo, 3040 .report_zones = dm_blk_report_zones, 3041 .pr_ops = &dm_pr_ops, 3042 .owner = THIS_MODULE 3043 }; 3044 3045 static const struct block_device_operations dm_rq_blk_dops = { 3046 .open = dm_blk_open, 3047 .release = dm_blk_close, 3048 .ioctl = dm_blk_ioctl, 3049 .getgeo = dm_blk_getgeo, 3050 .pr_ops = &dm_pr_ops, 3051 .owner = THIS_MODULE 3052 }; 3053 3054 static const struct dax_operations dm_dax_ops = { 3055 .direct_access = dm_dax_direct_access, 3056 .dax_supported = dm_dax_supported, 3057 .copy_from_iter = dm_dax_copy_from_iter, 3058 .copy_to_iter = dm_dax_copy_to_iter, 3059 .zero_page_range = dm_dax_zero_page_range, 3060 }; 3061 3062 /* 3063 * module hooks 3064 */ 3065 module_init(dm_init); 3066 module_exit(dm_exit); 3067 3068 module_param(major, uint, 0); 3069 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3070 3071 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3072 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3073 3074 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3075 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3076 3077 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3078 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3079 3080 MODULE_DESCRIPTION(DM_NAME " driver"); 3081 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3082 MODULE_LICENSE("GPL"); 3083