1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 unsigned long flags; 78 79 spin_lock_irqsave(&iot->lock, flags); 80 r = __iot_idle_for(iot, jifs); 81 spin_unlock_irqrestore(&iot->lock, flags); 82 83 return r; 84 } 85 86 static void iot_io_begin(struct io_tracker *iot, sector_t len) 87 { 88 unsigned long flags; 89 90 spin_lock_irqsave(&iot->lock, flags); 91 iot->in_flight += len; 92 spin_unlock_irqrestore(&iot->lock, flags); 93 } 94 95 static void __iot_io_end(struct io_tracker *iot, sector_t len) 96 { 97 if (!len) 98 return; 99 100 iot->in_flight -= len; 101 if (!iot->in_flight) 102 iot->idle_time = jiffies; 103 } 104 105 static void iot_io_end(struct io_tracker *iot, sector_t len) 106 { 107 unsigned long flags; 108 109 spin_lock_irqsave(&iot->lock, flags); 110 __iot_io_end(iot, len); 111 spin_unlock_irqrestore(&iot->lock, flags); 112 } 113 114 /*----------------------------------------------------------------*/ 115 116 /* 117 * Represents a chunk of future work. 'input' allows continuations to pass 118 * values between themselves, typically error values. 119 */ 120 struct continuation { 121 struct work_struct ws; 122 blk_status_t input; 123 }; 124 125 static inline void init_continuation(struct continuation *k, 126 void (*fn)(struct work_struct *)) 127 { 128 INIT_WORK(&k->ws, fn); 129 k->input = 0; 130 } 131 132 static inline void queue_continuation(struct workqueue_struct *wq, 133 struct continuation *k) 134 { 135 queue_work(wq, &k->ws); 136 } 137 138 /*----------------------------------------------------------------*/ 139 140 /* 141 * The batcher collects together pieces of work that need a particular 142 * operation to occur before they can proceed (typically a commit). 143 */ 144 struct batcher { 145 /* 146 * The operation that everyone is waiting for. 147 */ 148 blk_status_t (*commit_op)(void *context); 149 void *commit_context; 150 151 /* 152 * This is how bios should be issued once the commit op is complete 153 * (accounted_request). 154 */ 155 void (*issue_op)(struct bio *bio, void *context); 156 void *issue_context; 157 158 /* 159 * Queued work gets put on here after commit. 160 */ 161 struct workqueue_struct *wq; 162 163 spinlock_t lock; 164 struct list_head work_items; 165 struct bio_list bios; 166 struct work_struct commit_work; 167 168 bool commit_scheduled; 169 }; 170 171 static void __commit(struct work_struct *_ws) 172 { 173 struct batcher *b = container_of(_ws, struct batcher, commit_work); 174 blk_status_t r; 175 unsigned long flags; 176 struct list_head work_items; 177 struct work_struct *ws, *tmp; 178 struct continuation *k; 179 struct bio *bio; 180 struct bio_list bios; 181 182 INIT_LIST_HEAD(&work_items); 183 bio_list_init(&bios); 184 185 /* 186 * We have to grab these before the commit_op to avoid a race 187 * condition. 188 */ 189 spin_lock_irqsave(&b->lock, flags); 190 list_splice_init(&b->work_items, &work_items); 191 bio_list_merge(&bios, &b->bios); 192 bio_list_init(&b->bios); 193 b->commit_scheduled = false; 194 spin_unlock_irqrestore(&b->lock, flags); 195 196 r = b->commit_op(b->commit_context); 197 198 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 199 k = container_of(ws, struct continuation, ws); 200 k->input = r; 201 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 202 queue_work(b->wq, ws); 203 } 204 205 while ((bio = bio_list_pop(&bios))) { 206 if (r) { 207 bio->bi_status = r; 208 bio_endio(bio); 209 } else 210 b->issue_op(bio, b->issue_context); 211 } 212 } 213 214 static void batcher_init(struct batcher *b, 215 blk_status_t (*commit_op)(void *), 216 void *commit_context, 217 void (*issue_op)(struct bio *bio, void *), 218 void *issue_context, 219 struct workqueue_struct *wq) 220 { 221 b->commit_op = commit_op; 222 b->commit_context = commit_context; 223 b->issue_op = issue_op; 224 b->issue_context = issue_context; 225 b->wq = wq; 226 227 spin_lock_init(&b->lock); 228 INIT_LIST_HEAD(&b->work_items); 229 bio_list_init(&b->bios); 230 INIT_WORK(&b->commit_work, __commit); 231 b->commit_scheduled = false; 232 } 233 234 static void async_commit(struct batcher *b) 235 { 236 queue_work(b->wq, &b->commit_work); 237 } 238 239 static void continue_after_commit(struct batcher *b, struct continuation *k) 240 { 241 unsigned long flags; 242 bool commit_scheduled; 243 244 spin_lock_irqsave(&b->lock, flags); 245 commit_scheduled = b->commit_scheduled; 246 list_add_tail(&k->ws.entry, &b->work_items); 247 spin_unlock_irqrestore(&b->lock, flags); 248 249 if (commit_scheduled) 250 async_commit(b); 251 } 252 253 /* 254 * Bios are errored if commit failed. 255 */ 256 static void issue_after_commit(struct batcher *b, struct bio *bio) 257 { 258 unsigned long flags; 259 bool commit_scheduled; 260 261 spin_lock_irqsave(&b->lock, flags); 262 commit_scheduled = b->commit_scheduled; 263 bio_list_add(&b->bios, bio); 264 spin_unlock_irqrestore(&b->lock, flags); 265 266 if (commit_scheduled) 267 async_commit(b); 268 } 269 270 /* 271 * Call this if some urgent work is waiting for the commit to complete. 272 */ 273 static void schedule_commit(struct batcher *b) 274 { 275 bool immediate; 276 unsigned long flags; 277 278 spin_lock_irqsave(&b->lock, flags); 279 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 280 b->commit_scheduled = true; 281 spin_unlock_irqrestore(&b->lock, flags); 282 283 if (immediate) 284 async_commit(b); 285 } 286 287 /* 288 * There are a couple of places where we let a bio run, but want to do some 289 * work before calling its endio function. We do this by temporarily 290 * changing the endio fn. 291 */ 292 struct dm_hook_info { 293 bio_end_io_t *bi_end_io; 294 }; 295 296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 297 bio_end_io_t *bi_end_io, void *bi_private) 298 { 299 h->bi_end_io = bio->bi_end_io; 300 301 bio->bi_end_io = bi_end_io; 302 bio->bi_private = bi_private; 303 } 304 305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 306 { 307 bio->bi_end_io = h->bi_end_io; 308 } 309 310 /*----------------------------------------------------------------*/ 311 312 #define MIGRATION_POOL_SIZE 128 313 #define COMMIT_PERIOD HZ 314 #define MIGRATION_COUNT_WINDOW 10 315 316 /* 317 * The block size of the device holding cache data must be 318 * between 32KB and 1GB. 319 */ 320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 322 323 enum cache_metadata_mode { 324 CM_WRITE, /* metadata may be changed */ 325 CM_READ_ONLY, /* metadata may not be changed */ 326 CM_FAIL 327 }; 328 329 enum cache_io_mode { 330 /* 331 * Data is written to cached blocks only. These blocks are marked 332 * dirty. If you lose the cache device you will lose data. 333 * Potential performance increase for both reads and writes. 334 */ 335 CM_IO_WRITEBACK, 336 337 /* 338 * Data is written to both cache and origin. Blocks are never 339 * dirty. Potential performance benfit for reads only. 340 */ 341 CM_IO_WRITETHROUGH, 342 343 /* 344 * A degraded mode useful for various cache coherency situations 345 * (eg, rolling back snapshots). Reads and writes always go to the 346 * origin. If a write goes to a cached oblock, then the cache 347 * block is invalidated. 348 */ 349 CM_IO_PASSTHROUGH 350 }; 351 352 struct cache_features { 353 enum cache_metadata_mode mode; 354 enum cache_io_mode io_mode; 355 unsigned metadata_version; 356 }; 357 358 struct cache_stats { 359 atomic_t read_hit; 360 atomic_t read_miss; 361 atomic_t write_hit; 362 atomic_t write_miss; 363 atomic_t demotion; 364 atomic_t promotion; 365 atomic_t writeback; 366 atomic_t copies_avoided; 367 atomic_t cache_cell_clash; 368 atomic_t commit_count; 369 atomic_t discard_count; 370 }; 371 372 struct cache { 373 struct dm_target *ti; 374 spinlock_t lock; 375 376 /* 377 * Fields for converting from sectors to blocks. 378 */ 379 int sectors_per_block_shift; 380 sector_t sectors_per_block; 381 382 struct dm_cache_metadata *cmd; 383 384 /* 385 * Metadata is written to this device. 386 */ 387 struct dm_dev *metadata_dev; 388 389 /* 390 * The slower of the two data devices. Typically a spindle. 391 */ 392 struct dm_dev *origin_dev; 393 394 /* 395 * The faster of the two data devices. Typically an SSD. 396 */ 397 struct dm_dev *cache_dev; 398 399 /* 400 * Size of the origin device in _complete_ blocks and native sectors. 401 */ 402 dm_oblock_t origin_blocks; 403 sector_t origin_sectors; 404 405 /* 406 * Size of the cache device in blocks. 407 */ 408 dm_cblock_t cache_size; 409 410 /* 411 * Invalidation fields. 412 */ 413 spinlock_t invalidation_lock; 414 struct list_head invalidation_requests; 415 416 sector_t migration_threshold; 417 wait_queue_head_t migration_wait; 418 atomic_t nr_allocated_migrations; 419 420 /* 421 * The number of in flight migrations that are performing 422 * background io. eg, promotion, writeback. 423 */ 424 atomic_t nr_io_migrations; 425 426 struct bio_list deferred_bios; 427 428 struct rw_semaphore quiesce_lock; 429 430 struct dm_target_callbacks callbacks; 431 432 /* 433 * origin_blocks entries, discarded if set. 434 */ 435 dm_dblock_t discard_nr_blocks; 436 unsigned long *discard_bitset; 437 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 438 439 /* 440 * Rather than reconstructing the table line for the status we just 441 * save it and regurgitate. 442 */ 443 unsigned nr_ctr_args; 444 const char **ctr_args; 445 446 struct dm_kcopyd_client *copier; 447 struct work_struct deferred_bio_worker; 448 struct work_struct migration_worker; 449 struct workqueue_struct *wq; 450 struct delayed_work waker; 451 struct dm_bio_prison_v2 *prison; 452 453 /* 454 * cache_size entries, dirty if set 455 */ 456 unsigned long *dirty_bitset; 457 atomic_t nr_dirty; 458 459 unsigned policy_nr_args; 460 struct dm_cache_policy *policy; 461 462 /* 463 * Cache features such as write-through. 464 */ 465 struct cache_features features; 466 467 struct cache_stats stats; 468 469 bool need_tick_bio:1; 470 bool sized:1; 471 bool invalidate:1; 472 bool commit_requested:1; 473 bool loaded_mappings:1; 474 bool loaded_discards:1; 475 476 struct rw_semaphore background_work_lock; 477 478 struct batcher committer; 479 struct work_struct commit_ws; 480 481 struct io_tracker tracker; 482 483 mempool_t migration_pool; 484 485 struct bio_set bs; 486 }; 487 488 struct per_bio_data { 489 bool tick:1; 490 unsigned req_nr:2; 491 struct dm_bio_prison_cell_v2 *cell; 492 struct dm_hook_info hook_info; 493 sector_t len; 494 }; 495 496 struct dm_cache_migration { 497 struct continuation k; 498 struct cache *cache; 499 500 struct policy_work *op; 501 struct bio *overwrite_bio; 502 struct dm_bio_prison_cell_v2 *cell; 503 504 dm_cblock_t invalidate_cblock; 505 dm_oblock_t invalidate_oblock; 506 }; 507 508 /*----------------------------------------------------------------*/ 509 510 static bool writethrough_mode(struct cache *cache) 511 { 512 return cache->features.io_mode == CM_IO_WRITETHROUGH; 513 } 514 515 static bool writeback_mode(struct cache *cache) 516 { 517 return cache->features.io_mode == CM_IO_WRITEBACK; 518 } 519 520 static inline bool passthrough_mode(struct cache *cache) 521 { 522 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 523 } 524 525 /*----------------------------------------------------------------*/ 526 527 static void wake_deferred_bio_worker(struct cache *cache) 528 { 529 queue_work(cache->wq, &cache->deferred_bio_worker); 530 } 531 532 static void wake_migration_worker(struct cache *cache) 533 { 534 if (passthrough_mode(cache)) 535 return; 536 537 queue_work(cache->wq, &cache->migration_worker); 538 } 539 540 /*----------------------------------------------------------------*/ 541 542 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 543 { 544 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 545 } 546 547 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 548 { 549 dm_bio_prison_free_cell_v2(cache->prison, cell); 550 } 551 552 static struct dm_cache_migration *alloc_migration(struct cache *cache) 553 { 554 struct dm_cache_migration *mg; 555 556 mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); 557 if (!mg) 558 return NULL; 559 560 memset(mg, 0, sizeof(*mg)); 561 562 mg->cache = cache; 563 atomic_inc(&cache->nr_allocated_migrations); 564 565 return mg; 566 } 567 568 static void free_migration(struct dm_cache_migration *mg) 569 { 570 struct cache *cache = mg->cache; 571 572 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 573 wake_up(&cache->migration_wait); 574 575 mempool_free(mg, &cache->migration_pool); 576 } 577 578 /*----------------------------------------------------------------*/ 579 580 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 581 { 582 return to_oblock(from_oblock(b) + 1ull); 583 } 584 585 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 586 { 587 key->virtual = 0; 588 key->dev = 0; 589 key->block_begin = from_oblock(begin); 590 key->block_end = from_oblock(end); 591 } 592 593 /* 594 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 595 * level 1 which prevents *both* READs and WRITEs. 596 */ 597 #define WRITE_LOCK_LEVEL 0 598 #define READ_WRITE_LOCK_LEVEL 1 599 600 static unsigned lock_level(struct bio *bio) 601 { 602 return bio_data_dir(bio) == WRITE ? 603 WRITE_LOCK_LEVEL : 604 READ_WRITE_LOCK_LEVEL; 605 } 606 607 /*---------------------------------------------------------------- 608 * Per bio data 609 *--------------------------------------------------------------*/ 610 611 static struct per_bio_data *get_per_bio_data(struct bio *bio) 612 { 613 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 614 BUG_ON(!pb); 615 return pb; 616 } 617 618 static struct per_bio_data *init_per_bio_data(struct bio *bio) 619 { 620 struct per_bio_data *pb = get_per_bio_data(bio); 621 622 pb->tick = false; 623 pb->req_nr = dm_bio_get_target_bio_nr(bio); 624 pb->cell = NULL; 625 pb->len = 0; 626 627 return pb; 628 } 629 630 /*----------------------------------------------------------------*/ 631 632 static void defer_bio(struct cache *cache, struct bio *bio) 633 { 634 unsigned long flags; 635 636 spin_lock_irqsave(&cache->lock, flags); 637 bio_list_add(&cache->deferred_bios, bio); 638 spin_unlock_irqrestore(&cache->lock, flags); 639 640 wake_deferred_bio_worker(cache); 641 } 642 643 static void defer_bios(struct cache *cache, struct bio_list *bios) 644 { 645 unsigned long flags; 646 647 spin_lock_irqsave(&cache->lock, flags); 648 bio_list_merge(&cache->deferred_bios, bios); 649 bio_list_init(bios); 650 spin_unlock_irqrestore(&cache->lock, flags); 651 652 wake_deferred_bio_worker(cache); 653 } 654 655 /*----------------------------------------------------------------*/ 656 657 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 658 { 659 bool r; 660 struct per_bio_data *pb; 661 struct dm_cell_key_v2 key; 662 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 663 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 664 665 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 666 if (!cell_prealloc) { 667 defer_bio(cache, bio); 668 return false; 669 } 670 671 build_key(oblock, end, &key); 672 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 673 if (!r) { 674 /* 675 * Failed to get the lock. 676 */ 677 free_prison_cell(cache, cell_prealloc); 678 return r; 679 } 680 681 if (cell != cell_prealloc) 682 free_prison_cell(cache, cell_prealloc); 683 684 pb = get_per_bio_data(bio); 685 pb->cell = cell; 686 687 return r; 688 } 689 690 /*----------------------------------------------------------------*/ 691 692 static bool is_dirty(struct cache *cache, dm_cblock_t b) 693 { 694 return test_bit(from_cblock(b), cache->dirty_bitset); 695 } 696 697 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 698 { 699 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 700 atomic_inc(&cache->nr_dirty); 701 policy_set_dirty(cache->policy, cblock); 702 } 703 } 704 705 /* 706 * These two are called when setting after migrations to force the policy 707 * and dirty bitset to be in sync. 708 */ 709 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 710 { 711 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 712 atomic_inc(&cache->nr_dirty); 713 policy_set_dirty(cache->policy, cblock); 714 } 715 716 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 717 { 718 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 719 if (atomic_dec_return(&cache->nr_dirty) == 0) 720 dm_table_event(cache->ti->table); 721 } 722 723 policy_clear_dirty(cache->policy, cblock); 724 } 725 726 /*----------------------------------------------------------------*/ 727 728 static bool block_size_is_power_of_two(struct cache *cache) 729 { 730 return cache->sectors_per_block_shift >= 0; 731 } 732 733 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 734 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 735 __always_inline 736 #endif 737 static dm_block_t block_div(dm_block_t b, uint32_t n) 738 { 739 do_div(b, n); 740 741 return b; 742 } 743 744 static dm_block_t oblocks_per_dblock(struct cache *cache) 745 { 746 dm_block_t oblocks = cache->discard_block_size; 747 748 if (block_size_is_power_of_two(cache)) 749 oblocks >>= cache->sectors_per_block_shift; 750 else 751 oblocks = block_div(oblocks, cache->sectors_per_block); 752 753 return oblocks; 754 } 755 756 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 757 { 758 return to_dblock(block_div(from_oblock(oblock), 759 oblocks_per_dblock(cache))); 760 } 761 762 static void set_discard(struct cache *cache, dm_dblock_t b) 763 { 764 unsigned long flags; 765 766 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 767 atomic_inc(&cache->stats.discard_count); 768 769 spin_lock_irqsave(&cache->lock, flags); 770 set_bit(from_dblock(b), cache->discard_bitset); 771 spin_unlock_irqrestore(&cache->lock, flags); 772 } 773 774 static void clear_discard(struct cache *cache, dm_dblock_t b) 775 { 776 unsigned long flags; 777 778 spin_lock_irqsave(&cache->lock, flags); 779 clear_bit(from_dblock(b), cache->discard_bitset); 780 spin_unlock_irqrestore(&cache->lock, flags); 781 } 782 783 static bool is_discarded(struct cache *cache, dm_dblock_t b) 784 { 785 int r; 786 unsigned long flags; 787 788 spin_lock_irqsave(&cache->lock, flags); 789 r = test_bit(from_dblock(b), cache->discard_bitset); 790 spin_unlock_irqrestore(&cache->lock, flags); 791 792 return r; 793 } 794 795 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 796 { 797 int r; 798 unsigned long flags; 799 800 spin_lock_irqsave(&cache->lock, flags); 801 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 802 cache->discard_bitset); 803 spin_unlock_irqrestore(&cache->lock, flags); 804 805 return r; 806 } 807 808 /*---------------------------------------------------------------- 809 * Remapping 810 *--------------------------------------------------------------*/ 811 static void remap_to_origin(struct cache *cache, struct bio *bio) 812 { 813 bio_set_dev(bio, cache->origin_dev->bdev); 814 } 815 816 static void remap_to_cache(struct cache *cache, struct bio *bio, 817 dm_cblock_t cblock) 818 { 819 sector_t bi_sector = bio->bi_iter.bi_sector; 820 sector_t block = from_cblock(cblock); 821 822 bio_set_dev(bio, cache->cache_dev->bdev); 823 if (!block_size_is_power_of_two(cache)) 824 bio->bi_iter.bi_sector = 825 (block * cache->sectors_per_block) + 826 sector_div(bi_sector, cache->sectors_per_block); 827 else 828 bio->bi_iter.bi_sector = 829 (block << cache->sectors_per_block_shift) | 830 (bi_sector & (cache->sectors_per_block - 1)); 831 } 832 833 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 834 { 835 unsigned long flags; 836 struct per_bio_data *pb; 837 838 spin_lock_irqsave(&cache->lock, flags); 839 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 840 bio_op(bio) != REQ_OP_DISCARD) { 841 pb = get_per_bio_data(bio); 842 pb->tick = true; 843 cache->need_tick_bio = false; 844 } 845 spin_unlock_irqrestore(&cache->lock, flags); 846 } 847 848 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 849 dm_oblock_t oblock, bool bio_has_pbd) 850 { 851 if (bio_has_pbd) 852 check_if_tick_bio_needed(cache, bio); 853 remap_to_origin(cache, bio); 854 if (bio_data_dir(bio) == WRITE) 855 clear_discard(cache, oblock_to_dblock(cache, oblock)); 856 } 857 858 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 859 dm_oblock_t oblock) 860 { 861 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 862 __remap_to_origin_clear_discard(cache, bio, oblock, true); 863 } 864 865 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 866 dm_oblock_t oblock, dm_cblock_t cblock) 867 { 868 check_if_tick_bio_needed(cache, bio); 869 remap_to_cache(cache, bio, cblock); 870 if (bio_data_dir(bio) == WRITE) { 871 set_dirty(cache, cblock); 872 clear_discard(cache, oblock_to_dblock(cache, oblock)); 873 } 874 } 875 876 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 877 { 878 sector_t block_nr = bio->bi_iter.bi_sector; 879 880 if (!block_size_is_power_of_two(cache)) 881 (void) sector_div(block_nr, cache->sectors_per_block); 882 else 883 block_nr >>= cache->sectors_per_block_shift; 884 885 return to_oblock(block_nr); 886 } 887 888 static bool accountable_bio(struct cache *cache, struct bio *bio) 889 { 890 return bio_op(bio) != REQ_OP_DISCARD; 891 } 892 893 static void accounted_begin(struct cache *cache, struct bio *bio) 894 { 895 struct per_bio_data *pb; 896 897 if (accountable_bio(cache, bio)) { 898 pb = get_per_bio_data(bio); 899 pb->len = bio_sectors(bio); 900 iot_io_begin(&cache->tracker, pb->len); 901 } 902 } 903 904 static void accounted_complete(struct cache *cache, struct bio *bio) 905 { 906 struct per_bio_data *pb = get_per_bio_data(bio); 907 908 iot_io_end(&cache->tracker, pb->len); 909 } 910 911 static void accounted_request(struct cache *cache, struct bio *bio) 912 { 913 accounted_begin(cache, bio); 914 generic_make_request(bio); 915 } 916 917 static void issue_op(struct bio *bio, void *context) 918 { 919 struct cache *cache = context; 920 accounted_request(cache, bio); 921 } 922 923 /* 924 * When running in writethrough mode we need to send writes to clean blocks 925 * to both the cache and origin devices. Clone the bio and send them in parallel. 926 */ 927 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 928 dm_oblock_t oblock, dm_cblock_t cblock) 929 { 930 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 931 932 BUG_ON(!origin_bio); 933 934 bio_chain(origin_bio, bio); 935 /* 936 * Passing false to __remap_to_origin_clear_discard() skips 937 * all code that might use per_bio_data (since clone doesn't have it) 938 */ 939 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 940 submit_bio(origin_bio); 941 942 remap_to_cache(cache, bio, cblock); 943 } 944 945 /*---------------------------------------------------------------- 946 * Failure modes 947 *--------------------------------------------------------------*/ 948 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 949 { 950 return cache->features.mode; 951 } 952 953 static const char *cache_device_name(struct cache *cache) 954 { 955 return dm_device_name(dm_table_get_md(cache->ti->table)); 956 } 957 958 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 959 { 960 const char *descs[] = { 961 "write", 962 "read-only", 963 "fail" 964 }; 965 966 dm_table_event(cache->ti->table); 967 DMINFO("%s: switching cache to %s mode", 968 cache_device_name(cache), descs[(int)mode]); 969 } 970 971 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 972 { 973 bool needs_check; 974 enum cache_metadata_mode old_mode = get_cache_mode(cache); 975 976 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 977 DMERR("%s: unable to read needs_check flag, setting failure mode.", 978 cache_device_name(cache)); 979 new_mode = CM_FAIL; 980 } 981 982 if (new_mode == CM_WRITE && needs_check) { 983 DMERR("%s: unable to switch cache to write mode until repaired.", 984 cache_device_name(cache)); 985 if (old_mode != new_mode) 986 new_mode = old_mode; 987 else 988 new_mode = CM_READ_ONLY; 989 } 990 991 /* Never move out of fail mode */ 992 if (old_mode == CM_FAIL) 993 new_mode = CM_FAIL; 994 995 switch (new_mode) { 996 case CM_FAIL: 997 case CM_READ_ONLY: 998 dm_cache_metadata_set_read_only(cache->cmd); 999 break; 1000 1001 case CM_WRITE: 1002 dm_cache_metadata_set_read_write(cache->cmd); 1003 break; 1004 } 1005 1006 cache->features.mode = new_mode; 1007 1008 if (new_mode != old_mode) 1009 notify_mode_switch(cache, new_mode); 1010 } 1011 1012 static void abort_transaction(struct cache *cache) 1013 { 1014 const char *dev_name = cache_device_name(cache); 1015 1016 if (get_cache_mode(cache) >= CM_READ_ONLY) 1017 return; 1018 1019 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1020 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1021 set_cache_mode(cache, CM_FAIL); 1022 } 1023 1024 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1025 if (dm_cache_metadata_abort(cache->cmd)) { 1026 DMERR("%s: failed to abort metadata transaction", dev_name); 1027 set_cache_mode(cache, CM_FAIL); 1028 } 1029 } 1030 1031 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1032 { 1033 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1034 cache_device_name(cache), op, r); 1035 abort_transaction(cache); 1036 set_cache_mode(cache, CM_READ_ONLY); 1037 } 1038 1039 /*----------------------------------------------------------------*/ 1040 1041 static void load_stats(struct cache *cache) 1042 { 1043 struct dm_cache_statistics stats; 1044 1045 dm_cache_metadata_get_stats(cache->cmd, &stats); 1046 atomic_set(&cache->stats.read_hit, stats.read_hits); 1047 atomic_set(&cache->stats.read_miss, stats.read_misses); 1048 atomic_set(&cache->stats.write_hit, stats.write_hits); 1049 atomic_set(&cache->stats.write_miss, stats.write_misses); 1050 } 1051 1052 static void save_stats(struct cache *cache) 1053 { 1054 struct dm_cache_statistics stats; 1055 1056 if (get_cache_mode(cache) >= CM_READ_ONLY) 1057 return; 1058 1059 stats.read_hits = atomic_read(&cache->stats.read_hit); 1060 stats.read_misses = atomic_read(&cache->stats.read_miss); 1061 stats.write_hits = atomic_read(&cache->stats.write_hit); 1062 stats.write_misses = atomic_read(&cache->stats.write_miss); 1063 1064 dm_cache_metadata_set_stats(cache->cmd, &stats); 1065 } 1066 1067 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1068 { 1069 switch (op) { 1070 case POLICY_PROMOTE: 1071 atomic_inc(&stats->promotion); 1072 break; 1073 1074 case POLICY_DEMOTE: 1075 atomic_inc(&stats->demotion); 1076 break; 1077 1078 case POLICY_WRITEBACK: 1079 atomic_inc(&stats->writeback); 1080 break; 1081 } 1082 } 1083 1084 /*---------------------------------------------------------------- 1085 * Migration processing 1086 * 1087 * Migration covers moving data from the origin device to the cache, or 1088 * vice versa. 1089 *--------------------------------------------------------------*/ 1090 1091 static void inc_io_migrations(struct cache *cache) 1092 { 1093 atomic_inc(&cache->nr_io_migrations); 1094 } 1095 1096 static void dec_io_migrations(struct cache *cache) 1097 { 1098 atomic_dec(&cache->nr_io_migrations); 1099 } 1100 1101 static bool discard_or_flush(struct bio *bio) 1102 { 1103 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1104 } 1105 1106 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1107 dm_dblock_t *b, dm_dblock_t *e) 1108 { 1109 sector_t sb = bio->bi_iter.bi_sector; 1110 sector_t se = bio_end_sector(bio); 1111 1112 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1113 1114 if (se - sb < cache->discard_block_size) 1115 *e = *b; 1116 else 1117 *e = to_dblock(block_div(se, cache->discard_block_size)); 1118 } 1119 1120 /*----------------------------------------------------------------*/ 1121 1122 static void prevent_background_work(struct cache *cache) 1123 { 1124 lockdep_off(); 1125 down_write(&cache->background_work_lock); 1126 lockdep_on(); 1127 } 1128 1129 static void allow_background_work(struct cache *cache) 1130 { 1131 lockdep_off(); 1132 up_write(&cache->background_work_lock); 1133 lockdep_on(); 1134 } 1135 1136 static bool background_work_begin(struct cache *cache) 1137 { 1138 bool r; 1139 1140 lockdep_off(); 1141 r = down_read_trylock(&cache->background_work_lock); 1142 lockdep_on(); 1143 1144 return r; 1145 } 1146 1147 static void background_work_end(struct cache *cache) 1148 { 1149 lockdep_off(); 1150 up_read(&cache->background_work_lock); 1151 lockdep_on(); 1152 } 1153 1154 /*----------------------------------------------------------------*/ 1155 1156 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1157 { 1158 return (bio_data_dir(bio) == WRITE) && 1159 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1160 } 1161 1162 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1163 { 1164 return writeback_mode(cache) && 1165 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1166 } 1167 1168 static void quiesce(struct dm_cache_migration *mg, 1169 void (*continuation)(struct work_struct *)) 1170 { 1171 init_continuation(&mg->k, continuation); 1172 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1173 } 1174 1175 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1176 { 1177 struct continuation *k = container_of(ws, struct continuation, ws); 1178 return container_of(k, struct dm_cache_migration, k); 1179 } 1180 1181 static void copy_complete(int read_err, unsigned long write_err, void *context) 1182 { 1183 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1184 1185 if (read_err || write_err) 1186 mg->k.input = BLK_STS_IOERR; 1187 1188 queue_continuation(mg->cache->wq, &mg->k); 1189 } 1190 1191 static void copy(struct dm_cache_migration *mg, bool promote) 1192 { 1193 struct dm_io_region o_region, c_region; 1194 struct cache *cache = mg->cache; 1195 1196 o_region.bdev = cache->origin_dev->bdev; 1197 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1198 o_region.count = cache->sectors_per_block; 1199 1200 c_region.bdev = cache->cache_dev->bdev; 1201 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1202 c_region.count = cache->sectors_per_block; 1203 1204 if (promote) 1205 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1206 else 1207 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1208 } 1209 1210 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1211 { 1212 struct per_bio_data *pb = get_per_bio_data(bio); 1213 1214 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1215 free_prison_cell(cache, pb->cell); 1216 pb->cell = NULL; 1217 } 1218 1219 static void overwrite_endio(struct bio *bio) 1220 { 1221 struct dm_cache_migration *mg = bio->bi_private; 1222 struct cache *cache = mg->cache; 1223 struct per_bio_data *pb = get_per_bio_data(bio); 1224 1225 dm_unhook_bio(&pb->hook_info, bio); 1226 1227 if (bio->bi_status) 1228 mg->k.input = bio->bi_status; 1229 1230 queue_continuation(cache->wq, &mg->k); 1231 } 1232 1233 static void overwrite(struct dm_cache_migration *mg, 1234 void (*continuation)(struct work_struct *)) 1235 { 1236 struct bio *bio = mg->overwrite_bio; 1237 struct per_bio_data *pb = get_per_bio_data(bio); 1238 1239 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1240 1241 /* 1242 * The overwrite bio is part of the copy operation, as such it does 1243 * not set/clear discard or dirty flags. 1244 */ 1245 if (mg->op->op == POLICY_PROMOTE) 1246 remap_to_cache(mg->cache, bio, mg->op->cblock); 1247 else 1248 remap_to_origin(mg->cache, bio); 1249 1250 init_continuation(&mg->k, continuation); 1251 accounted_request(mg->cache, bio); 1252 } 1253 1254 /* 1255 * Migration steps: 1256 * 1257 * 1) exclusive lock preventing WRITEs 1258 * 2) quiesce 1259 * 3) copy or issue overwrite bio 1260 * 4) upgrade to exclusive lock preventing READs and WRITEs 1261 * 5) quiesce 1262 * 6) update metadata and commit 1263 * 7) unlock 1264 */ 1265 static void mg_complete(struct dm_cache_migration *mg, bool success) 1266 { 1267 struct bio_list bios; 1268 struct cache *cache = mg->cache; 1269 struct policy_work *op = mg->op; 1270 dm_cblock_t cblock = op->cblock; 1271 1272 if (success) 1273 update_stats(&cache->stats, op->op); 1274 1275 switch (op->op) { 1276 case POLICY_PROMOTE: 1277 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1278 policy_complete_background_work(cache->policy, op, success); 1279 1280 if (mg->overwrite_bio) { 1281 if (success) 1282 force_set_dirty(cache, cblock); 1283 else if (mg->k.input) 1284 mg->overwrite_bio->bi_status = mg->k.input; 1285 else 1286 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1287 bio_endio(mg->overwrite_bio); 1288 } else { 1289 if (success) 1290 force_clear_dirty(cache, cblock); 1291 dec_io_migrations(cache); 1292 } 1293 break; 1294 1295 case POLICY_DEMOTE: 1296 /* 1297 * We clear dirty here to update the nr_dirty counter. 1298 */ 1299 if (success) 1300 force_clear_dirty(cache, cblock); 1301 policy_complete_background_work(cache->policy, op, success); 1302 dec_io_migrations(cache); 1303 break; 1304 1305 case POLICY_WRITEBACK: 1306 if (success) 1307 force_clear_dirty(cache, cblock); 1308 policy_complete_background_work(cache->policy, op, success); 1309 dec_io_migrations(cache); 1310 break; 1311 } 1312 1313 bio_list_init(&bios); 1314 if (mg->cell) { 1315 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1316 free_prison_cell(cache, mg->cell); 1317 } 1318 1319 free_migration(mg); 1320 defer_bios(cache, &bios); 1321 wake_migration_worker(cache); 1322 1323 background_work_end(cache); 1324 } 1325 1326 static void mg_success(struct work_struct *ws) 1327 { 1328 struct dm_cache_migration *mg = ws_to_mg(ws); 1329 mg_complete(mg, mg->k.input == 0); 1330 } 1331 1332 static void mg_update_metadata(struct work_struct *ws) 1333 { 1334 int r; 1335 struct dm_cache_migration *mg = ws_to_mg(ws); 1336 struct cache *cache = mg->cache; 1337 struct policy_work *op = mg->op; 1338 1339 switch (op->op) { 1340 case POLICY_PROMOTE: 1341 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1342 if (r) { 1343 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1344 cache_device_name(cache)); 1345 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1346 1347 mg_complete(mg, false); 1348 return; 1349 } 1350 mg_complete(mg, true); 1351 break; 1352 1353 case POLICY_DEMOTE: 1354 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1355 if (r) { 1356 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1357 cache_device_name(cache)); 1358 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1359 1360 mg_complete(mg, false); 1361 return; 1362 } 1363 1364 /* 1365 * It would be nice if we only had to commit when a REQ_FLUSH 1366 * comes through. But there's one scenario that we have to 1367 * look out for: 1368 * 1369 * - vblock x in a cache block 1370 * - domotion occurs 1371 * - cache block gets reallocated and over written 1372 * - crash 1373 * 1374 * When we recover, because there was no commit the cache will 1375 * rollback to having the data for vblock x in the cache block. 1376 * But the cache block has since been overwritten, so it'll end 1377 * up pointing to data that was never in 'x' during the history 1378 * of the device. 1379 * 1380 * To avoid this issue we require a commit as part of the 1381 * demotion operation. 1382 */ 1383 init_continuation(&mg->k, mg_success); 1384 continue_after_commit(&cache->committer, &mg->k); 1385 schedule_commit(&cache->committer); 1386 break; 1387 1388 case POLICY_WRITEBACK: 1389 mg_complete(mg, true); 1390 break; 1391 } 1392 } 1393 1394 static void mg_update_metadata_after_copy(struct work_struct *ws) 1395 { 1396 struct dm_cache_migration *mg = ws_to_mg(ws); 1397 1398 /* 1399 * Did the copy succeed? 1400 */ 1401 if (mg->k.input) 1402 mg_complete(mg, false); 1403 else 1404 mg_update_metadata(ws); 1405 } 1406 1407 static void mg_upgrade_lock(struct work_struct *ws) 1408 { 1409 int r; 1410 struct dm_cache_migration *mg = ws_to_mg(ws); 1411 1412 /* 1413 * Did the copy succeed? 1414 */ 1415 if (mg->k.input) 1416 mg_complete(mg, false); 1417 1418 else { 1419 /* 1420 * Now we want the lock to prevent both reads and writes. 1421 */ 1422 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1423 READ_WRITE_LOCK_LEVEL); 1424 if (r < 0) 1425 mg_complete(mg, false); 1426 1427 else if (r) 1428 quiesce(mg, mg_update_metadata); 1429 1430 else 1431 mg_update_metadata(ws); 1432 } 1433 } 1434 1435 static void mg_full_copy(struct work_struct *ws) 1436 { 1437 struct dm_cache_migration *mg = ws_to_mg(ws); 1438 struct cache *cache = mg->cache; 1439 struct policy_work *op = mg->op; 1440 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1441 1442 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1443 is_discarded_oblock(cache, op->oblock)) { 1444 mg_upgrade_lock(ws); 1445 return; 1446 } 1447 1448 init_continuation(&mg->k, mg_upgrade_lock); 1449 copy(mg, is_policy_promote); 1450 } 1451 1452 static void mg_copy(struct work_struct *ws) 1453 { 1454 struct dm_cache_migration *mg = ws_to_mg(ws); 1455 1456 if (mg->overwrite_bio) { 1457 /* 1458 * No exclusive lock was held when we last checked if the bio 1459 * was optimisable. So we have to check again in case things 1460 * have changed (eg, the block may no longer be discarded). 1461 */ 1462 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1463 /* 1464 * Fallback to a real full copy after doing some tidying up. 1465 */ 1466 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1467 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1468 mg->overwrite_bio = NULL; 1469 inc_io_migrations(mg->cache); 1470 mg_full_copy(ws); 1471 return; 1472 } 1473 1474 /* 1475 * It's safe to do this here, even though it's new data 1476 * because all IO has been locked out of the block. 1477 * 1478 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1479 * so _not_ using mg_upgrade_lock() as continutation. 1480 */ 1481 overwrite(mg, mg_update_metadata_after_copy); 1482 1483 } else 1484 mg_full_copy(ws); 1485 } 1486 1487 static int mg_lock_writes(struct dm_cache_migration *mg) 1488 { 1489 int r; 1490 struct dm_cell_key_v2 key; 1491 struct cache *cache = mg->cache; 1492 struct dm_bio_prison_cell_v2 *prealloc; 1493 1494 prealloc = alloc_prison_cell(cache); 1495 if (!prealloc) { 1496 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1497 mg_complete(mg, false); 1498 return -ENOMEM; 1499 } 1500 1501 /* 1502 * Prevent writes to the block, but allow reads to continue. 1503 * Unless we're using an overwrite bio, in which case we lock 1504 * everything. 1505 */ 1506 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1507 r = dm_cell_lock_v2(cache->prison, &key, 1508 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1509 prealloc, &mg->cell); 1510 if (r < 0) { 1511 free_prison_cell(cache, prealloc); 1512 mg_complete(mg, false); 1513 return r; 1514 } 1515 1516 if (mg->cell != prealloc) 1517 free_prison_cell(cache, prealloc); 1518 1519 if (r == 0) 1520 mg_copy(&mg->k.ws); 1521 else 1522 quiesce(mg, mg_copy); 1523 1524 return 0; 1525 } 1526 1527 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1528 { 1529 struct dm_cache_migration *mg; 1530 1531 if (!background_work_begin(cache)) { 1532 policy_complete_background_work(cache->policy, op, false); 1533 return -EPERM; 1534 } 1535 1536 mg = alloc_migration(cache); 1537 if (!mg) { 1538 policy_complete_background_work(cache->policy, op, false); 1539 background_work_end(cache); 1540 return -ENOMEM; 1541 } 1542 1543 mg->op = op; 1544 mg->overwrite_bio = bio; 1545 1546 if (!bio) 1547 inc_io_migrations(cache); 1548 1549 return mg_lock_writes(mg); 1550 } 1551 1552 /*---------------------------------------------------------------- 1553 * invalidation processing 1554 *--------------------------------------------------------------*/ 1555 1556 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1557 { 1558 struct bio_list bios; 1559 struct cache *cache = mg->cache; 1560 1561 bio_list_init(&bios); 1562 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1563 free_prison_cell(cache, mg->cell); 1564 1565 if (!success && mg->overwrite_bio) 1566 bio_io_error(mg->overwrite_bio); 1567 1568 free_migration(mg); 1569 defer_bios(cache, &bios); 1570 1571 background_work_end(cache); 1572 } 1573 1574 static void invalidate_completed(struct work_struct *ws) 1575 { 1576 struct dm_cache_migration *mg = ws_to_mg(ws); 1577 invalidate_complete(mg, !mg->k.input); 1578 } 1579 1580 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1581 { 1582 int r = policy_invalidate_mapping(cache->policy, cblock); 1583 if (!r) { 1584 r = dm_cache_remove_mapping(cache->cmd, cblock); 1585 if (r) { 1586 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1587 cache_device_name(cache)); 1588 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1589 } 1590 1591 } else if (r == -ENODATA) { 1592 /* 1593 * Harmless, already unmapped. 1594 */ 1595 r = 0; 1596 1597 } else 1598 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1599 1600 return r; 1601 } 1602 1603 static void invalidate_remove(struct work_struct *ws) 1604 { 1605 int r; 1606 struct dm_cache_migration *mg = ws_to_mg(ws); 1607 struct cache *cache = mg->cache; 1608 1609 r = invalidate_cblock(cache, mg->invalidate_cblock); 1610 if (r) { 1611 invalidate_complete(mg, false); 1612 return; 1613 } 1614 1615 init_continuation(&mg->k, invalidate_completed); 1616 continue_after_commit(&cache->committer, &mg->k); 1617 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1618 mg->overwrite_bio = NULL; 1619 schedule_commit(&cache->committer); 1620 } 1621 1622 static int invalidate_lock(struct dm_cache_migration *mg) 1623 { 1624 int r; 1625 struct dm_cell_key_v2 key; 1626 struct cache *cache = mg->cache; 1627 struct dm_bio_prison_cell_v2 *prealloc; 1628 1629 prealloc = alloc_prison_cell(cache); 1630 if (!prealloc) { 1631 invalidate_complete(mg, false); 1632 return -ENOMEM; 1633 } 1634 1635 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1636 r = dm_cell_lock_v2(cache->prison, &key, 1637 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1638 if (r < 0) { 1639 free_prison_cell(cache, prealloc); 1640 invalidate_complete(mg, false); 1641 return r; 1642 } 1643 1644 if (mg->cell != prealloc) 1645 free_prison_cell(cache, prealloc); 1646 1647 if (r) 1648 quiesce(mg, invalidate_remove); 1649 1650 else { 1651 /* 1652 * We can't call invalidate_remove() directly here because we 1653 * might still be in request context. 1654 */ 1655 init_continuation(&mg->k, invalidate_remove); 1656 queue_work(cache->wq, &mg->k.ws); 1657 } 1658 1659 return 0; 1660 } 1661 1662 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1663 dm_oblock_t oblock, struct bio *bio) 1664 { 1665 struct dm_cache_migration *mg; 1666 1667 if (!background_work_begin(cache)) 1668 return -EPERM; 1669 1670 mg = alloc_migration(cache); 1671 if (!mg) { 1672 background_work_end(cache); 1673 return -ENOMEM; 1674 } 1675 1676 mg->overwrite_bio = bio; 1677 mg->invalidate_cblock = cblock; 1678 mg->invalidate_oblock = oblock; 1679 1680 return invalidate_lock(mg); 1681 } 1682 1683 /*---------------------------------------------------------------- 1684 * bio processing 1685 *--------------------------------------------------------------*/ 1686 1687 enum busy { 1688 IDLE, 1689 BUSY 1690 }; 1691 1692 static enum busy spare_migration_bandwidth(struct cache *cache) 1693 { 1694 bool idle = iot_idle_for(&cache->tracker, HZ); 1695 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1696 cache->sectors_per_block; 1697 1698 if (idle && current_volume <= cache->migration_threshold) 1699 return IDLE; 1700 else 1701 return BUSY; 1702 } 1703 1704 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1705 { 1706 atomic_inc(bio_data_dir(bio) == READ ? 1707 &cache->stats.read_hit : &cache->stats.write_hit); 1708 } 1709 1710 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1711 { 1712 atomic_inc(bio_data_dir(bio) == READ ? 1713 &cache->stats.read_miss : &cache->stats.write_miss); 1714 } 1715 1716 /*----------------------------------------------------------------*/ 1717 1718 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1719 bool *commit_needed) 1720 { 1721 int r, data_dir; 1722 bool rb, background_queued; 1723 dm_cblock_t cblock; 1724 1725 *commit_needed = false; 1726 1727 rb = bio_detain_shared(cache, block, bio); 1728 if (!rb) { 1729 /* 1730 * An exclusive lock is held for this block, so we have to 1731 * wait. We set the commit_needed flag so the current 1732 * transaction will be committed asap, allowing this lock 1733 * to be dropped. 1734 */ 1735 *commit_needed = true; 1736 return DM_MAPIO_SUBMITTED; 1737 } 1738 1739 data_dir = bio_data_dir(bio); 1740 1741 if (optimisable_bio(cache, bio, block)) { 1742 struct policy_work *op = NULL; 1743 1744 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1745 if (unlikely(r && r != -ENOENT)) { 1746 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1747 cache_device_name(cache), r); 1748 bio_io_error(bio); 1749 return DM_MAPIO_SUBMITTED; 1750 } 1751 1752 if (r == -ENOENT && op) { 1753 bio_drop_shared_lock(cache, bio); 1754 BUG_ON(op->op != POLICY_PROMOTE); 1755 mg_start(cache, op, bio); 1756 return DM_MAPIO_SUBMITTED; 1757 } 1758 } else { 1759 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1760 if (unlikely(r && r != -ENOENT)) { 1761 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1762 cache_device_name(cache), r); 1763 bio_io_error(bio); 1764 return DM_MAPIO_SUBMITTED; 1765 } 1766 1767 if (background_queued) 1768 wake_migration_worker(cache); 1769 } 1770 1771 if (r == -ENOENT) { 1772 struct per_bio_data *pb = get_per_bio_data(bio); 1773 1774 /* 1775 * Miss. 1776 */ 1777 inc_miss_counter(cache, bio); 1778 if (pb->req_nr == 0) { 1779 accounted_begin(cache, bio); 1780 remap_to_origin_clear_discard(cache, bio, block); 1781 } else { 1782 /* 1783 * This is a duplicate writethrough io that is no 1784 * longer needed because the block has been demoted. 1785 */ 1786 bio_endio(bio); 1787 return DM_MAPIO_SUBMITTED; 1788 } 1789 } else { 1790 /* 1791 * Hit. 1792 */ 1793 inc_hit_counter(cache, bio); 1794 1795 /* 1796 * Passthrough always maps to the origin, invalidating any 1797 * cache blocks that are written to. 1798 */ 1799 if (passthrough_mode(cache)) { 1800 if (bio_data_dir(bio) == WRITE) { 1801 bio_drop_shared_lock(cache, bio); 1802 atomic_inc(&cache->stats.demotion); 1803 invalidate_start(cache, cblock, block, bio); 1804 } else 1805 remap_to_origin_clear_discard(cache, bio, block); 1806 } else { 1807 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1808 !is_dirty(cache, cblock)) { 1809 remap_to_origin_and_cache(cache, bio, block, cblock); 1810 accounted_begin(cache, bio); 1811 } else 1812 remap_to_cache_dirty(cache, bio, block, cblock); 1813 } 1814 } 1815 1816 /* 1817 * dm core turns FUA requests into a separate payload and FLUSH req. 1818 */ 1819 if (bio->bi_opf & REQ_FUA) { 1820 /* 1821 * issue_after_commit will call accounted_begin a second time. So 1822 * we call accounted_complete() to avoid double accounting. 1823 */ 1824 accounted_complete(cache, bio); 1825 issue_after_commit(&cache->committer, bio); 1826 *commit_needed = true; 1827 return DM_MAPIO_SUBMITTED; 1828 } 1829 1830 return DM_MAPIO_REMAPPED; 1831 } 1832 1833 static bool process_bio(struct cache *cache, struct bio *bio) 1834 { 1835 bool commit_needed; 1836 1837 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1838 generic_make_request(bio); 1839 1840 return commit_needed; 1841 } 1842 1843 /* 1844 * A non-zero return indicates read_only or fail_io mode. 1845 */ 1846 static int commit(struct cache *cache, bool clean_shutdown) 1847 { 1848 int r; 1849 1850 if (get_cache_mode(cache) >= CM_READ_ONLY) 1851 return -EINVAL; 1852 1853 atomic_inc(&cache->stats.commit_count); 1854 r = dm_cache_commit(cache->cmd, clean_shutdown); 1855 if (r) 1856 metadata_operation_failed(cache, "dm_cache_commit", r); 1857 1858 return r; 1859 } 1860 1861 /* 1862 * Used by the batcher. 1863 */ 1864 static blk_status_t commit_op(void *context) 1865 { 1866 struct cache *cache = context; 1867 1868 if (dm_cache_changed_this_transaction(cache->cmd)) 1869 return errno_to_blk_status(commit(cache, false)); 1870 1871 return 0; 1872 } 1873 1874 /*----------------------------------------------------------------*/ 1875 1876 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1877 { 1878 struct per_bio_data *pb = get_per_bio_data(bio); 1879 1880 if (!pb->req_nr) 1881 remap_to_origin(cache, bio); 1882 else 1883 remap_to_cache(cache, bio, 0); 1884 1885 issue_after_commit(&cache->committer, bio); 1886 return true; 1887 } 1888 1889 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1890 { 1891 dm_dblock_t b, e; 1892 1893 // FIXME: do we need to lock the region? Or can we just assume the 1894 // user wont be so foolish as to issue discard concurrently with 1895 // other IO? 1896 calc_discard_block_range(cache, bio, &b, &e); 1897 while (b != e) { 1898 set_discard(cache, b); 1899 b = to_dblock(from_dblock(b) + 1); 1900 } 1901 1902 bio_endio(bio); 1903 1904 return false; 1905 } 1906 1907 static void process_deferred_bios(struct work_struct *ws) 1908 { 1909 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1910 1911 unsigned long flags; 1912 bool commit_needed = false; 1913 struct bio_list bios; 1914 struct bio *bio; 1915 1916 bio_list_init(&bios); 1917 1918 spin_lock_irqsave(&cache->lock, flags); 1919 bio_list_merge(&bios, &cache->deferred_bios); 1920 bio_list_init(&cache->deferred_bios); 1921 spin_unlock_irqrestore(&cache->lock, flags); 1922 1923 while ((bio = bio_list_pop(&bios))) { 1924 if (bio->bi_opf & REQ_PREFLUSH) 1925 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1926 1927 else if (bio_op(bio) == REQ_OP_DISCARD) 1928 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1929 1930 else 1931 commit_needed = process_bio(cache, bio) || commit_needed; 1932 } 1933 1934 if (commit_needed) 1935 schedule_commit(&cache->committer); 1936 } 1937 1938 /*---------------------------------------------------------------- 1939 * Main worker loop 1940 *--------------------------------------------------------------*/ 1941 1942 static void requeue_deferred_bios(struct cache *cache) 1943 { 1944 struct bio *bio; 1945 struct bio_list bios; 1946 1947 bio_list_init(&bios); 1948 bio_list_merge(&bios, &cache->deferred_bios); 1949 bio_list_init(&cache->deferred_bios); 1950 1951 while ((bio = bio_list_pop(&bios))) { 1952 bio->bi_status = BLK_STS_DM_REQUEUE; 1953 bio_endio(bio); 1954 } 1955 } 1956 1957 /* 1958 * We want to commit periodically so that not too much 1959 * unwritten metadata builds up. 1960 */ 1961 static void do_waker(struct work_struct *ws) 1962 { 1963 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1964 1965 policy_tick(cache->policy, true); 1966 wake_migration_worker(cache); 1967 schedule_commit(&cache->committer); 1968 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1969 } 1970 1971 static void check_migrations(struct work_struct *ws) 1972 { 1973 int r; 1974 struct policy_work *op; 1975 struct cache *cache = container_of(ws, struct cache, migration_worker); 1976 enum busy b; 1977 1978 for (;;) { 1979 b = spare_migration_bandwidth(cache); 1980 1981 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1982 if (r == -ENODATA) 1983 break; 1984 1985 if (r) { 1986 DMERR_LIMIT("%s: policy_background_work failed", 1987 cache_device_name(cache)); 1988 break; 1989 } 1990 1991 r = mg_start(cache, op, NULL); 1992 if (r) 1993 break; 1994 } 1995 } 1996 1997 /*---------------------------------------------------------------- 1998 * Target methods 1999 *--------------------------------------------------------------*/ 2000 2001 /* 2002 * This function gets called on the error paths of the constructor, so we 2003 * have to cope with a partially initialised struct. 2004 */ 2005 static void destroy(struct cache *cache) 2006 { 2007 unsigned i; 2008 2009 mempool_exit(&cache->migration_pool); 2010 2011 if (cache->prison) 2012 dm_bio_prison_destroy_v2(cache->prison); 2013 2014 if (cache->wq) 2015 destroy_workqueue(cache->wq); 2016 2017 if (cache->dirty_bitset) 2018 free_bitset(cache->dirty_bitset); 2019 2020 if (cache->discard_bitset) 2021 free_bitset(cache->discard_bitset); 2022 2023 if (cache->copier) 2024 dm_kcopyd_client_destroy(cache->copier); 2025 2026 if (cache->cmd) 2027 dm_cache_metadata_close(cache->cmd); 2028 2029 if (cache->metadata_dev) 2030 dm_put_device(cache->ti, cache->metadata_dev); 2031 2032 if (cache->origin_dev) 2033 dm_put_device(cache->ti, cache->origin_dev); 2034 2035 if (cache->cache_dev) 2036 dm_put_device(cache->ti, cache->cache_dev); 2037 2038 if (cache->policy) 2039 dm_cache_policy_destroy(cache->policy); 2040 2041 for (i = 0; i < cache->nr_ctr_args ; i++) 2042 kfree(cache->ctr_args[i]); 2043 kfree(cache->ctr_args); 2044 2045 bioset_exit(&cache->bs); 2046 2047 kfree(cache); 2048 } 2049 2050 static void cache_dtr(struct dm_target *ti) 2051 { 2052 struct cache *cache = ti->private; 2053 2054 destroy(cache); 2055 } 2056 2057 static sector_t get_dev_size(struct dm_dev *dev) 2058 { 2059 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2060 } 2061 2062 /*----------------------------------------------------------------*/ 2063 2064 /* 2065 * Construct a cache device mapping. 2066 * 2067 * cache <metadata dev> <cache dev> <origin dev> <block size> 2068 * <#feature args> [<feature arg>]* 2069 * <policy> <#policy args> [<policy arg>]* 2070 * 2071 * metadata dev : fast device holding the persistent metadata 2072 * cache dev : fast device holding cached data blocks 2073 * origin dev : slow device holding original data blocks 2074 * block size : cache unit size in sectors 2075 * 2076 * #feature args : number of feature arguments passed 2077 * feature args : writethrough. (The default is writeback.) 2078 * 2079 * policy : the replacement policy to use 2080 * #policy args : an even number of policy arguments corresponding 2081 * to key/value pairs passed to the policy 2082 * policy args : key/value pairs passed to the policy 2083 * E.g. 'sequential_threshold 1024' 2084 * See cache-policies.txt for details. 2085 * 2086 * Optional feature arguments are: 2087 * writethrough : write through caching that prohibits cache block 2088 * content from being different from origin block content. 2089 * Without this argument, the default behaviour is to write 2090 * back cache block contents later for performance reasons, 2091 * so they may differ from the corresponding origin blocks. 2092 */ 2093 struct cache_args { 2094 struct dm_target *ti; 2095 2096 struct dm_dev *metadata_dev; 2097 2098 struct dm_dev *cache_dev; 2099 sector_t cache_sectors; 2100 2101 struct dm_dev *origin_dev; 2102 sector_t origin_sectors; 2103 2104 uint32_t block_size; 2105 2106 const char *policy_name; 2107 int policy_argc; 2108 const char **policy_argv; 2109 2110 struct cache_features features; 2111 }; 2112 2113 static void destroy_cache_args(struct cache_args *ca) 2114 { 2115 if (ca->metadata_dev) 2116 dm_put_device(ca->ti, ca->metadata_dev); 2117 2118 if (ca->cache_dev) 2119 dm_put_device(ca->ti, ca->cache_dev); 2120 2121 if (ca->origin_dev) 2122 dm_put_device(ca->ti, ca->origin_dev); 2123 2124 kfree(ca); 2125 } 2126 2127 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2128 { 2129 if (!as->argc) { 2130 *error = "Insufficient args"; 2131 return false; 2132 } 2133 2134 return true; 2135 } 2136 2137 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2138 char **error) 2139 { 2140 int r; 2141 sector_t metadata_dev_size; 2142 char b[BDEVNAME_SIZE]; 2143 2144 if (!at_least_one_arg(as, error)) 2145 return -EINVAL; 2146 2147 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2148 &ca->metadata_dev); 2149 if (r) { 2150 *error = "Error opening metadata device"; 2151 return r; 2152 } 2153 2154 metadata_dev_size = get_dev_size(ca->metadata_dev); 2155 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2156 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2157 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2158 2159 return 0; 2160 } 2161 2162 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2163 char **error) 2164 { 2165 int r; 2166 2167 if (!at_least_one_arg(as, error)) 2168 return -EINVAL; 2169 2170 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2171 &ca->cache_dev); 2172 if (r) { 2173 *error = "Error opening cache device"; 2174 return r; 2175 } 2176 ca->cache_sectors = get_dev_size(ca->cache_dev); 2177 2178 return 0; 2179 } 2180 2181 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2182 char **error) 2183 { 2184 int r; 2185 2186 if (!at_least_one_arg(as, error)) 2187 return -EINVAL; 2188 2189 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2190 &ca->origin_dev); 2191 if (r) { 2192 *error = "Error opening origin device"; 2193 return r; 2194 } 2195 2196 ca->origin_sectors = get_dev_size(ca->origin_dev); 2197 if (ca->ti->len > ca->origin_sectors) { 2198 *error = "Device size larger than cached device"; 2199 return -EINVAL; 2200 } 2201 2202 return 0; 2203 } 2204 2205 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2206 char **error) 2207 { 2208 unsigned long block_size; 2209 2210 if (!at_least_one_arg(as, error)) 2211 return -EINVAL; 2212 2213 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2214 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2215 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2216 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2217 *error = "Invalid data block size"; 2218 return -EINVAL; 2219 } 2220 2221 if (block_size > ca->cache_sectors) { 2222 *error = "Data block size is larger than the cache device"; 2223 return -EINVAL; 2224 } 2225 2226 ca->block_size = block_size; 2227 2228 return 0; 2229 } 2230 2231 static void init_features(struct cache_features *cf) 2232 { 2233 cf->mode = CM_WRITE; 2234 cf->io_mode = CM_IO_WRITEBACK; 2235 cf->metadata_version = 1; 2236 } 2237 2238 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2239 char **error) 2240 { 2241 static const struct dm_arg _args[] = { 2242 {0, 2, "Invalid number of cache feature arguments"}, 2243 }; 2244 2245 int r, mode_ctr = 0; 2246 unsigned argc; 2247 const char *arg; 2248 struct cache_features *cf = &ca->features; 2249 2250 init_features(cf); 2251 2252 r = dm_read_arg_group(_args, as, &argc, error); 2253 if (r) 2254 return -EINVAL; 2255 2256 while (argc--) { 2257 arg = dm_shift_arg(as); 2258 2259 if (!strcasecmp(arg, "writeback")) { 2260 cf->io_mode = CM_IO_WRITEBACK; 2261 mode_ctr++; 2262 } 2263 2264 else if (!strcasecmp(arg, "writethrough")) { 2265 cf->io_mode = CM_IO_WRITETHROUGH; 2266 mode_ctr++; 2267 } 2268 2269 else if (!strcasecmp(arg, "passthrough")) { 2270 cf->io_mode = CM_IO_PASSTHROUGH; 2271 mode_ctr++; 2272 } 2273 2274 else if (!strcasecmp(arg, "metadata2")) 2275 cf->metadata_version = 2; 2276 2277 else { 2278 *error = "Unrecognised cache feature requested"; 2279 return -EINVAL; 2280 } 2281 } 2282 2283 if (mode_ctr > 1) { 2284 *error = "Duplicate cache io_mode features requested"; 2285 return -EINVAL; 2286 } 2287 2288 return 0; 2289 } 2290 2291 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2292 char **error) 2293 { 2294 static const struct dm_arg _args[] = { 2295 {0, 1024, "Invalid number of policy arguments"}, 2296 }; 2297 2298 int r; 2299 2300 if (!at_least_one_arg(as, error)) 2301 return -EINVAL; 2302 2303 ca->policy_name = dm_shift_arg(as); 2304 2305 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2306 if (r) 2307 return -EINVAL; 2308 2309 ca->policy_argv = (const char **)as->argv; 2310 dm_consume_args(as, ca->policy_argc); 2311 2312 return 0; 2313 } 2314 2315 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2316 char **error) 2317 { 2318 int r; 2319 struct dm_arg_set as; 2320 2321 as.argc = argc; 2322 as.argv = argv; 2323 2324 r = parse_metadata_dev(ca, &as, error); 2325 if (r) 2326 return r; 2327 2328 r = parse_cache_dev(ca, &as, error); 2329 if (r) 2330 return r; 2331 2332 r = parse_origin_dev(ca, &as, error); 2333 if (r) 2334 return r; 2335 2336 r = parse_block_size(ca, &as, error); 2337 if (r) 2338 return r; 2339 2340 r = parse_features(ca, &as, error); 2341 if (r) 2342 return r; 2343 2344 r = parse_policy(ca, &as, error); 2345 if (r) 2346 return r; 2347 2348 return 0; 2349 } 2350 2351 /*----------------------------------------------------------------*/ 2352 2353 static struct kmem_cache *migration_cache; 2354 2355 #define NOT_CORE_OPTION 1 2356 2357 static int process_config_option(struct cache *cache, const char *key, const char *value) 2358 { 2359 unsigned long tmp; 2360 2361 if (!strcasecmp(key, "migration_threshold")) { 2362 if (kstrtoul(value, 10, &tmp)) 2363 return -EINVAL; 2364 2365 cache->migration_threshold = tmp; 2366 return 0; 2367 } 2368 2369 return NOT_CORE_OPTION; 2370 } 2371 2372 static int set_config_value(struct cache *cache, const char *key, const char *value) 2373 { 2374 int r = process_config_option(cache, key, value); 2375 2376 if (r == NOT_CORE_OPTION) 2377 r = policy_set_config_value(cache->policy, key, value); 2378 2379 if (r) 2380 DMWARN("bad config value for %s: %s", key, value); 2381 2382 return r; 2383 } 2384 2385 static int set_config_values(struct cache *cache, int argc, const char **argv) 2386 { 2387 int r = 0; 2388 2389 if (argc & 1) { 2390 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2391 return -EINVAL; 2392 } 2393 2394 while (argc) { 2395 r = set_config_value(cache, argv[0], argv[1]); 2396 if (r) 2397 break; 2398 2399 argc -= 2; 2400 argv += 2; 2401 } 2402 2403 return r; 2404 } 2405 2406 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2407 char **error) 2408 { 2409 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2410 cache->cache_size, 2411 cache->origin_sectors, 2412 cache->sectors_per_block); 2413 if (IS_ERR(p)) { 2414 *error = "Error creating cache's policy"; 2415 return PTR_ERR(p); 2416 } 2417 cache->policy = p; 2418 BUG_ON(!cache->policy); 2419 2420 return 0; 2421 } 2422 2423 /* 2424 * We want the discard block size to be at least the size of the cache 2425 * block size and have no more than 2^14 discard blocks across the origin. 2426 */ 2427 #define MAX_DISCARD_BLOCKS (1 << 14) 2428 2429 static bool too_many_discard_blocks(sector_t discard_block_size, 2430 sector_t origin_size) 2431 { 2432 (void) sector_div(origin_size, discard_block_size); 2433 2434 return origin_size > MAX_DISCARD_BLOCKS; 2435 } 2436 2437 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2438 sector_t origin_size) 2439 { 2440 sector_t discard_block_size = cache_block_size; 2441 2442 if (origin_size) 2443 while (too_many_discard_blocks(discard_block_size, origin_size)) 2444 discard_block_size *= 2; 2445 2446 return discard_block_size; 2447 } 2448 2449 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2450 { 2451 dm_block_t nr_blocks = from_cblock(size); 2452 2453 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2454 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2455 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2456 "Please consider increasing the cache block size to reduce the overall cache block count.", 2457 (unsigned long long) nr_blocks); 2458 2459 cache->cache_size = size; 2460 } 2461 2462 static int is_congested(struct dm_dev *dev, int bdi_bits) 2463 { 2464 struct request_queue *q = bdev_get_queue(dev->bdev); 2465 return bdi_congested(q->backing_dev_info, bdi_bits); 2466 } 2467 2468 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2469 { 2470 struct cache *cache = container_of(cb, struct cache, callbacks); 2471 2472 return is_congested(cache->origin_dev, bdi_bits) || 2473 is_congested(cache->cache_dev, bdi_bits); 2474 } 2475 2476 #define DEFAULT_MIGRATION_THRESHOLD 2048 2477 2478 static int cache_create(struct cache_args *ca, struct cache **result) 2479 { 2480 int r = 0; 2481 char **error = &ca->ti->error; 2482 struct cache *cache; 2483 struct dm_target *ti = ca->ti; 2484 dm_block_t origin_blocks; 2485 struct dm_cache_metadata *cmd; 2486 bool may_format = ca->features.mode == CM_WRITE; 2487 2488 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2489 if (!cache) 2490 return -ENOMEM; 2491 2492 cache->ti = ca->ti; 2493 ti->private = cache; 2494 ti->num_flush_bios = 2; 2495 ti->flush_supported = true; 2496 2497 ti->num_discard_bios = 1; 2498 ti->discards_supported = true; 2499 ti->split_discard_bios = false; 2500 2501 ti->per_io_data_size = sizeof(struct per_bio_data); 2502 2503 cache->features = ca->features; 2504 if (writethrough_mode(cache)) { 2505 /* Create bioset for writethrough bios issued to origin */ 2506 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2507 if (r) 2508 goto bad; 2509 } 2510 2511 cache->callbacks.congested_fn = cache_is_congested; 2512 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2513 2514 cache->metadata_dev = ca->metadata_dev; 2515 cache->origin_dev = ca->origin_dev; 2516 cache->cache_dev = ca->cache_dev; 2517 2518 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2519 2520 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2521 origin_blocks = block_div(origin_blocks, ca->block_size); 2522 cache->origin_blocks = to_oblock(origin_blocks); 2523 2524 cache->sectors_per_block = ca->block_size; 2525 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2526 r = -EINVAL; 2527 goto bad; 2528 } 2529 2530 if (ca->block_size & (ca->block_size - 1)) { 2531 dm_block_t cache_size = ca->cache_sectors; 2532 2533 cache->sectors_per_block_shift = -1; 2534 cache_size = block_div(cache_size, ca->block_size); 2535 set_cache_size(cache, to_cblock(cache_size)); 2536 } else { 2537 cache->sectors_per_block_shift = __ffs(ca->block_size); 2538 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2539 } 2540 2541 r = create_cache_policy(cache, ca, error); 2542 if (r) 2543 goto bad; 2544 2545 cache->policy_nr_args = ca->policy_argc; 2546 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2547 2548 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2549 if (r) { 2550 *error = "Error setting cache policy's config values"; 2551 goto bad; 2552 } 2553 2554 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2555 ca->block_size, may_format, 2556 dm_cache_policy_get_hint_size(cache->policy), 2557 ca->features.metadata_version); 2558 if (IS_ERR(cmd)) { 2559 *error = "Error creating metadata object"; 2560 r = PTR_ERR(cmd); 2561 goto bad; 2562 } 2563 cache->cmd = cmd; 2564 set_cache_mode(cache, CM_WRITE); 2565 if (get_cache_mode(cache) != CM_WRITE) { 2566 *error = "Unable to get write access to metadata, please check/repair metadata."; 2567 r = -EINVAL; 2568 goto bad; 2569 } 2570 2571 if (passthrough_mode(cache)) { 2572 bool all_clean; 2573 2574 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2575 if (r) { 2576 *error = "dm_cache_metadata_all_clean() failed"; 2577 goto bad; 2578 } 2579 2580 if (!all_clean) { 2581 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2582 r = -EINVAL; 2583 goto bad; 2584 } 2585 2586 policy_allow_migrations(cache->policy, false); 2587 } 2588 2589 spin_lock_init(&cache->lock); 2590 bio_list_init(&cache->deferred_bios); 2591 atomic_set(&cache->nr_allocated_migrations, 0); 2592 atomic_set(&cache->nr_io_migrations, 0); 2593 init_waitqueue_head(&cache->migration_wait); 2594 2595 r = -ENOMEM; 2596 atomic_set(&cache->nr_dirty, 0); 2597 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2598 if (!cache->dirty_bitset) { 2599 *error = "could not allocate dirty bitset"; 2600 goto bad; 2601 } 2602 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2603 2604 cache->discard_block_size = 2605 calculate_discard_block_size(cache->sectors_per_block, 2606 cache->origin_sectors); 2607 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2608 cache->discard_block_size)); 2609 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2610 if (!cache->discard_bitset) { 2611 *error = "could not allocate discard bitset"; 2612 goto bad; 2613 } 2614 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2615 2616 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2617 if (IS_ERR(cache->copier)) { 2618 *error = "could not create kcopyd client"; 2619 r = PTR_ERR(cache->copier); 2620 goto bad; 2621 } 2622 2623 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2624 if (!cache->wq) { 2625 *error = "could not create workqueue for metadata object"; 2626 goto bad; 2627 } 2628 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2629 INIT_WORK(&cache->migration_worker, check_migrations); 2630 INIT_DELAYED_WORK(&cache->waker, do_waker); 2631 2632 cache->prison = dm_bio_prison_create_v2(cache->wq); 2633 if (!cache->prison) { 2634 *error = "could not create bio prison"; 2635 goto bad; 2636 } 2637 2638 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2639 migration_cache); 2640 if (r) { 2641 *error = "Error creating cache's migration mempool"; 2642 goto bad; 2643 } 2644 2645 cache->need_tick_bio = true; 2646 cache->sized = false; 2647 cache->invalidate = false; 2648 cache->commit_requested = false; 2649 cache->loaded_mappings = false; 2650 cache->loaded_discards = false; 2651 2652 load_stats(cache); 2653 2654 atomic_set(&cache->stats.demotion, 0); 2655 atomic_set(&cache->stats.promotion, 0); 2656 atomic_set(&cache->stats.copies_avoided, 0); 2657 atomic_set(&cache->stats.cache_cell_clash, 0); 2658 atomic_set(&cache->stats.commit_count, 0); 2659 atomic_set(&cache->stats.discard_count, 0); 2660 2661 spin_lock_init(&cache->invalidation_lock); 2662 INIT_LIST_HEAD(&cache->invalidation_requests); 2663 2664 batcher_init(&cache->committer, commit_op, cache, 2665 issue_op, cache, cache->wq); 2666 iot_init(&cache->tracker); 2667 2668 init_rwsem(&cache->background_work_lock); 2669 prevent_background_work(cache); 2670 2671 *result = cache; 2672 return 0; 2673 bad: 2674 destroy(cache); 2675 return r; 2676 } 2677 2678 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2679 { 2680 unsigned i; 2681 const char **copy; 2682 2683 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2684 if (!copy) 2685 return -ENOMEM; 2686 for (i = 0; i < argc; i++) { 2687 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2688 if (!copy[i]) { 2689 while (i--) 2690 kfree(copy[i]); 2691 kfree(copy); 2692 return -ENOMEM; 2693 } 2694 } 2695 2696 cache->nr_ctr_args = argc; 2697 cache->ctr_args = copy; 2698 2699 return 0; 2700 } 2701 2702 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2703 { 2704 int r = -EINVAL; 2705 struct cache_args *ca; 2706 struct cache *cache = NULL; 2707 2708 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2709 if (!ca) { 2710 ti->error = "Error allocating memory for cache"; 2711 return -ENOMEM; 2712 } 2713 ca->ti = ti; 2714 2715 r = parse_cache_args(ca, argc, argv, &ti->error); 2716 if (r) 2717 goto out; 2718 2719 r = cache_create(ca, &cache); 2720 if (r) 2721 goto out; 2722 2723 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2724 if (r) { 2725 destroy(cache); 2726 goto out; 2727 } 2728 2729 ti->private = cache; 2730 out: 2731 destroy_cache_args(ca); 2732 return r; 2733 } 2734 2735 /*----------------------------------------------------------------*/ 2736 2737 static int cache_map(struct dm_target *ti, struct bio *bio) 2738 { 2739 struct cache *cache = ti->private; 2740 2741 int r; 2742 bool commit_needed; 2743 dm_oblock_t block = get_bio_block(cache, bio); 2744 2745 init_per_bio_data(bio); 2746 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2747 /* 2748 * This can only occur if the io goes to a partial block at 2749 * the end of the origin device. We don't cache these. 2750 * Just remap to the origin and carry on. 2751 */ 2752 remap_to_origin(cache, bio); 2753 accounted_begin(cache, bio); 2754 return DM_MAPIO_REMAPPED; 2755 } 2756 2757 if (discard_or_flush(bio)) { 2758 defer_bio(cache, bio); 2759 return DM_MAPIO_SUBMITTED; 2760 } 2761 2762 r = map_bio(cache, bio, block, &commit_needed); 2763 if (commit_needed) 2764 schedule_commit(&cache->committer); 2765 2766 return r; 2767 } 2768 2769 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2770 { 2771 struct cache *cache = ti->private; 2772 unsigned long flags; 2773 struct per_bio_data *pb = get_per_bio_data(bio); 2774 2775 if (pb->tick) { 2776 policy_tick(cache->policy, false); 2777 2778 spin_lock_irqsave(&cache->lock, flags); 2779 cache->need_tick_bio = true; 2780 spin_unlock_irqrestore(&cache->lock, flags); 2781 } 2782 2783 bio_drop_shared_lock(cache, bio); 2784 accounted_complete(cache, bio); 2785 2786 return DM_ENDIO_DONE; 2787 } 2788 2789 static int write_dirty_bitset(struct cache *cache) 2790 { 2791 int r; 2792 2793 if (get_cache_mode(cache) >= CM_READ_ONLY) 2794 return -EINVAL; 2795 2796 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2797 if (r) 2798 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2799 2800 return r; 2801 } 2802 2803 static int write_discard_bitset(struct cache *cache) 2804 { 2805 unsigned i, r; 2806 2807 if (get_cache_mode(cache) >= CM_READ_ONLY) 2808 return -EINVAL; 2809 2810 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2811 cache->discard_nr_blocks); 2812 if (r) { 2813 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2814 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2815 return r; 2816 } 2817 2818 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2819 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2820 is_discarded(cache, to_dblock(i))); 2821 if (r) { 2822 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2823 return r; 2824 } 2825 } 2826 2827 return 0; 2828 } 2829 2830 static int write_hints(struct cache *cache) 2831 { 2832 int r; 2833 2834 if (get_cache_mode(cache) >= CM_READ_ONLY) 2835 return -EINVAL; 2836 2837 r = dm_cache_write_hints(cache->cmd, cache->policy); 2838 if (r) { 2839 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2840 return r; 2841 } 2842 2843 return 0; 2844 } 2845 2846 /* 2847 * returns true on success 2848 */ 2849 static bool sync_metadata(struct cache *cache) 2850 { 2851 int r1, r2, r3, r4; 2852 2853 r1 = write_dirty_bitset(cache); 2854 if (r1) 2855 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2856 2857 r2 = write_discard_bitset(cache); 2858 if (r2) 2859 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2860 2861 save_stats(cache); 2862 2863 r3 = write_hints(cache); 2864 if (r3) 2865 DMERR("%s: could not write hints", cache_device_name(cache)); 2866 2867 /* 2868 * If writing the above metadata failed, we still commit, but don't 2869 * set the clean shutdown flag. This will effectively force every 2870 * dirty bit to be set on reload. 2871 */ 2872 r4 = commit(cache, !r1 && !r2 && !r3); 2873 if (r4) 2874 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2875 2876 return !r1 && !r2 && !r3 && !r4; 2877 } 2878 2879 static void cache_postsuspend(struct dm_target *ti) 2880 { 2881 struct cache *cache = ti->private; 2882 2883 prevent_background_work(cache); 2884 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2885 2886 cancel_delayed_work(&cache->waker); 2887 flush_workqueue(cache->wq); 2888 WARN_ON(cache->tracker.in_flight); 2889 2890 /* 2891 * If it's a flush suspend there won't be any deferred bios, so this 2892 * call is harmless. 2893 */ 2894 requeue_deferred_bios(cache); 2895 2896 if (get_cache_mode(cache) == CM_WRITE) 2897 (void) sync_metadata(cache); 2898 } 2899 2900 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2901 bool dirty, uint32_t hint, bool hint_valid) 2902 { 2903 int r; 2904 struct cache *cache = context; 2905 2906 if (dirty) { 2907 set_bit(from_cblock(cblock), cache->dirty_bitset); 2908 atomic_inc(&cache->nr_dirty); 2909 } else 2910 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2911 2912 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2913 if (r) 2914 return r; 2915 2916 return 0; 2917 } 2918 2919 /* 2920 * The discard block size in the on disk metadata is not 2921 * neccessarily the same as we're currently using. So we have to 2922 * be careful to only set the discarded attribute if we know it 2923 * covers a complete block of the new size. 2924 */ 2925 struct discard_load_info { 2926 struct cache *cache; 2927 2928 /* 2929 * These blocks are sized using the on disk dblock size, rather 2930 * than the current one. 2931 */ 2932 dm_block_t block_size; 2933 dm_block_t discard_begin, discard_end; 2934 }; 2935 2936 static void discard_load_info_init(struct cache *cache, 2937 struct discard_load_info *li) 2938 { 2939 li->cache = cache; 2940 li->discard_begin = li->discard_end = 0; 2941 } 2942 2943 static void set_discard_range(struct discard_load_info *li) 2944 { 2945 sector_t b, e; 2946 2947 if (li->discard_begin == li->discard_end) 2948 return; 2949 2950 /* 2951 * Convert to sectors. 2952 */ 2953 b = li->discard_begin * li->block_size; 2954 e = li->discard_end * li->block_size; 2955 2956 /* 2957 * Then convert back to the current dblock size. 2958 */ 2959 b = dm_sector_div_up(b, li->cache->discard_block_size); 2960 sector_div(e, li->cache->discard_block_size); 2961 2962 /* 2963 * The origin may have shrunk, so we need to check we're still in 2964 * bounds. 2965 */ 2966 if (e > from_dblock(li->cache->discard_nr_blocks)) 2967 e = from_dblock(li->cache->discard_nr_blocks); 2968 2969 for (; b < e; b++) 2970 set_discard(li->cache, to_dblock(b)); 2971 } 2972 2973 static int load_discard(void *context, sector_t discard_block_size, 2974 dm_dblock_t dblock, bool discard) 2975 { 2976 struct discard_load_info *li = context; 2977 2978 li->block_size = discard_block_size; 2979 2980 if (discard) { 2981 if (from_dblock(dblock) == li->discard_end) 2982 /* 2983 * We're already in a discard range, just extend it. 2984 */ 2985 li->discard_end = li->discard_end + 1ULL; 2986 2987 else { 2988 /* 2989 * Emit the old range and start a new one. 2990 */ 2991 set_discard_range(li); 2992 li->discard_begin = from_dblock(dblock); 2993 li->discard_end = li->discard_begin + 1ULL; 2994 } 2995 } else { 2996 set_discard_range(li); 2997 li->discard_begin = li->discard_end = 0; 2998 } 2999 3000 return 0; 3001 } 3002 3003 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3004 { 3005 sector_t size = get_dev_size(cache->cache_dev); 3006 (void) sector_div(size, cache->sectors_per_block); 3007 return to_cblock(size); 3008 } 3009 3010 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3011 { 3012 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3013 return true; 3014 3015 /* 3016 * We can't drop a dirty block when shrinking the cache. 3017 */ 3018 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3019 new_size = to_cblock(from_cblock(new_size) + 1); 3020 if (is_dirty(cache, new_size)) { 3021 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3022 cache_device_name(cache), 3023 (unsigned long long) from_cblock(new_size)); 3024 return false; 3025 } 3026 } 3027 3028 return true; 3029 } 3030 3031 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3032 { 3033 int r; 3034 3035 r = dm_cache_resize(cache->cmd, new_size); 3036 if (r) { 3037 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3038 metadata_operation_failed(cache, "dm_cache_resize", r); 3039 return r; 3040 } 3041 3042 set_cache_size(cache, new_size); 3043 3044 return 0; 3045 } 3046 3047 static int cache_preresume(struct dm_target *ti) 3048 { 3049 int r = 0; 3050 struct cache *cache = ti->private; 3051 dm_cblock_t csize = get_cache_dev_size(cache); 3052 3053 /* 3054 * Check to see if the cache has resized. 3055 */ 3056 if (!cache->sized) { 3057 r = resize_cache_dev(cache, csize); 3058 if (r) 3059 return r; 3060 3061 cache->sized = true; 3062 3063 } else if (csize != cache->cache_size) { 3064 if (!can_resize(cache, csize)) 3065 return -EINVAL; 3066 3067 r = resize_cache_dev(cache, csize); 3068 if (r) 3069 return r; 3070 } 3071 3072 if (!cache->loaded_mappings) { 3073 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3074 load_mapping, cache); 3075 if (r) { 3076 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3077 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3078 return r; 3079 } 3080 3081 cache->loaded_mappings = true; 3082 } 3083 3084 if (!cache->loaded_discards) { 3085 struct discard_load_info li; 3086 3087 /* 3088 * The discard bitset could have been resized, or the 3089 * discard block size changed. To be safe we start by 3090 * setting every dblock to not discarded. 3091 */ 3092 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3093 3094 discard_load_info_init(cache, &li); 3095 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3096 if (r) { 3097 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3098 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3099 return r; 3100 } 3101 set_discard_range(&li); 3102 3103 cache->loaded_discards = true; 3104 } 3105 3106 return r; 3107 } 3108 3109 static void cache_resume(struct dm_target *ti) 3110 { 3111 struct cache *cache = ti->private; 3112 3113 cache->need_tick_bio = true; 3114 allow_background_work(cache); 3115 do_waker(&cache->waker.work); 3116 } 3117 3118 /* 3119 * Status format: 3120 * 3121 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3122 * <cache block size> <#used cache blocks>/<#total cache blocks> 3123 * <#read hits> <#read misses> <#write hits> <#write misses> 3124 * <#demotions> <#promotions> <#dirty> 3125 * <#features> <features>* 3126 * <#core args> <core args> 3127 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3128 */ 3129 static void cache_status(struct dm_target *ti, status_type_t type, 3130 unsigned status_flags, char *result, unsigned maxlen) 3131 { 3132 int r = 0; 3133 unsigned i; 3134 ssize_t sz = 0; 3135 dm_block_t nr_free_blocks_metadata = 0; 3136 dm_block_t nr_blocks_metadata = 0; 3137 char buf[BDEVNAME_SIZE]; 3138 struct cache *cache = ti->private; 3139 dm_cblock_t residency; 3140 bool needs_check; 3141 3142 switch (type) { 3143 case STATUSTYPE_INFO: 3144 if (get_cache_mode(cache) == CM_FAIL) { 3145 DMEMIT("Fail"); 3146 break; 3147 } 3148 3149 /* Commit to ensure statistics aren't out-of-date */ 3150 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3151 (void) commit(cache, false); 3152 3153 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3154 if (r) { 3155 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3156 cache_device_name(cache), r); 3157 goto err; 3158 } 3159 3160 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3161 if (r) { 3162 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3163 cache_device_name(cache), r); 3164 goto err; 3165 } 3166 3167 residency = policy_residency(cache->policy); 3168 3169 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3170 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3171 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3172 (unsigned long long)nr_blocks_metadata, 3173 (unsigned long long)cache->sectors_per_block, 3174 (unsigned long long) from_cblock(residency), 3175 (unsigned long long) from_cblock(cache->cache_size), 3176 (unsigned) atomic_read(&cache->stats.read_hit), 3177 (unsigned) atomic_read(&cache->stats.read_miss), 3178 (unsigned) atomic_read(&cache->stats.write_hit), 3179 (unsigned) atomic_read(&cache->stats.write_miss), 3180 (unsigned) atomic_read(&cache->stats.demotion), 3181 (unsigned) atomic_read(&cache->stats.promotion), 3182 (unsigned long) atomic_read(&cache->nr_dirty)); 3183 3184 if (cache->features.metadata_version == 2) 3185 DMEMIT("2 metadata2 "); 3186 else 3187 DMEMIT("1 "); 3188 3189 if (writethrough_mode(cache)) 3190 DMEMIT("writethrough "); 3191 3192 else if (passthrough_mode(cache)) 3193 DMEMIT("passthrough "); 3194 3195 else if (writeback_mode(cache)) 3196 DMEMIT("writeback "); 3197 3198 else { 3199 DMERR("%s: internal error: unknown io mode: %d", 3200 cache_device_name(cache), (int) cache->features.io_mode); 3201 goto err; 3202 } 3203 3204 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3205 3206 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3207 if (sz < maxlen) { 3208 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3209 if (r) 3210 DMERR("%s: policy_emit_config_values returned %d", 3211 cache_device_name(cache), r); 3212 } 3213 3214 if (get_cache_mode(cache) == CM_READ_ONLY) 3215 DMEMIT("ro "); 3216 else 3217 DMEMIT("rw "); 3218 3219 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3220 3221 if (r || needs_check) 3222 DMEMIT("needs_check "); 3223 else 3224 DMEMIT("- "); 3225 3226 break; 3227 3228 case STATUSTYPE_TABLE: 3229 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3230 DMEMIT("%s ", buf); 3231 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3232 DMEMIT("%s ", buf); 3233 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3234 DMEMIT("%s", buf); 3235 3236 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3237 DMEMIT(" %s", cache->ctr_args[i]); 3238 if (cache->nr_ctr_args) 3239 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3240 } 3241 3242 return; 3243 3244 err: 3245 DMEMIT("Error"); 3246 } 3247 3248 /* 3249 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3250 * the one-past-the-end value. 3251 */ 3252 struct cblock_range { 3253 dm_cblock_t begin; 3254 dm_cblock_t end; 3255 }; 3256 3257 /* 3258 * A cache block range can take two forms: 3259 * 3260 * i) A single cblock, eg. '3456' 3261 * ii) A begin and end cblock with a dash between, eg. 123-234 3262 */ 3263 static int parse_cblock_range(struct cache *cache, const char *str, 3264 struct cblock_range *result) 3265 { 3266 char dummy; 3267 uint64_t b, e; 3268 int r; 3269 3270 /* 3271 * Try and parse form (ii) first. 3272 */ 3273 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3274 if (r < 0) 3275 return r; 3276 3277 if (r == 2) { 3278 result->begin = to_cblock(b); 3279 result->end = to_cblock(e); 3280 return 0; 3281 } 3282 3283 /* 3284 * That didn't work, try form (i). 3285 */ 3286 r = sscanf(str, "%llu%c", &b, &dummy); 3287 if (r < 0) 3288 return r; 3289 3290 if (r == 1) { 3291 result->begin = to_cblock(b); 3292 result->end = to_cblock(from_cblock(result->begin) + 1u); 3293 return 0; 3294 } 3295 3296 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3297 return -EINVAL; 3298 } 3299 3300 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3301 { 3302 uint64_t b = from_cblock(range->begin); 3303 uint64_t e = from_cblock(range->end); 3304 uint64_t n = from_cblock(cache->cache_size); 3305 3306 if (b >= n) { 3307 DMERR("%s: begin cblock out of range: %llu >= %llu", 3308 cache_device_name(cache), b, n); 3309 return -EINVAL; 3310 } 3311 3312 if (e > n) { 3313 DMERR("%s: end cblock out of range: %llu > %llu", 3314 cache_device_name(cache), e, n); 3315 return -EINVAL; 3316 } 3317 3318 if (b >= e) { 3319 DMERR("%s: invalid cblock range: %llu >= %llu", 3320 cache_device_name(cache), b, e); 3321 return -EINVAL; 3322 } 3323 3324 return 0; 3325 } 3326 3327 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3328 { 3329 return to_cblock(from_cblock(b) + 1); 3330 } 3331 3332 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3333 { 3334 int r = 0; 3335 3336 /* 3337 * We don't need to do any locking here because we know we're in 3338 * passthrough mode. There's is potential for a race between an 3339 * invalidation triggered by an io and an invalidation message. This 3340 * is harmless, we must not worry if the policy call fails. 3341 */ 3342 while (range->begin != range->end) { 3343 r = invalidate_cblock(cache, range->begin); 3344 if (r) 3345 return r; 3346 3347 range->begin = cblock_succ(range->begin); 3348 } 3349 3350 cache->commit_requested = true; 3351 return r; 3352 } 3353 3354 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3355 const char **cblock_ranges) 3356 { 3357 int r = 0; 3358 unsigned i; 3359 struct cblock_range range; 3360 3361 if (!passthrough_mode(cache)) { 3362 DMERR("%s: cache has to be in passthrough mode for invalidation", 3363 cache_device_name(cache)); 3364 return -EPERM; 3365 } 3366 3367 for (i = 0; i < count; i++) { 3368 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3369 if (r) 3370 break; 3371 3372 r = validate_cblock_range(cache, &range); 3373 if (r) 3374 break; 3375 3376 /* 3377 * Pass begin and end origin blocks to the worker and wake it. 3378 */ 3379 r = request_invalidation(cache, &range); 3380 if (r) 3381 break; 3382 } 3383 3384 return r; 3385 } 3386 3387 /* 3388 * Supports 3389 * "<key> <value>" 3390 * and 3391 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3392 * 3393 * The key migration_threshold is supported by the cache target core. 3394 */ 3395 static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3396 char *result, unsigned maxlen) 3397 { 3398 struct cache *cache = ti->private; 3399 3400 if (!argc) 3401 return -EINVAL; 3402 3403 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3404 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3405 cache_device_name(cache)); 3406 return -EOPNOTSUPP; 3407 } 3408 3409 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3410 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3411 3412 if (argc != 2) 3413 return -EINVAL; 3414 3415 return set_config_value(cache, argv[0], argv[1]); 3416 } 3417 3418 static int cache_iterate_devices(struct dm_target *ti, 3419 iterate_devices_callout_fn fn, void *data) 3420 { 3421 int r = 0; 3422 struct cache *cache = ti->private; 3423 3424 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3425 if (!r) 3426 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3427 3428 return r; 3429 } 3430 3431 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3432 { 3433 /* 3434 * FIXME: these limits may be incompatible with the cache device 3435 */ 3436 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3437 cache->origin_sectors); 3438 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3439 } 3440 3441 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3442 { 3443 struct cache *cache = ti->private; 3444 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3445 3446 /* 3447 * If the system-determined stacked limits are compatible with the 3448 * cache's blocksize (io_opt is a factor) do not override them. 3449 */ 3450 if (io_opt_sectors < cache->sectors_per_block || 3451 do_div(io_opt_sectors, cache->sectors_per_block)) { 3452 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3453 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3454 } 3455 set_discard_limits(cache, limits); 3456 } 3457 3458 /*----------------------------------------------------------------*/ 3459 3460 static struct target_type cache_target = { 3461 .name = "cache", 3462 .version = {2, 0, 0}, 3463 .module = THIS_MODULE, 3464 .ctr = cache_ctr, 3465 .dtr = cache_dtr, 3466 .map = cache_map, 3467 .end_io = cache_end_io, 3468 .postsuspend = cache_postsuspend, 3469 .preresume = cache_preresume, 3470 .resume = cache_resume, 3471 .status = cache_status, 3472 .message = cache_message, 3473 .iterate_devices = cache_iterate_devices, 3474 .io_hints = cache_io_hints, 3475 }; 3476 3477 static int __init dm_cache_init(void) 3478 { 3479 int r; 3480 3481 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3482 if (!migration_cache) { 3483 dm_unregister_target(&cache_target); 3484 return -ENOMEM; 3485 } 3486 3487 r = dm_register_target(&cache_target); 3488 if (r) { 3489 DMERR("cache target registration failed: %d", r); 3490 return r; 3491 } 3492 3493 return 0; 3494 } 3495 3496 static void __exit dm_cache_exit(void) 3497 { 3498 dm_unregister_target(&cache_target); 3499 kmem_cache_destroy(migration_cache); 3500 } 3501 3502 module_init(dm_cache_init); 3503 module_exit(dm_cache_exit); 3504 3505 MODULE_DESCRIPTION(DM_NAME " cache target"); 3506 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3507 MODULE_LICENSE("GPL"); 3508