1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-prison-v2.h" 10 #include "dm-bio-record.h" 11 #include "dm-cache-metadata.h" 12 #include "dm-io-tracker.h" 13 #include "dm-cache-background-tracker.h" 14 15 #include <linux/dm-io.h> 16 #include <linux/dm-kcopyd.h> 17 #include <linux/jiffies.h> 18 #include <linux/init.h> 19 #include <linux/mempool.h> 20 #include <linux/module.h> 21 #include <linux/rwsem.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 25 #define DM_MSG_PREFIX "cache" 26 27 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 28 "A percentage of time allocated for copying to and/or from cache"); 29 30 /*----------------------------------------------------------------*/ 31 32 /* 33 * Glossary: 34 * 35 * oblock: index of an origin block 36 * cblock: index of a cache block 37 * promotion: movement of a block from origin to cache 38 * demotion: movement of a block from cache to origin 39 * migration: movement of a block between the origin and cache device, 40 * either direction 41 */ 42 43 /*----------------------------------------------------------------*/ 44 45 /* 46 * Represents a chunk of future work. 'input' allows continuations to pass 47 * values between themselves, typically error values. 48 */ 49 struct continuation { 50 struct work_struct ws; 51 blk_status_t input; 52 }; 53 54 static inline void init_continuation(struct continuation *k, 55 void (*fn)(struct work_struct *)) 56 { 57 INIT_WORK(&k->ws, fn); 58 k->input = 0; 59 } 60 61 static inline void queue_continuation(struct workqueue_struct *wq, 62 struct continuation *k) 63 { 64 queue_work(wq, &k->ws); 65 } 66 67 /*----------------------------------------------------------------*/ 68 69 /* 70 * The batcher collects together pieces of work that need a particular 71 * operation to occur before they can proceed (typically a commit). 72 */ 73 struct batcher { 74 /* 75 * The operation that everyone is waiting for. 76 */ 77 blk_status_t (*commit_op)(void *context); 78 void *commit_context; 79 80 /* 81 * This is how bios should be issued once the commit op is complete 82 * (accounted_request). 83 */ 84 void (*issue_op)(struct bio *bio, void *context); 85 void *issue_context; 86 87 /* 88 * Queued work gets put on here after commit. 89 */ 90 struct workqueue_struct *wq; 91 92 spinlock_t lock; 93 struct list_head work_items; 94 struct bio_list bios; 95 struct work_struct commit_work; 96 97 bool commit_scheduled; 98 }; 99 100 static void __commit(struct work_struct *_ws) 101 { 102 struct batcher *b = container_of(_ws, struct batcher, commit_work); 103 blk_status_t r; 104 struct list_head work_items; 105 struct work_struct *ws, *tmp; 106 struct continuation *k; 107 struct bio *bio; 108 struct bio_list bios; 109 110 INIT_LIST_HEAD(&work_items); 111 bio_list_init(&bios); 112 113 /* 114 * We have to grab these before the commit_op to avoid a race 115 * condition. 116 */ 117 spin_lock_irq(&b->lock); 118 list_splice_init(&b->work_items, &work_items); 119 bio_list_merge_init(&bios, &b->bios); 120 b->commit_scheduled = false; 121 spin_unlock_irq(&b->lock); 122 123 r = b->commit_op(b->commit_context); 124 125 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 126 k = container_of(ws, struct continuation, ws); 127 k->input = r; 128 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 129 queue_work(b->wq, ws); 130 } 131 132 while ((bio = bio_list_pop(&bios))) { 133 if (r) { 134 bio->bi_status = r; 135 bio_endio(bio); 136 } else 137 b->issue_op(bio, b->issue_context); 138 } 139 } 140 141 static void batcher_init(struct batcher *b, 142 blk_status_t (*commit_op)(void *), 143 void *commit_context, 144 void (*issue_op)(struct bio *bio, void *), 145 void *issue_context, 146 struct workqueue_struct *wq) 147 { 148 b->commit_op = commit_op; 149 b->commit_context = commit_context; 150 b->issue_op = issue_op; 151 b->issue_context = issue_context; 152 b->wq = wq; 153 154 spin_lock_init(&b->lock); 155 INIT_LIST_HEAD(&b->work_items); 156 bio_list_init(&b->bios); 157 INIT_WORK(&b->commit_work, __commit); 158 b->commit_scheduled = false; 159 } 160 161 static void async_commit(struct batcher *b) 162 { 163 queue_work(b->wq, &b->commit_work); 164 } 165 166 static void continue_after_commit(struct batcher *b, struct continuation *k) 167 { 168 bool commit_scheduled; 169 170 spin_lock_irq(&b->lock); 171 commit_scheduled = b->commit_scheduled; 172 list_add_tail(&k->ws.entry, &b->work_items); 173 spin_unlock_irq(&b->lock); 174 175 if (commit_scheduled) 176 async_commit(b); 177 } 178 179 /* 180 * Bios are errored if commit failed. 181 */ 182 static void issue_after_commit(struct batcher *b, struct bio *bio) 183 { 184 bool commit_scheduled; 185 186 spin_lock_irq(&b->lock); 187 commit_scheduled = b->commit_scheduled; 188 bio_list_add(&b->bios, bio); 189 spin_unlock_irq(&b->lock); 190 191 if (commit_scheduled) 192 async_commit(b); 193 } 194 195 /* 196 * Call this if some urgent work is waiting for the commit to complete. 197 */ 198 static void schedule_commit(struct batcher *b) 199 { 200 bool immediate; 201 202 spin_lock_irq(&b->lock); 203 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 204 b->commit_scheduled = true; 205 spin_unlock_irq(&b->lock); 206 207 if (immediate) 208 async_commit(b); 209 } 210 211 /* 212 * There are a couple of places where we let a bio run, but want to do some 213 * work before calling its endio function. We do this by temporarily 214 * changing the endio fn. 215 */ 216 struct dm_hook_info { 217 bio_end_io_t *bi_end_io; 218 }; 219 220 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 221 bio_end_io_t *bi_end_io, void *bi_private) 222 { 223 h->bi_end_io = bio->bi_end_io; 224 225 bio->bi_end_io = bi_end_io; 226 bio->bi_private = bi_private; 227 } 228 229 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 230 { 231 bio->bi_end_io = h->bi_end_io; 232 } 233 234 /*----------------------------------------------------------------*/ 235 236 #define MIGRATION_POOL_SIZE 128 237 #define COMMIT_PERIOD HZ 238 #define MIGRATION_COUNT_WINDOW 10 239 240 /* 241 * The block size of the device holding cache data must be 242 * between 32KB and 1GB. 243 */ 244 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 245 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 246 247 enum cache_metadata_mode { 248 CM_WRITE, /* metadata may be changed */ 249 CM_READ_ONLY, /* metadata may not be changed */ 250 CM_FAIL 251 }; 252 253 enum cache_io_mode { 254 /* 255 * Data is written to cached blocks only. These blocks are marked 256 * dirty. If you lose the cache device you will lose data. 257 * Potential performance increase for both reads and writes. 258 */ 259 CM_IO_WRITEBACK, 260 261 /* 262 * Data is written to both cache and origin. Blocks are never 263 * dirty. Potential performance benfit for reads only. 264 */ 265 CM_IO_WRITETHROUGH, 266 267 /* 268 * A degraded mode useful for various cache coherency situations 269 * (eg, rolling back snapshots). Reads and writes always go to the 270 * origin. If a write goes to a cached oblock, then the cache 271 * block is invalidated. 272 */ 273 CM_IO_PASSTHROUGH 274 }; 275 276 struct cache_features { 277 enum cache_metadata_mode mode; 278 enum cache_io_mode io_mode; 279 unsigned int metadata_version; 280 bool discard_passdown:1; 281 }; 282 283 struct cache_stats { 284 atomic_t read_hit; 285 atomic_t read_miss; 286 atomic_t write_hit; 287 atomic_t write_miss; 288 atomic_t demotion; 289 atomic_t promotion; 290 atomic_t writeback; 291 atomic_t copies_avoided; 292 atomic_t cache_cell_clash; 293 atomic_t commit_count; 294 atomic_t discard_count; 295 }; 296 297 struct cache { 298 struct dm_target *ti; 299 spinlock_t lock; 300 301 /* 302 * Fields for converting from sectors to blocks. 303 */ 304 int sectors_per_block_shift; 305 sector_t sectors_per_block; 306 307 struct dm_cache_metadata *cmd; 308 309 /* 310 * Metadata is written to this device. 311 */ 312 struct dm_dev *metadata_dev; 313 314 /* 315 * The slower of the two data devices. Typically a spindle. 316 */ 317 struct dm_dev *origin_dev; 318 319 /* 320 * The faster of the two data devices. Typically an SSD. 321 */ 322 struct dm_dev *cache_dev; 323 324 /* 325 * Size of the origin device in _complete_ blocks and native sectors. 326 */ 327 dm_oblock_t origin_blocks; 328 sector_t origin_sectors; 329 330 /* 331 * Size of the cache device in blocks. 332 */ 333 dm_cblock_t cache_size; 334 335 /* 336 * Invalidation fields. 337 */ 338 spinlock_t invalidation_lock; 339 struct list_head invalidation_requests; 340 341 sector_t migration_threshold; 342 wait_queue_head_t migration_wait; 343 atomic_t nr_allocated_migrations; 344 345 /* 346 * The number of in flight migrations that are performing 347 * background io. eg, promotion, writeback. 348 */ 349 atomic_t nr_io_migrations; 350 351 struct bio_list deferred_bios; 352 353 struct rw_semaphore quiesce_lock; 354 355 /* 356 * origin_blocks entries, discarded if set. 357 */ 358 dm_dblock_t discard_nr_blocks; 359 unsigned long *discard_bitset; 360 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 361 362 /* 363 * Rather than reconstructing the table line for the status we just 364 * save it and regurgitate. 365 */ 366 unsigned int nr_ctr_args; 367 const char **ctr_args; 368 369 struct dm_kcopyd_client *copier; 370 struct work_struct deferred_bio_worker; 371 struct work_struct migration_worker; 372 struct workqueue_struct *wq; 373 struct delayed_work waker; 374 struct dm_bio_prison_v2 *prison; 375 376 /* 377 * cache_size entries, dirty if set 378 */ 379 unsigned long *dirty_bitset; 380 atomic_t nr_dirty; 381 382 unsigned int policy_nr_args; 383 struct dm_cache_policy *policy; 384 385 /* 386 * Cache features such as write-through. 387 */ 388 struct cache_features features; 389 390 struct cache_stats stats; 391 392 bool need_tick_bio:1; 393 bool sized:1; 394 bool invalidate:1; 395 bool commit_requested:1; 396 bool loaded_mappings:1; 397 bool loaded_discards:1; 398 399 struct rw_semaphore background_work_lock; 400 401 struct batcher committer; 402 struct work_struct commit_ws; 403 404 struct dm_io_tracker tracker; 405 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 410 /* 411 * Cache_size entries. Set bits indicate blocks mapped beyond the 412 * target length, which are marked for invalidation. 413 */ 414 unsigned long *invalid_bitset; 415 }; 416 417 struct per_bio_data { 418 bool tick:1; 419 unsigned int req_nr:2; 420 struct dm_bio_prison_cell_v2 *cell; 421 struct dm_hook_info hook_info; 422 sector_t len; 423 }; 424 425 struct dm_cache_migration { 426 struct continuation k; 427 struct cache *cache; 428 429 struct policy_work *op; 430 struct bio *overwrite_bio; 431 struct dm_bio_prison_cell_v2 *cell; 432 433 dm_cblock_t invalidate_cblock; 434 dm_oblock_t invalidate_oblock; 435 }; 436 437 /*----------------------------------------------------------------*/ 438 439 static bool writethrough_mode(struct cache *cache) 440 { 441 return cache->features.io_mode == CM_IO_WRITETHROUGH; 442 } 443 444 static bool writeback_mode(struct cache *cache) 445 { 446 return cache->features.io_mode == CM_IO_WRITEBACK; 447 } 448 449 static inline bool passthrough_mode(struct cache *cache) 450 { 451 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 452 } 453 454 /*----------------------------------------------------------------*/ 455 456 static void wake_deferred_bio_worker(struct cache *cache) 457 { 458 queue_work(cache->wq, &cache->deferred_bio_worker); 459 } 460 461 static void wake_migration_worker(struct cache *cache) 462 { 463 if (passthrough_mode(cache)) 464 return; 465 466 queue_work(cache->wq, &cache->migration_worker); 467 } 468 469 /*----------------------------------------------------------------*/ 470 471 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 472 { 473 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 474 } 475 476 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 477 { 478 dm_bio_prison_free_cell_v2(cache->prison, cell); 479 } 480 481 static struct dm_cache_migration *alloc_migration(struct cache *cache) 482 { 483 struct dm_cache_migration *mg; 484 485 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 486 487 memset(mg, 0, sizeof(*mg)); 488 489 mg->cache = cache; 490 atomic_inc(&cache->nr_allocated_migrations); 491 492 return mg; 493 } 494 495 static void free_migration(struct dm_cache_migration *mg) 496 { 497 struct cache *cache = mg->cache; 498 499 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 500 wake_up(&cache->migration_wait); 501 502 mempool_free(mg, &cache->migration_pool); 503 } 504 505 /*----------------------------------------------------------------*/ 506 507 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 508 { 509 return to_oblock(from_oblock(b) + 1ull); 510 } 511 512 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 513 { 514 key->virtual = 0; 515 key->dev = 0; 516 key->block_begin = from_oblock(begin); 517 key->block_end = from_oblock(end); 518 } 519 520 /* 521 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 522 * level 1 which prevents *both* READs and WRITEs. 523 */ 524 #define WRITE_LOCK_LEVEL 0 525 #define READ_WRITE_LOCK_LEVEL 1 526 527 static unsigned int lock_level(struct bio *bio) 528 { 529 return bio_data_dir(bio) == WRITE ? 530 WRITE_LOCK_LEVEL : 531 READ_WRITE_LOCK_LEVEL; 532 } 533 534 /* 535 *-------------------------------------------------------------- 536 * Per bio data 537 *-------------------------------------------------------------- 538 */ 539 540 static struct per_bio_data *get_per_bio_data(struct bio *bio) 541 { 542 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 543 544 BUG_ON(!pb); 545 return pb; 546 } 547 548 static struct per_bio_data *init_per_bio_data(struct bio *bio) 549 { 550 struct per_bio_data *pb = get_per_bio_data(bio); 551 552 pb->tick = false; 553 pb->req_nr = dm_bio_get_target_bio_nr(bio); 554 pb->cell = NULL; 555 pb->len = 0; 556 557 return pb; 558 } 559 560 /*----------------------------------------------------------------*/ 561 562 static void defer_bio(struct cache *cache, struct bio *bio) 563 { 564 spin_lock_irq(&cache->lock); 565 bio_list_add(&cache->deferred_bios, bio); 566 spin_unlock_irq(&cache->lock); 567 568 wake_deferred_bio_worker(cache); 569 } 570 571 static void defer_bios(struct cache *cache, struct bio_list *bios) 572 { 573 spin_lock_irq(&cache->lock); 574 bio_list_merge_init(&cache->deferred_bios, bios); 575 spin_unlock_irq(&cache->lock); 576 577 wake_deferred_bio_worker(cache); 578 } 579 580 /*----------------------------------------------------------------*/ 581 582 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 583 { 584 bool r; 585 struct per_bio_data *pb; 586 struct dm_cell_key_v2 key; 587 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 588 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 589 590 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 591 592 build_key(oblock, end, &key); 593 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 594 if (!r) { 595 /* 596 * Failed to get the lock. 597 */ 598 free_prison_cell(cache, cell_prealloc); 599 return r; 600 } 601 602 if (cell != cell_prealloc) 603 free_prison_cell(cache, cell_prealloc); 604 605 pb = get_per_bio_data(bio); 606 pb->cell = cell; 607 608 return r; 609 } 610 611 /*----------------------------------------------------------------*/ 612 613 static bool is_dirty(struct cache *cache, dm_cblock_t b) 614 { 615 return test_bit(from_cblock(b), cache->dirty_bitset); 616 } 617 618 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 619 { 620 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 621 atomic_inc(&cache->nr_dirty); 622 policy_set_dirty(cache->policy, cblock); 623 } 624 } 625 626 /* 627 * These two are called when setting after migrations to force the policy 628 * and dirty bitset to be in sync. 629 */ 630 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 631 { 632 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 633 atomic_inc(&cache->nr_dirty); 634 policy_set_dirty(cache->policy, cblock); 635 } 636 637 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 638 { 639 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 640 if (atomic_dec_return(&cache->nr_dirty) == 0) 641 dm_table_event(cache->ti->table); 642 } 643 644 policy_clear_dirty(cache->policy, cblock); 645 } 646 647 /*----------------------------------------------------------------*/ 648 649 static bool block_size_is_power_of_two(struct cache *cache) 650 { 651 return cache->sectors_per_block_shift >= 0; 652 } 653 654 static dm_block_t block_div(dm_block_t b, uint32_t n) 655 { 656 do_div(b, n); 657 658 return b; 659 } 660 661 static dm_block_t oblocks_per_dblock(struct cache *cache) 662 { 663 dm_block_t oblocks = cache->discard_block_size; 664 665 if (block_size_is_power_of_two(cache)) 666 oblocks >>= cache->sectors_per_block_shift; 667 else 668 oblocks = block_div(oblocks, cache->sectors_per_block); 669 670 return oblocks; 671 } 672 673 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 674 { 675 return to_dblock(block_div(from_oblock(oblock), 676 oblocks_per_dblock(cache))); 677 } 678 679 static void set_discard(struct cache *cache, dm_dblock_t b) 680 { 681 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 682 atomic_inc(&cache->stats.discard_count); 683 684 spin_lock_irq(&cache->lock); 685 set_bit(from_dblock(b), cache->discard_bitset); 686 spin_unlock_irq(&cache->lock); 687 } 688 689 static void clear_discard(struct cache *cache, dm_dblock_t b) 690 { 691 spin_lock_irq(&cache->lock); 692 clear_bit(from_dblock(b), cache->discard_bitset); 693 spin_unlock_irq(&cache->lock); 694 } 695 696 static bool is_discarded(struct cache *cache, dm_dblock_t b) 697 { 698 int r; 699 700 spin_lock_irq(&cache->lock); 701 r = test_bit(from_dblock(b), cache->discard_bitset); 702 spin_unlock_irq(&cache->lock); 703 704 return r; 705 } 706 707 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 708 { 709 int r; 710 711 spin_lock_irq(&cache->lock); 712 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 713 cache->discard_bitset); 714 spin_unlock_irq(&cache->lock); 715 716 return r; 717 } 718 719 /* 720 * ------------------------------------------------------------- 721 * Remapping 722 *-------------------------------------------------------------- 723 */ 724 static void remap_to_origin(struct cache *cache, struct bio *bio) 725 { 726 bio_set_dev(bio, cache->origin_dev->bdev); 727 } 728 729 static void remap_to_cache(struct cache *cache, struct bio *bio, 730 dm_cblock_t cblock) 731 { 732 sector_t bi_sector = bio->bi_iter.bi_sector; 733 sector_t block = from_cblock(cblock); 734 735 bio_set_dev(bio, cache->cache_dev->bdev); 736 if (!block_size_is_power_of_two(cache)) 737 bio->bi_iter.bi_sector = 738 (block * cache->sectors_per_block) + 739 sector_div(bi_sector, cache->sectors_per_block); 740 else 741 bio->bi_iter.bi_sector = 742 (block << cache->sectors_per_block_shift) | 743 (bi_sector & (cache->sectors_per_block - 1)); 744 } 745 746 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 747 { 748 struct per_bio_data *pb; 749 750 spin_lock_irq(&cache->lock); 751 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 752 bio_op(bio) != REQ_OP_DISCARD) { 753 pb = get_per_bio_data(bio); 754 pb->tick = true; 755 cache->need_tick_bio = false; 756 } 757 spin_unlock_irq(&cache->lock); 758 } 759 760 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 761 dm_oblock_t oblock) 762 { 763 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 764 check_if_tick_bio_needed(cache, bio); 765 remap_to_origin(cache, bio); 766 if (bio_data_dir(bio) == WRITE) 767 clear_discard(cache, oblock_to_dblock(cache, oblock)); 768 } 769 770 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 771 dm_oblock_t oblock, dm_cblock_t cblock) 772 { 773 check_if_tick_bio_needed(cache, bio); 774 remap_to_cache(cache, bio, cblock); 775 if (bio_data_dir(bio) == WRITE) { 776 set_dirty(cache, cblock); 777 clear_discard(cache, oblock_to_dblock(cache, oblock)); 778 } 779 } 780 781 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 782 { 783 sector_t block_nr = bio->bi_iter.bi_sector; 784 785 if (!block_size_is_power_of_two(cache)) 786 (void) sector_div(block_nr, cache->sectors_per_block); 787 else 788 block_nr >>= cache->sectors_per_block_shift; 789 790 return to_oblock(block_nr); 791 } 792 793 static bool accountable_bio(struct cache *cache, struct bio *bio) 794 { 795 return bio_op(bio) != REQ_OP_DISCARD; 796 } 797 798 static void accounted_begin(struct cache *cache, struct bio *bio) 799 { 800 struct per_bio_data *pb; 801 802 if (accountable_bio(cache, bio)) { 803 pb = get_per_bio_data(bio); 804 pb->len = bio_sectors(bio); 805 dm_iot_io_begin(&cache->tracker, pb->len); 806 } 807 } 808 809 static void accounted_complete(struct cache *cache, struct bio *bio) 810 { 811 struct per_bio_data *pb = get_per_bio_data(bio); 812 813 dm_iot_io_end(&cache->tracker, pb->len); 814 } 815 816 static void accounted_request(struct cache *cache, struct bio *bio) 817 { 818 accounted_begin(cache, bio); 819 dm_submit_bio_remap(bio, NULL); 820 } 821 822 static void issue_op(struct bio *bio, void *context) 823 { 824 struct cache *cache = context; 825 826 accounted_request(cache, bio); 827 } 828 829 /* 830 * When running in writethrough mode we need to send writes to clean blocks 831 * to both the cache and origin devices. Clone the bio and send them in parallel. 832 */ 833 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 834 dm_oblock_t oblock, dm_cblock_t cblock) 835 { 836 struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, 837 GFP_NOIO, &cache->bs); 838 839 BUG_ON(!origin_bio); 840 841 bio_chain(origin_bio, bio); 842 843 if (bio_data_dir(origin_bio) == WRITE) 844 clear_discard(cache, oblock_to_dblock(cache, oblock)); 845 submit_bio(origin_bio); 846 847 remap_to_cache(cache, bio, cblock); 848 } 849 850 /* 851 *-------------------------------------------------------------- 852 * Failure modes 853 *-------------------------------------------------------------- 854 */ 855 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 856 { 857 return cache->features.mode; 858 } 859 860 static const char *cache_device_name(struct cache *cache) 861 { 862 return dm_table_device_name(cache->ti->table); 863 } 864 865 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 866 { 867 static const char *descs[] = { 868 "write", 869 "read-only", 870 "fail" 871 }; 872 873 dm_table_event(cache->ti->table); 874 DMINFO("%s: switching cache to %s mode", 875 cache_device_name(cache), descs[(int)mode]); 876 } 877 878 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 879 { 880 bool needs_check; 881 enum cache_metadata_mode old_mode = get_cache_mode(cache); 882 883 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 884 DMERR("%s: unable to read needs_check flag, setting failure mode.", 885 cache_device_name(cache)); 886 new_mode = CM_FAIL; 887 } 888 889 if (new_mode == CM_WRITE && needs_check) { 890 DMERR("%s: unable to switch cache to write mode until repaired.", 891 cache_device_name(cache)); 892 if (old_mode != new_mode) 893 new_mode = old_mode; 894 else 895 new_mode = CM_READ_ONLY; 896 } 897 898 /* Never move out of fail mode */ 899 if (old_mode == CM_FAIL) 900 new_mode = CM_FAIL; 901 902 switch (new_mode) { 903 case CM_FAIL: 904 case CM_READ_ONLY: 905 dm_cache_metadata_set_read_only(cache->cmd); 906 break; 907 908 case CM_WRITE: 909 dm_cache_metadata_set_read_write(cache->cmd); 910 break; 911 } 912 913 cache->features.mode = new_mode; 914 915 if (new_mode != old_mode) 916 notify_mode_switch(cache, new_mode); 917 } 918 919 static void abort_transaction(struct cache *cache) 920 { 921 const char *dev_name = cache_device_name(cache); 922 923 if (get_cache_mode(cache) >= CM_READ_ONLY) 924 return; 925 926 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 927 if (dm_cache_metadata_abort(cache->cmd)) { 928 DMERR("%s: failed to abort metadata transaction", dev_name); 929 set_cache_mode(cache, CM_FAIL); 930 } 931 932 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 933 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 934 set_cache_mode(cache, CM_FAIL); 935 } 936 } 937 938 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 939 { 940 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 941 cache_device_name(cache), op, r); 942 abort_transaction(cache); 943 set_cache_mode(cache, CM_READ_ONLY); 944 } 945 946 /*----------------------------------------------------------------*/ 947 948 static void load_stats(struct cache *cache) 949 { 950 struct dm_cache_statistics stats; 951 952 dm_cache_metadata_get_stats(cache->cmd, &stats); 953 atomic_set(&cache->stats.read_hit, stats.read_hits); 954 atomic_set(&cache->stats.read_miss, stats.read_misses); 955 atomic_set(&cache->stats.write_hit, stats.write_hits); 956 atomic_set(&cache->stats.write_miss, stats.write_misses); 957 } 958 959 static void save_stats(struct cache *cache) 960 { 961 struct dm_cache_statistics stats; 962 963 if (get_cache_mode(cache) >= CM_READ_ONLY) 964 return; 965 966 stats.read_hits = atomic_read(&cache->stats.read_hit); 967 stats.read_misses = atomic_read(&cache->stats.read_miss); 968 stats.write_hits = atomic_read(&cache->stats.write_hit); 969 stats.write_misses = atomic_read(&cache->stats.write_miss); 970 971 dm_cache_metadata_set_stats(cache->cmd, &stats); 972 } 973 974 static void update_stats(struct cache_stats *stats, enum policy_operation op) 975 { 976 switch (op) { 977 case POLICY_PROMOTE: 978 atomic_inc(&stats->promotion); 979 break; 980 981 case POLICY_DEMOTE: 982 atomic_inc(&stats->demotion); 983 break; 984 985 case POLICY_WRITEBACK: 986 atomic_inc(&stats->writeback); 987 break; 988 } 989 } 990 991 /* 992 *--------------------------------------------------------------------- 993 * Migration processing 994 * 995 * Migration covers moving data from the origin device to the cache, or 996 * vice versa. 997 *--------------------------------------------------------------------- 998 */ 999 static void inc_io_migrations(struct cache *cache) 1000 { 1001 atomic_inc(&cache->nr_io_migrations); 1002 } 1003 1004 static void dec_io_migrations(struct cache *cache) 1005 { 1006 atomic_dec(&cache->nr_io_migrations); 1007 } 1008 1009 static bool discard_or_flush(struct bio *bio) 1010 { 1011 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1012 } 1013 1014 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1015 dm_dblock_t *b, dm_dblock_t *e) 1016 { 1017 sector_t sb = bio->bi_iter.bi_sector; 1018 sector_t se = bio_end_sector(bio); 1019 1020 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1021 1022 if (se - sb < cache->discard_block_size) 1023 *e = *b; 1024 else 1025 *e = to_dblock(block_div(se, cache->discard_block_size)); 1026 } 1027 1028 /*----------------------------------------------------------------*/ 1029 1030 static void prevent_background_work(struct cache *cache) 1031 { 1032 lockdep_off(); 1033 down_write(&cache->background_work_lock); 1034 lockdep_on(); 1035 } 1036 1037 static void allow_background_work(struct cache *cache) 1038 { 1039 lockdep_off(); 1040 up_write(&cache->background_work_lock); 1041 lockdep_on(); 1042 } 1043 1044 static bool background_work_begin(struct cache *cache) 1045 { 1046 bool r; 1047 1048 lockdep_off(); 1049 r = down_read_trylock(&cache->background_work_lock); 1050 lockdep_on(); 1051 1052 return r; 1053 } 1054 1055 static void background_work_end(struct cache *cache) 1056 { 1057 lockdep_off(); 1058 up_read(&cache->background_work_lock); 1059 lockdep_on(); 1060 } 1061 1062 /*----------------------------------------------------------------*/ 1063 1064 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1065 { 1066 return (bio_data_dir(bio) == WRITE) && 1067 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1068 } 1069 1070 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1071 { 1072 return writeback_mode(cache) && 1073 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1074 } 1075 1076 static void quiesce(struct dm_cache_migration *mg, 1077 void (*continuation)(struct work_struct *)) 1078 { 1079 init_continuation(&mg->k, continuation); 1080 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1081 } 1082 1083 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1084 { 1085 struct continuation *k = container_of(ws, struct continuation, ws); 1086 1087 return container_of(k, struct dm_cache_migration, k); 1088 } 1089 1090 static void copy_complete(int read_err, unsigned long write_err, void *context) 1091 { 1092 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1093 1094 if (read_err || write_err) 1095 mg->k.input = BLK_STS_IOERR; 1096 1097 queue_continuation(mg->cache->wq, &mg->k); 1098 } 1099 1100 static void copy(struct dm_cache_migration *mg, bool promote) 1101 { 1102 struct dm_io_region o_region, c_region; 1103 struct cache *cache = mg->cache; 1104 1105 o_region.bdev = cache->origin_dev->bdev; 1106 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1107 o_region.count = cache->sectors_per_block; 1108 1109 c_region.bdev = cache->cache_dev->bdev; 1110 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1111 c_region.count = cache->sectors_per_block; 1112 1113 if (promote) 1114 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1115 else 1116 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1117 } 1118 1119 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1120 { 1121 struct per_bio_data *pb = get_per_bio_data(bio); 1122 1123 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1124 free_prison_cell(cache, pb->cell); 1125 pb->cell = NULL; 1126 } 1127 1128 static void overwrite_endio(struct bio *bio) 1129 { 1130 struct dm_cache_migration *mg = bio->bi_private; 1131 struct cache *cache = mg->cache; 1132 struct per_bio_data *pb = get_per_bio_data(bio); 1133 1134 dm_unhook_bio(&pb->hook_info, bio); 1135 1136 if (bio->bi_status) 1137 mg->k.input = bio->bi_status; 1138 1139 queue_continuation(cache->wq, &mg->k); 1140 } 1141 1142 static void overwrite(struct dm_cache_migration *mg, 1143 void (*continuation)(struct work_struct *)) 1144 { 1145 struct bio *bio = mg->overwrite_bio; 1146 struct per_bio_data *pb = get_per_bio_data(bio); 1147 1148 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1149 1150 /* 1151 * The overwrite bio is part of the copy operation, as such it does 1152 * not set/clear discard or dirty flags. 1153 */ 1154 if (mg->op->op == POLICY_PROMOTE) 1155 remap_to_cache(mg->cache, bio, mg->op->cblock); 1156 else 1157 remap_to_origin(mg->cache, bio); 1158 1159 init_continuation(&mg->k, continuation); 1160 accounted_request(mg->cache, bio); 1161 } 1162 1163 /* 1164 * Migration steps: 1165 * 1166 * 1) exclusive lock preventing WRITEs 1167 * 2) quiesce 1168 * 3) copy or issue overwrite bio 1169 * 4) upgrade to exclusive lock preventing READs and WRITEs 1170 * 5) quiesce 1171 * 6) update metadata and commit 1172 * 7) unlock 1173 */ 1174 static void mg_complete(struct dm_cache_migration *mg, bool success) 1175 { 1176 struct bio_list bios; 1177 struct cache *cache = mg->cache; 1178 struct policy_work *op = mg->op; 1179 dm_cblock_t cblock = op->cblock; 1180 1181 if (success) 1182 update_stats(&cache->stats, op->op); 1183 1184 switch (op->op) { 1185 case POLICY_PROMOTE: 1186 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1187 policy_complete_background_work(cache->policy, op, success); 1188 1189 if (mg->overwrite_bio) { 1190 if (success) 1191 force_set_dirty(cache, cblock); 1192 else if (mg->k.input) 1193 mg->overwrite_bio->bi_status = mg->k.input; 1194 else 1195 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1196 bio_endio(mg->overwrite_bio); 1197 } else { 1198 if (success) 1199 force_clear_dirty(cache, cblock); 1200 dec_io_migrations(cache); 1201 } 1202 break; 1203 1204 case POLICY_DEMOTE: 1205 /* 1206 * We clear dirty here to update the nr_dirty counter. 1207 */ 1208 if (success) 1209 force_clear_dirty(cache, cblock); 1210 policy_complete_background_work(cache->policy, op, success); 1211 dec_io_migrations(cache); 1212 break; 1213 1214 case POLICY_WRITEBACK: 1215 if (success) 1216 force_clear_dirty(cache, cblock); 1217 policy_complete_background_work(cache->policy, op, success); 1218 dec_io_migrations(cache); 1219 break; 1220 } 1221 1222 bio_list_init(&bios); 1223 if (mg->cell) { 1224 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1225 free_prison_cell(cache, mg->cell); 1226 } 1227 1228 free_migration(mg); 1229 defer_bios(cache, &bios); 1230 wake_migration_worker(cache); 1231 1232 background_work_end(cache); 1233 } 1234 1235 static void mg_success(struct work_struct *ws) 1236 { 1237 struct dm_cache_migration *mg = ws_to_mg(ws); 1238 1239 mg_complete(mg, mg->k.input == 0); 1240 } 1241 1242 static void mg_update_metadata(struct work_struct *ws) 1243 { 1244 int r; 1245 struct dm_cache_migration *mg = ws_to_mg(ws); 1246 struct cache *cache = mg->cache; 1247 struct policy_work *op = mg->op; 1248 1249 switch (op->op) { 1250 case POLICY_PROMOTE: 1251 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1252 if (r) { 1253 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1254 cache_device_name(cache)); 1255 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1256 1257 mg_complete(mg, false); 1258 return; 1259 } 1260 mg_complete(mg, true); 1261 break; 1262 1263 case POLICY_DEMOTE: 1264 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1265 if (r) { 1266 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1267 cache_device_name(cache)); 1268 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1269 1270 mg_complete(mg, false); 1271 return; 1272 } 1273 1274 /* 1275 * It would be nice if we only had to commit when a REQ_FLUSH 1276 * comes through. But there's one scenario that we have to 1277 * look out for: 1278 * 1279 * - vblock x in a cache block 1280 * - domotion occurs 1281 * - cache block gets reallocated and over written 1282 * - crash 1283 * 1284 * When we recover, because there was no commit the cache will 1285 * rollback to having the data for vblock x in the cache block. 1286 * But the cache block has since been overwritten, so it'll end 1287 * up pointing to data that was never in 'x' during the history 1288 * of the device. 1289 * 1290 * To avoid this issue we require a commit as part of the 1291 * demotion operation. 1292 */ 1293 init_continuation(&mg->k, mg_success); 1294 continue_after_commit(&cache->committer, &mg->k); 1295 schedule_commit(&cache->committer); 1296 break; 1297 1298 case POLICY_WRITEBACK: 1299 mg_complete(mg, true); 1300 break; 1301 } 1302 } 1303 1304 static void mg_update_metadata_after_copy(struct work_struct *ws) 1305 { 1306 struct dm_cache_migration *mg = ws_to_mg(ws); 1307 1308 /* 1309 * Did the copy succeed? 1310 */ 1311 if (mg->k.input) 1312 mg_complete(mg, false); 1313 else 1314 mg_update_metadata(ws); 1315 } 1316 1317 static void mg_upgrade_lock(struct work_struct *ws) 1318 { 1319 int r; 1320 struct dm_cache_migration *mg = ws_to_mg(ws); 1321 1322 /* 1323 * Did the copy succeed? 1324 */ 1325 if (mg->k.input) 1326 mg_complete(mg, false); 1327 1328 else { 1329 /* 1330 * Now we want the lock to prevent both reads and writes. 1331 */ 1332 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1333 READ_WRITE_LOCK_LEVEL); 1334 if (r < 0) 1335 mg_complete(mg, false); 1336 1337 else if (r) 1338 quiesce(mg, mg_update_metadata); 1339 1340 else 1341 mg_update_metadata(ws); 1342 } 1343 } 1344 1345 static void mg_full_copy(struct work_struct *ws) 1346 { 1347 struct dm_cache_migration *mg = ws_to_mg(ws); 1348 struct cache *cache = mg->cache; 1349 struct policy_work *op = mg->op; 1350 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1351 1352 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1353 is_discarded_oblock(cache, op->oblock)) { 1354 mg_upgrade_lock(ws); 1355 return; 1356 } 1357 1358 init_continuation(&mg->k, mg_upgrade_lock); 1359 copy(mg, is_policy_promote); 1360 } 1361 1362 static void mg_copy(struct work_struct *ws) 1363 { 1364 struct dm_cache_migration *mg = ws_to_mg(ws); 1365 1366 if (mg->overwrite_bio) { 1367 /* 1368 * No exclusive lock was held when we last checked if the bio 1369 * was optimisable. So we have to check again in case things 1370 * have changed (eg, the block may no longer be discarded). 1371 */ 1372 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1373 /* 1374 * Fallback to a real full copy after doing some tidying up. 1375 */ 1376 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1377 1378 BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */ 1379 mg->overwrite_bio = NULL; 1380 inc_io_migrations(mg->cache); 1381 mg_full_copy(ws); 1382 return; 1383 } 1384 1385 /* 1386 * It's safe to do this here, even though it's new data 1387 * because all IO has been locked out of the block. 1388 * 1389 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1390 * so _not_ using mg_upgrade_lock() as continutation. 1391 */ 1392 overwrite(mg, mg_update_metadata_after_copy); 1393 1394 } else 1395 mg_full_copy(ws); 1396 } 1397 1398 static int mg_lock_writes(struct dm_cache_migration *mg) 1399 { 1400 int r; 1401 struct dm_cell_key_v2 key; 1402 struct cache *cache = mg->cache; 1403 struct dm_bio_prison_cell_v2 *prealloc; 1404 1405 prealloc = alloc_prison_cell(cache); 1406 1407 /* 1408 * Prevent writes to the block, but allow reads to continue. 1409 * Unless we're using an overwrite bio, in which case we lock 1410 * everything. 1411 */ 1412 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1413 r = dm_cell_lock_v2(cache->prison, &key, 1414 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1415 prealloc, &mg->cell); 1416 if (r < 0) { 1417 free_prison_cell(cache, prealloc); 1418 mg_complete(mg, false); 1419 return r; 1420 } 1421 1422 if (mg->cell != prealloc) 1423 free_prison_cell(cache, prealloc); 1424 1425 if (r == 0) 1426 mg_copy(&mg->k.ws); 1427 else 1428 quiesce(mg, mg_copy); 1429 1430 return 0; 1431 } 1432 1433 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1434 { 1435 struct dm_cache_migration *mg; 1436 1437 if (!background_work_begin(cache)) { 1438 policy_complete_background_work(cache->policy, op, false); 1439 return -EPERM; 1440 } 1441 1442 mg = alloc_migration(cache); 1443 1444 mg->op = op; 1445 mg->overwrite_bio = bio; 1446 1447 if (!bio) 1448 inc_io_migrations(cache); 1449 1450 return mg_lock_writes(mg); 1451 } 1452 1453 /* 1454 *-------------------------------------------------------------- 1455 * invalidation processing 1456 *-------------------------------------------------------------- 1457 */ 1458 1459 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1460 { 1461 struct bio_list bios; 1462 struct cache *cache = mg->cache; 1463 1464 bio_list_init(&bios); 1465 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1466 free_prison_cell(cache, mg->cell); 1467 1468 if (!success && mg->overwrite_bio) 1469 bio_io_error(mg->overwrite_bio); 1470 1471 free_migration(mg); 1472 defer_bios(cache, &bios); 1473 1474 background_work_end(cache); 1475 } 1476 1477 static void invalidate_completed(struct work_struct *ws) 1478 { 1479 struct dm_cache_migration *mg = ws_to_mg(ws); 1480 1481 invalidate_complete(mg, !mg->k.input); 1482 } 1483 1484 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1485 { 1486 int r; 1487 1488 r = policy_invalidate_mapping(cache->policy, cblock); 1489 if (!r) { 1490 r = dm_cache_remove_mapping(cache->cmd, cblock); 1491 if (r) { 1492 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1493 cache_device_name(cache)); 1494 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1495 } 1496 1497 } else if (r == -ENODATA) { 1498 /* 1499 * Harmless, already unmapped. 1500 */ 1501 r = 0; 1502 1503 } else 1504 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1505 1506 return r; 1507 } 1508 1509 static void invalidate_remove(struct work_struct *ws) 1510 { 1511 int r; 1512 struct dm_cache_migration *mg = ws_to_mg(ws); 1513 struct cache *cache = mg->cache; 1514 1515 r = invalidate_cblock(cache, mg->invalidate_cblock); 1516 if (r) { 1517 invalidate_complete(mg, false); 1518 return; 1519 } 1520 1521 init_continuation(&mg->k, invalidate_completed); 1522 continue_after_commit(&cache->committer, &mg->k); 1523 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1524 mg->overwrite_bio = NULL; 1525 schedule_commit(&cache->committer); 1526 } 1527 1528 static int invalidate_lock(struct dm_cache_migration *mg) 1529 { 1530 int r; 1531 struct dm_cell_key_v2 key; 1532 struct cache *cache = mg->cache; 1533 struct dm_bio_prison_cell_v2 *prealloc; 1534 1535 prealloc = alloc_prison_cell(cache); 1536 1537 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1538 r = dm_cell_lock_v2(cache->prison, &key, 1539 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1540 if (r < 0) { 1541 free_prison_cell(cache, prealloc); 1542 invalidate_complete(mg, false); 1543 return r; 1544 } 1545 1546 if (mg->cell != prealloc) 1547 free_prison_cell(cache, prealloc); 1548 1549 if (r) 1550 quiesce(mg, invalidate_remove); 1551 1552 else { 1553 /* 1554 * We can't call invalidate_remove() directly here because we 1555 * might still be in request context. 1556 */ 1557 init_continuation(&mg->k, invalidate_remove); 1558 queue_work(cache->wq, &mg->k.ws); 1559 } 1560 1561 return 0; 1562 } 1563 1564 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1565 dm_oblock_t oblock, struct bio *bio) 1566 { 1567 struct dm_cache_migration *mg; 1568 1569 if (!background_work_begin(cache)) 1570 return -EPERM; 1571 1572 mg = alloc_migration(cache); 1573 1574 mg->overwrite_bio = bio; 1575 mg->invalidate_cblock = cblock; 1576 mg->invalidate_oblock = oblock; 1577 1578 return invalidate_lock(mg); 1579 } 1580 1581 /* 1582 *-------------------------------------------------------------- 1583 * bio processing 1584 *-------------------------------------------------------------- 1585 */ 1586 1587 enum busy { 1588 IDLE, 1589 BUSY 1590 }; 1591 1592 static enum busy spare_migration_bandwidth(struct cache *cache) 1593 { 1594 bool idle = dm_iot_idle_for(&cache->tracker, HZ); 1595 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1596 cache->sectors_per_block; 1597 1598 if (idle && current_volume <= cache->migration_threshold) 1599 return IDLE; 1600 else 1601 return BUSY; 1602 } 1603 1604 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1605 { 1606 atomic_inc(bio_data_dir(bio) == READ ? 1607 &cache->stats.read_hit : &cache->stats.write_hit); 1608 } 1609 1610 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1611 { 1612 atomic_inc(bio_data_dir(bio) == READ ? 1613 &cache->stats.read_miss : &cache->stats.write_miss); 1614 } 1615 1616 /*----------------------------------------------------------------*/ 1617 1618 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1619 bool *commit_needed) 1620 { 1621 int r, data_dir; 1622 bool rb, background_queued; 1623 dm_cblock_t cblock; 1624 1625 *commit_needed = false; 1626 1627 rb = bio_detain_shared(cache, block, bio); 1628 if (!rb) { 1629 /* 1630 * An exclusive lock is held for this block, so we have to 1631 * wait. We set the commit_needed flag so the current 1632 * transaction will be committed asap, allowing this lock 1633 * to be dropped. 1634 */ 1635 *commit_needed = true; 1636 return DM_MAPIO_SUBMITTED; 1637 } 1638 1639 data_dir = bio_data_dir(bio); 1640 1641 if (optimisable_bio(cache, bio, block)) { 1642 struct policy_work *op = NULL; 1643 1644 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1645 if (unlikely(r && r != -ENOENT)) { 1646 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1647 cache_device_name(cache), r); 1648 bio_io_error(bio); 1649 return DM_MAPIO_SUBMITTED; 1650 } 1651 1652 if (r == -ENOENT && op) { 1653 bio_drop_shared_lock(cache, bio); 1654 BUG_ON(op->op != POLICY_PROMOTE); 1655 mg_start(cache, op, bio); 1656 return DM_MAPIO_SUBMITTED; 1657 } 1658 } else { 1659 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1660 if (unlikely(r && r != -ENOENT)) { 1661 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1662 cache_device_name(cache), r); 1663 bio_io_error(bio); 1664 return DM_MAPIO_SUBMITTED; 1665 } 1666 1667 if (background_queued) 1668 wake_migration_worker(cache); 1669 } 1670 1671 if (r == -ENOENT) { 1672 struct per_bio_data *pb = get_per_bio_data(bio); 1673 1674 /* 1675 * Miss. 1676 */ 1677 inc_miss_counter(cache, bio); 1678 if (pb->req_nr == 0) { 1679 accounted_begin(cache, bio); 1680 remap_to_origin_clear_discard(cache, bio, block); 1681 } else { 1682 /* 1683 * This is a duplicate writethrough io that is no 1684 * longer needed because the block has been demoted. 1685 */ 1686 bio_endio(bio); 1687 return DM_MAPIO_SUBMITTED; 1688 } 1689 } else { 1690 /* 1691 * Hit. 1692 */ 1693 inc_hit_counter(cache, bio); 1694 1695 /* 1696 * Passthrough always maps to the origin, invalidating any 1697 * cache blocks that are written to. 1698 */ 1699 if (passthrough_mode(cache)) { 1700 if (bio_data_dir(bio) == WRITE) { 1701 bio_drop_shared_lock(cache, bio); 1702 atomic_inc(&cache->stats.demotion); 1703 invalidate_start(cache, cblock, block, bio); 1704 } else 1705 remap_to_origin_clear_discard(cache, bio, block); 1706 } else { 1707 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1708 !is_dirty(cache, cblock)) { 1709 remap_to_origin_and_cache(cache, bio, block, cblock); 1710 accounted_begin(cache, bio); 1711 } else 1712 remap_to_cache_dirty(cache, bio, block, cblock); 1713 } 1714 } 1715 1716 /* 1717 * dm core turns FUA requests into a separate payload and FLUSH req. 1718 */ 1719 if (bio->bi_opf & REQ_FUA) { 1720 /* 1721 * issue_after_commit will call accounted_begin a second time. So 1722 * we call accounted_complete() to avoid double accounting. 1723 */ 1724 accounted_complete(cache, bio); 1725 issue_after_commit(&cache->committer, bio); 1726 *commit_needed = true; 1727 return DM_MAPIO_SUBMITTED; 1728 } 1729 1730 return DM_MAPIO_REMAPPED; 1731 } 1732 1733 static bool process_bio(struct cache *cache, struct bio *bio) 1734 { 1735 bool commit_needed; 1736 1737 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1738 dm_submit_bio_remap(bio, NULL); 1739 1740 return commit_needed; 1741 } 1742 1743 /* 1744 * A non-zero return indicates read_only or fail_io mode. 1745 */ 1746 static int commit(struct cache *cache, bool clean_shutdown) 1747 { 1748 int r; 1749 1750 if (get_cache_mode(cache) >= CM_READ_ONLY) 1751 return -EINVAL; 1752 1753 atomic_inc(&cache->stats.commit_count); 1754 r = dm_cache_commit(cache->cmd, clean_shutdown); 1755 if (r) 1756 metadata_operation_failed(cache, "dm_cache_commit", r); 1757 1758 return r; 1759 } 1760 1761 /* 1762 * Used by the batcher. 1763 */ 1764 static blk_status_t commit_op(void *context) 1765 { 1766 struct cache *cache = context; 1767 1768 if (dm_cache_changed_this_transaction(cache->cmd)) 1769 return errno_to_blk_status(commit(cache, false)); 1770 1771 return 0; 1772 } 1773 1774 /*----------------------------------------------------------------*/ 1775 1776 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1777 { 1778 struct per_bio_data *pb = get_per_bio_data(bio); 1779 1780 if (!pb->req_nr) 1781 remap_to_origin(cache, bio); 1782 else 1783 remap_to_cache(cache, bio, 0); 1784 1785 issue_after_commit(&cache->committer, bio); 1786 return true; 1787 } 1788 1789 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1790 { 1791 dm_dblock_t b, e; 1792 1793 /* 1794 * FIXME: do we need to lock the region? Or can we just assume the 1795 * user wont be so foolish as to issue discard concurrently with 1796 * other IO? 1797 */ 1798 calc_discard_block_range(cache, bio, &b, &e); 1799 while (b != e) { 1800 set_discard(cache, b); 1801 b = to_dblock(from_dblock(b) + 1); 1802 } 1803 1804 if (cache->features.discard_passdown) { 1805 remap_to_origin(cache, bio); 1806 dm_submit_bio_remap(bio, NULL); 1807 } else 1808 bio_endio(bio); 1809 1810 return false; 1811 } 1812 1813 static void process_deferred_bios(struct work_struct *ws) 1814 { 1815 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1816 1817 bool commit_needed = false; 1818 struct bio_list bios; 1819 struct bio *bio; 1820 1821 bio_list_init(&bios); 1822 1823 spin_lock_irq(&cache->lock); 1824 bio_list_merge_init(&bios, &cache->deferred_bios); 1825 spin_unlock_irq(&cache->lock); 1826 1827 while ((bio = bio_list_pop(&bios))) { 1828 if (bio->bi_opf & REQ_PREFLUSH) 1829 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1830 1831 else if (bio_op(bio) == REQ_OP_DISCARD) 1832 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1833 1834 else 1835 commit_needed = process_bio(cache, bio) || commit_needed; 1836 cond_resched(); 1837 } 1838 1839 if (commit_needed) 1840 schedule_commit(&cache->committer); 1841 } 1842 1843 /* 1844 *-------------------------------------------------------------- 1845 * Main worker loop 1846 *-------------------------------------------------------------- 1847 */ 1848 static void requeue_deferred_bios(struct cache *cache) 1849 { 1850 struct bio *bio; 1851 struct bio_list bios; 1852 1853 bio_list_init(&bios); 1854 bio_list_merge_init(&bios, &cache->deferred_bios); 1855 1856 while ((bio = bio_list_pop(&bios))) { 1857 bio->bi_status = BLK_STS_DM_REQUEUE; 1858 bio_endio(bio); 1859 cond_resched(); 1860 } 1861 } 1862 1863 /* 1864 * We want to commit periodically so that not too much 1865 * unwritten metadata builds up. 1866 */ 1867 static void do_waker(struct work_struct *ws) 1868 { 1869 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1870 1871 policy_tick(cache->policy, true); 1872 wake_migration_worker(cache); 1873 schedule_commit(&cache->committer); 1874 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1875 } 1876 1877 static void check_migrations(struct work_struct *ws) 1878 { 1879 int r; 1880 struct policy_work *op; 1881 struct cache *cache = container_of(ws, struct cache, migration_worker); 1882 enum busy b; 1883 1884 for (;;) { 1885 b = spare_migration_bandwidth(cache); 1886 1887 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1888 if (r == -ENODATA) 1889 break; 1890 1891 if (r) { 1892 DMERR_LIMIT("%s: policy_background_work failed", 1893 cache_device_name(cache)); 1894 break; 1895 } 1896 1897 r = mg_start(cache, op, NULL); 1898 if (r) 1899 break; 1900 1901 cond_resched(); 1902 } 1903 } 1904 1905 /* 1906 *-------------------------------------------------------------- 1907 * Target methods 1908 *-------------------------------------------------------------- 1909 */ 1910 1911 /* 1912 * This function gets called on the error paths of the constructor, so we 1913 * have to cope with a partially initialised struct. 1914 */ 1915 static void __destroy(struct cache *cache) 1916 { 1917 mempool_exit(&cache->migration_pool); 1918 1919 if (cache->prison) 1920 dm_bio_prison_destroy_v2(cache->prison); 1921 1922 if (cache->wq) 1923 destroy_workqueue(cache->wq); 1924 1925 if (cache->dirty_bitset) 1926 free_bitset(cache->dirty_bitset); 1927 1928 if (cache->discard_bitset) 1929 free_bitset(cache->discard_bitset); 1930 1931 if (cache->invalid_bitset) 1932 free_bitset(cache->invalid_bitset); 1933 1934 if (cache->copier) 1935 dm_kcopyd_client_destroy(cache->copier); 1936 1937 if (cache->cmd) 1938 dm_cache_metadata_close(cache->cmd); 1939 1940 if (cache->metadata_dev) 1941 dm_put_device(cache->ti, cache->metadata_dev); 1942 1943 if (cache->origin_dev) 1944 dm_put_device(cache->ti, cache->origin_dev); 1945 1946 if (cache->cache_dev) 1947 dm_put_device(cache->ti, cache->cache_dev); 1948 1949 if (cache->policy) 1950 dm_cache_policy_destroy(cache->policy); 1951 1952 bioset_exit(&cache->bs); 1953 1954 kfree(cache); 1955 } 1956 1957 static void destroy(struct cache *cache) 1958 { 1959 unsigned int i; 1960 1961 cancel_delayed_work_sync(&cache->waker); 1962 1963 for (i = 0; i < cache->nr_ctr_args ; i++) 1964 kfree(cache->ctr_args[i]); 1965 kfree(cache->ctr_args); 1966 1967 __destroy(cache); 1968 } 1969 1970 static void cache_dtr(struct dm_target *ti) 1971 { 1972 struct cache *cache = ti->private; 1973 1974 destroy(cache); 1975 } 1976 1977 static sector_t get_dev_size(struct dm_dev *dev) 1978 { 1979 return bdev_nr_sectors(dev->bdev); 1980 } 1981 1982 /*----------------------------------------------------------------*/ 1983 1984 /* 1985 * Construct a cache device mapping. 1986 * 1987 * cache <metadata dev> <cache dev> <origin dev> <block size> 1988 * <#feature args> [<feature arg>]* 1989 * <policy> <#policy args> [<policy arg>]* 1990 * 1991 * metadata dev : fast device holding the persistent metadata 1992 * cache dev : fast device holding cached data blocks 1993 * origin dev : slow device holding original data blocks 1994 * block size : cache unit size in sectors 1995 * 1996 * #feature args : number of feature arguments passed 1997 * feature args : writethrough. (The default is writeback.) 1998 * 1999 * policy : the replacement policy to use 2000 * #policy args : an even number of policy arguments corresponding 2001 * to key/value pairs passed to the policy 2002 * policy args : key/value pairs passed to the policy 2003 * E.g. 'sequential_threshold 1024' 2004 * See cache-policies.txt for details. 2005 * 2006 * Optional feature arguments are: 2007 * writethrough : write through caching that prohibits cache block 2008 * content from being different from origin block content. 2009 * Without this argument, the default behaviour is to write 2010 * back cache block contents later for performance reasons, 2011 * so they may differ from the corresponding origin blocks. 2012 */ 2013 struct cache_args { 2014 struct dm_target *ti; 2015 2016 struct dm_dev *metadata_dev; 2017 2018 struct dm_dev *cache_dev; 2019 sector_t cache_sectors; 2020 2021 struct dm_dev *origin_dev; 2022 2023 uint32_t block_size; 2024 2025 const char *policy_name; 2026 int policy_argc; 2027 const char **policy_argv; 2028 2029 struct cache_features features; 2030 }; 2031 2032 static void destroy_cache_args(struct cache_args *ca) 2033 { 2034 if (ca->metadata_dev) 2035 dm_put_device(ca->ti, ca->metadata_dev); 2036 2037 if (ca->cache_dev) 2038 dm_put_device(ca->ti, ca->cache_dev); 2039 2040 if (ca->origin_dev) 2041 dm_put_device(ca->ti, ca->origin_dev); 2042 2043 kfree(ca); 2044 } 2045 2046 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2047 { 2048 if (!as->argc) { 2049 *error = "Insufficient args"; 2050 return false; 2051 } 2052 2053 return true; 2054 } 2055 2056 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2057 char **error) 2058 { 2059 int r; 2060 sector_t metadata_dev_size; 2061 2062 if (!at_least_one_arg(as, error)) 2063 return -EINVAL; 2064 2065 r = dm_get_device(ca->ti, dm_shift_arg(as), 2066 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); 2067 if (r) { 2068 *error = "Error opening metadata device"; 2069 return r; 2070 } 2071 2072 metadata_dev_size = get_dev_size(ca->metadata_dev); 2073 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2074 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2075 ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2076 2077 return 0; 2078 } 2079 2080 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2081 char **error) 2082 { 2083 int r; 2084 2085 if (!at_least_one_arg(as, error)) 2086 return -EINVAL; 2087 2088 r = dm_get_device(ca->ti, dm_shift_arg(as), 2089 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); 2090 if (r) { 2091 *error = "Error opening cache device"; 2092 return r; 2093 } 2094 ca->cache_sectors = get_dev_size(ca->cache_dev); 2095 2096 return 0; 2097 } 2098 2099 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2100 char **error) 2101 { 2102 int r; 2103 2104 if (!at_least_one_arg(as, error)) 2105 return -EINVAL; 2106 2107 r = dm_get_device(ca->ti, dm_shift_arg(as), 2108 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); 2109 if (r) { 2110 *error = "Error opening origin device"; 2111 return r; 2112 } 2113 2114 return 0; 2115 } 2116 2117 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2118 char **error) 2119 { 2120 unsigned long block_size; 2121 2122 if (!at_least_one_arg(as, error)) 2123 return -EINVAL; 2124 2125 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2126 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2127 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2128 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2129 *error = "Invalid data block size"; 2130 return -EINVAL; 2131 } 2132 2133 if (block_size > ca->cache_sectors) { 2134 *error = "Data block size is larger than the cache device"; 2135 return -EINVAL; 2136 } 2137 2138 ca->block_size = block_size; 2139 2140 return 0; 2141 } 2142 2143 static void init_features(struct cache_features *cf) 2144 { 2145 cf->mode = CM_WRITE; 2146 cf->io_mode = CM_IO_WRITEBACK; 2147 cf->metadata_version = 1; 2148 cf->discard_passdown = true; 2149 } 2150 2151 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2152 char **error) 2153 { 2154 static const struct dm_arg _args[] = { 2155 {0, 3, "Invalid number of cache feature arguments"}, 2156 }; 2157 2158 int r, mode_ctr = 0; 2159 unsigned int argc; 2160 const char *arg; 2161 struct cache_features *cf = &ca->features; 2162 2163 init_features(cf); 2164 2165 r = dm_read_arg_group(_args, as, &argc, error); 2166 if (r) 2167 return -EINVAL; 2168 2169 while (argc--) { 2170 arg = dm_shift_arg(as); 2171 2172 if (!strcasecmp(arg, "writeback")) { 2173 cf->io_mode = CM_IO_WRITEBACK; 2174 mode_ctr++; 2175 } 2176 2177 else if (!strcasecmp(arg, "writethrough")) { 2178 cf->io_mode = CM_IO_WRITETHROUGH; 2179 mode_ctr++; 2180 } 2181 2182 else if (!strcasecmp(arg, "passthrough")) { 2183 cf->io_mode = CM_IO_PASSTHROUGH; 2184 mode_ctr++; 2185 } 2186 2187 else if (!strcasecmp(arg, "metadata2")) 2188 cf->metadata_version = 2; 2189 2190 else if (!strcasecmp(arg, "no_discard_passdown")) 2191 cf->discard_passdown = false; 2192 2193 else { 2194 *error = "Unrecognised cache feature requested"; 2195 return -EINVAL; 2196 } 2197 } 2198 2199 if (mode_ctr > 1) { 2200 *error = "Duplicate cache io_mode features requested"; 2201 return -EINVAL; 2202 } 2203 2204 return 0; 2205 } 2206 2207 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2208 char **error) 2209 { 2210 static const struct dm_arg _args[] = { 2211 {0, 1024, "Invalid number of policy arguments"}, 2212 }; 2213 2214 int r; 2215 2216 if (!at_least_one_arg(as, error)) 2217 return -EINVAL; 2218 2219 ca->policy_name = dm_shift_arg(as); 2220 2221 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2222 if (r) 2223 return -EINVAL; 2224 2225 ca->policy_argv = (const char **)as->argv; 2226 dm_consume_args(as, ca->policy_argc); 2227 2228 return 0; 2229 } 2230 2231 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2232 char **error) 2233 { 2234 int r; 2235 struct dm_arg_set as; 2236 2237 as.argc = argc; 2238 as.argv = argv; 2239 2240 r = parse_metadata_dev(ca, &as, error); 2241 if (r) 2242 return r; 2243 2244 r = parse_cache_dev(ca, &as, error); 2245 if (r) 2246 return r; 2247 2248 r = parse_origin_dev(ca, &as, error); 2249 if (r) 2250 return r; 2251 2252 r = parse_block_size(ca, &as, error); 2253 if (r) 2254 return r; 2255 2256 r = parse_features(ca, &as, error); 2257 if (r) 2258 return r; 2259 2260 r = parse_policy(ca, &as, error); 2261 if (r) 2262 return r; 2263 2264 return 0; 2265 } 2266 2267 /*----------------------------------------------------------------*/ 2268 2269 static struct kmem_cache *migration_cache = NULL; 2270 2271 #define NOT_CORE_OPTION 1 2272 2273 static int process_config_option(struct cache *cache, const char *key, const char *value) 2274 { 2275 unsigned long tmp; 2276 2277 if (!strcasecmp(key, "migration_threshold")) { 2278 if (kstrtoul(value, 10, &tmp)) 2279 return -EINVAL; 2280 2281 cache->migration_threshold = tmp; 2282 return 0; 2283 } 2284 2285 return NOT_CORE_OPTION; 2286 } 2287 2288 static int set_config_value(struct cache *cache, const char *key, const char *value) 2289 { 2290 int r = process_config_option(cache, key, value); 2291 2292 if (r == NOT_CORE_OPTION) 2293 r = policy_set_config_value(cache->policy, key, value); 2294 2295 if (r) 2296 DMWARN("bad config value for %s: %s", key, value); 2297 2298 return r; 2299 } 2300 2301 static int set_config_values(struct cache *cache, int argc, const char **argv) 2302 { 2303 int r = 0; 2304 2305 if (argc & 1) { 2306 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2307 return -EINVAL; 2308 } 2309 2310 while (argc) { 2311 r = set_config_value(cache, argv[0], argv[1]); 2312 if (r) 2313 break; 2314 2315 argc -= 2; 2316 argv += 2; 2317 } 2318 2319 return r; 2320 } 2321 2322 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2323 char **error) 2324 { 2325 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2326 cache->cache_size, 2327 cache->origin_sectors, 2328 cache->sectors_per_block); 2329 if (IS_ERR(p)) { 2330 *error = "Error creating cache's policy"; 2331 return PTR_ERR(p); 2332 } 2333 cache->policy = p; 2334 BUG_ON(!cache->policy); 2335 2336 return 0; 2337 } 2338 2339 /* 2340 * We want the discard block size to be at least the size of the cache 2341 * block size and have no more than 2^14 discard blocks across the origin. 2342 */ 2343 #define MAX_DISCARD_BLOCKS (1 << 14) 2344 2345 static bool too_many_discard_blocks(sector_t discard_block_size, 2346 sector_t origin_size) 2347 { 2348 (void) sector_div(origin_size, discard_block_size); 2349 2350 return origin_size > MAX_DISCARD_BLOCKS; 2351 } 2352 2353 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2354 sector_t origin_size) 2355 { 2356 sector_t discard_block_size = cache_block_size; 2357 2358 if (origin_size) 2359 while (too_many_discard_blocks(discard_block_size, origin_size)) 2360 discard_block_size *= 2; 2361 2362 return discard_block_size; 2363 } 2364 2365 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2366 { 2367 dm_block_t nr_blocks = from_cblock(size); 2368 2369 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2370 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2371 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2372 "Please consider increasing the cache block size to reduce the overall cache block count.", 2373 (unsigned long long) nr_blocks); 2374 2375 cache->cache_size = size; 2376 } 2377 2378 #define DEFAULT_MIGRATION_THRESHOLD 2048 2379 2380 static int cache_create(struct cache_args *ca, struct cache **result) 2381 { 2382 int r = 0; 2383 char **error = &ca->ti->error; 2384 struct cache *cache; 2385 struct dm_target *ti = ca->ti; 2386 dm_block_t origin_blocks; 2387 struct dm_cache_metadata *cmd; 2388 bool may_format = ca->features.mode == CM_WRITE; 2389 2390 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2391 if (!cache) 2392 return -ENOMEM; 2393 2394 cache->ti = ca->ti; 2395 ti->private = cache; 2396 ti->accounts_remapped_io = true; 2397 ti->num_flush_bios = 2; 2398 ti->flush_supported = true; 2399 2400 ti->num_discard_bios = 1; 2401 ti->discards_supported = true; 2402 2403 ti->per_io_data_size = sizeof(struct per_bio_data); 2404 2405 cache->features = ca->features; 2406 if (writethrough_mode(cache)) { 2407 /* Create bioset for writethrough bios issued to origin */ 2408 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2409 if (r) 2410 goto bad; 2411 } 2412 2413 cache->metadata_dev = ca->metadata_dev; 2414 cache->origin_dev = ca->origin_dev; 2415 cache->cache_dev = ca->cache_dev; 2416 2417 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2418 2419 origin_blocks = cache->origin_sectors = ti->len; 2420 origin_blocks = block_div(origin_blocks, ca->block_size); 2421 cache->origin_blocks = to_oblock(origin_blocks); 2422 2423 cache->sectors_per_block = ca->block_size; 2424 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2425 r = -EINVAL; 2426 goto bad; 2427 } 2428 2429 if (ca->block_size & (ca->block_size - 1)) { 2430 dm_block_t cache_size = ca->cache_sectors; 2431 2432 cache->sectors_per_block_shift = -1; 2433 cache_size = block_div(cache_size, ca->block_size); 2434 set_cache_size(cache, to_cblock(cache_size)); 2435 } else { 2436 cache->sectors_per_block_shift = __ffs(ca->block_size); 2437 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2438 } 2439 2440 r = create_cache_policy(cache, ca, error); 2441 if (r) 2442 goto bad; 2443 2444 cache->policy_nr_args = ca->policy_argc; 2445 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2446 2447 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2448 if (r) { 2449 *error = "Error setting cache policy's config values"; 2450 goto bad; 2451 } 2452 2453 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2454 ca->block_size, may_format, 2455 dm_cache_policy_get_hint_size(cache->policy), 2456 ca->features.metadata_version); 2457 if (IS_ERR(cmd)) { 2458 *error = "Error creating metadata object"; 2459 r = PTR_ERR(cmd); 2460 goto bad; 2461 } 2462 cache->cmd = cmd; 2463 set_cache_mode(cache, CM_WRITE); 2464 if (get_cache_mode(cache) != CM_WRITE) { 2465 *error = "Unable to get write access to metadata, please check/repair metadata."; 2466 r = -EINVAL; 2467 goto bad; 2468 } 2469 2470 if (passthrough_mode(cache)) { 2471 bool all_clean; 2472 2473 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2474 if (r) { 2475 *error = "dm_cache_metadata_all_clean() failed"; 2476 goto bad; 2477 } 2478 2479 if (!all_clean) { 2480 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2481 r = -EINVAL; 2482 goto bad; 2483 } 2484 2485 policy_allow_migrations(cache->policy, false); 2486 } 2487 2488 spin_lock_init(&cache->lock); 2489 bio_list_init(&cache->deferred_bios); 2490 atomic_set(&cache->nr_allocated_migrations, 0); 2491 atomic_set(&cache->nr_io_migrations, 0); 2492 init_waitqueue_head(&cache->migration_wait); 2493 2494 r = -ENOMEM; 2495 atomic_set(&cache->nr_dirty, 0); 2496 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2497 if (!cache->dirty_bitset) { 2498 *error = "could not allocate dirty bitset"; 2499 goto bad; 2500 } 2501 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2502 2503 cache->discard_block_size = 2504 calculate_discard_block_size(cache->sectors_per_block, 2505 cache->origin_sectors); 2506 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2507 cache->discard_block_size)); 2508 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2509 if (!cache->discard_bitset) { 2510 *error = "could not allocate discard bitset"; 2511 goto bad; 2512 } 2513 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2514 2515 cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2516 if (!cache->invalid_bitset) { 2517 *error = "could not allocate bitset for invalid blocks"; 2518 goto bad; 2519 } 2520 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 2521 2522 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2523 if (IS_ERR(cache->copier)) { 2524 *error = "could not create kcopyd client"; 2525 r = PTR_ERR(cache->copier); 2526 goto bad; 2527 } 2528 2529 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, 2530 WQ_MEM_RECLAIM | WQ_PERCPU, 0); 2531 if (!cache->wq) { 2532 *error = "could not create workqueue for metadata object"; 2533 goto bad; 2534 } 2535 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2536 INIT_WORK(&cache->migration_worker, check_migrations); 2537 INIT_DELAYED_WORK(&cache->waker, do_waker); 2538 2539 cache->prison = dm_bio_prison_create_v2(cache->wq); 2540 if (!cache->prison) { 2541 *error = "could not create bio prison"; 2542 goto bad; 2543 } 2544 2545 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2546 migration_cache); 2547 if (r) { 2548 *error = "Error creating cache's migration mempool"; 2549 goto bad; 2550 } 2551 2552 cache->need_tick_bio = true; 2553 cache->sized = false; 2554 cache->invalidate = false; 2555 cache->commit_requested = false; 2556 cache->loaded_mappings = false; 2557 cache->loaded_discards = false; 2558 2559 load_stats(cache); 2560 2561 atomic_set(&cache->stats.demotion, 0); 2562 atomic_set(&cache->stats.promotion, 0); 2563 atomic_set(&cache->stats.copies_avoided, 0); 2564 atomic_set(&cache->stats.cache_cell_clash, 0); 2565 atomic_set(&cache->stats.commit_count, 0); 2566 atomic_set(&cache->stats.discard_count, 0); 2567 2568 spin_lock_init(&cache->invalidation_lock); 2569 INIT_LIST_HEAD(&cache->invalidation_requests); 2570 2571 batcher_init(&cache->committer, commit_op, cache, 2572 issue_op, cache, cache->wq); 2573 dm_iot_init(&cache->tracker); 2574 2575 init_rwsem(&cache->background_work_lock); 2576 prevent_background_work(cache); 2577 2578 *result = cache; 2579 return 0; 2580 bad: 2581 __destroy(cache); 2582 return r; 2583 } 2584 2585 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2586 { 2587 unsigned int i; 2588 const char **copy; 2589 2590 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2591 if (!copy) 2592 return -ENOMEM; 2593 for (i = 0; i < argc; i++) { 2594 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2595 if (!copy[i]) { 2596 while (i--) 2597 kfree(copy[i]); 2598 kfree(copy); 2599 return -ENOMEM; 2600 } 2601 } 2602 2603 cache->nr_ctr_args = argc; 2604 cache->ctr_args = copy; 2605 2606 return 0; 2607 } 2608 2609 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2610 { 2611 int r = -EINVAL; 2612 struct cache_args *ca; 2613 struct cache *cache = NULL; 2614 2615 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2616 if (!ca) { 2617 ti->error = "Error allocating memory for cache"; 2618 return -ENOMEM; 2619 } 2620 ca->ti = ti; 2621 2622 r = parse_cache_args(ca, argc, argv, &ti->error); 2623 if (r) 2624 goto out; 2625 2626 r = cache_create(ca, &cache); 2627 if (r) 2628 goto out; 2629 2630 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2631 if (r) { 2632 __destroy(cache); 2633 goto out; 2634 } 2635 2636 ti->private = cache; 2637 out: 2638 destroy_cache_args(ca); 2639 return r; 2640 } 2641 2642 /*----------------------------------------------------------------*/ 2643 2644 static int cache_map(struct dm_target *ti, struct bio *bio) 2645 { 2646 struct cache *cache = ti->private; 2647 2648 int r; 2649 bool commit_needed; 2650 dm_oblock_t block = get_bio_block(cache, bio); 2651 2652 init_per_bio_data(bio); 2653 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2654 /* 2655 * This can only occur if the io goes to a partial block at 2656 * the end of the origin device. We don't cache these. 2657 * Just remap to the origin and carry on. 2658 */ 2659 remap_to_origin(cache, bio); 2660 accounted_begin(cache, bio); 2661 return DM_MAPIO_REMAPPED; 2662 } 2663 2664 if (discard_or_flush(bio)) { 2665 defer_bio(cache, bio); 2666 return DM_MAPIO_SUBMITTED; 2667 } 2668 2669 r = map_bio(cache, bio, block, &commit_needed); 2670 if (commit_needed) 2671 schedule_commit(&cache->committer); 2672 2673 return r; 2674 } 2675 2676 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2677 { 2678 struct cache *cache = ti->private; 2679 unsigned long flags; 2680 struct per_bio_data *pb = get_per_bio_data(bio); 2681 2682 if (pb->tick) { 2683 policy_tick(cache->policy, false); 2684 2685 spin_lock_irqsave(&cache->lock, flags); 2686 cache->need_tick_bio = true; 2687 spin_unlock_irqrestore(&cache->lock, flags); 2688 } 2689 2690 bio_drop_shared_lock(cache, bio); 2691 accounted_complete(cache, bio); 2692 2693 return DM_ENDIO_DONE; 2694 } 2695 2696 static int write_dirty_bitset(struct cache *cache) 2697 { 2698 int r; 2699 2700 if (get_cache_mode(cache) >= CM_READ_ONLY) 2701 return -EINVAL; 2702 2703 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2704 if (r) 2705 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2706 2707 return r; 2708 } 2709 2710 static int write_discard_bitset(struct cache *cache) 2711 { 2712 unsigned int i, r; 2713 2714 if (get_cache_mode(cache) >= CM_READ_ONLY) 2715 return -EINVAL; 2716 2717 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2718 cache->discard_nr_blocks); 2719 if (r) { 2720 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2721 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2722 return r; 2723 } 2724 2725 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2726 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2727 is_discarded(cache, to_dblock(i))); 2728 if (r) { 2729 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2730 return r; 2731 } 2732 } 2733 2734 return 0; 2735 } 2736 2737 static int write_hints(struct cache *cache) 2738 { 2739 int r; 2740 2741 if (get_cache_mode(cache) >= CM_READ_ONLY) 2742 return -EINVAL; 2743 2744 r = dm_cache_write_hints(cache->cmd, cache->policy); 2745 if (r) { 2746 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2747 return r; 2748 } 2749 2750 return 0; 2751 } 2752 2753 /* 2754 * returns true on success 2755 */ 2756 static bool sync_metadata(struct cache *cache) 2757 { 2758 int r1, r2, r3, r4; 2759 2760 r1 = write_dirty_bitset(cache); 2761 if (r1) 2762 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2763 2764 r2 = write_discard_bitset(cache); 2765 if (r2) 2766 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2767 2768 save_stats(cache); 2769 2770 r3 = write_hints(cache); 2771 if (r3) 2772 DMERR("%s: could not write hints", cache_device_name(cache)); 2773 2774 /* 2775 * If writing the above metadata failed, we still commit, but don't 2776 * set the clean shutdown flag. This will effectively force every 2777 * dirty bit to be set on reload. 2778 */ 2779 r4 = commit(cache, !r1 && !r2 && !r3); 2780 if (r4) 2781 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2782 2783 return !r1 && !r2 && !r3 && !r4; 2784 } 2785 2786 static void cache_postsuspend(struct dm_target *ti) 2787 { 2788 struct cache *cache = ti->private; 2789 2790 prevent_background_work(cache); 2791 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2792 2793 cancel_delayed_work_sync(&cache->waker); 2794 drain_workqueue(cache->wq); 2795 WARN_ON(cache->tracker.in_flight); 2796 2797 /* 2798 * If it's a flush suspend there won't be any deferred bios, so this 2799 * call is harmless. 2800 */ 2801 requeue_deferred_bios(cache); 2802 2803 if (get_cache_mode(cache) == CM_WRITE) 2804 (void) sync_metadata(cache); 2805 } 2806 2807 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2808 bool dirty, uint32_t hint, bool hint_valid) 2809 { 2810 struct cache *cache = context; 2811 2812 if (dirty) { 2813 set_bit(from_cblock(cblock), cache->dirty_bitset); 2814 atomic_inc(&cache->nr_dirty); 2815 } else 2816 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2817 2818 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2819 } 2820 2821 static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2822 bool dirty, uint32_t hint, bool hint_valid) 2823 { 2824 struct cache *cache = context; 2825 2826 if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { 2827 if (dirty) { 2828 DMERR("%s: unable to shrink origin; cache block %u is dirty", 2829 cache_device_name(cache), from_cblock(cblock)); 2830 return -EFBIG; 2831 } 2832 set_bit(from_cblock(cblock), cache->invalid_bitset); 2833 return 0; 2834 } 2835 2836 return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); 2837 } 2838 2839 /* 2840 * The discard block size in the on disk metadata is not 2841 * necessarily the same as we're currently using. So we have to 2842 * be careful to only set the discarded attribute if we know it 2843 * covers a complete block of the new size. 2844 */ 2845 struct discard_load_info { 2846 struct cache *cache; 2847 2848 /* 2849 * These blocks are sized using the on disk dblock size, rather 2850 * than the current one. 2851 */ 2852 dm_block_t block_size; 2853 dm_block_t discard_begin, discard_end; 2854 }; 2855 2856 static void discard_load_info_init(struct cache *cache, 2857 struct discard_load_info *li) 2858 { 2859 li->cache = cache; 2860 li->discard_begin = li->discard_end = 0; 2861 } 2862 2863 static void set_discard_range(struct discard_load_info *li) 2864 { 2865 sector_t b, e; 2866 2867 if (li->discard_begin == li->discard_end) 2868 return; 2869 2870 /* 2871 * Convert to sectors. 2872 */ 2873 b = li->discard_begin * li->block_size; 2874 e = li->discard_end * li->block_size; 2875 2876 /* 2877 * Then convert back to the current dblock size. 2878 */ 2879 b = dm_sector_div_up(b, li->cache->discard_block_size); 2880 sector_div(e, li->cache->discard_block_size); 2881 2882 /* 2883 * The origin may have shrunk, so we need to check we're still in 2884 * bounds. 2885 */ 2886 if (e > from_dblock(li->cache->discard_nr_blocks)) 2887 e = from_dblock(li->cache->discard_nr_blocks); 2888 2889 for (; b < e; b++) 2890 set_discard(li->cache, to_dblock(b)); 2891 } 2892 2893 static int load_discard(void *context, sector_t discard_block_size, 2894 dm_dblock_t dblock, bool discard) 2895 { 2896 struct discard_load_info *li = context; 2897 2898 li->block_size = discard_block_size; 2899 2900 if (discard) { 2901 if (from_dblock(dblock) == li->discard_end) 2902 /* 2903 * We're already in a discard range, just extend it. 2904 */ 2905 li->discard_end = li->discard_end + 1ULL; 2906 2907 else { 2908 /* 2909 * Emit the old range and start a new one. 2910 */ 2911 set_discard_range(li); 2912 li->discard_begin = from_dblock(dblock); 2913 li->discard_end = li->discard_begin + 1ULL; 2914 } 2915 } else { 2916 set_discard_range(li); 2917 li->discard_begin = li->discard_end = 0; 2918 } 2919 2920 return 0; 2921 } 2922 2923 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2924 { 2925 sector_t size = get_dev_size(cache->cache_dev); 2926 (void) sector_div(size, cache->sectors_per_block); 2927 return to_cblock(size); 2928 } 2929 2930 static bool can_resume(struct cache *cache) 2931 { 2932 /* 2933 * Disallow retrying the resume operation for devices that failed the 2934 * first resume attempt, as the failure leaves the policy object partially 2935 * initialized. Retrying could trigger BUG_ON when loading cache mappings 2936 * into the incomplete policy object. 2937 */ 2938 if (cache->sized && !cache->loaded_mappings) { 2939 if (get_cache_mode(cache) != CM_WRITE) 2940 DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", 2941 cache_device_name(cache)); 2942 else 2943 DMERR("%s: unable to resume cache due to missing proper cache table reload", 2944 cache_device_name(cache)); 2945 return false; 2946 } 2947 2948 return true; 2949 } 2950 2951 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2952 { 2953 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2954 DMERR("%s: unable to extend cache due to missing cache table reload", 2955 cache_device_name(cache)); 2956 return false; 2957 } 2958 2959 /* 2960 * We can't drop a dirty block when shrinking the cache. 2961 */ 2962 if (cache->loaded_mappings) { 2963 new_size = to_cblock(find_next_bit(cache->dirty_bitset, 2964 from_cblock(cache->cache_size), 2965 from_cblock(new_size))); 2966 if (new_size != cache->cache_size) { 2967 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2968 cache_device_name(cache), 2969 (unsigned long long) from_cblock(new_size)); 2970 return false; 2971 } 2972 } 2973 2974 return true; 2975 } 2976 2977 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2978 { 2979 int r; 2980 2981 r = dm_cache_resize(cache->cmd, new_size); 2982 if (r) { 2983 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2984 metadata_operation_failed(cache, "dm_cache_resize", r); 2985 return r; 2986 } 2987 2988 set_cache_size(cache, new_size); 2989 2990 return 0; 2991 } 2992 2993 static int truncate_oblocks(struct cache *cache) 2994 { 2995 uint32_t nr_blocks = from_cblock(cache->cache_size); 2996 uint32_t i; 2997 int r; 2998 2999 for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { 3000 r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); 3001 if (r) { 3002 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 3003 cache_device_name(cache)); 3004 return r; 3005 } 3006 } 3007 3008 return 0; 3009 } 3010 3011 static int cache_preresume(struct dm_target *ti) 3012 { 3013 int r = 0; 3014 struct cache *cache = ti->private; 3015 dm_cblock_t csize = get_cache_dev_size(cache); 3016 3017 if (!can_resume(cache)) 3018 return -EINVAL; 3019 3020 /* 3021 * Check to see if the cache has resized. 3022 */ 3023 if (!cache->sized || csize != cache->cache_size) { 3024 if (!can_resize(cache, csize)) 3025 return -EINVAL; 3026 3027 r = resize_cache_dev(cache, csize); 3028 if (r) 3029 return r; 3030 3031 cache->sized = true; 3032 } 3033 3034 if (!cache->loaded_mappings) { 3035 /* 3036 * The fast device could have been resized since the last 3037 * failed preresume attempt. To be safe we start by a blank 3038 * bitset for cache blocks. 3039 */ 3040 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 3041 3042 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3043 load_filtered_mapping, cache); 3044 if (r) { 3045 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3046 if (r != -EFBIG) 3047 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3048 return r; 3049 } 3050 3051 r = truncate_oblocks(cache); 3052 if (r) { 3053 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 3054 return r; 3055 } 3056 3057 cache->loaded_mappings = true; 3058 } 3059 3060 if (!cache->loaded_discards) { 3061 struct discard_load_info li; 3062 3063 /* 3064 * The discard bitset could have been resized, or the 3065 * discard block size changed. To be safe we start by 3066 * setting every dblock to not discarded. 3067 */ 3068 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3069 3070 discard_load_info_init(cache, &li); 3071 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3072 if (r) { 3073 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3074 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3075 return r; 3076 } 3077 set_discard_range(&li); 3078 3079 cache->loaded_discards = true; 3080 } 3081 3082 return r; 3083 } 3084 3085 static void cache_resume(struct dm_target *ti) 3086 { 3087 struct cache *cache = ti->private; 3088 3089 cache->need_tick_bio = true; 3090 allow_background_work(cache); 3091 do_waker(&cache->waker.work); 3092 } 3093 3094 static void emit_flags(struct cache *cache, char *result, 3095 unsigned int maxlen, ssize_t *sz_ptr) 3096 { 3097 ssize_t sz = *sz_ptr; 3098 struct cache_features *cf = &cache->features; 3099 unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3100 3101 DMEMIT("%u ", count); 3102 3103 if (cf->metadata_version == 2) 3104 DMEMIT("metadata2 "); 3105 3106 if (writethrough_mode(cache)) 3107 DMEMIT("writethrough "); 3108 3109 else if (passthrough_mode(cache)) 3110 DMEMIT("passthrough "); 3111 3112 else if (writeback_mode(cache)) 3113 DMEMIT("writeback "); 3114 3115 else { 3116 DMEMIT("unknown "); 3117 DMERR("%s: internal error: unknown io mode: %d", 3118 cache_device_name(cache), (int) cf->io_mode); 3119 } 3120 3121 if (!cf->discard_passdown) 3122 DMEMIT("no_discard_passdown "); 3123 3124 *sz_ptr = sz; 3125 } 3126 3127 /* 3128 * Status format: 3129 * 3130 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3131 * <cache block size> <#used cache blocks>/<#total cache blocks> 3132 * <#read hits> <#read misses> <#write hits> <#write misses> 3133 * <#demotions> <#promotions> <#dirty> 3134 * <#features> <features>* 3135 * <#core args> <core args> 3136 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3137 */ 3138 static void cache_status(struct dm_target *ti, status_type_t type, 3139 unsigned int status_flags, char *result, unsigned int maxlen) 3140 { 3141 int r = 0; 3142 unsigned int i; 3143 ssize_t sz = 0; 3144 dm_block_t nr_free_blocks_metadata = 0; 3145 dm_block_t nr_blocks_metadata = 0; 3146 char buf[BDEVNAME_SIZE]; 3147 struct cache *cache = ti->private; 3148 dm_cblock_t residency; 3149 bool needs_check; 3150 3151 switch (type) { 3152 case STATUSTYPE_INFO: 3153 if (get_cache_mode(cache) == CM_FAIL) { 3154 DMEMIT("Fail"); 3155 break; 3156 } 3157 3158 /* Commit to ensure statistics aren't out-of-date */ 3159 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3160 (void) commit(cache, false); 3161 3162 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3163 if (r) { 3164 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3165 cache_device_name(cache), r); 3166 goto err; 3167 } 3168 3169 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3170 if (r) { 3171 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3172 cache_device_name(cache), r); 3173 goto err; 3174 } 3175 3176 residency = policy_residency(cache->policy); 3177 3178 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3179 (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, 3180 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3181 (unsigned long long)nr_blocks_metadata, 3182 (unsigned long long)cache->sectors_per_block, 3183 (unsigned long long) from_cblock(residency), 3184 (unsigned long long) from_cblock(cache->cache_size), 3185 (unsigned int) atomic_read(&cache->stats.read_hit), 3186 (unsigned int) atomic_read(&cache->stats.read_miss), 3187 (unsigned int) atomic_read(&cache->stats.write_hit), 3188 (unsigned int) atomic_read(&cache->stats.write_miss), 3189 (unsigned int) atomic_read(&cache->stats.demotion), 3190 (unsigned int) atomic_read(&cache->stats.promotion), 3191 (unsigned long) atomic_read(&cache->nr_dirty)); 3192 3193 emit_flags(cache, result, maxlen, &sz); 3194 3195 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3196 3197 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3198 if (sz < maxlen) { 3199 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3200 if (r) 3201 DMERR("%s: policy_emit_config_values returned %d", 3202 cache_device_name(cache), r); 3203 } 3204 3205 if (get_cache_mode(cache) == CM_READ_ONLY) 3206 DMEMIT("ro "); 3207 else 3208 DMEMIT("rw "); 3209 3210 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3211 3212 if (r || needs_check) 3213 DMEMIT("needs_check "); 3214 else 3215 DMEMIT("- "); 3216 3217 break; 3218 3219 case STATUSTYPE_TABLE: 3220 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3221 DMEMIT("%s ", buf); 3222 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3223 DMEMIT("%s ", buf); 3224 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3225 DMEMIT("%s", buf); 3226 3227 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3228 DMEMIT(" %s", cache->ctr_args[i]); 3229 if (cache->nr_ctr_args) 3230 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3231 break; 3232 3233 case STATUSTYPE_IMA: 3234 DMEMIT_TARGET_NAME_VERSION(ti->type); 3235 if (get_cache_mode(cache) == CM_FAIL) 3236 DMEMIT(",metadata_mode=fail"); 3237 else if (get_cache_mode(cache) == CM_READ_ONLY) 3238 DMEMIT(",metadata_mode=ro"); 3239 else 3240 DMEMIT(",metadata_mode=rw"); 3241 3242 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3243 DMEMIT(",cache_metadata_device=%s", buf); 3244 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3245 DMEMIT(",cache_device=%s", buf); 3246 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3247 DMEMIT(",cache_origin_device=%s", buf); 3248 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); 3249 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); 3250 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); 3251 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); 3252 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); 3253 DMEMIT(";"); 3254 break; 3255 } 3256 3257 return; 3258 3259 err: 3260 DMEMIT("Error"); 3261 } 3262 3263 /* 3264 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3265 * the one-past-the-end value. 3266 */ 3267 struct cblock_range { 3268 dm_cblock_t begin; 3269 dm_cblock_t end; 3270 }; 3271 3272 /* 3273 * A cache block range can take two forms: 3274 * 3275 * i) A single cblock, eg. '3456' 3276 * ii) A begin and end cblock with a dash between, eg. 123-234 3277 */ 3278 static int parse_cblock_range(struct cache *cache, const char *str, 3279 struct cblock_range *result) 3280 { 3281 char dummy; 3282 uint64_t b, e; 3283 int r; 3284 3285 /* 3286 * Try and parse form (ii) first. 3287 */ 3288 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3289 3290 if (r == 2) { 3291 result->begin = to_cblock(b); 3292 result->end = to_cblock(e); 3293 return 0; 3294 } 3295 3296 /* 3297 * That didn't work, try form (i). 3298 */ 3299 r = sscanf(str, "%llu%c", &b, &dummy); 3300 3301 if (r == 1) { 3302 result->begin = to_cblock(b); 3303 result->end = to_cblock(from_cblock(result->begin) + 1u); 3304 return 0; 3305 } 3306 3307 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3308 return -EINVAL; 3309 } 3310 3311 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3312 { 3313 uint64_t b = from_cblock(range->begin); 3314 uint64_t e = from_cblock(range->end); 3315 uint64_t n = from_cblock(cache->cache_size); 3316 3317 if (b >= n) { 3318 DMERR("%s: begin cblock out of range: %llu >= %llu", 3319 cache_device_name(cache), b, n); 3320 return -EINVAL; 3321 } 3322 3323 if (e > n) { 3324 DMERR("%s: end cblock out of range: %llu > %llu", 3325 cache_device_name(cache), e, n); 3326 return -EINVAL; 3327 } 3328 3329 if (b >= e) { 3330 DMERR("%s: invalid cblock range: %llu >= %llu", 3331 cache_device_name(cache), b, e); 3332 return -EINVAL; 3333 } 3334 3335 return 0; 3336 } 3337 3338 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3339 { 3340 return to_cblock(from_cblock(b) + 1); 3341 } 3342 3343 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3344 { 3345 int r = 0; 3346 3347 /* 3348 * We don't need to do any locking here because we know we're in 3349 * passthrough mode. There's is potential for a race between an 3350 * invalidation triggered by an io and an invalidation message. This 3351 * is harmless, we must not worry if the policy call fails. 3352 */ 3353 while (range->begin != range->end) { 3354 r = invalidate_cblock(cache, range->begin); 3355 if (r) 3356 return r; 3357 3358 range->begin = cblock_succ(range->begin); 3359 } 3360 3361 cache->commit_requested = true; 3362 return r; 3363 } 3364 3365 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, 3366 const char **cblock_ranges) 3367 { 3368 int r = 0; 3369 unsigned int i; 3370 struct cblock_range range; 3371 3372 if (!passthrough_mode(cache)) { 3373 DMERR("%s: cache has to be in passthrough mode for invalidation", 3374 cache_device_name(cache)); 3375 return -EPERM; 3376 } 3377 3378 for (i = 0; i < count; i++) { 3379 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3380 if (r) 3381 break; 3382 3383 r = validate_cblock_range(cache, &range); 3384 if (r) 3385 break; 3386 3387 /* 3388 * Pass begin and end origin blocks to the worker and wake it. 3389 */ 3390 r = request_invalidation(cache, &range); 3391 if (r) 3392 break; 3393 } 3394 3395 return r; 3396 } 3397 3398 /* 3399 * Supports 3400 * "<key> <value>" 3401 * and 3402 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3403 * 3404 * The key migration_threshold is supported by the cache target core. 3405 */ 3406 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, 3407 char *result, unsigned int maxlen) 3408 { 3409 struct cache *cache = ti->private; 3410 3411 if (!argc) 3412 return -EINVAL; 3413 3414 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3415 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3416 cache_device_name(cache)); 3417 return -EOPNOTSUPP; 3418 } 3419 3420 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3421 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3422 3423 if (argc != 2) 3424 return -EINVAL; 3425 3426 return set_config_value(cache, argv[0], argv[1]); 3427 } 3428 3429 static int cache_iterate_devices(struct dm_target *ti, 3430 iterate_devices_callout_fn fn, void *data) 3431 { 3432 int r = 0; 3433 struct cache *cache = ti->private; 3434 3435 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3436 if (!r) 3437 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3438 3439 return r; 3440 } 3441 3442 /* 3443 * If discard_passdown was enabled verify that the origin device 3444 * supports discards. Disable discard_passdown if not. 3445 */ 3446 static void disable_passdown_if_not_supported(struct cache *cache) 3447 { 3448 struct block_device *origin_bdev = cache->origin_dev->bdev; 3449 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3450 const char *reason = NULL; 3451 3452 if (!cache->features.discard_passdown) 3453 return; 3454 3455 if (!bdev_max_discard_sectors(origin_bdev)) 3456 reason = "discard unsupported"; 3457 3458 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3459 reason = "max discard sectors smaller than a block"; 3460 3461 if (reason) { 3462 DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3463 origin_bdev, reason); 3464 cache->features.discard_passdown = false; 3465 } 3466 } 3467 3468 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3469 { 3470 struct block_device *origin_bdev = cache->origin_dev->bdev; 3471 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3472 3473 if (!cache->features.discard_passdown) { 3474 /* No passdown is done so setting own virtual limits */ 3475 limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3476 cache->origin_sectors); 3477 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3478 return; 3479 } 3480 3481 /* 3482 * cache_iterate_devices() is stacking both origin and fast device limits 3483 * but discards aren't passed to fast device, so inherit origin's limits. 3484 */ 3485 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3486 limits->discard_granularity = origin_limits->discard_granularity; 3487 limits->discard_alignment = origin_limits->discard_alignment; 3488 } 3489 3490 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3491 { 3492 struct cache *cache = ti->private; 3493 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3494 3495 /* 3496 * If the system-determined stacked limits are compatible with the 3497 * cache's blocksize (io_opt is a factor) do not override them. 3498 */ 3499 if (io_opt_sectors < cache->sectors_per_block || 3500 do_div(io_opt_sectors, cache->sectors_per_block)) { 3501 limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; 3502 limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; 3503 } 3504 3505 disable_passdown_if_not_supported(cache); 3506 set_discard_limits(cache, limits); 3507 } 3508 3509 /*----------------------------------------------------------------*/ 3510 3511 static struct target_type cache_target = { 3512 .name = "cache", 3513 .version = {2, 3, 0}, 3514 .module = THIS_MODULE, 3515 .ctr = cache_ctr, 3516 .dtr = cache_dtr, 3517 .map = cache_map, 3518 .end_io = cache_end_io, 3519 .postsuspend = cache_postsuspend, 3520 .preresume = cache_preresume, 3521 .resume = cache_resume, 3522 .status = cache_status, 3523 .message = cache_message, 3524 .iterate_devices = cache_iterate_devices, 3525 .io_hints = cache_io_hints, 3526 }; 3527 3528 static int __init dm_cache_init(void) 3529 { 3530 int r; 3531 3532 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3533 if (!migration_cache) { 3534 r = -ENOMEM; 3535 goto err; 3536 } 3537 3538 btracker_work_cache = kmem_cache_create("dm_cache_bt_work", 3539 sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); 3540 if (!btracker_work_cache) { 3541 r = -ENOMEM; 3542 goto err; 3543 } 3544 3545 r = dm_register_target(&cache_target); 3546 if (r) { 3547 goto err; 3548 } 3549 3550 return 0; 3551 3552 err: 3553 kmem_cache_destroy(migration_cache); 3554 kmem_cache_destroy(btracker_work_cache); 3555 return r; 3556 } 3557 3558 static void __exit dm_cache_exit(void) 3559 { 3560 dm_unregister_target(&cache_target); 3561 kmem_cache_destroy(migration_cache); 3562 kmem_cache_destroy(btracker_work_cache); 3563 } 3564 3565 module_init(dm_cache_init); 3566 module_exit(dm_cache_exit); 3567 3568 MODULE_DESCRIPTION(DM_NAME " cache target"); 3569 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3570 MODULE_LICENSE("GPL"); 3571