1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-prison-v2.h" 10 #include "dm-bio-record.h" 11 #include "dm-cache-metadata.h" 12 #include "dm-io-tracker.h" 13 #include "dm-cache-background-tracker.h" 14 15 #include <linux/dm-io.h> 16 #include <linux/dm-kcopyd.h> 17 #include <linux/jiffies.h> 18 #include <linux/init.h> 19 #include <linux/mempool.h> 20 #include <linux/module.h> 21 #include <linux/rwsem.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 25 #define DM_MSG_PREFIX "cache" 26 27 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 28 "A percentage of time allocated for copying to and/or from cache"); 29 30 /*----------------------------------------------------------------*/ 31 32 /* 33 * Glossary: 34 * 35 * oblock: index of an origin block 36 * cblock: index of a cache block 37 * promotion: movement of a block from origin to cache 38 * demotion: movement of a block from cache to origin 39 * migration: movement of a block between the origin and cache device, 40 * either direction 41 */ 42 43 /*----------------------------------------------------------------*/ 44 45 /* 46 * Represents a chunk of future work. 'input' allows continuations to pass 47 * values between themselves, typically error values. 48 */ 49 struct continuation { 50 struct work_struct ws; 51 blk_status_t input; 52 }; 53 54 static inline void init_continuation(struct continuation *k, 55 void (*fn)(struct work_struct *)) 56 { 57 INIT_WORK(&k->ws, fn); 58 k->input = 0; 59 } 60 61 static inline void queue_continuation(struct workqueue_struct *wq, 62 struct continuation *k) 63 { 64 queue_work(wq, &k->ws); 65 } 66 67 /*----------------------------------------------------------------*/ 68 69 /* 70 * The batcher collects together pieces of work that need a particular 71 * operation to occur before they can proceed (typically a commit). 72 */ 73 struct batcher { 74 /* 75 * The operation that everyone is waiting for. 76 */ 77 blk_status_t (*commit_op)(void *context); 78 void *commit_context; 79 80 /* 81 * This is how bios should be issued once the commit op is complete 82 * (accounted_request). 83 */ 84 void (*issue_op)(struct bio *bio, void *context); 85 void *issue_context; 86 87 /* 88 * Queued work gets put on here after commit. 89 */ 90 struct workqueue_struct *wq; 91 92 spinlock_t lock; 93 struct list_head work_items; 94 struct bio_list bios; 95 struct work_struct commit_work; 96 97 bool commit_scheduled; 98 }; 99 100 static void __commit(struct work_struct *_ws) 101 { 102 struct batcher *b = container_of(_ws, struct batcher, commit_work); 103 blk_status_t r; 104 struct list_head work_items; 105 struct work_struct *ws, *tmp; 106 struct continuation *k; 107 struct bio *bio; 108 struct bio_list bios; 109 110 INIT_LIST_HEAD(&work_items); 111 bio_list_init(&bios); 112 113 /* 114 * We have to grab these before the commit_op to avoid a race 115 * condition. 116 */ 117 spin_lock_irq(&b->lock); 118 list_splice_init(&b->work_items, &work_items); 119 bio_list_merge_init(&bios, &b->bios); 120 b->commit_scheduled = false; 121 spin_unlock_irq(&b->lock); 122 123 r = b->commit_op(b->commit_context); 124 125 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 126 k = container_of(ws, struct continuation, ws); 127 k->input = r; 128 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 129 queue_work(b->wq, ws); 130 } 131 132 while ((bio = bio_list_pop(&bios))) { 133 if (r) { 134 bio->bi_status = r; 135 bio_endio(bio); 136 } else 137 b->issue_op(bio, b->issue_context); 138 } 139 } 140 141 static void batcher_init(struct batcher *b, 142 blk_status_t (*commit_op)(void *), 143 void *commit_context, 144 void (*issue_op)(struct bio *bio, void *), 145 void *issue_context, 146 struct workqueue_struct *wq) 147 { 148 b->commit_op = commit_op; 149 b->commit_context = commit_context; 150 b->issue_op = issue_op; 151 b->issue_context = issue_context; 152 b->wq = wq; 153 154 spin_lock_init(&b->lock); 155 INIT_LIST_HEAD(&b->work_items); 156 bio_list_init(&b->bios); 157 INIT_WORK(&b->commit_work, __commit); 158 b->commit_scheduled = false; 159 } 160 161 static void async_commit(struct batcher *b) 162 { 163 queue_work(b->wq, &b->commit_work); 164 } 165 166 static void continue_after_commit(struct batcher *b, struct continuation *k) 167 { 168 bool commit_scheduled; 169 170 spin_lock_irq(&b->lock); 171 commit_scheduled = b->commit_scheduled; 172 list_add_tail(&k->ws.entry, &b->work_items); 173 spin_unlock_irq(&b->lock); 174 175 if (commit_scheduled) 176 async_commit(b); 177 } 178 179 /* 180 * Bios are errored if commit failed. 181 */ 182 static void issue_after_commit(struct batcher *b, struct bio *bio) 183 { 184 bool commit_scheduled; 185 186 spin_lock_irq(&b->lock); 187 commit_scheduled = b->commit_scheduled; 188 bio_list_add(&b->bios, bio); 189 spin_unlock_irq(&b->lock); 190 191 if (commit_scheduled) 192 async_commit(b); 193 } 194 195 /* 196 * Call this if some urgent work is waiting for the commit to complete. 197 */ 198 static void schedule_commit(struct batcher *b) 199 { 200 bool immediate; 201 202 spin_lock_irq(&b->lock); 203 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 204 b->commit_scheduled = true; 205 spin_unlock_irq(&b->lock); 206 207 if (immediate) 208 async_commit(b); 209 } 210 211 /* 212 * There are a couple of places where we let a bio run, but want to do some 213 * work before calling its endio function. We do this by temporarily 214 * changing the endio fn. 215 */ 216 struct dm_hook_info { 217 bio_end_io_t *bi_end_io; 218 }; 219 220 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 221 bio_end_io_t *bi_end_io, void *bi_private) 222 { 223 h->bi_end_io = bio->bi_end_io; 224 225 bio->bi_end_io = bi_end_io; 226 bio->bi_private = bi_private; 227 } 228 229 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 230 { 231 bio->bi_end_io = h->bi_end_io; 232 } 233 234 /*----------------------------------------------------------------*/ 235 236 #define MIGRATION_POOL_SIZE 128 237 #define COMMIT_PERIOD HZ 238 #define MIGRATION_COUNT_WINDOW 10 239 240 /* 241 * The block size of the device holding cache data must be 242 * between 32KB and 1GB. 243 */ 244 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 245 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 246 247 enum cache_metadata_mode { 248 CM_WRITE, /* metadata may be changed */ 249 CM_READ_ONLY, /* metadata may not be changed */ 250 CM_FAIL 251 }; 252 253 enum cache_io_mode { 254 /* 255 * Data is written to cached blocks only. These blocks are marked 256 * dirty. If you lose the cache device you will lose data. 257 * Potential performance increase for both reads and writes. 258 */ 259 CM_IO_WRITEBACK, 260 261 /* 262 * Data is written to both cache and origin. Blocks are never 263 * dirty. Potential performance benfit for reads only. 264 */ 265 CM_IO_WRITETHROUGH, 266 267 /* 268 * A degraded mode useful for various cache coherency situations 269 * (eg, rolling back snapshots). Reads and writes always go to the 270 * origin. If a write goes to a cached oblock, then the cache 271 * block is invalidated. 272 */ 273 CM_IO_PASSTHROUGH 274 }; 275 276 struct cache_features { 277 enum cache_metadata_mode mode; 278 enum cache_io_mode io_mode; 279 unsigned int metadata_version; 280 bool discard_passdown:1; 281 }; 282 283 struct cache_stats { 284 atomic_t read_hit; 285 atomic_t read_miss; 286 atomic_t write_hit; 287 atomic_t write_miss; 288 atomic_t demotion; 289 atomic_t promotion; 290 atomic_t writeback; 291 atomic_t copies_avoided; 292 atomic_t cache_cell_clash; 293 atomic_t commit_count; 294 atomic_t discard_count; 295 }; 296 297 struct cache { 298 struct dm_target *ti; 299 spinlock_t lock; 300 301 /* 302 * Fields for converting from sectors to blocks. 303 */ 304 int sectors_per_block_shift; 305 sector_t sectors_per_block; 306 307 struct dm_cache_metadata *cmd; 308 309 /* 310 * Metadata is written to this device. 311 */ 312 struct dm_dev *metadata_dev; 313 314 /* 315 * The slower of the two data devices. Typically a spindle. 316 */ 317 struct dm_dev *origin_dev; 318 319 /* 320 * The faster of the two data devices. Typically an SSD. 321 */ 322 struct dm_dev *cache_dev; 323 324 /* 325 * Size of the origin device in _complete_ blocks and native sectors. 326 */ 327 dm_oblock_t origin_blocks; 328 sector_t origin_sectors; 329 330 /* 331 * Size of the cache device in blocks. 332 */ 333 dm_cblock_t cache_size; 334 335 /* 336 * Invalidation fields. 337 */ 338 spinlock_t invalidation_lock; 339 struct list_head invalidation_requests; 340 341 sector_t migration_threshold; 342 wait_queue_head_t migration_wait; 343 atomic_t nr_allocated_migrations; 344 345 /* 346 * The number of in flight migrations that are performing 347 * background io. eg, promotion, writeback. 348 */ 349 atomic_t nr_io_migrations; 350 351 struct bio_list deferred_bios; 352 353 struct rw_semaphore quiesce_lock; 354 355 /* 356 * origin_blocks entries, discarded if set. 357 */ 358 dm_dblock_t discard_nr_blocks; 359 unsigned long *discard_bitset; 360 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 361 362 /* 363 * Rather than reconstructing the table line for the status we just 364 * save it and regurgitate. 365 */ 366 unsigned int nr_ctr_args; 367 const char **ctr_args; 368 369 struct dm_kcopyd_client *copier; 370 struct work_struct deferred_bio_worker; 371 struct work_struct migration_worker; 372 struct workqueue_struct *wq; 373 struct delayed_work waker; 374 struct dm_bio_prison_v2 *prison; 375 376 /* 377 * cache_size entries, dirty if set 378 */ 379 unsigned long *dirty_bitset; 380 atomic_t nr_dirty; 381 382 unsigned int policy_nr_args; 383 struct dm_cache_policy *policy; 384 385 /* 386 * Cache features such as write-through. 387 */ 388 struct cache_features features; 389 390 struct cache_stats stats; 391 392 bool need_tick_bio:1; 393 bool sized:1; 394 bool invalidate:1; 395 bool commit_requested:1; 396 bool loaded_mappings:1; 397 bool loaded_discards:1; 398 399 struct rw_semaphore background_work_lock; 400 401 struct batcher committer; 402 struct work_struct commit_ws; 403 404 struct dm_io_tracker tracker; 405 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 }; 410 411 struct per_bio_data { 412 bool tick:1; 413 unsigned int req_nr:2; 414 struct dm_bio_prison_cell_v2 *cell; 415 struct dm_hook_info hook_info; 416 sector_t len; 417 }; 418 419 struct dm_cache_migration { 420 struct continuation k; 421 struct cache *cache; 422 423 struct policy_work *op; 424 struct bio *overwrite_bio; 425 struct dm_bio_prison_cell_v2 *cell; 426 427 dm_cblock_t invalidate_cblock; 428 dm_oblock_t invalidate_oblock; 429 }; 430 431 /*----------------------------------------------------------------*/ 432 433 static bool writethrough_mode(struct cache *cache) 434 { 435 return cache->features.io_mode == CM_IO_WRITETHROUGH; 436 } 437 438 static bool writeback_mode(struct cache *cache) 439 { 440 return cache->features.io_mode == CM_IO_WRITEBACK; 441 } 442 443 static inline bool passthrough_mode(struct cache *cache) 444 { 445 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 446 } 447 448 /*----------------------------------------------------------------*/ 449 450 static void wake_deferred_bio_worker(struct cache *cache) 451 { 452 queue_work(cache->wq, &cache->deferred_bio_worker); 453 } 454 455 static void wake_migration_worker(struct cache *cache) 456 { 457 if (passthrough_mode(cache)) 458 return; 459 460 queue_work(cache->wq, &cache->migration_worker); 461 } 462 463 /*----------------------------------------------------------------*/ 464 465 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 466 { 467 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 468 } 469 470 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 471 { 472 dm_bio_prison_free_cell_v2(cache->prison, cell); 473 } 474 475 static struct dm_cache_migration *alloc_migration(struct cache *cache) 476 { 477 struct dm_cache_migration *mg; 478 479 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 480 481 memset(mg, 0, sizeof(*mg)); 482 483 mg->cache = cache; 484 atomic_inc(&cache->nr_allocated_migrations); 485 486 return mg; 487 } 488 489 static void free_migration(struct dm_cache_migration *mg) 490 { 491 struct cache *cache = mg->cache; 492 493 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 494 wake_up(&cache->migration_wait); 495 496 mempool_free(mg, &cache->migration_pool); 497 } 498 499 /*----------------------------------------------------------------*/ 500 501 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 502 { 503 return to_oblock(from_oblock(b) + 1ull); 504 } 505 506 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 507 { 508 key->virtual = 0; 509 key->dev = 0; 510 key->block_begin = from_oblock(begin); 511 key->block_end = from_oblock(end); 512 } 513 514 /* 515 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 516 * level 1 which prevents *both* READs and WRITEs. 517 */ 518 #define WRITE_LOCK_LEVEL 0 519 #define READ_WRITE_LOCK_LEVEL 1 520 521 static unsigned int lock_level(struct bio *bio) 522 { 523 return bio_data_dir(bio) == WRITE ? 524 WRITE_LOCK_LEVEL : 525 READ_WRITE_LOCK_LEVEL; 526 } 527 528 /* 529 *-------------------------------------------------------------- 530 * Per bio data 531 *-------------------------------------------------------------- 532 */ 533 534 static struct per_bio_data *get_per_bio_data(struct bio *bio) 535 { 536 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 537 538 BUG_ON(!pb); 539 return pb; 540 } 541 542 static struct per_bio_data *init_per_bio_data(struct bio *bio) 543 { 544 struct per_bio_data *pb = get_per_bio_data(bio); 545 546 pb->tick = false; 547 pb->req_nr = dm_bio_get_target_bio_nr(bio); 548 pb->cell = NULL; 549 pb->len = 0; 550 551 return pb; 552 } 553 554 /*----------------------------------------------------------------*/ 555 556 static void defer_bio(struct cache *cache, struct bio *bio) 557 { 558 spin_lock_irq(&cache->lock); 559 bio_list_add(&cache->deferred_bios, bio); 560 spin_unlock_irq(&cache->lock); 561 562 wake_deferred_bio_worker(cache); 563 } 564 565 static void defer_bios(struct cache *cache, struct bio_list *bios) 566 { 567 spin_lock_irq(&cache->lock); 568 bio_list_merge_init(&cache->deferred_bios, bios); 569 spin_unlock_irq(&cache->lock); 570 571 wake_deferred_bio_worker(cache); 572 } 573 574 /*----------------------------------------------------------------*/ 575 576 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 577 { 578 bool r; 579 struct per_bio_data *pb; 580 struct dm_cell_key_v2 key; 581 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 582 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 583 584 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 585 586 build_key(oblock, end, &key); 587 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 588 if (!r) { 589 /* 590 * Failed to get the lock. 591 */ 592 free_prison_cell(cache, cell_prealloc); 593 return r; 594 } 595 596 if (cell != cell_prealloc) 597 free_prison_cell(cache, cell_prealloc); 598 599 pb = get_per_bio_data(bio); 600 pb->cell = cell; 601 602 return r; 603 } 604 605 /*----------------------------------------------------------------*/ 606 607 static bool is_dirty(struct cache *cache, dm_cblock_t b) 608 { 609 return test_bit(from_cblock(b), cache->dirty_bitset); 610 } 611 612 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 613 { 614 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 615 atomic_inc(&cache->nr_dirty); 616 policy_set_dirty(cache->policy, cblock); 617 } 618 } 619 620 /* 621 * These two are called when setting after migrations to force the policy 622 * and dirty bitset to be in sync. 623 */ 624 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 625 { 626 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 627 atomic_inc(&cache->nr_dirty); 628 policy_set_dirty(cache->policy, cblock); 629 } 630 631 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 632 { 633 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 634 if (atomic_dec_return(&cache->nr_dirty) == 0) 635 dm_table_event(cache->ti->table); 636 } 637 638 policy_clear_dirty(cache->policy, cblock); 639 } 640 641 /*----------------------------------------------------------------*/ 642 643 static bool block_size_is_power_of_two(struct cache *cache) 644 { 645 return cache->sectors_per_block_shift >= 0; 646 } 647 648 static dm_block_t block_div(dm_block_t b, uint32_t n) 649 { 650 do_div(b, n); 651 652 return b; 653 } 654 655 static dm_block_t oblocks_per_dblock(struct cache *cache) 656 { 657 dm_block_t oblocks = cache->discard_block_size; 658 659 if (block_size_is_power_of_two(cache)) 660 oblocks >>= cache->sectors_per_block_shift; 661 else 662 oblocks = block_div(oblocks, cache->sectors_per_block); 663 664 return oblocks; 665 } 666 667 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 668 { 669 return to_dblock(block_div(from_oblock(oblock), 670 oblocks_per_dblock(cache))); 671 } 672 673 static void set_discard(struct cache *cache, dm_dblock_t b) 674 { 675 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 676 atomic_inc(&cache->stats.discard_count); 677 678 spin_lock_irq(&cache->lock); 679 set_bit(from_dblock(b), cache->discard_bitset); 680 spin_unlock_irq(&cache->lock); 681 } 682 683 static void clear_discard(struct cache *cache, dm_dblock_t b) 684 { 685 spin_lock_irq(&cache->lock); 686 clear_bit(from_dblock(b), cache->discard_bitset); 687 spin_unlock_irq(&cache->lock); 688 } 689 690 static bool is_discarded(struct cache *cache, dm_dblock_t b) 691 { 692 int r; 693 694 spin_lock_irq(&cache->lock); 695 r = test_bit(from_dblock(b), cache->discard_bitset); 696 spin_unlock_irq(&cache->lock); 697 698 return r; 699 } 700 701 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 702 { 703 int r; 704 705 spin_lock_irq(&cache->lock); 706 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 707 cache->discard_bitset); 708 spin_unlock_irq(&cache->lock); 709 710 return r; 711 } 712 713 /* 714 * ------------------------------------------------------------- 715 * Remapping 716 *-------------------------------------------------------------- 717 */ 718 static void remap_to_origin(struct cache *cache, struct bio *bio) 719 { 720 bio_set_dev(bio, cache->origin_dev->bdev); 721 } 722 723 static void remap_to_cache(struct cache *cache, struct bio *bio, 724 dm_cblock_t cblock) 725 { 726 sector_t bi_sector = bio->bi_iter.bi_sector; 727 sector_t block = from_cblock(cblock); 728 729 bio_set_dev(bio, cache->cache_dev->bdev); 730 if (!block_size_is_power_of_two(cache)) 731 bio->bi_iter.bi_sector = 732 (block * cache->sectors_per_block) + 733 sector_div(bi_sector, cache->sectors_per_block); 734 else 735 bio->bi_iter.bi_sector = 736 (block << cache->sectors_per_block_shift) | 737 (bi_sector & (cache->sectors_per_block - 1)); 738 } 739 740 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 741 { 742 struct per_bio_data *pb; 743 744 spin_lock_irq(&cache->lock); 745 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 746 bio_op(bio) != REQ_OP_DISCARD) { 747 pb = get_per_bio_data(bio); 748 pb->tick = true; 749 cache->need_tick_bio = false; 750 } 751 spin_unlock_irq(&cache->lock); 752 } 753 754 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 755 dm_oblock_t oblock) 756 { 757 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 758 check_if_tick_bio_needed(cache, bio); 759 remap_to_origin(cache, bio); 760 if (bio_data_dir(bio) == WRITE) 761 clear_discard(cache, oblock_to_dblock(cache, oblock)); 762 } 763 764 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 765 dm_oblock_t oblock, dm_cblock_t cblock) 766 { 767 check_if_tick_bio_needed(cache, bio); 768 remap_to_cache(cache, bio, cblock); 769 if (bio_data_dir(bio) == WRITE) { 770 set_dirty(cache, cblock); 771 clear_discard(cache, oblock_to_dblock(cache, oblock)); 772 } 773 } 774 775 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 776 { 777 sector_t block_nr = bio->bi_iter.bi_sector; 778 779 if (!block_size_is_power_of_two(cache)) 780 (void) sector_div(block_nr, cache->sectors_per_block); 781 else 782 block_nr >>= cache->sectors_per_block_shift; 783 784 return to_oblock(block_nr); 785 } 786 787 static bool accountable_bio(struct cache *cache, struct bio *bio) 788 { 789 return bio_op(bio) != REQ_OP_DISCARD; 790 } 791 792 static void accounted_begin(struct cache *cache, struct bio *bio) 793 { 794 struct per_bio_data *pb; 795 796 if (accountable_bio(cache, bio)) { 797 pb = get_per_bio_data(bio); 798 pb->len = bio_sectors(bio); 799 dm_iot_io_begin(&cache->tracker, pb->len); 800 } 801 } 802 803 static void accounted_complete(struct cache *cache, struct bio *bio) 804 { 805 struct per_bio_data *pb = get_per_bio_data(bio); 806 807 dm_iot_io_end(&cache->tracker, pb->len); 808 } 809 810 static void accounted_request(struct cache *cache, struct bio *bio) 811 { 812 accounted_begin(cache, bio); 813 dm_submit_bio_remap(bio, NULL); 814 } 815 816 static void issue_op(struct bio *bio, void *context) 817 { 818 struct cache *cache = context; 819 820 accounted_request(cache, bio); 821 } 822 823 /* 824 * When running in writethrough mode we need to send writes to clean blocks 825 * to both the cache and origin devices. Clone the bio and send them in parallel. 826 */ 827 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 828 dm_oblock_t oblock, dm_cblock_t cblock) 829 { 830 struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, 831 GFP_NOIO, &cache->bs); 832 833 BUG_ON(!origin_bio); 834 835 bio_chain(origin_bio, bio); 836 837 if (bio_data_dir(origin_bio) == WRITE) 838 clear_discard(cache, oblock_to_dblock(cache, oblock)); 839 submit_bio(origin_bio); 840 841 remap_to_cache(cache, bio, cblock); 842 } 843 844 /* 845 *-------------------------------------------------------------- 846 * Failure modes 847 *-------------------------------------------------------------- 848 */ 849 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 850 { 851 return cache->features.mode; 852 } 853 854 static const char *cache_device_name(struct cache *cache) 855 { 856 return dm_table_device_name(cache->ti->table); 857 } 858 859 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 860 { 861 static const char *descs[] = { 862 "write", 863 "read-only", 864 "fail" 865 }; 866 867 dm_table_event(cache->ti->table); 868 DMINFO("%s: switching cache to %s mode", 869 cache_device_name(cache), descs[(int)mode]); 870 } 871 872 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 873 { 874 bool needs_check; 875 enum cache_metadata_mode old_mode = get_cache_mode(cache); 876 877 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 878 DMERR("%s: unable to read needs_check flag, setting failure mode.", 879 cache_device_name(cache)); 880 new_mode = CM_FAIL; 881 } 882 883 if (new_mode == CM_WRITE && needs_check) { 884 DMERR("%s: unable to switch cache to write mode until repaired.", 885 cache_device_name(cache)); 886 if (old_mode != new_mode) 887 new_mode = old_mode; 888 else 889 new_mode = CM_READ_ONLY; 890 } 891 892 /* Never move out of fail mode */ 893 if (old_mode == CM_FAIL) 894 new_mode = CM_FAIL; 895 896 switch (new_mode) { 897 case CM_FAIL: 898 case CM_READ_ONLY: 899 dm_cache_metadata_set_read_only(cache->cmd); 900 break; 901 902 case CM_WRITE: 903 dm_cache_metadata_set_read_write(cache->cmd); 904 break; 905 } 906 907 cache->features.mode = new_mode; 908 909 if (new_mode != old_mode) 910 notify_mode_switch(cache, new_mode); 911 } 912 913 static void abort_transaction(struct cache *cache) 914 { 915 const char *dev_name = cache_device_name(cache); 916 917 if (get_cache_mode(cache) >= CM_READ_ONLY) 918 return; 919 920 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 921 if (dm_cache_metadata_abort(cache->cmd)) { 922 DMERR("%s: failed to abort metadata transaction", dev_name); 923 set_cache_mode(cache, CM_FAIL); 924 } 925 926 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 927 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 928 set_cache_mode(cache, CM_FAIL); 929 } 930 } 931 932 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 933 { 934 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 935 cache_device_name(cache), op, r); 936 abort_transaction(cache); 937 set_cache_mode(cache, CM_READ_ONLY); 938 } 939 940 /*----------------------------------------------------------------*/ 941 942 static void load_stats(struct cache *cache) 943 { 944 struct dm_cache_statistics stats; 945 946 dm_cache_metadata_get_stats(cache->cmd, &stats); 947 atomic_set(&cache->stats.read_hit, stats.read_hits); 948 atomic_set(&cache->stats.read_miss, stats.read_misses); 949 atomic_set(&cache->stats.write_hit, stats.write_hits); 950 atomic_set(&cache->stats.write_miss, stats.write_misses); 951 } 952 953 static void save_stats(struct cache *cache) 954 { 955 struct dm_cache_statistics stats; 956 957 if (get_cache_mode(cache) >= CM_READ_ONLY) 958 return; 959 960 stats.read_hits = atomic_read(&cache->stats.read_hit); 961 stats.read_misses = atomic_read(&cache->stats.read_miss); 962 stats.write_hits = atomic_read(&cache->stats.write_hit); 963 stats.write_misses = atomic_read(&cache->stats.write_miss); 964 965 dm_cache_metadata_set_stats(cache->cmd, &stats); 966 } 967 968 static void update_stats(struct cache_stats *stats, enum policy_operation op) 969 { 970 switch (op) { 971 case POLICY_PROMOTE: 972 atomic_inc(&stats->promotion); 973 break; 974 975 case POLICY_DEMOTE: 976 atomic_inc(&stats->demotion); 977 break; 978 979 case POLICY_WRITEBACK: 980 atomic_inc(&stats->writeback); 981 break; 982 } 983 } 984 985 /* 986 *--------------------------------------------------------------------- 987 * Migration processing 988 * 989 * Migration covers moving data from the origin device to the cache, or 990 * vice versa. 991 *--------------------------------------------------------------------- 992 */ 993 static void inc_io_migrations(struct cache *cache) 994 { 995 atomic_inc(&cache->nr_io_migrations); 996 } 997 998 static void dec_io_migrations(struct cache *cache) 999 { 1000 atomic_dec(&cache->nr_io_migrations); 1001 } 1002 1003 static bool discard_or_flush(struct bio *bio) 1004 { 1005 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1006 } 1007 1008 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1009 dm_dblock_t *b, dm_dblock_t *e) 1010 { 1011 sector_t sb = bio->bi_iter.bi_sector; 1012 sector_t se = bio_end_sector(bio); 1013 1014 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1015 1016 if (se - sb < cache->discard_block_size) 1017 *e = *b; 1018 else 1019 *e = to_dblock(block_div(se, cache->discard_block_size)); 1020 } 1021 1022 /*----------------------------------------------------------------*/ 1023 1024 static void prevent_background_work(struct cache *cache) 1025 { 1026 lockdep_off(); 1027 down_write(&cache->background_work_lock); 1028 lockdep_on(); 1029 } 1030 1031 static void allow_background_work(struct cache *cache) 1032 { 1033 lockdep_off(); 1034 up_write(&cache->background_work_lock); 1035 lockdep_on(); 1036 } 1037 1038 static bool background_work_begin(struct cache *cache) 1039 { 1040 bool r; 1041 1042 lockdep_off(); 1043 r = down_read_trylock(&cache->background_work_lock); 1044 lockdep_on(); 1045 1046 return r; 1047 } 1048 1049 static void background_work_end(struct cache *cache) 1050 { 1051 lockdep_off(); 1052 up_read(&cache->background_work_lock); 1053 lockdep_on(); 1054 } 1055 1056 /*----------------------------------------------------------------*/ 1057 1058 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1059 { 1060 return (bio_data_dir(bio) == WRITE) && 1061 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1062 } 1063 1064 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1065 { 1066 return writeback_mode(cache) && 1067 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1068 } 1069 1070 static void quiesce(struct dm_cache_migration *mg, 1071 void (*continuation)(struct work_struct *)) 1072 { 1073 init_continuation(&mg->k, continuation); 1074 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1075 } 1076 1077 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1078 { 1079 struct continuation *k = container_of(ws, struct continuation, ws); 1080 1081 return container_of(k, struct dm_cache_migration, k); 1082 } 1083 1084 static void copy_complete(int read_err, unsigned long write_err, void *context) 1085 { 1086 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1087 1088 if (read_err || write_err) 1089 mg->k.input = BLK_STS_IOERR; 1090 1091 queue_continuation(mg->cache->wq, &mg->k); 1092 } 1093 1094 static void copy(struct dm_cache_migration *mg, bool promote) 1095 { 1096 struct dm_io_region o_region, c_region; 1097 struct cache *cache = mg->cache; 1098 1099 o_region.bdev = cache->origin_dev->bdev; 1100 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1101 o_region.count = cache->sectors_per_block; 1102 1103 c_region.bdev = cache->cache_dev->bdev; 1104 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1105 c_region.count = cache->sectors_per_block; 1106 1107 if (promote) 1108 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1109 else 1110 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1111 } 1112 1113 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1114 { 1115 struct per_bio_data *pb = get_per_bio_data(bio); 1116 1117 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1118 free_prison_cell(cache, pb->cell); 1119 pb->cell = NULL; 1120 } 1121 1122 static void overwrite_endio(struct bio *bio) 1123 { 1124 struct dm_cache_migration *mg = bio->bi_private; 1125 struct cache *cache = mg->cache; 1126 struct per_bio_data *pb = get_per_bio_data(bio); 1127 1128 dm_unhook_bio(&pb->hook_info, bio); 1129 1130 if (bio->bi_status) 1131 mg->k.input = bio->bi_status; 1132 1133 queue_continuation(cache->wq, &mg->k); 1134 } 1135 1136 static void overwrite(struct dm_cache_migration *mg, 1137 void (*continuation)(struct work_struct *)) 1138 { 1139 struct bio *bio = mg->overwrite_bio; 1140 struct per_bio_data *pb = get_per_bio_data(bio); 1141 1142 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1143 1144 /* 1145 * The overwrite bio is part of the copy operation, as such it does 1146 * not set/clear discard or dirty flags. 1147 */ 1148 if (mg->op->op == POLICY_PROMOTE) 1149 remap_to_cache(mg->cache, bio, mg->op->cblock); 1150 else 1151 remap_to_origin(mg->cache, bio); 1152 1153 init_continuation(&mg->k, continuation); 1154 accounted_request(mg->cache, bio); 1155 } 1156 1157 /* 1158 * Migration steps: 1159 * 1160 * 1) exclusive lock preventing WRITEs 1161 * 2) quiesce 1162 * 3) copy or issue overwrite bio 1163 * 4) upgrade to exclusive lock preventing READs and WRITEs 1164 * 5) quiesce 1165 * 6) update metadata and commit 1166 * 7) unlock 1167 */ 1168 static void mg_complete(struct dm_cache_migration *mg, bool success) 1169 { 1170 struct bio_list bios; 1171 struct cache *cache = mg->cache; 1172 struct policy_work *op = mg->op; 1173 dm_cblock_t cblock = op->cblock; 1174 1175 if (success) 1176 update_stats(&cache->stats, op->op); 1177 1178 switch (op->op) { 1179 case POLICY_PROMOTE: 1180 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1181 policy_complete_background_work(cache->policy, op, success); 1182 1183 if (mg->overwrite_bio) { 1184 if (success) 1185 force_set_dirty(cache, cblock); 1186 else if (mg->k.input) 1187 mg->overwrite_bio->bi_status = mg->k.input; 1188 else 1189 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1190 bio_endio(mg->overwrite_bio); 1191 } else { 1192 if (success) 1193 force_clear_dirty(cache, cblock); 1194 dec_io_migrations(cache); 1195 } 1196 break; 1197 1198 case POLICY_DEMOTE: 1199 /* 1200 * We clear dirty here to update the nr_dirty counter. 1201 */ 1202 if (success) 1203 force_clear_dirty(cache, cblock); 1204 policy_complete_background_work(cache->policy, op, success); 1205 dec_io_migrations(cache); 1206 break; 1207 1208 case POLICY_WRITEBACK: 1209 if (success) 1210 force_clear_dirty(cache, cblock); 1211 policy_complete_background_work(cache->policy, op, success); 1212 dec_io_migrations(cache); 1213 break; 1214 } 1215 1216 bio_list_init(&bios); 1217 if (mg->cell) { 1218 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1219 free_prison_cell(cache, mg->cell); 1220 } 1221 1222 free_migration(mg); 1223 defer_bios(cache, &bios); 1224 wake_migration_worker(cache); 1225 1226 background_work_end(cache); 1227 } 1228 1229 static void mg_success(struct work_struct *ws) 1230 { 1231 struct dm_cache_migration *mg = ws_to_mg(ws); 1232 1233 mg_complete(mg, mg->k.input == 0); 1234 } 1235 1236 static void mg_update_metadata(struct work_struct *ws) 1237 { 1238 int r; 1239 struct dm_cache_migration *mg = ws_to_mg(ws); 1240 struct cache *cache = mg->cache; 1241 struct policy_work *op = mg->op; 1242 1243 switch (op->op) { 1244 case POLICY_PROMOTE: 1245 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1246 if (r) { 1247 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1248 cache_device_name(cache)); 1249 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1250 1251 mg_complete(mg, false); 1252 return; 1253 } 1254 mg_complete(mg, true); 1255 break; 1256 1257 case POLICY_DEMOTE: 1258 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1259 if (r) { 1260 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1261 cache_device_name(cache)); 1262 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1263 1264 mg_complete(mg, false); 1265 return; 1266 } 1267 1268 /* 1269 * It would be nice if we only had to commit when a REQ_FLUSH 1270 * comes through. But there's one scenario that we have to 1271 * look out for: 1272 * 1273 * - vblock x in a cache block 1274 * - domotion occurs 1275 * - cache block gets reallocated and over written 1276 * - crash 1277 * 1278 * When we recover, because there was no commit the cache will 1279 * rollback to having the data for vblock x in the cache block. 1280 * But the cache block has since been overwritten, so it'll end 1281 * up pointing to data that was never in 'x' during the history 1282 * of the device. 1283 * 1284 * To avoid this issue we require a commit as part of the 1285 * demotion operation. 1286 */ 1287 init_continuation(&mg->k, mg_success); 1288 continue_after_commit(&cache->committer, &mg->k); 1289 schedule_commit(&cache->committer); 1290 break; 1291 1292 case POLICY_WRITEBACK: 1293 mg_complete(mg, true); 1294 break; 1295 } 1296 } 1297 1298 static void mg_update_metadata_after_copy(struct work_struct *ws) 1299 { 1300 struct dm_cache_migration *mg = ws_to_mg(ws); 1301 1302 /* 1303 * Did the copy succeed? 1304 */ 1305 if (mg->k.input) 1306 mg_complete(mg, false); 1307 else 1308 mg_update_metadata(ws); 1309 } 1310 1311 static void mg_upgrade_lock(struct work_struct *ws) 1312 { 1313 int r; 1314 struct dm_cache_migration *mg = ws_to_mg(ws); 1315 1316 /* 1317 * Did the copy succeed? 1318 */ 1319 if (mg->k.input) 1320 mg_complete(mg, false); 1321 1322 else { 1323 /* 1324 * Now we want the lock to prevent both reads and writes. 1325 */ 1326 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1327 READ_WRITE_LOCK_LEVEL); 1328 if (r < 0) 1329 mg_complete(mg, false); 1330 1331 else if (r) 1332 quiesce(mg, mg_update_metadata); 1333 1334 else 1335 mg_update_metadata(ws); 1336 } 1337 } 1338 1339 static void mg_full_copy(struct work_struct *ws) 1340 { 1341 struct dm_cache_migration *mg = ws_to_mg(ws); 1342 struct cache *cache = mg->cache; 1343 struct policy_work *op = mg->op; 1344 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1345 1346 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1347 is_discarded_oblock(cache, op->oblock)) { 1348 mg_upgrade_lock(ws); 1349 return; 1350 } 1351 1352 init_continuation(&mg->k, mg_upgrade_lock); 1353 copy(mg, is_policy_promote); 1354 } 1355 1356 static void mg_copy(struct work_struct *ws) 1357 { 1358 struct dm_cache_migration *mg = ws_to_mg(ws); 1359 1360 if (mg->overwrite_bio) { 1361 /* 1362 * No exclusive lock was held when we last checked if the bio 1363 * was optimisable. So we have to check again in case things 1364 * have changed (eg, the block may no longer be discarded). 1365 */ 1366 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1367 /* 1368 * Fallback to a real full copy after doing some tidying up. 1369 */ 1370 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1371 1372 BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */ 1373 mg->overwrite_bio = NULL; 1374 inc_io_migrations(mg->cache); 1375 mg_full_copy(ws); 1376 return; 1377 } 1378 1379 /* 1380 * It's safe to do this here, even though it's new data 1381 * because all IO has been locked out of the block. 1382 * 1383 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1384 * so _not_ using mg_upgrade_lock() as continutation. 1385 */ 1386 overwrite(mg, mg_update_metadata_after_copy); 1387 1388 } else 1389 mg_full_copy(ws); 1390 } 1391 1392 static int mg_lock_writes(struct dm_cache_migration *mg) 1393 { 1394 int r; 1395 struct dm_cell_key_v2 key; 1396 struct cache *cache = mg->cache; 1397 struct dm_bio_prison_cell_v2 *prealloc; 1398 1399 prealloc = alloc_prison_cell(cache); 1400 1401 /* 1402 * Prevent writes to the block, but allow reads to continue. 1403 * Unless we're using an overwrite bio, in which case we lock 1404 * everything. 1405 */ 1406 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1407 r = dm_cell_lock_v2(cache->prison, &key, 1408 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1409 prealloc, &mg->cell); 1410 if (r < 0) { 1411 free_prison_cell(cache, prealloc); 1412 mg_complete(mg, false); 1413 return r; 1414 } 1415 1416 if (mg->cell != prealloc) 1417 free_prison_cell(cache, prealloc); 1418 1419 if (r == 0) 1420 mg_copy(&mg->k.ws); 1421 else 1422 quiesce(mg, mg_copy); 1423 1424 return 0; 1425 } 1426 1427 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1428 { 1429 struct dm_cache_migration *mg; 1430 1431 if (!background_work_begin(cache)) { 1432 policy_complete_background_work(cache->policy, op, false); 1433 return -EPERM; 1434 } 1435 1436 mg = alloc_migration(cache); 1437 1438 mg->op = op; 1439 mg->overwrite_bio = bio; 1440 1441 if (!bio) 1442 inc_io_migrations(cache); 1443 1444 return mg_lock_writes(mg); 1445 } 1446 1447 /* 1448 *-------------------------------------------------------------- 1449 * invalidation processing 1450 *-------------------------------------------------------------- 1451 */ 1452 1453 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1454 { 1455 struct bio_list bios; 1456 struct cache *cache = mg->cache; 1457 1458 bio_list_init(&bios); 1459 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1460 free_prison_cell(cache, mg->cell); 1461 1462 if (!success && mg->overwrite_bio) 1463 bio_io_error(mg->overwrite_bio); 1464 1465 free_migration(mg); 1466 defer_bios(cache, &bios); 1467 1468 background_work_end(cache); 1469 } 1470 1471 static void invalidate_completed(struct work_struct *ws) 1472 { 1473 struct dm_cache_migration *mg = ws_to_mg(ws); 1474 1475 invalidate_complete(mg, !mg->k.input); 1476 } 1477 1478 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1479 { 1480 int r; 1481 1482 r = policy_invalidate_mapping(cache->policy, cblock); 1483 if (!r) { 1484 r = dm_cache_remove_mapping(cache->cmd, cblock); 1485 if (r) { 1486 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1487 cache_device_name(cache)); 1488 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1489 } 1490 1491 } else if (r == -ENODATA) { 1492 /* 1493 * Harmless, already unmapped. 1494 */ 1495 r = 0; 1496 1497 } else 1498 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1499 1500 return r; 1501 } 1502 1503 static void invalidate_remove(struct work_struct *ws) 1504 { 1505 int r; 1506 struct dm_cache_migration *mg = ws_to_mg(ws); 1507 struct cache *cache = mg->cache; 1508 1509 r = invalidate_cblock(cache, mg->invalidate_cblock); 1510 if (r) { 1511 invalidate_complete(mg, false); 1512 return; 1513 } 1514 1515 init_continuation(&mg->k, invalidate_completed); 1516 continue_after_commit(&cache->committer, &mg->k); 1517 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1518 mg->overwrite_bio = NULL; 1519 schedule_commit(&cache->committer); 1520 } 1521 1522 static int invalidate_lock(struct dm_cache_migration *mg) 1523 { 1524 int r; 1525 struct dm_cell_key_v2 key; 1526 struct cache *cache = mg->cache; 1527 struct dm_bio_prison_cell_v2 *prealloc; 1528 1529 prealloc = alloc_prison_cell(cache); 1530 1531 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1532 r = dm_cell_lock_v2(cache->prison, &key, 1533 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1534 if (r < 0) { 1535 free_prison_cell(cache, prealloc); 1536 invalidate_complete(mg, false); 1537 return r; 1538 } 1539 1540 if (mg->cell != prealloc) 1541 free_prison_cell(cache, prealloc); 1542 1543 if (r) 1544 quiesce(mg, invalidate_remove); 1545 1546 else { 1547 /* 1548 * We can't call invalidate_remove() directly here because we 1549 * might still be in request context. 1550 */ 1551 init_continuation(&mg->k, invalidate_remove); 1552 queue_work(cache->wq, &mg->k.ws); 1553 } 1554 1555 return 0; 1556 } 1557 1558 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1559 dm_oblock_t oblock, struct bio *bio) 1560 { 1561 struct dm_cache_migration *mg; 1562 1563 if (!background_work_begin(cache)) 1564 return -EPERM; 1565 1566 mg = alloc_migration(cache); 1567 1568 mg->overwrite_bio = bio; 1569 mg->invalidate_cblock = cblock; 1570 mg->invalidate_oblock = oblock; 1571 1572 return invalidate_lock(mg); 1573 } 1574 1575 /* 1576 *-------------------------------------------------------------- 1577 * bio processing 1578 *-------------------------------------------------------------- 1579 */ 1580 1581 enum busy { 1582 IDLE, 1583 BUSY 1584 }; 1585 1586 static enum busy spare_migration_bandwidth(struct cache *cache) 1587 { 1588 bool idle = dm_iot_idle_for(&cache->tracker, HZ); 1589 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1590 cache->sectors_per_block; 1591 1592 if (idle && current_volume <= cache->migration_threshold) 1593 return IDLE; 1594 else 1595 return BUSY; 1596 } 1597 1598 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1599 { 1600 atomic_inc(bio_data_dir(bio) == READ ? 1601 &cache->stats.read_hit : &cache->stats.write_hit); 1602 } 1603 1604 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1605 { 1606 atomic_inc(bio_data_dir(bio) == READ ? 1607 &cache->stats.read_miss : &cache->stats.write_miss); 1608 } 1609 1610 /*----------------------------------------------------------------*/ 1611 1612 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1613 bool *commit_needed) 1614 { 1615 int r, data_dir; 1616 bool rb, background_queued; 1617 dm_cblock_t cblock; 1618 1619 *commit_needed = false; 1620 1621 rb = bio_detain_shared(cache, block, bio); 1622 if (!rb) { 1623 /* 1624 * An exclusive lock is held for this block, so we have to 1625 * wait. We set the commit_needed flag so the current 1626 * transaction will be committed asap, allowing this lock 1627 * to be dropped. 1628 */ 1629 *commit_needed = true; 1630 return DM_MAPIO_SUBMITTED; 1631 } 1632 1633 data_dir = bio_data_dir(bio); 1634 1635 if (optimisable_bio(cache, bio, block)) { 1636 struct policy_work *op = NULL; 1637 1638 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1639 if (unlikely(r && r != -ENOENT)) { 1640 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1641 cache_device_name(cache), r); 1642 bio_io_error(bio); 1643 return DM_MAPIO_SUBMITTED; 1644 } 1645 1646 if (r == -ENOENT && op) { 1647 bio_drop_shared_lock(cache, bio); 1648 BUG_ON(op->op != POLICY_PROMOTE); 1649 mg_start(cache, op, bio); 1650 return DM_MAPIO_SUBMITTED; 1651 } 1652 } else { 1653 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1654 if (unlikely(r && r != -ENOENT)) { 1655 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1656 cache_device_name(cache), r); 1657 bio_io_error(bio); 1658 return DM_MAPIO_SUBMITTED; 1659 } 1660 1661 if (background_queued) 1662 wake_migration_worker(cache); 1663 } 1664 1665 if (r == -ENOENT) { 1666 struct per_bio_data *pb = get_per_bio_data(bio); 1667 1668 /* 1669 * Miss. 1670 */ 1671 inc_miss_counter(cache, bio); 1672 if (pb->req_nr == 0) { 1673 accounted_begin(cache, bio); 1674 remap_to_origin_clear_discard(cache, bio, block); 1675 } else { 1676 /* 1677 * This is a duplicate writethrough io that is no 1678 * longer needed because the block has been demoted. 1679 */ 1680 bio_endio(bio); 1681 return DM_MAPIO_SUBMITTED; 1682 } 1683 } else { 1684 /* 1685 * Hit. 1686 */ 1687 inc_hit_counter(cache, bio); 1688 1689 /* 1690 * Passthrough always maps to the origin, invalidating any 1691 * cache blocks that are written to. 1692 */ 1693 if (passthrough_mode(cache)) { 1694 if (bio_data_dir(bio) == WRITE) { 1695 bio_drop_shared_lock(cache, bio); 1696 atomic_inc(&cache->stats.demotion); 1697 invalidate_start(cache, cblock, block, bio); 1698 } else 1699 remap_to_origin_clear_discard(cache, bio, block); 1700 } else { 1701 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1702 !is_dirty(cache, cblock)) { 1703 remap_to_origin_and_cache(cache, bio, block, cblock); 1704 accounted_begin(cache, bio); 1705 } else 1706 remap_to_cache_dirty(cache, bio, block, cblock); 1707 } 1708 } 1709 1710 /* 1711 * dm core turns FUA requests into a separate payload and FLUSH req. 1712 */ 1713 if (bio->bi_opf & REQ_FUA) { 1714 /* 1715 * issue_after_commit will call accounted_begin a second time. So 1716 * we call accounted_complete() to avoid double accounting. 1717 */ 1718 accounted_complete(cache, bio); 1719 issue_after_commit(&cache->committer, bio); 1720 *commit_needed = true; 1721 return DM_MAPIO_SUBMITTED; 1722 } 1723 1724 return DM_MAPIO_REMAPPED; 1725 } 1726 1727 static bool process_bio(struct cache *cache, struct bio *bio) 1728 { 1729 bool commit_needed; 1730 1731 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1732 dm_submit_bio_remap(bio, NULL); 1733 1734 return commit_needed; 1735 } 1736 1737 /* 1738 * A non-zero return indicates read_only or fail_io mode. 1739 */ 1740 static int commit(struct cache *cache, bool clean_shutdown) 1741 { 1742 int r; 1743 1744 if (get_cache_mode(cache) >= CM_READ_ONLY) 1745 return -EINVAL; 1746 1747 atomic_inc(&cache->stats.commit_count); 1748 r = dm_cache_commit(cache->cmd, clean_shutdown); 1749 if (r) 1750 metadata_operation_failed(cache, "dm_cache_commit", r); 1751 1752 return r; 1753 } 1754 1755 /* 1756 * Used by the batcher. 1757 */ 1758 static blk_status_t commit_op(void *context) 1759 { 1760 struct cache *cache = context; 1761 1762 if (dm_cache_changed_this_transaction(cache->cmd)) 1763 return errno_to_blk_status(commit(cache, false)); 1764 1765 return 0; 1766 } 1767 1768 /*----------------------------------------------------------------*/ 1769 1770 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1771 { 1772 struct per_bio_data *pb = get_per_bio_data(bio); 1773 1774 if (!pb->req_nr) 1775 remap_to_origin(cache, bio); 1776 else 1777 remap_to_cache(cache, bio, 0); 1778 1779 issue_after_commit(&cache->committer, bio); 1780 return true; 1781 } 1782 1783 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1784 { 1785 dm_dblock_t b, e; 1786 1787 /* 1788 * FIXME: do we need to lock the region? Or can we just assume the 1789 * user wont be so foolish as to issue discard concurrently with 1790 * other IO? 1791 */ 1792 calc_discard_block_range(cache, bio, &b, &e); 1793 while (b != e) { 1794 set_discard(cache, b); 1795 b = to_dblock(from_dblock(b) + 1); 1796 } 1797 1798 if (cache->features.discard_passdown) { 1799 remap_to_origin(cache, bio); 1800 dm_submit_bio_remap(bio, NULL); 1801 } else 1802 bio_endio(bio); 1803 1804 return false; 1805 } 1806 1807 static void process_deferred_bios(struct work_struct *ws) 1808 { 1809 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1810 1811 bool commit_needed = false; 1812 struct bio_list bios; 1813 struct bio *bio; 1814 1815 bio_list_init(&bios); 1816 1817 spin_lock_irq(&cache->lock); 1818 bio_list_merge_init(&bios, &cache->deferred_bios); 1819 spin_unlock_irq(&cache->lock); 1820 1821 while ((bio = bio_list_pop(&bios))) { 1822 if (bio->bi_opf & REQ_PREFLUSH) 1823 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1824 1825 else if (bio_op(bio) == REQ_OP_DISCARD) 1826 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1827 1828 else 1829 commit_needed = process_bio(cache, bio) || commit_needed; 1830 cond_resched(); 1831 } 1832 1833 if (commit_needed) 1834 schedule_commit(&cache->committer); 1835 } 1836 1837 /* 1838 *-------------------------------------------------------------- 1839 * Main worker loop 1840 *-------------------------------------------------------------- 1841 */ 1842 static void requeue_deferred_bios(struct cache *cache) 1843 { 1844 struct bio *bio; 1845 struct bio_list bios; 1846 1847 bio_list_init(&bios); 1848 bio_list_merge_init(&bios, &cache->deferred_bios); 1849 1850 while ((bio = bio_list_pop(&bios))) { 1851 bio->bi_status = BLK_STS_DM_REQUEUE; 1852 bio_endio(bio); 1853 cond_resched(); 1854 } 1855 } 1856 1857 /* 1858 * We want to commit periodically so that not too much 1859 * unwritten metadata builds up. 1860 */ 1861 static void do_waker(struct work_struct *ws) 1862 { 1863 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1864 1865 policy_tick(cache->policy, true); 1866 wake_migration_worker(cache); 1867 schedule_commit(&cache->committer); 1868 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1869 } 1870 1871 static void check_migrations(struct work_struct *ws) 1872 { 1873 int r; 1874 struct policy_work *op; 1875 struct cache *cache = container_of(ws, struct cache, migration_worker); 1876 enum busy b; 1877 1878 for (;;) { 1879 b = spare_migration_bandwidth(cache); 1880 1881 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1882 if (r == -ENODATA) 1883 break; 1884 1885 if (r) { 1886 DMERR_LIMIT("%s: policy_background_work failed", 1887 cache_device_name(cache)); 1888 break; 1889 } 1890 1891 r = mg_start(cache, op, NULL); 1892 if (r) 1893 break; 1894 1895 cond_resched(); 1896 } 1897 } 1898 1899 /* 1900 *-------------------------------------------------------------- 1901 * Target methods 1902 *-------------------------------------------------------------- 1903 */ 1904 1905 /* 1906 * This function gets called on the error paths of the constructor, so we 1907 * have to cope with a partially initialised struct. 1908 */ 1909 static void __destroy(struct cache *cache) 1910 { 1911 mempool_exit(&cache->migration_pool); 1912 1913 if (cache->prison) 1914 dm_bio_prison_destroy_v2(cache->prison); 1915 1916 if (cache->wq) 1917 destroy_workqueue(cache->wq); 1918 1919 if (cache->dirty_bitset) 1920 free_bitset(cache->dirty_bitset); 1921 1922 if (cache->discard_bitset) 1923 free_bitset(cache->discard_bitset); 1924 1925 if (cache->copier) 1926 dm_kcopyd_client_destroy(cache->copier); 1927 1928 if (cache->cmd) 1929 dm_cache_metadata_close(cache->cmd); 1930 1931 if (cache->metadata_dev) 1932 dm_put_device(cache->ti, cache->metadata_dev); 1933 1934 if (cache->origin_dev) 1935 dm_put_device(cache->ti, cache->origin_dev); 1936 1937 if (cache->cache_dev) 1938 dm_put_device(cache->ti, cache->cache_dev); 1939 1940 if (cache->policy) 1941 dm_cache_policy_destroy(cache->policy); 1942 1943 bioset_exit(&cache->bs); 1944 1945 kfree(cache); 1946 } 1947 1948 static void destroy(struct cache *cache) 1949 { 1950 unsigned int i; 1951 1952 cancel_delayed_work_sync(&cache->waker); 1953 1954 for (i = 0; i < cache->nr_ctr_args ; i++) 1955 kfree(cache->ctr_args[i]); 1956 kfree(cache->ctr_args); 1957 1958 __destroy(cache); 1959 } 1960 1961 static void cache_dtr(struct dm_target *ti) 1962 { 1963 struct cache *cache = ti->private; 1964 1965 destroy(cache); 1966 } 1967 1968 static sector_t get_dev_size(struct dm_dev *dev) 1969 { 1970 return bdev_nr_sectors(dev->bdev); 1971 } 1972 1973 /*----------------------------------------------------------------*/ 1974 1975 /* 1976 * Construct a cache device mapping. 1977 * 1978 * cache <metadata dev> <cache dev> <origin dev> <block size> 1979 * <#feature args> [<feature arg>]* 1980 * <policy> <#policy args> [<policy arg>]* 1981 * 1982 * metadata dev : fast device holding the persistent metadata 1983 * cache dev : fast device holding cached data blocks 1984 * origin dev : slow device holding original data blocks 1985 * block size : cache unit size in sectors 1986 * 1987 * #feature args : number of feature arguments passed 1988 * feature args : writethrough. (The default is writeback.) 1989 * 1990 * policy : the replacement policy to use 1991 * #policy args : an even number of policy arguments corresponding 1992 * to key/value pairs passed to the policy 1993 * policy args : key/value pairs passed to the policy 1994 * E.g. 'sequential_threshold 1024' 1995 * See cache-policies.txt for details. 1996 * 1997 * Optional feature arguments are: 1998 * writethrough : write through caching that prohibits cache block 1999 * content from being different from origin block content. 2000 * Without this argument, the default behaviour is to write 2001 * back cache block contents later for performance reasons, 2002 * so they may differ from the corresponding origin blocks. 2003 */ 2004 struct cache_args { 2005 struct dm_target *ti; 2006 2007 struct dm_dev *metadata_dev; 2008 2009 struct dm_dev *cache_dev; 2010 sector_t cache_sectors; 2011 2012 struct dm_dev *origin_dev; 2013 2014 uint32_t block_size; 2015 2016 const char *policy_name; 2017 int policy_argc; 2018 const char **policy_argv; 2019 2020 struct cache_features features; 2021 }; 2022 2023 static void destroy_cache_args(struct cache_args *ca) 2024 { 2025 if (ca->metadata_dev) 2026 dm_put_device(ca->ti, ca->metadata_dev); 2027 2028 if (ca->cache_dev) 2029 dm_put_device(ca->ti, ca->cache_dev); 2030 2031 if (ca->origin_dev) 2032 dm_put_device(ca->ti, ca->origin_dev); 2033 2034 kfree(ca); 2035 } 2036 2037 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2038 { 2039 if (!as->argc) { 2040 *error = "Insufficient args"; 2041 return false; 2042 } 2043 2044 return true; 2045 } 2046 2047 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2048 char **error) 2049 { 2050 int r; 2051 sector_t metadata_dev_size; 2052 2053 if (!at_least_one_arg(as, error)) 2054 return -EINVAL; 2055 2056 r = dm_get_device(ca->ti, dm_shift_arg(as), 2057 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); 2058 if (r) { 2059 *error = "Error opening metadata device"; 2060 return r; 2061 } 2062 2063 metadata_dev_size = get_dev_size(ca->metadata_dev); 2064 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2065 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2066 ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2067 2068 return 0; 2069 } 2070 2071 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2072 char **error) 2073 { 2074 int r; 2075 2076 if (!at_least_one_arg(as, error)) 2077 return -EINVAL; 2078 2079 r = dm_get_device(ca->ti, dm_shift_arg(as), 2080 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); 2081 if (r) { 2082 *error = "Error opening cache device"; 2083 return r; 2084 } 2085 ca->cache_sectors = get_dev_size(ca->cache_dev); 2086 2087 return 0; 2088 } 2089 2090 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2091 char **error) 2092 { 2093 sector_t origin_sectors; 2094 int r; 2095 2096 if (!at_least_one_arg(as, error)) 2097 return -EINVAL; 2098 2099 r = dm_get_device(ca->ti, dm_shift_arg(as), 2100 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); 2101 if (r) { 2102 *error = "Error opening origin device"; 2103 return r; 2104 } 2105 2106 origin_sectors = get_dev_size(ca->origin_dev); 2107 if (ca->ti->len > origin_sectors) { 2108 *error = "Device size larger than cached device"; 2109 return -EINVAL; 2110 } 2111 2112 return 0; 2113 } 2114 2115 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2116 char **error) 2117 { 2118 unsigned long block_size; 2119 2120 if (!at_least_one_arg(as, error)) 2121 return -EINVAL; 2122 2123 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2124 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2125 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2126 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2127 *error = "Invalid data block size"; 2128 return -EINVAL; 2129 } 2130 2131 if (block_size > ca->cache_sectors) { 2132 *error = "Data block size is larger than the cache device"; 2133 return -EINVAL; 2134 } 2135 2136 ca->block_size = block_size; 2137 2138 return 0; 2139 } 2140 2141 static void init_features(struct cache_features *cf) 2142 { 2143 cf->mode = CM_WRITE; 2144 cf->io_mode = CM_IO_WRITEBACK; 2145 cf->metadata_version = 1; 2146 cf->discard_passdown = true; 2147 } 2148 2149 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2150 char **error) 2151 { 2152 static const struct dm_arg _args[] = { 2153 {0, 3, "Invalid number of cache feature arguments"}, 2154 }; 2155 2156 int r, mode_ctr = 0; 2157 unsigned int argc; 2158 const char *arg; 2159 struct cache_features *cf = &ca->features; 2160 2161 init_features(cf); 2162 2163 r = dm_read_arg_group(_args, as, &argc, error); 2164 if (r) 2165 return -EINVAL; 2166 2167 while (argc--) { 2168 arg = dm_shift_arg(as); 2169 2170 if (!strcasecmp(arg, "writeback")) { 2171 cf->io_mode = CM_IO_WRITEBACK; 2172 mode_ctr++; 2173 } 2174 2175 else if (!strcasecmp(arg, "writethrough")) { 2176 cf->io_mode = CM_IO_WRITETHROUGH; 2177 mode_ctr++; 2178 } 2179 2180 else if (!strcasecmp(arg, "passthrough")) { 2181 cf->io_mode = CM_IO_PASSTHROUGH; 2182 mode_ctr++; 2183 } 2184 2185 else if (!strcasecmp(arg, "metadata2")) 2186 cf->metadata_version = 2; 2187 2188 else if (!strcasecmp(arg, "no_discard_passdown")) 2189 cf->discard_passdown = false; 2190 2191 else { 2192 *error = "Unrecognised cache feature requested"; 2193 return -EINVAL; 2194 } 2195 } 2196 2197 if (mode_ctr > 1) { 2198 *error = "Duplicate cache io_mode features requested"; 2199 return -EINVAL; 2200 } 2201 2202 return 0; 2203 } 2204 2205 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2206 char **error) 2207 { 2208 static const struct dm_arg _args[] = { 2209 {0, 1024, "Invalid number of policy arguments"}, 2210 }; 2211 2212 int r; 2213 2214 if (!at_least_one_arg(as, error)) 2215 return -EINVAL; 2216 2217 ca->policy_name = dm_shift_arg(as); 2218 2219 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2220 if (r) 2221 return -EINVAL; 2222 2223 ca->policy_argv = (const char **)as->argv; 2224 dm_consume_args(as, ca->policy_argc); 2225 2226 return 0; 2227 } 2228 2229 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2230 char **error) 2231 { 2232 int r; 2233 struct dm_arg_set as; 2234 2235 as.argc = argc; 2236 as.argv = argv; 2237 2238 r = parse_metadata_dev(ca, &as, error); 2239 if (r) 2240 return r; 2241 2242 r = parse_cache_dev(ca, &as, error); 2243 if (r) 2244 return r; 2245 2246 r = parse_origin_dev(ca, &as, error); 2247 if (r) 2248 return r; 2249 2250 r = parse_block_size(ca, &as, error); 2251 if (r) 2252 return r; 2253 2254 r = parse_features(ca, &as, error); 2255 if (r) 2256 return r; 2257 2258 r = parse_policy(ca, &as, error); 2259 if (r) 2260 return r; 2261 2262 return 0; 2263 } 2264 2265 /*----------------------------------------------------------------*/ 2266 2267 static struct kmem_cache *migration_cache = NULL; 2268 2269 #define NOT_CORE_OPTION 1 2270 2271 static int process_config_option(struct cache *cache, const char *key, const char *value) 2272 { 2273 unsigned long tmp; 2274 2275 if (!strcasecmp(key, "migration_threshold")) { 2276 if (kstrtoul(value, 10, &tmp)) 2277 return -EINVAL; 2278 2279 cache->migration_threshold = tmp; 2280 return 0; 2281 } 2282 2283 return NOT_CORE_OPTION; 2284 } 2285 2286 static int set_config_value(struct cache *cache, const char *key, const char *value) 2287 { 2288 int r = process_config_option(cache, key, value); 2289 2290 if (r == NOT_CORE_OPTION) 2291 r = policy_set_config_value(cache->policy, key, value); 2292 2293 if (r) 2294 DMWARN("bad config value for %s: %s", key, value); 2295 2296 return r; 2297 } 2298 2299 static int set_config_values(struct cache *cache, int argc, const char **argv) 2300 { 2301 int r = 0; 2302 2303 if (argc & 1) { 2304 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2305 return -EINVAL; 2306 } 2307 2308 while (argc) { 2309 r = set_config_value(cache, argv[0], argv[1]); 2310 if (r) 2311 break; 2312 2313 argc -= 2; 2314 argv += 2; 2315 } 2316 2317 return r; 2318 } 2319 2320 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2321 char **error) 2322 { 2323 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2324 cache->cache_size, 2325 cache->origin_sectors, 2326 cache->sectors_per_block); 2327 if (IS_ERR(p)) { 2328 *error = "Error creating cache's policy"; 2329 return PTR_ERR(p); 2330 } 2331 cache->policy = p; 2332 BUG_ON(!cache->policy); 2333 2334 return 0; 2335 } 2336 2337 /* 2338 * We want the discard block size to be at least the size of the cache 2339 * block size and have no more than 2^14 discard blocks across the origin. 2340 */ 2341 #define MAX_DISCARD_BLOCKS (1 << 14) 2342 2343 static bool too_many_discard_blocks(sector_t discard_block_size, 2344 sector_t origin_size) 2345 { 2346 (void) sector_div(origin_size, discard_block_size); 2347 2348 return origin_size > MAX_DISCARD_BLOCKS; 2349 } 2350 2351 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2352 sector_t origin_size) 2353 { 2354 sector_t discard_block_size = cache_block_size; 2355 2356 if (origin_size) 2357 while (too_many_discard_blocks(discard_block_size, origin_size)) 2358 discard_block_size *= 2; 2359 2360 return discard_block_size; 2361 } 2362 2363 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2364 { 2365 dm_block_t nr_blocks = from_cblock(size); 2366 2367 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2368 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2369 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2370 "Please consider increasing the cache block size to reduce the overall cache block count.", 2371 (unsigned long long) nr_blocks); 2372 2373 cache->cache_size = size; 2374 } 2375 2376 #define DEFAULT_MIGRATION_THRESHOLD 2048 2377 2378 static int cache_create(struct cache_args *ca, struct cache **result) 2379 { 2380 int r = 0; 2381 char **error = &ca->ti->error; 2382 struct cache *cache; 2383 struct dm_target *ti = ca->ti; 2384 dm_block_t origin_blocks; 2385 struct dm_cache_metadata *cmd; 2386 bool may_format = ca->features.mode == CM_WRITE; 2387 2388 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2389 if (!cache) 2390 return -ENOMEM; 2391 2392 cache->ti = ca->ti; 2393 ti->private = cache; 2394 ti->accounts_remapped_io = true; 2395 ti->num_flush_bios = 2; 2396 ti->flush_supported = true; 2397 2398 ti->num_discard_bios = 1; 2399 ti->discards_supported = true; 2400 2401 ti->per_io_data_size = sizeof(struct per_bio_data); 2402 2403 cache->features = ca->features; 2404 if (writethrough_mode(cache)) { 2405 /* Create bioset for writethrough bios issued to origin */ 2406 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2407 if (r) 2408 goto bad; 2409 } 2410 2411 cache->metadata_dev = ca->metadata_dev; 2412 cache->origin_dev = ca->origin_dev; 2413 cache->cache_dev = ca->cache_dev; 2414 2415 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2416 2417 origin_blocks = cache->origin_sectors = ti->len; 2418 origin_blocks = block_div(origin_blocks, ca->block_size); 2419 cache->origin_blocks = to_oblock(origin_blocks); 2420 2421 cache->sectors_per_block = ca->block_size; 2422 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2423 r = -EINVAL; 2424 goto bad; 2425 } 2426 2427 if (ca->block_size & (ca->block_size - 1)) { 2428 dm_block_t cache_size = ca->cache_sectors; 2429 2430 cache->sectors_per_block_shift = -1; 2431 cache_size = block_div(cache_size, ca->block_size); 2432 set_cache_size(cache, to_cblock(cache_size)); 2433 } else { 2434 cache->sectors_per_block_shift = __ffs(ca->block_size); 2435 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2436 } 2437 2438 r = create_cache_policy(cache, ca, error); 2439 if (r) 2440 goto bad; 2441 2442 cache->policy_nr_args = ca->policy_argc; 2443 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2444 2445 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2446 if (r) { 2447 *error = "Error setting cache policy's config values"; 2448 goto bad; 2449 } 2450 2451 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2452 ca->block_size, may_format, 2453 dm_cache_policy_get_hint_size(cache->policy), 2454 ca->features.metadata_version); 2455 if (IS_ERR(cmd)) { 2456 *error = "Error creating metadata object"; 2457 r = PTR_ERR(cmd); 2458 goto bad; 2459 } 2460 cache->cmd = cmd; 2461 set_cache_mode(cache, CM_WRITE); 2462 if (get_cache_mode(cache) != CM_WRITE) { 2463 *error = "Unable to get write access to metadata, please check/repair metadata."; 2464 r = -EINVAL; 2465 goto bad; 2466 } 2467 2468 if (passthrough_mode(cache)) { 2469 bool all_clean; 2470 2471 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2472 if (r) { 2473 *error = "dm_cache_metadata_all_clean() failed"; 2474 goto bad; 2475 } 2476 2477 if (!all_clean) { 2478 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2479 r = -EINVAL; 2480 goto bad; 2481 } 2482 2483 policy_allow_migrations(cache->policy, false); 2484 } 2485 2486 spin_lock_init(&cache->lock); 2487 bio_list_init(&cache->deferred_bios); 2488 atomic_set(&cache->nr_allocated_migrations, 0); 2489 atomic_set(&cache->nr_io_migrations, 0); 2490 init_waitqueue_head(&cache->migration_wait); 2491 2492 r = -ENOMEM; 2493 atomic_set(&cache->nr_dirty, 0); 2494 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2495 if (!cache->dirty_bitset) { 2496 *error = "could not allocate dirty bitset"; 2497 goto bad; 2498 } 2499 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2500 2501 cache->discard_block_size = 2502 calculate_discard_block_size(cache->sectors_per_block, 2503 cache->origin_sectors); 2504 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2505 cache->discard_block_size)); 2506 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2507 if (!cache->discard_bitset) { 2508 *error = "could not allocate discard bitset"; 2509 goto bad; 2510 } 2511 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2512 2513 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2514 if (IS_ERR(cache->copier)) { 2515 *error = "could not create kcopyd client"; 2516 r = PTR_ERR(cache->copier); 2517 goto bad; 2518 } 2519 2520 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2521 if (!cache->wq) { 2522 *error = "could not create workqueue for metadata object"; 2523 goto bad; 2524 } 2525 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2526 INIT_WORK(&cache->migration_worker, check_migrations); 2527 INIT_DELAYED_WORK(&cache->waker, do_waker); 2528 2529 cache->prison = dm_bio_prison_create_v2(cache->wq); 2530 if (!cache->prison) { 2531 *error = "could not create bio prison"; 2532 goto bad; 2533 } 2534 2535 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2536 migration_cache); 2537 if (r) { 2538 *error = "Error creating cache's migration mempool"; 2539 goto bad; 2540 } 2541 2542 cache->need_tick_bio = true; 2543 cache->sized = false; 2544 cache->invalidate = false; 2545 cache->commit_requested = false; 2546 cache->loaded_mappings = false; 2547 cache->loaded_discards = false; 2548 2549 load_stats(cache); 2550 2551 atomic_set(&cache->stats.demotion, 0); 2552 atomic_set(&cache->stats.promotion, 0); 2553 atomic_set(&cache->stats.copies_avoided, 0); 2554 atomic_set(&cache->stats.cache_cell_clash, 0); 2555 atomic_set(&cache->stats.commit_count, 0); 2556 atomic_set(&cache->stats.discard_count, 0); 2557 2558 spin_lock_init(&cache->invalidation_lock); 2559 INIT_LIST_HEAD(&cache->invalidation_requests); 2560 2561 batcher_init(&cache->committer, commit_op, cache, 2562 issue_op, cache, cache->wq); 2563 dm_iot_init(&cache->tracker); 2564 2565 init_rwsem(&cache->background_work_lock); 2566 prevent_background_work(cache); 2567 2568 *result = cache; 2569 return 0; 2570 bad: 2571 __destroy(cache); 2572 return r; 2573 } 2574 2575 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2576 { 2577 unsigned int i; 2578 const char **copy; 2579 2580 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2581 if (!copy) 2582 return -ENOMEM; 2583 for (i = 0; i < argc; i++) { 2584 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2585 if (!copy[i]) { 2586 while (i--) 2587 kfree(copy[i]); 2588 kfree(copy); 2589 return -ENOMEM; 2590 } 2591 } 2592 2593 cache->nr_ctr_args = argc; 2594 cache->ctr_args = copy; 2595 2596 return 0; 2597 } 2598 2599 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2600 { 2601 int r = -EINVAL; 2602 struct cache_args *ca; 2603 struct cache *cache = NULL; 2604 2605 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2606 if (!ca) { 2607 ti->error = "Error allocating memory for cache"; 2608 return -ENOMEM; 2609 } 2610 ca->ti = ti; 2611 2612 r = parse_cache_args(ca, argc, argv, &ti->error); 2613 if (r) 2614 goto out; 2615 2616 r = cache_create(ca, &cache); 2617 if (r) 2618 goto out; 2619 2620 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2621 if (r) { 2622 __destroy(cache); 2623 goto out; 2624 } 2625 2626 ti->private = cache; 2627 out: 2628 destroy_cache_args(ca); 2629 return r; 2630 } 2631 2632 /*----------------------------------------------------------------*/ 2633 2634 static int cache_map(struct dm_target *ti, struct bio *bio) 2635 { 2636 struct cache *cache = ti->private; 2637 2638 int r; 2639 bool commit_needed; 2640 dm_oblock_t block = get_bio_block(cache, bio); 2641 2642 init_per_bio_data(bio); 2643 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2644 /* 2645 * This can only occur if the io goes to a partial block at 2646 * the end of the origin device. We don't cache these. 2647 * Just remap to the origin and carry on. 2648 */ 2649 remap_to_origin(cache, bio); 2650 accounted_begin(cache, bio); 2651 return DM_MAPIO_REMAPPED; 2652 } 2653 2654 if (discard_or_flush(bio)) { 2655 defer_bio(cache, bio); 2656 return DM_MAPIO_SUBMITTED; 2657 } 2658 2659 r = map_bio(cache, bio, block, &commit_needed); 2660 if (commit_needed) 2661 schedule_commit(&cache->committer); 2662 2663 return r; 2664 } 2665 2666 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2667 { 2668 struct cache *cache = ti->private; 2669 unsigned long flags; 2670 struct per_bio_data *pb = get_per_bio_data(bio); 2671 2672 if (pb->tick) { 2673 policy_tick(cache->policy, false); 2674 2675 spin_lock_irqsave(&cache->lock, flags); 2676 cache->need_tick_bio = true; 2677 spin_unlock_irqrestore(&cache->lock, flags); 2678 } 2679 2680 bio_drop_shared_lock(cache, bio); 2681 accounted_complete(cache, bio); 2682 2683 return DM_ENDIO_DONE; 2684 } 2685 2686 static int write_dirty_bitset(struct cache *cache) 2687 { 2688 int r; 2689 2690 if (get_cache_mode(cache) >= CM_READ_ONLY) 2691 return -EINVAL; 2692 2693 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2694 if (r) 2695 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2696 2697 return r; 2698 } 2699 2700 static int write_discard_bitset(struct cache *cache) 2701 { 2702 unsigned int i, r; 2703 2704 if (get_cache_mode(cache) >= CM_READ_ONLY) 2705 return -EINVAL; 2706 2707 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2708 cache->discard_nr_blocks); 2709 if (r) { 2710 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2711 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2712 return r; 2713 } 2714 2715 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2716 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2717 is_discarded(cache, to_dblock(i))); 2718 if (r) { 2719 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2720 return r; 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 static int write_hints(struct cache *cache) 2728 { 2729 int r; 2730 2731 if (get_cache_mode(cache) >= CM_READ_ONLY) 2732 return -EINVAL; 2733 2734 r = dm_cache_write_hints(cache->cmd, cache->policy); 2735 if (r) { 2736 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2737 return r; 2738 } 2739 2740 return 0; 2741 } 2742 2743 /* 2744 * returns true on success 2745 */ 2746 static bool sync_metadata(struct cache *cache) 2747 { 2748 int r1, r2, r3, r4; 2749 2750 r1 = write_dirty_bitset(cache); 2751 if (r1) 2752 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2753 2754 r2 = write_discard_bitset(cache); 2755 if (r2) 2756 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2757 2758 save_stats(cache); 2759 2760 r3 = write_hints(cache); 2761 if (r3) 2762 DMERR("%s: could not write hints", cache_device_name(cache)); 2763 2764 /* 2765 * If writing the above metadata failed, we still commit, but don't 2766 * set the clean shutdown flag. This will effectively force every 2767 * dirty bit to be set on reload. 2768 */ 2769 r4 = commit(cache, !r1 && !r2 && !r3); 2770 if (r4) 2771 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2772 2773 return !r1 && !r2 && !r3 && !r4; 2774 } 2775 2776 static void cache_postsuspend(struct dm_target *ti) 2777 { 2778 struct cache *cache = ti->private; 2779 2780 prevent_background_work(cache); 2781 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2782 2783 cancel_delayed_work_sync(&cache->waker); 2784 drain_workqueue(cache->wq); 2785 WARN_ON(cache->tracker.in_flight); 2786 2787 /* 2788 * If it's a flush suspend there won't be any deferred bios, so this 2789 * call is harmless. 2790 */ 2791 requeue_deferred_bios(cache); 2792 2793 if (get_cache_mode(cache) == CM_WRITE) 2794 (void) sync_metadata(cache); 2795 } 2796 2797 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2798 bool dirty, uint32_t hint, bool hint_valid) 2799 { 2800 struct cache *cache = context; 2801 2802 if (dirty) { 2803 set_bit(from_cblock(cblock), cache->dirty_bitset); 2804 atomic_inc(&cache->nr_dirty); 2805 } else 2806 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2807 2808 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2809 } 2810 2811 /* 2812 * The discard block size in the on disk metadata is not 2813 * necessarily the same as we're currently using. So we have to 2814 * be careful to only set the discarded attribute if we know it 2815 * covers a complete block of the new size. 2816 */ 2817 struct discard_load_info { 2818 struct cache *cache; 2819 2820 /* 2821 * These blocks are sized using the on disk dblock size, rather 2822 * than the current one. 2823 */ 2824 dm_block_t block_size; 2825 dm_block_t discard_begin, discard_end; 2826 }; 2827 2828 static void discard_load_info_init(struct cache *cache, 2829 struct discard_load_info *li) 2830 { 2831 li->cache = cache; 2832 li->discard_begin = li->discard_end = 0; 2833 } 2834 2835 static void set_discard_range(struct discard_load_info *li) 2836 { 2837 sector_t b, e; 2838 2839 if (li->discard_begin == li->discard_end) 2840 return; 2841 2842 /* 2843 * Convert to sectors. 2844 */ 2845 b = li->discard_begin * li->block_size; 2846 e = li->discard_end * li->block_size; 2847 2848 /* 2849 * Then convert back to the current dblock size. 2850 */ 2851 b = dm_sector_div_up(b, li->cache->discard_block_size); 2852 sector_div(e, li->cache->discard_block_size); 2853 2854 /* 2855 * The origin may have shrunk, so we need to check we're still in 2856 * bounds. 2857 */ 2858 if (e > from_dblock(li->cache->discard_nr_blocks)) 2859 e = from_dblock(li->cache->discard_nr_blocks); 2860 2861 for (; b < e; b++) 2862 set_discard(li->cache, to_dblock(b)); 2863 } 2864 2865 static int load_discard(void *context, sector_t discard_block_size, 2866 dm_dblock_t dblock, bool discard) 2867 { 2868 struct discard_load_info *li = context; 2869 2870 li->block_size = discard_block_size; 2871 2872 if (discard) { 2873 if (from_dblock(dblock) == li->discard_end) 2874 /* 2875 * We're already in a discard range, just extend it. 2876 */ 2877 li->discard_end = li->discard_end + 1ULL; 2878 2879 else { 2880 /* 2881 * Emit the old range and start a new one. 2882 */ 2883 set_discard_range(li); 2884 li->discard_begin = from_dblock(dblock); 2885 li->discard_end = li->discard_begin + 1ULL; 2886 } 2887 } else { 2888 set_discard_range(li); 2889 li->discard_begin = li->discard_end = 0; 2890 } 2891 2892 return 0; 2893 } 2894 2895 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2896 { 2897 sector_t size = get_dev_size(cache->cache_dev); 2898 (void) sector_div(size, cache->sectors_per_block); 2899 return to_cblock(size); 2900 } 2901 2902 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2903 { 2904 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2905 DMERR("%s: unable to extend cache due to missing cache table reload", 2906 cache_device_name(cache)); 2907 return false; 2908 } 2909 2910 /* 2911 * We can't drop a dirty block when shrinking the cache. 2912 */ 2913 if (cache->loaded_mappings) { 2914 new_size = to_cblock(find_next_bit(cache->dirty_bitset, 2915 from_cblock(cache->cache_size), 2916 from_cblock(new_size))); 2917 if (new_size != cache->cache_size) { 2918 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2919 cache_device_name(cache), 2920 (unsigned long long) from_cblock(new_size)); 2921 return false; 2922 } 2923 } 2924 2925 return true; 2926 } 2927 2928 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2929 { 2930 int r; 2931 2932 r = dm_cache_resize(cache->cmd, new_size); 2933 if (r) { 2934 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2935 metadata_operation_failed(cache, "dm_cache_resize", r); 2936 return r; 2937 } 2938 2939 set_cache_size(cache, new_size); 2940 2941 return 0; 2942 } 2943 2944 static int cache_preresume(struct dm_target *ti) 2945 { 2946 int r = 0; 2947 struct cache *cache = ti->private; 2948 dm_cblock_t csize = get_cache_dev_size(cache); 2949 2950 /* 2951 * Check to see if the cache has resized. 2952 */ 2953 if (!cache->sized || csize != cache->cache_size) { 2954 if (!can_resize(cache, csize)) 2955 return -EINVAL; 2956 2957 r = resize_cache_dev(cache, csize); 2958 if (r) 2959 return r; 2960 2961 cache->sized = true; 2962 } 2963 2964 if (!cache->loaded_mappings) { 2965 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2966 load_mapping, cache); 2967 if (r) { 2968 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 2969 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 2970 return r; 2971 } 2972 2973 cache->loaded_mappings = true; 2974 } 2975 2976 if (!cache->loaded_discards) { 2977 struct discard_load_info li; 2978 2979 /* 2980 * The discard bitset could have been resized, or the 2981 * discard block size changed. To be safe we start by 2982 * setting every dblock to not discarded. 2983 */ 2984 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2985 2986 discard_load_info_init(cache, &li); 2987 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 2988 if (r) { 2989 DMERR("%s: could not load origin discards", cache_device_name(cache)); 2990 metadata_operation_failed(cache, "dm_cache_load_discards", r); 2991 return r; 2992 } 2993 set_discard_range(&li); 2994 2995 cache->loaded_discards = true; 2996 } 2997 2998 return r; 2999 } 3000 3001 static void cache_resume(struct dm_target *ti) 3002 { 3003 struct cache *cache = ti->private; 3004 3005 cache->need_tick_bio = true; 3006 allow_background_work(cache); 3007 do_waker(&cache->waker.work); 3008 } 3009 3010 static void emit_flags(struct cache *cache, char *result, 3011 unsigned int maxlen, ssize_t *sz_ptr) 3012 { 3013 ssize_t sz = *sz_ptr; 3014 struct cache_features *cf = &cache->features; 3015 unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3016 3017 DMEMIT("%u ", count); 3018 3019 if (cf->metadata_version == 2) 3020 DMEMIT("metadata2 "); 3021 3022 if (writethrough_mode(cache)) 3023 DMEMIT("writethrough "); 3024 3025 else if (passthrough_mode(cache)) 3026 DMEMIT("passthrough "); 3027 3028 else if (writeback_mode(cache)) 3029 DMEMIT("writeback "); 3030 3031 else { 3032 DMEMIT("unknown "); 3033 DMERR("%s: internal error: unknown io mode: %d", 3034 cache_device_name(cache), (int) cf->io_mode); 3035 } 3036 3037 if (!cf->discard_passdown) 3038 DMEMIT("no_discard_passdown "); 3039 3040 *sz_ptr = sz; 3041 } 3042 3043 /* 3044 * Status format: 3045 * 3046 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3047 * <cache block size> <#used cache blocks>/<#total cache blocks> 3048 * <#read hits> <#read misses> <#write hits> <#write misses> 3049 * <#demotions> <#promotions> <#dirty> 3050 * <#features> <features>* 3051 * <#core args> <core args> 3052 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3053 */ 3054 static void cache_status(struct dm_target *ti, status_type_t type, 3055 unsigned int status_flags, char *result, unsigned int maxlen) 3056 { 3057 int r = 0; 3058 unsigned int i; 3059 ssize_t sz = 0; 3060 dm_block_t nr_free_blocks_metadata = 0; 3061 dm_block_t nr_blocks_metadata = 0; 3062 char buf[BDEVNAME_SIZE]; 3063 struct cache *cache = ti->private; 3064 dm_cblock_t residency; 3065 bool needs_check; 3066 3067 switch (type) { 3068 case STATUSTYPE_INFO: 3069 if (get_cache_mode(cache) == CM_FAIL) { 3070 DMEMIT("Fail"); 3071 break; 3072 } 3073 3074 /* Commit to ensure statistics aren't out-of-date */ 3075 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3076 (void) commit(cache, false); 3077 3078 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3079 if (r) { 3080 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3081 cache_device_name(cache), r); 3082 goto err; 3083 } 3084 3085 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3086 if (r) { 3087 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3088 cache_device_name(cache), r); 3089 goto err; 3090 } 3091 3092 residency = policy_residency(cache->policy); 3093 3094 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3095 (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, 3096 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3097 (unsigned long long)nr_blocks_metadata, 3098 (unsigned long long)cache->sectors_per_block, 3099 (unsigned long long) from_cblock(residency), 3100 (unsigned long long) from_cblock(cache->cache_size), 3101 (unsigned int) atomic_read(&cache->stats.read_hit), 3102 (unsigned int) atomic_read(&cache->stats.read_miss), 3103 (unsigned int) atomic_read(&cache->stats.write_hit), 3104 (unsigned int) atomic_read(&cache->stats.write_miss), 3105 (unsigned int) atomic_read(&cache->stats.demotion), 3106 (unsigned int) atomic_read(&cache->stats.promotion), 3107 (unsigned long) atomic_read(&cache->nr_dirty)); 3108 3109 emit_flags(cache, result, maxlen, &sz); 3110 3111 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3112 3113 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3114 if (sz < maxlen) { 3115 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3116 if (r) 3117 DMERR("%s: policy_emit_config_values returned %d", 3118 cache_device_name(cache), r); 3119 } 3120 3121 if (get_cache_mode(cache) == CM_READ_ONLY) 3122 DMEMIT("ro "); 3123 else 3124 DMEMIT("rw "); 3125 3126 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3127 3128 if (r || needs_check) 3129 DMEMIT("needs_check "); 3130 else 3131 DMEMIT("- "); 3132 3133 break; 3134 3135 case STATUSTYPE_TABLE: 3136 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3137 DMEMIT("%s ", buf); 3138 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3139 DMEMIT("%s ", buf); 3140 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3141 DMEMIT("%s", buf); 3142 3143 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3144 DMEMIT(" %s", cache->ctr_args[i]); 3145 if (cache->nr_ctr_args) 3146 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3147 break; 3148 3149 case STATUSTYPE_IMA: 3150 DMEMIT_TARGET_NAME_VERSION(ti->type); 3151 if (get_cache_mode(cache) == CM_FAIL) 3152 DMEMIT(",metadata_mode=fail"); 3153 else if (get_cache_mode(cache) == CM_READ_ONLY) 3154 DMEMIT(",metadata_mode=ro"); 3155 else 3156 DMEMIT(",metadata_mode=rw"); 3157 3158 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3159 DMEMIT(",cache_metadata_device=%s", buf); 3160 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3161 DMEMIT(",cache_device=%s", buf); 3162 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3163 DMEMIT(",cache_origin_device=%s", buf); 3164 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); 3165 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); 3166 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); 3167 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); 3168 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); 3169 DMEMIT(";"); 3170 break; 3171 } 3172 3173 return; 3174 3175 err: 3176 DMEMIT("Error"); 3177 } 3178 3179 /* 3180 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3181 * the one-past-the-end value. 3182 */ 3183 struct cblock_range { 3184 dm_cblock_t begin; 3185 dm_cblock_t end; 3186 }; 3187 3188 /* 3189 * A cache block range can take two forms: 3190 * 3191 * i) A single cblock, eg. '3456' 3192 * ii) A begin and end cblock with a dash between, eg. 123-234 3193 */ 3194 static int parse_cblock_range(struct cache *cache, const char *str, 3195 struct cblock_range *result) 3196 { 3197 char dummy; 3198 uint64_t b, e; 3199 int r; 3200 3201 /* 3202 * Try and parse form (ii) first. 3203 */ 3204 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3205 3206 if (r == 2) { 3207 result->begin = to_cblock(b); 3208 result->end = to_cblock(e); 3209 return 0; 3210 } 3211 3212 /* 3213 * That didn't work, try form (i). 3214 */ 3215 r = sscanf(str, "%llu%c", &b, &dummy); 3216 3217 if (r == 1) { 3218 result->begin = to_cblock(b); 3219 result->end = to_cblock(from_cblock(result->begin) + 1u); 3220 return 0; 3221 } 3222 3223 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3224 return -EINVAL; 3225 } 3226 3227 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3228 { 3229 uint64_t b = from_cblock(range->begin); 3230 uint64_t e = from_cblock(range->end); 3231 uint64_t n = from_cblock(cache->cache_size); 3232 3233 if (b >= n) { 3234 DMERR("%s: begin cblock out of range: %llu >= %llu", 3235 cache_device_name(cache), b, n); 3236 return -EINVAL; 3237 } 3238 3239 if (e > n) { 3240 DMERR("%s: end cblock out of range: %llu > %llu", 3241 cache_device_name(cache), e, n); 3242 return -EINVAL; 3243 } 3244 3245 if (b >= e) { 3246 DMERR("%s: invalid cblock range: %llu >= %llu", 3247 cache_device_name(cache), b, e); 3248 return -EINVAL; 3249 } 3250 3251 return 0; 3252 } 3253 3254 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3255 { 3256 return to_cblock(from_cblock(b) + 1); 3257 } 3258 3259 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3260 { 3261 int r = 0; 3262 3263 /* 3264 * We don't need to do any locking here because we know we're in 3265 * passthrough mode. There's is potential for a race between an 3266 * invalidation triggered by an io and an invalidation message. This 3267 * is harmless, we must not worry if the policy call fails. 3268 */ 3269 while (range->begin != range->end) { 3270 r = invalidate_cblock(cache, range->begin); 3271 if (r) 3272 return r; 3273 3274 range->begin = cblock_succ(range->begin); 3275 } 3276 3277 cache->commit_requested = true; 3278 return r; 3279 } 3280 3281 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, 3282 const char **cblock_ranges) 3283 { 3284 int r = 0; 3285 unsigned int i; 3286 struct cblock_range range; 3287 3288 if (!passthrough_mode(cache)) { 3289 DMERR("%s: cache has to be in passthrough mode for invalidation", 3290 cache_device_name(cache)); 3291 return -EPERM; 3292 } 3293 3294 for (i = 0; i < count; i++) { 3295 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3296 if (r) 3297 break; 3298 3299 r = validate_cblock_range(cache, &range); 3300 if (r) 3301 break; 3302 3303 /* 3304 * Pass begin and end origin blocks to the worker and wake it. 3305 */ 3306 r = request_invalidation(cache, &range); 3307 if (r) 3308 break; 3309 } 3310 3311 return r; 3312 } 3313 3314 /* 3315 * Supports 3316 * "<key> <value>" 3317 * and 3318 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3319 * 3320 * The key migration_threshold is supported by the cache target core. 3321 */ 3322 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, 3323 char *result, unsigned int maxlen) 3324 { 3325 struct cache *cache = ti->private; 3326 3327 if (!argc) 3328 return -EINVAL; 3329 3330 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3331 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3332 cache_device_name(cache)); 3333 return -EOPNOTSUPP; 3334 } 3335 3336 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3337 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3338 3339 if (argc != 2) 3340 return -EINVAL; 3341 3342 return set_config_value(cache, argv[0], argv[1]); 3343 } 3344 3345 static int cache_iterate_devices(struct dm_target *ti, 3346 iterate_devices_callout_fn fn, void *data) 3347 { 3348 int r = 0; 3349 struct cache *cache = ti->private; 3350 3351 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3352 if (!r) 3353 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3354 3355 return r; 3356 } 3357 3358 /* 3359 * If discard_passdown was enabled verify that the origin device 3360 * supports discards. Disable discard_passdown if not. 3361 */ 3362 static void disable_passdown_if_not_supported(struct cache *cache) 3363 { 3364 struct block_device *origin_bdev = cache->origin_dev->bdev; 3365 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3366 const char *reason = NULL; 3367 3368 if (!cache->features.discard_passdown) 3369 return; 3370 3371 if (!bdev_max_discard_sectors(origin_bdev)) 3372 reason = "discard unsupported"; 3373 3374 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3375 reason = "max discard sectors smaller than a block"; 3376 3377 if (reason) { 3378 DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3379 origin_bdev, reason); 3380 cache->features.discard_passdown = false; 3381 } 3382 } 3383 3384 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3385 { 3386 struct block_device *origin_bdev = cache->origin_dev->bdev; 3387 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3388 3389 if (!cache->features.discard_passdown) { 3390 /* No passdown is done so setting own virtual limits */ 3391 limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3392 cache->origin_sectors); 3393 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3394 return; 3395 } 3396 3397 /* 3398 * cache_iterate_devices() is stacking both origin and fast device limits 3399 * but discards aren't passed to fast device, so inherit origin's limits. 3400 */ 3401 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3402 limits->discard_granularity = origin_limits->discard_granularity; 3403 limits->discard_alignment = origin_limits->discard_alignment; 3404 } 3405 3406 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3407 { 3408 struct cache *cache = ti->private; 3409 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3410 3411 /* 3412 * If the system-determined stacked limits are compatible with the 3413 * cache's blocksize (io_opt is a factor) do not override them. 3414 */ 3415 if (io_opt_sectors < cache->sectors_per_block || 3416 do_div(io_opt_sectors, cache->sectors_per_block)) { 3417 limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; 3418 limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; 3419 } 3420 3421 disable_passdown_if_not_supported(cache); 3422 set_discard_limits(cache, limits); 3423 } 3424 3425 /*----------------------------------------------------------------*/ 3426 3427 static struct target_type cache_target = { 3428 .name = "cache", 3429 .version = {2, 2, 0}, 3430 .module = THIS_MODULE, 3431 .ctr = cache_ctr, 3432 .dtr = cache_dtr, 3433 .map = cache_map, 3434 .end_io = cache_end_io, 3435 .postsuspend = cache_postsuspend, 3436 .preresume = cache_preresume, 3437 .resume = cache_resume, 3438 .status = cache_status, 3439 .message = cache_message, 3440 .iterate_devices = cache_iterate_devices, 3441 .io_hints = cache_io_hints, 3442 }; 3443 3444 static int __init dm_cache_init(void) 3445 { 3446 int r; 3447 3448 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3449 if (!migration_cache) { 3450 r = -ENOMEM; 3451 goto err; 3452 } 3453 3454 btracker_work_cache = kmem_cache_create("dm_cache_bt_work", 3455 sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); 3456 if (!btracker_work_cache) { 3457 r = -ENOMEM; 3458 goto err; 3459 } 3460 3461 r = dm_register_target(&cache_target); 3462 if (r) { 3463 goto err; 3464 } 3465 3466 return 0; 3467 3468 err: 3469 kmem_cache_destroy(migration_cache); 3470 kmem_cache_destroy(btracker_work_cache); 3471 return r; 3472 } 3473 3474 static void __exit dm_cache_exit(void) 3475 { 3476 dm_unregister_target(&cache_target); 3477 kmem_cache_destroy(migration_cache); 3478 kmem_cache_destroy(btracker_work_cache); 3479 } 3480 3481 module_init(dm_cache_init); 3482 module_exit(dm_cache_exit); 3483 3484 MODULE_DESCRIPTION(DM_NAME " cache target"); 3485 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3486 MODULE_LICENSE("GPL"); 3487