1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-prison-v2.h" 10 #include "dm-bio-record.h" 11 #include "dm-cache-metadata.h" 12 #include "dm-io-tracker.h" 13 #include "dm-cache-background-tracker.h" 14 15 #include <linux/dm-io.h> 16 #include <linux/dm-kcopyd.h> 17 #include <linux/jiffies.h> 18 #include <linux/init.h> 19 #include <linux/mempool.h> 20 #include <linux/module.h> 21 #include <linux/rwsem.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 25 #define DM_MSG_PREFIX "cache" 26 27 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 28 "A percentage of time allocated for copying to and/or from cache"); 29 30 /*----------------------------------------------------------------*/ 31 32 /* 33 * Glossary: 34 * 35 * oblock: index of an origin block 36 * cblock: index of a cache block 37 * promotion: movement of a block from origin to cache 38 * demotion: movement of a block from cache to origin 39 * migration: movement of a block between the origin and cache device, 40 * either direction 41 */ 42 43 /*----------------------------------------------------------------*/ 44 45 /* 46 * Represents a chunk of future work. 'input' allows continuations to pass 47 * values between themselves, typically error values. 48 */ 49 struct continuation { 50 struct work_struct ws; 51 blk_status_t input; 52 }; 53 54 static inline void init_continuation(struct continuation *k, 55 void (*fn)(struct work_struct *)) 56 { 57 INIT_WORK(&k->ws, fn); 58 k->input = 0; 59 } 60 61 static inline void queue_continuation(struct workqueue_struct *wq, 62 struct continuation *k) 63 { 64 queue_work(wq, &k->ws); 65 } 66 67 /*----------------------------------------------------------------*/ 68 69 /* 70 * The batcher collects together pieces of work that need a particular 71 * operation to occur before they can proceed (typically a commit). 72 */ 73 struct batcher { 74 /* 75 * The operation that everyone is waiting for. 76 */ 77 blk_status_t (*commit_op)(void *context); 78 void *commit_context; 79 80 /* 81 * This is how bios should be issued once the commit op is complete 82 * (accounted_request). 83 */ 84 void (*issue_op)(struct bio *bio, void *context); 85 void *issue_context; 86 87 /* 88 * Queued work gets put on here after commit. 89 */ 90 struct workqueue_struct *wq; 91 92 spinlock_t lock; 93 struct list_head work_items; 94 struct bio_list bios; 95 struct work_struct commit_work; 96 97 bool commit_scheduled; 98 }; 99 100 static void __commit(struct work_struct *_ws) 101 { 102 struct batcher *b = container_of(_ws, struct batcher, commit_work); 103 blk_status_t r; 104 struct list_head work_items; 105 struct work_struct *ws, *tmp; 106 struct continuation *k; 107 struct bio *bio; 108 struct bio_list bios; 109 110 INIT_LIST_HEAD(&work_items); 111 bio_list_init(&bios); 112 113 /* 114 * We have to grab these before the commit_op to avoid a race 115 * condition. 116 */ 117 spin_lock_irq(&b->lock); 118 list_splice_init(&b->work_items, &work_items); 119 bio_list_merge_init(&bios, &b->bios); 120 b->commit_scheduled = false; 121 spin_unlock_irq(&b->lock); 122 123 r = b->commit_op(b->commit_context); 124 125 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 126 k = container_of(ws, struct continuation, ws); 127 k->input = r; 128 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 129 queue_work(b->wq, ws); 130 } 131 132 while ((bio = bio_list_pop(&bios))) { 133 if (r) { 134 bio->bi_status = r; 135 bio_endio(bio); 136 } else 137 b->issue_op(bio, b->issue_context); 138 } 139 } 140 141 static void batcher_init(struct batcher *b, 142 blk_status_t (*commit_op)(void *), 143 void *commit_context, 144 void (*issue_op)(struct bio *bio, void *), 145 void *issue_context, 146 struct workqueue_struct *wq) 147 { 148 b->commit_op = commit_op; 149 b->commit_context = commit_context; 150 b->issue_op = issue_op; 151 b->issue_context = issue_context; 152 b->wq = wq; 153 154 spin_lock_init(&b->lock); 155 INIT_LIST_HEAD(&b->work_items); 156 bio_list_init(&b->bios); 157 INIT_WORK(&b->commit_work, __commit); 158 b->commit_scheduled = false; 159 } 160 161 static void async_commit(struct batcher *b) 162 { 163 queue_work(b->wq, &b->commit_work); 164 } 165 166 static void continue_after_commit(struct batcher *b, struct continuation *k) 167 { 168 bool commit_scheduled; 169 170 spin_lock_irq(&b->lock); 171 commit_scheduled = b->commit_scheduled; 172 list_add_tail(&k->ws.entry, &b->work_items); 173 spin_unlock_irq(&b->lock); 174 175 if (commit_scheduled) 176 async_commit(b); 177 } 178 179 /* 180 * Bios are errored if commit failed. 181 */ 182 static void issue_after_commit(struct batcher *b, struct bio *bio) 183 { 184 bool commit_scheduled; 185 186 spin_lock_irq(&b->lock); 187 commit_scheduled = b->commit_scheduled; 188 bio_list_add(&b->bios, bio); 189 spin_unlock_irq(&b->lock); 190 191 if (commit_scheduled) 192 async_commit(b); 193 } 194 195 /* 196 * Call this if some urgent work is waiting for the commit to complete. 197 */ 198 static void schedule_commit(struct batcher *b) 199 { 200 bool immediate; 201 202 spin_lock_irq(&b->lock); 203 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 204 b->commit_scheduled = true; 205 spin_unlock_irq(&b->lock); 206 207 if (immediate) 208 async_commit(b); 209 } 210 211 /* 212 * There are a couple of places where we let a bio run, but want to do some 213 * work before calling its endio function. We do this by temporarily 214 * changing the endio fn. 215 */ 216 struct dm_hook_info { 217 bio_end_io_t *bi_end_io; 218 }; 219 220 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 221 bio_end_io_t *bi_end_io, void *bi_private) 222 { 223 h->bi_end_io = bio->bi_end_io; 224 225 bio->bi_end_io = bi_end_io; 226 bio->bi_private = bi_private; 227 } 228 229 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 230 { 231 bio->bi_end_io = h->bi_end_io; 232 } 233 234 /*----------------------------------------------------------------*/ 235 236 #define MIGRATION_POOL_SIZE 128 237 #define COMMIT_PERIOD HZ 238 #define MIGRATION_COUNT_WINDOW 10 239 240 /* 241 * The block size of the device holding cache data must be 242 * between 32KB and 1GB. 243 */ 244 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 245 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 246 247 enum cache_metadata_mode { 248 CM_WRITE, /* metadata may be changed */ 249 CM_READ_ONLY, /* metadata may not be changed */ 250 CM_FAIL 251 }; 252 253 enum cache_io_mode { 254 /* 255 * Data is written to cached blocks only. These blocks are marked 256 * dirty. If you lose the cache device you will lose data. 257 * Potential performance increase for both reads and writes. 258 */ 259 CM_IO_WRITEBACK, 260 261 /* 262 * Data is written to both cache and origin. Blocks are never 263 * dirty. Potential performance benfit for reads only. 264 */ 265 CM_IO_WRITETHROUGH, 266 267 /* 268 * A degraded mode useful for various cache coherency situations 269 * (eg, rolling back snapshots). Reads and writes always go to the 270 * origin. If a write goes to a cached oblock, then the cache 271 * block is invalidated. 272 */ 273 CM_IO_PASSTHROUGH 274 }; 275 276 struct cache_features { 277 enum cache_metadata_mode mode; 278 enum cache_io_mode io_mode; 279 unsigned int metadata_version; 280 bool discard_passdown:1; 281 }; 282 283 struct cache_stats { 284 atomic_t read_hit; 285 atomic_t read_miss; 286 atomic_t write_hit; 287 atomic_t write_miss; 288 atomic_t demotion; 289 atomic_t promotion; 290 atomic_t writeback; 291 atomic_t copies_avoided; 292 atomic_t cache_cell_clash; 293 atomic_t commit_count; 294 atomic_t discard_count; 295 }; 296 297 struct cache { 298 struct dm_target *ti; 299 spinlock_t lock; 300 301 /* 302 * Fields for converting from sectors to blocks. 303 */ 304 int sectors_per_block_shift; 305 sector_t sectors_per_block; 306 307 struct dm_cache_metadata *cmd; 308 309 /* 310 * Metadata is written to this device. 311 */ 312 struct dm_dev *metadata_dev; 313 314 /* 315 * The slower of the two data devices. Typically a spindle. 316 */ 317 struct dm_dev *origin_dev; 318 319 /* 320 * The faster of the two data devices. Typically an SSD. 321 */ 322 struct dm_dev *cache_dev; 323 324 /* 325 * Size of the origin device in _complete_ blocks and native sectors. 326 */ 327 dm_oblock_t origin_blocks; 328 sector_t origin_sectors; 329 330 /* 331 * Size of the cache device in blocks. 332 */ 333 dm_cblock_t cache_size; 334 335 /* 336 * Invalidation fields. 337 */ 338 spinlock_t invalidation_lock; 339 struct list_head invalidation_requests; 340 341 sector_t migration_threshold; 342 wait_queue_head_t migration_wait; 343 atomic_t nr_allocated_migrations; 344 345 /* 346 * The number of in flight migrations that are performing 347 * background io. eg, promotion, writeback. 348 */ 349 atomic_t nr_io_migrations; 350 351 struct bio_list deferred_bios; 352 353 struct rw_semaphore quiesce_lock; 354 355 /* 356 * origin_blocks entries, discarded if set. 357 */ 358 dm_dblock_t discard_nr_blocks; 359 unsigned long *discard_bitset; 360 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 361 362 /* 363 * Rather than reconstructing the table line for the status we just 364 * save it and regurgitate. 365 */ 366 unsigned int nr_ctr_args; 367 const char **ctr_args; 368 369 struct dm_kcopyd_client *copier; 370 struct work_struct deferred_bio_worker; 371 struct work_struct migration_worker; 372 struct workqueue_struct *wq; 373 struct delayed_work waker; 374 struct dm_bio_prison_v2 *prison; 375 376 /* 377 * cache_size entries, dirty if set 378 */ 379 unsigned long *dirty_bitset; 380 atomic_t nr_dirty; 381 382 unsigned int policy_nr_args; 383 struct dm_cache_policy *policy; 384 385 /* 386 * Cache features such as write-through. 387 */ 388 struct cache_features features; 389 390 struct cache_stats stats; 391 392 bool need_tick_bio:1; 393 bool sized:1; 394 bool invalidate:1; 395 bool commit_requested:1; 396 bool loaded_mappings:1; 397 bool loaded_discards:1; 398 399 struct rw_semaphore background_work_lock; 400 401 struct batcher committer; 402 struct work_struct commit_ws; 403 404 struct dm_io_tracker tracker; 405 406 mempool_t migration_pool; 407 408 struct bio_set bs; 409 410 /* 411 * Cache_size entries. Set bits indicate blocks mapped beyond the 412 * target length, which are marked for invalidation. 413 */ 414 unsigned long *invalid_bitset; 415 }; 416 417 struct per_bio_data { 418 bool tick:1; 419 unsigned int req_nr:2; 420 struct dm_bio_prison_cell_v2 *cell; 421 struct dm_hook_info hook_info; 422 sector_t len; 423 }; 424 425 struct dm_cache_migration { 426 struct continuation k; 427 struct cache *cache; 428 429 struct policy_work *op; 430 struct bio *overwrite_bio; 431 struct dm_bio_prison_cell_v2 *cell; 432 433 dm_cblock_t invalidate_cblock; 434 dm_oblock_t invalidate_oblock; 435 }; 436 437 /*----------------------------------------------------------------*/ 438 439 static bool writethrough_mode(struct cache *cache) 440 { 441 return cache->features.io_mode == CM_IO_WRITETHROUGH; 442 } 443 444 static bool writeback_mode(struct cache *cache) 445 { 446 return cache->features.io_mode == CM_IO_WRITEBACK; 447 } 448 449 static inline bool passthrough_mode(struct cache *cache) 450 { 451 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 452 } 453 454 /*----------------------------------------------------------------*/ 455 456 static void wake_deferred_bio_worker(struct cache *cache) 457 { 458 queue_work(cache->wq, &cache->deferred_bio_worker); 459 } 460 461 static void wake_migration_worker(struct cache *cache) 462 { 463 if (passthrough_mode(cache)) 464 return; 465 466 queue_work(cache->wq, &cache->migration_worker); 467 } 468 469 /*----------------------------------------------------------------*/ 470 471 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 472 { 473 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 474 } 475 476 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 477 { 478 dm_bio_prison_free_cell_v2(cache->prison, cell); 479 } 480 481 static struct dm_cache_migration *alloc_migration(struct cache *cache) 482 { 483 struct dm_cache_migration *mg; 484 485 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 486 487 memset(mg, 0, sizeof(*mg)); 488 489 mg->cache = cache; 490 atomic_inc(&cache->nr_allocated_migrations); 491 492 return mg; 493 } 494 495 static void free_migration(struct dm_cache_migration *mg) 496 { 497 struct cache *cache = mg->cache; 498 499 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 500 wake_up(&cache->migration_wait); 501 502 mempool_free(mg, &cache->migration_pool); 503 } 504 505 /*----------------------------------------------------------------*/ 506 507 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 508 { 509 return to_oblock(from_oblock(b) + 1ull); 510 } 511 512 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 513 { 514 key->virtual = 0; 515 key->dev = 0; 516 key->block_begin = from_oblock(begin); 517 key->block_end = from_oblock(end); 518 } 519 520 /* 521 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 522 * level 1 which prevents *both* READs and WRITEs. 523 */ 524 #define WRITE_LOCK_LEVEL 0 525 #define READ_WRITE_LOCK_LEVEL 1 526 527 static unsigned int lock_level(struct bio *bio) 528 { 529 return bio_data_dir(bio) == WRITE ? 530 WRITE_LOCK_LEVEL : 531 READ_WRITE_LOCK_LEVEL; 532 } 533 534 /* 535 *-------------------------------------------------------------- 536 * Per bio data 537 *-------------------------------------------------------------- 538 */ 539 540 static struct per_bio_data *get_per_bio_data(struct bio *bio) 541 { 542 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 543 544 BUG_ON(!pb); 545 return pb; 546 } 547 548 static struct per_bio_data *init_per_bio_data(struct bio *bio) 549 { 550 struct per_bio_data *pb = get_per_bio_data(bio); 551 552 pb->tick = false; 553 pb->req_nr = dm_bio_get_target_bio_nr(bio); 554 pb->cell = NULL; 555 pb->len = 0; 556 557 return pb; 558 } 559 560 /*----------------------------------------------------------------*/ 561 562 static void defer_bio(struct cache *cache, struct bio *bio) 563 { 564 spin_lock_irq(&cache->lock); 565 bio_list_add(&cache->deferred_bios, bio); 566 spin_unlock_irq(&cache->lock); 567 568 wake_deferred_bio_worker(cache); 569 } 570 571 static void defer_bios(struct cache *cache, struct bio_list *bios) 572 { 573 spin_lock_irq(&cache->lock); 574 bio_list_merge_init(&cache->deferred_bios, bios); 575 spin_unlock_irq(&cache->lock); 576 577 wake_deferred_bio_worker(cache); 578 } 579 580 /*----------------------------------------------------------------*/ 581 582 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 583 { 584 bool r; 585 struct per_bio_data *pb; 586 struct dm_cell_key_v2 key; 587 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 588 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 589 590 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 591 592 build_key(oblock, end, &key); 593 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 594 if (!r) { 595 /* 596 * Failed to get the lock. 597 */ 598 free_prison_cell(cache, cell_prealloc); 599 return r; 600 } 601 602 if (cell != cell_prealloc) 603 free_prison_cell(cache, cell_prealloc); 604 605 pb = get_per_bio_data(bio); 606 pb->cell = cell; 607 608 return r; 609 } 610 611 /*----------------------------------------------------------------*/ 612 613 static bool is_dirty(struct cache *cache, dm_cblock_t b) 614 { 615 return test_bit(from_cblock(b), cache->dirty_bitset); 616 } 617 618 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 619 { 620 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 621 atomic_inc(&cache->nr_dirty); 622 policy_set_dirty(cache->policy, cblock); 623 } 624 } 625 626 /* 627 * These two are called when setting after migrations to force the policy 628 * and dirty bitset to be in sync. 629 */ 630 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 631 { 632 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 633 atomic_inc(&cache->nr_dirty); 634 policy_set_dirty(cache->policy, cblock); 635 } 636 637 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 638 { 639 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 640 if (atomic_dec_return(&cache->nr_dirty) == 0) 641 dm_table_event(cache->ti->table); 642 } 643 644 policy_clear_dirty(cache->policy, cblock); 645 } 646 647 /*----------------------------------------------------------------*/ 648 649 static bool block_size_is_power_of_two(struct cache *cache) 650 { 651 return cache->sectors_per_block_shift >= 0; 652 } 653 654 static dm_block_t block_div(dm_block_t b, uint32_t n) 655 { 656 do_div(b, n); 657 658 return b; 659 } 660 661 static dm_block_t oblocks_per_dblock(struct cache *cache) 662 { 663 dm_block_t oblocks = cache->discard_block_size; 664 665 if (block_size_is_power_of_two(cache)) 666 oblocks >>= cache->sectors_per_block_shift; 667 else 668 oblocks = block_div(oblocks, cache->sectors_per_block); 669 670 return oblocks; 671 } 672 673 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 674 { 675 return to_dblock(block_div(from_oblock(oblock), 676 oblocks_per_dblock(cache))); 677 } 678 679 static void set_discard(struct cache *cache, dm_dblock_t b) 680 { 681 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 682 atomic_inc(&cache->stats.discard_count); 683 684 spin_lock_irq(&cache->lock); 685 set_bit(from_dblock(b), cache->discard_bitset); 686 spin_unlock_irq(&cache->lock); 687 } 688 689 static void clear_discard(struct cache *cache, dm_dblock_t b) 690 { 691 spin_lock_irq(&cache->lock); 692 clear_bit(from_dblock(b), cache->discard_bitset); 693 spin_unlock_irq(&cache->lock); 694 } 695 696 static bool is_discarded(struct cache *cache, dm_dblock_t b) 697 { 698 int r; 699 700 spin_lock_irq(&cache->lock); 701 r = test_bit(from_dblock(b), cache->discard_bitset); 702 spin_unlock_irq(&cache->lock); 703 704 return r; 705 } 706 707 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 708 { 709 int r; 710 711 spin_lock_irq(&cache->lock); 712 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 713 cache->discard_bitset); 714 spin_unlock_irq(&cache->lock); 715 716 return r; 717 } 718 719 /* 720 * ------------------------------------------------------------- 721 * Remapping 722 *-------------------------------------------------------------- 723 */ 724 static void remap_to_origin(struct cache *cache, struct bio *bio) 725 { 726 bio_set_dev(bio, cache->origin_dev->bdev); 727 } 728 729 static void remap_to_cache(struct cache *cache, struct bio *bio, 730 dm_cblock_t cblock) 731 { 732 sector_t bi_sector = bio->bi_iter.bi_sector; 733 sector_t block = from_cblock(cblock); 734 735 bio_set_dev(bio, cache->cache_dev->bdev); 736 if (!block_size_is_power_of_two(cache)) 737 bio->bi_iter.bi_sector = 738 (block * cache->sectors_per_block) + 739 sector_div(bi_sector, cache->sectors_per_block); 740 else 741 bio->bi_iter.bi_sector = 742 (block << cache->sectors_per_block_shift) | 743 (bi_sector & (cache->sectors_per_block - 1)); 744 } 745 746 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 747 { 748 struct per_bio_data *pb; 749 750 spin_lock_irq(&cache->lock); 751 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 752 bio_op(bio) != REQ_OP_DISCARD) { 753 pb = get_per_bio_data(bio); 754 pb->tick = true; 755 cache->need_tick_bio = false; 756 } 757 spin_unlock_irq(&cache->lock); 758 } 759 760 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 761 dm_oblock_t oblock) 762 { 763 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 764 check_if_tick_bio_needed(cache, bio); 765 remap_to_origin(cache, bio); 766 if (bio_data_dir(bio) == WRITE) 767 clear_discard(cache, oblock_to_dblock(cache, oblock)); 768 } 769 770 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 771 dm_oblock_t oblock, dm_cblock_t cblock) 772 { 773 check_if_tick_bio_needed(cache, bio); 774 remap_to_cache(cache, bio, cblock); 775 if (bio_data_dir(bio) == WRITE) { 776 set_dirty(cache, cblock); 777 clear_discard(cache, oblock_to_dblock(cache, oblock)); 778 } 779 } 780 781 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 782 { 783 sector_t block_nr = bio->bi_iter.bi_sector; 784 785 if (!block_size_is_power_of_two(cache)) 786 (void) sector_div(block_nr, cache->sectors_per_block); 787 else 788 block_nr >>= cache->sectors_per_block_shift; 789 790 return to_oblock(block_nr); 791 } 792 793 static bool accountable_bio(struct cache *cache, struct bio *bio) 794 { 795 return bio_op(bio) != REQ_OP_DISCARD; 796 } 797 798 static void accounted_begin(struct cache *cache, struct bio *bio) 799 { 800 struct per_bio_data *pb; 801 802 if (accountable_bio(cache, bio)) { 803 pb = get_per_bio_data(bio); 804 pb->len = bio_sectors(bio); 805 dm_iot_io_begin(&cache->tracker, pb->len); 806 } 807 } 808 809 static void accounted_complete(struct cache *cache, struct bio *bio) 810 { 811 struct per_bio_data *pb = get_per_bio_data(bio); 812 813 dm_iot_io_end(&cache->tracker, pb->len); 814 } 815 816 static void accounted_request(struct cache *cache, struct bio *bio) 817 { 818 accounted_begin(cache, bio); 819 dm_submit_bio_remap(bio, NULL); 820 } 821 822 static void issue_op(struct bio *bio, void *context) 823 { 824 struct cache *cache = context; 825 826 accounted_request(cache, bio); 827 } 828 829 /* 830 * When running in writethrough mode we need to send writes to clean blocks 831 * to both the cache and origin devices. Clone the bio and send them in parallel. 832 */ 833 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 834 dm_oblock_t oblock, dm_cblock_t cblock) 835 { 836 struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, 837 GFP_NOIO, &cache->bs); 838 839 BUG_ON(!origin_bio); 840 841 bio_chain(origin_bio, bio); 842 843 if (bio_data_dir(origin_bio) == WRITE) 844 clear_discard(cache, oblock_to_dblock(cache, oblock)); 845 submit_bio(origin_bio); 846 847 remap_to_cache(cache, bio, cblock); 848 } 849 850 /* 851 *-------------------------------------------------------------- 852 * Failure modes 853 *-------------------------------------------------------------- 854 */ 855 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 856 { 857 return cache->features.mode; 858 } 859 860 static const char *cache_device_name(struct cache *cache) 861 { 862 return dm_table_device_name(cache->ti->table); 863 } 864 865 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 866 { 867 static const char *descs[] = { 868 "write", 869 "read-only", 870 "fail" 871 }; 872 873 dm_table_event(cache->ti->table); 874 DMINFO("%s: switching cache to %s mode", 875 cache_device_name(cache), descs[(int)mode]); 876 } 877 878 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 879 { 880 bool needs_check; 881 enum cache_metadata_mode old_mode = get_cache_mode(cache); 882 883 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 884 DMERR("%s: unable to read needs_check flag, setting failure mode.", 885 cache_device_name(cache)); 886 new_mode = CM_FAIL; 887 } 888 889 if (new_mode == CM_WRITE && needs_check) { 890 DMERR("%s: unable to switch cache to write mode until repaired.", 891 cache_device_name(cache)); 892 if (old_mode != new_mode) 893 new_mode = old_mode; 894 else 895 new_mode = CM_READ_ONLY; 896 } 897 898 /* Never move out of fail mode */ 899 if (old_mode == CM_FAIL) 900 new_mode = CM_FAIL; 901 902 switch (new_mode) { 903 case CM_FAIL: 904 case CM_READ_ONLY: 905 dm_cache_metadata_set_read_only(cache->cmd); 906 break; 907 908 case CM_WRITE: 909 dm_cache_metadata_set_read_write(cache->cmd); 910 break; 911 } 912 913 cache->features.mode = new_mode; 914 915 if (new_mode != old_mode) 916 notify_mode_switch(cache, new_mode); 917 } 918 919 static void abort_transaction(struct cache *cache) 920 { 921 const char *dev_name = cache_device_name(cache); 922 923 if (get_cache_mode(cache) >= CM_READ_ONLY) 924 return; 925 926 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 927 if (dm_cache_metadata_abort(cache->cmd)) { 928 DMERR("%s: failed to abort metadata transaction", dev_name); 929 set_cache_mode(cache, CM_FAIL); 930 } 931 932 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 933 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 934 set_cache_mode(cache, CM_FAIL); 935 } 936 } 937 938 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 939 { 940 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 941 cache_device_name(cache), op, r); 942 abort_transaction(cache); 943 set_cache_mode(cache, CM_READ_ONLY); 944 } 945 946 /*----------------------------------------------------------------*/ 947 948 static void load_stats(struct cache *cache) 949 { 950 struct dm_cache_statistics stats; 951 952 dm_cache_metadata_get_stats(cache->cmd, &stats); 953 atomic_set(&cache->stats.read_hit, stats.read_hits); 954 atomic_set(&cache->stats.read_miss, stats.read_misses); 955 atomic_set(&cache->stats.write_hit, stats.write_hits); 956 atomic_set(&cache->stats.write_miss, stats.write_misses); 957 } 958 959 static void save_stats(struct cache *cache) 960 { 961 struct dm_cache_statistics stats; 962 963 if (get_cache_mode(cache) >= CM_READ_ONLY) 964 return; 965 966 stats.read_hits = atomic_read(&cache->stats.read_hit); 967 stats.read_misses = atomic_read(&cache->stats.read_miss); 968 stats.write_hits = atomic_read(&cache->stats.write_hit); 969 stats.write_misses = atomic_read(&cache->stats.write_miss); 970 971 dm_cache_metadata_set_stats(cache->cmd, &stats); 972 } 973 974 static void update_stats(struct cache_stats *stats, enum policy_operation op) 975 { 976 switch (op) { 977 case POLICY_PROMOTE: 978 atomic_inc(&stats->promotion); 979 break; 980 981 case POLICY_DEMOTE: 982 atomic_inc(&stats->demotion); 983 break; 984 985 case POLICY_WRITEBACK: 986 atomic_inc(&stats->writeback); 987 break; 988 } 989 } 990 991 /* 992 *--------------------------------------------------------------------- 993 * Migration processing 994 * 995 * Migration covers moving data from the origin device to the cache, or 996 * vice versa. 997 *--------------------------------------------------------------------- 998 */ 999 static void inc_io_migrations(struct cache *cache) 1000 { 1001 atomic_inc(&cache->nr_io_migrations); 1002 } 1003 1004 static void dec_io_migrations(struct cache *cache) 1005 { 1006 atomic_dec(&cache->nr_io_migrations); 1007 } 1008 1009 static bool discard_or_flush(struct bio *bio) 1010 { 1011 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1012 } 1013 1014 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1015 dm_dblock_t *b, dm_dblock_t *e) 1016 { 1017 sector_t sb = bio->bi_iter.bi_sector; 1018 sector_t se = bio_end_sector(bio); 1019 1020 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1021 1022 if (se - sb < cache->discard_block_size) 1023 *e = *b; 1024 else 1025 *e = to_dblock(block_div(se, cache->discard_block_size)); 1026 } 1027 1028 /*----------------------------------------------------------------*/ 1029 1030 static void prevent_background_work(struct cache *cache) 1031 { 1032 lockdep_off(); 1033 down_write(&cache->background_work_lock); 1034 lockdep_on(); 1035 } 1036 1037 static void allow_background_work(struct cache *cache) 1038 { 1039 lockdep_off(); 1040 up_write(&cache->background_work_lock); 1041 lockdep_on(); 1042 } 1043 1044 static bool background_work_begin(struct cache *cache) 1045 { 1046 bool r; 1047 1048 lockdep_off(); 1049 r = down_read_trylock(&cache->background_work_lock); 1050 lockdep_on(); 1051 1052 return r; 1053 } 1054 1055 static void background_work_end(struct cache *cache) 1056 { 1057 lockdep_off(); 1058 up_read(&cache->background_work_lock); 1059 lockdep_on(); 1060 } 1061 1062 /*----------------------------------------------------------------*/ 1063 1064 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1065 { 1066 return (bio_data_dir(bio) == WRITE) && 1067 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1068 } 1069 1070 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1071 { 1072 return writeback_mode(cache) && 1073 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1074 } 1075 1076 static void quiesce(struct dm_cache_migration *mg, 1077 void (*continuation)(struct work_struct *)) 1078 { 1079 init_continuation(&mg->k, continuation); 1080 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1081 } 1082 1083 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1084 { 1085 struct continuation *k = container_of(ws, struct continuation, ws); 1086 1087 return container_of(k, struct dm_cache_migration, k); 1088 } 1089 1090 static void copy_complete(int read_err, unsigned long write_err, void *context) 1091 { 1092 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1093 1094 if (read_err || write_err) 1095 mg->k.input = BLK_STS_IOERR; 1096 1097 queue_continuation(mg->cache->wq, &mg->k); 1098 } 1099 1100 static void copy(struct dm_cache_migration *mg, bool promote) 1101 { 1102 struct dm_io_region o_region, c_region; 1103 struct cache *cache = mg->cache; 1104 1105 o_region.bdev = cache->origin_dev->bdev; 1106 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1107 o_region.count = cache->sectors_per_block; 1108 1109 c_region.bdev = cache->cache_dev->bdev; 1110 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1111 c_region.count = cache->sectors_per_block; 1112 1113 if (promote) 1114 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1115 else 1116 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1117 } 1118 1119 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1120 { 1121 struct per_bio_data *pb = get_per_bio_data(bio); 1122 1123 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1124 free_prison_cell(cache, pb->cell); 1125 pb->cell = NULL; 1126 } 1127 1128 static void overwrite_endio(struct bio *bio) 1129 { 1130 struct dm_cache_migration *mg = bio->bi_private; 1131 struct cache *cache = mg->cache; 1132 struct per_bio_data *pb = get_per_bio_data(bio); 1133 1134 dm_unhook_bio(&pb->hook_info, bio); 1135 1136 if (bio->bi_status) 1137 mg->k.input = bio->bi_status; 1138 1139 queue_continuation(cache->wq, &mg->k); 1140 } 1141 1142 static void overwrite(struct dm_cache_migration *mg, 1143 void (*continuation)(struct work_struct *)) 1144 { 1145 struct bio *bio = mg->overwrite_bio; 1146 struct per_bio_data *pb = get_per_bio_data(bio); 1147 1148 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1149 1150 /* 1151 * The overwrite bio is part of the copy operation, as such it does 1152 * not set/clear discard or dirty flags. 1153 */ 1154 if (mg->op->op == POLICY_PROMOTE) 1155 remap_to_cache(mg->cache, bio, mg->op->cblock); 1156 else 1157 remap_to_origin(mg->cache, bio); 1158 1159 init_continuation(&mg->k, continuation); 1160 accounted_request(mg->cache, bio); 1161 } 1162 1163 /* 1164 * Migration steps: 1165 * 1166 * 1) exclusive lock preventing WRITEs 1167 * 2) quiesce 1168 * 3) copy or issue overwrite bio 1169 * 4) upgrade to exclusive lock preventing READs and WRITEs 1170 * 5) quiesce 1171 * 6) update metadata and commit 1172 * 7) unlock 1173 */ 1174 static void mg_complete(struct dm_cache_migration *mg, bool success) 1175 { 1176 struct bio_list bios; 1177 struct cache *cache = mg->cache; 1178 struct policy_work *op = mg->op; 1179 dm_cblock_t cblock = op->cblock; 1180 1181 if (success) 1182 update_stats(&cache->stats, op->op); 1183 1184 switch (op->op) { 1185 case POLICY_PROMOTE: 1186 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1187 policy_complete_background_work(cache->policy, op, success); 1188 1189 if (mg->overwrite_bio) { 1190 if (success) 1191 force_set_dirty(cache, cblock); 1192 else if (mg->k.input) 1193 mg->overwrite_bio->bi_status = mg->k.input; 1194 else 1195 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1196 bio_endio(mg->overwrite_bio); 1197 } else { 1198 if (success) 1199 force_clear_dirty(cache, cblock); 1200 dec_io_migrations(cache); 1201 } 1202 break; 1203 1204 case POLICY_DEMOTE: 1205 /* 1206 * We clear dirty here to update the nr_dirty counter. 1207 */ 1208 if (success) 1209 force_clear_dirty(cache, cblock); 1210 policy_complete_background_work(cache->policy, op, success); 1211 dec_io_migrations(cache); 1212 break; 1213 1214 case POLICY_WRITEBACK: 1215 if (success) 1216 force_clear_dirty(cache, cblock); 1217 policy_complete_background_work(cache->policy, op, success); 1218 dec_io_migrations(cache); 1219 break; 1220 } 1221 1222 bio_list_init(&bios); 1223 if (mg->cell) { 1224 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1225 free_prison_cell(cache, mg->cell); 1226 } 1227 1228 free_migration(mg); 1229 defer_bios(cache, &bios); 1230 wake_migration_worker(cache); 1231 1232 background_work_end(cache); 1233 } 1234 1235 static void mg_success(struct work_struct *ws) 1236 { 1237 struct dm_cache_migration *mg = ws_to_mg(ws); 1238 1239 mg_complete(mg, mg->k.input == 0); 1240 } 1241 1242 static void mg_update_metadata(struct work_struct *ws) 1243 { 1244 int r; 1245 struct dm_cache_migration *mg = ws_to_mg(ws); 1246 struct cache *cache = mg->cache; 1247 struct policy_work *op = mg->op; 1248 1249 switch (op->op) { 1250 case POLICY_PROMOTE: 1251 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1252 if (r) { 1253 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1254 cache_device_name(cache)); 1255 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1256 1257 mg_complete(mg, false); 1258 return; 1259 } 1260 mg_complete(mg, true); 1261 break; 1262 1263 case POLICY_DEMOTE: 1264 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1265 if (r) { 1266 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1267 cache_device_name(cache)); 1268 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1269 1270 mg_complete(mg, false); 1271 return; 1272 } 1273 1274 /* 1275 * It would be nice if we only had to commit when a REQ_FLUSH 1276 * comes through. But there's one scenario that we have to 1277 * look out for: 1278 * 1279 * - vblock x in a cache block 1280 * - domotion occurs 1281 * - cache block gets reallocated and over written 1282 * - crash 1283 * 1284 * When we recover, because there was no commit the cache will 1285 * rollback to having the data for vblock x in the cache block. 1286 * But the cache block has since been overwritten, so it'll end 1287 * up pointing to data that was never in 'x' during the history 1288 * of the device. 1289 * 1290 * To avoid this issue we require a commit as part of the 1291 * demotion operation. 1292 */ 1293 init_continuation(&mg->k, mg_success); 1294 continue_after_commit(&cache->committer, &mg->k); 1295 schedule_commit(&cache->committer); 1296 break; 1297 1298 case POLICY_WRITEBACK: 1299 mg_complete(mg, true); 1300 break; 1301 } 1302 } 1303 1304 static void mg_update_metadata_after_copy(struct work_struct *ws) 1305 { 1306 struct dm_cache_migration *mg = ws_to_mg(ws); 1307 1308 /* 1309 * Did the copy succeed? 1310 */ 1311 if (mg->k.input) 1312 mg_complete(mg, false); 1313 else 1314 mg_update_metadata(ws); 1315 } 1316 1317 static void mg_upgrade_lock(struct work_struct *ws) 1318 { 1319 int r; 1320 struct dm_cache_migration *mg = ws_to_mg(ws); 1321 1322 /* 1323 * Did the copy succeed? 1324 */ 1325 if (mg->k.input) 1326 mg_complete(mg, false); 1327 1328 else { 1329 /* 1330 * Now we want the lock to prevent both reads and writes. 1331 */ 1332 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1333 READ_WRITE_LOCK_LEVEL); 1334 if (r < 0) 1335 mg_complete(mg, false); 1336 1337 else if (r) 1338 quiesce(mg, mg_update_metadata); 1339 1340 else 1341 mg_update_metadata(ws); 1342 } 1343 } 1344 1345 static void mg_full_copy(struct work_struct *ws) 1346 { 1347 struct dm_cache_migration *mg = ws_to_mg(ws); 1348 struct cache *cache = mg->cache; 1349 struct policy_work *op = mg->op; 1350 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1351 1352 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1353 is_discarded_oblock(cache, op->oblock)) { 1354 mg_upgrade_lock(ws); 1355 return; 1356 } 1357 1358 init_continuation(&mg->k, mg_upgrade_lock); 1359 copy(mg, is_policy_promote); 1360 } 1361 1362 static void mg_copy(struct work_struct *ws) 1363 { 1364 struct dm_cache_migration *mg = ws_to_mg(ws); 1365 1366 if (mg->overwrite_bio) { 1367 /* 1368 * No exclusive lock was held when we last checked if the bio 1369 * was optimisable. So we have to check again in case things 1370 * have changed (eg, the block may no longer be discarded). 1371 */ 1372 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1373 /* 1374 * Fallback to a real full copy after doing some tidying up. 1375 */ 1376 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1377 1378 BUG_ON(rb); /* An exclusive lock must _not_ be held for this block */ 1379 mg->overwrite_bio = NULL; 1380 inc_io_migrations(mg->cache); 1381 mg_full_copy(ws); 1382 return; 1383 } 1384 1385 /* 1386 * It's safe to do this here, even though it's new data 1387 * because all IO has been locked out of the block. 1388 * 1389 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1390 * so _not_ using mg_upgrade_lock() as continutation. 1391 */ 1392 overwrite(mg, mg_update_metadata_after_copy); 1393 1394 } else 1395 mg_full_copy(ws); 1396 } 1397 1398 static int mg_lock_writes(struct dm_cache_migration *mg) 1399 { 1400 int r; 1401 struct dm_cell_key_v2 key; 1402 struct cache *cache = mg->cache; 1403 struct dm_bio_prison_cell_v2 *prealloc; 1404 1405 prealloc = alloc_prison_cell(cache); 1406 1407 /* 1408 * Prevent writes to the block, but allow reads to continue. 1409 * Unless we're using an overwrite bio, in which case we lock 1410 * everything. 1411 */ 1412 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1413 r = dm_cell_lock_v2(cache->prison, &key, 1414 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1415 prealloc, &mg->cell); 1416 if (r < 0) { 1417 free_prison_cell(cache, prealloc); 1418 mg_complete(mg, false); 1419 return r; 1420 } 1421 1422 if (mg->cell != prealloc) 1423 free_prison_cell(cache, prealloc); 1424 1425 if (r == 0) 1426 mg_copy(&mg->k.ws); 1427 else 1428 quiesce(mg, mg_copy); 1429 1430 return 0; 1431 } 1432 1433 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1434 { 1435 struct dm_cache_migration *mg; 1436 1437 if (!background_work_begin(cache)) { 1438 policy_complete_background_work(cache->policy, op, false); 1439 return -EPERM; 1440 } 1441 1442 mg = alloc_migration(cache); 1443 1444 mg->op = op; 1445 mg->overwrite_bio = bio; 1446 1447 if (!bio) 1448 inc_io_migrations(cache); 1449 1450 return mg_lock_writes(mg); 1451 } 1452 1453 /* 1454 *-------------------------------------------------------------- 1455 * invalidation processing 1456 *-------------------------------------------------------------- 1457 */ 1458 1459 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1460 { 1461 struct bio_list bios; 1462 struct cache *cache = mg->cache; 1463 1464 bio_list_init(&bios); 1465 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1466 free_prison_cell(cache, mg->cell); 1467 1468 if (!success && mg->overwrite_bio) 1469 bio_io_error(mg->overwrite_bio); 1470 1471 free_migration(mg); 1472 defer_bios(cache, &bios); 1473 1474 background_work_end(cache); 1475 } 1476 1477 static void invalidate_completed(struct work_struct *ws) 1478 { 1479 struct dm_cache_migration *mg = ws_to_mg(ws); 1480 1481 invalidate_complete(mg, !mg->k.input); 1482 } 1483 1484 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1485 { 1486 int r; 1487 1488 r = policy_invalidate_mapping(cache->policy, cblock); 1489 if (!r) { 1490 r = dm_cache_remove_mapping(cache->cmd, cblock); 1491 if (r) { 1492 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1493 cache_device_name(cache)); 1494 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1495 } 1496 1497 } else if (r == -ENODATA) { 1498 /* 1499 * Harmless, already unmapped. 1500 */ 1501 r = 0; 1502 1503 } else 1504 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1505 1506 return r; 1507 } 1508 1509 static void invalidate_remove(struct work_struct *ws) 1510 { 1511 int r; 1512 struct dm_cache_migration *mg = ws_to_mg(ws); 1513 struct cache *cache = mg->cache; 1514 1515 r = invalidate_cblock(cache, mg->invalidate_cblock); 1516 if (r) { 1517 invalidate_complete(mg, false); 1518 return; 1519 } 1520 1521 init_continuation(&mg->k, invalidate_completed); 1522 continue_after_commit(&cache->committer, &mg->k); 1523 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1524 mg->overwrite_bio = NULL; 1525 schedule_commit(&cache->committer); 1526 } 1527 1528 static int invalidate_lock(struct dm_cache_migration *mg) 1529 { 1530 int r; 1531 struct dm_cell_key_v2 key; 1532 struct cache *cache = mg->cache; 1533 struct dm_bio_prison_cell_v2 *prealloc; 1534 1535 prealloc = alloc_prison_cell(cache); 1536 1537 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1538 r = dm_cell_lock_v2(cache->prison, &key, 1539 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1540 if (r < 0) { 1541 free_prison_cell(cache, prealloc); 1542 invalidate_complete(mg, false); 1543 return r; 1544 } 1545 1546 if (mg->cell != prealloc) 1547 free_prison_cell(cache, prealloc); 1548 1549 if (r) 1550 quiesce(mg, invalidate_remove); 1551 1552 else { 1553 /* 1554 * We can't call invalidate_remove() directly here because we 1555 * might still be in request context. 1556 */ 1557 init_continuation(&mg->k, invalidate_remove); 1558 queue_work(cache->wq, &mg->k.ws); 1559 } 1560 1561 return 0; 1562 } 1563 1564 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1565 dm_oblock_t oblock, struct bio *bio) 1566 { 1567 struct dm_cache_migration *mg; 1568 1569 if (!background_work_begin(cache)) 1570 return -EPERM; 1571 1572 mg = alloc_migration(cache); 1573 1574 mg->overwrite_bio = bio; 1575 mg->invalidate_cblock = cblock; 1576 mg->invalidate_oblock = oblock; 1577 1578 return invalidate_lock(mg); 1579 } 1580 1581 /* 1582 *-------------------------------------------------------------- 1583 * bio processing 1584 *-------------------------------------------------------------- 1585 */ 1586 1587 enum busy { 1588 IDLE, 1589 BUSY 1590 }; 1591 1592 static enum busy spare_migration_bandwidth(struct cache *cache) 1593 { 1594 bool idle = dm_iot_idle_for(&cache->tracker, HZ); 1595 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1596 cache->sectors_per_block; 1597 1598 if (idle && current_volume <= cache->migration_threshold) 1599 return IDLE; 1600 else 1601 return BUSY; 1602 } 1603 1604 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1605 { 1606 atomic_inc(bio_data_dir(bio) == READ ? 1607 &cache->stats.read_hit : &cache->stats.write_hit); 1608 } 1609 1610 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1611 { 1612 atomic_inc(bio_data_dir(bio) == READ ? 1613 &cache->stats.read_miss : &cache->stats.write_miss); 1614 } 1615 1616 /*----------------------------------------------------------------*/ 1617 1618 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1619 bool *commit_needed) 1620 { 1621 int r, data_dir; 1622 bool rb, background_queued; 1623 dm_cblock_t cblock; 1624 1625 *commit_needed = false; 1626 1627 rb = bio_detain_shared(cache, block, bio); 1628 if (!rb) { 1629 /* 1630 * An exclusive lock is held for this block, so we have to 1631 * wait. We set the commit_needed flag so the current 1632 * transaction will be committed asap, allowing this lock 1633 * to be dropped. 1634 */ 1635 *commit_needed = true; 1636 return DM_MAPIO_SUBMITTED; 1637 } 1638 1639 data_dir = bio_data_dir(bio); 1640 1641 if (optimisable_bio(cache, bio, block)) { 1642 struct policy_work *op = NULL; 1643 1644 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1645 if (unlikely(r && r != -ENOENT)) { 1646 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1647 cache_device_name(cache), r); 1648 bio_io_error(bio); 1649 return DM_MAPIO_SUBMITTED; 1650 } 1651 1652 if (r == -ENOENT && op) { 1653 bio_drop_shared_lock(cache, bio); 1654 BUG_ON(op->op != POLICY_PROMOTE); 1655 mg_start(cache, op, bio); 1656 return DM_MAPIO_SUBMITTED; 1657 } 1658 } else { 1659 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1660 if (unlikely(r && r != -ENOENT)) { 1661 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1662 cache_device_name(cache), r); 1663 bio_io_error(bio); 1664 return DM_MAPIO_SUBMITTED; 1665 } 1666 1667 if (background_queued) 1668 wake_migration_worker(cache); 1669 } 1670 1671 if (r == -ENOENT) { 1672 struct per_bio_data *pb = get_per_bio_data(bio); 1673 1674 /* 1675 * Miss. 1676 */ 1677 inc_miss_counter(cache, bio); 1678 if (pb->req_nr == 0) { 1679 accounted_begin(cache, bio); 1680 remap_to_origin_clear_discard(cache, bio, block); 1681 } else { 1682 /* 1683 * This is a duplicate writethrough io that is no 1684 * longer needed because the block has been demoted. 1685 */ 1686 bio_endio(bio); 1687 return DM_MAPIO_SUBMITTED; 1688 } 1689 } else { 1690 /* 1691 * Hit. 1692 */ 1693 inc_hit_counter(cache, bio); 1694 1695 /* 1696 * Passthrough always maps to the origin, invalidating any 1697 * cache blocks that are written to. 1698 */ 1699 if (passthrough_mode(cache)) { 1700 if (bio_data_dir(bio) == WRITE) { 1701 bio_drop_shared_lock(cache, bio); 1702 atomic_inc(&cache->stats.demotion); 1703 invalidate_start(cache, cblock, block, bio); 1704 } else 1705 remap_to_origin_clear_discard(cache, bio, block); 1706 } else { 1707 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1708 !is_dirty(cache, cblock)) { 1709 remap_to_origin_and_cache(cache, bio, block, cblock); 1710 accounted_begin(cache, bio); 1711 } else 1712 remap_to_cache_dirty(cache, bio, block, cblock); 1713 } 1714 } 1715 1716 /* 1717 * dm core turns FUA requests into a separate payload and FLUSH req. 1718 */ 1719 if (bio->bi_opf & REQ_FUA) { 1720 /* 1721 * issue_after_commit will call accounted_begin a second time. So 1722 * we call accounted_complete() to avoid double accounting. 1723 */ 1724 accounted_complete(cache, bio); 1725 issue_after_commit(&cache->committer, bio); 1726 *commit_needed = true; 1727 return DM_MAPIO_SUBMITTED; 1728 } 1729 1730 return DM_MAPIO_REMAPPED; 1731 } 1732 1733 static bool process_bio(struct cache *cache, struct bio *bio) 1734 { 1735 bool commit_needed; 1736 1737 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1738 dm_submit_bio_remap(bio, NULL); 1739 1740 return commit_needed; 1741 } 1742 1743 /* 1744 * A non-zero return indicates read_only or fail_io mode. 1745 */ 1746 static int commit(struct cache *cache, bool clean_shutdown) 1747 { 1748 int r; 1749 1750 if (get_cache_mode(cache) >= CM_READ_ONLY) 1751 return -EINVAL; 1752 1753 atomic_inc(&cache->stats.commit_count); 1754 r = dm_cache_commit(cache->cmd, clean_shutdown); 1755 if (r) 1756 metadata_operation_failed(cache, "dm_cache_commit", r); 1757 1758 return r; 1759 } 1760 1761 /* 1762 * Used by the batcher. 1763 */ 1764 static blk_status_t commit_op(void *context) 1765 { 1766 struct cache *cache = context; 1767 1768 if (dm_cache_changed_this_transaction(cache->cmd)) 1769 return errno_to_blk_status(commit(cache, false)); 1770 1771 return 0; 1772 } 1773 1774 /*----------------------------------------------------------------*/ 1775 1776 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1777 { 1778 struct per_bio_data *pb = get_per_bio_data(bio); 1779 1780 if (!pb->req_nr) 1781 remap_to_origin(cache, bio); 1782 else 1783 remap_to_cache(cache, bio, 0); 1784 1785 issue_after_commit(&cache->committer, bio); 1786 return true; 1787 } 1788 1789 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1790 { 1791 dm_dblock_t b, e; 1792 1793 /* 1794 * FIXME: do we need to lock the region? Or can we just assume the 1795 * user wont be so foolish as to issue discard concurrently with 1796 * other IO? 1797 */ 1798 calc_discard_block_range(cache, bio, &b, &e); 1799 while (b != e) { 1800 set_discard(cache, b); 1801 b = to_dblock(from_dblock(b) + 1); 1802 } 1803 1804 if (cache->features.discard_passdown) { 1805 remap_to_origin(cache, bio); 1806 dm_submit_bio_remap(bio, NULL); 1807 } else 1808 bio_endio(bio); 1809 1810 return false; 1811 } 1812 1813 static void process_deferred_bios(struct work_struct *ws) 1814 { 1815 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1816 1817 bool commit_needed = false; 1818 struct bio_list bios; 1819 struct bio *bio; 1820 1821 bio_list_init(&bios); 1822 1823 spin_lock_irq(&cache->lock); 1824 bio_list_merge_init(&bios, &cache->deferred_bios); 1825 spin_unlock_irq(&cache->lock); 1826 1827 while ((bio = bio_list_pop(&bios))) { 1828 if (bio->bi_opf & REQ_PREFLUSH) 1829 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1830 1831 else if (bio_op(bio) == REQ_OP_DISCARD) 1832 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1833 1834 else 1835 commit_needed = process_bio(cache, bio) || commit_needed; 1836 cond_resched(); 1837 } 1838 1839 if (commit_needed) 1840 schedule_commit(&cache->committer); 1841 } 1842 1843 /* 1844 *-------------------------------------------------------------- 1845 * Main worker loop 1846 *-------------------------------------------------------------- 1847 */ 1848 static void requeue_deferred_bios(struct cache *cache) 1849 { 1850 struct bio *bio; 1851 struct bio_list bios; 1852 1853 bio_list_init(&bios); 1854 bio_list_merge_init(&bios, &cache->deferred_bios); 1855 1856 while ((bio = bio_list_pop(&bios))) { 1857 bio->bi_status = BLK_STS_DM_REQUEUE; 1858 bio_endio(bio); 1859 cond_resched(); 1860 } 1861 } 1862 1863 /* 1864 * We want to commit periodically so that not too much 1865 * unwritten metadata builds up. 1866 */ 1867 static void do_waker(struct work_struct *ws) 1868 { 1869 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1870 1871 policy_tick(cache->policy, true); 1872 wake_migration_worker(cache); 1873 schedule_commit(&cache->committer); 1874 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1875 } 1876 1877 static void check_migrations(struct work_struct *ws) 1878 { 1879 int r; 1880 struct policy_work *op; 1881 struct cache *cache = container_of(ws, struct cache, migration_worker); 1882 enum busy b; 1883 1884 for (;;) { 1885 b = spare_migration_bandwidth(cache); 1886 1887 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1888 if (r == -ENODATA) 1889 break; 1890 1891 if (r) { 1892 DMERR_LIMIT("%s: policy_background_work failed", 1893 cache_device_name(cache)); 1894 break; 1895 } 1896 1897 r = mg_start(cache, op, NULL); 1898 if (r) 1899 break; 1900 1901 cond_resched(); 1902 } 1903 } 1904 1905 /* 1906 *-------------------------------------------------------------- 1907 * Target methods 1908 *-------------------------------------------------------------- 1909 */ 1910 1911 /* 1912 * This function gets called on the error paths of the constructor, so we 1913 * have to cope with a partially initialised struct. 1914 */ 1915 static void __destroy(struct cache *cache) 1916 { 1917 mempool_exit(&cache->migration_pool); 1918 1919 if (cache->prison) 1920 dm_bio_prison_destroy_v2(cache->prison); 1921 1922 if (cache->wq) 1923 destroy_workqueue(cache->wq); 1924 1925 if (cache->dirty_bitset) 1926 free_bitset(cache->dirty_bitset); 1927 1928 if (cache->discard_bitset) 1929 free_bitset(cache->discard_bitset); 1930 1931 if (cache->invalid_bitset) 1932 free_bitset(cache->invalid_bitset); 1933 1934 if (cache->copier) 1935 dm_kcopyd_client_destroy(cache->copier); 1936 1937 if (cache->cmd) 1938 dm_cache_metadata_close(cache->cmd); 1939 1940 if (cache->metadata_dev) 1941 dm_put_device(cache->ti, cache->metadata_dev); 1942 1943 if (cache->origin_dev) 1944 dm_put_device(cache->ti, cache->origin_dev); 1945 1946 if (cache->cache_dev) 1947 dm_put_device(cache->ti, cache->cache_dev); 1948 1949 if (cache->policy) 1950 dm_cache_policy_destroy(cache->policy); 1951 1952 bioset_exit(&cache->bs); 1953 1954 kfree(cache); 1955 } 1956 1957 static void destroy(struct cache *cache) 1958 { 1959 unsigned int i; 1960 1961 cancel_delayed_work_sync(&cache->waker); 1962 1963 for (i = 0; i < cache->nr_ctr_args ; i++) 1964 kfree(cache->ctr_args[i]); 1965 kfree(cache->ctr_args); 1966 1967 __destroy(cache); 1968 } 1969 1970 static void cache_dtr(struct dm_target *ti) 1971 { 1972 struct cache *cache = ti->private; 1973 1974 destroy(cache); 1975 } 1976 1977 static sector_t get_dev_size(struct dm_dev *dev) 1978 { 1979 return bdev_nr_sectors(dev->bdev); 1980 } 1981 1982 /*----------------------------------------------------------------*/ 1983 1984 /* 1985 * Construct a cache device mapping. 1986 * 1987 * cache <metadata dev> <cache dev> <origin dev> <block size> 1988 * <#feature args> [<feature arg>]* 1989 * <policy> <#policy args> [<policy arg>]* 1990 * 1991 * metadata dev : fast device holding the persistent metadata 1992 * cache dev : fast device holding cached data blocks 1993 * origin dev : slow device holding original data blocks 1994 * block size : cache unit size in sectors 1995 * 1996 * #feature args : number of feature arguments passed 1997 * feature args : writethrough. (The default is writeback.) 1998 * 1999 * policy : the replacement policy to use 2000 * #policy args : an even number of policy arguments corresponding 2001 * to key/value pairs passed to the policy 2002 * policy args : key/value pairs passed to the policy 2003 * E.g. 'sequential_threshold 1024' 2004 * See cache-policies.txt for details. 2005 * 2006 * Optional feature arguments are: 2007 * writethrough : write through caching that prohibits cache block 2008 * content from being different from origin block content. 2009 * Without this argument, the default behaviour is to write 2010 * back cache block contents later for performance reasons, 2011 * so they may differ from the corresponding origin blocks. 2012 */ 2013 struct cache_args { 2014 struct dm_target *ti; 2015 2016 struct dm_dev *metadata_dev; 2017 2018 struct dm_dev *cache_dev; 2019 sector_t cache_sectors; 2020 2021 struct dm_dev *origin_dev; 2022 2023 uint32_t block_size; 2024 2025 const char *policy_name; 2026 int policy_argc; 2027 const char **policy_argv; 2028 2029 struct cache_features features; 2030 }; 2031 2032 static void destroy_cache_args(struct cache_args *ca) 2033 { 2034 if (ca->metadata_dev) 2035 dm_put_device(ca->ti, ca->metadata_dev); 2036 2037 if (ca->cache_dev) 2038 dm_put_device(ca->ti, ca->cache_dev); 2039 2040 if (ca->origin_dev) 2041 dm_put_device(ca->ti, ca->origin_dev); 2042 2043 kfree(ca); 2044 } 2045 2046 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2047 { 2048 if (!as->argc) { 2049 *error = "Insufficient args"; 2050 return false; 2051 } 2052 2053 return true; 2054 } 2055 2056 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2057 char **error) 2058 { 2059 int r; 2060 sector_t metadata_dev_size; 2061 2062 if (!at_least_one_arg(as, error)) 2063 return -EINVAL; 2064 2065 r = dm_get_device(ca->ti, dm_shift_arg(as), 2066 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); 2067 if (r) { 2068 *error = "Error opening metadata device"; 2069 return r; 2070 } 2071 2072 metadata_dev_size = get_dev_size(ca->metadata_dev); 2073 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2074 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2075 ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2076 2077 return 0; 2078 } 2079 2080 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2081 char **error) 2082 { 2083 int r; 2084 2085 if (!at_least_one_arg(as, error)) 2086 return -EINVAL; 2087 2088 r = dm_get_device(ca->ti, dm_shift_arg(as), 2089 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); 2090 if (r) { 2091 *error = "Error opening cache device"; 2092 return r; 2093 } 2094 ca->cache_sectors = get_dev_size(ca->cache_dev); 2095 2096 return 0; 2097 } 2098 2099 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2100 char **error) 2101 { 2102 sector_t origin_sectors; 2103 int r; 2104 2105 if (!at_least_one_arg(as, error)) 2106 return -EINVAL; 2107 2108 r = dm_get_device(ca->ti, dm_shift_arg(as), 2109 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); 2110 if (r) { 2111 *error = "Error opening origin device"; 2112 return r; 2113 } 2114 2115 origin_sectors = get_dev_size(ca->origin_dev); 2116 if (ca->ti->len > origin_sectors) { 2117 *error = "Device size larger than cached device"; 2118 return -EINVAL; 2119 } 2120 2121 return 0; 2122 } 2123 2124 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2125 char **error) 2126 { 2127 unsigned long block_size; 2128 2129 if (!at_least_one_arg(as, error)) 2130 return -EINVAL; 2131 2132 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2133 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2134 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2135 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2136 *error = "Invalid data block size"; 2137 return -EINVAL; 2138 } 2139 2140 if (block_size > ca->cache_sectors) { 2141 *error = "Data block size is larger than the cache device"; 2142 return -EINVAL; 2143 } 2144 2145 ca->block_size = block_size; 2146 2147 return 0; 2148 } 2149 2150 static void init_features(struct cache_features *cf) 2151 { 2152 cf->mode = CM_WRITE; 2153 cf->io_mode = CM_IO_WRITEBACK; 2154 cf->metadata_version = 1; 2155 cf->discard_passdown = true; 2156 } 2157 2158 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2159 char **error) 2160 { 2161 static const struct dm_arg _args[] = { 2162 {0, 3, "Invalid number of cache feature arguments"}, 2163 }; 2164 2165 int r, mode_ctr = 0; 2166 unsigned int argc; 2167 const char *arg; 2168 struct cache_features *cf = &ca->features; 2169 2170 init_features(cf); 2171 2172 r = dm_read_arg_group(_args, as, &argc, error); 2173 if (r) 2174 return -EINVAL; 2175 2176 while (argc--) { 2177 arg = dm_shift_arg(as); 2178 2179 if (!strcasecmp(arg, "writeback")) { 2180 cf->io_mode = CM_IO_WRITEBACK; 2181 mode_ctr++; 2182 } 2183 2184 else if (!strcasecmp(arg, "writethrough")) { 2185 cf->io_mode = CM_IO_WRITETHROUGH; 2186 mode_ctr++; 2187 } 2188 2189 else if (!strcasecmp(arg, "passthrough")) { 2190 cf->io_mode = CM_IO_PASSTHROUGH; 2191 mode_ctr++; 2192 } 2193 2194 else if (!strcasecmp(arg, "metadata2")) 2195 cf->metadata_version = 2; 2196 2197 else if (!strcasecmp(arg, "no_discard_passdown")) 2198 cf->discard_passdown = false; 2199 2200 else { 2201 *error = "Unrecognised cache feature requested"; 2202 return -EINVAL; 2203 } 2204 } 2205 2206 if (mode_ctr > 1) { 2207 *error = "Duplicate cache io_mode features requested"; 2208 return -EINVAL; 2209 } 2210 2211 return 0; 2212 } 2213 2214 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2215 char **error) 2216 { 2217 static const struct dm_arg _args[] = { 2218 {0, 1024, "Invalid number of policy arguments"}, 2219 }; 2220 2221 int r; 2222 2223 if (!at_least_one_arg(as, error)) 2224 return -EINVAL; 2225 2226 ca->policy_name = dm_shift_arg(as); 2227 2228 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2229 if (r) 2230 return -EINVAL; 2231 2232 ca->policy_argv = (const char **)as->argv; 2233 dm_consume_args(as, ca->policy_argc); 2234 2235 return 0; 2236 } 2237 2238 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2239 char **error) 2240 { 2241 int r; 2242 struct dm_arg_set as; 2243 2244 as.argc = argc; 2245 as.argv = argv; 2246 2247 r = parse_metadata_dev(ca, &as, error); 2248 if (r) 2249 return r; 2250 2251 r = parse_cache_dev(ca, &as, error); 2252 if (r) 2253 return r; 2254 2255 r = parse_origin_dev(ca, &as, error); 2256 if (r) 2257 return r; 2258 2259 r = parse_block_size(ca, &as, error); 2260 if (r) 2261 return r; 2262 2263 r = parse_features(ca, &as, error); 2264 if (r) 2265 return r; 2266 2267 r = parse_policy(ca, &as, error); 2268 if (r) 2269 return r; 2270 2271 return 0; 2272 } 2273 2274 /*----------------------------------------------------------------*/ 2275 2276 static struct kmem_cache *migration_cache = NULL; 2277 2278 #define NOT_CORE_OPTION 1 2279 2280 static int process_config_option(struct cache *cache, const char *key, const char *value) 2281 { 2282 unsigned long tmp; 2283 2284 if (!strcasecmp(key, "migration_threshold")) { 2285 if (kstrtoul(value, 10, &tmp)) 2286 return -EINVAL; 2287 2288 cache->migration_threshold = tmp; 2289 return 0; 2290 } 2291 2292 return NOT_CORE_OPTION; 2293 } 2294 2295 static int set_config_value(struct cache *cache, const char *key, const char *value) 2296 { 2297 int r = process_config_option(cache, key, value); 2298 2299 if (r == NOT_CORE_OPTION) 2300 r = policy_set_config_value(cache->policy, key, value); 2301 2302 if (r) 2303 DMWARN("bad config value for %s: %s", key, value); 2304 2305 return r; 2306 } 2307 2308 static int set_config_values(struct cache *cache, int argc, const char **argv) 2309 { 2310 int r = 0; 2311 2312 if (argc & 1) { 2313 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2314 return -EINVAL; 2315 } 2316 2317 while (argc) { 2318 r = set_config_value(cache, argv[0], argv[1]); 2319 if (r) 2320 break; 2321 2322 argc -= 2; 2323 argv += 2; 2324 } 2325 2326 return r; 2327 } 2328 2329 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2330 char **error) 2331 { 2332 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2333 cache->cache_size, 2334 cache->origin_sectors, 2335 cache->sectors_per_block); 2336 if (IS_ERR(p)) { 2337 *error = "Error creating cache's policy"; 2338 return PTR_ERR(p); 2339 } 2340 cache->policy = p; 2341 BUG_ON(!cache->policy); 2342 2343 return 0; 2344 } 2345 2346 /* 2347 * We want the discard block size to be at least the size of the cache 2348 * block size and have no more than 2^14 discard blocks across the origin. 2349 */ 2350 #define MAX_DISCARD_BLOCKS (1 << 14) 2351 2352 static bool too_many_discard_blocks(sector_t discard_block_size, 2353 sector_t origin_size) 2354 { 2355 (void) sector_div(origin_size, discard_block_size); 2356 2357 return origin_size > MAX_DISCARD_BLOCKS; 2358 } 2359 2360 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2361 sector_t origin_size) 2362 { 2363 sector_t discard_block_size = cache_block_size; 2364 2365 if (origin_size) 2366 while (too_many_discard_blocks(discard_block_size, origin_size)) 2367 discard_block_size *= 2; 2368 2369 return discard_block_size; 2370 } 2371 2372 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2373 { 2374 dm_block_t nr_blocks = from_cblock(size); 2375 2376 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2377 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2378 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2379 "Please consider increasing the cache block size to reduce the overall cache block count.", 2380 (unsigned long long) nr_blocks); 2381 2382 cache->cache_size = size; 2383 } 2384 2385 #define DEFAULT_MIGRATION_THRESHOLD 2048 2386 2387 static int cache_create(struct cache_args *ca, struct cache **result) 2388 { 2389 int r = 0; 2390 char **error = &ca->ti->error; 2391 struct cache *cache; 2392 struct dm_target *ti = ca->ti; 2393 dm_block_t origin_blocks; 2394 struct dm_cache_metadata *cmd; 2395 bool may_format = ca->features.mode == CM_WRITE; 2396 2397 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2398 if (!cache) 2399 return -ENOMEM; 2400 2401 cache->ti = ca->ti; 2402 ti->private = cache; 2403 ti->accounts_remapped_io = true; 2404 ti->num_flush_bios = 2; 2405 ti->flush_supported = true; 2406 2407 ti->num_discard_bios = 1; 2408 ti->discards_supported = true; 2409 2410 ti->per_io_data_size = sizeof(struct per_bio_data); 2411 2412 cache->features = ca->features; 2413 if (writethrough_mode(cache)) { 2414 /* Create bioset for writethrough bios issued to origin */ 2415 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2416 if (r) 2417 goto bad; 2418 } 2419 2420 cache->metadata_dev = ca->metadata_dev; 2421 cache->origin_dev = ca->origin_dev; 2422 cache->cache_dev = ca->cache_dev; 2423 2424 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2425 2426 origin_blocks = cache->origin_sectors = ti->len; 2427 origin_blocks = block_div(origin_blocks, ca->block_size); 2428 cache->origin_blocks = to_oblock(origin_blocks); 2429 2430 cache->sectors_per_block = ca->block_size; 2431 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2432 r = -EINVAL; 2433 goto bad; 2434 } 2435 2436 if (ca->block_size & (ca->block_size - 1)) { 2437 dm_block_t cache_size = ca->cache_sectors; 2438 2439 cache->sectors_per_block_shift = -1; 2440 cache_size = block_div(cache_size, ca->block_size); 2441 set_cache_size(cache, to_cblock(cache_size)); 2442 } else { 2443 cache->sectors_per_block_shift = __ffs(ca->block_size); 2444 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2445 } 2446 2447 r = create_cache_policy(cache, ca, error); 2448 if (r) 2449 goto bad; 2450 2451 cache->policy_nr_args = ca->policy_argc; 2452 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2453 2454 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2455 if (r) { 2456 *error = "Error setting cache policy's config values"; 2457 goto bad; 2458 } 2459 2460 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2461 ca->block_size, may_format, 2462 dm_cache_policy_get_hint_size(cache->policy), 2463 ca->features.metadata_version); 2464 if (IS_ERR(cmd)) { 2465 *error = "Error creating metadata object"; 2466 r = PTR_ERR(cmd); 2467 goto bad; 2468 } 2469 cache->cmd = cmd; 2470 set_cache_mode(cache, CM_WRITE); 2471 if (get_cache_mode(cache) != CM_WRITE) { 2472 *error = "Unable to get write access to metadata, please check/repair metadata."; 2473 r = -EINVAL; 2474 goto bad; 2475 } 2476 2477 if (passthrough_mode(cache)) { 2478 bool all_clean; 2479 2480 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2481 if (r) { 2482 *error = "dm_cache_metadata_all_clean() failed"; 2483 goto bad; 2484 } 2485 2486 if (!all_clean) { 2487 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2488 r = -EINVAL; 2489 goto bad; 2490 } 2491 2492 policy_allow_migrations(cache->policy, false); 2493 } 2494 2495 spin_lock_init(&cache->lock); 2496 bio_list_init(&cache->deferred_bios); 2497 atomic_set(&cache->nr_allocated_migrations, 0); 2498 atomic_set(&cache->nr_io_migrations, 0); 2499 init_waitqueue_head(&cache->migration_wait); 2500 2501 r = -ENOMEM; 2502 atomic_set(&cache->nr_dirty, 0); 2503 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2504 if (!cache->dirty_bitset) { 2505 *error = "could not allocate dirty bitset"; 2506 goto bad; 2507 } 2508 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2509 2510 cache->discard_block_size = 2511 calculate_discard_block_size(cache->sectors_per_block, 2512 cache->origin_sectors); 2513 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2514 cache->discard_block_size)); 2515 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2516 if (!cache->discard_bitset) { 2517 *error = "could not allocate discard bitset"; 2518 goto bad; 2519 } 2520 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2521 2522 cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2523 if (!cache->invalid_bitset) { 2524 *error = "could not allocate bitset for invalid blocks"; 2525 goto bad; 2526 } 2527 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 2528 2529 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2530 if (IS_ERR(cache->copier)) { 2531 *error = "could not create kcopyd client"; 2532 r = PTR_ERR(cache->copier); 2533 goto bad; 2534 } 2535 2536 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2537 if (!cache->wq) { 2538 *error = "could not create workqueue for metadata object"; 2539 goto bad; 2540 } 2541 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2542 INIT_WORK(&cache->migration_worker, check_migrations); 2543 INIT_DELAYED_WORK(&cache->waker, do_waker); 2544 2545 cache->prison = dm_bio_prison_create_v2(cache->wq); 2546 if (!cache->prison) { 2547 *error = "could not create bio prison"; 2548 goto bad; 2549 } 2550 2551 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2552 migration_cache); 2553 if (r) { 2554 *error = "Error creating cache's migration mempool"; 2555 goto bad; 2556 } 2557 2558 cache->need_tick_bio = true; 2559 cache->sized = false; 2560 cache->invalidate = false; 2561 cache->commit_requested = false; 2562 cache->loaded_mappings = false; 2563 cache->loaded_discards = false; 2564 2565 load_stats(cache); 2566 2567 atomic_set(&cache->stats.demotion, 0); 2568 atomic_set(&cache->stats.promotion, 0); 2569 atomic_set(&cache->stats.copies_avoided, 0); 2570 atomic_set(&cache->stats.cache_cell_clash, 0); 2571 atomic_set(&cache->stats.commit_count, 0); 2572 atomic_set(&cache->stats.discard_count, 0); 2573 2574 spin_lock_init(&cache->invalidation_lock); 2575 INIT_LIST_HEAD(&cache->invalidation_requests); 2576 2577 batcher_init(&cache->committer, commit_op, cache, 2578 issue_op, cache, cache->wq); 2579 dm_iot_init(&cache->tracker); 2580 2581 init_rwsem(&cache->background_work_lock); 2582 prevent_background_work(cache); 2583 2584 *result = cache; 2585 return 0; 2586 bad: 2587 __destroy(cache); 2588 return r; 2589 } 2590 2591 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2592 { 2593 unsigned int i; 2594 const char **copy; 2595 2596 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2597 if (!copy) 2598 return -ENOMEM; 2599 for (i = 0; i < argc; i++) { 2600 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2601 if (!copy[i]) { 2602 while (i--) 2603 kfree(copy[i]); 2604 kfree(copy); 2605 return -ENOMEM; 2606 } 2607 } 2608 2609 cache->nr_ctr_args = argc; 2610 cache->ctr_args = copy; 2611 2612 return 0; 2613 } 2614 2615 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2616 { 2617 int r = -EINVAL; 2618 struct cache_args *ca; 2619 struct cache *cache = NULL; 2620 2621 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2622 if (!ca) { 2623 ti->error = "Error allocating memory for cache"; 2624 return -ENOMEM; 2625 } 2626 ca->ti = ti; 2627 2628 r = parse_cache_args(ca, argc, argv, &ti->error); 2629 if (r) 2630 goto out; 2631 2632 r = cache_create(ca, &cache); 2633 if (r) 2634 goto out; 2635 2636 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2637 if (r) { 2638 __destroy(cache); 2639 goto out; 2640 } 2641 2642 ti->private = cache; 2643 out: 2644 destroy_cache_args(ca); 2645 return r; 2646 } 2647 2648 /*----------------------------------------------------------------*/ 2649 2650 static int cache_map(struct dm_target *ti, struct bio *bio) 2651 { 2652 struct cache *cache = ti->private; 2653 2654 int r; 2655 bool commit_needed; 2656 dm_oblock_t block = get_bio_block(cache, bio); 2657 2658 init_per_bio_data(bio); 2659 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2660 /* 2661 * This can only occur if the io goes to a partial block at 2662 * the end of the origin device. We don't cache these. 2663 * Just remap to the origin and carry on. 2664 */ 2665 remap_to_origin(cache, bio); 2666 accounted_begin(cache, bio); 2667 return DM_MAPIO_REMAPPED; 2668 } 2669 2670 if (discard_or_flush(bio)) { 2671 defer_bio(cache, bio); 2672 return DM_MAPIO_SUBMITTED; 2673 } 2674 2675 r = map_bio(cache, bio, block, &commit_needed); 2676 if (commit_needed) 2677 schedule_commit(&cache->committer); 2678 2679 return r; 2680 } 2681 2682 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2683 { 2684 struct cache *cache = ti->private; 2685 unsigned long flags; 2686 struct per_bio_data *pb = get_per_bio_data(bio); 2687 2688 if (pb->tick) { 2689 policy_tick(cache->policy, false); 2690 2691 spin_lock_irqsave(&cache->lock, flags); 2692 cache->need_tick_bio = true; 2693 spin_unlock_irqrestore(&cache->lock, flags); 2694 } 2695 2696 bio_drop_shared_lock(cache, bio); 2697 accounted_complete(cache, bio); 2698 2699 return DM_ENDIO_DONE; 2700 } 2701 2702 static int write_dirty_bitset(struct cache *cache) 2703 { 2704 int r; 2705 2706 if (get_cache_mode(cache) >= CM_READ_ONLY) 2707 return -EINVAL; 2708 2709 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2710 if (r) 2711 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2712 2713 return r; 2714 } 2715 2716 static int write_discard_bitset(struct cache *cache) 2717 { 2718 unsigned int i, r; 2719 2720 if (get_cache_mode(cache) >= CM_READ_ONLY) 2721 return -EINVAL; 2722 2723 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2724 cache->discard_nr_blocks); 2725 if (r) { 2726 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2727 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2728 return r; 2729 } 2730 2731 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2732 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2733 is_discarded(cache, to_dblock(i))); 2734 if (r) { 2735 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2736 return r; 2737 } 2738 } 2739 2740 return 0; 2741 } 2742 2743 static int write_hints(struct cache *cache) 2744 { 2745 int r; 2746 2747 if (get_cache_mode(cache) >= CM_READ_ONLY) 2748 return -EINVAL; 2749 2750 r = dm_cache_write_hints(cache->cmd, cache->policy); 2751 if (r) { 2752 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2753 return r; 2754 } 2755 2756 return 0; 2757 } 2758 2759 /* 2760 * returns true on success 2761 */ 2762 static bool sync_metadata(struct cache *cache) 2763 { 2764 int r1, r2, r3, r4; 2765 2766 r1 = write_dirty_bitset(cache); 2767 if (r1) 2768 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2769 2770 r2 = write_discard_bitset(cache); 2771 if (r2) 2772 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2773 2774 save_stats(cache); 2775 2776 r3 = write_hints(cache); 2777 if (r3) 2778 DMERR("%s: could not write hints", cache_device_name(cache)); 2779 2780 /* 2781 * If writing the above metadata failed, we still commit, but don't 2782 * set the clean shutdown flag. This will effectively force every 2783 * dirty bit to be set on reload. 2784 */ 2785 r4 = commit(cache, !r1 && !r2 && !r3); 2786 if (r4) 2787 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2788 2789 return !r1 && !r2 && !r3 && !r4; 2790 } 2791 2792 static void cache_postsuspend(struct dm_target *ti) 2793 { 2794 struct cache *cache = ti->private; 2795 2796 prevent_background_work(cache); 2797 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2798 2799 cancel_delayed_work_sync(&cache->waker); 2800 drain_workqueue(cache->wq); 2801 WARN_ON(cache->tracker.in_flight); 2802 2803 /* 2804 * If it's a flush suspend there won't be any deferred bios, so this 2805 * call is harmless. 2806 */ 2807 requeue_deferred_bios(cache); 2808 2809 if (get_cache_mode(cache) == CM_WRITE) 2810 (void) sync_metadata(cache); 2811 } 2812 2813 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2814 bool dirty, uint32_t hint, bool hint_valid) 2815 { 2816 struct cache *cache = context; 2817 2818 if (dirty) { 2819 set_bit(from_cblock(cblock), cache->dirty_bitset); 2820 atomic_inc(&cache->nr_dirty); 2821 } else 2822 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2823 2824 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2825 } 2826 2827 static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2828 bool dirty, uint32_t hint, bool hint_valid) 2829 { 2830 struct cache *cache = context; 2831 2832 if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { 2833 if (dirty) { 2834 DMERR("%s: unable to shrink origin; cache block %u is dirty", 2835 cache_device_name(cache), from_cblock(cblock)); 2836 return -EFBIG; 2837 } 2838 set_bit(from_cblock(cblock), cache->invalid_bitset); 2839 return 0; 2840 } 2841 2842 return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); 2843 } 2844 2845 /* 2846 * The discard block size in the on disk metadata is not 2847 * necessarily the same as we're currently using. So we have to 2848 * be careful to only set the discarded attribute if we know it 2849 * covers a complete block of the new size. 2850 */ 2851 struct discard_load_info { 2852 struct cache *cache; 2853 2854 /* 2855 * These blocks are sized using the on disk dblock size, rather 2856 * than the current one. 2857 */ 2858 dm_block_t block_size; 2859 dm_block_t discard_begin, discard_end; 2860 }; 2861 2862 static void discard_load_info_init(struct cache *cache, 2863 struct discard_load_info *li) 2864 { 2865 li->cache = cache; 2866 li->discard_begin = li->discard_end = 0; 2867 } 2868 2869 static void set_discard_range(struct discard_load_info *li) 2870 { 2871 sector_t b, e; 2872 2873 if (li->discard_begin == li->discard_end) 2874 return; 2875 2876 /* 2877 * Convert to sectors. 2878 */ 2879 b = li->discard_begin * li->block_size; 2880 e = li->discard_end * li->block_size; 2881 2882 /* 2883 * Then convert back to the current dblock size. 2884 */ 2885 b = dm_sector_div_up(b, li->cache->discard_block_size); 2886 sector_div(e, li->cache->discard_block_size); 2887 2888 /* 2889 * The origin may have shrunk, so we need to check we're still in 2890 * bounds. 2891 */ 2892 if (e > from_dblock(li->cache->discard_nr_blocks)) 2893 e = from_dblock(li->cache->discard_nr_blocks); 2894 2895 for (; b < e; b++) 2896 set_discard(li->cache, to_dblock(b)); 2897 } 2898 2899 static int load_discard(void *context, sector_t discard_block_size, 2900 dm_dblock_t dblock, bool discard) 2901 { 2902 struct discard_load_info *li = context; 2903 2904 li->block_size = discard_block_size; 2905 2906 if (discard) { 2907 if (from_dblock(dblock) == li->discard_end) 2908 /* 2909 * We're already in a discard range, just extend it. 2910 */ 2911 li->discard_end = li->discard_end + 1ULL; 2912 2913 else { 2914 /* 2915 * Emit the old range and start a new one. 2916 */ 2917 set_discard_range(li); 2918 li->discard_begin = from_dblock(dblock); 2919 li->discard_end = li->discard_begin + 1ULL; 2920 } 2921 } else { 2922 set_discard_range(li); 2923 li->discard_begin = li->discard_end = 0; 2924 } 2925 2926 return 0; 2927 } 2928 2929 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2930 { 2931 sector_t size = get_dev_size(cache->cache_dev); 2932 (void) sector_div(size, cache->sectors_per_block); 2933 return to_cblock(size); 2934 } 2935 2936 static bool can_resume(struct cache *cache) 2937 { 2938 /* 2939 * Disallow retrying the resume operation for devices that failed the 2940 * first resume attempt, as the failure leaves the policy object partially 2941 * initialized. Retrying could trigger BUG_ON when loading cache mappings 2942 * into the incomplete policy object. 2943 */ 2944 if (cache->sized && !cache->loaded_mappings) { 2945 if (get_cache_mode(cache) != CM_WRITE) 2946 DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", 2947 cache_device_name(cache)); 2948 else 2949 DMERR("%s: unable to resume cache due to missing proper cache table reload", 2950 cache_device_name(cache)); 2951 return false; 2952 } 2953 2954 return true; 2955 } 2956 2957 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2958 { 2959 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2960 DMERR("%s: unable to extend cache due to missing cache table reload", 2961 cache_device_name(cache)); 2962 return false; 2963 } 2964 2965 /* 2966 * We can't drop a dirty block when shrinking the cache. 2967 */ 2968 if (cache->loaded_mappings) { 2969 new_size = to_cblock(find_next_bit(cache->dirty_bitset, 2970 from_cblock(cache->cache_size), 2971 from_cblock(new_size))); 2972 if (new_size != cache->cache_size) { 2973 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2974 cache_device_name(cache), 2975 (unsigned long long) from_cblock(new_size)); 2976 return false; 2977 } 2978 } 2979 2980 return true; 2981 } 2982 2983 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2984 { 2985 int r; 2986 2987 r = dm_cache_resize(cache->cmd, new_size); 2988 if (r) { 2989 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2990 metadata_operation_failed(cache, "dm_cache_resize", r); 2991 return r; 2992 } 2993 2994 set_cache_size(cache, new_size); 2995 2996 return 0; 2997 } 2998 2999 static int truncate_oblocks(struct cache *cache) 3000 { 3001 uint32_t nr_blocks = from_cblock(cache->cache_size); 3002 uint32_t i; 3003 int r; 3004 3005 for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { 3006 r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); 3007 if (r) { 3008 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 3009 cache_device_name(cache)); 3010 return r; 3011 } 3012 } 3013 3014 return 0; 3015 } 3016 3017 static int cache_preresume(struct dm_target *ti) 3018 { 3019 int r = 0; 3020 struct cache *cache = ti->private; 3021 dm_cblock_t csize = get_cache_dev_size(cache); 3022 3023 if (!can_resume(cache)) 3024 return -EINVAL; 3025 3026 /* 3027 * Check to see if the cache has resized. 3028 */ 3029 if (!cache->sized || csize != cache->cache_size) { 3030 if (!can_resize(cache, csize)) 3031 return -EINVAL; 3032 3033 r = resize_cache_dev(cache, csize); 3034 if (r) 3035 return r; 3036 3037 cache->sized = true; 3038 } 3039 3040 if (!cache->loaded_mappings) { 3041 /* 3042 * The fast device could have been resized since the last 3043 * failed preresume attempt. To be safe we start by a blank 3044 * bitset for cache blocks. 3045 */ 3046 clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); 3047 3048 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3049 load_filtered_mapping, cache); 3050 if (r) { 3051 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3052 if (r != -EFBIG) 3053 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3054 return r; 3055 } 3056 3057 r = truncate_oblocks(cache); 3058 if (r) { 3059 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 3060 return r; 3061 } 3062 3063 cache->loaded_mappings = true; 3064 } 3065 3066 if (!cache->loaded_discards) { 3067 struct discard_load_info li; 3068 3069 /* 3070 * The discard bitset could have been resized, or the 3071 * discard block size changed. To be safe we start by 3072 * setting every dblock to not discarded. 3073 */ 3074 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3075 3076 discard_load_info_init(cache, &li); 3077 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3078 if (r) { 3079 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3080 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3081 return r; 3082 } 3083 set_discard_range(&li); 3084 3085 cache->loaded_discards = true; 3086 } 3087 3088 return r; 3089 } 3090 3091 static void cache_resume(struct dm_target *ti) 3092 { 3093 struct cache *cache = ti->private; 3094 3095 cache->need_tick_bio = true; 3096 allow_background_work(cache); 3097 do_waker(&cache->waker.work); 3098 } 3099 3100 static void emit_flags(struct cache *cache, char *result, 3101 unsigned int maxlen, ssize_t *sz_ptr) 3102 { 3103 ssize_t sz = *sz_ptr; 3104 struct cache_features *cf = &cache->features; 3105 unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3106 3107 DMEMIT("%u ", count); 3108 3109 if (cf->metadata_version == 2) 3110 DMEMIT("metadata2 "); 3111 3112 if (writethrough_mode(cache)) 3113 DMEMIT("writethrough "); 3114 3115 else if (passthrough_mode(cache)) 3116 DMEMIT("passthrough "); 3117 3118 else if (writeback_mode(cache)) 3119 DMEMIT("writeback "); 3120 3121 else { 3122 DMEMIT("unknown "); 3123 DMERR("%s: internal error: unknown io mode: %d", 3124 cache_device_name(cache), (int) cf->io_mode); 3125 } 3126 3127 if (!cf->discard_passdown) 3128 DMEMIT("no_discard_passdown "); 3129 3130 *sz_ptr = sz; 3131 } 3132 3133 /* 3134 * Status format: 3135 * 3136 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3137 * <cache block size> <#used cache blocks>/<#total cache blocks> 3138 * <#read hits> <#read misses> <#write hits> <#write misses> 3139 * <#demotions> <#promotions> <#dirty> 3140 * <#features> <features>* 3141 * <#core args> <core args> 3142 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3143 */ 3144 static void cache_status(struct dm_target *ti, status_type_t type, 3145 unsigned int status_flags, char *result, unsigned int maxlen) 3146 { 3147 int r = 0; 3148 unsigned int i; 3149 ssize_t sz = 0; 3150 dm_block_t nr_free_blocks_metadata = 0; 3151 dm_block_t nr_blocks_metadata = 0; 3152 char buf[BDEVNAME_SIZE]; 3153 struct cache *cache = ti->private; 3154 dm_cblock_t residency; 3155 bool needs_check; 3156 3157 switch (type) { 3158 case STATUSTYPE_INFO: 3159 if (get_cache_mode(cache) == CM_FAIL) { 3160 DMEMIT("Fail"); 3161 break; 3162 } 3163 3164 /* Commit to ensure statistics aren't out-of-date */ 3165 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3166 (void) commit(cache, false); 3167 3168 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3169 if (r) { 3170 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3171 cache_device_name(cache), r); 3172 goto err; 3173 } 3174 3175 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3176 if (r) { 3177 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3178 cache_device_name(cache), r); 3179 goto err; 3180 } 3181 3182 residency = policy_residency(cache->policy); 3183 3184 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3185 (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, 3186 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3187 (unsigned long long)nr_blocks_metadata, 3188 (unsigned long long)cache->sectors_per_block, 3189 (unsigned long long) from_cblock(residency), 3190 (unsigned long long) from_cblock(cache->cache_size), 3191 (unsigned int) atomic_read(&cache->stats.read_hit), 3192 (unsigned int) atomic_read(&cache->stats.read_miss), 3193 (unsigned int) atomic_read(&cache->stats.write_hit), 3194 (unsigned int) atomic_read(&cache->stats.write_miss), 3195 (unsigned int) atomic_read(&cache->stats.demotion), 3196 (unsigned int) atomic_read(&cache->stats.promotion), 3197 (unsigned long) atomic_read(&cache->nr_dirty)); 3198 3199 emit_flags(cache, result, maxlen, &sz); 3200 3201 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3202 3203 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3204 if (sz < maxlen) { 3205 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3206 if (r) 3207 DMERR("%s: policy_emit_config_values returned %d", 3208 cache_device_name(cache), r); 3209 } 3210 3211 if (get_cache_mode(cache) == CM_READ_ONLY) 3212 DMEMIT("ro "); 3213 else 3214 DMEMIT("rw "); 3215 3216 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3217 3218 if (r || needs_check) 3219 DMEMIT("needs_check "); 3220 else 3221 DMEMIT("- "); 3222 3223 break; 3224 3225 case STATUSTYPE_TABLE: 3226 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3227 DMEMIT("%s ", buf); 3228 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3229 DMEMIT("%s ", buf); 3230 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3231 DMEMIT("%s", buf); 3232 3233 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3234 DMEMIT(" %s", cache->ctr_args[i]); 3235 if (cache->nr_ctr_args) 3236 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3237 break; 3238 3239 case STATUSTYPE_IMA: 3240 DMEMIT_TARGET_NAME_VERSION(ti->type); 3241 if (get_cache_mode(cache) == CM_FAIL) 3242 DMEMIT(",metadata_mode=fail"); 3243 else if (get_cache_mode(cache) == CM_READ_ONLY) 3244 DMEMIT(",metadata_mode=ro"); 3245 else 3246 DMEMIT(",metadata_mode=rw"); 3247 3248 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3249 DMEMIT(",cache_metadata_device=%s", buf); 3250 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3251 DMEMIT(",cache_device=%s", buf); 3252 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3253 DMEMIT(",cache_origin_device=%s", buf); 3254 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); 3255 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); 3256 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); 3257 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); 3258 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); 3259 DMEMIT(";"); 3260 break; 3261 } 3262 3263 return; 3264 3265 err: 3266 DMEMIT("Error"); 3267 } 3268 3269 /* 3270 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3271 * the one-past-the-end value. 3272 */ 3273 struct cblock_range { 3274 dm_cblock_t begin; 3275 dm_cblock_t end; 3276 }; 3277 3278 /* 3279 * A cache block range can take two forms: 3280 * 3281 * i) A single cblock, eg. '3456' 3282 * ii) A begin and end cblock with a dash between, eg. 123-234 3283 */ 3284 static int parse_cblock_range(struct cache *cache, const char *str, 3285 struct cblock_range *result) 3286 { 3287 char dummy; 3288 uint64_t b, e; 3289 int r; 3290 3291 /* 3292 * Try and parse form (ii) first. 3293 */ 3294 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3295 3296 if (r == 2) { 3297 result->begin = to_cblock(b); 3298 result->end = to_cblock(e); 3299 return 0; 3300 } 3301 3302 /* 3303 * That didn't work, try form (i). 3304 */ 3305 r = sscanf(str, "%llu%c", &b, &dummy); 3306 3307 if (r == 1) { 3308 result->begin = to_cblock(b); 3309 result->end = to_cblock(from_cblock(result->begin) + 1u); 3310 return 0; 3311 } 3312 3313 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3314 return -EINVAL; 3315 } 3316 3317 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3318 { 3319 uint64_t b = from_cblock(range->begin); 3320 uint64_t e = from_cblock(range->end); 3321 uint64_t n = from_cblock(cache->cache_size); 3322 3323 if (b >= n) { 3324 DMERR("%s: begin cblock out of range: %llu >= %llu", 3325 cache_device_name(cache), b, n); 3326 return -EINVAL; 3327 } 3328 3329 if (e > n) { 3330 DMERR("%s: end cblock out of range: %llu > %llu", 3331 cache_device_name(cache), e, n); 3332 return -EINVAL; 3333 } 3334 3335 if (b >= e) { 3336 DMERR("%s: invalid cblock range: %llu >= %llu", 3337 cache_device_name(cache), b, e); 3338 return -EINVAL; 3339 } 3340 3341 return 0; 3342 } 3343 3344 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3345 { 3346 return to_cblock(from_cblock(b) + 1); 3347 } 3348 3349 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3350 { 3351 int r = 0; 3352 3353 /* 3354 * We don't need to do any locking here because we know we're in 3355 * passthrough mode. There's is potential for a race between an 3356 * invalidation triggered by an io and an invalidation message. This 3357 * is harmless, we must not worry if the policy call fails. 3358 */ 3359 while (range->begin != range->end) { 3360 r = invalidate_cblock(cache, range->begin); 3361 if (r) 3362 return r; 3363 3364 range->begin = cblock_succ(range->begin); 3365 } 3366 3367 cache->commit_requested = true; 3368 return r; 3369 } 3370 3371 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, 3372 const char **cblock_ranges) 3373 { 3374 int r = 0; 3375 unsigned int i; 3376 struct cblock_range range; 3377 3378 if (!passthrough_mode(cache)) { 3379 DMERR("%s: cache has to be in passthrough mode for invalidation", 3380 cache_device_name(cache)); 3381 return -EPERM; 3382 } 3383 3384 for (i = 0; i < count; i++) { 3385 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3386 if (r) 3387 break; 3388 3389 r = validate_cblock_range(cache, &range); 3390 if (r) 3391 break; 3392 3393 /* 3394 * Pass begin and end origin blocks to the worker and wake it. 3395 */ 3396 r = request_invalidation(cache, &range); 3397 if (r) 3398 break; 3399 } 3400 3401 return r; 3402 } 3403 3404 /* 3405 * Supports 3406 * "<key> <value>" 3407 * and 3408 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3409 * 3410 * The key migration_threshold is supported by the cache target core. 3411 */ 3412 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, 3413 char *result, unsigned int maxlen) 3414 { 3415 struct cache *cache = ti->private; 3416 3417 if (!argc) 3418 return -EINVAL; 3419 3420 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3421 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3422 cache_device_name(cache)); 3423 return -EOPNOTSUPP; 3424 } 3425 3426 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3427 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3428 3429 if (argc != 2) 3430 return -EINVAL; 3431 3432 return set_config_value(cache, argv[0], argv[1]); 3433 } 3434 3435 static int cache_iterate_devices(struct dm_target *ti, 3436 iterate_devices_callout_fn fn, void *data) 3437 { 3438 int r = 0; 3439 struct cache *cache = ti->private; 3440 3441 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3442 if (!r) 3443 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3444 3445 return r; 3446 } 3447 3448 /* 3449 * If discard_passdown was enabled verify that the origin device 3450 * supports discards. Disable discard_passdown if not. 3451 */ 3452 static void disable_passdown_if_not_supported(struct cache *cache) 3453 { 3454 struct block_device *origin_bdev = cache->origin_dev->bdev; 3455 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3456 const char *reason = NULL; 3457 3458 if (!cache->features.discard_passdown) 3459 return; 3460 3461 if (!bdev_max_discard_sectors(origin_bdev)) 3462 reason = "discard unsupported"; 3463 3464 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3465 reason = "max discard sectors smaller than a block"; 3466 3467 if (reason) { 3468 DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3469 origin_bdev, reason); 3470 cache->features.discard_passdown = false; 3471 } 3472 } 3473 3474 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3475 { 3476 struct block_device *origin_bdev = cache->origin_dev->bdev; 3477 struct queue_limits *origin_limits = bdev_limits(origin_bdev); 3478 3479 if (!cache->features.discard_passdown) { 3480 /* No passdown is done so setting own virtual limits */ 3481 limits->max_hw_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3482 cache->origin_sectors); 3483 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3484 return; 3485 } 3486 3487 /* 3488 * cache_iterate_devices() is stacking both origin and fast device limits 3489 * but discards aren't passed to fast device, so inherit origin's limits. 3490 */ 3491 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3492 limits->discard_granularity = origin_limits->discard_granularity; 3493 limits->discard_alignment = origin_limits->discard_alignment; 3494 } 3495 3496 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3497 { 3498 struct cache *cache = ti->private; 3499 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3500 3501 /* 3502 * If the system-determined stacked limits are compatible with the 3503 * cache's blocksize (io_opt is a factor) do not override them. 3504 */ 3505 if (io_opt_sectors < cache->sectors_per_block || 3506 do_div(io_opt_sectors, cache->sectors_per_block)) { 3507 limits->io_min = cache->sectors_per_block << SECTOR_SHIFT; 3508 limits->io_opt = cache->sectors_per_block << SECTOR_SHIFT; 3509 } 3510 3511 disable_passdown_if_not_supported(cache); 3512 set_discard_limits(cache, limits); 3513 } 3514 3515 /*----------------------------------------------------------------*/ 3516 3517 static struct target_type cache_target = { 3518 .name = "cache", 3519 .version = {2, 3, 0}, 3520 .module = THIS_MODULE, 3521 .ctr = cache_ctr, 3522 .dtr = cache_dtr, 3523 .map = cache_map, 3524 .end_io = cache_end_io, 3525 .postsuspend = cache_postsuspend, 3526 .preresume = cache_preresume, 3527 .resume = cache_resume, 3528 .status = cache_status, 3529 .message = cache_message, 3530 .iterate_devices = cache_iterate_devices, 3531 .io_hints = cache_io_hints, 3532 }; 3533 3534 static int __init dm_cache_init(void) 3535 { 3536 int r; 3537 3538 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3539 if (!migration_cache) { 3540 r = -ENOMEM; 3541 goto err; 3542 } 3543 3544 btracker_work_cache = kmem_cache_create("dm_cache_bt_work", 3545 sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); 3546 if (!btracker_work_cache) { 3547 r = -ENOMEM; 3548 goto err; 3549 } 3550 3551 r = dm_register_target(&cache_target); 3552 if (r) { 3553 goto err; 3554 } 3555 3556 return 0; 3557 3558 err: 3559 kmem_cache_destroy(migration_cache); 3560 kmem_cache_destroy(btracker_work_cache); 3561 return r; 3562 } 3563 3564 static void __exit dm_cache_exit(void) 3565 { 3566 dm_unregister_target(&cache_target); 3567 kmem_cache_destroy(migration_cache); 3568 kmem_cache_destroy(btracker_work_cache); 3569 } 3570 3571 module_init(dm_cache_init); 3572 module_exit(dm_cache_exit); 3573 3574 MODULE_DESCRIPTION(DM_NAME " cache target"); 3575 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3576 MODULE_LICENSE("GPL"); 3577